diff --git a/.gitignore b/.gitignore index 828bbe9bd3363853ae3f58f54a8d5f60cefad837..1ef4c297ee4f369775c13b32a46a55887de719e7 100644 --- a/.gitignore +++ b/.gitignore @@ -14,8 +14,10 @@ __pycache__ *.swp .vscode/ cmake_build/ +tensorflow/contrib/cmake/_build/ .idea/** /build/ +[Bb]uild/ /tensorflow/core/util/version_info.cc /tensorflow/python/framework/fast_tensor_util.cpp Pods @@ -28,6 +30,8 @@ Podfile.lock /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite xcuserdata/** /api_init_files_list.txt +/estimator_api_init_files_list.txt +*.whl # Android .gradle diff --git a/CODEOWNERS b/CODEOWNERS index b9f0313cc6d59d3fbdcd014e1a528126d863075a..1725a5c4715c0ab0afef7c0c09d232163254067e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,53 +1,62 @@ -# NOTE: Disabled temporarily because it's too noisy on pushes. # Where component owners are known, add them here. -# /tensorflow/core/platform/windows/ @mrry -# /tensorflow/java/ @asimshankar -# /tensorflow/tensorboard/ @jart @dandelionmane -# /tensorflow/tools/docs/ @markdaoust +/tenosrflow/core/debug @caisq +/tensorflow/core/platform/windows/ @mrry +/tensorflow/go @asimshankar +/tensorflow/java/ @asimshankar +/tensorflow/python/debug @caisq +/tensorflow/python/tools/api/generator/ @annarev +/tensorflow/tensorboard/ @jart +/tensorflow/tools/docs/ @markdaoust # contrib -# NEED OWNER: /tensorflow/contrib/avro/ -# /tensorflow/contrib/batching/ @alextp @chrisolston -# /tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon -# /tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva -# /tensorflow/contrib/cmake/ @mrry @benoitsteiner -# /tensorflow/contrib/copy_graph/ @tucker @poxvoculi -# /tensorflow/contrib/crf/ @kentonl -# /tensorflow/contrib/data/ @mrry -# /tensorflow/contrib/distributions/ @jvdillon @langmore @rsepassi -# /tensorflow/contrib/factorization/ @agarwal-ashish @xavigonzalvo -# /tensorflow/contrib/ffmpeg/ @fredbertsch -# NEED OWNER: /tensorflow/contrib/framework/ -# /tensorflow/contrib/graph_editor/ @purpledog +# NEED OWNER: /tensorflow/contrib/all_reduce +/tensorflow/contrib/batching/ @alextp @chrisolston +/tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon +/tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva +/tensorflow/contrib/checkpoint/ @allenlavoie +/tensorflow/contrib/contrib/cluster_resolver/ @frankchn +/tensorflow/contrib/cmake/ @mrry +/tensorflow/contrib/copy_graph/ @tucker @poxvoculi +/tensorflow/contrib/crf/ @kentonl +/tensorflow/contrib/data/ @mrry +/tensorflow/tensorflow/contrib/distribute @joshl @priyag @sourabhbajaj @frankchn +/tensorflow/contrib/distributions/ @jvdillon @langmore @rsepassi +/tensorflow/contrib/eager @alextp @asimshankar +/tensorflow/contrib/factorization/ @agarwal-ashish @xavigonzalvo +/tensorflow/contrib/ffmpeg/ @fredbertsch +/tensorflow/contrib/framework/ @ebrevdo +/tensorflow/contrib/gan/ @joel-shor +/tensorflow/contrib/graph_editor/ @purpledog # NEED OWNER: /tensorflow/contrib/grid_rnn/ -# /tensorflow/contrib/hvx/ @satok16 -# /tensorflow/contrib/integrate/ @shoyer -# /tensorflow/contrib/kernel_methods/ @petrosmol -# /tensorflow/contrib/ios_examples/ @petewarden -# /tensorflow/contrib/labeled_tensor/ @shoyer -# /tensorflow/contrib/layers/ @fchollet @martinwicke -# /tensorflow/contrib/learn/ @martinwicke @ispirmustafa @alextp -# /tensorflow/contrib/linalg/ @langmore -# /tensorflow/contrib/linear_optimizer/ @petrosmol @andreasst @katsiapis -# /tensorflow/contrib/lookup/ @ysuematsu @andreasst -# /tensorflow/contrib/losses/ @alextp @ispirmustafa -# /tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg -# /tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa -# /tensorflow/contrib/nccl/ @cwhipkey @zheng-xq -# /tensorflow/contrib/opt/ @strategist333 -# /tensorflow/contrib/pi_examples/ @maciekcc -# /tensorflow/contrib/quantization/ @petewarden @cwhipkey @keveman -# /tensorflow/contrib/rnn/ @ebrevdo -# /tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh -# /tensorflow/contrib/seq2seq/ @lukaszkaiser -# /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh -# /tensorflow/contrib/slim/ @sguada @thenbasilmanran -# /tensorflow/contrib/stateless/ @girving -# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank -# /tensorflow/contrib/testing/ @dandelionmane -# /tensorflow/contrib/timeseries/ @allenlavoie -# /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu -# /tensorflow/contrib/training/ @joel-shor @ebrevdo -# /tensorflow/contrib/util/ @sherrym +/tensorflow/contrib/hvx/ @satok16 +/tensorflow/contrib/integrate/ @shoyer +/tensorflow/contrib/kernel_methods/ @petrosmol +/tensorflow/contrib/ios_examples/ @petewarden +/tensorflow/contrib/labeled_tensor/ @shoyer +/tensorflow/contrib/layers/ @fchollet @martinwicke +/tensorflow/contrib/learn/ @martinwicke @ispirmustafa @alextp +/tensorflow/contrib/linalg/ @langmore +/tensorflow/contrib/linear_optimizer/ @petrosmol @andreasst @katsiapis +/tensorflow/contrib/lookup/ @ysuematsu @andreasst +/tensorflow/contrib/losses/ @alextp @ispirmustafa +/tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg +/tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa +/tensorflow/contrib/nccl/ @cwhipkey @zheng-xq +/tensorflow/contrib/opt/ @strategist333 @alextp +/tensorflow/contrib/pi_examples/ @maciekcc +/tensorflow/contrib/quantization/ @petewarden +/tensorflow/contrib/rnn/ @ebrevdo @scottzhu +/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenl +/tensorflow/contrib/seq2seq/ @ebrevdo @lmthang +/tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh +/tensorflow/contrib/slim/ @sguada @thenbasilmanran +/tensorflow/contrib/stateless/ @girving @alextp +/tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank +/tensorflow/contrib/tensorrt/ @aaroey +# NEED OWNER: /tensorflow/contrib/testing/ +/tensorflow/contrib/timeseries/ @allenlavoie +/tensorflow/contrib/tpu/ @frankchn @saeta @jhseu @sourabhbajaj +/tensorflow/contrib/training/ @joel-shor @ebrevdo +/tensorflow/contrib/util/ @sherrym diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8669c25c452b53da48239bc20c9a2d3528e75422..f598999f351c10f8bd01dfbd3ad8897f19d570e8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -90,7 +90,7 @@ Bazel BUILD files also need to include a license section, e.g., Changes to TensorFlow C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). -Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do: +Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on ubuntu:16.04, do: ```bash apt-get install -y clang-tidy @@ -107,7 +107,7 @@ diff /tmp/my_cc_file.cc #### Python coding style Changes to TensorFlow Python code should conform to -[Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) +[Google Python Style Guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md) Use `pylint` to check your Python changes. To install `pylint` and retrieve TensorFlow's custom style definition: diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index 2f3df7cda9cec29ed0c2266629022f0a22b37df9..52faed9297cfcaf8c93bb9c79686c9258a53c560 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -15,9 +15,10 @@ If you open a GitHub issue, here is our policy: ### System information - **Have I written custom code (as opposed to using a stock example script provided in TensorFlow)**: - **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**: +- **Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device**: - **TensorFlow installed from (source or binary)**: - **TensorFlow version (use command below)**: -- **Python version**: +- **Python version**: - **Bazel version (if compiling from source)**: - **GCC/Compiler version (if compiling from source)**: - **CUDA/cuDNN version**: diff --git a/README.md b/README.md index 6fb4486d0de9ff476b5cf1dbd63d66879637df84..e3092e551e32d7f01e9bebd65323d1b5691f0269 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ data flow graphs. The graph nodes represent mathematical operations, while the graph edges represent the multidimensional data arrays (tensors) that flow between them. This flexible architecture enables you to deploy computation to one or more CPUs or GPUs in a desktop, server, or mobile device without rewriting -code. TensorFlow also includes [TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard), a data visualization toolkit. +code. TensorFlow also includes [TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard), a data visualization toolkit. TensorFlow was originally developed by researchers and engineers working on the Google Brain team within Google's Machine Intelligence Research @@ -22,6 +22,8 @@ organization for the purposes of conducting machine learning and deep neural networks research. The system is general enough to be applicable in a wide variety of other domains, as well. +TensorFlow provides stable Python API and C APIs as well as without API backwards compatibility guarantee like C++, Go, Java, JavaScript and Swift. + Keep up to date with release announcements and security updates by subscribing to [announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce). @@ -56,6 +58,7 @@ $ python 42 >>> sess.close() ``` +Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/). ## Contribution guidelines @@ -80,13 +83,15 @@ The TensorFlow project strives to abide by generally accepted best practices in | Build Type | Status | Artifacts | | --- | --- | --- | -| **Linux CPU** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) | -| **Linux GPU** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | [pypi](https://pypi.org/project/tf-nightly-gpu/) | -| **Linux XLA** | TBA | TBA | -| **MacOS** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) | -| **Windows CPU** | [![Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [pypi](https://pypi.org/project/tf-nightly/) | -| **Windows GPU** | [![Status](http://ci.tensorflow.org/job/tf-master-win-gpu-cmake/badge/icon)](http://ci.tensorflow.org/job/tf-master-win-gpu-cmake/) | [pypi](https://pypi.org/project/tf-nightly-gpu/) | -| **Android** | [![Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/) [build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/) | +| **Linux CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html) | [pypi](https://pypi.org/project/tf-nightly/) | +| **Linux GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html) | [pypi](https://pypi.org/project/tf-nightly-gpu/) | +| **Linux XLA** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html) | TBA | +| **MacOS** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html) | [pypi](https://pypi.org/project/tf-nightly/) | +| **Windows CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html) | [pypi](https://pypi.org/project/tf-nightly/) | +| **Windows GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html) | [pypi](https://pypi.org/project/tf-nightly-gpu/) | +| **Android** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html) | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) | +| **Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv6l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl) | +| **Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv7l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl) | ### Community Supported Builds @@ -95,16 +100,21 @@ The TensorFlow project strives to abide by generally accepted best practices in | --- | --- | --- | | **IBM s390x** | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/) | TBA | | **IBM ppc64le CPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/) | TBA | +| **IBM ppc64le GPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_PPC64LE_GPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_PPC64LE_GPU/) | TBA | +| **Linux CPU with Intel® MKL-DNN** Nightly | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/) | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) | +| **Linux CPU with Intel® MKL-DNN** Python 2.7
**Linux CPU with Intel® MKL-DNN** Python 3.5
**Linux CPU with Intel® MKL-DNN** Python 3.6 | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild)|[1.10.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.10.0-cp27-cp27mu-linux_x86_64.whl)
[1.10.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.10.0-cp35-cp35m-linux_x86_64.whl)
[1.10.0 py3.6](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.10.0-cp36-cp36m-linux_x86_64.whl) | ## For more information - +* [TensorFlow Blog](https://medium.com/tensorflow) +* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si) +* [TensorFlow Model Zoo](https://github.com/tensorflow/models) +* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730) +* [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap) +* [TensorFlow Twitter](https://twitter.com/tensorflow) * [TensorFlow Website](https://www.tensorflow.org) * [TensorFlow White Papers](https://www.tensorflow.org/about/bib) * [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ) -* [TensorFlow Model Zoo](https://github.com/tensorflow/models) -* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730) -* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si) Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate. diff --git a/RELEASE.md b/RELEASE.md index 84d9d52868ecd55d38d6073315749d11c2340e8c..763ef3b279dde209ed387534032deae40a33a9e4 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,149 @@ +# Release 1.10.0 + +## Major Features And Improvements + +* The `tf.lite` runtime now supports `complex64`. +* Initial [Google Cloud Bigtable integration](https://github.com/tensorflow/tensorflow/tree/r1.10/tensorflow/contrib/bigtable) for `tf.data`. +* Improved local run behavior in `tf.estimator.train_and_evaluate` which does not reload checkpoints for evaluation. +* `RunConfig` now sets device_filters to restrict how workers and PS can communicate. This can speed up training and ensure clean shutdowns in some situations. But if you have jobs that require communication between workers, you will have to set custom session_options in your `RunConfig`. +* Moved Distributions and Bijectors from `tf.contrib.distributions` to [Tensorflow Probability (TFP)](https://github.com/tensorflow/probability). `tf.contrib.distributions` is now deprecated and will be removed by the end of 2018. +* Adding new endpoints for existing tensorflow symbols. These endpoints are going to be the preferred endpoints going forward and may replace some of the existing endpoints in the future. See below for the complete list. New symbols have been added to the following modules: [`tf.debugging`](https://www.tensorflow.org/versions/master/api_docs/python/tf/debugging), [`tf.dtypes`](https://www.tensorflow.org/versions/master/api_docs/python/tf/dtypes), [`tf.image`](https://www.tensorflow.org/versions/master/api_docs/python/tf/image), [`tf.io`](https://www.tensorflow.org/versions/master/api_docs/python/tf/io), [`tf.linalg`](https://www.tensorflow.org/versions/master/api_docs/python/tf/linalg), [`tf.manip`](https://www.tensorflow.org/versions/master/api_docs/python/tf/manip), [`tf.math`](https://www.tensorflow.org/versions/master/api_docs/python/tf/math), [`tf.quantization`](https://www.tensorflow.org/versions/master/api_docs/python/tf/quantization), [`tf.strings`](https://www.tensorflow.org/versions/master/api_docs/python/tf/strings) + +## Breaking Changes + +* Prebuilt binaries are now (as of TensorFlow 1.10) built against NCCL 2.2 and no longer include NCCL in the binary install. TensorFlow usage with multiple GPUs and NCCL requires upgrade to [NCCL 2.2](https://developer.nvidia.com/nccl). See updated install guides: [Installing TensorFlow on Ubuntu](https://www.tensorflow.org/install/install_linux#tensorflow_gpu_support) and [Install TensorFlow from Sources](https://www.tensorflow.org/install/install_sources#optional_install_tensorflow_for_gpu_prerequisites). +* Starting from TensorFlow 1.11, Windows builds will use Bazel. Therefore, we will drop official support for cmake. + +## Bug Fixes and Other Changes + +* `tf.data`: + * `tf.contrib.data.group_by_reducer()` is now available via the public API. + * `tf.contrib.data.choose_from_datasets()` is now available via the public API. + * Adding `drop_remainder` argument to `tf.data.Dataset.batch()` and `tf.data.Dataset.padded_batch()`, deprecating `tf.contrib.data.batch_and_drop_remainder()` and `tf.contrib.data.padded_batch_and_drop_remainder()`. +* `tf.estimator`: + * `Estimator`s now use custom savers included in `EstimatorSpec` scaffolds for saving SavedModels during export. + * `EstimatorSpec` will now add a default prediction output for export if no `export_output` is provided, eliminating the need to explicitly include a `PredictOutput` object in the `model_fn` for simple use-cases. + * Support sparse_combiner in canned Linear Estimators. + * Added batch normalization to `DNNClassifier`, `DNNRegressor`, and `DNNEstimator`. + * Adding ranking support for boosted trees. + * Adding center bias option for boosted trees. +* Add `synchronization` and `aggregation` args to get_variable(). These args will be used for distributed variables. +* Add `synchronization` and `aggregation` args to the layer `add_weight()` API. These args will be used for distributed variables. +* `tf.losses.*` do not add to the global collection when executing eagerly (to avoid leaking memory). +* Support different summary and checkpoint directories in `tf.train.MonitoredTrainingSession()`. +* Added IndRNN, IndyGRU, and IndyLSTM cells to `tf.contrib.rnn`. +* Add safe static factory functions for SparseTensor and convert all CHECKs to DCHECKs. Using the constructor directly is unsafe and deprecated. +* Make the Bigtable client connection pool configurable & increase the default # of connections for performance. +* Added derivative of `tf.random_gamma` with respect to the alpha parameter. +* Added derivative of `tf.igamma(a, x)` and `tf.igammac(a, x)` with respect to a. +* Modified Bessel functions of order zero and one. +* Add FillTriangular Bijector to create triangular matrices. +* Added support for Type III DCT, and `tf.spectral.idct(type=2|3)`. +* Correctly handle CuDNN RNN weight loaded when nest in `TimeDistributed`. +* Adding per-element weight support for `WALSComputePartialLhsAndRhsOp`. +* ZerosLike and OnesLike ops treated as constants by Graph Transform Tool. +* Gamma distribution and the derived distributions (Beta, Dirichlet, Student's t, inverse Gamma) now fully reparameterized. +* Java: Experimental wrapper classes to make graph generation easier. Thanks @karllessard and @kbsriram +* Build & link in secure gRPC components (switch from the insecure grpc dependency to secure grpc dependency). +* Adding new endpoints for existing tensorflow symbols. These endpoints are going to be the preferred endpoints going forward and may replace some of the existing endpoints in the future. List of new endpoints: + * New endpoints in `tf.image` namespace: `tf.image.extract_image_patches` + * New endpoints in `tf.debugging` namespace: `tf.debugging.check_numerics`, `tf.debugging.is_finite`, `tf.debugging.is_inf`, `tf.debugging.is_nan`. + * New endpoints in `tf.dtypes` namespace: `tf.dtypes.as_string`. + * New endpoints in `tf.io` namespace: `tf.io.decode_base64`, `tf.io.decode_compressed`, `tf.io.decode_json_example`, `tf.io.decode_raw`, `tf.io.encode_base64`, `tf.io.matching_files`, `tf.io.parse_tensor`, `tf.io.read_file, `tf.io.write_file`. + * New endpoints in tf.linalg namespace: `tf.linalg.cross`, `tf.linalg.tensor_diag` (corresponds to `tf.diag`), `tf.linalg.tensor_diag_part` (corresponds to `tf.diag_part`). + * New endpoints in tf.manip namespace: `tf.manip.batch_to_space_nd`, `tf.manip.gather_nd`, `tf.manip.reshape`, `tf.manip.reverse`, `tf.manip.scatter_nd`, `tf.manip.space_to_batch_nd`, `tf.manip.tile` + * New endpoints in tf.math namespace: `tf.math.acos`, `tf.math.acosh`, `tf.math.add`, `tf.math.asin`, `tf.math.asinh`, `tf.math.atan`, `tf.math.atan2`, `tf.math.atanh`, `tf.math.betainc`, `tf.math.ceil`, `tf.math.cos`, `tf.math.cosh`, `tf.math.digamma`, `tf.math.equal`, `tf.math.erfc`, `tf.math.exp`, `tf.math.expm1`, `tf.math.floor`, `tf.math.greater`, `tf.math.greater_equal`, `tf.math.igamma`, `tf.math.igammac`, `tf.math.invert_permutation`, `tf.math.less`, `tf.math.less_equal`, `tf.math.lgamma`, `tf.math.log`, `tf.math.log1p`, `tf.math.logical_and`, `tf.math.logical_not`, `tf.math.logical_or`, `tf.math.maximum`, `tf.math.minimum`, `tf.math.not_equal`, `tf.math.polygamma`, `tf.math.reciprocal`, `tf.math.rint`, `tf.math.rsqrt`, `tf.math.segment_max`, `tf.math.segment_mean`, `tf.math.segment_min`, `tf.math.segment_prod`, `tf.math.segment_sum`, `tf.math.sin`, `tf.math.sinh`, `tf.math.softplus`, `tf.math.softsign`, `tf.math.squared_difference`, `tf.math.tan`, `tf.math.unsorted_segment_max`, `tf.math.unsorted_segment_min`, `tf.math.unsorted_segment_prod`, `tf.math.unsorted_segment_sum`, `tf.math.zeta`. + * New endpoints in `tf.quantization` namespace: `tf.quantization.dequantize`, `tf.quantization.fake_quant_with_min_max_args`, `tf.quantization.fake_quant_with_min_max_args_gradient`, `tf.quantization.fake_quant_with_min_max_vars`, `tf.quantization.fake_quant_with_min_max_vars_gradient`, `tf.quantization.fake_quant_with_min_max_vars_per_channel`, `tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient`. + * New endpoints in tf.strings namespace: `tf.strings.join` (corresponds to `tf.string_join`), `tf.strings.regex_replace`, `tf.strings.to_number` (corresponds to `tf.string_to_number`), `tf.strings.strip` (corresponds to `tf.string_strip`), `tf.strings.substr`, `tf.strings.to_hash_bucket` (corresponds to `tf.string_to_hash_bucket`), `tf.strings.to_hash_bucket_fast` (corresponds to `tf.string_to_hash_bucket_fast`), `tf.strings.to_hash_bucket_strong` (corresponds to `tf.string_to_hash_bucket_strong`). + + +## Thanks to our Contributors + +This release contains contributions from many people at Google, as well as: + +Ag Ramesh, Alex Wiltschko, Alexander Pantyukhin, Amogh Mannekote, An Jiaoyang, Andrei Nigmatulin, Andrew Ginns, BjøRn Moholt, Brett Koonce, Chengzhi Chen, Chinmay Das, Christian Ertler, Christoph Boeddeker, Clayne Robison, Courtial Florian, ctiijima, Dan Douthit, Dan J, Dan Ringwalt, EFanZh, Emanuele Ballarin, eqy, Evgeniy Zheltonozhskiy, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, G K, gracehoney, Guillaume Klein, Guozhong Zhuang, Hsien-Yang Li, hsm207, ImSheridan, Jayaram Bobba, Jiandong Ruan, Jie, Joel Shor, Jonas Rauber, Jongmin Baek, jsawruk, Karan Kaw, Karl Lessard, karl@kubx.ca, Kb Sriram, KinmanLam, leiiwang, Li, Yiqiang, Loo Rong Jie, Mahmoud Abuzaina, Mahmoud Aslan, ManHyuk, Martin Patz, Martin Zeitler, mktozk, Mohammad Ashraf Bhuiyan, mrTsjolder, Naman Bhalla, Nick Felt, Nicolas Lopez, Niranjan Hasabnis, Nishidha Panpaliya, Nitish, nrstott, Nutti, Parag Jain, PeterLee, Philipp Jund, Rach L, Rafal Wojdyla, Roland Zimmermann, Sergei Lebedev, SneakyFish5, Soila Kavulya, Sriram Veturi, Steven Schmatz, Taehoon Lee, Tang, Wenyi, Taras Sereda, Ted Chang, Tim Zaman, Tristan Rice, tucan, vchigrin, Vikram Tiwari, Vincent, WeberXie, William D. Irons, Yan Facai (颜发才), Yong Tang, Yu Yi, Yuxin Wu, Zé ViníCius + +# Release 1.9.0 + +## Major Features And Improvements +* Updated docs for `tf.keras`: New Keras-based [get started](http://tensorflow.org/versions/r1.9/get_started), + and [programmers guide page](http://tensorflow.org/versions/r1.9/programmers_guide/keras). +* Update `tf.keras` to the Keras 2.1.6 API. +* Added [`tf.keras.layers.CuDNNGRU`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/keras/layers/CuDNNGRU) and [`tf.keras.layers.CuDNNLSTM`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/keras/layers/CuDNNLSTM) layers. [Try it](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb?linkId=53292082). +* Adding support of core [feature columns](https://www.tensorflow.org/get_started/feature_columns) and [losses](https://www.tensorflow.org/api_docs/python/tf/losses) to [gradient boosted trees estimators](https://github.com/tensorflow/models/tree/master/official/boosted_trees). +* The [python interface](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/contrib/lite) + for the [TFLite Optimizing Converter](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/toco/README.md) + has been expanded, and the command line interface (AKA: `toco`, `tflite_convert`) is once again + included in the standard `pip` installation. +* Improved data-loading and text processing with: + * [`tf.decode_compressed`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/decode_compressed) + * [`tf.string_strip`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/string_strip) + * [`tf.strings.regex_full_match`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/strings/regex_full_match) +* Added experimental support for new pre-made Estimators: + * [`tf.contrib.estimator.BaselineEstimator`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/contrib/estimator/BaselineEstimator) + * [`tf.contrib.estimator.RNNClassifier`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/contrib/estimator/RNNEstimator) + * [`tf.contrib.estimator.RNNEstimator`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/contrib/estimator/RNNClassifier) +* The [distributions.Bijector](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/contrib/distributions/bijectors/Bijector) + API supports broadcasting for Bijectors with new API changes. + +## Breaking Changes + * If you're opening empty variable scopes; replace `variable_scope('', ...)` by + `variable_scope(tf.get_variable_scope(), ...)`. + * Headers used for building custom ops have been moved from site-packages/external into site-packages/tensorflow/include/external. + +## Bug Fixes and Other Changes + +* `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`. +* Layered variable names have changed in the following conditions: + * Using `tf.keras.layers` with custom variable scopes. + * Using `tf.layers` in a subclassed `tf.keras.Model` class. See + [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details +* `tf.data`: + * `Dataset.from_generator()` now accepts an `args` list, in order to create nested generators. + * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed. + * `tf.contrib.data.sample_from_datasets()` and `tf.contrib.data.choose_from_datasets()` make it easier to sample or deterministically choose elements from multiple datasets. + * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings, and two infrequently used arguments removed. + * (C++) `DatasetBase::DebugString()` is now `const`. + * (C++) `DatasetBase::MakeIterator()` has been renamed to `DatasetBase::MakeIteratorInternal()`. + * (C++) `IteratorBase::Initialize()` method was added to support raising errors during iterator construction. +* Eager Execution: + * Added the ability to pause recording operations for gradient computation via `tf.GradientTape.stop_recording`. + * Updated documentation, introductory notebooks. +* `tf.keras`: + * Move Keras code out of _impl folder and remove API files. + * `tf.keras.Model.save_weights` now saves in TensorFlow format by default. + * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods. +* TensorFlow Debugger (tfdbg) CLI: fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB). +* `tf.contrib`: + * `tf.contrib.framework.zero_initializer` supports ResourceVariable. + * Adding "constrained_optimization" to tensorflow/contrib. +* Other: + * Add GCS Configuration Ops. + * Changing signature of `MakeIterator` to enable propagating error status. + * KL divergence for two Dirichlet distributions. + * More consistent GcsFileSystem behavior for certain reads past EOF. + * Update benchmark for tf.scan to match ranges across eager and graph modes. + * Fixed bug in `tf.reduce_prod gradient` for complex dtypes. + * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr). To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)"). + * Benchmark for tf.scan in graph and eager modes. + * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D. + * Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce RPC calls for looking up the embeddings when there are repeated ids in the batch. + * Support indicator column in boosted trees. + * Prevent `tf.gradients()` from backpropagating through integer tensors. + * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`. + * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary. + * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints. + * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product. + * Allow LinearOperator to broadcast. + * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other. + + +## Thanks to our Contributors + +This release contains contributions from many people at Google, as well as: + +Abdullah Alrasheed, Achal Shah, Ad-530, ADiegoCAlonso, Aditya Yogi, Ag Ramesh, akindyakov, Andy Kernahan, Anya Petrova, Aurelien Geron, Ben, Ben Barsdell, Bhavani-Subramanian, braincodercn, Brett Koonce, Brian Nemsick, Brian Zier, Bryan Heden, candy.dc, cclauss, Clayne Robison, ctiijima, Dalmo Cirne, David Norman, David T.H. Kao, DosLin, ekelsen, Elson Rodriguez, Erik Smistad, Felix Abecassis, Fergal Cotter, fo40225, foo0x29a, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, gdh1995, Geoffrey Irving, Giuseppe, gracehoney, Guido Zuidhof, Guillaume Klein, Guozhong Zhuang, Haggai, Harald Husum, imsheridan, Ivan Zhang, Jan Zikes, Jayaram Bobba, Jesse Benson, Jesse Gumz, Jiajia Li, Jie, jinghuangintel, Jingwen, jjsjann123, Joe Yearsley, Joel Hestness, Joel Shor, josephyearsley, Junpeng Lao, Karol M. Langner, Kb Sriram, krantideep95, Krish Ravindranath, Letian Feng, Loo Rong Jie, Lukas Geiger, Maciej, Mahmoud Abuzaina, ManHyuk, Mark Ryan, mbhuiyan, Michal Turek, Mostafa Alaa, Myungsung Kwak, Nand Dalal, Nehal J Wani, Neil Tenenholtz, ngc92, Nicholas Nadeau, P.Eng., Avs, Niranjan Hasabnis, P-Hidringer, Paul Van Eck, Peng Yu, Qing Zhao, Qingying Chen, Quanlong, Rajendra Arora, Rholais Lii, rmanyari, Robin Richtsfeld, Russell Klopfer, Sagi, Sam Sendelbach, Sandeep N Gupta, Sandip Giri, Sarah Edkins, Scott Tseng, Sdalbsoo, Sergii Khomenko, Seungwoo Choi (Biggie), Seyed Majid Azimi, Shaoning Zeng, shengfuintel, Siu Kei, Muk, Smit Shilu, soonson, Stefan Schweter, Sukhwan Kim, Sunitha Kambhampati, Taehoon Lee, tamimaddari82, Tang, Wenyi, Ted Chang, u2takey, Utkarsh Upadhyay, Vadim Markovtsev, voegtlel, Wai Hon Law, wangsiyu, Wenhao Hu, wenhao.hu, William D. Irons, Yan Facai (颜发才), Yanbo Liang, Yihong Wang, Yilei (Dolee) Yang, Yong Tang, Yuan (Terry) Tang + # Release 1.8.0 ## Major Features And Improvements @@ -406,15 +552,7 @@ answered questions, and were part of inspiring discussions. ## Major Features And Improvements * `tf.keras` is now part of the core TensorFlow API. -* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of - the core TensorFlow API. - * The API is now subject to backwards compatibility guarantees. - -# Release 1.4.0 - -## Major Features And Improvements -* `tf.keras` is now part of the core TensorFlow API. -* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of +* [`tf.data`](http://tensorflow.org/guide/datasets) is now part of the core TensorFlow API. * The API is now subject to backwards compatibility guarantees. * For a guide to migrating from the `tf.contrib.data` API, see the @@ -434,7 +572,7 @@ answered questions, and were part of inspiring discussions. * TensorFlow Debugger (tfdbg): * Add `eval` command to allow evaluation of arbitrary Python/numpy expressions in tfdbg command-line interface. See - [Debugging TensorFlow Programs](https://www.tensorflow.org/programmers_guide/debugger) + [Debugging TensorFlow Programs](https://www.tensorflow.org/guide/debugger) for more details. * Usability improvement: The frequently used tensor filter `has_inf_or_nan` is now added to `Session` wrappers and hooks by default. So there is no need @@ -721,7 +859,7 @@ answered questions, and were part of inspiring discussions. * Support client-provided ClusterSpec's and propagate them to all workers to enable the creation of dynamic TensorFlow clusters. * TensorFlow C library now available for Windows. * We released a new open-source version of TensorBoard. -* [`SavedModel CLI`](https://www.tensorflow.org/versions/master/programmers_guide/saved_model_cli) tool available to inspect and execute MetaGraph in SavedModel +* [`SavedModel CLI`](https://www.tensorflow.org/versions/master/guide/saved_model_cli) tool available to inspect and execute MetaGraph in SavedModel * Android releases of TensorFlow are now pushed to jcenter for easier integration into apps. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/android/README.md diff --git a/SECURITY.md b/SECURITY.md index 0a4be37cbc20665bf8be68616496d35c8b6d7fb7..0b52fdc7ab84b7bd5bce5d247ede81b40699005c 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -242,12 +242,7 @@ v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc= -----END PGP PUBLIC KEY BLOCK----- ``` -### Known vulnerabilities - -| Type | Versions affected | Reported by | Additional Information | -|--------------------|:-----------------:|-----------------------|-----------------------------| -| TensorFlow Lite TOCO FlatBuffer Parsing Vulnerability | <= 1.7 | Blade Team of Tencent | [security advisory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/security/advisory/tfsa-2018-003.md) | -| GIF File Parsing Null Pointer Dereference Error | <= 1.5 | Blade Team of Tencent | [security advisory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/security/advisory/tfsa-2018-002.md) | -| BMP File Parser Out-of-bounds Read | <= 1.6 | Blade Team of Tencent | [security advisory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/security/advisory/tfsa-2018-001.md) | -| Out Of Bounds Read | <=1.4 | Blade Team of Tencent | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) | +### Known Vulnerabilities +For a list of known vulnerabilities and security advisories for TensorFlow, +[click here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/index.md). diff --git a/WORKSPACE b/WORKSPACE index 4ddfb9a3832ea1ea639ace887e1d601bdd857086..17961829a605c2d1f2d2ba86a7c30c47618c139b 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -18,30 +18,14 @@ closure_repositories() # files, in case the parsing of those build files depends on the bazel # version we require here. load("//tensorflow:version_check.bzl", "check_bazel_version_at_least") -check_bazel_version_at_least("0.10.0") +check_bazel_version_at_least("0.15.0") load("//tensorflow:workspace.bzl", "tf_workspace") -# Uncomment and update the paths in these entries to build the Android demo. -#android_sdk_repository( -# name = "androidsdk", -# api_level = 23, -# # Ensure that you have the build_tools_version below installed in the -# # SDK manager as it updates periodically. -# build_tools_version = "26.0.1", -# # Replace with path to Android SDK on your system -# path = "", -#) -# -#android_ndk_repository( -# name="androidndk", -# path="", -# # This needs to be 14 or higher to compile TensorFlow. -# # Please specify API level to >= 21 to build for 64-bit -# # archtectures or the Android NDK will automatically select biggest -# # API level that it supports without notice. -# # Note that the NDK version is not the API level. -# api_level=14) +load("//third_party/android:android_configure.bzl", "android_configure") +android_configure(name="local_config_android") +load("@local_config_android//:android.bzl", "android_workspace") +android_workspace() # Please add all new TensorFlow dependencies in workspace.bzl. tf_workspace() diff --git a/configure.py b/configure.py index b6c32543cf707983d48e390cc89abf13dafd55d3..361bd4764dc5c1900be7378f51c00aedf6f2ce41 100644 --- a/configure.py +++ b/configure.py @@ -35,8 +35,8 @@ except ImportError: _DEFAULT_CUDA_VERSION = '9.0' _DEFAULT_CUDNN_VERSION = '7' -_DEFAULT_NCCL_VERSION = '1.3' -_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2' +_DEFAULT_NCCL_VERSION = '2.2' +_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0' _DEFAULT_CUDA_PATH = '/usr/local/cuda' _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda' _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing ' @@ -45,7 +45,7 @@ _DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/%s-linux-gnu' % platform.machine() _TF_OPENCL_VERSION = '1.2' _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp' _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include' -_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15] +_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16] _DEFAULT_PROMPT_ASK_ATTEMPTS = 10 @@ -670,8 +670,9 @@ def create_android_ndk_rule(environ_cp): error_msg=('The path %s or its child file "source.properties" ' 'does not exist.') ) - - write_android_ndk_workspace_rule(android_ndk_home_path) + write_action_env_to_bazelrc('ANDROID_NDK_HOME', android_ndk_home_path) + write_action_env_to_bazelrc('ANDROID_NDK_API_LEVEL', + check_ndk_level(android_ndk_home_path)) def create_android_sdk_rule(environ_cp): @@ -679,7 +680,7 @@ def create_android_sdk_rule(environ_cp): if is_windows() or is_cygwin(): default_sdk_path = cygpath('%s/Android/Sdk' % environ_cp['APPDATA']) elif is_macos(): - default_sdk_path = '%s/library/Android/Sdk/ndk-bundle' % environ_cp['HOME'] + default_sdk_path = '%s/library/Android/Sdk' % environ_cp['HOME'] else: default_sdk_path = '%s/Android/Sdk' % environ_cp['HOME'] @@ -733,41 +734,12 @@ def create_android_sdk_rule(environ_cp): error_msg=('The selected SDK does not have build-tools version %s ' 'available.')) - write_android_sdk_workspace_rule(android_sdk_home_path, - android_build_tools_version, - android_api_level) - - -def write_android_sdk_workspace_rule(android_sdk_home_path, - android_build_tools_version, - android_api_level): - print('Writing android_sdk_workspace rule.\n') - with open(_TF_WORKSPACE, 'a') as f: - f.write(""" -android_sdk_repository( - name="androidsdk", - api_level=%s, - path="%s", - build_tools_version="%s")\n -""" % (android_api_level, android_sdk_home_path, android_build_tools_version)) - - -def write_android_ndk_workspace_rule(android_ndk_home_path): - print('Writing android_ndk_workspace rule.') - ndk_api_level = check_ndk_level(android_ndk_home_path) - if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS: - print('WARNING: The API level of the NDK in %s is %s, which is not ' - 'supported by Bazel (officially supported versions: %s). Please use ' - 'another version. Compiling Android targets may result in confusing ' - 'errors.\n' % (android_ndk_home_path, ndk_api_level, - _SUPPORTED_ANDROID_NDK_VERSIONS)) - with open(_TF_WORKSPACE, 'a') as f: - f.write(""" -android_ndk_repository( - name="androidndk", - path="%s", - api_level=%s)\n -""" % (android_ndk_home_path, ndk_api_level)) + write_action_env_to_bazelrc('ANDROID_BUILD_TOOLS_VERSION', + android_build_tools_version) + write_action_env_to_bazelrc('ANDROID_SDK_API_LEVEL', + android_api_level) + write_action_env_to_bazelrc('ANDROID_SDK_HOME', + android_sdk_home_path) def check_ndk_level(android_ndk_home_path): @@ -780,18 +752,16 @@ def check_ndk_level(android_ndk_home_path): revision = re.search(r'Pkg.Revision = (\d+)', filedata) if revision: - return revision.group(1) - return None - - -def workspace_has_any_android_rule(): - """Check the WORKSPACE for existing android_*_repository rules.""" - with open(_TF_WORKSPACE, 'r') as f: - workspace = f.read() - has_any_rule = re.search(r'^android_[ns]dk_repository', - workspace, - re.MULTILINE) - return has_any_rule + ndk_api_level = revision.group(1) + else: + raise Exception('Unable to parse NDK revision.') + if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS: + print('WARNING: The API level of the NDK in %s is %s, which is not ' + 'supported by Bazel (officially supported versions: %s). Please use ' + 'another version. Compiling Android targets may result in confusing ' + 'errors.\n' % (android_ndk_home_path, ndk_api_level, + _SUPPORTED_ANDROID_NDK_VERSIONS)) + return ndk_api_level def set_gcc_host_compiler_path(environ_cp): @@ -865,16 +835,19 @@ def set_tf_cuda_version(environ_cp): '[Default is %s]: ') % (tf_cuda_version, default_cuda_path) cuda_toolkit_path = get_from_env_or_user_or_default( environ_cp, 'CUDA_TOOLKIT_PATH', ask_cuda_path, default_cuda_path) + if is_windows() or is_cygwin(): + cuda_toolkit_path = cygpath(cuda_toolkit_path) if is_windows(): - cuda_rt_lib_path = 'lib/x64/cudart.lib' + cuda_rt_lib_paths = ['lib/x64/cudart.lib'] elif is_linux(): - cuda_rt_lib_path = 'lib64/libcudart.so.%s' % tf_cuda_version + cuda_rt_lib_paths = ['%s/libcudart.so.%s' % (x, tf_cuda_version) + for x in ['lib64', 'lib/x86_64-linux-gnu']] elif is_macos(): - cuda_rt_lib_path = 'lib/libcudart.%s.dylib' % tf_cuda_version + cuda_rt_lib_paths = ['lib/libcudart.%s.dylib' % tf_cuda_version] - cuda_toolkit_path_full = os.path.join(cuda_toolkit_path, cuda_rt_lib_path) - if os.path.exists(cuda_toolkit_path_full): + cuda_toolkit_paths_full = [os.path.join(cuda_toolkit_path, x) for x in cuda_rt_lib_paths] + if any([os.path.exists(x) for x in cuda_toolkit_paths_full]): break # Reset and retry @@ -910,7 +883,7 @@ def set_tf_cudnn_version(environ_cp): default_cudnn_path = environ_cp.get('CUDA_TOOLKIT_PATH') ask_cudnn_path = (r'Please specify the location where cuDNN %s library is ' 'installed. Refer to README.md for more details. [Default' - ' is %s]:') % (tf_cudnn_version, default_cudnn_path) + ' is %s]: ') % (tf_cudnn_version, default_cudnn_path) cudnn_install_path = get_from_env_or_user_or_default( environ_cp, 'CUDNN_INSTALL_PATH', ask_cudnn_path, default_cudnn_path) @@ -973,6 +946,35 @@ def set_tf_cudnn_version(environ_cp): write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version) +def is_cuda_compatible(lib, cuda_ver, cudnn_ver): + """Check compatibility between given library and cudnn/cudart libraries.""" + ldd_bin = which('ldd') or '/usr/bin/ldd' + ldd_out = run_shell([ldd_bin, lib], True) + ldd_out = ldd_out.split(os.linesep) + cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$') + cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$') + cudnn = None + cudart = None + cudnn_ok = True # assume no cudnn dependency by default + cuda_ok = True # assume no cuda dependency by default + for line in ldd_out: + if 'libcudnn.so' in line: + cudnn = cudnn_pattern.search(line) + cudnn_ok = False + elif 'libcudart.so' in line: + cudart = cuda_pattern.search(line) + cuda_ok = False + if cudnn and len(cudnn.group(1)): + cudnn = convert_version_to_int(cudnn.group(1)) + if cudart and len(cudart.group(1)): + cudart = convert_version_to_int(cudart.group(1)) + if cudnn is not None: + cudnn_ok = (cudnn == cudnn_ver) + if cudart is not None: + cuda_ok = (cudart == cuda_ver) + return cudnn_ok and cuda_ok + + def set_tf_tensorrt_install_path(environ_cp): """Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION. @@ -989,8 +991,8 @@ def set_tf_tensorrt_install_path(environ_cp): raise ValueError('Currently TensorRT is only supported on Linux platform.') # Ask user whether to add TensorRT support. - if str(int(get_var( - environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False))) != '1': + if str(int(get_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', + False))) != '1': return for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS): @@ -1003,47 +1005,29 @@ def set_tf_tensorrt_install_path(environ_cp): # Result returned from "read" will be used unexpanded. That make "~" # unusable. Going through one more level of expansion to handle that. - trt_install_path = os.path.realpath( - os.path.expanduser(trt_install_path)) + trt_install_path = os.path.realpath(os.path.expanduser(trt_install_path)) def find_libs(search_path): """Search for libnvinfer.so in "search_path".""" fl = set() if os.path.exists(search_path) and os.path.isdir(search_path): - fl.update([os.path.realpath(os.path.join(search_path, x)) - for x in os.listdir(search_path) if 'libnvinfer.so' in x]) + fl.update([ + os.path.realpath(os.path.join(search_path, x)) + for x in os.listdir(search_path) + if 'libnvinfer.so' in x + ]) return fl possible_files = find_libs(trt_install_path) possible_files.update(find_libs(os.path.join(trt_install_path, 'lib'))) possible_files.update(find_libs(os.path.join(trt_install_path, 'lib64'))) - - def is_compatible(tensorrt_lib, cuda_ver, cudnn_ver): - """Check the compatibility between tensorrt and cudnn/cudart libraries.""" - ldd_bin = which('ldd') or '/usr/bin/ldd' - ldd_out = run_shell([ldd_bin, tensorrt_lib]).split(os.linesep) - cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$') - cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$') - cudnn = None - cudart = None - for line in ldd_out: - if 'libcudnn.so' in line: - cudnn = cudnn_pattern.search(line) - elif 'libcudart.so' in line: - cudart = cuda_pattern.search(line) - if cudnn and len(cudnn.group(1)): - cudnn = convert_version_to_int(cudnn.group(1)) - if cudart and len(cudart.group(1)): - cudart = convert_version_to_int(cudart.group(1)) - return (cudnn == cudnn_ver) and (cudart == cuda_ver) - cuda_ver = convert_version_to_int(environ_cp['TF_CUDA_VERSION']) cudnn_ver = convert_version_to_int(environ_cp['TF_CUDNN_VERSION']) nvinfer_pattern = re.compile('.*libnvinfer.so.?(.*)$') highest_ver = [0, None, None] for lib_file in possible_files: - if is_compatible(lib_file, cuda_ver, cudnn_ver): + if is_cuda_compatible(lib_file, cuda_ver, cudnn_ver): matches = nvinfer_pattern.search(lib_file) if len(matches.groups()) == 0: continue @@ -1059,12 +1043,13 @@ def set_tf_tensorrt_install_path(environ_cp): # Try another alternative from ldconfig. ldconfig_bin = which('ldconfig') or '/sbin/ldconfig' ldconfig_output = run_shell([ldconfig_bin, '-p']) - search_result = re.search( - '.*libnvinfer.so\\.?([0-9.]*).* => (.*)', ldconfig_output) + search_result = re.search('.*libnvinfer.so\\.?([0-9.]*).* => (.*)', + ldconfig_output) if search_result: libnvinfer_path_from_ldconfig = search_result.group(2) if os.path.exists(libnvinfer_path_from_ldconfig): - if is_compatible(libnvinfer_path_from_ldconfig, cuda_ver, cudnn_ver): + if is_cuda_compatible(libnvinfer_path_from_ldconfig, cuda_ver, + cudnn_ver): trt_install_path = os.path.dirname(libnvinfer_path_from_ldconfig) tf_tensorrt_version = search_result.group(1) break @@ -1113,8 +1098,10 @@ def set_tf_nccl_install_path(environ_cp): raise ValueError('Currently NCCL is only supported on Linux platforms.') ask_nccl_version = ( - 'Please specify the NCCL version you want to use. ' - '[Leave empty to default to NCCL %s]: ') % _DEFAULT_NCCL_VERSION + 'Please specify the NCCL version you want to use. If NCCL %s is not ' + 'installed, then you can use version 1.3 that can be fetched ' + 'automatically but it may have worse performance with multiple GPUs. ' + '[Default is %s]: ') % (_DEFAULT_NCCL_VERSION, _DEFAULT_NCCL_VERSION) for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS): tf_nccl_version = get_from_env_or_user_or_default( @@ -1215,7 +1202,7 @@ def set_tf_cuda_compute_capabilities(environ_cp): 'https://developer.nvidia.com/cuda-gpus.\nPlease' ' note that each additional compute ' 'capability significantly increases your ' - 'build time and binary size. [Default is: %s]' % + 'build time and binary size. [Default is: %s]: ' % default_cuda_compute_capabilities) tf_cuda_compute_capabilities = get_from_env_or_user_or_default( environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES', @@ -1223,7 +1210,7 @@ def set_tf_cuda_compute_capabilities(environ_cp): # Check whether all capabilities from the input is valid all_valid = True # Remove all whitespace characters before splitting the string - # that users may insert by accident, as this will result in error + # that users may insert by accident, as this will result in error tf_cuda_compute_capabilities = ''.join(tf_cuda_compute_capabilities.split()) for compute_capability in tf_cuda_compute_capabilities.split(','): m = re.match('[0-9]+.[0-9]+', compute_capability) @@ -1250,28 +1237,13 @@ def set_tf_cuda_compute_capabilities(environ_cp): def set_other_cuda_vars(environ_cp): """Set other CUDA related variables.""" - if is_windows(): - # The following three variables are needed for MSVC toolchain configuration - # in Bazel - environ_cp['CUDA_PATH'] = environ_cp.get('CUDA_TOOLKIT_PATH') - environ_cp['CUDA_COMPUTE_CAPABILITIES'] = environ_cp.get( - 'TF_CUDA_COMPUTE_CAPABILITIES') - environ_cp['NO_WHOLE_ARCHIVE_OPTION'] = 1 - write_action_env_to_bazelrc('CUDA_PATH', environ_cp.get('CUDA_PATH')) - write_action_env_to_bazelrc('CUDA_COMPUTE_CAPABILITIE', - environ_cp.get('CUDA_COMPUTE_CAPABILITIE')) - write_action_env_to_bazelrc('NO_WHOLE_ARCHIVE_OPTION', - environ_cp.get('NO_WHOLE_ARCHIVE_OPTION')) - write_to_bazelrc('build --config=win-cuda') - write_to_bazelrc('test --config=win-cuda') + # If CUDA is enabled, always use GPU during build and test. + if environ_cp.get('TF_CUDA_CLANG') == '1': + write_to_bazelrc('build --config=cuda_clang') + write_to_bazelrc('test --config=cuda_clang') else: - # If CUDA is enabled, always use GPU during build and test. - if environ_cp.get('TF_CUDA_CLANG') == '1': - write_to_bazelrc('build --config=cuda_clang') - write_to_bazelrc('test --config=cuda_clang') - else: - write_to_bazelrc('build --config=cuda') - write_to_bazelrc('test --config=cuda') + write_to_bazelrc('build --config=cuda') + write_to_bazelrc('test --config=cuda') def set_host_cxx_compiler(environ_cp): @@ -1427,14 +1399,43 @@ def set_grpc_build_flags(): write_to_bazelrc('build --define grpc_no_ares=true') -def set_windows_build_flags(): - if is_windows(): - # The non-monolithic build is not supported yet - write_to_bazelrc('build --config monolithic') - # Suppress warning messages - write_to_bazelrc('build --copt=-w --host_copt=-w') - # Output more verbose information when something goes wrong - write_to_bazelrc('build --verbose_failures') +def set_system_libs_flag(environ_cp): + syslibs = environ_cp.get('TF_SYSTEM_LIBS', '') + syslibs = ','.join(sorted(syslibs.split(','))) + if syslibs and syslibs != '': + write_action_env_to_bazelrc('TF_SYSTEM_LIBS', syslibs) + + +def set_windows_build_flags(environ_cp): + """Set Windows specific build options.""" + # The non-monolithic build is not supported yet + write_to_bazelrc('build --config monolithic') + # Suppress warning messages + write_to_bazelrc('build --copt=-w --host_copt=-w') + # Output more verbose information when something goes wrong + write_to_bazelrc('build --verbose_failures') + # The host and target platforms are the same in Windows build. So we don't + # have to distinct them. This avoids building the same targets twice. + write_to_bazelrc('build --distinct_host_configuration=false') + # Enable short object file path to avoid long path issue on Windows. + # TODO(pcloudy): Remove this flag when upgrading Bazel to 0.16.0 + # Short object file path will be enabled by default. + write_to_bazelrc('build --experimental_shortened_obj_file_path=true') + + if get_var( + environ_cp, 'TF_OVERRIDE_EIGEN_STRONG_INLINE', 'Eigen strong inline', + True, + ('Would you like to override eigen strong inline for some C++ ' + 'compilation to reduce the compilation time?'), + 'Eigen strong inline overridden.', + 'Not overriding eigen strong inline, ' + 'some compilations could take more than 20 mins.'): + # Due to a known MSVC compiler issue + # https://github.com/tensorflow/tensorflow/issues/10521 + # Overriding eigen strong inline speeds up the compiling of + # conv_grad_ops_3d.cc and conv_ops_3d.cc by 20 minutes, + # but this also hurts the performance. Let users decide what they want. + write_to_bazelrc('build --define=override_eigen_strong_inline=true') def config_info_line(name, help_text): @@ -1454,14 +1455,14 @@ def main(): # environment variables. environ_cp = dict(os.environ) - check_bazel_version('0.10.0') + check_bazel_version('0.15.0') reset_tf_configure_bazelrc(args.workspace) cleanup_makefile() setup_python(environ_cp) if is_windows(): - environ_cp['TF_NEED_S3'] = '0' + environ_cp['TF_NEED_AWS'] = '0' environ_cp['TF_NEED_GCP'] = '0' environ_cp['TF_NEED_HDFS'] = '0' environ_cp['TF_NEED_JEMALLOC'] = '0' @@ -1474,19 +1475,31 @@ def main(): # TODO(ibiryukov): Investigate using clang as a cpu or cuda compiler on # Windows. environ_cp['TF_DOWNLOAD_CLANG'] = '0' + environ_cp['TF_ENABLE_XLA'] = '0' + environ_cp['TF_NEED_GDR'] = '0' + environ_cp['TF_NEED_VERBS'] = '0' + environ_cp['TF_NEED_MPI'] = '0' + environ_cp['TF_SET_ANDROID_WORKSPACE'] = '0' if is_macos(): environ_cp['TF_NEED_JEMALLOC'] = '0' environ_cp['TF_NEED_TENSORRT'] = '0' + # The numpy package on ppc64le uses OpenBLAS which has multi-threading + # issues that lead to incorrect answers. Set OMP_NUM_THREADS=1 at + # runtime to allow the Tensorflow testcases which compare numpy + # results to Tensorflow results to succeed. + if is_ppc64le(): + write_action_env_to_bazelrc("OMP_NUM_THREADS", 1) + set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc', 'with_jemalloc', True) set_build_var(environ_cp, 'TF_NEED_GCP', 'Google Cloud Platform', 'with_gcp_support', True, 'gcp') set_build_var(environ_cp, 'TF_NEED_HDFS', 'Hadoop File System', 'with_hdfs_support', True, 'hdfs') - set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System', - 'with_s3_support', True, 's3') + set_build_var(environ_cp, 'TF_NEED_AWS', 'Amazon AWS Platform', + 'with_aws_support', True, 'aws') set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform', 'with_kafka_support', True, 'kafka') set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support', @@ -1495,6 +1508,8 @@ def main(): False, 'gdr') set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support', False, 'verbs') + set_build_var(environ_cp, 'TF_NEED_NGRAPH', 'nGraph', + 'with_ngraph_support', False, 'ngraph') set_action_env_var(environ_cp, 'TF_NEED_OPENCL_SYCL', 'OpenCL SYCL', False) if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1': @@ -1528,6 +1543,10 @@ def main(): if environ_cp.get('TF_DOWNLOAD_CLANG') != '1': # Set up which clang we should use as the cuda / host compiler. set_clang_cuda_compiler_path(environ_cp) + else: + # Use downloaded LLD for linking. + write_to_bazelrc('build:cuda_clang --config=download_clang_use_lld') + write_to_bazelrc('test:cuda_clang --config=download_clang_use_lld') else: # Set up which gcc nvcc should use as the host compiler # No need to set this on Windows @@ -1549,29 +1568,29 @@ def main(): set_grpc_build_flags() set_cc_opt_flags(environ_cp) - set_windows_build_flags() + set_system_libs_flag(environ_cp) + if is_windows(): + set_windows_build_flags(environ_cp) - if workspace_has_any_android_rule(): - print('The WORKSPACE file has at least one of ["android_sdk_repository", ' - '"android_ndk_repository"] already set. Will not ask to help ' - 'configure the WORKSPACE. Please delete the existing rules to ' - 'activate the helper.\n') - else: - if get_var( - environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace', - False, - ('Would you like to interactively configure ./WORKSPACE for ' - 'Android builds?'), - 'Searching for NDK and SDK installations.', - 'Not configuring the WORKSPACE for Android builds.'): - create_android_ndk_rule(environ_cp) - create_android_sdk_rule(environ_cp) - - print('Preconfigured Bazel build configs. You can use any of the below by ' - 'adding "--config=<>" to your build command. See tools/bazel.rc for ' - 'more details.') - config_info_line('mkl', 'Build with MKL support.') - config_info_line('monolithic', 'Config for mostly static monolithic build.') + if get_var( + environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace', + False, + ('Would you like to interactively configure ./WORKSPACE for ' + 'Android builds?'), + 'Searching for NDK and SDK installations.', + 'Not configuring the WORKSPACE for Android builds.'): + create_android_ndk_rule(environ_cp) + create_android_sdk_rule(environ_cp) + + # On Windows, we don't have MKL support and the build is always monolithic. + # So no need to print the following message. + # TODO(pcloudy): remove the following if check when they make sense on Windows + if not is_windows(): + print('Preconfigured Bazel build configs. You can use any of the below by ' + 'adding "--config=<>" to your build command. See tools/bazel.rc for ' + 'more details.') + config_info_line('mkl', 'Build with MKL support.') + config_info_line('monolithic', 'Config for mostly static monolithic build.') if __name__ == '__main__': main() diff --git a/tensorflow/BUILD b/tensorflow/BUILD index f2ad16fa04f5beb6616c58c28d0f0c460c3e3a17..b5e0a4e98b0c183454afa4a4389dcf73802b219b 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -19,6 +19,26 @@ load( "//tensorflow/core:platform/default/build_config.bzl", "tf_additional_binary_deps", ) +load( + "//tensorflow/python/tools/api/generator:api_gen.bzl", + "gen_api_init_files", # @unused +) +load( + "//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", + "TENSORFLOW_API_INIT_FILES_V1", # @unused +) +load( + "//third_party/ngraph:build_defs.bzl", + "if_ngraph", +) + +# Config setting used when building for products +# which requires restricted licenses to be avoided. +config_setting( + name = "no_lgpl_deps", + values = {"define": "__TENSORFLOW_NO_LGPL_DEPS__=1"}, + visibility = ["//visibility:public"], +) # Config setting for determining if we are building for Android. config_setting( @@ -111,12 +131,6 @@ config_setting( visibility = ["//visibility:public"], ) -config_setting( - name = "windows_msvc", - values = {"cpu": "x64_windows_msvc"}, - visibility = ["//visibility:public"], -) - config_setting( name = "no_tensorflow_py_deps", define_values = {"no_tensorflow_py_deps": "true"}, @@ -150,6 +164,12 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "linux_s390x", + values = {"cpu": "s390x"}, + visibility = ["//visibility:public"], +) + config_setting( name = "debug", values = { @@ -206,8 +226,8 @@ config_setting( ) config_setting( - name = "with_s3_support", - define_values = {"with_s3_support": "true"}, + name = "with_aws_support", + define_values = {"with_aws_support": "true"}, visibility = ["//visibility:public"], ) @@ -234,8 +254,8 @@ config_setting( ) config_setting( - name = "with_s3_support_windows_override", - define_values = {"with_s3_support": "true"}, + name = "with_aws_support_windows_override", + define_values = {"with_aws_support": "true"}, values = {"cpu": "x64_windows"}, visibility = ["//visibility:public"], ) @@ -247,6 +267,13 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "with_cuda_support_windows_override", + define_values = {"using_cuda_nvcc": "true"}, + values = {"cpu": "x64_windows"}, + visibility = ["//visibility:public"], +) + config_setting( name = "with_gcp_support_android_override", define_values = {"with_gcp_support": "true"}, @@ -262,8 +289,8 @@ config_setting( ) config_setting( - name = "with_s3_support_android_override", - define_values = {"with_s3_support": "true"}, + name = "with_aws_support_android_override", + define_values = {"with_aws_support": "true"}, values = {"crosstool_top": "//external:android/crosstool"}, visibility = ["//visibility:public"], ) @@ -283,8 +310,8 @@ config_setting( ) config_setting( - name = "with_s3_support_ios_override", - define_values = {"with_s3_support": "true"}, + name = "with_aws_support_ios_override", + define_values = {"with_aws_support": "true"}, values = {"crosstool_top": "//tools/osx/crosstool:crosstool"}, visibility = ["//visibility:public"], ) @@ -356,6 +383,15 @@ config_setting( }, ) +# Setting to use when loading kernels dynamically +config_setting( + name = "dynamic_loaded_kernels", + define_values = { + "dynamic_loaded_kernels": "true", + }, + visibility = ["//visibility:public"], +) + config_setting( name = "using_cuda_nvcc", define_values = { @@ -383,17 +419,18 @@ config_setting( visibility = ["//visibility:public"], ) -# TODO(laigd): consider removing this option and make TensorRT enabled -# automatically when CUDA is enabled. +# This flag is set from the configure step when the user selects with nGraph option. +# By default it should be false config_setting( - name = "with_tensorrt_support", - values = {"define": "with_tensorrt_support=true"}, + name = "with_ngraph_support", + values = {"define": "with_ngraph_support=true"}, visibility = ["//visibility:public"], ) package_group( name = "internal", packages = [ + "-//third_party/tensorflow/python/estimator", "//learning/meta_rank/...", "//tensorflow/...", "//tensorflow_fold/llgtm/...", @@ -403,21 +440,32 @@ package_group( load( "//third_party/mkl:build_defs.bzl", - "if_mkl", + "if_mkl_ml", ) filegroup( name = "intel_binary_blob", - data = if_mkl( + data = if_mkl_ml( [ "//third_party/mkl:intel_binary_blob", ], ), ) -filegroup( - name = "docs_src", - data = glob(["docs_src/**/*.md"]), +cc_library( + name = "grpc", + deps = select({ + ":linux_s390x": ["@grpc//:grpc_unsecure"], + "//conditions:default": ["@grpc"], + }), +) + +cc_library( + name = "grpc++", + deps = select({ + ":linux_s390x": ["@grpc//:grpc++_unsecure"], + "//conditions:default": ["@grpc//:grpc++"], + }), ) # A shared object which includes registration mechanisms for ops and @@ -447,6 +495,14 @@ filegroup( tf_cc_shared_object( name = "libtensorflow_framework.so", framework_so = [], + linkopts = select({ + "//tensorflow:darwin": [], + "//tensorflow:windows": [], + "//conditions:default": [ + "-Wl,--version-script", # This line must be directly followed by the version_script.lds file + "$(location //tensorflow:tf_framework_version_script.lds)", + ], + }), linkstatic = 1, visibility = ["//visibility:public"], deps = [ @@ -456,6 +512,7 @@ tf_cc_shared_object( "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl", "//tensorflow/core:lib_internal_impl", "//tensorflow/stream_executor:stream_executor_impl", + "//tensorflow:tf_framework_version_script.lds", ] + tf_additional_binary_deps(), ) @@ -471,7 +528,7 @@ tf_cc_shared_object( # excludes all but a subset of function names. # On MacOS, the linker does not support version_script, but has an # an "-exported_symbols_list" command. -z defs disallows undefined -# symbols in object files and -s strips the output. +# symbols in object files. tf_cc_shared_object( name = "libtensorflow.so", @@ -482,10 +539,8 @@ tf_cc_shared_object( "-Wl,-install_name,@rpath/libtensorflow.so", ], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": [ "-z defs", - "-s", "-Wl,--version-script", # This line must be directly followed by the version_script.lds file "$(location //tensorflow/c:version_script.lds)", ], @@ -508,10 +563,8 @@ tf_cc_shared_object( "$(location //tensorflow:tf_exported_symbols.lds)", ], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": [ "-z defs", - "-s", "-Wl,--version-script", # This line must be directly followed by the version_script.lds file "$(location //tensorflow:tf_version_script.lds)", ], @@ -526,7 +579,7 @@ tf_cc_shared_object( "//tensorflow/cc:scope", "//tensorflow/cc/profiler", "//tensorflow/core:tensorflow", - ], + ] + if_ngraph(["@ngraph_tf//:ngraph_tf"]), ) exports_files( @@ -536,13 +589,30 @@ exports_files( ], ) +gen_api_init_files( + name = "tensorflow_python_api_gen", + srcs = ["api_template.__init__.py"], + api_version = 1, + output_files = TENSORFLOW_API_INIT_FILES_V1, + root_init_template = "api_template.__init__.py", +) + py_library( name = "tensorflow_py", - srcs = ["__init__.py"], + srcs = ["//tensorflow/python/estimator/api:estimator_python_api_gen"], srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ - "//tensorflow/python", - "//tensorflow/tools/api/generator:python_api", + ":tensorflow_py_no_contrib", + "//tensorflow/contrib:contrib_py", + "//tensorflow/python/estimator:estimator_py", ], ) + +py_library( + name = "tensorflow_py_no_contrib", + srcs = [":tensorflow_python_api_gen"], + srcs_version = "PY2AND3", + visibility = ["//visibility:public"], + deps = ["//tensorflow/python:no_contrib"], +) diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py index c8683e3976c90add3f1f54d8e575c798327e9273..21677512b63828fa2035527ed573bf4dc4603085 100644 --- a/tensorflow/__init__.py +++ b/tensorflow/__init__.py @@ -22,16 +22,14 @@ from __future__ import print_function # pylint: disable=g-bad-import-order from tensorflow.python import pywrap_tensorflow # pylint: disable=unused-import -# pylint: disable=wildcard-import -from tensorflow.tools.api.generator.api import * # pylint: disable=redefined-builtin -# pylint: enable=wildcard-import from tensorflow.python.util.lazy_loader import LazyLoader contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib') del LazyLoader from tensorflow.python.platform import flags # pylint: disable=g-import-not-at-top -app.flags = flags # pylint: disable=undefined-variable +from tensorflow.python.platform import app # pylint: disable=g-import-not-at-top +app.flags = flags del absolute_import del division diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..779f65d5b17c350833f67f07985b00e8eb561e72 --- /dev/null +++ b/tensorflow/api_template.__init__.py @@ -0,0 +1,59 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Bring in all of the public TensorFlow interface into this module.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# pylint: disable=g-bad-import-order +from tensorflow.python import pywrap_tensorflow # pylint: disable=unused-import + +try: + import os # pylint: disable=g-import-not-at-top + # Add `estimator` attribute to allow access to estimator APIs via + # "tf.estimator..." + from tensorflow.python.estimator.api import estimator # pylint: disable=g-import-not-at-top + + # Add `estimator` to the __path__ to allow "from tensorflow.estimator..." + # style imports. + from tensorflow.python.estimator import api as estimator_api # pylint: disable=g-import-not-at-top + __path__ += [os.path.dirname(estimator_api.__file__)] + del estimator_api + del os +except (ImportError, AttributeError): + print('tf.estimator package not installed.') + +# API IMPORTS PLACEHOLDER + +from tensorflow.python.util.lazy_loader import LazyLoader # pylint: disable=g-import-not-at-top +contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib') +del LazyLoader + +from tensorflow.python.platform import flags # pylint: disable=g-import-not-at-top +app.flags = flags # pylint: disable=undefined-variable + +del absolute_import +del division +del print_function + +# These symbols appear because we import the python package which +# in turn imports from tensorflow.core and tensorflow.python. They +# must come from this module. So python adds these symbols for the +# resolution to succeed. +# pylint: disable=undefined-variable +del python +del core +# pylint: enable=undefined-variable diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD index 8a9301d584775cff3ae315e6fd856b00d1734248..2c3a877edfc6b310a3165f2414deee357ee63539 100644 --- a/tensorflow/c/BUILD +++ b/tensorflow/c/BUILD @@ -127,6 +127,15 @@ tf_cuda_library( ], ) +cc_library( + name = "c_api_headers", + hdrs = [ + "c_api.h", + ], + copts = tf_copts(), + visibility = ["//tensorflow:__subpackages__"], +) + exports_files( [ "version_script.lds", diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index b86b277ac3200b88ae03490a6c1b64d464e81950..173bbea596a4276559f5cd67824e5cc75313985c 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -33,6 +33,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/eval_const_tensor.h" #include "tensorflow/core/common_runtime/shape_refiner.h" #include "tensorflow/core/framework/allocation_description.pb.h" +#include "tensorflow/core/framework/kernel_def.pb.h" #include "tensorflow/core/framework/log_memory.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/op_kernel.h" @@ -45,11 +46,13 @@ limitations under the License. #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/graph/validate.h" #include "tensorflow/core/lib/core/coding.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/mutex.h" @@ -199,7 +202,8 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims, buf->len_ = len; if (dtype != TF_STRING && dtype != TF_RESOURCE && tensorflow::DataTypeCanUseMemcpy(static_cast(dtype)) && - reinterpret_cast(data) % EIGEN_MAX_ALIGN_BYTES != 0) { + reinterpret_cast(data) % std::max(1, EIGEN_MAX_ALIGN_BYTES) != + 0) { // TF_STRING and TF_RESOURCE tensors have a different representation in // TF_Tensor than they do in tensorflow::Tensor. So a copy here is a waste // (any alignment requirements will be taken care of by TF_TensorToTensor @@ -326,6 +330,7 @@ TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len) { } void TF_DeleteBuffer(TF_Buffer* buffer) { + if (buffer == nullptr) return; if (buffer->data_deallocator != nullptr) { (*buffer->data_deallocator)(const_cast(buffer->data), buffer->length); @@ -355,6 +360,7 @@ void TF_CloseDeprecatedSession(TF_DeprecatedSession* s, TF_Status* status) { void TF_DeleteDeprecatedSession(TF_DeprecatedSession* s, TF_Status* status) { status->status = Status::OK(); + if (s == nullptr) return; delete s->session; delete s; } @@ -390,64 +396,6 @@ void TF_Reset_Helper(const TF_SessionOptions* opt, const char** containers, status->status = Reset(opt->options, container_names); } -// This traverses the specified nodes in topological order to verify there are -// no cycles. Starting with inputless nodes, it visits nodes whose inputs have -// all been visited, and counts the total number of visited nodes. If there is a -// cycle, nodes in the cycle will never be visited, and the visited count will -// be less than the total node count. -Status ValidateNoCycles(const Graph& g) { - // TODO(nolivia): check this on a subset of the graph instead of all of it. - // A node is ready when all of its inputs have been visited. - std::vector ready; - std::vector pending_count(g.num_node_ids(), 0); - - for (int i = 0; i < g.num_node_ids(); ++i) { - const Node* n = g.FindNodeId(i); - if (n == nullptr) continue; - pending_count[i] = n->in_edges().size(); - if (n->IsMerge()) { - // While-loop cycles are legal cycles so we manually adjust the - // pending_count to make sure that the loop is visited. - for (const Edge* e : n->in_edges()) { - if (!e->IsControlEdge() && e->src()->IsNextIteration()) { - pending_count[i]--; - } - } - } - if (pending_count[i] == 0) { - ready.push_back(n); - } - } - - int processed = 0; - while (!ready.empty()) { - const Node* node = ready.back(); - ready.pop_back(); - ++processed; - - for (const Edge* out : node->out_edges()) { - const int output_id = out->dst()->id(); - pending_count[output_id]--; - if (pending_count[output_id] == 0) { - ready.push_back(out->dst()); - } - } - } - - if (processed < g.num_nodes()) { - std::vector nodes_in_cycle; - for (int i = 0; i < pending_count.size() && nodes_in_cycle.size() < 3; - ++i) { - if (pending_count[i] != 0) { - nodes_in_cycle.push_back(g.FindNodeId(i)->name()); - } - } - return errors::InvalidArgument( - "Graph is invalid, contains a cycle with ", g.num_nodes() - processed, - " nodes, including: ", str_util::Join(nodes_in_cycle, ", ")); - } - return Status::OK(); -} } // namespace } // namespace tensorflow @@ -631,7 +579,22 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in, "Failed to allocate memory to serialize message of type '", in.GetTypeName(), "' and size ", proto_size); } - in.SerializeToArray(buf, proto_size); + // SerializeToArray takes size as an int. + // This next 'if' is a workaround till we update to depend on a version + // of protocol buffers that includes + // https://github.com/google/protobuf/pull/4739 + if (proto_size > std::numeric_limits::max()) { + return InvalidArgument("Cannot serialize protocol buffer of type ", + in.GetTypeName(), " as the serialized size (", + proto_size, + "bytes) would be larger than the limit (", + std::numeric_limits::max(), " bytes)"); + } + if (!in.SerializeToArray(buf, proto_size)) { + return InvalidArgument("Unable to serialize ", in.GetTypeName(), + " protocol buffer, perhaps the serialized size (", + proto_size, " bytes) is too large?"); + } out->data = buf; out->length = proto_size; out->data_deallocator = [](void* data, size_t length) { @@ -731,7 +694,9 @@ bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) { const auto num_nodes = graph.num_node_ids(); if (session->last_num_graph_nodes < num_nodes) { - status->status = tensorflow::ValidateNoCycles(session->graph->graph); + // TODO(nolivia): check this on a subset of the graph instead of all of + // it. + status->status = graph::ValidateGraphHasNoCycle(session->graph->graph); if (!status->status.ok()) { session->graph->mu.unlock(); return false; @@ -946,6 +911,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) { TF_Buffer TF_GetOpList(TF_Library* lib_handle) { return lib_handle->op_list; } void TF_DeleteLibraryHandle(TF_Library* lib_handle) { + if (lib_handle == nullptr) return; tensorflow::port::Free(const_cast(lib_handle->op_list.data)); delete lib_handle; } @@ -1003,6 +969,7 @@ TF_DEVICELIST_METHOD(const char*, TF_DeviceListName, name().c_str(), nullptr); TF_DEVICELIST_METHOD(const char*, TF_DeviceListType, device_type().c_str(), nullptr); TF_DEVICELIST_METHOD(int64_t, TF_DeviceListMemoryBytes, memory_limit(), -1); +TF_DEVICELIST_METHOD(uint64_t, TF_DeviceListIncarnation, incarnation(), 0); #undef TF_DEVICELIST_METHOD @@ -1273,7 +1240,7 @@ void TF_SetAttrTypeList(TF_OperationDescription* desc, const char* attr_name, void TF_SetAttrFuncName(TF_OperationDescription* desc, const char* attr_name, const char* value, size_t length) { tensorflow::NameAttrList func_name; - func_name.set_name(std::string(value, value + length)); + func_name.set_name(string(value, value + length)); desc->node_builder.Attr(attr_name, func_name); } @@ -1892,6 +1859,7 @@ TF_Graph::TF_Graph() TF_Graph* TF_NewGraph() { return new TF_Graph; } void TF_DeleteGraph(TF_Graph* g) { + if (g == nullptr) return; g->mu.lock(); g->delete_requested = true; const bool del = g->sessions.empty(); @@ -2097,7 +2065,7 @@ static void GraphImportGraphDefLocked(TF_Graph* graph, const GraphDef& def, for (int i = 0; i < size; ++i) { TensorId id = results.missing_unused_input_map_keys[i]; - tf_results->missing_unused_key_names_data.push_back(std::string(id.first)); + tf_results->missing_unused_key_names_data.emplace_back(id.first); tf_results->missing_unused_key_names[i] = tf_results->missing_unused_key_names_data.back().c_str(); tf_results->missing_unused_key_indexes[i] = id.second; @@ -2108,7 +2076,8 @@ TF_ImportGraphDefResults* TF_GraphImportGraphDefWithResults( TF_Graph* graph, const TF_Buffer* graph_def, const TF_ImportGraphDefOptions* options, TF_Status* status) { GraphDef def; - if (!def.ParseFromArray(graph_def->data, graph_def->length)) { + if (!tensorflow::ParseProtoUnlimited(&def, graph_def->data, + graph_def->length)) { status->status = InvalidArgument("Invalid GraphDef"); return nullptr; } @@ -2138,7 +2107,8 @@ void TF_GraphImportGraphDefWithReturnOutputs( return; } GraphDef def; - if (!def.ParseFromArray(graph_def->data, graph_def->length)) { + if (!tensorflow::ParseProtoUnlimited(&def, graph_def->data, + graph_def->length)) { status->status = InvalidArgument("Invalid GraphDef"); return; } @@ -2421,6 +2391,12 @@ void TF_AbortWhile(const TF_WhileParams* params) { FreeWhileResources(params); } void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx, TF_Output* dx, TF_Status* status, TF_Output* dy) { + TF_AddGradientsWithPrefix(g, nullptr, y, ny, x, nx, dx, status, dy); +} + +void TF_AddGradientsWithPrefix(TF_Graph* g, const char* prefix, TF_Output* y, + int ny, TF_Output* x, int nx, TF_Output* dx, + TF_Status* status, TF_Output* dy) { #ifdef __ANDROID__ status->status = tensorflow::errors::Unimplemented( "Adding gradients is not supported in Android. File a bug at " @@ -2437,9 +2413,29 @@ void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx, const int first_new_node_id = g->graph.num_node_ids(); + string prefix_cmp; + const char* child_scope_name; + if (prefix == nullptr) { + child_scope_name = "gradients"; + } else { + prefix_cmp = string(prefix) + "/"; + // The operation should fail if the provided name prefix has already been + // used in this graph + for (const auto& pair : g->name_map) { + const string& name = pair.first; + if (name.compare(prefix) == 0 || + tensorflow::str_util::StartsWith(name, prefix_cmp)) { + status->status = InvalidArgument( + "prefix [", prefix, + "] conflicts with existing node in the graph named [", name, "]"); + return; + } + } + child_scope_name = prefix; + } tensorflow::Scope scope = NewInternalScope(&g->graph, &status->status, &g->refiner) - .NewSubScope("gradients"); + .NewSubScope(child_scope_name); if (dx != nullptr) { std::vector dx_arg = OutputsFromTFOutputs(dx, ny); @@ -2454,7 +2450,30 @@ void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx, for (int i = first_new_node_id; i < g->graph.num_node_ids(); ++i) { Node* n = g->graph.FindNodeId(i); if (n == nullptr) continue; - g->name_map[n->name()] = n; + + // Adding the gradients to the graph can alter the prefix to prevent + // name collisions only if this prefix has not been provided explicitly + // by the user. If it was provided, assert that it remained intact. + if (prefix != nullptr && + !tensorflow::str_util::StartsWith(n->name(), prefix_cmp)) { + status->status = tensorflow::errors::Internal( + "BUG: The gradients prefix have been unexpectedly altered when " + "adding the nodes to the graph. This is a bug. Please file an " + "issue at https://github.com/tensorflow/tensorflow/issues."); + return; + } + // We have a convoluted scheme here: Using the C++ graph construction API + // to add potentially many nodes to the graph without running the checks + // (such as uniqueness of the names of nodes) we run with other functions + // that add a node to the graph (like TF_FinishOperation). + if (!g->name_map.insert(std::make_pair(n->name(), n)).second) { + status->status = tensorflow::errors::Internal( + "BUG: The API allowed construction of a graph with duplicate node " + "names (", + n->name(), + "). This is a bug. Please file an issue at " + "https://github.com/tensorflow/tensorflow/issues."); + } } } @@ -2554,6 +2573,7 @@ void TF_CloseSession(TF_Session* s, TF_Status* status) { void TF_DeleteSession(TF_Session* s, TF_Status* status) { status->status = Status::OK(); + if (s == nullptr) return; TF_Graph* const graph = s->graph; if (graph != nullptr) { graph->mu.lock(); @@ -2752,7 +2772,34 @@ TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map, const char* name, TF_Buffer* ret = TF_NewBuffer(); status->status = MessageToBuffer(*api_def, ret); + if (!status->status.ok()) { + TF_DeleteBuffer(ret); + return nullptr; + } return ret; #endif // __ANDROID__ } + +TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status) { + tensorflow::KernelList kernel_list = tensorflow::GetAllRegisteredKernels(); + TF_Buffer* ret = TF_NewBuffer(); + status->status = MessageToBuffer(kernel_list, ret); + if (!status->status.ok()) { + TF_DeleteBuffer(ret); + return nullptr; + } + return ret; +} + +TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) { + tensorflow::KernelList kernel_list = + tensorflow::GetRegisteredKernelsForOp(name); + TF_Buffer* ret = TF_NewBuffer(); + status->status = MessageToBuffer(kernel_list, ret); + if (!status->status.ok()) { + TF_DeleteBuffer(ret); + return nullptr; + } + return ret; +} } // end extern "C" diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h index c8594347451dffd465d7fa926cc53818dc9e38d4..850f6ecd637d768bca99720e0add07680829e17a 100644 --- a/tensorflow/c/c_api.h +++ b/tensorflow/c/c_api.h @@ -44,6 +44,7 @@ limitations under the License. // * size_t is used to represent byte sizes of objects that are // materialized in the address space of the calling process. // * int is used as an index into arrays. +// * Deletion functions are safe to call on nullptr. // // Questions left to address: // * Might at some point need a way for callers to provide their own Env. @@ -894,7 +895,8 @@ TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions( TF_ImportGraphDefOptions* opts); // Set the prefix to be prepended to the names of nodes in `graph_def` that will -// be imported into `graph`. +// be imported into `graph`. `prefix` is copied and has no lifetime +// requirements. TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetPrefix( TF_ImportGraphDefOptions* opts, const char* prefix); @@ -915,6 +917,7 @@ TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetUniquifyPrefix( // Set any imported nodes with input `src_name:src_index` to have that input // replaced with `dst`. `src_name` refers to a node in the graph to be imported, // `dst` references a node already existing in the graph being imported into. +// `src_name` is copied and has no lifetime requirements. TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddInputMapping( TF_ImportGraphDefOptions* opts, const char* src_name, int src_index, TF_Output dst); @@ -922,7 +925,7 @@ TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddInputMapping( // Set any imported nodes with control input `src_name` to have that input // replaced with `dst`. `src_name` refers to a node in the graph to be imported, // `dst` references an operation already existing in the graph being imported -// into. +// into. `src_name` is copied and has no lifetime requirements. TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsRemapControlDependency( TF_ImportGraphDefOptions* opts, const char* src_name, TF_Operation* dst); @@ -934,6 +937,7 @@ TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddControlDependency( // Add an output in `graph_def` to be returned via the `return_outputs` output // parameter of TF_GraphImportGraphDef(). If the output is remapped via an input // mapping, the corresponding existing tensor in `graph` will be returned. +// `oper_name` is copied and has no lifetime requirements. TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddReturnOutput( TF_ImportGraphDefOptions* opts, const char* oper_name, int index); @@ -943,7 +947,8 @@ TF_CAPI_EXPORT extern int TF_ImportGraphDefOptionsNumReturnOutputs( const TF_ImportGraphDefOptions* opts); // Add an operation in `graph_def` to be returned via the `return_opers` output -// parameter of TF_GraphImportGraphDef(). +// parameter of TF_GraphImportGraphDef(). `oper_name` is copied and has no +// lifetime requirements. TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddReturnOperation( TF_ImportGraphDefOptions* opts, const char* oper_name); @@ -1126,6 +1131,7 @@ TF_CAPI_EXPORT extern void TF_AbortWhile(const TF_WhileParams* params); // Adds operations to compute the partial derivatives of sum of `y`s w.r.t `x`s, // i.e., d(y_1 + y_2 + ...)/dx_1, d(y_1 + y_2 + ...)/dx_2... +// // `dx` are used as initial gradients (which represent the symbolic partial // derivatives of some loss function `L` w.r.t. `y`). // `dx` must be nullptr or have size `ny`. @@ -1134,6 +1140,12 @@ TF_CAPI_EXPORT extern void TF_AbortWhile(const TF_WhileParams* params); // The partial derivatives are returned in `dy`. `dy` should be allocated to // size `nx`. // +// Gradient nodes are automatically named under the "gradients/" prefix. To +// guarantee name uniqueness, subsequent calls to the same graph will +// append an incremental tag to the prefix: "gradients_1/", "gradients_2/", ... +// See TF_AddGradientsWithPrefix, which provides a means to specify a custom +// name prefix for operations added to a graph to compute the gradients. +// // WARNING: This function does not yet support all the gradients that python // supports. See // https://www.tensorflow.org/code/tensorflow/cc/gradients/README.md @@ -1142,6 +1154,33 @@ TF_CAPI_EXPORT void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx, TF_Output* dx, TF_Status* status, TF_Output* dy); +// Adds operations to compute the partial derivatives of sum of `y`s w.r.t `x`s, +// i.e., d(y_1 + y_2 + ...)/dx_1, d(y_1 + y_2 + ...)/dx_2... +// This is a variant of TF_AddGradients that allows to caller to pass a custom +// name prefix to the operations added to a graph to compute the gradients. +// +// `dx` are used as initial gradients (which represent the symbolic partial +// derivatives of some loss function `L` w.r.t. `y`). +// `dx` must be nullptr or have size `ny`. +// If `dx` is nullptr, the implementation will use dx of `OnesLike` for all +// shapes in `y`. +// The partial derivatives are returned in `dy`. `dy` should be allocated to +// size `nx`. +// `prefix` names the scope into which all gradients operations are being added. +// `prefix` must be unique within the provided graph otherwise this operation +// will fail. If `prefix` is nullptr, the default prefixing behaviour takes +// place, see TF_AddGradients for more details. +// +// WARNING: This function does not yet support all the gradients that python +// supports. See +// https://www.tensorflow.org/code/tensorflow/cc/gradients/README.md +// for instructions on how to add C++ more gradients. +TF_CAPI_EXPORT void TF_AddGradientsWithPrefix(TF_Graph* g, const char* prefix, + TF_Output* y, int ny, + TF_Output* x, int nx, + TF_Output* dx, TF_Status* status, + TF_Output* dy); + // Create a TF_Function from a TF_Graph // // Params: @@ -1231,6 +1270,11 @@ TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunction( int noutputs, const TF_Output* outputs, const char* const* output_names, const TF_FunctionOptions* opts, const char* description, TF_Status* status); +// Returns the name of the graph function. +// The return value points to memory that is only usable until the next +// mutation to *func. +TF_CAPI_EXPORT extern const char* TF_FunctionName(TF_Function* func); + // Write out a serialized representation of `func` (as a FunctionDef protocol // message) to `output_func_def` (allocated by TF_NewBuffer()). // `output_func_def`'s underlying buffer will be freed when TF_DeleteBuffer() @@ -1517,6 +1561,13 @@ TF_CAPI_EXPORT extern const char* TF_DeviceListType(const TF_DeviceList* list, TF_CAPI_EXPORT extern int64_t TF_DeviceListMemoryBytes( const TF_DeviceList* list, int index, TF_Status* status); +// Retrieve the incarnation number of a given device. +// +// If index is out of bounds, an error code will be set in the status object, +// and 0 will be returned. +TF_CAPI_EXPORT extern uint64_t TF_DeviceListIncarnation( + const TF_DeviceList* list, int index, TF_Status* status); + // -------------------------------------------------------------------------- // Load plugins containing custom ops and kernels @@ -1599,6 +1650,18 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map, size_t name_len, TF_Status* status); +// -------------------------------------------------------------------------- +// Kernel definition information. + +// Returns a serialized KernelList protocol buffer containing KernelDefs for all +// registered kernels. +TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status); + +// Returns a serialized KernelList protocol buffer containing KernelDefs for all +// kernels registered for the operation named `name`. +TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp( + const char* name, TF_Status* status); + #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index 95b04f9058afdfaadbc24f0238860279fcd3e800..69b3ffe2a1f620e346405607ecf742fb863aa644 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -57,6 +57,45 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) { } } +TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation, + unsigned char gpu_memory_allow_growth) { + tensorflow::ConfigProto config; + auto* optimizer_options = + config.mutable_graph_options()->mutable_optimizer_options(); + if (enable_xla_compilation) { + optimizer_options->set_global_jit_level(tensorflow::OptimizerOptions::ON_1); + + // These XLA flags are needed to trigger XLA properly from C (more generally + // non-Python) clients. If this API is called again with `enable` set to + // false, it is safe to keep these flag values as is. + tensorflow::legacy_flags::MarkForCompilationPassFlags* flags = + tensorflow::legacy_flags::GetMarkForCompilationPassFlags(); + flags->tf_xla_cpu_global_jit = true; + flags->tf_xla_min_cluster_size = 1; + } else { + optimizer_options->set_global_jit_level(tensorflow::OptimizerOptions::OFF); + } + + auto* gpu_options = config.mutable_gpu_options(); + gpu_options->set_allow_growth(gpu_memory_allow_growth); + + TF_Buffer* ret = TF_NewBuffer(); + TF_CHECK_OK(MessageToBuffer(config, ret)); + return ret; +} + +TF_Buffer* TF_CreateRunOptions(unsigned char enable_full_trace) { + tensorflow::RunOptions options; + if (enable_full_trace) { + options.set_trace_level(tensorflow::RunOptions::FULL_TRACE); + } else { + options.set_trace_level(tensorflow::RunOptions::NO_TRACE); + } + TF_Buffer* ret = TF_NewBuffer(); + TF_CHECK_OK(MessageToBuffer(options, ret)); + return ret; +} + const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) { tensorflow::mutex_lock c(graph->mu); const auto& debug_str = graph->graph.ToGraphDefDebug().DebugString(); diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h index 20bdace40f1272ded06e710034053a7610326e7f..6617c5a572e90e78369f73d714f39942f213040f 100644 --- a/tensorflow/c/c_api_experimental.h +++ b/tensorflow/c/c_api_experimental.h @@ -55,11 +55,27 @@ extern "C" { // set XLA flag values to prepare for XLA compilation. Otherwise set // global_jit_level to OFF. // -// This API is syntax sugar over TF_SetConfig(), and is used by clients that -// cannot read/write the tensorflow.ConfigProto proto. +// This and the next API are syntax sugar over TF_SetConfig(), and is used by +// clients that cannot read/write the tensorflow.ConfigProto proto. +// TODO: Migrate to TF_CreateConfig() below. TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable); +// Create a serialized tensorflow.ConfigProto proto, where: +// +// a) ConfigProto.optimizer_options.global_jit_level is set to to ON_1 if +// `enable_xla_compilation` is non-zero, and OFF otherwise. +// b) ConfigProto.gpu_options.allow_growth is set to `gpu_memory_allow_growth`. +TF_CAPI_EXPORT extern TF_Buffer* TF_CreateConfig( + unsigned char enable_xla_compilation, + unsigned char gpu_memory_allow_growth); + +// Create a serialized tensorflow.RunOptions proto, where RunOptions.trace_level +// is set to FULL_TRACE if `enable_full_trace` is non-zero, and NO_TRACE +// otherwise. +TF_CAPI_EXPORT extern TF_Buffer* TF_CreateRunOptions( + unsigned char enable_full_trace); + // Returns the graph content in a human-readable format, with length set in // `len`. The format is subject to change in the future. // The returned string is heap-allocated, and caller should call free() on it. diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc index 384e6c8cb97022264c5327da5ca5861057608fbe..a2c5a42c11361779de61b515e0f08dcc45e609b9 100644 --- a/tensorflow/c/c_api_function.cc +++ b/tensorflow/c/c_api_function.cc @@ -536,6 +536,10 @@ TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name, return tf_function; } +const char* TF_FunctionName(TF_Function* func) { + return func->fdef.signature().name().c_str(); +} + void TF_GraphCopyFunction(TF_Graph* g, const TF_Function* func, const TF_Function* grad, TF_Status* status) { if (func == nullptr) { diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc index 610274696f5940c063e68f2310cfd9cc1e0bd964..73fe73769bc1219ce865149d67d333c53371ccc5 100644 --- a/tensorflow/c/c_api_function_test.cc +++ b/tensorflow/c/c_api_function_test.cc @@ -193,6 +193,7 @@ class CApiFunctionTest : public ::testing::Test { ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); ASSERT_NE(func_, nullptr); + ASSERT_EQ(std::string(func_name_), std::string(TF_FunctionName(func_))); TF_GraphCopyFunction(host_graph_, func_, nullptr, s_); ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); } @@ -1516,7 +1517,8 @@ void DefineStatefulFunction(const char* name, TF_Function** func) { TF_Output inputs[] = {}; TF_Output outputs[] = {{random, 0}}; - *func = TF_GraphToFunction(func_graph.get(), name, /*append_hash=*/false, -1, + *func = TF_GraphToFunction(func_graph.get(), name, + /*append_hash_to_fn_name=*/false, -1, /*opers=*/nullptr, 0, inputs, 1, outputs, /*output_names=*/nullptr, /*opts=*/nullptr, "", s.get()); @@ -1617,5 +1619,66 @@ TEST_F(CApiFunctionTest, GetFunctionsFromGraph) { TF_DeleteFunction(func1); } +// This test only works when the TF build includes XLA compiler. One way to set +// this up is via bazel build option "--define with_xla_support=true". +// +// FIXME: generalize the macro name TENSORFLOW_EAGER_USE_XLA to +// something like TENSORFLOW_CAPI_USE_XLA. +#ifdef TENSORFLOW_EAGER_USE_XLA +TEST_F(CApiFunctionTest, StatelessIf_XLA) { + TF_Function* func; + const std::string funcName = "BranchFunc"; + DefineFunction(funcName.c_str(), &func); + TF_GraphCopyFunction(host_graph_, func, nullptr, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_Operation* feed = Placeholder(host_graph_, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_Operation* true_cond = ScalarConst(true, host_graph_, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_OperationDescription* desc = + TF_NewOperation(host_graph_, "StatelessIf", "IfNode"); + TF_AddInput(desc, {true_cond, 0}); + TF_Output inputs[] = {{feed, 0}}; + TF_AddInputList(desc, inputs, TF_ARRAYSIZE(inputs)); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + TF_SetAttrType(desc, "Tcond", TF_BOOL); + TF_DataType inputType = TF_INT32; + TF_SetAttrTypeList(desc, "Tin", &inputType, 1); + TF_SetAttrTypeList(desc, "Tout", &inputType, 1); + TF_SetAttrFuncName(desc, "then_branch", funcName.data(), funcName.size()); + TF_SetAttrFuncName(desc, "else_branch", funcName.data(), funcName.size()); + TF_SetDevice(desc, "/device:XLA_CPU:0"); + auto op = TF_FinishOperation(desc, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + ASSERT_NE(op, nullptr); + + // Create a session for this graph. + CSession csession(host_graph_, s_, /*use_XLA*/ true); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + // Run the graph. + csession.SetInputs({{feed, Int32Tensor(17)}}); + csession.SetOutputs({op}); + csession.Run(s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + TF_Tensor* out = csession.output_tensor(0); + ASSERT_TRUE(out != nullptr); + EXPECT_EQ(TF_INT32, TF_TensorType(out)); + EXPECT_EQ(0, TF_NumDims(out)); // scalar + ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out)); + int32* output_contents = static_cast(TF_TensorData(out)); + EXPECT_EQ(-17, *output_contents); + + // Clean up + csession.CloseAndDelete(s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_DeleteFunction(func); +} +#endif // TENSORFLOW_EAGER_USE_XLA + } // namespace } // namespace tensorflow diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc index 577f10c5e69ea9ecbe8ce821c6bd5167e98bef25..03516c39dc970aa23967107d3a0446da94669465 100644 --- a/tensorflow/c/c_api_test.cc +++ b/tensorflow/c/c_api_test.cc @@ -29,9 +29,11 @@ limitations under the License. #include "tensorflow/core/framework/api_def.pb.h" #include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/framework/graph.pb_text.h" +#include "tensorflow/core/framework/kernel_def.pb.h" #include "tensorflow/core/framework/node_def.pb_text.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.pb.h" @@ -257,8 +259,8 @@ TEST(CAPI, DeprecatedSession) { TF_Run(session, run_options, nullptr, nullptr, 0, nullptr, nullptr, 0, nullptr, 0, run_metadata, s); EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s)) << TF_Message(s); - EXPECT_EQ(std::string("Session was not created with a graph before Run()!"), - std::string(TF_Message(s))); + EXPECT_EQ("Session was not created with a graph before Run()!", + string(TF_Message(s))); TF_DeleteBuffer(run_metadata); TF_DeleteBuffer(run_options); @@ -1160,7 +1162,7 @@ TEST(CAPI, GetOpDef) { } void StringVectorToArrays(const std::vector& v, - std::unique_ptr* ptrs, + std::unique_ptr* ptrs, std::unique_ptr* lens) { ptrs->reset(new const void*[v.size()]); lens->reset(new size_t[v.size()]); @@ -1196,7 +1198,7 @@ class CApiColocationTest : public ::testing::Test { void SetViaStringList(TF_OperationDescription* desc, const std::vector& list) { - std::unique_ptr list_ptrs; + std::unique_ptr list_ptrs; std::unique_ptr list_lens; StringVectorToArrays(list, &list_ptrs, &list_lens); TF_SetAttrStringList(desc, tensorflow::kColocationAttrName, list_ptrs.get(), @@ -1222,8 +1224,8 @@ class CApiColocationTest : public ::testing::Test { TF_OperationGetAttrMetadata(op, tensorflow::kColocationAttrName, s_); if (expected.empty()) { ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_); - EXPECT_EQ(std::string("Operation 'add' has no attr named '_class'."), - std::string(TF_Message(s_))); + EXPECT_EQ("Operation 'add' has no attr named '_class'.", + string(TF_Message(s_))); return; } EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); @@ -1367,16 +1369,16 @@ TEST(CAPI, SavedModel) { input.flat()(i) = example.SerializeAsString(); } - const tensorflow::string input_op_name = - std::string(tensorflow::ParseTensorName(input_name).first); + const tensorflow::string input_op_name( + tensorflow::ParseTensorName(input_name).first); TF_Operation* input_op = TF_GraphOperationByName(graph, input_op_name.c_str()); ASSERT_TRUE(input_op != nullptr); csession.SetInputs({{input_op, TF_TensorFromTensor(input, s)}}); ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s); - const tensorflow::string output_op_name = - std::string(tensorflow::ParseTensorName(output_name).first); + const tensorflow::string output_op_name( + tensorflow::ParseTensorName(output_name).first); TF_Operation* output_op = TF_GraphOperationByName(graph, output_op_name.c_str()); ASSERT_TRUE(output_op != nullptr); @@ -1424,6 +1426,29 @@ TEST(CAPI, SavedModelNullArgsAreValid) { TF_DeleteStatus(s); } +TEST(CAPI, DeletingNullPointerIsSafe) { + TF_Status* status = TF_NewStatus(); + + TF_DeleteStatus(nullptr); + TF_DeleteBuffer(nullptr); + TF_DeleteTensor(nullptr); + TF_DeleteSessionOptions(nullptr); + TF_DeleteGraph(nullptr); + TF_DeleteImportGraphDefOptions(nullptr); + TF_DeleteImportGraphDefResults(nullptr); + TF_DeleteFunction(nullptr); + TF_DeleteSession(nullptr, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TF_DeletePRunHandle(nullptr); + TF_DeleteDeprecatedSession(nullptr, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TF_DeleteDeviceList(nullptr); + TF_DeleteLibraryHandle(nullptr); + TF_DeleteApiDefMap(nullptr); + + TF_DeleteStatus(status); +} + REGISTER_OP("TestOpWithNoGradient") .Input("x: T") .Output("y: T") @@ -1458,8 +1483,8 @@ class CApiGradientsTest : public ::testing::Test { BuildSuccessGraph(inputs, outputs); BuildExpectedGraph(grad_inputs_provided, expected_grad_outputs); - AddGradients(grad_inputs_provided, inputs, 2, outputs, 1, grad_outputs); - + AddGradients(grad_inputs_provided, nullptr, inputs, 2, outputs, 1, + grad_outputs); EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); // Compare that the graphs match. @@ -1480,7 +1505,8 @@ class CApiGradientsTest : public ::testing::Test { BuildErrorGraph(inputs, outputs); - AddGradients(grad_inputs_provided, inputs, 1, outputs, 1, grad_outputs); + AddGradients(grad_inputs_provided, nullptr, inputs, 1, outputs, 1, + grad_outputs); string expected_msg = "No gradient defined for op: TestOpWithNoGradient. Please see " @@ -1524,19 +1550,20 @@ class CApiGradientsTest : public ::testing::Test { EXPECT_EQ(*a_data, *b_data); } - void AddGradients(bool grad_inputs_provided, TF_Output* inputs, int ninputs, - TF_Output* outputs, int noutputs, TF_Output* grad_outputs) { + void AddGradients(bool grad_inputs_provided, const char* prefix, + TF_Output* inputs, int ninputs, TF_Output* outputs, + int noutputs, TF_Output* grad_outputs) { if (grad_inputs_provided) { TF_Output grad_inputs[1]; const float grad_inputs_val[] = {1.0, 1.0, 1.0, 1.0}; TF_Operation* grad_inputs_op = FloatConst2x2(graph_, s_, grad_inputs_val, "GradInputs"); grad_inputs[0] = TF_Output{grad_inputs_op, 0}; - TF_AddGradients(graph_, outputs, noutputs, inputs, ninputs, grad_inputs, - s_, grad_outputs); + TF_AddGradientsWithPrefix(graph_, prefix, outputs, noutputs, inputs, + ninputs, grad_inputs, s_, grad_outputs); } else { - TF_AddGradients(graph_, outputs, noutputs, inputs, ninputs, nullptr, s_, - grad_outputs); + TF_AddGradientsWithPrefix(graph_, prefix, outputs, noutputs, inputs, + ninputs, nullptr, s_, grad_outputs); } } @@ -1681,6 +1708,20 @@ class CApiGradientsTest : public ::testing::Test { return op; } + void BuildGraphAndAddGradientsWithPrefixes(const char* prefix1, + const char* prefix2 = nullptr) { + TF_Output inputs[2]; + TF_Output outputs[1]; + TF_Output grad_outputs[2]; + + BuildSuccessGraph(inputs, outputs); + + AddGradients(false, prefix1, inputs, 2, outputs, 1, grad_outputs); + if (prefix2 != nullptr) { + AddGradients(false, prefix2, inputs, 2, outputs, 1, grad_outputs); + } + } + TF_Status* s_; TF_Graph* graph_; TF_Graph* expected_graph_; @@ -1700,6 +1741,111 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) { TestGradientsError(false); } +TEST_F(CApiGradientsTest, GradientsPrefix_PrefixIsOk) { + BuildGraphAndAddGradientsWithPrefixes("gradients"); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); +} + +TEST_F(CApiGradientsTest, GradientsPrefix_TwoGradientsWithDistinctPrefixes) { + BuildGraphAndAddGradientsWithPrefixes("gradients", "gradients_1"); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); +} + +TEST_F(CApiGradientsTest, GradientsPrefix_TwoGradientsInSameScope) { + BuildGraphAndAddGradientsWithPrefixes("scope/gradients", "scope/gradients_1"); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); +} + +TEST_F(CApiGradientsTest, GradientsPrefix_TwoGradientsInDifferentScopes) { + BuildGraphAndAddGradientsWithPrefixes("scope/gradients", "scope_1/gradients"); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); +} + +TEST_F(CApiGradientsTest, GradientsPrefix_2ndGradientsAsSubScopeOf1st) { + BuildGraphAndAddGradientsWithPrefixes("gradients", "gradients/sub"); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); +} + +TEST_F(CApiGradientsTest, GradientsPrefix_PrefixMatchesExistingNodeName) { + BuildGraphAndAddGradientsWithPrefixes("Const_0"); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_); +} + +TEST_F(CApiGradientsTest, GradientsPrefix_TwoGradientsWithIdenticalPrefixes) { + BuildGraphAndAddGradientsWithPrefixes("gradients", "gradients"); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_); +} + +TEST_F(CApiGradientsTest, GradientsPrefix_2ndGradientsMatchingNodeOf1st) { + BuildGraphAndAddGradientsWithPrefixes("gradients", "gradients/MatMul"); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_); +} + +TEST_F(CApiGradientsTest, GradientsPrefix_1stGradientsMatchingNodeOf2nd) { + BuildGraphAndAddGradientsWithPrefixes("gradients/MatMul", "gradients"); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_); +} + +TEST_F(CApiGradientsTest, GradientsPrefix_2ndGradientsAsParentScopeOf1st) { + BuildGraphAndAddGradientsWithPrefixes("gradients/sub", "gradients"); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_); +} + +void ScalarFloatFromTensor(const TF_Tensor* t, float* f) { + ASSERT_TRUE(t != nullptr); + ASSERT_EQ(TF_FLOAT, TF_TensorType(t)); + ASSERT_EQ(0, TF_NumDims(t)); + ASSERT_EQ(4, TF_TensorByteSize(t)); + float* p = static_cast(TF_TensorData(t)); + *f = *p; +} + +TEST_F(CApiGradientsTest, MultipleCallsToAddGradients) { + const float X = 3.0f, Y = 7.0f; + TF_Operation* x = Placeholder(graph_, s_, "x", TF_FLOAT); + TF_Operation* y = Placeholder(graph_, s_, "y", TF_FLOAT); + TF_Operation* xy = Mul(x, y, graph_, s_, "xy"); + TF_Output dxy_dx, dxy_dy; + + TF_Output outputs[1] = {{xy, 0}}; + TF_Output inputs[1] = {{x, 0}}; + TF_AddGradients(graph_, outputs, 1, inputs, 1, nullptr, s_, &dxy_dx); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + inputs[0] = {y, 0}; + TF_AddGradients(graph_, outputs, 1, inputs, 1, nullptr, s_, &dxy_dy); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_SessionOptions* opts = TF_NewSessionOptions(); + TF_Session* sess = TF_NewSession(graph_, opts, s_); + TF_DeleteSessionOptions(opts); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_Output feeds[] = {{x, 0}, {y, 0}}; + TF_Tensor* feedValues[] = {FloatTensor(X), FloatTensor(Y)}; + TF_Output fetches[] = {dxy_dx, dxy_dy}; + TF_Tensor* fetchValues[] = {nullptr, nullptr}; + + TF_SessionRun(sess, nullptr /* run_options */, feeds, feedValues, 2, fetches, + fetchValues, 2, nullptr /* target_opers */, 0, + nullptr /* run_metadata */, s_); + TF_DeleteTensor(feedValues[0]); + TF_DeleteTensor(feedValues[1]); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + TF_DeleteSession(sess, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + float dxy_dxValue = 0.0f, dxy_dyValue = 0.0f; + ScalarFloatFromTensor(fetchValues[0], &dxy_dxValue); + EXPECT_EQ(Y, dxy_dxValue); + + ScalarFloatFromTensor(fetchValues[1], &dxy_dyValue); + EXPECT_EQ(X, dxy_dyValue); + + TF_DeleteTensor(fetchValues[0]); + TF_DeleteTensor(fetchValues[1]); +} + // REGISTER_OP for CApiAttributesTest test cases. // Registers two ops, each with a single attribute called 'v'. // The attribute in one op will have a type 'type', the other @@ -1784,7 +1930,7 @@ TEST_F(CApiAttributesTest, String) { TEST_F(CApiAttributesTest, StringList) { std::vector list = {"bugs", "bunny", "duck"}; - std::unique_ptr list_ptrs; + std::unique_ptr list_ptrs; std::unique_ptr list_lens; StringVectorToArrays(list, &list_ptrs, &list_lens); int list_total_size = 0; @@ -1800,7 +1946,7 @@ TEST_F(CApiAttributesTest, StringList) { ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); EXPECT_TF_META("v", list.size(), TF_ATTR_STRING, list_total_size); - std::unique_ptr values(new void*[list.size()]); + std::unique_ptr values(new void*[list.size()]); std::unique_ptr lens(new size_t[list.size()]); std::unique_ptr storage(new char[list_total_size]); TF_OperationGetAttrStringList(oper, "v", values.get(), lens.get(), @@ -2025,7 +2171,7 @@ TEST_F(CApiAttributesTest, TensorShapeProtoList) { tensorflow::PartialTensorShape(pts2).AsProto(&proto); proto.SerializeToString(&bytes2); - std::unique_ptr list_ptrs; + std::unique_ptr list_ptrs; std::unique_ptr list_lens; const std::vector list = {bytes1, bytes2}; StringVectorToArrays(list, &list_ptrs, &list_lens); @@ -2257,6 +2403,57 @@ TEST(TestApiDef, TestCreateApiDefWithOverwrites) { TF_DeleteLibraryHandle(lib); } +class DummyKernel : public tensorflow::OpKernel { + public: + explicit DummyKernel(tensorflow::OpKernelConstruction* context) + : OpKernel(context) {} + void Compute(tensorflow::OpKernelContext* context) override {} +}; + +// Test we can query kernels +REGISTER_OP("TestOpWithSingleKernel") + .Input("a: float") + .Input("b: float") + .Output("o: float"); +REGISTER_KERNEL_BUILDER( + Name("TestOpWithSingleKernel").Device(tensorflow::DEVICE_CPU), DummyKernel); + +TEST(TestKernel, TestGetAllRegisteredKernels) { + TF_Status* status = TF_NewStatus(); + TF_Buffer* kernel_list_buf = TF_GetAllRegisteredKernels(status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + KernelList kernel_list; + kernel_list.ParseFromArray(kernel_list_buf->data, kernel_list_buf->length); + ASSERT_GT(kernel_list.kernel_size(), 0); + TF_DeleteBuffer(kernel_list_buf); + TF_DeleteStatus(status); +} + +TEST(TestKernel, TestGetRegisteredKernelsForOp) { + TF_Status* status = TF_NewStatus(); + TF_Buffer* kernel_list_buf = + TF_GetRegisteredKernelsForOp("TestOpWithSingleKernel", status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + KernelList kernel_list; + kernel_list.ParseFromArray(kernel_list_buf->data, kernel_list_buf->length); + ASSERT_EQ(kernel_list.kernel_size(), 1); + EXPECT_EQ(kernel_list.kernel(0).op(), "TestOpWithSingleKernel"); + EXPECT_EQ(kernel_list.kernel(0).device_type(), "CPU"); + TF_DeleteBuffer(kernel_list_buf); + TF_DeleteStatus(status); +} + +TEST(TestKernel, TestGetRegisteredKernelsForOpNoKernels) { + TF_Status* status = TF_NewStatus(); + TF_Buffer* kernel_list_buf = TF_GetRegisteredKernelsForOp("Unknown", status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + KernelList kernel_list; + kernel_list.ParseFromArray(kernel_list_buf->data, kernel_list_buf->length); + ASSERT_EQ(kernel_list.kernel_size(), 0); + TF_DeleteBuffer(kernel_list_buf); + TF_DeleteStatus(status); +} + #undef EXPECT_TF_META } // namespace diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc index f3b28c1708129d39e451d927a89c0d10e2193b63..f15d9ee20adb31a0b76e2cd0d1e67f17a9deff05 100644 --- a/tensorflow/c/c_test_util.cc +++ b/tensorflow/c/c_test_util.cc @@ -26,6 +26,10 @@ limitations under the License. using tensorflow::GraphDef; using tensorflow::NodeDef; +static void BoolDeallocator(void* data, size_t, void* arg) { + delete[] static_cast(data); +} + static void Int32Deallocator(void* data, size_t, void* arg) { delete[] static_cast(data); } @@ -38,6 +42,14 @@ static void FloatDeallocator(void* data, size_t, void* arg) { delete[] static_cast(data); } +TF_Tensor* BoolTensor(bool v) { + const int num_bytes = sizeof(bool); + bool* values = new bool[1]; + values[0] = v; + return TF_NewTensor(TF_BOOL, nullptr, 0, values, num_bytes, &BoolDeallocator, + nullptr); +} + TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) { int64_t num_values = 1; for (int i = 0; i < num_dims; ++i) { @@ -131,6 +143,12 @@ TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s, return op; } +TF_Operation* ScalarConst(bool v, TF_Graph* graph, TF_Status* s, + const char* name) { + unique_tensor_ptr tensor(BoolTensor(v), TF_DeleteTensor); + return Const(tensor.get(), graph, s, name); +} + TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s, const char* name) { unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor); @@ -216,6 +234,13 @@ TF_Operation* Min(TF_Operation* l, TF_Operation* r, TF_Graph* graph, return MinWithDevice(l, r, graph, /*op_device=*/"", s, name); } +TF_Operation* Mul(TF_Operation* l, TF_Operation* r, TF_Graph* graph, + TF_Status* s, const char* name) { + TF_Operation* op; + BinaryOpHelper("Mul", l, r, graph, s, name, &op, "", true); + return op; +} + TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s, const char* name) { TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name); diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h index c16aba666ee6974fed5351c2d9ac291dcbcdecab..7eeb1ee5e17ad7e5644f8bc8a18ca967b108475d 100644 --- a/tensorflow/c/c_test_util.h +++ b/tensorflow/c/c_test_util.h @@ -31,6 +31,8 @@ using ::tensorflow::string; typedef std::unique_ptr unique_tensor_ptr; +TF_Tensor* BoolTensor(int32_t v); + // Create a tensor with values of type TF_INT8 provided by `values`. TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values); @@ -55,6 +57,9 @@ TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s, const char* name = "const"); +TF_Operation* ScalarConst(bool v, TF_Graph* graph, TF_Status* s, + const char* name = "scalar"); + TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s, const char* name = "scalar"); @@ -80,6 +85,9 @@ TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s, TF_Operation* Min(TF_Operation* l, TF_Operation* r, TF_Graph* graph, TF_Status* s, const char* name = "min"); +TF_Operation* Mul(TF_Operation* l, TF_Operation* r, TF_Graph* graph, + TF_Status* s, const char* name = "mul"); + // If `op_device` is non-empty, set the created op on that device. TF_Operation* MinWithDevice(TF_Operation* l, TF_Operation* r, TF_Graph* graph, const string& op_device, TF_Status* s, diff --git a/tensorflow/c/checkpoint_reader.cc b/tensorflow/c/checkpoint_reader.cc index 74bc25a491ac01cb725d1c004197e48727c30230..d3311f0cd06f2b151c3567735eb41b5baf72e102 100644 --- a/tensorflow/c/checkpoint_reader.cc +++ b/tensorflow/c/checkpoint_reader.cc @@ -125,7 +125,7 @@ CheckpointReader::BuildV2VarMaps() { const auto& slice_proto = entry.slices(i); CHECK(filtered_keys .insert(EncodeTensorNameSlice( - std::string(v2_reader_->key()) /* full var's name */, + string(v2_reader_->key()) /* full var's name */, TensorSlice(slice_proto))) .second); } @@ -138,11 +138,11 @@ CheckpointReader::BuildV2VarMaps() { new TensorSliceReader::VarToDataTypeMap); v2_reader_->Seek(kHeaderEntryKey); for (v2_reader_->Next(); v2_reader_->Valid(); v2_reader_->Next()) { - if (filtered_keys.count(std::string(v2_reader_->key())) > 0) continue; + if (filtered_keys.count(string(v2_reader_->key())) > 0) continue; CHECK(entry.ParseFromArray(v2_reader_->value().data(), v2_reader_->value().size())) << entry.InitializationErrorString(); - string key = std::string(v2_reader_->key()); + string key(v2_reader_->key()); (*var_to_shape_map)[key] = TensorShape(entry.shape()); (*var_to_data_type_map)[key] = DataType(entry.dtype()); } diff --git a/tensorflow/c/checkpoint_reader.h b/tensorflow/c/checkpoint_reader.h index 4de1300a7f66a8b4eb8074819432fd7dd597bb15..91654c8d4fb8067ae1fb525ebaa6c54689085545 100644 --- a/tensorflow/c/checkpoint_reader.h +++ b/tensorflow/c/checkpoint_reader.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_C_CHECKPOINT_READER_H -#define TENSORFLOW_C_CHECKPOINT_READER_H +#ifndef TENSORFLOW_C_CHECKPOINT_READER_H_ +#define TENSORFLOW_C_CHECKPOINT_READER_H_ #include #include @@ -79,4 +79,4 @@ class CheckpointReader { } // namespace checkpoint } // namespace tensorflow -#endif // TENSORFLOW_C_CHECKPOINT_READER_H +#endif // TENSORFLOW_C_CHECKPOINT_READER_H_ diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index f265da2c2c89c0e9caf14f2213c606fcb69997e0..37be52f57d865c1e59611540d5dab04b59e89444 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -54,7 +54,6 @@ tf_cuda_library( "//tensorflow/core/distributed_runtime/eager:eager_client", "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client", "//tensorflow/core/distributed_runtime/rpc:grpc_channel", - "//tensorflow/core/distributed_runtime/rpc/eager:eager_grpc_server_lib", "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib", "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache", "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service", @@ -93,10 +92,10 @@ tf_cuda_library( "//tensorflow/core/distributed_runtime/eager:eager_client", "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle", "//tensorflow/core/distributed_runtime/rpc:grpc_channel", + "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib", "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache", "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service", "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr", - "//tensorflow/core/distributed_runtime/rpc/eager:eager_grpc_server_lib", "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client", ], ) @@ -122,6 +121,7 @@ tf_cuda_library( tf_cuda_cc_test( name = "c_api_test", + size = "small", srcs = [ "c_api_debug_test.cc", "c_api_test.cc", @@ -139,7 +139,7 @@ tf_cuda_cc_test( "//tensorflow/core:protos_all_cc", "//tensorflow/core:test", "//tensorflow/core:test_main", - "//tensorflow/core/distributed_runtime/rpc/eager:eager_grpc_server_lib", + "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib", ], ) diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc old mode 100644 new mode 100755 index 81221c4078bec9820ee187efdf0314da378be62b..1ccae3f138920b1908f18387ea87b11388115d37 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -36,9 +36,9 @@ limitations under the License. #include "tensorflow/core/common_runtime/eager/execute.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/common_runtime/rendezvous_mgr.h" -#include "tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h" #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h" +#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h" #include "tensorflow/core/distributed_runtime/server_lib.h" #include "tensorflow/core/distributed_runtime/worker_env.h" #include "tensorflow/core/framework/node_def_util.h" @@ -46,10 +46,12 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.pb.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/refcount.h" +#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/map_util.h" #include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -107,7 +109,8 @@ tensorflow::Status GetAllRemoteDevices( } tensorflow::Status CreateRemoteContexts( - const std::vector& remote_workers, + const std::vector& remote_workers, int64 rendezvous_id, + int keep_alive_secs, const tensorflow::ServerDef& server_def, tensorflow::eager::EagerClientCache* remote_eager_workers, bool async, tensorflow::gtl::FlatMap* remote_contexts) { for (int i = 0; i < remote_workers.size(); i++) { @@ -115,15 +118,18 @@ tensorflow::Status CreateRemoteContexts( tensorflow::eager::CreateContextRequest request; tensorflow::eager::CreateContextResponse response; + request.set_rendezvous_id(rendezvous_id); tensorflow::DeviceNameUtils::ParsedName parsed_name; if (!tensorflow::DeviceNameUtils::ParseFullName(remote_worker, &parsed_name)) { return tensorflow::errors::InvalidArgument( "Unable to parse ", remote_worker, " as a device name"); } + *request.mutable_server_def() = server_def; request.mutable_server_def()->set_job_name(parsed_name.job); request.mutable_server_def()->set_task_index(parsed_name.task); request.set_async(async); + request.set_keep_alive_secs(keep_alive_secs); auto* eager_client = remote_eager_workers->GetClient(remote_worker); if (eager_client == nullptr) { return tensorflow::errors::Internal( @@ -145,48 +151,86 @@ tensorflow::Status CreateRemoteContexts( return tensorflow::Status::OK(); } -tensorflow::Status NewRemoteAwareTFE_Context(const TFE_ContextOptions* opts, - TFE_Context** ctx) { - string worker_name = tensorflow::strings::StrCat( - "/job:", opts->server_def.job_name(), - "/replica:0/task:", opts->server_def.task_index()); - std::unique_ptr server; - TF_RETURN_IF_ERROR( - tensorflow::eager::EagerGrpcServer::Create(opts->server_def, &server)); +tensorflow::Status UpdateTFE_ContextWithServerDef( + int keep_alive_secs, const tensorflow::ServerDef& server_def, + TFE_Context* ctx) { + // We don't use the TF_RETURN_IF_ERROR macro directly since that destroys the + // server object (which currently CHECK-fails) and we miss the error, instead, + // we log the error, and then return to allow the user to see the error + // message. +#define LOG_AND_RETURN_IF_ERROR(...) \ + do { \ + const ::tensorflow::Status _status = (__VA_ARGS__); \ + if (TF_PREDICT_FALSE(!_status.ok())) { \ + LOG(ERROR) << _status.error_message(); \ + return _status; \ + } \ + } while (0); + + string worker_name = + tensorflow::strings::StrCat("/job:", server_def.job_name(), + "/replica:0/task:", server_def.task_index()); + + std::unique_ptr server; + LOG_AND_RETURN_IF_ERROR(tensorflow::NewServer(server_def, &server)); + + tensorflow::GrpcServer* grpc_server = + dynamic_cast(server.get()); + if (grpc_server == nullptr) { + LOG_AND_RETURN_IF_ERROR(tensorflow::errors::Internal( + "Currently, TFE_NewContext only supports tensorflow::GrpcServer.")); + } - TF_RETURN_IF_ERROR(server->Start()); + LOG_AND_RETURN_IF_ERROR(grpc_server->Start()); + + int64 rendezvous_id = tensorflow::random::New64(); std::vector remote_workers; - server->master_env()->worker_cache->ListWorkers(&remote_workers); + grpc_server->master_env()->worker_cache->ListWorkers(&remote_workers); remote_workers.erase( std::remove(remote_workers.begin(), remote_workers.end(), worker_name), remote_workers.end()); std::unique_ptr remote_device_mgr; - TF_RETURN_IF_ERROR(GetAllRemoteDevices( - remote_workers, server->master_env()->worker_cache, &remote_device_mgr)); + LOG_AND_RETURN_IF_ERROR(GetAllRemoteDevices( + remote_workers, grpc_server->master_env()->worker_cache, + &remote_device_mgr)); std::shared_ptr channel_cache = - server->channel_cache(); + grpc_server->channel_cache(); std::unique_ptr remote_eager_workers( tensorflow::eager::NewGrpcEagerClientCache(channel_cache)); // Initialize remote eager workers. tensorflow::gtl::FlatMap remote_contexts; - TF_RETURN_IF_ERROR(CreateRemoteContexts(remote_workers, - remote_eager_workers.get(), - opts->async, &remote_contexts)); + LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts( + remote_workers, rendezvous_id, keep_alive_secs, server_def, + remote_eager_workers.get(), ctx->context.Async(), &remote_contexts)); tensorflow::RemoteRendezvous* r = - server->worker_env()->rendezvous_mgr->Find(0); + grpc_server->worker_env()->rendezvous_mgr->Find(rendezvous_id); + + auto session_name = tensorflow::strings::StrCat("eager_", rendezvous_id); + TF_RETURN_IF_ERROR(grpc_server->worker_env()->session_mgr->CreateSession( + session_name, server_def, true)); + + std::shared_ptr worker_session; + TF_RETURN_IF_ERROR( + grpc_server->worker_env()->session_mgr->WorkerSessionForSession( + session_name, &worker_session)); - auto* device_mgr = server->worker_env()->device_mgr; - *ctx = new TFE_Context(opts->session_options.options, opts->policy, - opts->async, device_mgr, r, std::move(server), - std::move(remote_eager_workers), - std::move(remote_device_mgr), remote_contexts); + // Initialize remote tensor communication based on worker session. + TF_RETURN_IF_ERROR(r->Initialize(worker_session.get())); + + auto* device_mgr = grpc_server->worker_env()->device_mgr; + + ctx->context.InitializeRemote(std::move(server), + std::move(remote_eager_workers), + std::move(remote_device_mgr), remote_contexts, + r, device_mgr, keep_alive_secs); return tensorflow::Status::OK(); +#undef LOG_AND_RETURN_IF_ERROR } } // namespace @@ -200,38 +244,23 @@ void TFE_ContextOptionsSetConfig(TFE_ContextOptions* options, const void* proto, } void TFE_ContextOptionsSetAsync(TFE_ContextOptions* options, - unsigned char async) { - options->async = async; + unsigned char enable) { + options->async = enable; } void TFE_ContextOptionsSetDevicePlacementPolicy( TFE_ContextOptions* options, TFE_ContextDevicePlacementPolicy policy) { options->policy = policy; } -TF_CAPI_EXPORT extern void TFE_ContextOptionsSetServerDef( - TFE_ContextOptions* options, const void* proto, size_t proto_len, - TF_Status* status) { - if (!options->server_def.ParseFromArray(proto, proto_len)) { - status->status = tensorflow::errors::InvalidArgument( - "Invalid tensorflow.ServerDef protocol buffer"); - } -} - TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx, - unsigned char async, + unsigned char enable, TF_Status* status) { - status->status = ctx->context.SetAsyncForThread(async); + status->status = ctx->context.SetAsyncForThread(enable); } void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; } TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) { - if (!opts->server_def.job_name().empty()) { - TFE_Context* ctx = nullptr; - status->status = NewRemoteAwareTFE_Context(opts, &ctx); - return ctx; - } - std::vector devices; status->status = tensorflow::DeviceFactory::AddDevices( opts->session_options.options, "/job:localhost/replica:0/task:0", @@ -247,7 +276,7 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) { opts->async, std::move(device_mgr), r); } -void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) { delete ctx; } +void TFE_DeleteContext(TFE_Context* ctx) { delete ctx; } TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) { TF_DeviceList* list = new TF_DeviceList; @@ -260,6 +289,22 @@ TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) { void TFE_ContextClearCaches(TFE_Context* ctx) { ctx->context.ClearCaches(); } +// Set server_def on the context, possibly updating it. +TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx, + int keep_alive_secs, + const void* proto, + size_t proto_len, + TF_Status* status) { + tensorflow::ServerDef server_def; + if (!server_def.ParseFromArray(proto, proto_len)) { + status->status = tensorflow::errors::InvalidArgument( + "Invalid tensorflow.ServerDef protocol buffer"); + return; + } + status->status = + UpdateTFE_ContextWithServerDef(keep_alive_secs, server_def, ctx); +} + void TFE_ContextSetThreadLocalDevicePlacementPolicy( TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy) { ctx->context.SetThreadLocalDevicePlacementPolicy( @@ -295,7 +340,7 @@ TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) { } void TFE_DeleteTensorHandle(TFE_TensorHandle* h) { - DCHECK(h); + if (h == nullptr) return; if (h->handle) { h->handle->Unref(); } @@ -307,19 +352,34 @@ TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h) { } int TFE_TensorHandleNumDims(TFE_TensorHandle* h, TF_Status* status) { - const tensorflow::Tensor* t = nullptr; - status->status = h->handle->Tensor(&t); - return t == nullptr ? 0 : t->dims(); + if (h == nullptr || h->handle == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "The passed in handle is a nullptr"); + return -1; + } + int result; + status->status = h->handle->NumDims(&result); + return result; } int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index, TF_Status* status) { - const tensorflow::Tensor* t = nullptr; - status->status = h->handle->Tensor(&t); - return t == nullptr ? 0 : t->dim_size(dim_index); + if (h == nullptr || h->handle == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "The passed in handle is a nullptr"); + return -1; + } + tensorflow::int64 result; + status->status = h->handle->Dim(dim_index, &result); + return result; } const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) { + if (h == nullptr || h->handle == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "The passed in handle is a nullptr"); + return nullptr; + } tensorflow::Device* d = nullptr; status->status = h->handle->OpDevice(&d); return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0" @@ -327,6 +387,11 @@ const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) { } TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) { + if (h == nullptr || h->handle == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "The passed in handle is a nullptr"); + return nullptr; + } // TODO(agarwal): move this implementation inside TFE_TensorHandle. tensorflow::Device* d = nullptr; tensorflow::Device* op_device = nullptr; @@ -421,8 +486,11 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx, return ret; } -void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const char* value) { - op->operation.MutableAttrs()->Set(attr_name, value); +void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const void* value, + size_t length) { + op->operation.MutableAttrs()->Set( + attr_name, + tensorflow::StringPiece(static_cast(value), length)); } void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value) { @@ -473,16 +541,22 @@ void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name, op->operation.MutableAttrs()->Set(attr_name, attr_value); } -#define TFE_OP_SET_ATTR_LIST(fn, type) \ - void fn(TFE_Op* op, const char* attr_name, const type* values, \ - int num_values) { \ - op->operation.MutableAttrs()->Set( \ - attr_name, \ - tensorflow::gtl::ArraySlice(values, num_values)); \ +void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name, + const void* const* values, const size_t* lengths, + int num_values) { + std::vector v(num_values); + for (int i = 0; i < num_values; ++i) { + v[i] = tensorflow::StringPiece(static_cast(values[i]), + lengths[i]); } -TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrStringList, char*) -TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float) -#undef TFE_OP_SET_ATTR_LIST + op->operation.MutableAttrs()->Set(attr_name, v); +} + +void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name, + const float* values, int num_values) { + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice(values, num_values)); +} void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name, const int64_t* values, int num_values) { @@ -614,17 +688,17 @@ TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t) { const tensorflow::Tensor* TFE_TensorHandleUnderlyingTensorInHostMemory( TFE_TensorHandle* h, TF_Status* status) { - tensorflow::Device* d = nullptr; - tensorflow::Device* op_device = nullptr; - const tensorflow::Tensor* t = nullptr; - status->status = h->handle->TensorAndDevice(&t, &d, &op_device); - if (!status->status.ok()) return nullptr; - if (d != nullptr) { + if (!h->handle->OnHostCPU()) { status->status = tensorflow::errors::FailedPrecondition( "TFE_TensorHandle is placed in device (not host) memory. Cannot return " "a tensorflow::Tensor"); return nullptr; } + tensorflow::Device* d = nullptr; + tensorflow::Device* op_device = nullptr; + const tensorflow::Tensor* t = nullptr; + status->status = h->handle->TensorAndDevice(&t, &d, &op_device); + if (!status->status.ok()) return nullptr; return t; } @@ -650,14 +724,20 @@ TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func, } } // namespace +void TFE_ContextStartStep(TFE_Context* ctx) { ctx->context.StartStep(); } + +void TFE_ContextEndStep(TFE_Context* ctx) { ctx->context.EndStep(); } + namespace tensorflow { void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op, const tensorflow::AttrValue& default_value, const char* attr_name, TF_Status* status) { switch (default_value.value_case()) { - case tensorflow::AttrValue::kS: - TFE_OpSetAttrString(op, attr_name, default_value.s().data()); + case tensorflow::AttrValue::kS: { + const string& v = default_value.s(); + TFE_OpSetAttrString(op, attr_name, v.data(), v.size()); break; + } case tensorflow::AttrValue::kI: TFE_OpSetAttrInt(op, attr_name, static_cast(default_value.i())); break; diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h old mode 100644 new mode 100755 index 1862af3ce2f505a6e83b4805417eaf335ed07bc0..eec2750d6eb3bceed8da3ed44812ac2e8fd5c877 --- a/tensorflow/c/eager/c_api.h +++ b/tensorflow/c/eager/c_api.h @@ -76,21 +76,11 @@ typedef enum TFE_ContextDevicePlacementPolicy { // Sets the default execution mode (sync/async). Note that this can be // overridden per thread using TFE_ContextSetAsyncForThread. TF_CAPI_EXPORT extern void TFE_ContextOptionsSetAsync(TFE_ContextOptions*, - unsigned char async); + unsigned char enable); TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy( TFE_ContextOptions*, TFE_ContextDevicePlacementPolicy); -// A tensorflow.ServerDef specifies remote workers (in addition to the current -// workers name). Operations created on this context can then be executed on -// any of these remote workers by setting an appropriate device. -// -// If the following is set, all servers identified by the -// ServerDef must be up when the context is created. -TF_CAPI_EXPORT extern void TFE_ContextOptionsSetServerDef( - TFE_ContextOptions* options, const void* proto, size_t proto_len, - TF_Status* status); - // Destroy an options object. TF_CAPI_EXPORT extern void TFE_DeleteContextOptions(TFE_ContextOptions*); @@ -102,8 +92,7 @@ typedef struct TFE_Context TFE_Context; TF_CAPI_EXPORT extern TFE_Context* TFE_NewContext( const TFE_ContextOptions* opts, TF_Status* status); -TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx, - TF_Status* status); +TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx); TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status); @@ -125,9 +114,21 @@ TFE_ContextGetDevicePlacementPolicy(TFE_Context*); // Overrides the execution mode (sync/async) for the current thread. TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context*, - unsigned char async, + unsigned char enable, TF_Status* status); +// A tensorflow.ServerDef specifies remote workers (in addition to the current +// workers name). Operations created on this context can then be executed on +// any of these remote workers by setting an appropriate device. +// +// If the following is set, all servers identified by the +// ServerDef must be up when the context is created. +TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx, + int keep_alive_secs, + const void* proto, + size_t proto_len, + TF_Status* status); + // Causes the calling thread to block till all ops dispatched in async mode // have been executed. Note that "execution" here refers to kernel execution / // scheduling of copies, etc. Similar to sync execution, it doesn't guarantee @@ -278,7 +279,8 @@ TF_CAPI_EXPORT extern TF_AttrType TFE_OpNameGetAttrType( TF_CAPI_EXPORT extern void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, - const char* value); + const void* value, + size_t length); TF_CAPI_EXPORT extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value); TF_CAPI_EXPORT extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, @@ -305,7 +307,8 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrFunction(TFE_Op* op, TF_CAPI_EXPORT extern void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name, - const char** value, + const void* const* values, + const size_t* lengths, int num_values); TF_CAPI_EXPORT extern void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name, @@ -378,6 +381,16 @@ TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf, TF_Status* status); +// Some TF ops need a step container to be set to limit the lifetime of some +// resources (mostly TensorArray and Stack, used in while loop gradients in +// graph mode). Calling this on a context tells it to start a step. +TF_CAPI_EXPORT extern void TFE_ContextStartStep(TFE_Context* ctx); + +// Ends a step. When there is no active step (that is, every started step has +// been ended) step containers will be cleared. Note: it is not safe to call +// TFE_ContextEndStep while ops which rely on the step container may be running. +TF_CAPI_EXPORT extern void TFE_ContextEndStep(TFE_Context* ctx); + #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h index 04a6efc47c5177c82b7e88168b67cc584587de7c..a5c0681e2e4eddae08954d9d0178ca96a3f8f29a 100644 --- a/tensorflow/c/eager/c_api_internal.h +++ b/tensorflow/c/eager/c_api_internal.h @@ -39,7 +39,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/rendezvous_mgr.h" #include "tensorflow/core/distributed_runtime/eager/eager_client.h" #include "tensorflow/core/distributed_runtime/remote_device.h" -#include "tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h" +#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h" #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h" @@ -59,7 +59,6 @@ struct TFE_ContextOptions { // true if async execution is enabled. bool async = false; TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_SILENT}; - tensorflow::ServerDef server_def; }; struct TFE_Context { @@ -73,23 +72,6 @@ struct TFE_Context { default_policy), async, std::move(device_mgr), rendezvous) {} - explicit TFE_Context( - const tensorflow::SessionOptions& opts, - TFE_ContextDevicePlacementPolicy default_policy, bool async, - tensorflow::DeviceMgr* local_device_mgr, - tensorflow::Rendezvous* rendezvous, - std::unique_ptr server, - std::unique_ptr remote_eager_workers, - std::unique_ptr remote_device_mgr, - const tensorflow::gtl::FlatMap& - remote_contexts) - : context(opts, - static_cast( - default_policy), - async, local_device_mgr, rendezvous, std::move(server), - std::move(remote_eager_workers), std::move(remote_device_mgr), - remote_contexts) {} - tensorflow::EagerContext context; }; diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc index 27ff5f7211b0592637a173d337f93c10d376443f..7126227cf529023eadf38984668a40118641bb1b 100644 --- a/tensorflow/c/eager/c_api_test.cc +++ b/tensorflow/c/eager/c_api_test.cc @@ -17,7 +17,7 @@ limitations under the License. #include #include "tensorflow/c/eager/c_api_test_util.h" -#include "tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h" +#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h" #include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" @@ -49,7 +49,7 @@ void BM_InitOp(int iters) { } tensorflow::testing::StopTiming(); TFE_DeleteTensorHandle(m); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TF_DeleteStatus(status); } @@ -80,7 +80,7 @@ void BM_Execute(int iters, int async) { tensorflow::testing::StopTiming(); TFE_DeleteOp(matmul); TFE_DeleteTensorHandle(m); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TF_DeleteStatus(status); } @@ -95,7 +95,7 @@ TEST(CAPI, Context) { TF_DeviceList* devices = TFE_ContextListDevices(ctx, status); EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); const int num_devices = TF_DeviceListCount(devices); @@ -108,14 +108,14 @@ TEST(CAPI, Context) { TF_DeleteStatus(status); } -tensorflow::ServerDef GetServerDef(int num_tasks) { +tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) { tensorflow::ServerDef server_def; server_def.set_protocol("grpc"); - server_def.set_job_name("localhost"); + server_def.set_job_name(job_name); server_def.set_task_index(0); tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster(); tensorflow::JobDef* job_def = cluster_def->add_job(); - job_def->set_name("localhost"); + job_def->set_name(job_name); for (int i = 0; i < num_tasks; i++) { int port = tensorflow::testing::PickUnusedPortOrDie(); job_def->mutable_tasks()->insert( @@ -124,6 +124,10 @@ tensorflow::ServerDef GetServerDef(int num_tasks) { return server_def; } +tensorflow::ServerDef GetServerDef(int num_tasks) { + return GetServerDef("localhost", num_tasks); +} + void TestRemoteExecute(bool async) { tensorflow::ServerDef server_def = GetServerDef(2); @@ -132,22 +136,24 @@ void TestRemoteExecute(bool async) { server_def.set_task_index(1); - std::unique_ptr worker_server; - ASSERT_TRUE( - tensorflow::eager::EagerGrpcServer::Create(server_def, &worker_server) - .ok()); + std::unique_ptr worker_server; + ASSERT_TRUE(tensorflow::GrpcServer::Create( + server_def, tensorflow::Env::Default(), &worker_server) + .ok()); ASSERT_TRUE(worker_server->Start().ok()); TF_Status* status = TF_NewStatus(); TFE_ContextOptions* opts = TFE_NewContextOptions(); - TFE_ContextOptionsSetServerDef(opts, serialized.data(), serialized.size(), - status); - TFE_ContextOptionsSetAsync(opts, static_cast(1)); - EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_ContextOptionsSetAsync(opts, static_cast(async)); + TFE_ContextOptionsSetDevicePlacementPolicy(opts, + TFE_DEVICE_PLACEMENT_EXPLICIT); TFE_Context* ctx = TFE_NewContext(opts, status); EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TFE_DeleteContextOptions(opts); + TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(); TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle(); const char remote_device_name[] = @@ -193,8 +199,8 @@ void TestRemoteExecute(bool async) { TFE_DeleteOp(matmul); TFE_ContextAsyncWait(ctx, status); - TFE_DeleteContext(ctx, status); EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContext(ctx); TF_DeleteStatus(status); @@ -205,6 +211,236 @@ void TestRemoteExecute(bool async) { TEST(CAPI, RemoteExecute) { TestRemoteExecute(false); } TEST(CAPI, RemoteExecuteAsync) { TestRemoteExecute(true); } +void TestRemoteExecuteSilentCopies(bool async) { + tensorflow::ServerDef server_def = GetServerDef(3); + + // This server def has the task index set to 0. + string serialized = server_def.SerializeAsString(); + + server_def.set_task_index(1); + std::unique_ptr worker_server1; + ASSERT_TRUE(tensorflow::GrpcServer::Create( + server_def, tensorflow::Env::Default(), &worker_server1) + .ok()); + ASSERT_TRUE(worker_server1->Start().ok()); + + server_def.set_task_index(2); + std::unique_ptr worker_server2; + ASSERT_TRUE(tensorflow::GrpcServer::Create( + server_def, tensorflow::Env::Default(), &worker_server2) + .ok()); + ASSERT_TRUE(worker_server2->Start().ok()); + + TF_Status* status = TF_NewStatus(); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + TFE_ContextOptionsSetAsync(opts, static_cast(async)); + TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT); + TFE_Context* ctx = TFE_NewContext(opts, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContextOptions(opts); + + TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(); + TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle(); + const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0"; + const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0"; + + auto* h1_task2 = + TFE_TensorHandleCopyToDevice(h1_task0, ctx, task2_name, status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + // Handles are on task0 (local), and task2, but op is on task1. + TFE_Op* matmul = MatMulOp(ctx, h0_task0, h1_task2); + TFE_OpSetDevice(matmul, task1_name, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TFE_TensorHandle* retvals[1]; + int num_retvals = 1; + TFE_Execute(matmul, &retvals[0], &num_retvals, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + auto* retval_task0 = TFE_TensorHandleCopyToDevice( + retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteTensorHandle(retval_task0); + float product[4] = {0}; + EXPECT_EQ(sizeof(product), TF_TensorByteSize(t)); + memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t)); + TF_DeleteTensor(t); + EXPECT_EQ(7, product[0]); + EXPECT_EQ(10, product[1]); + EXPECT_EQ(15, product[2]); + EXPECT_EQ(22, product[3]); + + TFE_DeleteTensorHandle(h0_task0); + TFE_DeleteTensorHandle(h1_task0); + TFE_DeleteTensorHandle(h1_task2); + TFE_DeleteTensorHandle(retvals[0]); + + TFE_DeleteOp(matmul); + + TFE_ContextAsyncWait(ctx, status); + TFE_DeleteContext(ctx); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TF_DeleteStatus(status); + + // TODO(nareshmodi): Figure out how to correctly shut the server down. + worker_server1.release(); + worker_server2.release(); +} + +TEST(CAPI, RemoteExecuteSilentCopies) { TestRemoteExecuteSilentCopies(false); } +TEST(CAPI, RemoteExecuteSilentCopiesAsync) { + TestRemoteExecuteSilentCopies(true); +} + +void CheckTFE_TensorHandleHasFloats(TFE_TensorHandle* handle, + const std::vector& expected_values) { + std::unique_ptr status( + TF_NewStatus(), TF_DeleteStatus); + TF_Tensor* t = TFE_TensorHandleResolve(handle, status.get()); + ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + std::unique_ptr actual_values(new float[expected_values.size()]); + EXPECT_EQ(sizeof(float) * expected_values.size(), TF_TensorByteSize(t)); + memcpy(actual_values.get(), TF_TensorData(t), TF_TensorByteSize(t)); + TF_DeleteTensor(t); + + for (int i = 0; i < expected_values.size(); i++) { + EXPECT_EQ(expected_values[i], actual_values[i]) + << "Mismatch in expected values at (zero-based) index " << i; + } +} + +void CheckRemoteMatMulExecutesOK(TFE_Context* ctx, + const char* remote_device_name, + const char* local_device_name) { + TF_Status* status = TF_NewStatus(); + TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(); + + TFE_Op* matmul = MatMulOp(ctx, h0_task0, h0_task0); + TFE_OpSetDevice(matmul, remote_device_name, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TFE_TensorHandle* retvals[1]; + int num_retvals = 1; + TFE_Execute(matmul, &retvals[0], &num_retvals, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + auto* retval_task0 = + TFE_TensorHandleCopyToDevice(retvals[0], ctx, local_device_name, status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + CheckTFE_TensorHandleHasFloats(retval_task0, {7, 10, 15, 22}); + + TFE_DeleteTensorHandle(retval_task0); + TFE_DeleteTensorHandle(h0_task0); + TFE_DeleteTensorHandle(retvals[0]); + + TFE_DeleteOp(matmul); + + TFE_ContextAsyncWait(ctx, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TF_DeleteStatus(status); +} + +void TestRemoteExecuteChangeServerDef(bool async) { + tensorflow::ServerDef server_def = GetServerDef(2); + + // This server def has the task index set to 0. + string serialized = server_def.SerializeAsString(); + + server_def.set_task_index(1); + + std::unique_ptr worker_server; + ASSERT_TRUE(tensorflow::GrpcServer::Create( + server_def, tensorflow::Env::Default(), &worker_server) + .ok()); + ASSERT_TRUE(worker_server->Start().ok()); + + TF_Status* status = TF_NewStatus(); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + TFE_ContextOptionsSetAsync(opts, static_cast(async)); + TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT); + TFE_Context* ctx = TFE_NewContext(opts, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContextOptions(opts); + + TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + const char remote_device_name[] = + "/job:localhost/replica:0/task:1/device:CPU:0"; + const char local_device_name[] = + "/job:localhost/replica:0/task:0/device:CPU:0"; + CheckRemoteMatMulExecutesOK(ctx, remote_device_name, local_device_name); + + TFE_ContextAsyncWait(ctx, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + // TODO(nareshmodi): Figure out how to correctly shut the server down. + worker_server.release(); + + // Update the server def with a new set of names (worker instead of + // localhost). + tensorflow::ServerDef updated_server_def = GetServerDef("worker", 2); + serialized = updated_server_def.SerializeAsString(); + + updated_server_def.set_task_index(1); + tensorflow::Status s = tensorflow::GrpcServer::Create( + updated_server_def, tensorflow::Env::Default(), &worker_server); + ASSERT_TRUE(s.ok()) << s.error_message(); + ASSERT_TRUE(worker_server->Start().ok()); + + TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + // Create a new tensor_handle. + TFE_TensorHandle* h0_task0_new = TestMatrixTensorHandle(); + + // Check that copying it to the old remote device (named localhost) fails. + TFE_TensorHandleCopyToDevice(h0_task0_new, ctx, remote_device_name, status); + EXPECT_NE(TF_OK, TF_GetCode(status)) << TF_Message(status); + + // Copying and executing on the new remote device works. + const char new_remote_device_name[] = + "/job:worker/replica:0/task:1/device:CPU:0"; + const char new_local_device_name[] = + "/job:worker/replica:0/task:0/device:CPU:0"; + + auto* h0_task1_new = TFE_TensorHandleCopyToDevice( + h0_task0_new, ctx, new_remote_device_name, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TFE_DeleteTensorHandle(h0_task0_new); + TFE_DeleteTensorHandle(h0_task1_new); + + CheckRemoteMatMulExecutesOK(ctx, new_remote_device_name, + new_local_device_name); + + TFE_ContextAsyncWait(ctx, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TF_DeleteStatus(status); + + TFE_DeleteContext(ctx); + + // TODO(nareshmodi): Figure out how to correctly shut the server down. + worker_server.release(); +} + +TEST(CAPI, RemoteExecuteChangeServerDef) { + TestRemoteExecuteChangeServerDef(false); +} +TEST(CAPI, RemoteExecuteChangeServerDefAsync) { + TestRemoteExecuteChangeServerDef(true); +} + TEST(CAPI, TensorHandle) { TFE_TensorHandle* h = TestMatrixTensorHandle(); EXPECT_EQ(TF_FLOAT, TFE_TensorHandleDataType(h)); @@ -289,8 +525,7 @@ void TensorHandleCopyBetweenDevices(bool async) { TF_DeleteDeviceList(devices); TF_DeleteTensor(t); TFE_DeleteTensorHandle(hcpu); - TFE_DeleteContext(ctx, status.get()); - EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + TFE_DeleteContext(ctx); } TEST(CAPI, TensorHandleCopyBetweenDevices) { @@ -327,7 +562,7 @@ void TensorHandleCopyBetweenDevicesError(bool async) { TFE_DeleteTensorHandle(hcopy); TFE_DeleteTensorHandle(hcpu); if (hdevice != nullptr) TFE_DeleteTensorHandle(hdevice); - TFE_DeleteContext(ctx, status.get()); + TFE_DeleteContext(ctx); } TEST(CAPI, TensorHandleCopyBetweenDevicesError) { @@ -360,7 +595,7 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) { TF_DeleteDeviceList(devices); TF_DeleteTensor(t); TFE_DeleteTensorHandle(hcpu); - TFE_DeleteContext(ctx, status.get()); + TFE_DeleteContext(ctx); return; } const string gpu_1_name(TF_DeviceListName(devices, 1, status.get())); @@ -393,8 +628,7 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) { TF_DeleteDeviceList(devices); TF_DeleteTensor(t); TFE_DeleteTensorHandle(hcpu); - TFE_DeleteContext(ctx, status.get()); - EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + TFE_DeleteContext(ctx); } TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevices) { @@ -442,8 +676,7 @@ void TensorHandleSilentCopy(bool async) { TFE_DeleteTensorHandle(hcpu); TFE_ContextAsyncWait(ctx, status.get()); EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); - TFE_DeleteContext(ctx, status.get()); - EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + TFE_DeleteContext(ctx); } TEST(CAPI, TensorHandleSilentCopy) { TensorHandleSilentCopy(false); } @@ -489,8 +722,7 @@ void TensorHandleSilentCopyLocal(bool async) { TFE_DeleteTensorHandle(hcpu); TFE_ContextAsyncWait(ctx, status.get()); EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); - TFE_DeleteContext(ctx, status.get()); - EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + TFE_DeleteContext(ctx); } TEST(CAPI, TensorHandleSilentCopyLocal) { TensorHandleSilentCopyLocal(false); } TEST(CAPI, TensorHandleSilentCopyLocalAsync) { @@ -523,11 +755,47 @@ void SetAndGetOpDevices(bool async) { TFE_DeleteOp(matmul); TFE_DeleteTensorHandle(m); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TF_DeleteStatus(status); } +TEST(CAPI, TensorHandleNullptr) { + TFE_TensorHandle* h = nullptr; + std::unique_ptr status( + TF_NewStatus(), TF_DeleteStatus); + + TF_Tensor* t = TFE_TensorHandleResolve(h, status.get()); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get())); + ASSERT_EQ(t, nullptr); + ASSERT_EQ("The passed in handle is a nullptr", + string(TF_Message(status.get()))); + + TF_SetStatus(status.get(), TF_OK, ""); + + const char* device_name = TFE_TensorHandleDeviceName(h, status.get()); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get())); + ASSERT_EQ(device_name, nullptr); + ASSERT_EQ("The passed in handle is a nullptr", + string(TF_Message(status.get()))); + + TF_SetStatus(status.get(), TF_OK, ""); + + int num_dims = TFE_TensorHandleNumDims(h, status.get()); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get())); + ASSERT_EQ(num_dims, -1); + ASSERT_EQ("The passed in handle is a nullptr", + string(TF_Message(status.get()))); + + TF_SetStatus(status.get(), TF_OK, ""); + + int dim = TFE_TensorHandleDim(h, 0, status.get()); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get())); + ASSERT_EQ(dim, -1); + ASSERT_EQ("The passed in handle is a nullptr", + string(TF_Message(status.get()))); +} + void Execute_MatMul_CPU(bool async) { TF_Status* status = TF_NewStatus(); TFE_ContextOptions* opts = TFE_NewContextOptions(); @@ -549,7 +817,7 @@ void Execute_MatMul_CPU(bool async) { TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status); ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TFE_DeleteTensorHandle(retvals[0]); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); float product[4] = {0}; EXPECT_EQ(sizeof(product), TF_TensorByteSize(t)); @@ -621,7 +889,7 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) { TFE_DeleteTensorHandle(m1); TFE_DeleteTensorHandle(m2); TFE_DeleteTensorHandle(retvals[0]); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); TF_DeleteStatus(status); } TEST(CAPI, Execute_MatMul_CPU_Runtime_Error) { @@ -652,7 +920,7 @@ void Execute_MatMul_CPU_Type_Error(bool async) { if (retvals[0] != nullptr) { TFE_DeleteTensorHandle(retvals[0]); } - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); TF_DeleteStatus(status); } @@ -690,7 +958,7 @@ TEST(CAPI, Execute_Min_CPU) { TF_DeleteTensor(t); EXPECT_EQ(1, output[0]); EXPECT_EQ(3, output[1]); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TF_DeleteStatus(status); } @@ -732,7 +1000,7 @@ void Execute_MatMul_XLA_CPU(bool async) { EXPECT_EQ(10, product[1]); EXPECT_EQ(15, product[2]); EXPECT_EQ(22, product[3]); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); TF_DeleteStatus(status); } TEST(CAPI, Execute_MatMul_XLA_CPU) { Execute_MatMul_XLA_CPU(false); } @@ -771,7 +1039,7 @@ void Execute_Min_XLA_CPU(bool async) { TF_DeleteTensor(t); EXPECT_EQ(1, output[0]); EXPECT_EQ(3, output[1]); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); TF_DeleteStatus(status); } TEST(CAPI, Execute_Min_XLA_CPU) { Execute_Min_XLA_CPU(false); } @@ -807,7 +1075,7 @@ void ExecuteWithTracing(bool async) { TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status); TFE_DeleteTensorHandle(retvals[0]); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); float product[4] = {0}; EXPECT_EQ(sizeof(product), TF_TensorByteSize(t)); @@ -883,7 +1151,7 @@ TEST(CAPI, Function_ident_CPU) { TF_DeleteTensor(r); TFE_DeleteTensorHandle(result[0]); } - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status); TF_DeleteStatus(status); } @@ -953,7 +1221,7 @@ TEST(CAPI, Function_ident_XLA_CPU) { TF_DeleteTensor(r); TFE_DeleteTensorHandle(result[0]); } - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status); TF_DeleteStatus(status); } @@ -1029,7 +1297,7 @@ void FunctionDefAndExecute(bool async) { EXPECT_EQ(10, product[1]); EXPECT_EQ(15, product[2]); EXPECT_EQ(22, product[3]); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TF_DeleteStatus(status); } @@ -1070,7 +1338,7 @@ void BM_ExecuteFunction(int iters, int async) { tensorflow::testing::StopTiming(); TFE_DeleteTensorHandle(m); TFE_DeleteTensorHandle(retval[0]); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TF_DeleteStatus(status); } @@ -1083,8 +1351,8 @@ TFE_TensorHandle* CreateVariable(TFE_Context* ctx, float value, if (TF_GetCode(status) != TF_OK) return nullptr; TFE_OpSetAttrType(op, "dtype", TF_FLOAT); TFE_OpSetAttrShape(op, "shape", {}, 0, status); - TFE_OpSetAttrString(op, "container", ""); - TFE_OpSetAttrString(op, "shared_name", ""); + TFE_OpSetAttrString(op, "container", "", 0); + TFE_OpSetAttrString(op, "shared_name", "", 0); if (TF_GetCode(status) != TF_OK) return nullptr; TFE_TensorHandle* var_handle = nullptr; int num_retvals = 1; @@ -1158,7 +1426,7 @@ TEST(CAPI, Variables) { TFE_DeleteTensorHandle(var_handle); TFE_DeleteTensorHandle(value_handle); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TF_DeleteStatus(status); } @@ -1197,10 +1465,67 @@ void BM_ReadVariable(int iters) { TFE_DeleteOp(op); TFE_DeleteTensorHandle(var_handle); - TFE_DeleteContext(ctx, status); + TFE_DeleteContext(ctx); CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TF_DeleteStatus(status); } BENCHMARK(BM_ReadVariable); +TEST(CAPI, StringAttributes) { + // Test that TFE_OpSetAttrString doesn't hold on to the value after it + // returns. + TF_Status* status = TF_NewStatus(); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + TFE_Context* ctx = TFE_NewContext(opts, status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContextOptions(opts); + + std::vector dims(4, 1); + TFE_Op* op = TFE_NewOp(ctx, "AvgPool", status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TF_Tensor* tensor = + TF_AllocateTensor(TF_FLOAT, dims.data(), dims.size(), sizeof(float)); + float tensor_data[] = {1}; + memcpy(TF_TensorData(tensor), tensor_data, TF_TensorByteSize(tensor)); + TFE_TensorHandle* tensor_handle = TFE_NewTensorHandle(tensor, status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_OpAddInput(op, tensor_handle, status); + TF_DeleteTensor(tensor); + TFE_DeleteTensorHandle(tensor_handle); + + std::vector values(4, 1); + TFE_OpSetAttrIntList(op, "ksize", values.data(), values.size()); + TFE_OpSetAttrIntList(op, "strides", values.data(), values.size()); + + const int BUFFER_SIZE = 10; + char buffer[BUFFER_SIZE]; + std::strncpy(buffer, "VALID", BUFFER_SIZE); + TFE_OpSetAttrString(op, "padding", buffer, std::strlen(buffer)); + // Overwriting value in "buffer", should be fine since TFE_Op + // shouldn't be holding on to it. + std::strncpy(buffer, "NHWC", BUFFER_SIZE); + TFE_OpSetAttrString(op, "data_format", buffer, std::strlen(buffer)); + + TFE_OpSetAttrType(op, "T", TF_FLOAT); + + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TFE_TensorHandle* retvals[1]; + int num_retvals = 1; + TFE_Execute(op, &retvals[0], &num_retvals, status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + ASSERT_EQ(1, num_retvals); + + tensor = TFE_TensorHandleResolve(retvals[0], status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + EXPECT_EQ(4, TF_TensorByteSize(tensor)); + TF_DeleteTensor(tensor); + TFE_DeleteTensorHandle(retvals[0]); + + TFE_DeleteOp(op); + + TFE_DeleteContext(ctx); + TF_DeleteStatus(status); +} } // namespace diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h index 734e712daa39c03f0177eb199b1acb1b19e5d845..ce038a4b57b2699c6d09fcf75ef41cecec4e97b8 100644 --- a/tensorflow/c/eager/tape.h +++ b/tensorflow/c/eager/tape.h @@ -440,6 +440,15 @@ Status InitialGradients(const VSpace& vspace, return Status::OK(); } +gtl::FlatMap>* FunctionsAcceptingNoneForIndicesMap() { + static auto* const m = new gtl::FlatMap>({ + {"SoftmaxCrossEntropyWithLogits", {1}}, + {"SparseSoftmaxCrossEntropyWithLogits", {1}}, + {"FusedBatchNorm", {1, 2, 3, 4}}, + }); + return m; +} + } // namespace // If over kMinAggregateCount gradients are accumulated and the total @@ -485,10 +494,6 @@ Status GradientTape::ComputeGradient( VLOG(1) << " " << t; } } - gtl::FlatMap> functions_accept_none_for_indices({ - {"SoftmaxCrossEntropyWithLogits", {1}}, - {"FusedBatchNorm", {1, 2, 3, 4}}, - }); while (!op_stack.empty()) { const int64 op = op_stack.back(); VLOG(1) << "Popped " << op; @@ -509,8 +514,8 @@ Status GradientTape::ComputeGradient( auto grad_it = gradients.find(id); if (grad_it == gradients.end()) { auto func_name_it = - functions_accept_none_for_indices.find(trace.op_type); - if (func_name_it != functions_accept_none_for_indices.end() && + FunctionsAcceptingNoneForIndicesMap()->find(trace.op_type); + if (func_name_it != FunctionsAcceptingNoneForIndicesMap()->end() && func_name_it->second.find(i) != func_name_it->second.end()) { out_gradients.push_back(nullptr); } else { @@ -520,7 +525,12 @@ Status GradientTape::ComputeGradient( } } else { any_gradient_nonzero = true; - auto new_gradients = vspace.AggregateGradients(grad_it->second); + Gradient* new_gradients = nullptr; + if (grad_it->second.size() == 1) { + new_gradients = grad_it->second.at(0); + } else { + new_gradients = vspace.AggregateGradients(grad_it->second); + } if (sources_set.find(grad_it->first) == sources_set.end()) { gradients.erase(grad_it); } else { diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh index 02a6a58b6153bb78c684f9290ef95900f96e9357..7184ad68fb79f2598067d68d5ab5ba8f2c7a22c8 100755 --- a/tensorflow/c/generate-pc.sh +++ b/tensorflow/c/generate-pc.sh @@ -15,10 +15,12 @@ # ============================================================================== TF_PREFIX='/usr/local' +LIBDIR='lib' usage() { echo "Usage: $0 OPTIONS" echo -e "-p, --prefix\tset installation prefix (default: /usr/local)" + echo -e "-l, --libdir\tset lib directory (default: lib)" echo -e "-v, --version\tset TensorFlow version" echo -e "-h, --help\tdisplay this message" } @@ -26,7 +28,7 @@ usage() { [ $# == 0 ] && usage && exit 0 # read the options -ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@") +ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@") eval set -- "$ARGS" # extract options and their arguments into variables. @@ -38,6 +40,11 @@ while true ; do "") shift 2 ;; *) TF_PREFIX=$2 ; shift 2 ;; esac ;; + -l|--libdir) + case "$2" in + "") shift 2 ;; + *) LIBDIR=$2 ; shift 2 ;; + esac ;; -v|--version) case "$2" in "") shift 2 ;; @@ -55,7 +62,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX" cat << EOF > tensorflow.pc prefix=${TF_PREFIX} exec_prefix=\${prefix} -libdir=\${exec_prefix}/lib +libdir=\${exec_prefix}/${LIBDIR} includedir=\${prefix}/include Name: TensorFlow diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc index e18fdf6c57bd3f432d8cb73536fb816df90b3963..8486b585c8587e18e8eea18a893fac0a40ff4a27 100644 --- a/tensorflow/c/python_api.cc +++ b/tensorflow/c/python_api.cc @@ -155,7 +155,7 @@ void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, tensorflow::shape_inference::ShapeHandle shape; status->status = ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape); - if (status->status.ok()) return; + if (!status->status.ok()) return; shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype()); } ic->set_output_handle_shapes_and_types(output.index, shapes_and_types); diff --git a/tensorflow/c/tf_status_helper.h b/tensorflow/c/tf_status_helper.h index 86e687df205617018d94c19ac34fdc3bf54dcc6f..7661a01de4afcefbb66b33a05534e22d2ba1baa0 100644 --- a/tensorflow/c/tf_status_helper.h +++ b/tensorflow/c/tf_status_helper.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_C_TF_STATUS_HELPER_H -#define TENSORFLOW_C_TF_STATUS_HELPER_H +#ifndef TENSORFLOW_C_TF_STATUS_HELPER_H_ +#define TENSORFLOW_C_TF_STATUS_HELPER_H_ #include "tensorflow/c/c_api.h" #include "tensorflow/core/lib/core/status.h" @@ -29,4 +29,4 @@ Status StatusFromTF_Status(const TF_Status* tf_status); } // namespace tensorflow -#endif // TENSORFLOW_C_TF_STATUS_HELPER_H +#endif // TENSORFLOW_C_TF_STATUS_HELPER_H_ diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD index 079e063d3e3fbdaf833e9031f5f9438853c14099..f56521dac0374849081fe94f16feb08e55647b56 100644 --- a/tensorflow/cc/BUILD +++ b/tensorflow/cc/BUILD @@ -121,6 +121,7 @@ cc_library( deps = [ ":array_grad", ":data_flow_grad", + ":image_grad", ":math_grad", ":nn_grad", ], @@ -331,6 +332,36 @@ tf_cc_test( ], ) +cc_library( + name = "image_grad", + srcs = ["gradients/image_grad.cc"], + deps = [ + ":cc_ops", + ":cc_ops_internal", + ":grad_op_registry", + ":gradients", + ], + alwayslink = 1, +) + +tf_cc_test( + name = "gradients_image_grad_test", + srcs = ["gradients/image_grad_test.cc"], + deps = [ + ":cc_ops", + ":client_session", + ":grad_op_registry", + ":grad_testutil", + ":gradient_checker", + ":image_grad", + ":testutil", + "//tensorflow/core:lib_internal", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + cc_library( name = "math_grad", srcs = ["gradients/math_grad.cc"], @@ -348,9 +379,11 @@ tf_cc_test( srcs = ["gradients/math_grad_test.cc"], deps = [ ":cc_ops", + ":client_session", ":grad_op_registry", ":grad_testutil", ":gradient_checker", + ":gradients", ":math_grad", ":testutil", "//tensorflow/core:lib_internal", @@ -530,7 +563,7 @@ cc_library_with_android_deps( "//tensorflow/core/api_def:base_api_def", ], deps = [ - "//tensorflow/core:framework", + "//tensorflow/core:framework_headers_lib", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:op_gen_lib", @@ -595,7 +628,6 @@ tf_cc_binary( copts = tf_copts(), linkopts = select({ "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//tensorflow:darwin": [ "-lm", "-lpthread", diff --git a/tensorflow/cc/client/client_session.cc b/tensorflow/cc/client/client_session.cc index ba056a8f3a84910aebf5079573cb64c19f41469d..0e61089a5950ee894ad5489317757cff8a85e966 100644 --- a/tensorflow/cc/client/client_session.cc +++ b/tensorflow/cc/client/client_session.cc @@ -127,4 +127,22 @@ Status ClientSession::Run(const RunOptions& run_options, const FeedType& inputs, target_node_names, outputs, run_metadata); } +Status ClientSession::MakeCallable(const CallableOptions& callable_options, + CallableHandle* out_handle) { + TF_RETURN_IF_ERROR(impl()->MaybeExtendGraph()); + return impl()->session_->MakeCallable(callable_options, out_handle); +} + +Status ClientSession::RunCallable(CallableHandle handle, + const std::vector& feed_tensors, + std::vector* fetch_tensors, + RunMetadata* run_metadata) { + return impl()->session_->RunCallable(handle, feed_tensors, fetch_tensors, + run_metadata); +} + +Status ClientSession::ReleaseCallable(CallableHandle handle) { + return impl()->session_->ReleaseCallable(handle); +} + } // end namespace tensorflow diff --git a/tensorflow/cc/client/client_session.h b/tensorflow/cc/client/client_session.h index 5fb4109f7d15d5997f745acd913e60a02855fd73..7dd653eec4ec729b652cb779d06e820bfb437b3c 100644 --- a/tensorflow/cc/client/client_session.h +++ b/tensorflow/cc/client/client_session.h @@ -87,7 +87,33 @@ class ClientSession { const std::vector& run_outputs, std::vector* outputs, RunMetadata* run_metadata) const; - // TODO(keveman): Add support for partial run. + /// \brief A handle to a subgraph, created with + /// `ClientSession::MakeCallable()`. + typedef int64 CallableHandle; + + /// \brief Creates a `handle` for invoking the subgraph defined by + /// `callable_options`. + /// NOTE: This API is still experimental and may change. + Status MakeCallable(const CallableOptions& callable_options, + CallableHandle* out_handle); + + /// \brief Invokes the subgraph named by `handle` with the given options and + /// input tensors. + /// + /// The order of tensors in `feed_tensors` must match the order of names in + /// `CallableOptions::feed()` and the order of tensors in `fetch_tensors` will + /// match the order of names in `CallableOptions::fetch()` when this subgraph + /// was created. + /// NOTE: This API is still experimental and may change. + Status RunCallable(CallableHandle handle, + const std::vector& feed_tensors, + std::vector* fetch_tensors, + RunMetadata* run_metadata); + + /// \brief Releases resources associated with the given `handle` in this + /// session. + /// NOTE: This API is still experimental and may change. + Status ReleaseCallable(CallableHandle handle); private: class Impl; diff --git a/tensorflow/cc/client/client_session_test.cc b/tensorflow/cc/client/client_session_test.cc index ea5cf5a1f12be316cc6e0d0a02cd3caf4d177400..559ffea7e817526e7f1396cd0e8187d01364f23b 100644 --- a/tensorflow/cc/client/client_session_test.cc +++ b/tensorflow/cc/client/client_session_test.cc @@ -95,5 +95,26 @@ TEST(ClientSessionTest, MultiThreaded) { test::ExpectTensorEqual(outputs[0], test::AsTensor({-1, 2}, {2})); } +TEST(ClientSessionTest, Callable) { + Scope root = Scope::NewRootScope(); + auto a = Placeholder(root, DT_INT32); + auto b = Placeholder(root, DT_INT32); + auto c = Add(root, a, b); + ClientSession session(root); + std::vector outputs; + + CallableOptions options; + options.add_feed(a.node()->name()); + options.add_feed(b.node()->name()); + options.add_fetch(c.node()->name()); + ClientSession::CallableHandle callable; + TF_CHECK_OK(session.MakeCallable(options, &callable)); + TF_EXPECT_OK(session.RunCallable( + callable, {test::AsTensor({1}, {}), test::AsTensor({41}, {})}, + &outputs, nullptr)); + test::ExpectTensorEqual(outputs[0], test::AsTensor({42}, {})); + TF_EXPECT_OK(session.ReleaseCallable(callable)); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc index d6a4f141b6bb8ccadb77f1fa83b5fb742d78f70f..a32d1b1eb50fc715084f5ee663a732770db1883c 100644 --- a/tensorflow/cc/framework/cc_op_gen.cc +++ b/tensorflow/cc/framework/cc_op_gen.cc @@ -273,6 +273,12 @@ string PrintAttrValue(const string& op, const AttrValue& attr_value) { return ""; // Prevent missing return warning } +bool IsEmptyList(const AttrValue::ListValue& list) { + return list.s_size() == 0 && list.i_size() == 0 && list.f_size() == 0 && + list.b_size() == 0 && list.type_size() == 0 && + list.shape_size() == 0 && list.tensor_size() == 0; +} + string ToCamelCase(const string& str) { string result; const char joiner = '_'; @@ -297,9 +303,9 @@ string ToCamelCase(const string& str) { // indicate whether to treat the type as const when accepting the C++ type as an // argument to a function. std::pair AttrTypeName(StringPiece attr_type) { - static const std::unordered_map, - StringPieceHasher> - attr_type_map{ + static const auto* attr_type_map = + new std::unordered_map, + StringPieceHasher>{ {"string", {"StringPiece", false}}, {"list(string)", {"gtl::ArraySlice", true}}, {"int", {"int64", false}}, @@ -317,14 +323,34 @@ std::pair AttrTypeName(StringPiece attr_type) { {"func", {"NameAttrList", true}}, }; - auto entry = attr_type_map.find(attr_type); - if (entry == attr_type_map.end()) { + auto entry = attr_type_map->find(attr_type); + if (entry == attr_type_map->end()) { LOG(FATAL) << "Unsupported Attr type: " << attr_type; return {"", false}; } return entry->second; } +const char* ListElementTypeName(StringPiece attr_type) { + static const auto* attr_list_type_map = + new std::unordered_map{ + {"list(string)", "string"}, + {"list(int)", "int"}, + {"list(float)", "float"}, + {"list(bool)", "bool"}, + {"list(type)", "DataType"}, + {"list(shape)", "PartialTensorShape"}, + {"list(tensor)", "TensorProto"}, + }; + + auto entry = attr_list_type_map->find(attr_type); + if (entry == attr_list_type_map->end()) { + LOG(FATAL) << "Unsupported or non-list Attr type: " << attr_type; + return ""; + } + return entry->second; +} + bool IsCPPKeyword(StringPiece name) { static const std::unordered_set // Keywords obtained from http://en.cppreference.com/w/cpp/keyword @@ -440,7 +466,7 @@ string AvoidCPPKeywords(StringPiece name) { if (IsCPPKeyword(name)) { return strings::StrCat(name, "_"); } - return std::string(name); + return string(name); } void InferArgAttributes(const OpDef::ArgDef& arg, @@ -482,15 +508,6 @@ bool HasOptionalAttrs( return false; } -const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) { - for (int i = 0; i < api_def.in_arg_size(); ++i) { - if (api_def.in_arg(i).name() == name) { - return &api_def.in_arg(i); - } - } - return nullptr; -} - struct OpInfo { // graph_op_def: The OpDef used by the runtime, has the names that // must be used when calling NodeBuilder. @@ -668,6 +685,7 @@ OpInfo::OpInfo(const OpDef& graph_op_def, const ApiDef& api_def, string OpInfo::GetOpAttrStruct() const { string struct_fields; string setters; + string defaults_static_storage; for (int i = 0; i < graph_op_def.attr_size(); ++i) { const auto& attr(graph_op_def.attr(i)); @@ -705,11 +723,32 @@ string OpInfo::GetOpAttrStruct() const { "_ = x;\n"); strings::StrAppend(&setters, " return ret;\n }\n\n"); - strings::StrAppend( - &struct_fields, " ", attr_type_name, " ", api_def_attr.rename_to(), - "_ = ", - PrintAttrValue(graph_op_def.name(), api_def_attr.default_value()), - ";\n"); + string field_initiliazer; + auto& default_value = api_def_attr.default_value(); + if (default_value.value_case() == AttrValue::kList && + !IsEmptyList(default_value.list())) { + // Non-empty lists need static storage for their defaults. Define a + // function with static local variable that stores the array. + strings::StrAppend(&defaults_static_storage, " static ", + attr_type_name, " Default_", api_def_attr.rename_to(), + "() {\n"); + strings::StrAppend( + &defaults_static_storage, " static const ", + ListElementTypeName(attr.type()), " kStorage[] = ", + PrintAttrValue(graph_op_def.name(), api_def_attr.default_value()), + ";\n"); + strings::StrAppend(&defaults_static_storage, " return ", + attr_type_name, "(kStorage);\n }\n"); + // Set the field_initializer to call the defined function. + strings::StrAppend(&field_initiliazer, "Default_", + api_def_attr.rename_to(), "()"); + } else { + field_initiliazer = + PrintAttrValue(graph_op_def.name(), api_def_attr.default_value()); + } + strings::StrAppend(&struct_fields, " ", attr_type_name, " ", + api_def_attr.rename_to(), "_ = ", field_initiliazer, + ";\n"); } if (struct_fields.empty()) { @@ -721,6 +760,9 @@ string OpInfo::GetOpAttrStruct() const { string struct_decl = MakeComment(attrs_comment, " "); strings::StrAppend(&struct_decl, " struct Attrs {\n"); strings::StrAppend(&struct_decl, setters, struct_fields); + if (!defaults_static_storage.empty()) { + strings::StrAppend(&struct_decl, " private:\n", defaults_static_storage); + } strings::StrAppend(&struct_decl, " };\n"); return struct_decl; diff --git a/tensorflow/cc/framework/gradient_checker.cc b/tensorflow/cc/framework/gradient_checker.cc index de2645cb440bda1f35e764af9197ca97bb760c08..e9f9c59e3aa0e8a9dc5d5e658540e9da73adaca5 100644 --- a/tensorflow/cc/framework/gradient_checker.cc +++ b/tensorflow/cc/framework/gradient_checker.cc @@ -247,7 +247,7 @@ Status ComputeNumericJacobianTranspose(const Scope& scope, const OutputList& xs, auto y_pos_flat = y_pos[y_idx].flat(); auto y_neg_flat = y_neg[y_idx].flat(); const int64 y_size = y_shapes[y_idx].num_elements(); - const Y_T scale = Y_T{2 * delta}; + const Y_T scale = 2 * delta; auto jacobian = (*jacobian_ts)[x_idx * y_num + y_idx].matrix(); for (int c = 0; c < y_size; ++c) { SetJacobian(&jacobian, r * x_stride + unit_dimension, @@ -351,7 +351,14 @@ Status ComputeGradientErrorInternal(const Scope& scope, const OutputList& xs, auto jac_n = jacobian_ns[i].matrix(); for (int r = 0; r < jacobian_ts[i].dim_size(0); ++r) { for (int c = 0; c < jacobian_ts[i].dim_size(1); ++c) { - *max_error = std::max(*max_error, std::fabs(jac_t(r, c) - jac_n(r, c))); + auto cur_error = std::fabs(jac_t(r, c) - jac_n(r, c)); + // Treat any NaN as max_error and immediately return. + // (Note that std::max may ignore NaN arguments.) + if (std::isnan(cur_error)) { + *max_error = cur_error; + return Status::OK(); + } + *max_error = std::max(*max_error, cur_error); } } } @@ -409,6 +416,7 @@ Status ComputeGradientError(const Scope& scope, const Output& x, const Output& y, const TensorShape& y_shape, JAC_T* max_error); INSTANTIATE_GRAD_ERR_TYPE(float, float, float); +INSTANTIATE_GRAD_ERR_TYPE(double, float, double); INSTANTIATE_GRAD_ERR_TYPE(double, double, double); INSTANTIATE_GRAD_ERR_TYPE(complex64, float, float); INSTANTIATE_GRAD_ERR_TYPE(float, complex64, float); diff --git a/tensorflow/cc/framework/gradient_checker_test.cc b/tensorflow/cc/framework/gradient_checker_test.cc index d4f0a7f5ab3716be41e22c02a21aca028f76fb88..8dd762c282eff287bddd49ea6f38b2b8060949b0 100644 --- a/tensorflow/cc/framework/gradient_checker_test.cc +++ b/tensorflow/cc/framework/gradient_checker_test.cc @@ -28,12 +28,14 @@ namespace { using ops::Complex; using ops::Const; +using ops::Div; using ops::MatMul; using ops::Placeholder; using ops::Real; using ops::Split; using ops::Square; using ops::Stack; +using ops::Sub; using ops::Unstack; TEST(GradientCheckerTest, BasicFloat) { @@ -104,6 +106,20 @@ TEST(GradientCheckerTest, Complex64ToFloat) { EXPECT_LT(max_error, 1e-4); } +// When calculating gradients that are undefined, test we get NaN +// as the computed error rather than 0. +TEST(GradientCheckerTest, BasicNan) { + Scope scope = Scope::NewRootScope(); + TensorShape shape({2, 4, 3}); + auto x = Placeholder(scope, DT_FLOAT, Placeholder::Shape(shape)); + // y = x/(x-x) should always return NaN + auto y = Div(scope, x, Sub(scope, x, x)); + float max_error; + TF_ASSERT_OK((ComputeGradientError( + scope, {x}, {shape}, {y}, {shape}, &max_error))); + EXPECT_TRUE(std::isnan(max_error)); +} + TEST(GradientCheckerTest, MatMulGrad) { Scope scope = Scope::NewRootScope(); diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc index 62a889181e787f2e181135ab0563c45e1bab8812..7f6ac4cae78d8d6e118837fce9ae5270336cdc89 100644 --- a/tensorflow/cc/framework/scope.cc +++ b/tensorflow/cc/framework/scope.cc @@ -37,6 +37,11 @@ Scope& Scope::operator=(const Scope& other) { return *this; } +namespace { +const char kScopeSeparator[] = "/"; +const char kSuffixSeparator[] = "_"; +} // namespace + Scope::Impl::Impl(Graph* graph, Status* status, NameMap* name_map, ShapeRefiner* refiner, bool disable_shape_inference) : graph_(graph), @@ -220,7 +225,7 @@ std::unordered_set Scope::Impl::GetColocationConstraints( for (const string& entry : node_constraints) { StringPiece s(entry); if (str_util::ConsumePrefix(&s, kColocationGroupPrefix)) { - current_constraints.insert(std::string(s)); + current_constraints.emplace(s); } } } else { @@ -308,19 +313,23 @@ string Scope::Impl::GetUniqueName(const string& prefix, return prefix; } auto entry = name_map_->find(prefix); - string unique_name = prefix; if (entry == name_map_->end()) { name_map_->insert({prefix, 0}); - } else { - unique_name = strings::StrCat(unique_name, "_", ++entry->second); + return prefix; } + string unique_name; + do { + unique_name = strings::StrCat(prefix, kSuffixSeparator, ++entry->second); + } while (name_map_->find(unique_name) != name_map_->end()); + name_map_->insert({unique_name, 0}); return unique_name; } string Scope::Impl::GetNameForOp(const string& default_name) const { const string unique_name = GetUniqueName(default_name, true /* check_single_use */); - const string sep = name_.empty() || unique_name.empty() ? "" : "/"; + const string sep = + name_.empty() || unique_name.empty() ? "" : kScopeSeparator; return strings::StrCat(name_, sep, unique_name); } @@ -345,7 +354,8 @@ Scope Scope::NewSubScope(const string& child_scope_name) const { } const string unique_name = impl()->GetUniqueName(child_scope_name, false /* check_single_use */); - const string sep = impl()->name_.empty() || unique_name.empty() ? "" : "/"; + const string sep = + impl()->name_.empty() || unique_name.empty() ? "" : kScopeSeparator; return Scope(new Impl(*this, Impl::Tags::ScopeName(), strings::StrCat(impl()->name_, sep, unique_name), false /* copy_names */)); @@ -412,7 +422,7 @@ CompositeOpScopes Scope::GetCompositeOpScopes( if (!impl()->single_use_scope()) { Scope child = NewSubScope(impl()->op_name_.empty() ? composite_op_name : impl()->op_name_); - const string child_op_sep = impl()->name_.empty() ? "" : "_"; + const string child_op_sep = impl()->name_.empty() ? "" : kSuffixSeparator; const string child_name = strings::StrCat(impl()->name_, child_op_sep, child.impl()->name_); return {child, @@ -435,7 +445,13 @@ class InternalScope { static Scope NewScope(Graph* graph, Status* status, ShapeRefiner* refiner) { Scope::Impl::NameMap* name_map = new Scope::Impl::NameMap; for (const Node* node : graph->nodes()) { - (*name_map)[node->name()] = 0; + const string& name = node->name(); + (*name_map)[name] = 0; + // Add all name prefixes ('/' separated). + size_t idx = -1; + while ((idx = name.find(kScopeSeparator, idx + 1)) != string::npos) { + (*name_map)[name.substr(0, idx)] = 0; + } } // We provide null destructors for these shared ptrs (except for name_map) // since the caller owns them and doesn't want the scope to destroy them. diff --git a/tensorflow/cc/framework/scope_internal.h b/tensorflow/cc/framework/scope_internal.h index 8efcfed20d0b86d86d8c20a3d8630c7c6bc909c3..58adaef2e942a7fa6b0ce8d5534ac3e2fd380580 100644 --- a/tensorflow/cc/framework/scope_internal.h +++ b/tensorflow/cc/framework/scope_internal.h @@ -34,8 +34,7 @@ class Scope::Impl { // name that has not been used so far in a scope will get no suffix. Later // uses of the same name will get suffixes _1, _2, _3, etc. Multiple scopes // can share the same NameMap. For instance, a new scope created using - // WithControlDependencies() should would share the same NameMap with the - // parent. + // WithControlDependencies() would share the same NameMap with the parent. typedef std::unordered_map NameMap; Impl(const std::shared_ptr& graph, diff --git a/tensorflow/cc/framework/scope_test.cc b/tensorflow/cc/framework/scope_test.cc index 9eca9d3face34319413e1acbc2f5ac0b2ba85374..b40b345eb84237c34ea593021bea022ad28095f7 100644 --- a/tensorflow/cc/framework/scope_test.cc +++ b/tensorflow/cc/framework/scope_test.cc @@ -26,6 +26,16 @@ TEST(ScopeTest, BasicNames) { EXPECT_EQ(root.GetUniqueNameForOp("mul"), "mul"); } +TEST(ScopeTest, OpAndScopeNameCollision) { + Scope root = Scope::NewRootScope(); + EXPECT_EQ(root.GetUniqueNameForOp("foo"), "foo"); + EXPECT_EQ(root.GetUniqueNameForOp("foo"), "foo_1"); + EXPECT_EQ(root.GetUniqueNameForOp("foo_1"), "foo_1_1"); + EXPECT_EQ(root.GetUniqueNameForOp("foo_2"), "foo_2"); + EXPECT_EQ(root.GetUniqueNameForOp("foo"), "foo_3"); + EXPECT_EQ(root.GetUniqueNameForOp("foo_2"), "foo_2_1"); +} + TEST(ScopeTest, HierarchicalNames) { Scope root = Scope::NewRootScope(); Scope child = root.NewSubScope("child"); diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc index ff348fadb24e29a83bd6c8853aa67931f6df4182..e9173227aadbf86eab666e6c17bacacb92888572 100644 --- a/tensorflow/cc/gradients/array_grad.cc +++ b/tensorflow/cc/gradients/array_grad.cc @@ -120,6 +120,24 @@ Status SplitGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Split", SplitGrad); +Status FillGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // y = fill(fill_shape, x) + // No gradient returned for the fill_shape argument. + grad_outputs->push_back(NoGradient()); + // The gradient for x (which must be a scalar) is just the sum of + // all the gradients from the shape it fills. + // We use ReduceSum to implement this, which needs an argument providing + // the indices of all the dimensions of the incoming gradient. + // grad(x) = reduce_sum(grad(y), [0..rank(grad(y))]) + auto all_dims = Range(scope, Const(scope, 0), Rank(scope, grad_inputs[0]), + Const(scope, 1)); + grad_outputs->push_back(ReduceSum(scope, grad_inputs[0], all_dims)); + return scope.status(); +} +REGISTER_GRADIENT_OP("Fill", FillGrad); + Status DiagGrad(const Scope& scope, const Operation& op, const std::vector& grad_inputs, std::vector* grad_outputs) { @@ -421,6 +439,58 @@ Status StridedSliceGradHelper(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("StridedSlice", StridedSliceGradHelper); +Status SliceGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // Propagate the incoming gradient along all the selected values, + // and zero everywhere else. Use the Pad operator for this. + // + // First create an Nx2 padding where N is the number of input + // dimensions. The first column is the number of prepended zeros + // for each dimension, and the second column is the number of + // appended zeros. + // + // The first column is just the begin vector. + // The second column is the shape of the input element-wise + // subtracted by begin+size + + // Running example: + // input.shape = [3, 5, 3] + // begin = [1, 2, 1], size = [1, 3, 2] + Input input = op.input(0); + Input begin = op.input(1); + // input_rank = 3 + auto input_rank = Rank(scope, input); + // slice_size = [1, 3, 2] + auto slice_size = Shape(scope, op.output(0)); + // padding_shape = [3, 1] + auto padding_shape = Stack(scope, {input_rank, 1}); + // before_padding = [[1] + // [2] + // [1]] + Input before_padding = Reshape(scope, begin, padding_shape); + // after_padding_sizes = shape(input) - slice_size - begin + // = [3, 5, 3] - [1, 3, 2] - [1, 2, 1] + // = [1, 0, 0] + auto after_padding_sizes = + Sub(scope, Sub(scope, Shape(scope, input), slice_size), begin); + // after_padding = [[1] + // [0] + // [0]] + Input after_padding = Reshape(scope, after_padding_sizes, padding_shape); + // paddings = [[1 1] + // [2 0] + // [1 0]] + auto paddings = + Concat(scope, {before_padding, after_padding}, Const(scope, 1)); + grad_outputs->push_back(Pad(scope, grad_inputs[0], paddings)); + // Nothing propagated for "begin" and "size" inputs + grad_outputs->push_back(NoGradient()); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("Slice", SliceGrad); + } // anonymous namespace } // namespace ops } // namespace tensorflow diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc index de3bd0fc9e2493f8ff76163f5be6bd4327c58c5a..f41de3dc2098df55fbbb616557f264a4e70db6b6 100644 --- a/tensorflow/cc/gradients/array_grad_test.cc +++ b/tensorflow/cc/gradients/array_grad_test.cc @@ -108,6 +108,14 @@ TEST_F(ArrayGradTest, SplitGrad) { RunTest({x}, {x_shape}, y.output, {y_shape, y_shape}); } +TEST_F(ArrayGradTest, FillGrad) { + TensorShape x_shape({}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + TensorShape y_shape({2, 5, 3}); + auto y = Fill(scope_, {2, 5, 3}, x); + RunTest(x, x_shape, y, y_shape); +} + TEST_F(ArrayGradTest, DiagGrad) { TensorShape x_shape({5, 2}); auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); @@ -378,5 +386,12 @@ TEST_F(ArrayGradTest, StridedSliceGrad) { RunTest(x, x_shape, y, {1, 2, 2, 2}); } +TEST_F(ArrayGradTest, SliceGrad) { + TensorShape x_shape({3, 5, 3}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + auto y = Slice(scope_, x, {1, 2, 1}, {1, 3, 2}); + RunTest(x, x_shape, y, {1, 3, 2}); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/cc/gradients/image_grad.cc b/tensorflow/cc/gradients/image_grad.cc new file mode 100644 index 0000000000000000000000000000000000000000..882709e1e2817431a32c453fe0f35f2b2e6c69b0 --- /dev/null +++ b/tensorflow/cc/gradients/image_grad.cc @@ -0,0 +1,74 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include "tensorflow/cc/framework/grad_op_registry.h" +#include "tensorflow/cc/framework/gradients.h" +#include "tensorflow/cc/ops/image_ops_internal.h" +#include "tensorflow/cc/ops/standard_ops.h" + +namespace tensorflow { +namespace ops { +namespace { + +Status ResizeNearestNeighborGradHelper(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + bool align_corners; + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "align_corners", &align_corners)); + // The internal gradient implementation needs the shape of the input image. + // x_shape = shape(x)[1:3] + // = slice(shape(x), {1}, {3 - 1}) + auto x_shape = Slice(scope, Shape(scope, op.input(0)), {1}, {2}); + grad_outputs->push_back(internal::ResizeNearestNeighborGrad( + scope, grad_inputs[0], x_shape, + internal::ResizeNearestNeighborGrad::AlignCorners(align_corners))); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("ResizeNearestNeighbor", ResizeNearestNeighborGradHelper); + +Status ResizeBilinearGradHelper(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + bool align_corners; + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "align_corners", &align_corners)); + grad_outputs->push_back(internal::ResizeBilinearGrad( + scope, grad_inputs[0], op.input(0), + internal::ResizeBilinearGrad::AlignCorners(align_corners))); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("ResizeBilinear", ResizeBilinearGradHelper); + +Status ResizeBicubicGradHelper(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + bool align_corners; + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "align_corners", &align_corners)); + grad_outputs->push_back(internal::ResizeBicubicGrad( + scope, grad_inputs[0], op.input(0), + internal::ResizeBicubicGrad::AlignCorners(align_corners))); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("ResizeBicubic", ResizeBicubicGradHelper); + +} // anonymous namespace +} // namespace ops +} // namespace tensorflow diff --git a/tensorflow/cc/gradients/image_grad_test.cc b/tensorflow/cc/gradients/image_grad_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e55c7561b030c50bd67bd53fd0d55710085c5d2 --- /dev/null +++ b/tensorflow/cc/gradients/image_grad_test.cc @@ -0,0 +1,157 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/client/client_session.h" +#include "tensorflow/cc/framework/grad_op_registry.h" +#include "tensorflow/cc/framework/gradient_checker.h" +#include "tensorflow/cc/framework/testutil.h" +#include "tensorflow/cc/gradients/grad_testutil.h" +#include "tensorflow/cc/ops/image_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace { + +using ops::Const; +using ops::ResizeBicubic; +using ops::ResizeBilinear; +using ops::ResizeNearestNeighbor; + +class ImageGradTest : public ::testing::Test { + protected: + ImageGradTest() : scope_(Scope::NewRootScope()) {} + + enum OpType { RESIZE_NEAREST, RESIZE_BILINEAR, RESIZE_BICUBIC }; + + template + Tensor MakeData(const TensorShape& data_shape) { + DataType data_type = DataTypeToEnum::v(); + Tensor data(data_type, data_shape); + auto data_flat = data.flat(); + for (int i = 0; i < data_flat.size(); ++i) { + data_flat(i) = T(i); + } + return data; + } + + template + void MakeOp(const OpType op_type, const Tensor& x_data, const Input& y_shape, + const bool align_corners, Output* x, Output* y) { + *x = Const(scope_, x_data); + switch (op_type) { + case RESIZE_NEAREST: + *y = ResizeNearestNeighbor( + scope_, *x, y_shape, + ResizeNearestNeighbor::AlignCorners(align_corners)); + return; + case RESIZE_BILINEAR: + *y = ResizeBilinear(scope_, *x, y_shape, + ResizeBilinear::AlignCorners(align_corners)); + return; + case RESIZE_BICUBIC: + *y = ResizeBicubic(scope_, *x, y_shape, + ResizeBicubic::AlignCorners(align_corners)); + return; + } + assert(false); + } + + template + void TestResizedShapeForType(const OpType op_type, const bool align_corners) { + TensorShape x_shape({1, 2, 2, 1}); + Tensor x_data = MakeData(x_shape); + Output x, y; + MakeOp(op_type, x_data, {4, 6}, align_corners, &x, &y); + + ClientSession session(scope_); + std::vector outputs; + TF_ASSERT_OK(session.Run({y}, &outputs)); + EXPECT_EQ(outputs.size(), 1); + EXPECT_EQ(outputs[0].shape(), TensorShape({1, 4, 6, 1})); + } + + void TestResizedShape(OpType op_type) { + for (const bool align_corners : {true, false}) { + TestResizedShapeForType(op_type, align_corners); + TestResizedShapeForType(op_type, align_corners); + TestResizedShapeForType(op_type, align_corners); + } + } + + template + void TestResizeToSmallerAndAlign(const OpType op_type, + const bool align_corners) { + TensorShape x_shape({1, 4, 6, 1}); + Tensor x_data = MakeData(x_shape); + Output x, y; + MakeOp(op_type, x_data, {2, 3}, align_corners, &x, &y); + JAC_T max_error; + TF_ASSERT_OK((ComputeGradientError( + scope_, x, x_data, y, {1, 2, 3, 1}, &max_error))); + EXPECT_LT(max_error, 1e-3); + } + + template + void TestResizeToLargerAndAlign(const OpType op_type, + const bool align_corners) { + TensorShape x_shape({1, 2, 3, 1}); + Tensor x_data = MakeData(x_shape); + Output x, y; + MakeOp(op_type, x_data, {4, 6}, align_corners, &x, &y); + JAC_T max_error; + TF_ASSERT_OK((ComputeGradientError( + scope_, x, x_data, y, {1, 4, 6, 1}, &max_error))); + EXPECT_LT(max_error, 1e-3); + } + + template + void TestResize(OpType op_type) { + for (const bool align_corners : {true, false}) { + TestResizeToSmallerAndAlign(op_type, align_corners); + TestResizeToLargerAndAlign(op_type, align_corners); + } + } + + Scope scope_; +}; + +TEST_F(ImageGradTest, TestNearestNeighbor) { + TestResizedShape(RESIZE_NEAREST); + TestResize(RESIZE_NEAREST); + TestResize(RESIZE_NEAREST); +} + +TEST_F(ImageGradTest, TestBilinear) { + TestResizedShape(RESIZE_BILINEAR); + TestResize(RESIZE_BILINEAR); + // Note that Y_T is always float for this op. We choose + // double for the jacobian to capture the higher precision + // between X_T and Y_T. + TestResize(RESIZE_BILINEAR); +} + +TEST_F(ImageGradTest, TestBicubic) { + TestResizedShape(RESIZE_BICUBIC); + TestResize(RESIZE_BICUBIC); + // Note that Y_T is always float for this op. We choose + // double for the jacobian to capture the higher precision + // between X_T and Y_T. + TestResize(RESIZE_BICUBIC); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc index 52c177212a8c88f1857defcc38de4a01ac47dab0..1329b568ab8d4cc5cc5eed554e74bf1100d9bdcf 100644 --- a/tensorflow/cc/gradients/math_grad.cc +++ b/tensorflow/cc/gradients/math_grad.cc @@ -38,6 +38,7 @@ REGISTER_NO_GRADIENT_OP("NotEqual"); REGISTER_NO_GRADIENT_OP("LogicalAnd"); REGISTER_NO_GRADIENT_OP("LogicalOr"); REGISTER_NO_GRADIENT_OP("LogicalNot"); +REGISTER_NO_GRADIENT_OP("Floor"); // Conjugate helper function returns the conjugate of an Output if it // is complex valued. @@ -440,6 +441,21 @@ Status RealDivGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("RealDiv", RealDivGrad); +Status DivNoNanGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + auto x_1 = ConjugateHelper(scope, op.input(0)); + auto x_2 = ConjugateHelper(scope, op.input(1)); + // y = x_1 / x_2 + // dy/dx_1 = 1/x_2 + // dy/dx_2 = -x_1/x_2^2 + auto gx_1 = DivNoNan(scope, grad_inputs[0], x_2); + auto gx_2 = Mul(scope, grad_inputs[0], + DivNoNan(scope, DivNoNan(scope, Neg(scope, x_1), x_2), x_2)); + return BinaryGradCommon(scope, op, grad_outputs, gx_1, gx_2); +} +REGISTER_GRADIENT_OP("DivNoNan", DivNoNanGrad); + Status SquaredDifferenceGrad(const Scope& scope, const Operation& op, const std::vector& grad_inputs, std::vector* grad_outputs) { @@ -1006,6 +1022,26 @@ Status ProdGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Prod", ProdGrad); +Status SegmentSumGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // The SegmentSum operation sums segments of the Tensor that have the same + // index in the segment_ids parameter. + // i.e z = [2, 3, 4, 5], segment_ids [0, 0, 0, 1] + // will produce [2 + 3 + 4, 5] = [9, 5] + // The gradient that will flow back to the gather operation will look like + // [x1, x2], it will have the same shape as the output of the SegmentSum + // operation. The differentiation step of the SegmentSum operation just + // broadcast the gradient in order to retrieve the z's shape. + // dy/dz = [x1, x1, x1, x2] + grad_outputs->push_back(Gather(scope, grad_inputs[0], op.input(1))); + + // stop propagation along segment_ids + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("SegmentSum", SegmentSumGrad); + // MatMulGrad helper function used to compute two MatMul operations // based on input matrix transposition combinations. Status MatMulGradHelper(const Scope& scope, const bool is_batch, diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc index fd7b6fe6625f27bda92e2f56f60908658cdecd7e..c16938322c3555939ace1013f3bb95c5689b503e 100644 --- a/tensorflow/cc/gradients/math_grad_test.cc +++ b/tensorflow/cc/gradients/math_grad_test.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/cc/client/client_session.h" #include "tensorflow/cc/framework/grad_op_registry.h" #include "tensorflow/cc/framework/gradient_checker.h" +#include "tensorflow/cc/framework/gradients.h" #include "tensorflow/cc/framework/testutil.h" #include "tensorflow/cc/gradients/grad_testutil.h" #include "tensorflow/cc/ops/standard_ops.h" @@ -31,6 +33,7 @@ using ops::AddN; using ops::BatchMatMul; using ops::Const; using ops::Div; +using ops::DivNoNan; using ops::MatMul; using ops::Max; using ops::Maximum; @@ -42,6 +45,7 @@ using ops::Placeholder; using ops::Pow; using ops::Prod; using ops::RealDiv; +using ops::SegmentSum; using ops::SquaredDifference; using ops::Sub; using ops::Sum; @@ -475,11 +479,7 @@ TEST_F(CWiseUnaryGradTest, Tan_Complex) { auto x_fn = [this](const int i) { return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}}); }; - // TODO(kbsriram) - // Enable when tan kernel supports complex inputs - if (false) { - TestCWiseGrad(TAN, x_fn); - } + TestCWiseGrad(TAN, x_fn); } TEST_F(CWiseUnaryGradTest, Atan) { @@ -854,6 +854,36 @@ TEST_F(NaryGradTest, RealDiv) { RunTest({x}, {x_shape}, {y}, {x_shape}); } +TEST_F(NaryGradTest, DivNoNan) { + { + TensorShape x_shape({3, 2, 5}); + const auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + // Test x / (1 + |x|) rather than x_1 / x_2 to avoid triggering large + // division errors in the numeric estimator used by the gradient checker. + const auto y = DivNoNan( + scope_, x, Add(scope_, Const(scope_, 1), Abs(scope_, x))); + RunTest({x}, {x_shape}, {y}, {x_shape}); + } + { + // Return 0 gradient (rather than NaN) for division by zero. + const auto x = Placeholder(scope_, DT_FLOAT); + const auto zero = Const(scope_, 0.0); + const auto y = DivNoNan(scope_, x, zero); + + std::vector grad_outputs; + TF_EXPECT_OK(AddSymbolicGradients(scope_, {y}, {x}, &grad_outputs)); + ClientSession session(scope_); + std::vector grad_result; + TF_EXPECT_OK( + session.Run({{x, {-3.0f, 0.0f, 3.0f}}}, grad_outputs, &grad_result)); + EXPECT_EQ(grad_result.size(), 1); + EXPECT_EQ(grad_result[0].NumElements(), 3); + EXPECT_EQ(grad_result[0].flat()(0), 0.0f); + EXPECT_EQ(grad_result[0].flat()(1), 0.0f); + EXPECT_EQ(grad_result[0].flat()(2), 0.0f); + } +} + TEST_F(NaryGradTest, SquaredDifference) { TensorShape x1_shape({3, 2, 5}); TensorShape x2_shape({2, 5}); @@ -902,5 +932,14 @@ TEST_F(NaryGradTest, Prod) { RunTest({x}, {x_shape}, {y}, {y_shape}); } +TEST_F(NaryGradTest, SegmentSum) { + TensorShape x_shape({3, 4}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + auto y = SegmentSum(scope_, x, {0, 0, 1}); + // the sum is always on the first dimension + TensorShape y_shape({2, 4}); + RunTest({x}, {x_shape}, {y}, {y_shape}); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc index c73482d5f4d13ade0dc0412941251d1651371b6e..588e96cb196189780037f66266484962ba0385e4 100644 --- a/tensorflow/cc/gradients/nn_grad.cc +++ b/tensorflow/cc/gradients/nn_grad.cc @@ -47,6 +47,72 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad); +bool IsZero(const Scope& scope, const Output& grad) { + string op_type_name = grad.op().node()->type_string(); + if (op_type_name == "ZerosLike" || op_type_name == "Zeros") { + return true; + } + // The Operation we were provided is not named something obvious so + // we need to actually look at its contents. + // The original python code did this by calling a utility function called + // tensor_util.constant_value. + // There is no C++ equivalent to tensor_util.constant_value so we do nothing + // for the moment. + return false; +} + +// Multiply after broadcasting vec to match dimensions of mat. +// Args: +// vec: A 1-D tensor of dimension [D0] +// mat: A 2-D tensor of dimesnion [D0, D1] +// +// Returns: +// A tensor of dimension [D0, D1], the result fo vec * mat. +Output BroadcastMul(const Scope& scope, const Output& vec, const Output& mat) { + auto reshaped = ExpandDims(scope, vec, -1); + return Multiply(scope, reshaped, mat); +} + +Status SoftmaxCrossEntropyWithLogitsGrad(const Scope& scope, + const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // Softmax gradient with cross entropy logits function. + // We multiply the backprop for cost with the gradients - op.output[1]. + // There is no gradient for labels. + + // The outputs of the network are at input index 0. + auto logits = op.input(0); + // The "truth" labels are at index 1. + auto softmax_grad = op.output(1); + + // The loss is the output at index 0, and backprop is the output at index 1. + auto grad_loss = grad_inputs[0]; + auto grad_grad = grad_inputs[1]; + + auto grad = BroadcastMul(scope, grad_loss, softmax_grad); + if (!IsZero(scope, grad_grad)) { + std::vector axis; + auto logits_softmax = Softmax(scope, logits); + + auto grad_grad_expand = ExpandDims(scope, grad_grad, 1); + auto logits_softmax_expand = ExpandDims(scope, logits_softmax, 2); + auto matmul_result = + BatchMatMul(scope, grad_grad_expand, logits_softmax_expand); + axis.push_back(1); + auto squeeze_result = Squeeze(scope, matmul_result, Squeeze::Axis(axis)); + auto subtraction_result = Subtract(scope, grad_grad, squeeze_result); + auto multiply_result = Multiply(scope, subtraction_result, logits_softmax); + grad = Add(scope, grad, multiply_result); + } + auto minus_log_softmax = Multiply(scope, LogSoftmax(scope, logits), -1.0f); + grad_outputs->push_back(grad); + grad_outputs->push_back(BroadcastMul(scope, grad_loss, minus_log_softmax)); + return scope.status(); +} +REGISTER_GRADIENT_OP("SoftmaxCrossEntropyWithLogits", + SoftmaxCrossEntropyWithLogitsGrad); + Status LogSoftmaxGrad(const Scope& scope, const Operation& op, const std::vector& grad_inputs, std::vector* grad_outputs) { @@ -195,9 +261,9 @@ Status MaxPool3DGradHelper(const Scope& scope, const Operation& op, TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding)); TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); MaxPool3DGrad::Attrs grad_attrs; - auto dx = MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0], - ksize, strides, padding, - grad_attrs.DataFormat(data_format)); + auto dx = + MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0], ksize, + strides, padding, grad_attrs.DataFormat(data_format)); grad_outputs->push_back(dx); return scope.status(); } @@ -216,10 +282,9 @@ Status AvgPoolGradHelper(const Scope& scope, const Operation& op, TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding)); TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); internal::AvgPoolGrad::Attrs grad_attrs; - auto dx = - internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], - ksize, strides, padding, - grad_attrs.DataFormat(data_format)); + auto dx = internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), + grad_inputs[0], ksize, strides, padding, + grad_attrs.DataFormat(data_format)); grad_outputs->push_back(dx); return scope.status(); } @@ -238,9 +303,9 @@ Status AvgPool3DGradHelper(const Scope& scope, const Operation& op, TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding)); TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); AvgPool3DGrad::Attrs grad_attrs; - auto dx = AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], - ksize, strides, padding, - grad_attrs.DataFormat(data_format)); + auto dx = + AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], ksize, + strides, padding, grad_attrs.DataFormat(data_format)); grad_outputs->push_back(dx); return scope.status(); } diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc index b4d457a9d14eb79232cda9412fa0050f6a9968cc..aa72cf7ba2a958f54d50b59f0edaefb27edf0e86 100644 --- a/tensorflow/cc/gradients/nn_grad_test.cc +++ b/tensorflow/cc/gradients/nn_grad_test.cc @@ -25,6 +25,8 @@ limitations under the License. namespace tensorflow { namespace { +using ops::AvgPool; +using ops::AvgPool3D; using ops::BiasAdd; using ops::Conv2D; using ops::Elu; @@ -33,11 +35,9 @@ using ops::FractionalMaxPool; using ops::L2Loss; using ops::LogSoftmax; using ops::LRN; -using ops::AvgPool; -using ops::AvgPool3D; using ops::MaxPool; -using ops::MaxPoolV2; using ops::MaxPool3D; +using ops::MaxPoolV2; using ops::Placeholder; using ops::Relu; using ops::Relu6; @@ -111,6 +111,20 @@ TEST_F(NNGradTest, SoftmaxGrad) { RunTest(x, shape, y, shape); } +TEST_F(NNGradTest, SoftmaxCrossEntropyWithLogitsGrad) { + TensorShape logits_shape({5, 3}); + TensorShape loss_shape({5}); + + auto logits = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(logits_shape)); + auto labels = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(logits_shape)); + auto y = + tensorflow::ops::SoftmaxCrossEntropyWithLogits(scope_, logits, labels); + // Note the reversal of the backprop and loss orders. Issue #18734 has been + // opened for this. + RunTest({logits, labels}, {logits_shape, logits_shape}, {y.backprop, y.loss}, + {logits_shape, loss_shape}); +} + TEST_F(NNGradTest, LogSoftmaxGrad) { TensorShape shape({5, 3}); auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape)); @@ -253,7 +267,7 @@ TEST_F(NNGradTest, AvgPool3DGradHelper) { RunTest(x, x_shape, y, y_shape); } -TEST_F(NNGradTest, LRN){ +TEST_F(NNGradTest, LRN) { TensorShape x_shape({1, 1, 2, 1}); auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); auto y = LRN(scope_, x); diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD index 06a3be18e08f611d3ecf9804908d791d15fdab13..3d3895c8fa82c3c0e2974228e9cad767d0e00df4 100644 --- a/tensorflow/cc/saved_model/BUILD +++ b/tensorflow/cc/saved_model/BUILD @@ -33,6 +33,46 @@ cc_library( hdrs = ["tag_constants.h"], ) +cc_library( + name = "reader", + srcs = ["reader.cc"], + hdrs = ["reader.h"], + deps = [ + ":constants", + ] + if_not_mobile([ + # TODO(b/111634734): :lib and :protos_all contain dependencies that + # cannot be built on mobile platforms. Instead, include the appropriate + # tf_lib depending on the build platform. + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + ]) + if_mobile([ + # Mobile-friendly SavedModel proto. See go/portable-proto for more info. + "//tensorflow/core:saved_model_portable_proto", + ]) + if_android([ + "//tensorflow/core:android_tensorflow_lib", + ]) + if_ios([ + "//tensorflow/core:ios_tensorflow_lib", + ]), +) + +tf_cc_test( + name = "reader_test", + srcs = ["reader_test.cc"], + data = [ + ":saved_model_half_plus_two", + ], + linkstatic = 1, + deps = [ + ":constants", + ":reader", + ":tag_constants", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + cc_library( name = "loader", hdrs = ["loader.h"], @@ -54,6 +94,7 @@ cc_library( hdrs = ["loader.h"], deps = [ ":constants", + ":reader", ] + if_not_mobile([ "//tensorflow/core:core_cpu", "//tensorflow/core:framework", diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc index faa1e378d07ea94ad08ee084d18bf6a113f054af..c6abe2f41b9b5ec2faee6f65b429ff606f8ac08e 100644 --- a/tensorflow/cc/saved_model/loader.cc +++ b/tensorflow/cc/saved_model/loader.cc @@ -18,8 +18,10 @@ limitations under the License. #include #include "tensorflow/cc/saved_model/constants.h" +#include "tensorflow/cc/saved_model/reader.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/monitoring/counter.h" +#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/protobuf_internal.h" @@ -43,56 +45,6 @@ auto* load_latency = monitoring::Counter<1>::New( constexpr char kLoadAttemptFail[] = "fail"; constexpr char kLoadAttemptSuccess[] = "success"; -Status ReadSavedModel(const string& export_dir, SavedModel* saved_model_proto) { - const string saved_model_pb_path = - io::JoinPath(export_dir, kSavedModelFilenamePb); - if (Env::Default()->FileExists(saved_model_pb_path).ok()) { - return ReadBinaryProto(Env::Default(), saved_model_pb_path, - saved_model_proto); - } - const string saved_model_pbtxt_path = - io::JoinPath(export_dir, kSavedModelFilenamePbTxt); - if (Env::Default()->FileExists(saved_model_pbtxt_path).ok()) { - return ReadTextProto(Env::Default(), saved_model_pbtxt_path, - saved_model_proto); - } - return Status(error::Code::NOT_FOUND, - "Could not find SavedModel .pb or .pbtxt at supplied export " - "directory path: " + - export_dir); -} - -string GetTagsAsString(const std::unordered_set& tags) { - string tags_as_string = "{ "; - for (const string& tag : tags) { - tags_as_string = strings::StrCat(tags_as_string, tag, " "); - } - tags_as_string = strings::StrCat(tags_as_string, "}"); - return tags_as_string; -} - -Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto, - const std::unordered_set& tags, - MetaGraphDef* meta_graph_def_to_load) { - for (const MetaGraphDef& meta_graph_def : saved_model_proto.meta_graphs()) { - // Get tags from the meta_graph_def. - std::unordered_set graph_tags; - for (const string& tag : meta_graph_def.meta_info_def().tags()) { - graph_tags.insert(tag); - } - // Match with the set of tags provided. - if (graph_tags == tags) { - *meta_graph_def_to_load = meta_graph_def; - return Status::OK(); - } - } - return Status(error::Code::NOT_FOUND, - "Could not find meta graph def matching supplied tags: " + - GetTagsAsString(tags) + - ". To inspect available tag-sets in the SavedModel, please " - "use the SavedModel CLI: `saved_model_cli`"); -} - Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def, const SessionOptions& session_options, std::unique_ptr* session) { @@ -122,6 +74,54 @@ void AddAssetsTensorsToInputs(const StringPiece export_dir, } } +// Like Session::Run(), but uses the Make/Run/ReleaseCallable() API to avoid +// leaving behind non-GC'ed state. +// +// Detailed motivation behind this approach, from ashankar@: +// +// Each call to Session::Run() that identifies a new subgraph (based on feeds +// and fetches) creates some datastructures that live as long as the session +// (the partitioned graph, associated executors etc.). +// +// A pathological case of this would be if say the initialization op +// (main_op/legacy_init_op) involves the use of a large constant. Then we +// allocate memory for that large constant that will just stick around till the +// session dies. With this Callable mechanism, that memory will be released +// right after ReleaseCallable returns. +// +// However, the resource manager state remains. +Status RunOnce(const RunOptions& run_options, + const std::vector>& inputs, + const std::vector& output_tensor_names, + const std::vector& target_node_names, + std::vector* outputs, RunMetadata* run_metadata, + Session* session) { + CallableOptions callable_options; + std::vector feed_tensors; + *callable_options.mutable_run_options() = run_options; + for (const auto& input : inputs) { + const string& name = input.first; + const Tensor& tensor = input.second; + callable_options.add_feed(name); + feed_tensors.push_back(tensor); + } + for (const string& output_tensor_name : output_tensor_names) { + callable_options.add_fetch(output_tensor_name); + } + for (const string& target_node_name : target_node_names) { + callable_options.add_target(target_node_name); + } + + Session::CallableHandle callable_handle; + TF_RETURN_IF_ERROR(session->MakeCallable(callable_options, &callable_handle)); + const Status run_status = session->RunCallable(callable_handle, feed_tensors, + outputs, run_metadata); + // Be sure to call ReleaseCallable() regardless of the outcome of + // RunCallable(). + session->ReleaseCallable(callable_handle).IgnoreError(); + return run_status; +} + bool HasMainOp(const MetaGraphDef& meta_graph_def) { const auto& collection_def_map = meta_graph_def.collection_def(); if (collection_def_map.find(kSavedModelMainOpKey) != @@ -134,10 +134,11 @@ bool HasMainOp(const MetaGraphDef& meta_graph_def) { Status RunMainOp(const RunOptions& run_options, const string& export_dir, const MetaGraphDef& meta_graph_def, const std::vector& asset_file_defs, - Session* session) { - LOG(INFO) << "Running MainOp on SavedModel bundle."; + Session* session, const string& main_op_key) { + LOG(INFO) << "Running MainOp with key " << main_op_key + << " on SavedModel bundle."; const auto& collection_def_map = meta_graph_def.collection_def(); - const auto main_op_it = collection_def_map.find(kSavedModelMainOpKey); + const auto main_op_it = collection_def_map.find(main_op_key); if (main_op_it != collection_def_map.end()) { if (main_op_it->second.node_list().value_size() != 1) { return errors::FailedPrecondition( @@ -147,8 +148,8 @@ Status RunMainOp(const RunOptions& run_options, const string& export_dir, AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs); RunMetadata run_metadata; const StringPiece main_op_name = main_op_it->second.node_list().value(0); - return session->Run(run_options, inputs, {}, {main_op_name.ToString()}, - nullptr /* outputs */, &run_metadata); + return RunOnce(run_options, inputs, {}, {string(main_op_name)}, + nullptr /* outputs */, &run_metadata, session); } return Status::OK(); } @@ -169,7 +170,8 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir, variables_directory, MetaFilename(kSavedModelVariablesFilename)); if (!Env::Default()->FileExists(variables_index_path).ok()) { LOG(INFO) << "The specified SavedModel has no variables; no checkpoints " - "were restored."; + "were restored. File does not exist: " + << variables_index_path; return Status::OK(); } const string variables_path = @@ -180,37 +182,13 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir, variables_path_tensor.scalar()() = variables_path; std::vector> inputs = { - {variable_filename_const_op_name.ToString(), variables_path_tensor}}; + {string(variable_filename_const_op_name), variables_path_tensor}}; AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs); RunMetadata run_metadata; - return session->Run(run_options, inputs, {}, {restore_op_name.ToString()}, - nullptr /* outputs */, &run_metadata); -} - -Status RunLegacyInitOp(const RunOptions& run_options, const string& export_dir, - const MetaGraphDef& meta_graph_def, - const std::vector& asset_file_defs, - Session* session) { - LOG(INFO) << "Running LegacyInitOp on SavedModel bundle."; - const auto& collection_def_map = meta_graph_def.collection_def(); - const auto init_op_it = collection_def_map.find(kSavedModelLegacyInitOpKey); - if (init_op_it != collection_def_map.end()) { - if (init_op_it->second.node_list().value_size() != 1) { - return errors::FailedPrecondition(strings::StrCat( - "Expected exactly one serving init op in : ", export_dir)); - } - std::vector> inputs; - AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs); - RunMetadata run_metadata; - const StringPiece legacy_init_op_name = - init_op_it->second.node_list().value(0); - return session->Run(run_options, inputs, {}, - {legacy_init_op_name.ToString()}, nullptr /* outputs */, - &run_metadata); - } - return Status::OK(); + return RunOnce(run_options, inputs, {}, {string(restore_op_name)}, + nullptr /* outputs */, &run_metadata, session); } Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def, @@ -235,18 +213,8 @@ Status LoadSavedModelInternal(const SessionOptions& session_options, const string& export_dir, const std::unordered_set& tags, SavedModelBundle* const bundle) { - if (!MaybeSavedModelDirectory(export_dir)) { - return Status(error::Code::NOT_FOUND, - "SavedModel not found in export directory: " + export_dir); - } - LOG(INFO) << "Loading SavedModel with tags: " << GetTagsAsString(tags) - << "; from: " << export_dir; - - SavedModel saved_model_proto; - TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto)); - - TF_RETURN_IF_ERROR( - FindMetaGraphDefToLoad(saved_model_proto, tags, &bundle->meta_graph_def)); + TF_RETURN_IF_ERROR(ReadMetaGraphDefFromSavedModel(export_dir, tags, + &bundle->meta_graph_def)); TF_RETURN_IF_ERROR(LoadMetaGraphIntoSession( bundle->meta_graph_def, session_options, &bundle->session)); @@ -262,11 +230,11 @@ Status LoadSavedModelInternal(const SessionOptions& session_options, if (HasMainOp(bundle->meta_graph_def)) { TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir, bundle->meta_graph_def, asset_file_defs, - bundle->session.get())); + bundle->session.get(), kSavedModelMainOpKey)); } else { - TF_RETURN_IF_ERROR(RunLegacyInitOp(run_options, export_dir, - bundle->meta_graph_def, asset_file_defs, - bundle->session.get())); + TF_RETURN_IF_ERROR(RunMainOp( + run_options, export_dir, bundle->meta_graph_def, asset_file_defs, + bundle->session.get(), kSavedModelLegacyInitOpKey)); } return Status::OK(); } @@ -288,8 +256,8 @@ Status LoadSavedModel(const SessionOptions& session_options, return end_microseconds - start_microseconds; }(); auto log_and_count = [&](const string& status_str) { - LOG(INFO) << "SavedModel load for tags " << GetTagsAsString(tags) - << "; Status: " << status_str << ". Took " + LOG(INFO) << "SavedModel load for tags { " << str_util::Join(tags, " ") + << " }; Status: " << status_str << ". Took " << load_latency_microsecs << " microseconds."; load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1); }; diff --git a/tensorflow/cc/saved_model/reader.cc b/tensorflow/cc/saved_model/reader.cc new file mode 100644 index 0000000000000000000000000000000000000000..2146c8a19745fa9ea2484c4bb4a2104a38d85144 --- /dev/null +++ b/tensorflow/cc/saved_model/reader.cc @@ -0,0 +1,88 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/saved_model/reader.h" + +#include + +#include "tensorflow/cc/saved_model/constants.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/protobuf/saved_model.pb.h" + +namespace tensorflow { +namespace { + +Status ReadSavedModel(const string& export_dir, SavedModel* saved_model_proto) { + LOG(INFO) << "Reading SavedModel from: " << export_dir; + + const string saved_model_pb_path = + io::JoinPath(export_dir, kSavedModelFilenamePb); + if (Env::Default()->FileExists(saved_model_pb_path).ok()) { + return ReadBinaryProto(Env::Default(), saved_model_pb_path, + saved_model_proto); + } + const string saved_model_pbtxt_path = + io::JoinPath(export_dir, kSavedModelFilenamePbTxt); + if (Env::Default()->FileExists(saved_model_pbtxt_path).ok()) { + return ReadTextProto(Env::Default(), saved_model_pbtxt_path, + saved_model_proto); + } + return Status(error::Code::NOT_FOUND, + "Could not find SavedModel .pb or .pbtxt at supplied export " + "directory path: " + + export_dir); +} + +Status FindMetaGraphDef(const SavedModel& saved_model_proto, + const std::unordered_set& tags, + MetaGraphDef* meta_graph_def) { + LOG(INFO) << "Reading meta graph with tags { " << str_util::Join(tags, " ") + << " }"; + for (const MetaGraphDef& graph_def : saved_model_proto.meta_graphs()) { + // Get tags from the graph_def. + std::unordered_set graph_tags; + for (const string& tag : graph_def.meta_info_def().tags()) { + graph_tags.insert(tag); + } + // Match with the set of tags provided. + if (graph_tags == tags) { + *meta_graph_def = graph_def; + return Status::OK(); + } + } + return Status( + error::Code::NOT_FOUND, + strings::StrCat( + "Could not find meta graph def matching supplied tags: { ", + str_util::Join(tags, " "), + " }. To inspect available tag-sets in the SavedModel, please " + "use the SavedModel CLI: `saved_model_cli`")); +} + +} // namespace + +Status ReadMetaGraphDefFromSavedModel(const string& export_dir, + const std::unordered_set& tags, + MetaGraphDef* const meta_graph_def) { + SavedModel saved_model_proto; + TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto)); + TF_RETURN_IF_ERROR(FindMetaGraphDef(saved_model_proto, tags, meta_graph_def)); + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/cc/saved_model/reader.h b/tensorflow/cc/saved_model/reader.h new file mode 100644 index 0000000000000000000000000000000000000000..5815108df2a1883b6618e801f30c1915cde8c895 --- /dev/null +++ b/tensorflow/cc/saved_model/reader.h @@ -0,0 +1,39 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +/// Functions to read the SavedModel proto, or parts of it. + +#ifndef TENSORFLOW_CC_SAVED_MODEL_READER_H_ +#define TENSORFLOW_CC_SAVED_MODEL_READER_H_ + +#include +#include + +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/protobuf/meta_graph.pb.h" + +namespace tensorflow { + +// Reads the SavedModel proto from saved_model.pb(txt) in the given directory, +// finds the MetaGraphDef that matches the given set of tags and writes it to +// the `meta_graph_def` parameter. Returns a failure status when the SavedModel +// file does not exist or no MetaGraphDef matches the tags. +Status ReadMetaGraphDefFromSavedModel(const string& export_dir, + const std::unordered_set& tags, + MetaGraphDef* const meta_graph_def); + +} // namespace tensorflow + +#endif // TENSORFLOW_CC_SAVED_MODEL_READER_H_ diff --git a/tensorflow/cc/saved_model/reader_test.cc b/tensorflow/cc/saved_model/reader_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..620e9c2eece886c9600a8c93cede3b132ccbccaa --- /dev/null +++ b/tensorflow/cc/saved_model/reader_test.cc @@ -0,0 +1,108 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/saved_model/reader.h" + +#include "tensorflow/cc/saved_model/constants.h" +#include "tensorflow/cc/saved_model/tag_constants.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +constexpr char kTestDataPbTxt[] = + "cc/saved_model/testdata/half_plus_two_pbtxt/00000123"; +constexpr char kTestDataSharded[] = + "cc/saved_model/testdata/half_plus_two/00000123"; + +class ReaderTest : public ::testing::Test { + protected: + ReaderTest() {} + + void CheckMetaGraphDef(const MetaGraphDef& meta_graph_def) { + const auto& tags = meta_graph_def.meta_info_def().tags(); + EXPECT_TRUE(std::find(tags.begin(), tags.end(), kSavedModelTagServe) != + tags.end()); + EXPECT_NE(meta_graph_def.meta_info_def().tensorflow_version(), ""); + EXPECT_EQ( + meta_graph_def.signature_def().at("serving_default").method_name(), + "tensorflow/serving/predict"); + } +}; + +TEST_F(ReaderTest, TagMatch) { + MetaGraphDef meta_graph_def; + + const string export_dir = + io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded); + TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe}, + &meta_graph_def)); + CheckMetaGraphDef(meta_graph_def); +} + +TEST_F(ReaderTest, NoTagMatch) { + MetaGraphDef meta_graph_def; + + const string export_dir = + io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded); + Status st = ReadMetaGraphDefFromSavedModel(export_dir, {"missing-tag"}, + &meta_graph_def); + EXPECT_FALSE(st.ok()); + EXPECT_TRUE(str_util::StrContains( + st.error_message(), + "Could not find meta graph def matching supplied tags: { missing-tag }")) + << st.error_message(); +} + +TEST_F(ReaderTest, NoTagMatchMultiple) { + MetaGraphDef meta_graph_def; + + const string export_dir = + io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded); + Status st = ReadMetaGraphDefFromSavedModel( + export_dir, {kSavedModelTagServe, "missing-tag"}, &meta_graph_def); + EXPECT_FALSE(st.ok()); + EXPECT_TRUE(str_util::StrContains( + st.error_message(), + "Could not find meta graph def matching supplied tags: ")) + << st.error_message(); +} + +TEST_F(ReaderTest, PbtxtFormat) { + MetaGraphDef meta_graph_def; + + const string export_dir = + io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPbTxt); + TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe}, + &meta_graph_def)); + CheckMetaGraphDef(meta_graph_def); +} + +TEST_F(ReaderTest, InvalidExportPath) { + MetaGraphDef meta_graph_def; + + const string export_dir = + io::JoinPath(testing::TensorFlowSrcRoot(), "missing-path"); + Status st = ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe}, + &meta_graph_def); + EXPECT_FALSE(st.ok()); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD index 2119c8ec47f941a76e81346ae5d20da78eae11a3..6c29f09cde7ee17c11cb44ce48d8e9128daae4d0 100644 --- a/tensorflow/compiler/aot/BUILD +++ b/tensorflow/compiler/aot/BUILD @@ -8,28 +8,6 @@ load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library") load("//tensorflow:tensorflow.bzl", "tf_cc_test") load("//tensorflow:tensorflow.bzl", "tf_cc_binary") -# Optional runtime utilities for use by code generated by tfcompile. -cc_library( - name = "runtime", - srcs = ["runtime.cc"], - hdrs = ["runtime.h"], - visibility = ["//visibility:public"], - deps = [ - "//tensorflow/core:framework_lite", - ], -) - -tf_cc_test( - name = "runtime_test", - srcs = ["runtime_test.cc"], - deps = [ - ":runtime", - "//tensorflow/core:framework", - "//tensorflow/core:test", - "//tensorflow/core:test_main", - ], -) - # Don't depend on this directly; this is only used for the benchmark test # generated by tf_library. cc_library( @@ -53,9 +31,8 @@ cc_library( ], deps = [ ":embedded_protocol_buffers", - ":runtime", # needed by codegen to print aligned_buffer_bytes "//tensorflow/compiler/tf2xla", - "//tensorflow/compiler/tf2xla:common", + "//tensorflow/compiler/tf2xla:cpu_function_runtime", "//tensorflow/compiler/tf2xla:tf2xla_proto", "//tensorflow/compiler/tf2xla:tf2xla_util", "//tensorflow/compiler/tf2xla:xla_compiler", @@ -68,13 +45,18 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:compile_only_client", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:compiler", + "//tensorflow/compiler/xla/service/cpu:buffer_info_util", "//tensorflow/compiler/xla/service/cpu:cpu_compiler", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -91,6 +73,7 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/strings", "@llvm//:support", # fixdeps: keep "@llvm//:x86_code_gen", # fixdeps: keep ], @@ -119,6 +102,7 @@ cc_library( "//tensorflow/core:graph", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/strings", ], ) @@ -208,11 +192,13 @@ cc_library( srcs = ["embedded_protocol_buffers.cc"], hdrs = ["embedded_protocol_buffers.h"], deps = [ - "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", "@llvm//:core", "@llvm//:support", "@llvm//:target", @@ -237,7 +223,6 @@ test_suite( tests = [ ":benchmark_test", ":codegen_test", - ":runtime_test", ":test_graph_tfadd_test", ":test_graph_tfunknownop2_test", ":test_graph_tfunknownop3_test", diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc index 0025842aead53973befc794378a26fa8db2ae1cb..2b1ce34b3770a47e31d4f623b1b4f4650206737e 100644 --- a/tensorflow/compiler/aot/codegen.cc +++ b/tensorflow/compiler/aot/codegen.cc @@ -19,16 +19,18 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_join.h" +#include "absl/strings/str_replace.h" +#include "absl/types/span.h" #include "tensorflow/compiler/aot/embedded_protocol_buffers.h" -#include "tensorflow/compiler/aot/runtime.h" -#include "tensorflow/compiler/tf2xla/str_util.h" +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" #include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/xla/service/compiler.h" +#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" namespace tensorflow { @@ -36,6 +38,8 @@ namespace tfcompile { namespace { +using BufferInfo = cpu_function_runtime::BufferInfo; + bool IsAlpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } @@ -85,27 +89,36 @@ Status XLATypeToCpp(xla::PrimitiveType type, string* str) { return Status::OK(); } -// total_buffer_bytes returns the sum of each size in `sizes`, skipping -1 -// values. There are `n` entries in `sizes`. -size_t total_buffer_bytes(const intptr_t* sizes, size_t n) { - size_t total = 0; - for (size_t i = 0; i < n; ++i) { - if (sizes[i] != -1) { - total += sizes[i]; - } - } - return total; +// Returns the sum of the size of each buffer in `buffer_infos`. +size_t TotalBufferBytes(const std::vector& buffer_infos) { + return std::accumulate(buffer_infos.begin(), buffer_infos.end(), size_t{0}, + [](size_t size, const BufferInfo& buffer_info) { + return size + buffer_info.size(); + }); } -// Fills in arg_sizes with the byte size of each positional arg. -Status ComputeArgSizes(const CompileResult& compile_result, - std::vector* arg_sizes) { - const xla::ProgramShape& ps = compile_result.program_shape; - for (int i = 0; i < ps.parameters_size(); ++i) { - arg_sizes->push_back(xla::ShapeUtil::ByteSizeOf( - ps.parameters(i), compile_result.pointer_size)); - } - return Status::OK(); +// Returns a vector of BufferInfo instances in `buffer_infos` that are entry +// parameter buffers. +std::vector ExtractEntryParamBufferInfos( + const std::vector& buffer_infos) { + std::vector result; + std::copy_if(buffer_infos.begin(), buffer_infos.end(), + std::back_inserter(result), [](const BufferInfo& buffer_info) { + return buffer_info.is_entry_parameter(); + }); + return result; +} + +// Returns a vector of BufferInfo instances in `buffer_infos` that are temp +// buffers. +std::vector ExtractTempBufferInfos( + const std::vector& buffer_infos) { + std::vector result; + std::copy_if(buffer_infos.begin(), buffer_infos.end(), + std::back_inserter(result), [](const BufferInfo& buffer_info) { + return buffer_info.is_temp_buffer(); + }); + return result; } // Add (from,to) rewrite pairs based on the given shape. These rewrite pairs @@ -129,7 +142,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape, } rewrites->push_back({"{{I}}", strings::StrCat(i)}); rewrites->push_back({"{{TYPE}}", type}); - rewrites->push_back({"{{DIM_VARS}}", str_util::Join(dim_vars, ", ")}); + rewrites->push_back({"{{DIM_VARS}}", absl::StrJoin(dim_vars, ", ")}); rewrites->push_back({"{{DIM_SIZES}}", dim_sizes}); rewrites->push_back({"{{INDICES}}", indices}); return Status::OK(); @@ -145,8 +158,9 @@ Status AddRewritesForShape(int i, const xla::Shape& shape, // text-templating mechanism. string RewriteWithName(const string& name, string code, const std::vector>& rewrites) { - str_util::ReplaceAllPairs(&code, rewrites); - return str_util::StringReplace(code, "{{NAME}}", name, /*replace_all=*/true); + absl::StrReplaceAll(rewrites, &code); + absl::StrReplaceAll({{"{{NAME}}", name}}, &code); + return code; } // Generate methods for args (inputs). @@ -278,6 +292,25 @@ Status ValidateFeedFetchCppNames(const tf2xla::Config& config) { return Status::OK(); } +// Returns a list of C++ expressions that, when executed, will construct the +// BufferInfo instances in `buffer_infos`. +std::vector BufferInfosToCppExpression( + const std::vector& buffer_infos) { + std::vector buffer_infos_as_strings; + std::transform(buffer_infos.begin(), buffer_infos.end(), + std::back_inserter(buffer_infos_as_strings), + [](const BufferInfo& buffer_info) { + std::pair encoded = buffer_info.Encode(); + string encoded_second_as_str = + encoded.second == ~0ULL + ? "~0ULL" + : strings::StrCat(encoded.second, "ULL"); + return strings::StrCat( + "::tensorflow::cpu_function_runtime::BufferInfo({", + encoded.first, "ULL, ", encoded_second_as_str, "})"); + }); + return buffer_infos_as_strings; +} } // namespace Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config, @@ -286,29 +319,35 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config, TF_RETURN_IF_ERROR(ValidateConfig(config)); TF_RETURN_IF_ERROR(ValidateFeedFetchCppNames(config)); const int64 result_index = compile_result.aot->result_buffer_index(); - const xla::BufferSizes& temp_sizes = compile_result.aot->buffer_sizes(); - if (result_index < 0 || result_index > temp_sizes.size()) { + const std::vector& buffer_infos = + compile_result.aot->buffer_infos(); + const std::vector arg_index_table = + ::xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos); + std::vector buffer_infos_as_strings = + BufferInfosToCppExpression(buffer_infos); + if (result_index < 0 || result_index >= buffer_infos.size()) { return errors::InvalidArgument("result index: ", result_index, " is outside the range of temp sizes: [0,", - temp_sizes.size(), ")"); + buffer_infos.size(), ")"); } // Compute sizes and generate methods. - std::vector arg_sizes; - TF_RETURN_IF_ERROR(ComputeArgSizes(compile_result, &arg_sizes)); + std::vector buffer_infos_for_args = + ExtractEntryParamBufferInfos(buffer_infos); + std::vector buffer_infos_for_temps = + ExtractTempBufferInfos(buffer_infos); const xla::ProgramShape& ps = compile_result.program_shape; string methods_arg, methods_result; TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg)); TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result)); - const std::vector iarg(arg_sizes.begin(), arg_sizes.end()); - const std::vector itemp(temp_sizes.begin(), temp_sizes.end()); - const size_t arg_bytes_aligned = - runtime::aligned_buffer_bytes(iarg.data(), iarg.size()); - const size_t arg_bytes_total = total_buffer_bytes(iarg.data(), iarg.size()); - const size_t temp_bytes_aligned = - runtime::aligned_buffer_bytes(itemp.data(), itemp.size()); - const size_t temp_bytes_total = - total_buffer_bytes(itemp.data(), itemp.size()); + const size_t arg_bytes_aligned = cpu_function_runtime::AlignedBufferBytes( + buffer_infos_for_args.data(), buffer_infos_for_args.size(), + /*allocate_entry_params=*/true); + const size_t arg_bytes_total = TotalBufferBytes(buffer_infos_for_args); + const size_t temp_bytes_aligned = cpu_function_runtime::AlignedBufferBytes( + buffer_infos_for_temps.data(), buffer_infos_for_temps.size(), + /*allocate_entry_params=*/true); + const size_t temp_bytes_total = TotalBufferBytes(buffer_infos_for_temps); // Create rewrite strings for namespace start and end. string ns_start; @@ -343,8 +382,8 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config, // calling HloProfilePrinter::profile_counters_size. const string assign_profile_counters_size = opts.gen_hlo_profile_printer_data - ? "data->profile_counters_size = " - "data->hlo_profile_printer_data->profile_counters_size();" + ? "data->set_profile_counters_size(" + "data->hlo_profile_printer_data()->profile_counters_size());" : ""; // Use a poor-man's text templating mechanism; first populate the full header @@ -414,9 +453,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { static constexpr size_t kNumArgs = {{ARG_NUM}}; // Byte size of each argument buffer. There are kNumArgs entries. - static const intptr_t* ArgSizes() { - static constexpr intptr_t kArgSizes[kNumArgs] = {{{ARG_SIZES}}}; - return kArgSizes; + static const ::tensorflow::int64 ArgSize(::tensorflow::int32 index) { + return BufferInfos()[ArgIndexToBufferIndex()[index]].size(); } // Returns static data used to create an XlaCompiledCpuFunction. @@ -424,17 +462,17 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { static XlaCompiledCpuFunction::StaticData* kStaticData = [](){ XlaCompiledCpuFunction::StaticData* data = new XlaCompiledCpuFunction::StaticData; - data->raw_function = {{ENTRY}}; - data->arg_sizes = ArgSizes(); - data->num_args = kNumArgs; - data->temp_sizes = TempSizes(); - data->num_temps = kNumTemps; - data->result_index = kResultIndex; - data->arg_names = StaticArgNames(); - data->result_names = StaticResultNames(); - data->program_shape = StaticProgramShape(); - data->hlo_profile_printer_data = StaticHloProfilePrinterData(); - {{ASSIGN_PROFILE_COUNTERS_SIZE}} + data->set_raw_function({{ENTRY}}); + data->set_buffer_infos(BufferInfos()); + data->set_num_buffers(kNumBuffers); + data->set_arg_index_table(ArgIndexToBufferIndex()); + data->set_num_args(kNumArgs); + data->set_result_index(kResultIndex); + data->set_arg_names(StaticArgNames()); + data->set_result_names(StaticResultNames()); + data->set_program_shape(StaticProgramShape()); + data->set_hlo_profile_printer_data(StaticHloProfilePrinterData()); +{{ASSIGN_PROFILE_COUNTERS_SIZE}} return data; }(); return *kStaticData; @@ -482,17 +520,27 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { {{METHODS_RESULT}} private: - // Number of result and temporary buffers for the compiled computation. - static constexpr size_t kNumTemps = {{TEMP_NUM}}; - // The 0-based index of the result tuple in the temporary buffers. - static constexpr size_t kResultIndex = {{RESULT_INDEX}}; + // Number of buffers for the compiled computation. + static constexpr size_t kNumBuffers = {{NUM_BUFFERS}}; + + static const ::tensorflow::cpu_function_runtime::BufferInfo* BufferInfos() { + static const ::tensorflow::cpu_function_runtime::BufferInfo + kBufferInfos[kNumBuffers] = { +{{BUFFER_INFOS_AS_STRING}} + }; + return kBufferInfos; + } - // Byte size of each result / temporary buffer. There are kNumTemps entries. - static const intptr_t* TempSizes() { - static constexpr intptr_t kTempSizes[kNumTemps] = {{{TEMP_SIZES}}}; - return kTempSizes; + static const ::tensorflow::int32* ArgIndexToBufferIndex() { + static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = { +{{ARG_INDEX_TABLE}} + }; + return kArgIndexToBufferIndex; } + // The 0-based index of the result tuple in the temporary buffers. + static constexpr size_t kResultIndex = {{RESULT_INDEX}}; + // Array of names of each positional argument, terminated by nullptr. static const char** StaticArgNames() {{ARG_NAMES_CODE}} @@ -523,12 +571,12 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { {"{{ARG_BYTES_ALIGNED}}", strings::StrCat(arg_bytes_aligned)}, {"{{ARG_BYTES_TOTAL}}", strings::StrCat(arg_bytes_total)}, {"{{ARG_NAMES_CODE}}", arg_names_code}, - {"{{ARG_NUM}}", strings::StrCat(arg_sizes.size())}, - {"{{ARG_SIZES}}", str_util::Join(arg_sizes, ", ")}, + {"{{ARG_NUM}}", strings::StrCat(arg_index_table.size())}, + {"{{ARG_INDEX_TABLE}}", absl::StrJoin(arg_index_table, ", ")}, {"{{ASSIGN_PROFILE_COUNTERS_SIZE}}", assign_profile_counters_size}, {"{{CLASS}}", opts.class_name}, {"{{DECLS_FROM_OBJ_FILE}}", - str_util::Join(metadata_result.header_variable_decls, "\n")}, + absl::StrJoin(metadata_result.header_variable_decls, "\n")}, {"{{ENTRY}}", compile_result.entry_point}, {"{{HLO_PROFILE_PRINTER_DATA_SHIM_EXPRESSION}}", metadata_result.hlo_profile_printer_data_access_shim}, @@ -546,9 +594,10 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { {"{{RESULT_NAMES_CODE}}", result_names_code}, {"{{TEMP_BYTES_ALIGNED}}", strings::StrCat(temp_bytes_aligned)}, {"{{TEMP_BYTES_TOTAL}}", strings::StrCat(temp_bytes_total)}, - {"{{TEMP_NUM}}", strings::StrCat(temp_sizes.size())}, - {"{{TEMP_SIZES}}", str_util::Join(temp_sizes, ", ")}}; - str_util::ReplaceAllPairs(header, rewrites); + {"{{NUM_BUFFERS}}", strings::StrCat(buffer_infos.size())}, + {"{{BUFFER_INFOS_AS_STRING}}", + absl::StrJoin(buffer_infos_as_strings, ",\n")}}; + absl::StrReplaceAll(rewrites, header); return Status::OK(); } @@ -570,7 +619,8 @@ Status GenerateMetadata(const CodegenOpts& opts, if (opts.gen_program_shape) { program_shape = - tensorflow::MakeUnique(compile_result.program_shape); + absl::make_unique(compile_result.program_shape); + // The parameter names are currently meaningless, and redundant with the // rest of our metadata, so clear them out to avoid confusion and save // space. diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc index 29bc9c13b889c86c2ba8776c7b067c54cb05bc43..e3a53edb7368c209bea16a9e34b1f452a8ff4bf8 100644 --- a/tensorflow/compiler/aot/codegen_test.cc +++ b/tensorflow/compiler/aot/codegen_test.cc @@ -18,13 +18,13 @@ limitations under the License. #include #include +#include "absl/strings/match.h" #include "llvm/Support/TargetSelect.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/test.h" @@ -32,9 +32,11 @@ namespace tensorflow { namespace tfcompile { namespace { -void ExpectErrorContains(const Status& status, StringPiece str) { +using ::tensorflow::cpu_function_runtime::BufferInfo; + +void ExpectErrorContains(const Status& status, absl::string_view str) { EXPECT_NE(Status::OK(), status); - EXPECT_TRUE(str_util::StrContains(status.error_message(), str)) + EXPECT_TRUE(absl::StrContains(status.error_message(), str)) << "expected error: " << status.error_message() << " to contain: " << str; } @@ -171,8 +173,14 @@ TEST(CodegenTest, Golden) { fetch->mutable_id()->set_node_name("fetch0"); fetch->set_name("myfetch"); CompileResult compile_result; - compile_result.aot.reset( - new xla::cpu::CpuAotCompilationResult({}, {1, -1, 2, -1, 3, 120}, 5, {})); + compile_result.aot.reset(new xla::cpu::CpuAotCompilationResult( + {}, + {BufferInfo::MakeTempBuffer(1), + BufferInfo::MakeEntryParameter(/*size=*/8, /*param_number=*/0), + BufferInfo::MakeTempBuffer(2), + BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/1), + BufferInfo::MakeTempBuffer(3), BufferInfo::MakeTempBuffer(120)}, + 5, {})); compile_result.program_shape = xla::ShapeUtil::MakeProgramShape( { xla::ShapeUtil::MakeShape(xla::F32, {1, 2}), diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden index 6641d45e83020f4144616a6a2837c844330298f5..e4d8a02877c75fa72c5747650ab9c7ac229955b3 100644 --- a/tensorflow/compiler/aot/codegen_test_h.golden +++ b/tensorflow/compiler/aot/codegen_test_h.golden @@ -65,9 +65,8 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { static constexpr size_t kNumArgs = 2; // Byte size of each argument buffer. There are kNumArgs entries. - static const intptr_t* ArgSizes() { - static constexpr intptr_t kArgSizes[kNumArgs] = {8, 96}; - return kArgSizes; + static const ::tensorflow::int64 ArgSize(::tensorflow::int32 index) { + return BufferInfos()[ArgIndexToBufferIndex()[index]].size(); } // Returns static data used to create an XlaCompiledCpuFunction. @@ -75,17 +74,17 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { static XlaCompiledCpuFunction::StaticData* kStaticData = [](){ XlaCompiledCpuFunction::StaticData* data = new XlaCompiledCpuFunction::StaticData; - data->raw_function = entry_point; - data->arg_sizes = ArgSizes(); - data->num_args = kNumArgs; - data->temp_sizes = TempSizes(); - data->num_temps = kNumTemps; - data->result_index = kResultIndex; - data->arg_names = StaticArgNames(); - data->result_names = StaticResultNames(); - data->program_shape = StaticProgramShape(); - data->hlo_profile_printer_data = StaticHloProfilePrinterData(); - + data->set_raw_function(entry_point); + data->set_buffer_infos(BufferInfos()); + data->set_num_buffers(kNumBuffers); + data->set_arg_index_table(ArgIndexToBufferIndex()); + data->set_num_args(kNumArgs); + data->set_result_index(kResultIndex); + data->set_arg_names(StaticArgNames()); + data->set_result_names(StaticResultNames()); + data->set_program_shape(StaticProgramShape()); + data->set_hlo_profile_printer_data(StaticHloProfilePrinterData()); + return data; }(); return *kStaticData; @@ -215,17 +214,32 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { } private: - // Number of result and temporary buffers for the compiled computation. - static constexpr size_t kNumTemps = 6; - // The 0-based index of the result tuple in the temporary buffers. - static constexpr size_t kResultIndex = 5; + // Number of buffers for the compiled computation. + static constexpr size_t kNumBuffers = 6; + + static const ::tensorflow::cpu_function_runtime::BufferInfo* BufferInfos() { + static const ::tensorflow::cpu_function_runtime::BufferInfo + kBufferInfos[kNumBuffers] = { +::tensorflow::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({34ULL, 0ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({9ULL, ~0ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({386ULL, 1ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({13ULL, ~0ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({481ULL, ~0ULL}) + }; + return kBufferInfos; + } - // Byte size of each result / temporary buffer. There are kNumTemps entries. - static const intptr_t* TempSizes() { - static constexpr intptr_t kTempSizes[kNumTemps] = {1, -1, 2, -1, 3, 120}; - return kTempSizes; + static const ::tensorflow::int32* ArgIndexToBufferIndex() { + static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = { +1, 3 + }; + return kArgIndexToBufferIndex; } + // The 0-based index of the result tuple in the temporary buffers. + static constexpr size_t kResultIndex = 5; + // Array of names of each positional argument, terminated by nullptr. static const char** StaticArgNames() { static const char* kNames[] = {"myfeed", nullptr}; diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc index bbc35da2ef6d14ff0d3570ef2d5cf6743456c674..2b5f97b34cd928d32eb220536342c715d91d45bb 100644 --- a/tensorflow/compiler/aot/compile.cc +++ b/tensorflow/compiler/aot/compile.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/compile_only_client.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/util.h" diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc index 4e27aafec7747655d8e4ea3ddd1788d495ca0710..f1e8e5c08482e15d989c19a43aa7c5f437cd091d 100644 --- a/tensorflow/compiler/aot/embedded_protocol_buffers.cc +++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc @@ -18,6 +18,8 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_replace.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LLVMContext.h" @@ -26,8 +28,6 @@ limitations under the License. #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "tensorflow/compiler/tf2xla/str_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/util.h" @@ -65,14 +65,13 @@ static string CreateCPPShimExpression(StringPiece qualified_cpp_protobuf_name, " return proto;\n" " }()"; - str_util::ReplaceAllPairs( - &code, + return absl::StrReplaceAll( + code, { {"{{ARRAY_SYMBOL}}", strings::StrCat(protobuf_array_symbol_name)}, {"{{ARRAY_SIZE}}", strings::StrCat(protobuf_array_size)}, {"{{PROTOBUF_NAME}}", strings::StrCat(qualified_cpp_protobuf_name)}, }); - return code; } static StatusOr CodegenModule(llvm::TargetMachine* target_machine, @@ -97,7 +96,7 @@ static StatusOr> GetTargetMachineFromTriple(StringPiece target_triple) { std::string error; std::string normalized_triple = - llvm::Triple::normalize(AsStringRef(target_triple)); + llvm::Triple::normalize(AsStringRef(absl::string_view(target_triple))); const llvm::Target* target = llvm::TargetRegistry::lookupTarget(normalized_triple, error); if (target == nullptr) { @@ -105,20 +104,20 @@ GetTargetMachineFromTriple(StringPiece target_triple) { error.c_str()); } - return WrapUnique(target->createTargetMachine( + return absl::WrapUnique(target->createTargetMachine( normalized_triple, /*CPU=*/"", /*Features=*/"", llvm::TargetOptions(), llvm::None)); } StatusOr CreateEmbeddedProtocolBuffers( StringPiece target_triple, - gtl::ArraySlice protobufs_to_embed) { + absl::Span protobufs_to_embed) { TF_ASSIGN_OR_RETURN(std::unique_ptr target_machine, GetTargetMachineFromTriple(target_triple)); llvm::LLVMContext llvm_context; std::unique_ptr module_with_serialized_proto = - MakeUnique("embedded_data_module", llvm_context); + absl::make_unique("embedded_data_module", llvm_context); EmbeddedProtocolBuffers result; diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h index 4e194a6aba9a9efcad27c47c42e148d8e537ae68..4f940c019750f49da4ad2386aa4b23281cc5a9fc 100644 --- a/tensorflow/compiler/aot/embedded_protocol_buffers.h +++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h @@ -20,8 +20,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_ #define TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_ +#include "absl/types/span.h" #include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/protobuf.h" namespace tensorflow { @@ -84,7 +84,7 @@ struct ProtobufToEmbed { // EmbeddedProtocolBuffers instance. StatusOr CreateEmbeddedProtocolBuffers( StringPiece target_triple, - gtl::ArraySlice protobufs_to_embed); + absl::Span protobufs_to_embed); } // namespace tfcompile } // namespace tensorflow diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/aot/runtime.cc deleted file mode 100644 index 5e74079fc158379b8977ada6412141e39142c3d3..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/aot/runtime.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/aot/runtime.h" - -#include - -#include "tensorflow/core/platform/dynamic_annotations.h" - -namespace tensorflow { -namespace tfcompile { -namespace runtime { - -namespace { - -// Inline memory allocation routines here, because depending on '//base' brings -// in libraries which use c++ streams, which adds considerable code size on -// android. -inline void* aligned_malloc(size_t size, int minimum_alignment) { -#if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN) - return memalign(minimum_alignment, size); -#elif defined(_WIN32) - return _aligned_malloc(size, minimum_alignment); -#else // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN - void* ptr = nullptr; - // posix_memalign requires that the requested alignment be at least - // sizeof(void*). In this case, fall back on malloc which should return memory - // aligned to at least the size of a pointer. - const int required_alignment = sizeof(void*); - if (minimum_alignment < required_alignment) return malloc(size); - if (posix_memalign(&ptr, minimum_alignment, size) != 0) - return nullptr; - else - return ptr; -#endif -} - -inline void aligned_free(void* aligned_memory) { -#if defined(_WIN32) - _aligned_free(aligned_memory); -#else - free(aligned_memory); -#endif -} - -size_t align_to(size_t n, size_t align) { - return (((n - 1) / align) + 1) * align; -} - -} // namespace - -size_t aligned_buffer_bytes(const intptr_t* sizes, size_t n) { - size_t total = 0; - for (size_t i = 0; i < n; ++i) { - if (sizes[i] != -1) { - total += align_to(sizes[i], kAlign); - } - } - return total; -} - -void* MallocContiguousBuffers(const intptr_t* sizes, size_t n, void** bufs, - bool annotate_initialized) { - const size_t total = aligned_buffer_bytes(sizes, n); - void* contiguous = nullptr; - if (total > 0) { - contiguous = aligned_malloc(total, kAlign); - if (annotate_initialized) { - // Since the memory for temp buffers is written to by JITed code, msan has - // no way of knowing the memory was initialized, so explicitly mark it. - TF_ANNOTATE_MEMORY_IS_INITIALIZED(contiguous, total); - } - } - uintptr_t pos = reinterpret_cast(contiguous); - for (size_t i = 0; i < n; ++i) { - if (sizes[i] == -1) { - bufs[i] = nullptr; - } else { - bufs[i] = reinterpret_cast(pos); - pos += align_to(sizes[i], kAlign); - } - } - return contiguous; -} - -void FreeContiguous(void* contiguous) { - if (contiguous != nullptr) { - aligned_free(contiguous); - } -} - -} // namespace runtime -} // namespace tfcompile -} // namespace tensorflow diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h deleted file mode 100644 index d1a669ceb17b9fd71d26e978035283f8824b0376..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/aot/runtime.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// This file contains utilities to make it easier to invoke functions generated -// by tfcompile. Usage of these utilities is optional. - -#ifndef TENSORFLOW_COMPILER_AOT_RUNTIME_H_ -#define TENSORFLOW_COMPILER_AOT_RUNTIME_H_ - -#include "tensorflow/core/platform/types.h" - -namespace tensorflow { -namespace tfcompile { -namespace runtime { - -// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment. -static constexpr size_t kAlign = 64; - -// aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1 -// values. There are `n` entries in `sizes`. Each buffer is aligned to kAlign -// byte boundaries. -size_t aligned_buffer_bytes(const intptr_t* sizes, size_t n); - -// MallocContiguousBuffers allocates buffers for use by the entry point -// generated by tfcompile. `sizes` is an array of byte sizes for each buffer, -// where -1 causes the buffer pointer to be nullptr. There are `n` entries in -// `sizes`. If `annotate_initialized` is set, the allocated memory will be -// annotated as having been initialized - this is useful when allocating -// temporary buffers. -// -// A single contiguous block of memory is allocated, and portions of it are -// parceled out into `bufs`, which must have space for `n` entries. Returns the -// head of the allocated contiguous block, which should be passed to -// FreeContiguous when the buffers are no longer in use. -void* MallocContiguousBuffers(const intptr_t* sizes, size_t n, void** bufs, - bool annotate_initialized); - -// FreeContiguous frees the contiguous block of memory allocated by -// MallocContiguousBuffers. -void FreeContiguous(void* contiguous); - -} // namespace runtime -} // namespace tfcompile -} // namespace tensorflow - -#endif // TENSORFLOW_COMPILER_AOT_RUNTIME_H_ diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc deleted file mode 100644 index 06ec623eb2dce5f8dc7156fb7e7b9ad57d90c8ee..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/aot/runtime_test.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/aot/runtime.h" - -#include "tensorflow/core/framework/allocator.h" -#include "tensorflow/core/platform/test.h" - -namespace tensorflow { -namespace tfcompile { -namespace runtime { -namespace { - -TEST(Runtime, AlignmentValue) { - // We've chosen 64 byte alignment for the tfcompile runtime to mimic the - // regular tensorflow allocator, which was chosen to play nicely with Eigen. - // The tfcompile runtime also has a requirement that comes from the xla - // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8 - // So any value that we choose must abide by that constraint as well. - EXPECT_EQ(kAlign, Allocator::kAllocatorAlignment); -} - -TEST(Runtime, AlignedBufferBytes) { - EXPECT_EQ(aligned_buffer_bytes(nullptr, 0), 0); - - static constexpr intptr_t sizesA[1] = {-1}; - EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0); - - static constexpr intptr_t sizesB[1] = {3}; - EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64); - - static constexpr intptr_t sizesC[1] = {32}; - EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64); - - static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3}; - EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320); -} - -void* add_ptr(void* base, uintptr_t delta) { - return reinterpret_cast(reinterpret_cast(base) + delta); -} - -// To test MallocContiguousBuffers and FreeContiguous, we just check for -// expected nullptrs, and write to each byte of allocated memory. We rely on -// the leak checker to tell us if there's an inconsistency between malloc and -// free. We also check the contiguous property. -TEST(Runtime, MallocFreeContiguousBuffers) { - // Test empty sizes. - void* base = MallocContiguousBuffers(nullptr, 0, nullptr, false); - EXPECT_EQ(base, nullptr); - FreeContiguous(base); - - // Test non-empty sizes with 0 sum. - static constexpr intptr_t sizesA[1] = {-1}; - void* bufA[1]; - base = MallocContiguousBuffers(sizesA, 1, bufA, false); - EXPECT_EQ(base, nullptr); - EXPECT_EQ(bufA[0], nullptr); - FreeContiguous(base); - - // Test non-empty sizes with non-0 sum. - static constexpr intptr_t sizesB[1] = {3}; - void* bufB[1]; - base = MallocContiguousBuffers(sizesB, 1, bufB, false); - EXPECT_NE(base, nullptr); - EXPECT_EQ(bufB[0], add_ptr(base, 0)); - char* bufB0_bytes = static_cast(bufB[0]); - bufB0_bytes[0] = 'A'; - bufB0_bytes[1] = 'B'; - bufB0_bytes[2] = 'C'; - FreeContiguous(base); - - // Test non-empty sizes with non-0 sum, and annotate_initialized. - static constexpr intptr_t sizesC[1] = {3}; - void* bufC[1]; - base = MallocContiguousBuffers(sizesC, 1, bufC, true); - EXPECT_NE(base, nullptr); - EXPECT_EQ(bufC[0], add_ptr(base, 0)); - char* bufC0_bytes = static_cast(bufC[0]); - bufC0_bytes[0] = 'A'; - bufC0_bytes[1] = 'B'; - bufC0_bytes[2] = 'C'; - FreeContiguous(base); - - // Test mixed sizes. - static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3}; - void* bufD[7]; - base = MallocContiguousBuffers(sizesD, 7, bufD, false); - EXPECT_NE(base, nullptr); - EXPECT_EQ(bufD[0], add_ptr(base, 0)); - EXPECT_EQ(bufD[1], nullptr); - EXPECT_EQ(bufD[2], add_ptr(base, 64)); - EXPECT_EQ(bufD[3], nullptr); - EXPECT_EQ(bufD[4], add_ptr(base, 128)); - EXPECT_EQ(bufD[5], add_ptr(base, 192)); - EXPECT_EQ(bufD[6], add_ptr(base, 256)); - for (int i = 0; i < 7; ++i) { - const intptr_t size = sizesD[i]; - if (size != -1) { - char* bufD_bytes = static_cast(bufD[i]); - for (size_t j = 0; j < size; ++j) { - bufD_bytes[j] = 'A' + j; - } - } - } - FreeContiguous(base); -} - -} // namespace -} // namespace runtime -} // namespace tfcompile -} // namespace tensorflow diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc index 6b098049cbd7539a2b2e2696b13139a8a6b28e0f..5deb47d12310d24dce847227bd119249210ffb8d 100644 --- a/tensorflow/compiler/aot/test.cc +++ b/tensorflow/compiler/aot/test.cc @@ -51,11 +51,9 @@ namespace tensorflow { namespace tfcompile { namespace { -void zero_buffers(void** bufs, const intptr_t* sizes, size_t n) { - for (int i = 0; i < n; ++i) { - if (sizes[i] != -1) { - memset(bufs[i], 0, sizes[i]); - } +void zero_buffers(XlaCompiledCpuFunction* computation) { + for (int i = 0; i < computation->num_args(); ++i) { + memset(computation->arg_data(i), 0, computation->arg_size(i)); } } @@ -66,7 +64,7 @@ TEST(TEST_NAME, NoCrash) { CPP_CLASS computation; computation.set_thread_pool(&device); - zero_buffers(computation.args(), CPP_CLASS::ArgSizes(), CPP_CLASS::kNumArgs); + zero_buffers(&computation); EXPECT_TRUE(computation.Run()); } @@ -80,7 +78,7 @@ void BM_NAME(int iters) { CPP_CLASS computation; computation.set_thread_pool(&device); - zero_buffers(computation.args(), CPP_CLASS::ArgSizes(), CPP_CLASS::kNumArgs); + zero_buffers(&computation); testing::StartTiming(); while (--iters) { diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD index fd2cf2b67d4618dd626b8eef78eed044d7fde0a4..723e9bec8afcfbf7ceeeb59c63e4e12442fdb7ab 100644 --- a/tensorflow/compiler/aot/tests/BUILD +++ b/tensorflow/compiler/aot/tests/BUILD @@ -7,6 +7,10 @@ package( load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library") load("//tensorflow:tensorflow.bzl", "tf_cc_test") +# We disable some tfcompile tests in the open source build with the +# "manual" tag to avoid making our OSS users build LLVM twice +# (once for host and once for target). + test_suite( name = "all_tests", tags = ["manual"], @@ -183,6 +187,9 @@ tf_library( cpp_class = "MatMulAndAddCompWithProfiling", enable_xla_hlo_profiling = True, graph = "test_graph_tfmatmulandadd.pb", + tags = [ + "manual", + ], ) tf_library( @@ -222,5 +229,6 @@ tf_cc_test( "//tensorflow/core:test", "//tensorflow/core:test_main", "//third_party/eigen3", + "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc index fee46280e9a0e7ba2cf7c3ed46469ae8cc0841d4..dd2b151098f2054571ac32b8b506cbc00659588a 100644 --- a/tensorflow/compiler/aot/tests/tfcompile_test.cc +++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc @@ -16,6 +16,7 @@ limitations under the License. #define EIGEN_USE_THREADS #define EIGEN_USE_CUSTOM_THREAD_POOL +#include "absl/strings/str_split.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/aot/tests/test_graph_tfadd.h" #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h" @@ -32,7 +33,6 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/test.h" namespace tensorflow { @@ -44,8 +44,8 @@ using ::testing::IsSupersetOf; TEST(TFCompileTest, Add) { AddComp add; - EXPECT_EQ(add.arg0_data(), add.args()[0]); - EXPECT_EQ(add.arg1_data(), add.args()[1]); + EXPECT_EQ(add.arg0_data(), add.arg_data(0)); + EXPECT_EQ(add.arg1_data(), add.arg_data(1)); add.arg0() = 1; add.arg1() = 2; @@ -67,10 +67,10 @@ TEST(TFCompileTest, Add) { EXPECT_EQ(add_const.error_msg(), ""); EXPECT_EQ(add_const.arg0(), 123); EXPECT_EQ(add_const.arg0_data()[0], 123); - EXPECT_EQ(add_const.arg0_data(), add.args()[0]); + EXPECT_EQ(add_const.arg0_data(), add.arg_data(0)); EXPECT_EQ(add_const.arg1(), 456); EXPECT_EQ(add_const.arg1_data()[0], 456); - EXPECT_EQ(add_const.arg1_data(), add.args()[1]); + EXPECT_EQ(add_const.arg1_data(), add.arg_data(1)); EXPECT_EQ(add_const.result0(), 579); EXPECT_EQ(add_const.result0_data()[0], 579); EXPECT_EQ(add_const.result0_data(), add_const.results()[0]); @@ -85,8 +85,8 @@ TEST(TFCompileTest, Add_SetArg) { int32 arg_y = 32; add.set_arg0_data(&arg_x); add.set_arg1_data(&arg_y); - EXPECT_EQ(add.arg0_data(), add.args()[0]); - EXPECT_EQ(add.arg1_data(), add.args()[1]); + EXPECT_EQ(add.arg0_data(), add.arg_data(0)); + EXPECT_EQ(add.arg1_data(), add.arg_data(1)); EXPECT_TRUE(add.Run()); EXPECT_EQ(add.error_msg(), ""); @@ -97,7 +97,7 @@ TEST(TFCompileTest, Add_SetArg) { TEST(TFCompileTest, AddWithCkpt) { AddWithCkptComp add; - EXPECT_EQ(add.arg0_data(), add.args()[0]); + EXPECT_EQ(add.arg0_data(), add.arg_data(0)); add.arg0() = 1; EXPECT_TRUE(add.Run()); @@ -117,7 +117,7 @@ TEST(TFCompileTest, AddWithCkpt) { EXPECT_EQ(add_const.error_msg(), ""); EXPECT_EQ(add_const.arg0(), 111); EXPECT_EQ(add_const.arg0_data()[0], 111); - EXPECT_EQ(add_const.arg0_data(), add_const.args()[0]); + EXPECT_EQ(add_const.arg0_data(), add_const.arg_data(0)); EXPECT_EQ(add_const.result0(), 153); EXPECT_EQ(add_const.result0_data()[0], 153); EXPECT_EQ(add_const.result0_data(), add_const.results()[0]); @@ -125,7 +125,7 @@ TEST(TFCompileTest, AddWithCkpt) { TEST(TFCompileTest, AddWithCkptSaver) { AddWithCkptSaverComp add; - EXPECT_EQ(add.arg0_data(), add.args()[0]); + EXPECT_EQ(add.arg0_data(), add.arg_data(0)); add.arg0() = 1; EXPECT_TRUE(add.Run()); @@ -145,7 +145,7 @@ TEST(TFCompileTest, AddWithCkptSaver) { EXPECT_EQ(add_const.error_msg(), ""); EXPECT_EQ(add_const.arg0(), 111); EXPECT_EQ(add_const.arg0_data()[0], 111); - EXPECT_EQ(add_const.arg0_data(), add_const.args()[0]); + EXPECT_EQ(add_const.arg0_data(), add_const.arg_data(0)); EXPECT_EQ(add_const.result0(), 153); EXPECT_EQ(add_const.result0_data()[0], 153); EXPECT_EQ(add_const.result0_data(), add_const.results()[0]); @@ -153,9 +153,9 @@ TEST(TFCompileTest, AddWithCkptSaver) { TEST(TFCompileTest, Cond) { CondComp cond; - EXPECT_EQ(cond.arg0_data(), cond.args()[0]); - EXPECT_EQ(cond.arg1_data(), cond.args()[1]); - EXPECT_EQ(cond.arg2_data(), cond.args()[2]); + EXPECT_EQ(cond.arg0_data(), cond.arg_data(0)); + EXPECT_EQ(cond.arg1_data(), cond.arg_data(1)); + EXPECT_EQ(cond.arg2_data(), cond.arg_data(2)); cond.arg1() = 10; cond.arg2() = 20; { @@ -178,8 +178,8 @@ TEST(TFCompileTest, Cond) { TEST(TFCompileTest, Gather) { GatherComp gather; - EXPECT_EQ(gather.arg0_data(), gather.args()[0]); - EXPECT_EQ(gather.arg1_data(), gather.args()[1]); + EXPECT_EQ(gather.arg0_data(), gather.arg_data(0)); + EXPECT_EQ(gather.arg1_data(), gather.arg_data(1)); // Successful gather. { @@ -202,12 +202,12 @@ TEST(TFCompileTest, Gather) { EXPECT_EQ(gather_const.arg0(i), params[i]); EXPECT_EQ(gather_const.arg0_data()[i], params[i]); } - EXPECT_EQ(gather_const.arg0_data(), gather_const.args()[0]); + EXPECT_EQ(gather_const.arg0_data(), gather_const.arg_data(0)); for (int i = 0; i < 2; ++i) { EXPECT_EQ(gather_const.arg1(i), indices[i]); EXPECT_EQ(gather_const.arg1_data()[i], indices[i]); } - EXPECT_EQ(gather_const.arg1_data(), gather_const.args()[1]); + EXPECT_EQ(gather_const.arg1_data(), gather_const.arg_data(1)); for (int i = 0; i < 2; ++i) { EXPECT_EQ(gather_const.result0(i), results[i]); EXPECT_EQ(gather_const.result0_data()[i], results[i]); @@ -222,8 +222,8 @@ TEST(TFCompileTest, MatMul2) { foo::bar::MatMulComp matmul; matmul.set_thread_pool(&device); - EXPECT_EQ(matmul.arg0_data(), matmul.args()[0]); - EXPECT_EQ(matmul.arg1_data(), matmul.args()[1]); + EXPECT_EQ(matmul.arg0_data(), matmul.arg_data(0)); + EXPECT_EQ(matmul.arg1_data(), matmul.arg_data(1)); // Test using the argN() methods. { @@ -271,12 +271,12 @@ TEST(TFCompileTest, MatMul2) { EXPECT_EQ(matmul_const.arg0(i / 3, i % 3), args[i]); EXPECT_EQ(matmul_const.arg0_data()[i], args[i]); } - EXPECT_EQ(matmul_const.arg0_data(), matmul.args()[0]); + EXPECT_EQ(matmul_const.arg0_data(), matmul.arg_data(0)); for (int i = 0; i < 6; ++i) { EXPECT_EQ(matmul_const.arg1(i / 2, i % 2), args[i + 6]); EXPECT_EQ(matmul_const.arg1_data()[i], args[i + 6]); } - EXPECT_EQ(matmul_const.arg1_data(), matmul.args()[1]); + EXPECT_EQ(matmul_const.arg1_data(), matmul.arg_data(1)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(matmul_const.result0(i / 2, i % 2), results[i]); EXPECT_EQ(matmul_const.result0_data()[i], results[i]); @@ -300,8 +300,8 @@ TEST(TFCompileTest, MatMul2_SetArg) { float arg1[3][2] = {{7, 8}, {9, 10}, {11, 12}}; matmul.set_arg0_data(&arg0); matmul.set_arg1_data(&arg1); - EXPECT_EQ(matmul.arg0_data(), matmul.args()[0]); - EXPECT_EQ(matmul.arg1_data(), matmul.args()[1]); + EXPECT_EQ(matmul.arg0_data(), matmul.arg_data(0)); + EXPECT_EQ(matmul.arg1_data(), matmul.arg_data(1)); EXPECT_TRUE(matmul.Run()); EXPECT_EQ(matmul.error_msg(), ""); @@ -319,8 +319,8 @@ TEST(TFCompileTest, MatMulAndAdd1) { MatMulAndAddComp muladd; muladd.set_thread_pool(&device); - EXPECT_EQ(muladd.arg0_data(), muladd.args()[0]); - EXPECT_EQ(muladd.arg1_data(), muladd.args()[1]); + EXPECT_EQ(muladd.arg0_data(), muladd.arg_data(0)); + EXPECT_EQ(muladd.arg1_data(), muladd.arg_data(1)); // Test methods with positional args and results. { @@ -346,12 +346,12 @@ TEST(TFCompileTest, MatMulAndAdd1) { EXPECT_EQ(muladd_const.arg0(i / 2, i % 2), args[i]); EXPECT_EQ(muladd_const.arg0_data()[i], args[i]); } - EXPECT_EQ(muladd_const.arg0_data(), muladd.args()[0]); + EXPECT_EQ(muladd_const.arg0_data(), muladd.arg_data(0)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(muladd_const.arg1(i / 2, i % 2), args[i + 4]); EXPECT_EQ(muladd_const.arg1_data()[i], args[i + 4]); } - EXPECT_EQ(muladd_const.arg1_data(), muladd.args()[1]); + EXPECT_EQ(muladd_const.arg1_data(), muladd.arg_data(1)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(muladd_const.result0(i / 2, i % 2), results0[i]); EXPECT_EQ(muladd_const.result0_data()[i], results0[i]); @@ -387,12 +387,12 @@ TEST(TFCompileTest, MatMulAndAdd1) { EXPECT_EQ(muladd_const.arg_x(i / 2, i % 2), args[i]); EXPECT_EQ(muladd_const.arg_x_data()[i], args[i]); } - EXPECT_EQ(muladd_const.arg_x_data(), muladd.args()[0]); + EXPECT_EQ(muladd_const.arg_x_data(), muladd.arg_data(0)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(muladd_const.arg_y(i / 2, i % 2), args[i + 4]); EXPECT_EQ(muladd_const.arg_y_data()[i], args[i + 4]); } - EXPECT_EQ(muladd_const.arg_y_data(), muladd.args()[1]); + EXPECT_EQ(muladd_const.arg_y_data(), muladd.arg_data(1)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(muladd_const.result_x_y_prod(i / 2, i % 2), results0[i]); EXPECT_EQ(muladd_const.result_x_y_prod_data()[i], results0[i]); @@ -407,8 +407,8 @@ TEST(TFCompileTest, MatMulAndAdd1) { TEST(TFCompileTest, Function) { // The function is equivalent to an addition FunctionComp add_fn; - EXPECT_EQ(add_fn.arg0_data(), add_fn.args()[0]); - EXPECT_EQ(add_fn.arg1_data(), add_fn.args()[1]); + EXPECT_EQ(add_fn.arg0_data(), add_fn.arg_data(0)); + EXPECT_EQ(add_fn.arg1_data(), add_fn.arg_data(1)); add_fn.arg0() = 1; add_fn.arg1() = 2; @@ -451,8 +451,8 @@ TEST(TFCompileTest, AssertEqAndReturnDiff) { // Assert is converted into a no-op in XLA, so there is no failure even if the // two args are different. AssertComp assert; - EXPECT_EQ(assert.arg0_data(), assert.args()[0]); - EXPECT_EQ(assert.arg1_data(), assert.args()[1]); + EXPECT_EQ(assert.arg0_data(), assert.arg_data(0)); + EXPECT_EQ(assert.arg1_data(), assert.arg_data(1)); assert.arg0() = 2; assert.arg1() = 1; @@ -546,7 +546,7 @@ TEST(TFCompileTest, HloProfiling) { VLOG(1) << "HLO profile string:\n" << hlo_profile_as_string; std::vector hlo_profile_lines = - tensorflow::str_util::Split(hlo_profile_as_string, '\n'); + absl::StrSplit(hlo_profile_as_string, '\n'); auto header = HasSubstr("Execution profile for"); auto total_cycles_profile_line = HasSubstr("[total]"); diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl index 5c57fee326ca743dcb8aaae354d261ed4d7f44be..326f73b975aec3a7a6bc7cdc9a92f540ad545ad6 100644 --- a/tensorflow/compiler/aot/tfcompile.bzl +++ b/tensorflow/compiler/aot/tfcompile.bzl @@ -16,339 +16,365 @@ tf_library( ) """ -load("//tensorflow:tensorflow.bzl", - "if_android", "tf_cc_test", "tf_copts") - -def tf_library(name, graph, config, - freeze_checkpoint=None, freeze_saver=None, - cpp_class=None, gen_test=True, gen_benchmark=True, - visibility=None, testonly=None, - tfcompile_flags=None, - tfcompile_tool="//tensorflow/compiler/aot:tfcompile", - include_standard_runtime_deps=True, - enable_xla_hlo_profiling=False, deps=None, tags=None): - """Runs tfcompile to compile a TensorFlow graph into executable code. - - Given an invocation of tf_library(name="foo", ...), generates the following - build targets: - foo: A cc_library containing the generated header and computation. - foo_test: A cc_test with simple tests and benchmarks. Only created if - gen_test=True. - foo_benchmark: A cc_binary that runs a minimal-dependency benchmark, useful - for mobile devices or other platforms that can't compile the - full test libraries. Only created if gen_benchmark=True. - - Args: - name: The name of the build rule. - graph: The TensorFlow GraphDef to compile. If the file ends in '.pbtxt' it - is expected to be in the human-readable proto text format, otherwise it is - expected to be in the proto binary format. - config: File containing tensorflow.tf2xla.Config proto. If the file ends - in '.pbtxt' it is expected to be in the human-readable proto text format, - otherwise it is expected to be in the proto binary format. - freeze_checkpoint: If provided, run freeze_graph with this checkpoint to - convert variables into constants. - freeze_saver: If provided, run freeze_graph with this saver, in SaverDef - binary form, to convert variables into constants. - cpp_class: The name of the generated C++ class, wrapping the generated - function. The syntax of this flag is - [[::],...]. This mirrors the C++ syntax - for referring to a class, where multiple namespaces may precede the class - name, separated by double-colons. The class will be generated in the - given namespace(s), or if no namespaces are given, within the global - namespace. - gen_test: If True, also generate a cc_test rule that builds a simple - test and benchmark. - gen_benchmark: If True, also generate a binary with a simple benchmark. - Unlike the output of gen_test, this benchmark can be run on android. - visibility: Bazel build visibility. - testonly: Bazel testonly attribute. - tfcompile_flags: Extra flags to pass to tfcompile to control compilation. - tfcompile_tool: The tfcompile binary. A non-default can be passed to - use a tfcompile built with extra dependencies. - include_standard_runtime_deps: If True, the standard list of kernel/runtime - deps is added to deps. If False, deps must contain the full set of deps - needed by the generated library. - enable_xla_hlo_profiling: Enable XLA HLO profiling in the generated program, - and emit metadata that lets us pretty-print the gathered profile counters. - deps: a list of deps to include on the build rules for the generated - library, added to the standard deps if standard_runtime_deps is True. - tags: tags to apply to subsidiary build rules. - - The output header is called .h. - """ - if not cpp_class: - fail("cpp_class must be specified") - - tfcompile_graph = graph - if freeze_checkpoint or freeze_saver: - if not freeze_checkpoint: - fail("freeze_checkpoint must be specified when freeze_saver is specified") +load( + "//tensorflow:tensorflow.bzl", + "if_android", + "tf_cc_test", + "tf_copts", +) - freeze_name = "freeze_" + name - freeze_file = freeze_name + ".pb" +def tf_library( + name, + graph, + config, + freeze_checkpoint = None, + freeze_saver = None, + cpp_class = None, + gen_test = True, + gen_benchmark = True, + visibility = None, + testonly = None, + tfcompile_flags = None, + tfcompile_tool = "//tensorflow/compiler/aot:tfcompile", + include_standard_runtime_deps = True, + enable_xla_hlo_profiling = False, + deps = None, + tags = None): + """Runs tfcompile to compile a TensorFlow graph into executable code. - # First run tfcompile to generate the list of out_nodes. - out_nodes_file = "out_nodes_" + freeze_name - native.genrule( - name=("gen_" + out_nodes_file), - srcs=[config], - outs=[out_nodes_file], - cmd=("$(location " + tfcompile_tool + ")" + - " --config=$(location " + config + ")" + - " --dump_fetch_nodes > $@"), - tools=[tfcompile_tool], - # Run tfcompile on the build host, rather than forge, since it's - # typically way faster on the local machine. - local=1, - tags=tags, - ) + Given an invocation of tf_library(name="foo", ...), generates the following + build targets: + foo: A cc_library containing the generated header and + computation. + foo_test: A cc_test with simple tests and benchmarks. Only created if + gen_test=True. + foo_benchmark: A cc_binary that runs a minimal-dependency benchmark, + useful for mobile devices or other platforms that can't + compile the full test libraries. Only created if + gen_benchmark=True. + The output header is called .h. - # Now run freeze_graph to convert variables into constants. - freeze_args = (" --input_graph=$(location " + graph + ")" + - " --checkpoint_version=1" + - " --input_binary=" + str(not graph.endswith(".pbtxt")) + - " --input_checkpoint=$(location " + freeze_checkpoint + ")" + - " --output_graph=$(location " + freeze_file + ")" + - " --output_node_names=$$(<$(location " + out_nodes_file + - "))") - freeze_saver_srcs = [] - if freeze_saver: - freeze_args += " --input_saver=$(location " + freeze_saver + ")" - freeze_saver_srcs += [freeze_saver] - native.genrule( - name=freeze_name, - srcs=[ - graph, - freeze_checkpoint, - out_nodes_file, - ] + freeze_saver_srcs, - outs=[freeze_file], - cmd=("$(location //tensorflow/python/tools:freeze_graph)" + - freeze_args), - tools=["//tensorflow/python/tools:freeze_graph"], - tags=tags, - ) - tfcompile_graph = freeze_file + Args: + name: The name of the build rule. + graph: The TensorFlow GraphDef to compile. If the file ends in '.pbtxt' + it is expected to be in the human-readable proto text format, otherwise + it is expected to be in the proto binary format. + config: File containing tensorflow.tf2xla.Config proto. If the file ends + in '.pbtxt' it is expected to be in the human-readable proto text + format, otherwise it is expected to be in the proto binary format. + freeze_checkpoint: If provided, run freeze_graph with this checkpoint to + convert variables into constants. + freeze_saver: If provided, run freeze_graph with this saver, in SaverDef + binary form, to convert variables into constants. + cpp_class: The name of the generated C++ class, wrapping the generated + function. The syntax of this flag is + [[::],...]. This mirrors the C++ syntax + for referring to a class, where multiple namespaces may precede the + class name, separated by double-colons. The class will be generated in + the given namespace(s), or if no namespaces are given, within the global + namespace. + gen_test: If True, also generate a cc_test rule that builds a simple + test and benchmark. + gen_benchmark: If True, also generate a binary with a simple benchmark. + Unlike the output of gen_test, this benchmark can be run on android. + visibility: Bazel build visibility. + testonly: Bazel testonly attribute. + tfcompile_flags: Extra flags to pass to tfcompile to control compilation. + tfcompile_tool: The tfcompile binary. A non-default can be passed to + use a tfcompile built with extra dependencies. + include_standard_runtime_deps: If True, the standard list of + kernel/runtime deps is added to deps. If False, deps must contain the + full set of deps needed by the generated library. + enable_xla_hlo_profiling: Enable XLA HLO profiling in the generated + program, and emit metadata that lets us pretty-print the gathered + profile counters. + deps: a list of deps to include on the build rules for the generated + library, added to the standard deps if standard_runtime_deps is True. + tags: tags to apply to subsidiary build rules. + """ + if not cpp_class: + fail("cpp_class must be specified") - # Rule that runs tfcompile to produce the header and object file. - header_file = name + ".h" - metadata_object_file = name + "_tfcompile_metadata.o" - function_object_file = name + "_tfcompile_function.o" - ep = ("__" + native.package_name() + "__" + name).replace("/", "_") - if type(tfcompile_flags) == type(""): - flags = tfcompile_flags - else: - flags = " ".join(["'" + arg.replace("'", "'\\''") + "'" for arg in (tfcompile_flags or [])]) - if enable_xla_hlo_profiling: - profiling_flag = "--xla_hlo_profile" - else: - profiling_flag = "" - native.genrule( - name=("gen_" + name), - srcs=[ - tfcompile_graph, - config, - ], - outs=[ - header_file, - metadata_object_file, - function_object_file, - ], - cmd=("$(location " + tfcompile_tool + ")" + - " --graph=$(location " + tfcompile_graph + ")" + - " --config=$(location " + config + ")" + - " --entry_point=" + ep + - " --cpp_class=" + cpp_class + - " --target_triple=" + target_llvm_triple() + - " --out_header=$(@D)/" + header_file + - " --out_metadata_object=$(@D)/" + metadata_object_file + - " --out_function_object=$(@D)/" + function_object_file + - " " + flags + " " + profiling_flag), - tools=[tfcompile_tool], - visibility=visibility, - testonly=testonly, - # Run tfcompile on the build host since it's typically faster on the local - # machine. - # - # Note that setting the local=1 attribute on a *test target* causes the - # test infrastructure to skip that test. However this is a genrule, not a - # test target, and runs with --genrule_strategy=forced_forge, meaning the - # local=1 attribute is ignored, and the genrule is still run. - # - # https://www.bazel.io/versions/master/docs/be/general.html#genrule - local=1, - tags=tags, - ) + tfcompile_graph = graph + if freeze_checkpoint or freeze_saver: + if not freeze_checkpoint: + fail("freeze_checkpoint must be specified when freeze_saver is " + + "specified") - # Rule that runs tfcompile to produce the SessionModule proto, useful for - # debugging. TODO(b/64813587): Once the SessionModule proto is - # deterministic, move this into the main rule above. - session_module_pb = name + "_session_module.pb" - native.genrule( - name=(name + "_session_module"), - srcs=[ - tfcompile_graph, - config, - ], - outs=[ - session_module_pb, - ], - cmd=("$(location " + tfcompile_tool + ")" + - " --graph=$(location " + tfcompile_graph + ")" + - " --config=$(location " + config + ")" + - " --entry_point=" + ep + - " --cpp_class=" + cpp_class + - " --target_triple=" + target_llvm_triple() + - " --out_session_module=$(@D)/" + session_module_pb + - " " + flags), - tools=[tfcompile_tool], - visibility=visibility, - testonly=testonly, - local=1, - tags=tags, - ) + freeze_name = "freeze_" + name + freeze_file = freeze_name + ".pb" - # The cc_library rule packaging up the header and object file, and needed - # kernel implementations. - need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1) - native.cc_library( - name=name, - srcs=[function_object_file, metadata_object_file], - hdrs=[header_file], - visibility=visibility, - testonly=testonly, - deps = [ - # These deps are required by all tf_library targets even if - # include_standard_runtime_deps is False. Without them, the - # generated code will fail to compile. - "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function", - "//tensorflow/core:framework_lite", - ] + (need_xla_data_proto and [ - # If we're generating the program shape, we must depend on the proto. - "//tensorflow/compiler/xla:xla_data_proto", - ] or []) + (enable_xla_hlo_profiling and [ - "//tensorflow/compiler/xla/service:hlo_profile_printer_data" - ] or []) + (include_standard_runtime_deps and [ - # TODO(cwhipkey): only depend on kernel code that the model actually needed. - "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d", - "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d", - "//tensorflow/compiler/xla/service/cpu:runtime_conv2d", - "//tensorflow/compiler/xla/service/cpu:runtime_matmul", - "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d", - "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul", - "//third_party/eigen3", - ] or []) + (deps or []), - tags=tags, - ) + # First run tfcompile to generate the list of out_nodes. + out_nodes_file = "out_nodes_" + freeze_name + native.genrule( + name = ("gen_" + out_nodes_file), + srcs = [config], + outs = [out_nodes_file], + cmd = ("$(location " + tfcompile_tool + ")" + + " --config=$(location " + config + ")" + + " --dump_fetch_nodes > $@"), + tools = [tfcompile_tool], + # Run tfcompile on the build host, rather than forge, since it's + # typically way faster on the local machine. + local = 1, + tags = tags, + ) - # Variables used for gen_test and gen_benchmark. - no_ns_name = "" - cpp_class_split = cpp_class.rsplit("::", maxsplit=2) - if len(cpp_class_split) == 1: - no_ns_name = cpp_class_split[0] - else: - no_ns_name = cpp_class_split[1] - sed_replace = ( - "-e \"s|{{TFCOMPILE_HEADER}}|$(location " + header_file + ")|g\" " + - "-e \"s|{{TFCOMPILE_CPP_CLASS}}|" + cpp_class + "|g\" " + - "-e \"s|{{TFCOMPILE_NAME}}|" + no_ns_name + "|g\" ") + # Now run freeze_graph to convert variables into constants. + freeze_args = ( + " --input_graph=$(location " + graph + ")" + + " --checkpoint_version=1" + + " --input_binary=" + str(not graph.endswith(".pbtxt")) + + " --input_checkpoint=$(location " + freeze_checkpoint + ")" + + " --output_graph=$(location " + freeze_file + ")" + + " --output_node_names=$$(<$(location " + out_nodes_file + + "))" + ) + freeze_saver_srcs = [] + if freeze_saver: + freeze_args += " --input_saver=$(location " + freeze_saver + ")" + freeze_saver_srcs += [freeze_saver] + native.genrule( + name = freeze_name, + srcs = [ + graph, + freeze_checkpoint, + out_nodes_file, + ] + freeze_saver_srcs, + outs = [freeze_file], + cmd = ("$(location " + + "//tensorflow/python/tools:freeze_graph)" + + freeze_args), + tools = ["//tensorflow/python/tools:freeze_graph"], + tags = tags, + ) + tfcompile_graph = freeze_file - if gen_test: - test_name = name + "_test" - test_file = test_name + ".cc" - # Rule to rewrite test.cc to produce the test_file. + # Rule that runs tfcompile to produce the header and object file. + header_file = name + ".h" + metadata_object_file = name + "_tfcompile_metadata.o" + function_object_file = name + "_tfcompile_function.o" + ep = ("__" + native.package_name() + "__" + name).replace("/", "_") + if type(tfcompile_flags) == type(""): + flags = tfcompile_flags + else: + flags = " ".join([ + "'" + arg.replace("'", "'\\''") + "'" + for arg in (tfcompile_flags or []) + ]) + if enable_xla_hlo_profiling: + profiling_flag = "--xla_hlo_profile" + else: + profiling_flag = "" native.genrule( - name=("gen_" + test_name), - testonly=1, - srcs=[ - "//tensorflow/compiler/aot:test.cc", + name = ("gen_" + name), + srcs = [ + tfcompile_graph, + config, + ], + outs = [ header_file, + metadata_object_file, + function_object_file, ], - outs=[test_file], - cmd=("sed " + sed_replace + - " $(location //tensorflow/compiler/aot:test.cc) " + - "> $(OUTS)"), - tags=tags, - ) - - # The cc_test rule for the generated code. To ensure that this works - # reliably across build configurations, we must use tf_cc_test instead of - # native.cc_test. This is related to how we build - # //tensorflow/core:lib -- see the note in tensorflow/core/BUILD - # for more details. - tf_cc_test( - name=test_name, - srcs=[test_file], - deps=[ - ":" + name, - "//tensorflow/compiler/aot:runtime", - "//tensorflow/compiler/aot:tf_library_test_main", - "//tensorflow/compiler/xla:executable_run_options", - "//third_party/eigen3", - "//tensorflow/core:lib", - "//tensorflow/core:test", - ], - tags=tags, + cmd = ("$(location " + tfcompile_tool + ")" + + " --graph=$(location " + tfcompile_graph + ")" + + " --config=$(location " + config + ")" + + " --entry_point=" + ep + + " --cpp_class=" + cpp_class + + " --target_triple=" + target_llvm_triple() + + " --out_header=$(@D)/" + header_file + + " --out_metadata_object=$(@D)/" + metadata_object_file + + " --out_function_object=$(@D)/" + function_object_file + + " " + flags + " " + profiling_flag), + tools = [tfcompile_tool], + visibility = visibility, + testonly = testonly, + # Run tfcompile on the build host since it's typically faster on the + # local machine. + # + # Note that setting the local=1 attribute on a *test target* causes the + # test infrastructure to skip that test. However this is a genrule, not + # a test target, and runs with --genrule_strategy=forced_forge, meaning + # the local=1 attribute is ignored, and the genrule is still run. + # + # https://www.bazel.io/versions/master/docs/be/general.html#genrule + local = 1, + tags = tags, ) - if gen_benchmark: - benchmark_name = name + "_benchmark" - benchmark_file = benchmark_name + ".cc" - benchmark_main = ("//tensorflow/compiler/aot:" + - "benchmark_main.template") - - # Rule to rewrite benchmark.cc to produce the benchmark_file. + # Rule that runs tfcompile to produce the SessionModule proto, useful for + # debugging. TODO(b/64813587): Once the SessionModule proto is + # deterministic, move this into the main rule above. + session_module_pb = name + "_session_module.pb" native.genrule( - name=("gen_" + benchmark_name), - srcs=[ - benchmark_main, - header_file, + name = (name + "_session_module"), + srcs = [ + tfcompile_graph, + config, ], + outs = [ + session_module_pb, + ], + cmd = ("$(location " + tfcompile_tool + ")" + + " --graph=$(location " + tfcompile_graph + ")" + + " --config=$(location " + config + ")" + + " --entry_point=" + ep + + " --cpp_class=" + cpp_class + + " --target_triple=" + target_llvm_triple() + + " --out_session_module=$(@D)/" + session_module_pb + + " " + flags), + tools = [tfcompile_tool], + visibility = visibility, testonly = testonly, - outs=[benchmark_file], - cmd=("sed " + sed_replace + - " $(location " + benchmark_main + ") " + - "> $(OUTS)"), - tags=tags, + local = 1, + tags = tags, ) - # The cc_benchmark rule for the generated code. This does not need the - # tf_cc_binary since we (by deliberate design) do not depend on - # //tensorflow/core:lib. - # - # Note: to get smaller size on android for comparison, compile with: - # --copt=-fvisibility=hidden - # --copt=-D_LIBCPP_TYPE_VIS=_LIBCPP_HIDDEN - # --copt=-D_LIBCPP_EXCEPTION_ABI=_LIBCPP_HIDDEN - native.cc_binary( - name=benchmark_name, - srcs=[benchmark_file], + # The cc_library rule packaging up the header and object file, and needed + # kernel implementations. + need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1) + native.cc_library( + name = name, + srcs = [function_object_file, metadata_object_file], + hdrs = [header_file], + visibility = visibility, testonly = testonly, - copts = tf_copts(), - linkopts = if_android(["-pie", "-s"]), - deps=[ - ":" + name, - "//tensorflow/compiler/aot:benchmark", - "//tensorflow/compiler/aot:runtime", - "//tensorflow/compiler/xla:executable_run_options", + deps = [ + # These deps are required by all tf_library targets even if + # include_standard_runtime_deps is False. Without them, the + # generated code will fail to compile. + "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function", + "//tensorflow/core:framework_lite", + ] + (need_xla_data_proto and [ + # If we're generating the program shape, we must depend on the + # proto. + "//tensorflow/compiler/xla:xla_data_proto", + ] or []) + (enable_xla_hlo_profiling and [ + "//tensorflow/compiler/xla/service:hlo_profile_printer_data", + ] or []) + (include_standard_runtime_deps and [ + # TODO(cwhipkey): only depend on kernel code that the model actually + # needed. + "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d", + "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d", + "//tensorflow/compiler/xla/service/cpu:runtime_conv2d", + "//tensorflow/compiler/xla/service/cpu:runtime_matmul", + "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d", + "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul", "//third_party/eigen3", - ] + if_android([ - "//tensorflow/compiler/aot:benchmark_extra_android", - ]), - tags=tags, + ] or []) + (deps or []), + tags = tags, + ) + + # Variables used for gen_test and gen_benchmark. + cpp_class_split = cpp_class.rsplit("::", maxsplit = 2) + if len(cpp_class_split) == 1: + no_ns_name = cpp_class_split[0] + else: + no_ns_name = cpp_class_split[1] + sed_replace = ( + "-e \"s|{{TFCOMPILE_HEADER}}|$(location " + header_file + ")|g\" " + + "-e \"s|{{TFCOMPILE_CPP_CLASS}}|" + cpp_class + "|g\" " + + "-e \"s|{{TFCOMPILE_NAME}}|" + no_ns_name + "|g\" " ) + if gen_test: + test_name = name + "_test" + test_file = test_name + ".cc" + + # Rule to rewrite test.cc to produce the test_file. + native.genrule( + name = ("gen_" + test_name), + testonly = 1, + srcs = [ + "//tensorflow/compiler/aot:test.cc", + header_file, + ], + outs = [test_file], + cmd = ( + "sed " + sed_replace + + " $(location //tensorflow/compiler/aot:test.cc) " + + "> $(OUTS)" + ), + tags = tags, + ) + + # The cc_test rule for the generated code. To ensure that this works + # reliably across build configurations, we must use tf_cc_test instead + # of native.cc_test. This is related to how we build + # //tensorflow/core:lib -- see the note in + # tensorflow/core/BUILD for more details. + tf_cc_test( + name = test_name, + srcs = [test_file], + deps = [ + ":" + name, + "//tensorflow/compiler/aot:tf_library_test_main", + "//tensorflow/compiler/xla:executable_run_options", + "//third_party/eigen3", + "//tensorflow/core:lib", + "//tensorflow/core:test", + ], + tags = tags, + ) + + if gen_benchmark: + benchmark_name = name + "_benchmark" + benchmark_file = benchmark_name + ".cc" + benchmark_main = ("//tensorflow/compiler/aot:" + + "benchmark_main.template") + + # Rule to rewrite benchmark.cc to produce the benchmark_file. + native.genrule( + name = ("gen_" + benchmark_name), + srcs = [ + benchmark_main, + header_file, + ], + testonly = testonly, + outs = [benchmark_file], + cmd = ("sed " + sed_replace + + " $(location " + benchmark_main + ") " + + "> $(OUTS)"), + tags = tags, + ) + + # The cc_benchmark rule for the generated code. This does not need the + # tf_cc_binary since we (by deliberate design) do not depend on + # //tensorflow/core:lib. + # + # Note: to get smaller size on android for comparison, compile with: + # --copt=-fvisibility=hidden + # --copt=-D_LIBCPP_TYPE_VIS=_LIBCPP_HIDDEN + # --copt=-D_LIBCPP_EXCEPTION_ABI=_LIBCPP_HIDDEN + native.cc_binary( + name = benchmark_name, + srcs = [benchmark_file], + testonly = testonly, + copts = tf_copts(), + linkopts = if_android(["-pie", "-s"]), + deps = [ + ":" + name, + "//tensorflow/compiler/aot:benchmark", + "//tensorflow/compiler/xla:executable_run_options", + "//third_party/eigen3", + ] + if_android([ + "//tensorflow/compiler/aot:benchmark_extra_android", + ]), + tags = tags, + ) + def target_llvm_triple(): - """Returns the target LLVM triple to be used for compiling the target.""" - # TODO(toddw): Add target_triple for other targets. For details see: - # http://llvm.org/docs/doxygen/html/Triple_8h_source.html - return select({ - "//tensorflow:android_armeabi": "armv5-none-android", - "//tensorflow:android_arm": "armv7-none-android", - "//tensorflow:android_arm64": "aarch64-none-android", - "//tensorflow:android_x86": "i686-none-android", - "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu", - "//tensorflow:darwin": "x86_64-none-darwin", - "//conditions:default": "x86_64-pc-linux", - }) + """Returns the target LLVM triple to be used for compiling the target.""" + + # TODO(toddw): Add target_triple for other targets. For details see: + # http://llvm.org/docs/doxygen/html/Triple_8h_source.html + return select({ + "//tensorflow:android_armeabi": "armv5-none-android", + "//tensorflow:android_arm": "armv7-none-android", + "//tensorflow:android_arm64": "aarch64-none-android", + "//tensorflow:android_x86": "i686-none-android", + "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu", + "//tensorflow:darwin": "x86_64-none-darwin", + "//conditions:default": "x86_64-pc-linux", + }) diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc index 839e1588b7be6c91cf30c87bbaf75402446bd169..f3c44e9dda8ce96a268420a7f4d0f22e50ddfe41 100644 --- a/tensorflow/compiler/aot/tfcompile_main.cc +++ b/tensorflow/compiler/aot/tfcompile_main.cc @@ -18,6 +18,8 @@ limitations under the License. #include #include +#include "absl/strings/match.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/aot/codegen.h" #include "tensorflow/compiler/aot/compile.h" #include "tensorflow/compiler/aot/flags.h" @@ -34,7 +36,6 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -55,7 +56,7 @@ const char kUsageHeader[] = "\n"; Status ReadProtoFile(const string& fname, protobuf::Message* proto) { - if (str_util::EndsWith(fname, ".pbtxt")) { + if (absl::EndsWith(fname, ".pbtxt")) { return ReadTextProto(Env::Default(), fname, proto); } else { return ReadBinaryProto(Env::Default(), fname, proto); @@ -75,7 +76,7 @@ Status Main(const MainFlags& flags) { for (const tf2xla::Fetch& fetch : config.fetch()) { nodes.insert(fetch.id().node_name()); } - std::cout << str_util::Join(nodes, ","); + std::cout << absl::StrJoin(nodes, ","); return Status::OK(); } diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 6d6c030a26fd2edc99a429a301acb00e77116307..df81f3c23e38a2ec2cea827cd0adb123855e7714 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -25,6 +25,7 @@ load("//tensorflow:tensorflow.bzl", "tf_kernel_library") load("//tensorflow:tensorflow.bzl", "tf_cc_test") load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured") +load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") # Target that bundles up the XLA CPU and GPU JIT devices. cc_library( @@ -127,11 +128,11 @@ cc_library( "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/service:shaped_buffer", - "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/memory", ], ) @@ -159,12 +160,14 @@ cc_library( "//tensorflow/compiler/jit/ops:xla_ops", "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/tf2xla:dump_graph", + "//tensorflow/compiler/tf2xla:tf2xla_util", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/tf2xla/kernels:xla_ops", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/service:stream_pool", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", @@ -175,12 +178,20 @@ cc_library( "//tensorflow/core/kernels:cast_op", "//tensorflow/core/kernels:constant_op", "//tensorflow/core/kernels:control_flow_ops", + "//tensorflow/core/kernels:fifo_queue", + "//tensorflow/core/kernels:function_ops", "//tensorflow/core/kernels:identity_n_op", "//tensorflow/core/kernels:identity_op", "//tensorflow/core/kernels:no_op", + "//tensorflow/core/kernels:queue_op", "//tensorflow/core/kernels:resource_variable_ops", "//tensorflow/core/kernels:sendrecv_ops", + "//tensorflow/core/kernels:shape_ops", "//tensorflow/core/kernels:variable_ops", + "//tensorflow/core/kernels/data:generator_dataset_op", + "//tensorflow/core/kernels/data:iterator_ops", + "//tensorflow/core/kernels/data:prefetch_dataset_op", + "@com_google_absl//absl/memory", ], ) @@ -225,6 +236,7 @@ cc_library( "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", "//tensorflow/core/kernels:variable_ops", + "@com_google_absl//absl/memory", ], ) @@ -273,6 +285,7 @@ cc_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/memory", ], alwayslink = 1, ) @@ -293,6 +306,52 @@ tf_cc_test( "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:testlib", + "@com_google_absl//absl/memory", + ], +) + +cc_library( + name = "resource_operation_safety_analysis", + srcs = ["resource_operation_safety_analysis.cc"], + hdrs = ["resource_operation_safety_analysis.h"], + deps = [ + "//tensorflow/compiler/jit/graphcycles", + "//tensorflow/compiler/tf2xla:resource_operation_table", + "//tensorflow/core:framework", + "//tensorflow/core:graph", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + ], +) + +tf_cc_test( + name = "resource_operation_safety_analysis_test", + srcs = ["resource_operation_safety_analysis_test.cc"], + deps = [ + ":common", + ":resource_operation_safety_analysis", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:cc_ops_internal", + "//tensorflow/cc:function_ops", + "//tensorflow/cc:functional_ops", + "//tensorflow/cc:ops", + "//tensorflow/cc:resource_variable_ops", + "//tensorflow/cc:sendrecv_ops", + "//tensorflow/compiler/jit/kernels:xla_launch_op", + "//tensorflow/compiler/tf2xla:xla_compiler", + "//tensorflow/compiler/tf2xla/kernels:xla_ops", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:framework_internal", + "//tensorflow/core:graph", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + "@com_google_absl//absl/strings", ], ) @@ -300,27 +359,34 @@ cc_library( name = "compilation_passes", srcs = [ "build_xla_launch_ops_pass.cc", + "deadness_analysis.cc", + "deadness_analysis_internal.h", "encapsulate_subgraphs_pass.cc", "mark_for_compilation_pass.cc", + "mark_for_compilation_pass_test_helper.cc", + "partially_decluster_pass.cc", ], hdrs = [ "build_xla_launch_ops_pass.h", + "deadness_analysis.h", "encapsulate_subgraphs_pass.h", "mark_for_compilation_pass.h", + "mark_for_compilation_pass_test_helper.h", + "partially_decluster_pass.h", ], deps = [ ":common", ":shape_inference_helpers", ":union_find", + ":xla_cluster_util", "//tensorflow/compiler/jit/graphcycles", - "//tensorflow/compiler/jit/kernels:parallel_check_op", - "//tensorflow/compiler/jit/legacy_flags:encapsulate_subgraphs_pass_flags", "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags", - "//tensorflow/compiler/jit/ops:parallel_check_op", "//tensorflow/compiler/jit/ops:xla_ops", "//tensorflow/compiler/tf2xla:dump_graph", + "//tensorflow/compiler/tf2xla:resource_operation_table", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:util", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", @@ -329,6 +395,22 @@ cc_library( "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", "//tensorflow/core/kernels:bounds_check", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "xla_cluster_util", + srcs = ["xla_cluster_util.cc"], + hdrs = ["xla_cluster_util.h"], + deps = [ + ":resource_operation_safety_analysis", + "//tensorflow/compiler/jit/graphcycles", + "//tensorflow/core:framework", + "//tensorflow/core:graph", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core/kernels:bounds_check", + "@com_google_absl//absl/types:optional", ], ) @@ -356,16 +438,76 @@ tf_cc_test( ], ) +tf_cc_test( + name = "deadness_analysis_test", + size = "small", + srcs = [ + "deadness_analysis_internal.h", + "deadness_analysis_test.cc", + ], + deps = [ + ":common", + ":compilation_passes", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:cc_ops_internal", + "//tensorflow/cc:function_ops", + "//tensorflow/cc:ops", + "//tensorflow/cc:sendrecv_ops", + "//tensorflow/compiler/jit/kernels:xla_launch_op", + "//tensorflow/compiler/tf2xla:xla_compiler", + "//tensorflow/compiler/tf2xla/kernels:xla_ops", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:framework_internal", + "//tensorflow/core:graph", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + tf_cc_test( name = "compilation_passes_test", size = "small", srcs = [ "encapsulate_subgraphs_pass_test.cc", "mark_for_compilation_pass_test.cc", + "partially_decluster_pass_test.cc", ], deps = [ ":common", ":compilation_passes", + ":xla_cluster_util", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:cc_ops_internal", + "//tensorflow/cc:function_ops", + "//tensorflow/cc:ops", + "//tensorflow/cc:resource_variable_ops", + "//tensorflow/cc:sendrecv_ops", + "//tensorflow/compiler/jit/kernels:xla_launch_op", + "//tensorflow/compiler/tf2xla:xla_compiler", + "//tensorflow/compiler/tf2xla/kernels:xla_ops", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:framework_internal", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + "@com_google_absl//absl/strings", + ], +) + +tf_cc_test( + name = "xla_cluster_util_test", + size = "small", + srcs = [ + "xla_cluster_util_test.cc", + ], + deps = [ + ":common", + ":xla_cluster_util", "//tensorflow/cc:cc_ops", "//tensorflow/cc:cc_ops_internal", "//tensorflow/cc:function_ops", @@ -408,6 +550,42 @@ tf_cc_test( ], ) +cc_library( + name = "xla_fusion_optimizer", + srcs = ["xla_fusion_optimizer.cc"], + hdrs = ["xla_fusion_optimizer.h"], + visibility = ["//visibility:public"], + deps = [ + ":common", + ":compilation_passes", + ":union_find", + ":xla_cluster_util", + "//tensorflow/compiler/jit/graphcycles", + "//tensorflow/core:core_cpu_base", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core/grappler:grappler_item", + "//tensorflow/core/grappler/optimizers:custom_graph_optimizer", + "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry", + ], +) + +tf_cuda_cc_test( + name = "xla_fusion_optimizer_test", + srcs = ["xla_fusion_optimizer_test.cc"], + deps = [ + ":common", + ":xla_cluster_util", + ":xla_fusion_optimizer", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:ops", + "//tensorflow/cc:resource_variable_ops", + "//tensorflow/core:graph", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core/grappler/utils:grappler_test", + ], +) + # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library. cc_header_only_library( name = "xla_jit_headers_lib", diff --git a/tensorflow/compiler/jit/create_xla_launch_op.cc b/tensorflow/compiler/jit/create_xla_launch_op.cc index 731b8ebfdc6262500940274c94a03ae7c0376096..56b034a30b7bddb023e54ead22c91a7a18095d2d 100644 --- a/tensorflow/compiler/jit/create_xla_launch_op.cc +++ b/tensorflow/compiler/jit/create_xla_launch_op.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/jit/create_xla_launch_op.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/jit/defs.h" #include "tensorflow/compiler/jit/kernels/xla_launch_op.h" #include "tensorflow/compiler/jit/mark_for_compilation_pass.h" @@ -66,8 +67,28 @@ class SinglePassSearch { Status CompilationRequested(const FunctionLibraryRuntime& flr, const NodeDef& node_def) { + const FunctionDef* function_def = + flr.GetFunctionLibraryDefinition()->Find(node_def.name()); + if (function_def == nullptr) { + // The node def is not calling a function. Individual ops can be + // run directly using on-demand mode, no need to create XlaLaunch + // kernel for them. + // TODO(b/110359382): Make custom kernel creation return a bool instead of + // status. + // We don't set error messages here to avoid unnecessary string copy. + // Similarly below. + return Status(error::INVALID_ARGUMENT, ""); + } + + // If kXlaCompileAttr is set on the node_def, use its value. + const auto& it = node_def.attr().find(kXlaCompileAttr); + if (it != node_def.attr().end()) { + return it->second.b() ? Status::OK() : Status(error::INVALID_ARGUMENT, ""); + } + + // kXlaCompileAttr is not set on node_def, check if it is set on + // FunctionDef. bool xla_compile = false; - // Check if op is marked _XlaCompile=true. Status status = flr.GetFunctionLibraryDefinition()->GetAttr( node_def, kXlaCompileAttr, &xla_compile); if (!status.ok() || !xla_compile) { @@ -105,7 +126,8 @@ Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr, const DataTypeVector& arg_types = (*fbody)->arg_types; std::vector const_args(arg_types.size()); // If we can't analyze the const args. Bail out. - TF_RETURN_IF_ERROR(BackwardsConstAnalysis(*((*fbody)->graph), &const_args)); + TF_RETURN_IF_ERROR(BackwardsConstAnalysis( + *((*fbody)->graph), &const_args, /*compile_time_const_nodes=*/nullptr)); for (int i = 0; i < const_args.size(); ++i) { if (const_args[i]) { @@ -187,8 +209,13 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& node_def, // device memory. // XlaLaunch kernel keeps all outputs (including constants, which it copies), - // in device memory + // in device memory except for resources. MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY); + for (int i = 0; i < fbody->ret_types.size(); ++i) { + if (fbody->ret_types[i] == DT_RESOURCE) { + output_memory_types[i] = HOST_MEMORY; + } + } // Create the kernel. NameAttrList function; @@ -203,8 +230,8 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& node_def, &fbody->fdef.signature(), flr, fbody->arg_types, input_memory_types, fbody->ret_types, output_memory_types, flr->graph_def_version(), &s); - *kernel = MakeUnique(&construction, constant_arg_indices, - resource_arg_indices, function); + *kernel = absl::make_unique( + &construction, constant_arg_indices, resource_arg_indices, function); return s; } diff --git a/tensorflow/compiler/jit/create_xla_launch_op_test.cc b/tensorflow/compiler/jit/create_xla_launch_op_test.cc index b75ab486b80e098bc0a59f9ea8cdbaa23a28fef9..73866607621cd745f6e640a14405daebf0dd9985 100644 --- a/tensorflow/compiler/jit/create_xla_launch_op_test.cc +++ b/tensorflow/compiler/jit/create_xla_launch_op_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/jit/create_xla_launch_op.h" +#include "absl/memory/memory.h" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/framework/function_testlib.h" @@ -65,11 +66,11 @@ class CreateXlaLaunchOpTest : public ::testing::Test { for (const auto& fdef : flib) { *(proto.add_function()) = fdef; } - lib_def_ = - MakeUnique(OpRegistry::Global(), proto); + lib_def_ = absl::make_unique( + OpRegistry::Global(), proto); OptimizerOptions opts; - device_mgr_ = MakeUnique(devices_); - pflr_ = MakeUnique( + device_mgr_ = absl::make_unique(devices_); + pflr_ = absl::make_unique( device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr); flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0"); diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc new file mode 100644 index 0000000000000000000000000000000000000000..82aa03810bc0ecee8ae92ed6f286867eea893287 --- /dev/null +++ b/tensorflow/compiler/jit/deadness_analysis.cc @@ -0,0 +1,944 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/deadness_analysis.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/jit/deadness_analysis_internal.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/tensor_id.h" +#include "tensorflow/core/lib/gtl/flatset.h" +#include "tensorflow/core/lib/hash/hash.h" + +// ALGORITHM OVERVIEW +// ================== +// +// We map every output produced by each node in the TensorFlow graph (including +// control dependence) into an instance of the Predicate class. Instances of +// Predicate denote logical formulas and mapping a node `n` to a predicate +// `pred` implies that `n` is live whenever `pred` is true. Then we can deduce +// mismatching liveness in the inputs to node by comparing the predicate those +// inputs are mapped to. The core logic of this pass resides in creating the +// map from TensorFlow nodes to predicates. +// +// +// MAPPING NODES TO PREDICATES, MODULO CYCLES +// ------------------------------------------ +// +// If we ignore cycles for a moment, computing predicates is fairly +// straightforward. We traverse the graph in RPO, mapping each node to a +// predicate based on the predicates its inputs are mapped to. For instance a +// Merge(X, Y) node will be mapped to OR(PredicateFor(X), PredicateFor(Y)). +// Roughtly speaking, we abstract interpret each node on the "liveness" domain, +// where values in the domain represent if a tensor carries a dead signal or +// not. +// +// +// DEALING WITH CYCLES +// ------------------- +// +// We map Merge nodes that are the target of a backedge to AndRecurrence +// instances. An AndRecurrence with start() = S and step() = X, printed as +// {S,&,X}, *roughly* represents the infinite list of predicates +// [S,S&X,S&X&X,S&X&X, ...]. So {S,&,X} can be used to represent the predicate +// for Merge in a graph like: +// +// Init +// | +// v +// Merge <-----------+ +// | | +// v | +// Incr | +// | | +// v | +// Switch <- Cond | +// | | +// v (oidx: 1) | +// | | +// +---------------+ +// +// Where S is the predicate for Init and X is the predicate that asserts that +// Cond is true. {S,&,X} states that Merge is live on the first "iteration" iff +// S is true, live on the second iteration iff "S&X" is true, live on the third +// iteration iff "S&X&X" is true etc. There is a subtlety here, S&X&X would +// normally be equivalent to S&X which isn't quite what we want to represent. +// Instead we want {S,&,X} to denote the infinite list [S, S&X, +// S&X&X',S&X&X'&X'', ...] where X, X', X'' are predicates that assert Cond is +// true on iteration 0, 1, 2 respectively. This is made more precise in the +// comment on the AndRecurrence class. +// +// The general algorithm that deals with cycles does two RPO (reverse post +// order) passes over the graph. On the first pass it assigns a symbolic +// predicate to merge nodes with backedges. On the second pass it tries to +// pattern matche the predicates for the backedges of these merges and infer an +// AndRecurrence for the merge. +// +// In other words, we do a pessimistic data flow analysis where the data-flow +// lattice has two elements, Symbolic and NonSymbolic with Symbolic > +// NonSymbolic. The lattice has height = 2 so two iterations are sufficient to +// converge. We don't do an optimistic data flow analysis to make pattern +// matching easier: if we assigned the predicate of the initial value to the +// merge during the first pass, on the second pass the backedge may see a +// simplified value that would be difficult to pattern match. +// +// We still use symbolic predicates for merges for which we can't pattern match +// on the backedge predicate. This is conservatively correct. + +namespace tensorflow { + +namespace { + +// Represents a logical predicate, used as described in the algorithm overview +// above. +class Predicate { + public: + enum class Kind { kAnd, kOr, kNot, kAndRecurrence, kSymbol }; + + virtual string ToString() const = 0; + int64 hash() const { return hash_; } + virtual absl::Span GetOperands() const = 0; + + virtual Kind kind() const = 0; + virtual ~Predicate() {} + + // Invokes func on p and on all of its operands recursively. Does not invoke + // `func` on the same Predicate instance twice. Aborts the search if `func` + // returns true. + template + static void Visit(Predicate* p, const FunctionTy& func); + + protected: + explicit Predicate(int64 hash) : hash_(hash) {} + + private: + const int64 hash_; + + TF_DISALLOW_COPY_AND_ASSIGN(Predicate); +}; + +int64 HashPredicateSequence(Predicate::Kind kind, + absl::Span preds) { + int64 hash = ::tensorflow::hash()(kind); + for (Predicate* pred : preds) { + hash = Hash64Combine(hash, pred->hash()); + } + return hash; +} + +// Represents a logical conjunction of a set of predicates. +class AndPredicate : public Predicate { + public: + explicit AndPredicate(std::vector operands) + : Predicate(HashPredicateSequence(Kind::kAnd, operands)), + operands_(std::move(operands)) {} + + string ToString() const override { + if (operands().empty()) { + return "#true"; + } + + std::vector operands_str; + std::transform(operands().begin(), operands().end(), + std::back_inserter(operands_str), + [](Predicate* pred) { return pred->ToString(); }); + + return strings::StrCat("(", absl::StrJoin(operands_str, " & "), ")"); + } + + Kind kind() const override { return Kind::kAnd; } + + absl::Span GetOperands() const override { + return operands_; + } + absl::Span operands() const { return operands_; } + + private: + std::vector operands_; +}; + +// Represents a logical disjunction of a set of predicates. +class OrPredicate : public Predicate { + public: + explicit OrPredicate(std::vector operands) + : Predicate(HashPredicateSequence(Kind::kOr, operands)), + operands_(std::move(operands)) {} + + string ToString() const override { + if (operands().empty()) { + return "#false"; + } + + std::vector operands_str; + std::transform(operands().begin(), operands().end(), + std::back_inserter(operands_str), + [](Predicate* pred) { return pred->ToString(); }); + + return strings::StrCat("(", absl::StrJoin(operands_str, " | "), ")"); + } + + Kind kind() const override { return Kind::kOr; } + absl::Span GetOperands() const override { + return operands_; + } + absl::Span operands() const { return operands_; } + + private: + std::vector operands_; +}; + +// Represents a logical negation of a set of predicates. +class NotPredicate : public Predicate { + public: + explicit NotPredicate(Predicate* operand) + : Predicate(HashPredicateSequence(Kind::kNot, {operand})), + operands_({operand}) {} + + string ToString() const override { + return strings::StrCat("~", operand()->ToString()); + } + + Kind kind() const override { return Kind::kNot; } + Predicate* operand() const { return operands_[0]; } + absl::Span GetOperands() const override { + return operands_; + } + + private: + std::array operands_; +}; + +// Represents an infinite list of predicates. +// +// An AndRecurrence with start = S and step = X is printed as {S,&,X} and stands +// for the list of predicates: +// +// S, S & GenSym(X,1), S & GenSym(X,1) & GenSym(X,2), ... +// +// where GenSym(, ) renames every SymbolPredicate in +// by appending to it, in effect creating a "fresh" symbol. +// This means {P,&,Q} is not equal to "P on the first iteration; P&Q on +// subsequent iterations". +class AndRecurrencePredicate : public Predicate { + public: + explicit AndRecurrencePredicate(Predicate* start, Predicate* step) + : Predicate(HashPredicateSequence(Kind::kAndRecurrence, {start, step})), + operands_({start, step}) {} + + Predicate* start() const { return operands_[0]; } + Predicate* step() const { return operands_[1]; } + + string ToString() const override { + return strings::StrCat("{", start()->ToString(), ",&,", step()->ToString(), + "}"); + } + + Kind kind() const override { return Kind::kAndRecurrence; } + + absl::Span GetOperands() const override { + return operands_; + } + + private: + std::array operands_; +}; + +// Represents an uninterpreted symbol in a logical predicate. +// +// Two predicates are equivalent iff they are equivalent for all assignments to +// the symbols contained in them, i.e. predicates are forall qualified over +// symbols. +class SymbolPredicate : public Predicate { + public: + explicit SymbolPredicate(TensorId tensor_id, bool must_be_true) + : Predicate(Hash(tensor_id, must_be_true)), + tensor_id_(std::move(tensor_id)), + must_be_true_(must_be_true) {} + + string ToString() const override { + return must_be_true() ? strings::StrCat("*", tensor_id_.ToString()) + : tensor_id_.ToString(); + } + + Kind kind() const override { return Kind::kSymbol; } + absl::Span GetOperands() const override { return {}; } + + // If `must_be_true()` is true this SymbolPredicate represents the proposition + // "tensor_id() is live and evaluates to true". + // + // If `must_be_true()` is false then this SymbolPredicate represents the + // proposition "tensor_id() is live (and may evalutate to any value)" + TensorId tensor_id() const { return tensor_id_; } + bool must_be_true() const { return must_be_true_; } + + private: + TensorId tensor_id_; + bool must_be_true_; + + static int64 Hash(const TensorId tensor_id, bool must_be_true) { + return Hash64Combine( + ::tensorflow::hash()(must_be_true), + Hash64Combine(::tensorflow::hash()(Kind::kSymbol), + TensorId::Hasher{}(tensor_id))); + } +}; + +template +/*static*/ void Predicate::Visit(Predicate* p, const FunctionTy& func) { + gtl::FlatSet visited; + std::vector stack; + + stack.push_back(p); + visited.insert(p); + + while (!stack.empty()) { + Predicate* current = stack.back(); + stack.pop_back(); + bool done = func(current); + if (done) { + return; + } + for (Predicate* op : current->GetOperands()) { + if (visited.insert(op).second) { + stack.push_back(op); + } + } + } +} + +// Creates and owns Predicate instances. Simplifies predicates as it creates +// them. +class PredicateFactory { + public: + Predicate* MakeAndPredicate(absl::Span operands) { + return MakeAndOrImpl(operands, /*is_and=*/true); + } + + Predicate* MakeOrPredicate(absl::Span operands) { + return MakeAndOrImpl(operands, /*is_and=*/false); + } + + Predicate* MakeNotPredicate(Predicate* pred) { + SignatureForNot signature = pred; + auto it = interned_not_instances_.find(signature); + if (it == interned_not_instances_.end()) { + std::unique_ptr new_pred = Make(pred); + Predicate* new_pred_ptr = new_pred.get(); + interned_not_instances_.emplace(signature, std::move(new_pred)); + return new_pred_ptr; + } else { + return it->second.get(); + } + } + + Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step) { + auto it = interned_and_rec_instances_.find({start, step}); + if (it != interned_and_rec_instances_.end()) { + return it->second.get(); + } + + std::unique_ptr new_pred = + Make(start, step); + Predicate* new_pred_ptr = new_pred.get(); + CHECK(interned_and_rec_instances_ + .emplace(SignatureForAndRec(start, step), std::move(new_pred)) + .second); + return new_pred_ptr; + } + + Predicate* MakeSymbolPredicate(TensorId tensor_id, bool must_be_true) { + SignatureForSymbol signature = {tensor_id, must_be_true}; + auto it = interned_symbol_instances_.find(signature); + if (it == interned_symbol_instances_.end()) { + std::unique_ptr new_pred = + Make(tensor_id, must_be_true); + Predicate* new_pred_ptr = new_pred.get(); + interned_symbol_instances_.emplace(std::move(signature), + std::move(new_pred)); + return new_pred_ptr; + } else { + return it->second.get(); + } + } + + Predicate* MakeTrue() { return MakeAndPredicate({}); } + Predicate* MakeFalse() { return MakeOrPredicate({}); } + + private: + template + std::unique_ptr Make(Args&&... args) { + return std::unique_ptr( + new PredicateT(std::forward(args)...)); + } + + Predicate* MakeAndOrImpl(absl::Span operands, bool is_and); + + // Predicate instances are interned, meaning that there is only a single + // instance of a Predicate object with a given content. This makes checking + // for structural equality super-cheap -- we can just compare pointers. + // + // We intern predicates by maintaining a map from the content of a Predicate + // to the only instance of said predicate we allow to exist in the + // interned_and_or_instances_, interned_not_instances_ and + // interned_symbol_instances_ fields. These maps also double up as storage + // for the owning pointers to predicate instances. + + using SignatureForAndOr = + std::pair>; + using SignatureForNot = Predicate*; + using SignatureForAndRec = std::pair; + using SignatureForSymbol = std::pair; + + struct HashSignatureForAndOr { + size_t operator()(const SignatureForAndOr& signature) const { + size_t hash = ::tensorflow::hash()(signature.first); + for (Predicate* p : signature.second) { + hash = Hash64Combine(hash, ::tensorflow::hash()(p)); + } + return hash; + } + }; + + struct HashSignatureForSymbol { + size_t operator()(const SignatureForSymbol& signature) const { + return Hash64Combine(SafeTensorId::Hasher()(signature.first), + ::tensorflow::hash()(signature.second)); + } + }; + + gtl::FlatMap, + HashSignatureForAndOr> + interned_and_or_instances_; + gtl::FlatMap> + interned_not_instances_; + gtl::FlatMap> + interned_and_rec_instances_; + gtl::FlatMap, + HashSignatureForSymbol> + interned_symbol_instances_; +}; + +// Common code to create AndPredicate or OrPredicate instances. +Predicate* PredicateFactory::MakeAndOrImpl( + absl::Span operands, bool is_and) { + Predicate::Kind pred_kind = + is_and ? Predicate::Kind::kAnd : Predicate::Kind::kOr; + gtl::FlatSet simplified_ops_set; + std::vector simplified_ops; + for (Predicate* op : operands) { + // Simplify A&A => A and A|A => A. + if (!simplified_ops_set.insert(op).second) { + continue; + } + + if (op->kind() == pred_kind) { + // "Inline" the operands of an inner And/Or into the parent And/Or. + for (Predicate* subop : op->GetOperands()) { + if (simplified_ops_set.insert(subop).second) { + simplified_ops.push_back(subop); + } + } + } else { + simplified_ops.push_back(op); + } + } + + if (simplified_ops.size() == 1) { + return simplified_ops[0]; + } + + // Simplify "A&~A=>False" and "A|~A=>True". + gtl::FlatSet negated_ops; + for (Predicate* op : simplified_ops) { + if (op->kind() == Predicate::Kind::kNot) { + negated_ops.insert(dynamic_cast(*op).operand()); + } + } + + for (Predicate* op : simplified_ops) { + if (negated_ops.count(op)) { + return is_and ? MakeFalse() : MakeTrue(); + } + } + + std::stable_sort( + simplified_ops.begin(), simplified_ops.end(), + [](Predicate* a, Predicate* b) { return a->hash() < b->hash(); }); + + auto it = interned_and_or_instances_.find({pred_kind, simplified_ops}); + if (it == interned_and_or_instances_.end()) { + simplified_ops.shrink_to_fit(); + // NB! Because we'll use a non-owning reference to simplified_ops in the + // key for interned_and_or_instances_ we need to be careful to std::move() + // it all the way through. + absl::Span operands_slice = simplified_ops; + std::unique_ptr new_pred = + is_and ? Make(std::move(simplified_ops)) + : Make(std::move(simplified_ops)); + + Predicate* new_pred_ptr = new_pred.get(); + CHECK(interned_and_or_instances_ + .emplace(SignatureForAndOr(pred_kind, operands_slice), + std::move(new_pred)) + .second); + return new_pred_ptr; + } else { + return it->second.get(); + } +} + +class DeadnessAnalysisImpl : public DeadnessAnalysis { + public: + explicit DeadnessAnalysisImpl(const Graph* graph) + : graph_(*graph), vlog_(VLOG_IS_ON(2)) {} + + Status Populate(); + Status PopulateWithReversePostOrder(absl::Span rpo); + bool HasInputsWithMismatchingDeadness(const Node& node) override; + void Print() const override; + gtl::FlatMap PredicateMapAsString() const; + + private: + enum class EdgeKind { kDataAndControl, kDataOnly, kControlOnly }; + + std::vector GetIncomingPreds(Node* n, EdgeKind edge_kind); + + // Sets the predicate for output `output_idx` of `n` to `pred`. Sets the i'th + // bit of `should_revisit` if `pred` is different from the current predicate + // for the `output_idx` output of `n`. + void SetPredicate(Node* n, int output_idx, Predicate* pred, + std::vector* should_revisit) { + auto insert_result = + predicate_map_.insert({TensorId(n->name(), output_idx), pred}); + if (!insert_result.second && insert_result.first->second != pred) { + VLOG(4) << "For " << n->name() << ":" << output_idx << " from " + << insert_result.first->second->ToString() << " " + << insert_result.first->second << " to " << pred->ToString() + << " " << pred; + insert_result.first->second = pred; + if (should_revisit != nullptr) { + for (const Edge* e : n->out_edges()) { + (*should_revisit)[e->dst()->id()] = true; + } + } + } + } + + void SetPredicate(Node* n, absl::Span output_idxs, Predicate* pred, + std::vector* should_revisit) { + for (int output_idx : output_idxs) { + SetPredicate(n, output_idx, pred, should_revisit); + } + } + + Status HandleSwitch(Node* n, std::vector* should_revisit); + Status HandleMerge(Node* n, std::vector* should_revisit); + Status HandleRecv(Node* n, std::vector* should_revisit); + Status HandleGeneric(Node* n, std::vector* should_revisit); + Status HandleNode(Node* n, std::vector* should_revisit); + + const Graph& graph_; + gtl::FlatMap predicate_map_; + PredicateFactory predicate_factory_; + bool vlog_; +}; + +TensorId InputEdgeToTensorId(const Edge* e) { + return TensorId(e->src()->name(), e->src_output()); +} + +std::vector DeadnessAnalysisImpl::GetIncomingPreds( + Node* n, DeadnessAnalysisImpl::EdgeKind edge_kind) { + std::vector incoming_preds; + for (const Edge* in_edge : n->in_edges()) { + bool should_process = + edge_kind == EdgeKind::kDataAndControl || + (in_edge->IsControlEdge() && edge_kind == EdgeKind::kControlOnly) || + (!in_edge->IsControlEdge() && edge_kind == EdgeKind::kDataOnly); + + if (should_process) { + auto it = predicate_map_.find(InputEdgeToTensorId(in_edge)); + CHECK(it != predicate_map_.end()) << n->name(); + incoming_preds.push_back(it->second); + } + } + return incoming_preds; +} + +Status DeadnessAnalysisImpl::HandleSwitch(Node* n, + std::vector* should_revisit) { + std::vector input_preds = + GetIncomingPreds(n, EdgeKind::kDataAndControl); + const Edge* pred_edge; + TF_RETURN_IF_ERROR(n->input_edge(1, &pred_edge)); + Predicate* true_switch = predicate_factory_.MakeSymbolPredicate( + TensorId(pred_edge->src()->name(), pred_edge->src_output()), + /*must_be_true=*/true); + Predicate* false_switch = predicate_factory_.MakeNotPredicate(true_switch); + + // Output 0 is alive iff all inputs are alive and the condition is false. + input_preds.push_back(false_switch); + SetPredicate(n, 0, predicate_factory_.MakeAndPredicate(input_preds), + should_revisit); + input_preds.pop_back(); + + // Output 1 is alive iff all inputs are alive and the condition is true. + input_preds.push_back(true_switch); + SetPredicate(n, 1, predicate_factory_.MakeAndPredicate(input_preds), + should_revisit); + input_preds.pop_back(); + + // Control is alive iff all inputs are alive. + SetPredicate(n, Graph::kControlSlot, + predicate_factory_.MakeAndPredicate(input_preds), + should_revisit); + + return Status::OK(); +} + +namespace { +const Edge* FindUniqueBackedge(Node* merge) { + CHECK(merge->IsMerge()); + const Edge* result = nullptr; + for (const Edge* e : merge->in_edges()) { + if (e->src()->IsNextIteration()) { + CHECK_EQ(result, nullptr) + << "Multiple backedges to " << merge->DebugString(); + result = e; + } + } + return result; +} + +// If `backedge_predicate` is equal to `symbolic_predicate` & Step where Step +// does not contain `symbolic_predicate` as an inner (not top-level) operand +// then returns `Step`. Otherwise returns nullptr. +Predicate* DeduceStepPredicate(PredicateFactory* predicate_factory, + Predicate* symbolic_predicate, + Predicate* backedge_predicate) { + CHECK(dynamic_cast(symbolic_predicate)); + if (backedge_predicate->kind() != Predicate::Kind::kAnd) { + return nullptr; + } + + std::vector and_ops; + absl::Span recurrent_pred_ops = + backedge_predicate->GetOperands(); + + bool found_sym = false; + for (Predicate* and_op : recurrent_pred_ops) { + // We want the `symbol_predicate` to be the one of the operands of + // `backedge_predicate`, + if (and_op == symbolic_predicate) { + found_sym = true; + continue; + } + + // but we don't want it to be present anywhere else in the formula. E.g. we + // don't want the recurrent predicate to be + // symbol_predicate&(X|symbol_predicate). + bool found_sym_as_inner_operand = false; + auto has_self_as_inner_operand = [&](Predicate* p) { + if (p == symbolic_predicate) { + found_sym_as_inner_operand = true; + return true; // Stop searching, we're done. + } + + // Continue searching. + return false; + }; + + Predicate::Visit(and_op, has_self_as_inner_operand); + if (found_sym_as_inner_operand) { + return nullptr; + } + and_ops.push_back(and_op); + } + + return found_sym ? predicate_factory->MakeAndPredicate(and_ops) : nullptr; +} +} // namespace + +Status DeadnessAnalysisImpl::HandleMerge(Node* n, + std::vector* should_revisit) { + // Merge ignores deadness of its control inputs. A merge that isn't the + // target of a backedge has is alive iff any of its data inputs are. The + // liveness of a merge that is the target of a backedge can sometimes be + // represented using a AndRecurrencePredicate. If neither apply, we represent + // the liveness of the merge symbolically. + + bool has_unvisited_backedge = false; + for (const Edge* e : n->in_edges()) { + if (!e->IsControlEdge() && e->src()->IsNextIteration()) { + has_unvisited_backedge |= !predicate_map_.count(InputEdgeToTensorId(e)); + } + } + + auto it = predicate_map_.find(TensorId(n->name(), 0)); + if (it == predicate_map_.end()) { + if (has_unvisited_backedge) { + // We're visiting this merge for the first time and it has an unvisited + // backedge. + Predicate* input_data_pred = predicate_factory_.MakeSymbolPredicate( + TensorId(n->name(), 0), /*must_be_true=*/false); + SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred, + should_revisit); + return Status::OK(); + } + + // We're visiting this merge for the first time and it is a acyclic merge. + Predicate* input_data_pred = predicate_factory_.MakeOrPredicate( + GetIncomingPreds(n, EdgeKind::kDataOnly)); + SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred, + should_revisit); + return Status::OK(); + } + + if (it->second->kind() == Predicate::Kind::kSymbol) { + // Last time we visited this merge we only got a symbolic predicate because + // of an unvisited backedge. Try to pattern match the predicate expression + // for that backedge (which should be visited now) into an and recurrence + // for the merge node. + if (const Edge* unique_backedge = FindUniqueBackedge(n)) { + if (Predicate* step = DeduceStepPredicate( + &predicate_factory_, it->second, + predicate_map_[InputEdgeToTensorId(unique_backedge)])) { + // If the predicate for the backedge is "Sym&X" where "Sym" is the + // predicate for the merge then the merge has predicate {S,&,X} where S + // is the predicate for the merge ignoring the backedge. + std::vector non_recurrent_inputs; + for (const Edge* e : n->in_edges()) { + if (e != unique_backedge) { + non_recurrent_inputs.push_back( + predicate_map_[InputEdgeToTensorId(e)]); + } + } + + Predicate* start = + predicate_factory_.MakeOrPredicate(non_recurrent_inputs); + Predicate* and_rec = + predicate_factory_.MakeAndRecurrencePredicate(start, step); + SetPredicate(n, {0, 1, Graph::kControlSlot}, and_rec, should_revisit); + return Status::OK(); + } + } + } + return Status::OK(); +} + +Status DeadnessAnalysisImpl::HandleRecv(Node* n, + std::vector* should_revisit) { + // In addition to being alive or dead based on the inputs, a _Recv can also + // acquire a dead signal from a _Send. + std::vector input_preds = + GetIncomingPreds(n, EdgeKind::kDataAndControl); + input_preds.push_back(predicate_factory_.MakeSymbolPredicate( + TensorId(n->name(), 0), /*must_be_true=*/false)); + SetPredicate(n, {0, Graph::kControlSlot}, + predicate_factory_.MakeAndPredicate(input_preds), + should_revisit); + return Status::OK(); +} + +Status DeadnessAnalysisImpl::HandleGeneric(Node* n, + std::vector* should_revisit) { + // Generally nodes are alive iff all their inputs are alive. + Predicate* pred = predicate_factory_.MakeAndPredicate( + GetIncomingPreds(n, EdgeKind::kDataAndControl)); + for (int output_idx = 0; output_idx < n->num_outputs(); output_idx++) { + SetPredicate(n, output_idx, pred, should_revisit); + } + SetPredicate(n, Graph::kControlSlot, pred, should_revisit); + return Status::OK(); +} + +Status DeadnessAnalysisImpl::HandleNode(Node* n, + std::vector* should_revisit) { + if (n->IsSwitch()) { + TF_RETURN_IF_ERROR(HandleSwitch(n, should_revisit)); + } else if (n->IsMerge()) { + TF_RETURN_IF_ERROR(HandleMerge(n, should_revisit)); + } else if (n->IsControlTrigger()) { + SetPredicate(n, Graph::kControlSlot, predicate_factory_.MakeTrue(), + nullptr); + } else if (n->IsRecv() || n->IsHostRecv()) { + TF_RETURN_IF_ERROR(HandleRecv(n, should_revisit)); + } else if (n->IsNextIteration()) { + TF_RETURN_IF_ERROR(HandleGeneric(n, should_revisit)); + } else { + TF_RETURN_IF_ERROR(HandleGeneric(n, should_revisit)); + } + return Status::OK(); +} + +Status DeadnessAnalysisImpl::Populate() { + std::vector rpo; + GetReversePostOrder(graph_, &rpo, /*stable_comparator=*/NodeComparatorName(), + /*edge_filter=*/[](const Edge& edge) { + return !edge.src()->IsNextIteration(); + }); + return PopulateWithReversePostOrder(rpo); +} + +Status DeadnessAnalysisImpl::PopulateWithReversePostOrder( + absl::Span rpo) { + // This an abstract interpretation over the deadness propagation semantics of + // the graph executor. + // + // We iterate over the graph twice, each time in RPO. On the first iteration + // merge nodes with backedges are mapped to symbolic predicates. On the + // second iteration we use the predicates assigned to the backedges in the + // previous iteration to infer a more precise predicate for the backedge merge + // nodes and all the nodes that transitively use it. + // + // We don't track the output indices for should_revisit. Instead, putting a + // node in `should_revisit` denotes that the deadness flowing out from any + // output from said node may have changed. This is fine; only switches + // propagate different deadness along different output edges, and since the + // delta is solely due to the input *values* (and not input deadness), the + // delta should not change in the second iteration. + std::vector should_revisit; + should_revisit.resize(graph_.num_node_ids()); + for (Node* n : rpo) { + VLOG(4) << "Visiting " << n->name(); + TF_RETURN_IF_ERROR(HandleNode(n, /*should_revisit=*/nullptr)); + if (n->IsNextIteration()) { + // If this is a backedge for a merge node then remember to reprocess the + // merge the next time we run. + for (const Edge* e : n->out_edges()) { + if (e->dst()->IsMerge()) { + should_revisit[e->dst()->id()] = true; + } + } + } + } + + for (Node* n : rpo) { + // The nodes added to should_revisit in the previous loop need to be + // revisited now. Reprocesing these initial nodes may add *their* consumers + // to should_revisit, and these newly added nodes will also be processed by + // this very same loop. Since we're traversing the graph in reverse post + // order (producers before consumers) and HandleNode(n) can only ever add + // n's consumers to should_revisit, we won't "miss" an addition to + // should_revisit. + if (should_revisit[n->id()]) { + VLOG(4) << "Revisiting " << n->name(); + TF_RETURN_IF_ERROR(HandleNode(n, &should_revisit)); + } + } + + return Status::OK(); +} + +bool DeadnessAnalysisImpl::HasInputsWithMismatchingDeadness(const Node& node) { + CHECK(!node.IsMerge()); + + if (vlog_) { + VLOG(2) << "HasInputsWithMismatchingDeadness(" << node.name() << ")"; + } + + Predicate* pred = nullptr; + for (const Edge* edge : node.in_edges()) { + auto it = predicate_map_.find(InputEdgeToTensorId(edge)); + CHECK(it != predicate_map_.end()); + if (vlog_) { + VLOG(2) << " " << InputEdgeToTensorId(edge).ToString() << ": " + << it->second->ToString(); + } + + // Today we just compare the predicates for equality (with some + // canonicalization/simplification happening before) but we could be more + // sophisticated here if need be. Comparing pointers is sufficient because + // we intern Predicate instances by their content. + if (pred != nullptr && pred != it->second) { + if (vlog_) { + VLOG(2) << "HasInputsWithMismatchingDeadness(" << node.name() + << ") -> true"; + } + return true; + } + pred = it->second; + } + + if (vlog_) { + VLOG(2) << "HasInputsWithMismatchingDeadness(" << node.name() + << ") -> false"; + } + + return false; +} + +void DeadnessAnalysisImpl::Print() const { + std::vector tensor_ids; + for (const auto& kv_pair : predicate_map_) { + tensor_ids.push_back(kv_pair.first); + } + + std::sort(tensor_ids.begin(), tensor_ids.end()); + + for (TensorId tensor_id : tensor_ids) { + auto it = predicate_map_.find(tensor_id); + CHECK(it != predicate_map_.end()) << tensor_id.ToString(); + VLOG(2) << tensor_id.ToString() << " -> " << it->second->ToString(); + } +} + +} // namespace + +DeadnessAnalysis::~DeadnessAnalysis() {} + +/*static*/ Status DeadnessAnalysis::Run( + const Graph& graph, std::unique_ptr* result) { + std::unique_ptr analysis( + new DeadnessAnalysisImpl(&graph)); + TF_RETURN_IF_ERROR(analysis->Populate()); + + if (VLOG_IS_ON(2)) { + analysis->Print(); + } + + *result = std::move(analysis); + return Status::OK(); +} + +gtl::FlatMap +DeadnessAnalysisImpl::PredicateMapAsString() const { + gtl::FlatMap result; + std::vector tensor_ids; + for (const auto& kv_pair : predicate_map_) { + CHECK(result.insert({kv_pair.first, kv_pair.second->ToString()}).second); + } + return result; +} + +namespace deadness_analysis_internal { +Status ComputePredicates(const Graph& graph, + PredicateMapTy* out_predicate_map) { + DeadnessAnalysisImpl impl(&graph); + TF_RETURN_IF_ERROR(impl.Populate()); + *out_predicate_map = impl.PredicateMapAsString(); + return Status::OK(); +} + +Status ComputePredicates(const Graph& graph, + absl::Span reverse_post_order, + PredicateMapTy* out_predicate_map) { + DeadnessAnalysisImpl impl(&graph); + TF_RETURN_IF_ERROR(impl.PopulateWithReversePostOrder(reverse_post_order)); + *out_predicate_map = impl.PredicateMapAsString(); + return Status::OK(); +} +} // namespace deadness_analysis_internal + +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/deadness_analysis.h b/tensorflow/compiler/jit/deadness_analysis.h new file mode 100644 index 0000000000000000000000000000000000000000..6e7ab411619ba08060aa4925e91dce06299d1d23 --- /dev/null +++ b/tensorflow/compiler/jit/deadness_analysis.h @@ -0,0 +1,68 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_H_ +#define TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_H_ + +#include "tensorflow/core/graph/graph.h" + +namespace tensorflow { + +// This analyzes a TensorFlow graph to identify nodes which may have partially +// dead inputs (i.e. these nodes may have some dead inputs and some alive +// inputs). +// +// For example, the ADD node in the following graph +// +// V0 PRED0 V1 PRED1 +// | | | | +// v v v v +// SWITCH SWITCH +// | | +// +---+ + ---+ +// | | +// v v +// ADD +// +// can have its inputs independently dead or alive based on the runtime values +// of PRED0 and PRED1. +// +// It is tempting to call this a liveness analysis but I avoided that because +// "liveness" already has other connotations. +class DeadnessAnalysis { + public: + // Returns true if `node` may have some live inputs and some dead inputs. + // + // This is a conservatively correct routine -- if it returns false then `node` + // is guaranteed to not have inputs with mismatching liveness, but not the + // converse. + // + // REQUIRES: node is not a Merge operation. + virtual bool HasInputsWithMismatchingDeadness(const Node& node) = 0; + + // Prints out the internal state of this instance. For debugging purposes + // only. + virtual void Print() const = 0; + virtual ~DeadnessAnalysis(); + + // Run the deadness analysis over `graph` and returns an error or a populated + // instance of DeadnessAnalysis in `result`. + static Status Run(const Graph& graph, + std::unique_ptr* result); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_H_ diff --git a/tensorflow/compiler/jit/deadness_analysis_internal.h b/tensorflow/compiler/jit/deadness_analysis_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..3df2679c629ce801fc6c9006415dcd27b40c078e --- /dev/null +++ b/tensorflow/compiler/jit/deadness_analysis_internal.h @@ -0,0 +1,40 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_ +#define TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_ + +#include "tensorflow/core/graph/tensor_id.h" +#include "tensorflow/core/lib/gtl/flatmap.h" + +namespace tensorflow { +namespace deadness_analysis_internal { + +// Returns a map describing the predicate each Tensor was mapped to. For +// testing purposes only. +using PredicateMapTy = gtl::FlatMap; +Status ComputePredicates(const Graph& graph, PredicateMapTy* out_predicate_map); + +// Returns a map describing the predicate each Tensor was mapped to. For +// testing purposes only. Makes deadness analysis visit the graph in the order +// specified in `reverse_post_order` which must be a valid RPO for the graph +// minus NextIteration->Merge edges. +Status ComputePredicates(const Graph& graph, + absl::Span reverse_post_order, + PredicateMapTy* out_predicate_map); +} // namespace deadness_analysis_internal +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_ diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..28a56044d5e3795fc3ecf5d1092491b87cb90f01 --- /dev/null +++ b/tensorflow/compiler/jit/deadness_analysis_test.cc @@ -0,0 +1,799 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/deadness_analysis.h" + +#include "tensorflow/cc/framework/ops.h" +#include "tensorflow/cc/ops/array_ops.h" +#include "tensorflow/cc/ops/control_flow_ops_internal.h" +#include "tensorflow/cc/ops/function_ops.h" +#include "tensorflow/cc/ops/sendrecv_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/jit/deadness_analysis_internal.h" +#include "tensorflow/compiler/jit/defs.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/graph/graph_def_builder_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +using deadness_analysis_internal::ComputePredicates; +using deadness_analysis_internal::PredicateMapTy; + +Status AnalyzeDeadness(Graph* graph, + std::unique_ptr* result) { + FixupSourceAndSinkEdges(graph); + return DeadnessAnalysis::Run(*graph, result); +} + +ops::Switch CreateSwitch(const Scope& root, const string& prefix) { + Output value = ops::Placeholder(root.WithOpName(prefix + "/value"), DT_FLOAT); + Output predicate = + ops::Placeholder(root.WithOpName(prefix + "/pred"), DT_BOOL); + return ops::Switch(root.WithOpName(prefix + "/switch"), value, predicate); +} + +TensorId ControlOutputFor(const Output& o) { + return {o.node()->name(), Graph::kControlSlot}; +} + +void VLogGraphIfAsked(const Graph& graph) { + if (VLOG_IS_ON(3)) { + GraphDef graph_def; + graph.ToGraphDef(&graph_def); + string serialized; + ::tensorflow::protobuf::TextFormat::PrintToString(graph_def, &serialized); + LOG(INFO) << serialized; + } +} + +struct InductionVarInfo { + Output induction_var; + Output loop_cond; +}; + +// Creates an induction variable with the following structure (simplified for +// brevity): +// +// +---------------+ +// | initial_value | +// +---------------+ +// | +// | +// v +// +---------------+ +// | Enter | +// +---------------+ +// | +// | +// v +// +---------------+ +// +> | Merge | -+ +// | +---------------+ | +// | | | +// | | | +// | v | +// | +---------------+ | +// | | LessThan10 | | +// | +---------------+ | +// | | | +// | | | +// | v | +// | +---------------+ | +// +----+- | Switch | <+ +// | | +---------------+ +// | | | +// | | | +// | | v +// | | +---------------+ +// | +- | AddOne | +// | +---------------+ +// | +---------------+ +// +-----> | Exit | +// +---------------+ +InductionVarInfo CreateInductionVariable(const Scope& root, + const string& prefix, + const string& frame_name, + const Output& initial_value) { + Output enter_initial_value = ops::internal::Enter( + root.WithOpName(prefix + "/enter"), initial_value, frame_name); + + ops::Merge iv(root.WithOpName(prefix + "/iv"), + {enter_initial_value, enter_initial_value}); + Output increment_by = ops::Const(root.WithOpName(prefix + "/incr"), 1); + Output final_value = ops::Const(root.WithOpName(prefix + "/final"), 10); + Output loop_cond_expr = + ops::Less(root.WithOpName(prefix + "/less"), iv.output, final_value); + Output loop_cond = + ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr); + ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond); + ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output); + Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"), + latch.output_true, increment_by); + Output next_iteration = + ops::NextIteration(root.WithOpName(prefix + "/next_iteration"), iv_next); + + CHECK(root.graph() + ->UpdateEdge(next_iteration.node(), 0, iv.output.node(), 1) + .ok()); + root.graph()->AddControlEdge(iv.output.node(), increment_by.node()); + root.graph()->AddControlEdge(iv.output.node(), final_value.node()); + + return {iv.output, loop_cond}; +} + +InductionVarInfo CreateInductionVariable(const Scope& root, + const string& prefix, + const string& frame_name, int32 init) { + return CreateInductionVariable( + root, prefix, frame_name, + ops::Const(root.WithOpName(prefix + "/init"), init)); +} + +// Creates an induction variable with the following structure: +// +// +---------------+ +// | initial_value | +// +---------------+ +// | +// | +// v +// +---------------+ +// | Enter | +// +---------------+ +// | +// | +// v +// +---------------+ +// | Merge | <+ +// +---------------+ | +// | | +// | | +// v | +// +-----------+ +---------------+ | +// | loop_cond | --> | Switch | -+ +// +-----------+ +---------------+ +// | +// | +// v +// +---------------+ +// | Exit | +// +---------------+ +struct DependentInductionVar { + Output induction_var; + ops::Switch latch; +}; + +DependentInductionVar CreateDependentLoopInvariantValue( + const Scope& root, const string& prefix, const string& frame_name, + const Output& loop_cond, const Output& value) { + Output enter_value = ops::internal::Enter(root.WithOpName(prefix + "/enter"), + value, frame_name); + ops::Merge iv(root.WithOpName(prefix + "/iv"), {enter_value, enter_value}); + ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond); + ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output); + Output next_iteration = ops::NextIteration( + root.WithOpName(prefix + "/next_iteration"), latch.output_true); + CHECK(root.graph() + ->UpdateEdge(next_iteration.node(), 0, iv.output.node(), 1) + .ok()); + return {iv.output, latch}; +} + +DependentInductionVar CreateDependentLoopInvariantValue( + const Scope& root, const string& prefix, const string& frame_name, + const Output& loop_cond, int32 value) { + return CreateDependentLoopInvariantValue( + root, prefix, frame_name, loop_cond, + ops::Const(root.WithOpName(prefix + "/init"), value)); +} + +TEST(DeadnessAnalysisTest, BasicPositive) { + Scope root = Scope::NewRootScope().ExitOnError(); + + ops::Switch sw = CreateSwitch(root, "0"); + Output add = + ops::Add(root.WithOpName("add"), sw.output_true, sw.output_false); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, BasicNegative) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Output a = ops::Placeholder(root.WithOpName("a"), DT_FLOAT); + Output b = ops::Placeholder(root.WithOpName("b"), DT_FLOAT); + Output add = ops::Add(root.WithOpName("add"), a, b); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, AndIsCommutative) { + Scope root = Scope::NewRootScope().ExitOnError(); + + ops::Switch sw_0 = CreateSwitch(root, "0"); + ops::Switch sw_1 = CreateSwitch(root, "1"); + + Output a0 = + ops::Add(root.WithOpName("a0"), sw_0.output_false, sw_1.output_false); + Output a1 = + ops::Add(root.WithOpName("a1"), sw_1.output_false, sw_0.output_false); + + Output b0 = + ops::Add(root.WithOpName("b0"), sw_0.output_false, sw_1.output_true); + Output b1 = + ops::Add(root.WithOpName("b1"), sw_1.output_true, sw_0.output_false); + + Output live0 = ops::Add(root.WithOpName("live0"), a0, a1); + Output live1 = ops::Add(root.WithOpName("live1"), b0, b1); + + Output halfdead0 = ops::Add(root.WithOpName("halfdead0"), a0, b0); + Output halfdead1 = ops::Add(root.WithOpName("halfdead1"), a1, b1); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*live0.node())); + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*live1.node())); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*halfdead0.node())); + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*halfdead1.node())); +} + +TEST(DeadnessAnalysisTest, AndIsAssociative) { + Scope root = Scope::NewRootScope().ExitOnError(); + + ops::Switch sw_0 = CreateSwitch(root, "0"); + ops::Switch sw_1 = CreateSwitch(root, "1"); + ops::Switch sw_2 = CreateSwitch(root, "2"); + + Output a0 = + ops::Add(root.WithOpName("a0"), sw_0.output_false, sw_1.output_false); + Output a1 = ops::Add(root.WithOpName("a1"), a0, sw_2.output_false); + + Output b0 = + ops::Add(root.WithOpName("b0"), sw_1.output_false, sw_2.output_false); + Output b1 = ops::Add(root.WithOpName("b1"), sw_0.output_false, b0); + + Output add = ops::Add(root.WithOpName("add"), a1, b1); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, OrIsCommutative) { + Scope root = Scope::NewRootScope().ExitOnError(); + + ops::Switch sw_0 = CreateSwitch(root, "0"); + ops::Switch sw_1 = CreateSwitch(root, "1"); + + ops::Merge m0(root.WithOpName("m0"), {sw_0.output_false, sw_1.output_false}); + ops::Merge m1(root.WithOpName("m1"), {sw_1.output_false, sw_0.output_false}); + ops::Merge m2(root.WithOpName("m2"), {sw_0.output_false, sw_1.output_true}); + ops::Merge m3(root.WithOpName("m3"), {sw_1.output_true, sw_0.output_false}); + + Output live0 = ops::Add(root.WithOpName("live0"), m0.output, m1.output); + Output live1 = ops::Add(root.WithOpName("live1"), m2.output, m3.output); + + Output halfdead0 = + ops::Add(root.WithOpName("halfdead0"), m0.output, m2.output); + Output halfdead1 = + ops::Add(root.WithOpName("halfdead1"), m1.output, m3.output); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*live0.node())); + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*live1.node())); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*halfdead0.node())); + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*halfdead1.node())); +} + +TEST(DeadnessAnalysisTest, OrIsAssociative) { + Scope root = Scope::NewRootScope().ExitOnError(); + + ops::Switch sw_0 = CreateSwitch(root, "0"); + ops::Switch sw_1 = CreateSwitch(root, "1"); + ops::Switch sw_2 = CreateSwitch(root, "2"); + + ops::Merge m0(root.WithOpName("m0"), {sw_0.output_false, sw_1.output_false}); + ops::Merge m1(root.WithOpName("m1"), {m0.output, sw_2.output_false}); + ops::Merge m2(root.WithOpName("m2"), {sw_1.output_false, sw_2.output_false}); + ops::Merge m3(root.WithOpName("m3"), {sw_0.output_false, m2.output}); + + Output add = ops::Add(root.WithOpName("add"), m1.output, m3.output); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, AndOfOr) { + Scope root = Scope::NewRootScope().ExitOnError(); + + ops::Switch sw_0 = CreateSwitch(root, "0"); + ops::Switch sw_1 = CreateSwitch(root, "1"); + ops::Switch sw_2 = CreateSwitch(root, "2"); + ops::Switch sw_3 = CreateSwitch(root, "3"); + + ops::Merge m0(root.WithOpName("m0"), {sw_0.output_false, sw_1.output_false}); + ops::Merge m1(root.WithOpName("m1"), {sw_2.output_false, sw_3.output_false}); + + Output add0 = ops::Add(root.WithOpName("add0"), m0.output, m1.output); + Output add1 = ops::Add(root.WithOpName("add1"), m0.output, m1.output); + + Output add2 = ops::Add(root.WithOpName("add2"), add0, add1); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add2.node())); +} + +TEST(DeadnessAnalysisTest, OrOfAnd) { + Scope root = Scope::NewRootScope().ExitOnError(); + + ops::Switch sw_0 = CreateSwitch(root, "0"); + ops::Switch sw_1 = CreateSwitch(root, "1"); + ops::Switch sw_2 = CreateSwitch(root, "2"); + ops::Switch sw_3 = CreateSwitch(root, "3"); + + Output add0 = + ops::Add(root.WithOpName("add0"), sw_0.output_false, sw_1.output_false); + Output add1 = + ops::Add(root.WithOpName("add1"), sw_2.output_false, sw_3.output_false); + + ops::Merge m0(root.WithOpName("m0"), {add0, add1}); + ops::Merge m1(root.WithOpName("m1"), {add0, add1}); + + Output add2 = ops::Add(root.WithOpName("add2"), m0.output, m1.output); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add2.node())); +} + +TEST(DeadnessAnalysisTest, NEGATIVE_AndOrDistributive) { + // This demonstrates one of the weaknesses in the current approach -- since we + // only do some basic simplifications we can't see that "(A|B)&C" == + // "(A&C)|(B&C)". + Scope root = Scope::NewRootScope().ExitOnError(); + + ops::Switch sw_0 = CreateSwitch(root, "0"); + ops::Switch sw_1 = CreateSwitch(root, "1"); + ops::Switch sw_2 = CreateSwitch(root, "2"); + + ops::Merge m0(root.WithOpName("m0"), {sw_0.output_false, sw_1.output_false}); + Output add0 = ops::Add(root.WithOpName("add0"), m0.output, sw_2.output_false); + + Output add1 = + ops::Add(root.WithOpName("add1"), sw_0.output_false, sw_2.output_false); + Output add2 = + ops::Add(root.WithOpName("add2"), sw_1.output_false, sw_2.output_false); + ops::Merge m1(root.WithOpName("m1"), {add1, add2}); + + Output add3 = ops::Add(root.WithOpName("add3"), add0, m1.output); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add2.node())); +} + +TEST(DeadnessAnalysisTest, Ternary) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Output predicate = ops::Placeholder(root.WithOpName("predicate"), DT_BOOL); + Output true_value = ops::Placeholder(root.WithOpName("true_value"), DT_FLOAT); + Output false_value = + ops::Placeholder(root.WithOpName("false_value"), DT_FLOAT); + + ops::Switch predicated_true(root.WithOpName("predicated_true"), true_value, + predicate); + + ops::Switch predicated_false(root.WithOpName("predicated_false"), true_value, + predicate); + ops::Merge merge(root.WithOpName("ternary"), {predicated_true.output_true, + predicated_false.output_false}); + Output addend = ops::Placeholder(root.WithOpName("addend"), DT_FLOAT); + Output add = ops::Add(root.WithOpName("add"), merge.output, addend); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, Recv) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Output recv_a = ops::_Recv(root.WithOpName("recv_a"), DT_FLOAT, "tensor_a", + "sender", 0, "receiver"); + Output recv_b = ops::_Recv(root.WithOpName("recv_b"), DT_FLOAT, "tensor_b", + "sender", 0, "receiver"); + Output add = ops::Add(root.WithOpName("add"), recv_a, recv_b); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, HostRecv) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Output recv_a = ops::_HostRecv(root.WithOpName("recv_a"), DT_FLOAT, + "tensor_a", "sender", 0, "receiver"); + Output recv_b = ops::_HostRecv(root.WithOpName("recv_b"), DT_FLOAT, + "tensor_b", "sender", 0, "receiver"); + Output add = ops::Add(root.WithOpName("add"), recv_a, recv_b); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, Loop) { + Scope root = Scope::NewRootScope().ExitOnError(); + Output iv0 = CreateInductionVariable(root, "iv0", "fr0", 0).induction_var; + Output iv1 = CreateInductionVariable(root, "iv1", "fr0", 0).induction_var; + Output iv2 = CreateInductionVariable(root, "iv2", "fr0", 1).induction_var; + Output add0 = ops::Add(root.WithOpName("add0"), iv0, iv1); + Output add1 = ops::Add(root.WithOpName("add1"), iv1, iv2); + + // NB! iv0 and iv1 are equivalent and a smarter deadness analysis would have + // noticed that. Today we are pessimistic here because we assign an + // uninterpreted symbol to merges with backedges. + + VLogGraphIfAsked(*root.graph()); + + { + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add0.node())); + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add1.node())); + } + { + PredicateMapTy predicate_map; + TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map)); + + // In theory we should be able to tell that iv0/cond:0 and iv1/cond:0 + // produce the same deadness. But we're not that smart today. + EXPECT_EQ(predicate_map[ControlOutputFor(iv0)], "{#true,&,*iv0/cond:0}"); + EXPECT_EQ(predicate_map[ControlOutputFor(iv1)], "{#true,&,*iv1/cond:0}"); + EXPECT_EQ(predicate_map[ControlOutputFor(iv2)], "{#true,&,*iv2/cond:0}"); + EXPECT_EQ(predicate_map[ControlOutputFor(add0)], + "({#true,&,*iv1/cond:0} & {#true,&,*iv0/cond:0})"); + EXPECT_EQ(predicate_map[ControlOutputFor(add1)], + "({#true,&,*iv1/cond:0} & {#true,&,*iv2/cond:0})"); + } +} + +TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) { + Scope root = Scope::NewRootScope().ExitOnError(); + InductionVarInfo iv = CreateInductionVariable(root, "iv0", "frame", 0); + Output dependent_iv0 = + CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0) + .induction_var; + Output dependent_iv1 = + CreateDependentLoopInvariantValue(root, "div1", "frame", iv.loop_cond, 0) + .induction_var; + Output add0 = ops::Add(root.WithOpName("add0"), dependent_iv0, dependent_iv1); + + VLogGraphIfAsked(*root.graph()); + + { + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add0.node())); + } + { + PredicateMapTy predicate_map; + TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map)); + + EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)], + "{#true,&,*iv0/cond:0}"); + EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv0)], + "{#true,&,(*iv0/cond:0 & iv0/iv:0)}"); + EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv1)], + "{#true,&,(*iv0/cond:0 & iv0/iv:0)}"); + EXPECT_EQ(predicate_map[ControlOutputFor(add0)], + "{#true,&,(*iv0/cond:0 & iv0/iv:0)}"); + } +} + +TEST(DeadnessAnalysisTest, LoopInvariantPredicateOnBackedge) { + // Create a merge that "looks like" a loop but isn't really. It has a value + // that does not depend on the merge on its backedge. + Scope root = Scope::NewRootScope().ExitOnError(); + InductionVarInfo iv = CreateInductionVariable(root, "iv0", "frame", 0); + DependentInductionVar dependent_iv = + CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0); + FixupSourceAndSinkEdges(root.graph()); + + // To make deadness analysis think that dependent_iv is a loop we need an RPO + // that visits the merge before the backedge. This is a legal RPO for + // deadness analysis since it ignores NextIteration->Merge edges during RPO. + // Right now dependent_iv has an edge from Merge to NextIteration so do the + // RPO with this edge in place. Then remove this edge to get our test case. + std::vector rpo; + GetReversePostOrder(*root.graph(), &rpo, /*stable_comparator=*/{}, + /*edge_filter=*/[](const Edge& edge) { + return !edge.src()->IsNextIteration(); + }); + TF_ASSERT_OK(root.graph()->UpdateEdge( + iv.induction_var.node(), 0, dependent_iv.latch.output_true.node(), 0)); + + VLogGraphIfAsked(*root.graph()); + + { + PredicateMapTy predicate_map; + TF_ASSERT_OK(ComputePredicates(*root.graph(), rpo, &predicate_map)); + + EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv.induction_var)], + "div0/iv:0"); + } +} + +TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) { + Scope root = Scope::NewRootScope().ExitOnError(); + InductionVarInfo iv_outer = + CreateInductionVariable(root, "iv_outer", "frame", 0); + ops::Switch inner_value(root.WithOpName("outer_is_live"), + ops::Const(root.WithOpName("constant"), 5), + iv_outer.loop_cond); + InductionVarInfo iv_inner = CreateInductionVariable( + root, "iv_inner", "frame", + ops::internal::Enter(root.WithOpName("iv_inner/enter"), + inner_value.output_true, "frame_inner")); + + Output dependent_outer_iv0 = + CreateDependentLoopInvariantValue(root, "dependent_outer_iv0", "frame", + iv_outer.loop_cond, 0) + .induction_var; + Output dependent_outer_iv1 = + CreateDependentLoopInvariantValue(root, "dependent_outer_iv1", "frame", + iv_outer.loop_cond, 0) + .induction_var; + + Output dependent_inner_iv0 = + CreateDependentLoopInvariantValue(root, "dependent_inner_iv0", "frame", + iv_inner.loop_cond, dependent_outer_iv0) + .induction_var; + Output dependent_inner_iv1 = + CreateDependentLoopInvariantValue(root, "dependent_inner_iv1", "frame", + iv_inner.loop_cond, dependent_outer_iv1) + .induction_var; + + Output add0 = ops::Add(root.WithOpName("add0"), dependent_inner_iv0, + dependent_inner_iv1); + + VLogGraphIfAsked(*root.graph()); + + { + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add0.node())); + } + { + PredicateMapTy predicate_map; + TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map)); + + EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer.induction_var)], + "{#true,&,*iv_outer/cond:0}"); + EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner.induction_var)], + "{(*iv_outer/cond:0 & {#true,&,*iv_outer/cond:0}),&," + "*iv_inner/cond:0}"); + + EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv0)], + "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&," + "(*iv_inner/cond:0 & iv_inner/iv:0)}"); + EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv1)], + "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&," + "(*iv_inner/cond:0 & iv_inner/iv:0)}"); + EXPECT_EQ(predicate_map[ControlOutputFor(add0)], + "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&," + "(*iv_inner/cond:0 & iv_inner/iv:0)}"); + } +} + +TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) { + Scope root = Scope::NewRootScope().ExitOnError(); + InductionVarInfo iv_outer_0 = + CreateInductionVariable(root, "iv_outer_0", "frame", 0); + ops::Switch inner_value_0(root.WithOpName("outer_0_is_live"), + ops::Const(root.WithOpName("constant"), 5), + iv_outer_0.loop_cond); + InductionVarInfo iv_inner_0 = CreateInductionVariable( + root, "iv_inner_0", "frame", + ops::internal::Enter(root.WithOpName("iv_inner_0/enter"), + inner_value_0.output_true, "frame_inner")); + + InductionVarInfo iv_outer_1 = + CreateInductionVariable(root, "iv_outer_1", "frame", 1); + ops::Switch inner_init_value_1(root.WithOpName("outer_1_is_live"), + ops::Const(root.WithOpName("constant"), 5), + iv_outer_1.loop_cond); + InductionVarInfo iv_inner_1 = CreateInductionVariable( + root, "iv_inner_1", "frame", + ops::internal::Enter(root.WithOpName("iv_inner_1/enter"), + inner_init_value_1.output_true, "frame_inner")); + Output add0 = ops::Add(root.WithOpName("add0"), iv_inner_0.induction_var, + iv_inner_1.induction_var); + + VLogGraphIfAsked(*root.graph()); + + { + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add0.node())); + } + + { + PredicateMapTy predicate_map; + TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map)); + + EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_0.induction_var)], + "{#true,&,*iv_outer_0/cond:0}"); + EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_0.induction_var)], + "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&," + "*iv_inner_0/cond:0}"); + EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_1.induction_var)], + "{#true,&,*iv_outer_1/cond:0}"); + EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_1.induction_var)], + "{(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&," + "*iv_inner_1/cond:0}"); + EXPECT_EQ(predicate_map[ControlOutputFor(add0)], + "({(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&," + "*iv_inner_1/cond:0} & " + "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&," + "*iv_inner_0/cond:0})"); + } +} + +TEST(DeadnessAnalysisTest, ControlInputs) { + Scope root = Scope::NewRootScope().ExitOnError(); + ops::Switch sw = CreateSwitch(root, "0"); + + Output id0 = ops::Identity(root.WithOpName("id0"), sw.output_false); + Output id1 = ops::Identity(root.WithOpName("id1"), sw.output_true); + + Output const0 = ops::Const(root.WithOpName("const0"), 1); + Output const1 = ops::Const(root.WithOpName("const1"), 2); + + Output add = ops::Add(root.WithOpName("add"), const0, const1); + + root.graph()->AddControlEdge(id0.node(), const0.node()); + root.graph()->AddControlEdge(id1.node(), const1.node()); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, ControlTrigger) { + Scope root = Scope::NewRootScope().ExitOnError(); + ops::Switch sw = CreateSwitch(root, "0"); + + Output id0 = ops::Identity(root.WithOpName("id0"), sw.output_false); + Output id1 = ops::Identity(root.WithOpName("id1"), sw.output_true); + + ops::ControlTrigger ctrl_trigger0(root.WithOpName("ctrl_trigger0")); + ops::ControlTrigger ctrl_trigger1(root.WithOpName("ctrl_trigger1")); + + Output const0 = ops::Const(root.WithOpName("const0"), 1); + Output const1 = ops::Const(root.WithOpName("const1"), 2); + + Output add = ops::Add(root.WithOpName("add"), const0, const1); + + root.graph()->AddControlEdge(id0.node(), ctrl_trigger0.operation.node()); + root.graph()->AddControlEdge(ctrl_trigger0.operation.node(), const0.node()); + + root.graph()->AddControlEdge(id1.node(), ctrl_trigger1.operation.node()); + root.graph()->AddControlEdge(ctrl_trigger1.operation.node(), const1.node()); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, ControlInputsToMerge) { + Scope root = Scope::NewRootScope().ExitOnError(); + ops::Switch sw = CreateSwitch(root, "0"); + + Output id0 = ops::Identity(root.WithOpName("id0"), sw.output_false); + Output id1 = ops::Identity(root.WithOpName("id1"), sw.output_true); + + Output constant = ops::Const(root.WithOpName("constant"), 5); + ops::Merge m0(root.WithOpName("m0"), {constant}); + ops::Merge m1(root.WithOpName("m0"), {constant}); + Output add = ops::Add(root.WithOpName("add"), m0.output, m1.output); + + root.graph()->AddControlEdge(id0.node(), m0.output.node()); + root.graph()->AddControlEdge(id1.node(), m1.output.node()); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add.node())); +} + +TEST(DeadnessAnalysisTest, RecvVsSwitch) { + // Demonstrates why we need the must_be_true bit on SymbolP. + Scope root = Scope::NewRootScope().ExitOnError(); + + Output recv = ops::_Recv(root.WithOpName("recv"), DT_BOOL, "tensor", "sender", + 0, "receiver"); + Output value = ops::Placeholder(root.WithOpName("value"), DT_BOOL); + ops::Switch sw(root.WithOpName("switch"), value, recv); + Output logical_and = + ops::LogicalAnd(root.WithOpName("and"), recv, sw.output_true); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*logical_and.node())); +} + +TEST(DeadnessAnalysisTest, RecvVsSwitchText) { + // Demonstrates why we need the must_be_true bit on SymbolP. + Scope root = Scope::NewRootScope().ExitOnError(); + + Output recv = ops::_Recv(root.WithOpName("recv"), DT_BOOL, "tensor", "sender", + 0, "receiver"); + Output value = ops::Placeholder(root.WithOpName("value"), DT_BOOL); + ops::Switch sw(root.WithOpName("switch"), value, recv); + Output logical_and = + ops::LogicalAnd(root.WithOpName("and"), recv, sw.output_true); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + PredicateMapTy predicate_map; + TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map)); + + TensorId logical_and_output_0 = {logical_and.node()->name(), + Graph::kControlSlot}; + EXPECT_EQ(predicate_map[logical_and_output_0], "(recv:0 & *recv:0)"); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc index 6d1e3325ebd35b9608ea273fb7de39bad381e60d..2788102620546d8eab657c519f078c5b03e265cc 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc @@ -23,7 +23,6 @@ limitations under the License. #include #include "tensorflow/compiler/jit/graphcycles/graphcycles.h" -#include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h" #include "tensorflow/compiler/jit/mark_for_compilation_pass.h" #include "tensorflow/compiler/jit/shape_inference_helpers.h" #include "tensorflow/compiler/tf2xla/const_analysis.h" @@ -37,6 +36,7 @@ limitations under the License. #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/tensor.pb.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/control_flow.h" #include "tensorflow/core/graph/graph.h" @@ -45,7 +45,6 @@ limitations under the License. #include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/lib/gtl/map_util.h" #include "tensorflow/core/lib/hash/hash.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/public/session_options.h" #include "tensorflow/core/public/version.h" @@ -61,9 +60,9 @@ const char* const kXlaHostTransferSequencerAttr = namespace { -bool AreAllParentsConst(const Node& n, - const gtl::FlatSet& runtime_const_nodes) { - if (n.type_string() == "GuaranteeConst" || n.type_string() == "Const") { +bool AreAllParentsGuaranteedConst( + const Node& n, const gtl::FlatSet& runtime_const_nodes) { + if (n.type_string() == "GuaranteeConst") { // If the current node is itself a cast-to-const, no need // to look at the incoming edges. return true; @@ -94,7 +93,8 @@ void MarkGuaranteedConstants( ReverseDFSFrom(graph, srcs, /*enter=*/nullptr, /*leave=*/[&guaranteed_const_nodes](const Node* n) { // TODO(vinuraja): Doesn't work in the presence of loops. - if (AreAllParentsConst(*n, guaranteed_const_nodes)) { + if (AreAllParentsGuaranteedConst(*n, + guaranteed_const_nodes)) { guaranteed_const_nodes.insert(n); } }); @@ -107,41 +107,11 @@ void MarkGuaranteedConstants( } } -// A node/slot pair. -// TODO(phawkins): is there a common definition of this? -struct NodeSlot { - NodeSlot() : node(nullptr), slot(-1), dtype(DT_INVALID) {} - NodeSlot(const Node* node, int slot) - : node(node), slot(slot), dtype(DT_INVALID) {} - NodeSlot(const Node* node, int slot, DataType dtype) - : node(node), slot(slot), dtype(dtype) {} - - const Node* node; - int slot; - - // Optional: used to record the destination type of a source NodeSlot in case - // the source output is a Ref type that is cast to a Tensor at the - // destination. - DataType dtype; - - bool operator==(const NodeSlot& other) const { - return node == other.node && slot == other.slot && dtype == other.dtype; - } - - // Leave dtype out of the hash since there are never two NodeSlots with the - // same node and slot and different dtypes. - struct Hasher { - uint64 operator()(NodeSlot const& s) const { - return Hash64Combine(std::hash()(s.node), - std::hash()(s.slot)); - } - }; - - struct PairHasher { - uint64 operator()(std::pair const& s) const { - return Hash64Combine(Hasher()(s.first), Hasher()(s.second)); - } - }; +struct OutputInputTensorPairHasher { + uint64 operator()(std::pair const& s) const { + return Hash64Combine(OutputTensor::Hash()(s.first), + InputTensor::Hash()(s.second)); + } }; // TODO(phawkins) add a canonical copy of these operator names and refactor @@ -168,7 +138,7 @@ class Encapsulator { // Find subgraphs marked with 'group_attribute', and build a new // subgraph, one for each value of 'group_attribute'. - Status SplitIntoSubgraphs(); + Status SplitIntoSubgraphs(FunctionLibraryDefinition* library); // Build a FunctionDef for each subgraph, and add it 'library'. The values of // the 'group_attribute' annotations become the function names. @@ -182,8 +152,7 @@ class Encapsulator { // Write a copy of the input graph to 'graph_out', where the subgraphs are // replaced with calls to the new functions. - Status BuildOutputGraph(bool parallel_checking, Graph* graph_out, - FunctionLibraryDefinition* library); + Status BuildOutputGraph(Graph* graph_out, FunctionLibraryDefinition* library); private: // A subgraph of the input, all marked with a common 'group_attribute' @@ -271,7 +240,7 @@ class Encapsulator { // Adds the function call node to graph_out. Status AddFunctionCallNode( const std::unordered_map& node_images, - bool parallel_checking, Graph* graph_out); + Graph* graph_out); // Adds _RecvAtHost and _SendFromHost nodes, where needed, to graph_out. Status AddOutsideCompilationHostIONodes( @@ -284,11 +253,9 @@ class Encapsulator { // Subgraph. void GetOutsideCompilationSubgraphNames(std::vector* names) const; - // Returns the Node that inputs to the function should be wired up to. - Node* GetCallNodeForInputs() const; - - // Returns the Node that outputs to the function should be wired up to. - Node* GetCallNodeForOutputs() const; + // Returns the Node that the inputs and outputs of the function should be + // wired up to. + Node* GetCallNode() const; // Returns the index of the arg that the dst of edge should connect to. int GetArgIndexForEdge(const Edge* edge) const; @@ -380,7 +347,7 @@ class Encapsulator { // Map from source (producer node/slot) tensors in the original graph to // input index (slot number in the HostCompute/RecvAtHost nodes that will // be created) for the outside_compilation subgraph. - std::unordered_map inputs; + std::unordered_map inputs; // Set of nodes in the original graph that are the source of control edges // that cross from the containing compiled subgraph into the @@ -396,8 +363,15 @@ class Encapsulator { // node/slot) tensors in the original graph to output index (slot number // in the SendFromHost/HostCompute nodes that will be created) for the // outside_compilation subgraph. - std::unordered_map outputs_by_src; - std::unordered_map outputs_by_dst; + struct ArgNumAndType { + int index; + DataType dtype; + + ArgNumAndType(int i, DataType t) : index(i), dtype(t) {} + }; + std::unordered_map + outputs_by_src; + std::unordered_map outputs_by_dst; // Set of nodes in the original graph that are the destination of control // edges that cross from the outside_compilation subgraph into the @@ -425,12 +399,6 @@ class Encapsulator { OutsideCompilationSubgraph* LookupOrCreateOutsideCompilationSubgraph( const string& outside_compilation_id); - // Builds a ParallelCheck op that compares the output of the original - // subgraph with the encapsulated subgraph. - Status BuildParallelCheckOp( - const std::unordered_map& node_images, - Graph* graph_out); - // Builds a placeholder node used to provide the key input to a RecvAtHost // or SendFromHost node. This placeholder node will be removed by a later // pass. @@ -482,26 +450,21 @@ class Encapsulator { // Not owned. Node* host_compute_key_placeholder_ = nullptr; - // Function call node(s) in the output graph. Not owned. - // If parallel_checking is enabled, 'call_node_inputs' is the function call - // node to which inputs should be fed, and 'call_node_outputs' is the - // parallel check op from which outputs should be read. If parallel checking - // is disabled, both point to the function call node. - Node* call_node_inputs_; - Node* call_node_outputs_; + // Function call node in the output graph. Not owned. + Node* call_node_; // Maps from source (producer node/slot) and destination // (consumer node/slot) tensors in the input graph to _Arg numbers in // the subgraph. The source map is one-to-one, whereas the dest map may be // many-to-one. - std::unordered_map args_by_src_; - std::unordered_map args_by_dst_; + std::unordered_map args_by_src_; + std::unordered_map args_by_dst_; - // The _Arg nodes in the subgraph, in order by argument number. + // The arguments to the subgraph, in order. std::vector args_; // Map from source tensor in the input graph to result #. - std::unordered_map results_; + std::unordered_map results_; // The outside_compilation clusters in this subgraph. std::unordered_map @@ -541,13 +504,12 @@ class Encapsulator { // Copies all nodes that aren't in a compiled subgraph to the output graph. Status CopyNodesToOutputGraph( - bool parallel_checking, Graph* graph_out, - std::unordered_map* node_images); + Graph* graph_out, std::unordered_map* node_images); // Adds function call nodes for each compiled subgraph. Status AddFunctionCallNodes( const std::unordered_map& node_images, - bool parallel_checking, Graph* graph_out); + Graph* graph_out); // Adds _RecvAtHost and _SendFromHost nodes, where needed, for all // outside_compilation subgraphs. @@ -598,9 +560,9 @@ class Encapsulator { const string& src_outside_compilation_id, const string& dst_func_id, const string& dst_outside_compilation_id, const std::unordered_map& node_images, - bool parallel_checking, Graph* graph_out, - std::unordered_set, NodeSlot::PairHasher>* - edges_added); + Graph* graph_out, + std::unordered_set, + OutputInputTensorPairHasher>* edges_added); // Adds control dependencies between subgraph call nodes that have // dependencies via outside_compilation edges. @@ -609,7 +571,7 @@ class Encapsulator { // Adds all edges to the output graph. Status AddEdgesToOutputGraph( const std::unordered_map& node_images, - bool parallel_checking, Graph* graph_out); + Graph* graph_out); // Constructs a minimal shape inference graph that can be used to determine // the shape of send_node at the time that the subgraph is compiled. @@ -729,20 +691,14 @@ void TopologicalClusterSort( } // namespace -Node* Encapsulator::Subgraph::GetCallNodeForInputs() const { - return call_node_inputs_; -} - -Node* Encapsulator::Subgraph::GetCallNodeForOutputs() const { - return call_node_outputs_; -} +Node* Encapsulator::Subgraph::GetCallNode() const { return call_node_; } int Encapsulator::Subgraph::GetArgIndexForEdge(const Edge* edge) const { - return args_by_dst_.at(NodeSlot(edge->dst(), edge->dst_input())); + return args_by_dst_.at(InputTensor(edge->dst(), edge->dst_input())); } int Encapsulator::Subgraph::GetResultIndexForEdge(const Edge* edge) const { - return results_.at(NodeSlot(edge->src(), edge->src_output())); + return results_.at(OutputTensor(edge->src(), edge->src_output())); } Node* Encapsulator::Subgraph::GetRecvAtHostNode( @@ -754,7 +710,7 @@ Node* Encapsulator::Subgraph::GetRecvAtHostNode( int Encapsulator::Subgraph::GetRecvAtHostSlot( const string& outside_compilation_subgraph_name, const Edge* edge) const { return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name) - .inputs.at(NodeSlot(edge->src(), edge->src_output())); + .inputs.at(OutputTensor(edge->src(), edge->src_output())); } Node* Encapsulator::Subgraph::GetSendFromHostNode( @@ -766,7 +722,7 @@ Node* Encapsulator::Subgraph::GetSendFromHostNode( int Encapsulator::Subgraph::GetSendFromHostSlot( const string& outside_compilation_subgraph_name, const Edge* edge) const { return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name) - .outputs_by_dst.at(NodeSlot(edge->dst(), edge->dst_input())); + .outputs_by_dst.at(InputTensor(edge->dst(), edge->dst_input())); } Node* Encapsulator::Subgraph::MakeNodeImage(const Graph* graph_in, Node* node) { @@ -791,10 +747,10 @@ Status Encapsulator::Subgraph::RecordArg( std::vector>* src_arg_pairs) { Node* src_node = edge->src(); int src_slot = edge->src_output(); - std::unordered_map::iterator iter; + std::unordered_map::iterator iter; bool inserted; - std::tie(iter, inserted) = - args_by_src_.emplace(NodeSlot(src_node, src_slot), args_by_src_.size()); + std::tie(iter, inserted) = args_by_src_.emplace( + OutputTensor(src_node, src_slot), args_by_src_.size()); int arg_index = iter->second; if (inserted) { NodeDef arg_def; @@ -815,7 +771,7 @@ Status Encapsulator::Subgraph::RecordArg( Node* dst_node = edge->dst(); Node* dst_image = node_images.at(dst_node); int dst_slot = edge->dst_input(); - args_by_dst_[NodeSlot(dst_node, dst_slot)] = arg_index; + args_by_dst_[InputTensor(dst_node, dst_slot)] = arg_index; graph_->AddEdge(args_[arg_index], 0, dst_image, dst_slot); return Status::OK(); } @@ -826,10 +782,10 @@ Status Encapsulator::Subgraph::RecordResult( Node* src_node = edge->src(); Node* src_image = node_images.at(src_node); int src_slot = edge->src_output(); - std::unordered_map::iterator iter; + std::unordered_map::iterator iter; bool inserted; std::tie(iter, inserted) = - results_.emplace(NodeSlot(src_node, src_slot), results_.size()); + results_.emplace(OutputTensor(src_node, src_slot), results_.size()); int ret_index = iter->second; if (inserted) { NodeDef ret_def; @@ -867,8 +823,8 @@ void Encapsulator::Subgraph::RecordOutsideCompilationInputOrControl( outside_subgraph->control_inputs.insert(edge->src()); } else { int input_index = outside_subgraph->inputs.size(); - outside_subgraph->inputs.emplace(NodeSlot(edge->src(), edge->src_output()), - input_index); + outside_subgraph->inputs.emplace( + OutputTensor(edge->src(), edge->src_output()), input_index); } } @@ -882,11 +838,13 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl( DataType dtype = edge->dst()->input_type(edge->dst_input()); auto output_iter = outside_subgraph->outputs_by_src - .emplace(NodeSlot(edge->src(), edge->src_output(), dtype), - outside_subgraph->outputs_by_src.size()) + .emplace(OutputTensor(edge->src(), edge->src_output()), + OutsideCompilationSubgraph::ArgNumAndType( + outside_subgraph->outputs_by_src.size(), dtype)) .first; - int output_index = output_iter->second; - outside_subgraph->outputs_by_dst[NodeSlot(edge->dst(), edge->dst_input())] = + const int output_index = output_iter->second.index; + outside_subgraph + ->outputs_by_dst[InputTensor(edge->dst(), edge->dst_input())] = output_index; } } @@ -968,7 +926,7 @@ Status Encapsulator::Subgraph::AddHostComputes( for (const auto& input_src : oc_subgraph.inputs) { const Node* src_node = input_src.first.node; Node* src_image = node_images.at(src_node); - int src_slot = input_src.first.slot; + int src_slot = input_src.first.index; int input_index = input_src.second; DataType dtype = src_node->output_type(src_slot); @@ -976,8 +934,8 @@ Status Encapsulator::Subgraph::AddHostComputes( input_dtypes[input_index] = dtype; } for (const auto& output : oc_subgraph.outputs_by_src) { - DataType dtype = output.first.dtype; - int output_index = output.second; + DataType dtype = output.second.dtype; + int output_index = output.second.index; output_dtypes[output_index] = dtype; } @@ -1015,7 +973,7 @@ Status Encapsulator::Subgraph::AddHostComputes( for (auto& input_src : oc_subgraph.inputs) { const Node* src_node = input_src.first.node; Node* src_image = node_images.at(src_node); - int src_slot = input_src.first.slot; + int src_slot = input_src.first.index; int input_index = input_src.second; graph_->AddEdge(src_image, src_slot, host_compute, input_index); } @@ -1037,7 +995,7 @@ Status Encapsulator::Subgraph::AddHostComputes( for (const auto& output : oc_subgraph.outputs_by_dst) { const Node* dst_node = output.first.node; Node* dst_image = node_images.at(dst_node); - int dst_slot = output.first.slot; + int dst_slot = output.first.index; int output_index = output.second; graph_->AddEdge(host_compute, output_index, dst_image, dst_slot); @@ -1075,7 +1033,7 @@ Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name, void Encapsulator::Subgraph::ConnectSequencerToCallNode(Graph* graph_out) { if (sequencer_ != nullptr) { VLOG(2) << "ConnectSequencerToCallNode"; - graph_out->AddControlEdge(sequencer_, call_node_inputs_); + graph_out->AddControlEdge(sequencer_, call_node_); } } @@ -1090,14 +1048,19 @@ Status Encapsulator::Subgraph::BuildFunctionDef( call_node_def_.set_device(device_); if (rewrite_subgraph_fn) { + std::vector arg_source_tensors(args_by_src_.size()); + for (const auto& arg : args_by_src_) { + arg_source_tensors.at(arg.second) = arg.first; + } // Initialize the input and output permutations to the identity. std::vector input_permutation(args_by_src_.size()); std::iota(input_permutation.begin(), input_permutation.end(), 0); std::vector output_permutation(results_.size()); std::iota(output_permutation.begin(), output_permutation.end(), 0); - TF_RETURN_IF_ERROR(rewrite_subgraph_fn( - &graph_, &input_permutation, &output_permutation, &call_node_def_)); + TF_RETURN_IF_ERROR( + rewrite_subgraph_fn(arg_source_tensors, &graph_, &input_permutation, + &output_permutation, &call_node_def_)); // Apply the input/output permutations to the 'args_by_...' and 'results_' // mappings, so when we build edges in BuildOutputGraph() we @@ -1174,7 +1137,10 @@ Status Encapsulator::Subgraph::AddShapeInferenceInfo( GraphToFunctionDef(*inference_graph, inference_graph_name, &fdef)); host_compute->AddAttr("shape_inference_graph", inference_graph_name); host_compute->AddAttr("shapes", std::vector()); - TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef)); + // TODO(sibyl-Aix6ihai): Understand why there are multiple calls to Encapsulator. + if (library->Find(inference_graph_name) == nullptr) { + TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef)); + } } return Status::OK(); } @@ -1195,88 +1161,20 @@ Status Encapsulator::Subgraph::ReplaceFunctionDef( strings::StrCat("replace_encapsulate_fdef_", name), fdef); } - TF_RETURN_IF_ERROR(library->RemoveFunction(name)); - TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef)); - return Status::OK(); -} - -Status Encapsulator::Subgraph::BuildParallelCheckOp( - const std::unordered_map& node_images, - Graph* graph_out) { - // Build an index mapping output positions to node/slot pairs in the - // original graph. - std::vector results_by_num(results_.size()); - for (const auto& entry : results_) { - results_by_num[entry.second] = entry.first; - } - - // Build a parallel check NodeDef. - int num_results = results_by_num.size(); - std::vector result_dtypes(num_results); - std::vector expected_outputs(num_results); - std::vector actual_outputs(num_results); - for (int i = 0; i < num_results; ++i) { - const NodeSlot& node_slot = results_by_num[i]; - result_dtypes[i] = node_slot.node->output_type(node_slot.slot); - expected_outputs[i] = - NodeDefBuilder::NodeOut(node_images.at(node_slot.node)->name(), - node_slot.slot, result_dtypes[i]); - actual_outputs[i] = - NodeDefBuilder::NodeOut(call_node_def_.name(), i, result_dtypes[i]); - } - // Assign the parallel check op to a CPU on the same task as the cluster it is - // checking. - string device, dummy; - if (!DeviceNameUtils::SplitDeviceName( - call_node_inputs_->assigned_device_name(), &device, &dummy)) { - return errors::InvalidArgument("Could not parse device name"); - } - strings::StrAppend(&device, "/cpu:0"); - - NodeDef check_def; - TF_RETURN_IF_ERROR( - NodeDefBuilder(graph_out->NewName(strings::StrCat(call_node_def_.name(), - "_parallel_check")), - "ParallelCheck") - .Device(device) - .Attr("T", result_dtypes) - .Input(expected_outputs) - .Input(actual_outputs) - .Finalize(&check_def)); - - Status s; - Node* check_op = graph_out->AddNode(check_def, &s); - if (!s.ok()) return s; - check_op->set_assigned_device_name(device); - - // TODO(phawkins): it seems redundant to call AddEdge as well as - // pass Inputs to the NodeDefBuilder, but I have been unable to find a - // way to avoid it. - for (int i = 0; i < num_results; ++i) { - const NodeSlot& node_slot = results_by_num[i]; - graph_out->AddEdge(node_images.at(node_slot.node), node_slot.slot, check_op, - i); - graph_out->AddEdge(call_node_inputs_, i, check_op, num_results + i); - } - - call_node_outputs_ = check_op; + TF_RETURN_IF_ERROR(library->ReplaceFunction(name, fdef)); return Status::OK(); } Status Encapsulator::Subgraph::AddFunctionCallNode( const std::unordered_map& node_images, - bool parallel_checking, Graph* graph_out) { + Graph* graph_out) { Status s; - call_node_inputs_ = graph_out->AddNode(call_node_def_, &s); + call_node_ = graph_out->AddNode(call_node_def_, &s); if (!s.ok()) return s; // Copy the assigned device and the key_annotation over. - call_node_inputs_->set_assigned_device_name(device_); - call_node_outputs_ = call_node_inputs_; + call_node_->set_assigned_device_name(device_); - if (parallel_checking) { - TF_RETURN_IF_ERROR(BuildParallelCheckOp(node_images, graph_out)); - } return Status::OK(); } @@ -1315,7 +1213,7 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode( for (const auto& input : oc_subgraph->inputs) { const Node* src_node = input.first.node; - int src_slot = input.first.slot; + int src_slot = input.first.index; int input_index = input.second; DataType dtype = src_node->output_type(src_slot); @@ -1369,8 +1267,8 @@ Status Encapsulator::Subgraph::AddSendFromHostNode( for (const auto& output : oc_subgraph->outputs_by_src) { const Node* src_node = output.first.node; Node* src_image = node_images.at(src_node); - int src_slot = output.first.slot; - int output_index = output.second; + int src_slot = output.first.index; + int output_index = output.second.index; DataType dtype = src_node->output_type(src_slot); dtypes[output_index] = dtype; @@ -1579,7 +1477,7 @@ Status Encapsulator::CopySubgraphEdges( return Status::OK(); } -Status Encapsulator::SplitIntoSubgraphs() { +Status Encapsulator::SplitIntoSubgraphs(FunctionLibraryDefinition* library) { Status s; // Map from input graph nodes to subgraph nodes. @@ -1609,6 +1507,18 @@ Status Encapsulator::SplitIntoSubgraphs() { for (auto& entry : subgraphs_) { Subgraph& subgraph = entry.second; FixupSourceAndSinkEdges(subgraph.GetGraph()); + // Verify that the graph has well-formed control flow structure. + std::vector dummy; + TF_RETURN_IF_ERROR(BuildControlFlowInfo(subgraph.GetGraph(), &dummy)); + } + + if (VLOG_IS_ON(1)) { + // Dump subgraphs. + for (auto& entry : subgraphs_) { + dump_graph::DumpGraphToFile( + strings::StrCat("encapsulate_subgraphs_subgraph_", entry.first), + *entry.second.GetGraph(), library); + } } return s; @@ -1627,27 +1537,17 @@ Status Encapsulator::BuildFunctionDefs( } Status Encapsulator::CopyNodesToOutputGraph( - bool parallel_checking, Graph* graph_out, - std::unordered_map* node_images) { + Graph* graph_out, std::unordered_map* node_images) { for (Node* node : graph_in_->op_nodes()) { string func_id; string outside_compilation_id; TF_RETURN_IF_ERROR( GetFunctionNameAttr(node, &func_id, &outside_compilation_id)); - // Don't copy nodes that going to be encapsulated, unless parallel checking - // is enabled. - if (IsInSubgraph(func_id, outside_compilation_id) && !parallel_checking) - continue; + // Don't copy nodes that are going to be encapsulated. + if (IsInSubgraph(func_id, outside_compilation_id)) continue; Node* image = graph_out->CopyNode(node); - if (!outside_compilation_id.empty()) { - if (parallel_checking) { - return errors::InvalidArgument( - "Parallel checking is not supported when outside_compilation " - "clusters are present."); - } - } (*node_images)[node] = image; } (*node_images)[graph_in_->source_node()] = graph_out->source_node(); @@ -1657,10 +1557,10 @@ Status Encapsulator::CopyNodesToOutputGraph( Status Encapsulator::AddFunctionCallNodes( const std::unordered_map& node_images, - bool parallel_checking, Graph* graph_out) { + Graph* graph_out) { for (auto& subgraph_entry : subgraphs_) { - TF_RETURN_IF_ERROR(subgraph_entry.second.AddFunctionCallNode( - node_images, parallel_checking, graph_out)); + TF_RETURN_IF_ERROR( + subgraph_entry.second.AddFunctionCallNode(node_images, graph_out)); } return Status::OK(); } @@ -1694,7 +1594,7 @@ Status Encapsulator::FindOutputImageOfEdgeSrc( } else { // The edge is from a subgraph to a regular node in the output graph so // use the subgraph's call node output. - *src_image = subgraphs_.at(src_func_id).GetCallNodeForOutputs(); + *src_image = subgraphs_.at(src_func_id).GetCallNode(); } } else { // The source of the edge is in the output graph so use the node image in @@ -1742,7 +1642,7 @@ Status Encapsulator::FindOutputImageOfEdgeDst( } else { // The edge is to a subgraph from a regular node in the output graph so // use the subgraph's call node input. - *dst_image = subgraphs_.at(dst_func_id).GetCallNodeForInputs(); + *dst_image = subgraphs_.at(dst_func_id).GetCallNode(); } } else { // The destination of the edge is in the output graph so use the node image @@ -1778,10 +1678,9 @@ Status Encapsulator::CopyEdgeToOutputGraph( const Edge* edge, const string& src_func_id, const string& src_outside_compilation_id, const string& dst_func_id, const string& dst_outside_compilation_id, - const std::unordered_map& node_images, - bool parallel_checking, Graph* graph_out, - std::unordered_set, NodeSlot::PairHasher>* - edges_added) { + const std::unordered_map& node_images, Graph* graph_out, + std::unordered_set, + OutputInputTensorPairHasher>* edges_added) { Node* src_image; TF_RETURN_IF_ERROR(FindOutputImageOfEdgeSrc( src_func_id, src_outside_compilation_id, dst_func_id, @@ -1796,16 +1695,12 @@ Status Encapsulator::CopyEdgeToOutputGraph( if (edge->IsControlEdge()) { // Add the control edge, if we have not already added it, using the images // determined above (potentially call operators or RecvAtHost/SendFromHost). - if (edges_added->emplace(NodeSlot(src_image, -1), NodeSlot(dst_image, -1)) + if (edges_added + ->emplace(OutputTensor(src_image, -1), InputTensor(dst_image, -1)) .second) { graph_out->AddControlEdge(src_image, dst_image); } - // If parallel checking is enabled, also add a control edge to the - // corresponding parallel check op. - if (parallel_checking) { - graph_out->AddControlEdge(src_image, node_images.at(edge->dst())); - } return Status::OK(); } @@ -1817,18 +1712,10 @@ Status Encapsulator::CopyEdgeToOutputGraph( FindOutputSlotOfEdgeDst(src_func_id, src_outside_compilation_id, dst_func_id, dst_outside_compilation_id, edge); - if (IsInSubgraph(dst_func_id, dst_outside_compilation_id) && - parallel_checking) { - // If we are parallel checking, also feed the tensor as an input to the - // corresponding parallel check subgraph. - graph_out->AddEdge(src_image, src_output, node_images.at(edge->dst()), - edge->dst_input()); - } - // Add the edge, if we have not already added it. if (edges_added - ->emplace(NodeSlot(src_image, src_output), - NodeSlot(dst_image, dst_input)) + ->emplace(OutputTensor(src_image, src_output), + InputTensor(dst_image, dst_input)) .second) { graph_out->AddEdge(src_image, src_output, dst_image, dst_input); } @@ -1839,8 +1726,8 @@ Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) { for (const auto& ancestors : subgraph_ancestors_) { const string& subgraph = ancestors.first; for (const string& ancestor : ancestors.second) { - graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNodeForOutputs(), - subgraphs_[subgraph].GetCallNodeForInputs()); + graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNode(), + subgraphs_[subgraph].GetCallNode()); } } return Status::OK(); @@ -1848,11 +1735,12 @@ Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) { Status Encapsulator::AddEdgesToOutputGraph( const std::unordered_map& node_images, - bool parallel_checking, Graph* graph_out) { + Graph* graph_out) { // Set of edges already added to the output graph, represented as (src, dst) // pairs. We use the set to deduplicate edges; multiple edges in the input // graph may map to one edge in the output graph. - std::unordered_set, NodeSlot::PairHasher> + std::unordered_set, + OutputInputTensorPairHasher> edges_added; for (const Edge* edge : graph_in_->edges()) { @@ -1870,16 +1758,6 @@ Status Encapsulator::AddEdgesToOutputGraph( if (IsInSubgraph(src_func_id, src_outside_compilation_id) && IsInSubgraph(dst_func_id, dst_outside_compilation_id) && src_func_id == dst_func_id) { - if (parallel_checking) { - Node* src_image = node_images.at(edge->src()); - Node* dst_image = node_images.at(edge->dst()); - if (edge->IsControlEdge()) { - graph_out->AddControlEdge(src_image, dst_image); - } else { - graph_out->AddEdge(src_image, edge->src_output(), dst_image, - edge->dst_input()); - } - } continue; } @@ -1887,8 +1765,7 @@ Status Encapsulator::AddEdgesToOutputGraph( // unclustered graph. TF_RETURN_IF_ERROR(CopyEdgeToOutputGraph( edge, src_func_id, src_outside_compilation_id, dst_func_id, - dst_outside_compilation_id, node_images, parallel_checking, graph_out, - &edges_added)); + dst_outside_compilation_id, node_images, graph_out, &edges_added)); } for (auto& subgraph_entry : subgraphs_) { @@ -2067,6 +1944,8 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend( // continue. TensorShapeProto proto; context->ShapeHandleToProto(shape, &proto); + VLOG(2) << "Node " << src_node->name() + << " has known shape: " << proto.DebugString(); if (dummy_node_images.find(src_node) == dummy_node_images.end()) { dummy_node_images[src_node] = AddDummyShapedNode(src_node, src_port, control_flow_info, @@ -2084,6 +1963,8 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend( if (VLOG_IS_ON(2)) { TensorShapeProto proto; context->ShapeHandleToProto(shape, &proto); + VLOG(2) << "Node " << src_node->name() + << " has unknown shape: " << proto.DebugString(); } stack.push_back({src_node, false}); } @@ -2326,6 +2207,23 @@ Status Encapsulator::FindClusterDependencies() { } } } + if (VLOG_IS_ON(2)) { + // Print debug information. + VLOG(2) << "node_ancestors_map:"; + for (const auto& node_iter : node_ancestors_map) { + VLOG(2) << "\t" << node_iter.first->name() << ": subgraph = '" + << node_iter.second.subgraph + << "', outside_compilation_cluster = '" + << node_iter.second.outside_compilation_cluster + << "', ancestor_clusters: " + << (node_iter.second.ancestor_clusters.empty() ? "(empty)" : ""); + for (const auto& cluster_iter : node_iter.second.ancestor_clusters) { + VLOG(2) << "\t\tsubgraph = '" << cluster_iter.subgraph + << "', outside_compilation_cluster = '" + << cluster_iter.outside_compilation_cluster << "'"; + } + } + } return Status::OK(); } @@ -2504,18 +2402,15 @@ Status Encapsulator::GetShapeInfoForOutsideCompilationSends( return Status::OK(); } -Status Encapsulator::BuildOutputGraph(bool parallel_checking, Graph* graph_out, +Status Encapsulator::BuildOutputGraph(Graph* graph_out, FunctionLibraryDefinition* library) { // Map from nodes in the input graph to nodes in the output graph. std::unordered_map node_images; - TF_RETURN_IF_ERROR( - CopyNodesToOutputGraph(parallel_checking, graph_out, &node_images)); - TF_RETURN_IF_ERROR( - AddFunctionCallNodes(node_images, parallel_checking, graph_out)); + TF_RETURN_IF_ERROR(CopyNodesToOutputGraph(graph_out, &node_images)); + TF_RETURN_IF_ERROR(AddFunctionCallNodes(node_images, graph_out)); TF_RETURN_IF_ERROR(AddOutsideCompilationHostIONodes(node_images, graph_out)); - TF_RETURN_IF_ERROR( - AddEdgesToOutputGraph(node_images, parallel_checking, graph_out)); + TF_RETURN_IF_ERROR(AddEdgesToOutputGraph(node_images, graph_out)); TF_RETURN_IF_ERROR( GetShapeInfoForOutsideCompilationSends(graph_out, library)); @@ -2528,23 +2423,22 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking, Graph* graph_out, Status EncapsulateSubgraphsInFunctions( string group_attribute, string outside_compilation_attribute, const Graph& graph_in, const RewriteSubgraphFn& rewrite_subgraph_fn, - bool parallel_checking, bool reuse_existing_functions, - std::unique_ptr* graph_out, FunctionLibraryDefinition* library) { + bool reuse_existing_functions, std::unique_ptr* graph_out, + FunctionLibraryDefinition* library) { Status s; Encapsulator encapsulator(std::move(group_attribute), std::move(outside_compilation_attribute), &graph_in); TF_RETURN_IF_ERROR(encapsulator.FindClusterDependencies()); - TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs()); + TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs(library)); TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs( rewrite_subgraph_fn, reuse_existing_functions, library)); std::unique_ptr out(new Graph(library)); out->set_versions(graph_in.versions()); - TF_RETURN_IF_ERROR( - encapsulator.BuildOutputGraph(parallel_checking, out.get(), library)); + TF_RETURN_IF_ERROR(encapsulator.BuildOutputGraph(out.get(), library)); *graph_out = std::move(out); return Status::OK(); @@ -2585,10 +2479,8 @@ static Status RenumberArguments(Graph* graph, Status EncapsulateSubgraphsPass::Run( const GraphOptimizationPassOptions& options) { VLOG(1) << "EncapsulateSubgraphsPass::Run"; - legacy_flags::EncapsulateSubgraphsPassFlags* flags = - legacy_flags::GetEncapsulateSubgraphsPassFlags(); if (VLOG_IS_ON(1)) { - dump_graph::DumpGraphToFile("before_encapsulate_subgraphs", **options.graph, + dump_graph::DumpGraphToFile("encapsulate_subgraphs_before", **options.graph, options.flib_def); } @@ -2602,72 +2494,77 @@ Status EncapsulateSubgraphsPass::Run( FunctionLibraryRuntime* flr = pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice); - auto rewrite_subgraph = [flr](std::unique_ptr* subgraph, - std::vector* input_permutation, - std::vector* output_permutation, - NodeDef* node) { - // Optimize the subgraph. - OptimizeGraph(flr, subgraph); - - const int num_args = input_permutation->size(); - std::vector const_args(num_args); - TF_RETURN_IF_ERROR(BackwardsConstAnalysis(**subgraph, &const_args)); - - DataTypeVector arg_types(num_args); - TF_RETURN_IF_ERROR(GetArgTypes(**subgraph, &arg_types)); - - // Compute a permutation of the arguments such that the constant arguments - // are first. - const int num_consts = - std::count(const_args.begin(), const_args.end(), true); - - const int num_resources = - std::count(arg_types.begin(), arg_types.end(), DT_RESOURCE); - const int num_nonconsts = num_args - num_resources - num_consts; - if (num_nonconsts < 0) { - return errors::Internal("num_nonconsts should be >= 0, was ", - num_nonconsts); - } + auto rewrite_subgraph = + [flr](const std::vector& arg_source_tensors, + std::unique_ptr* subgraph, + std::vector* input_permutation, + std::vector* output_permutation, NodeDef* node) { + // Optimize the subgraph. + OptimizeGraph(flr, subgraph); + + const int num_args = input_permutation->size(); + std::vector const_args(num_args); + TF_RETURN_IF_ERROR(BackwardsConstAnalysis( + **subgraph, &const_args, /*compile_time_const_nodes=*/nullptr)); + + DataTypeVector arg_types(num_args); + TF_RETURN_IF_ERROR(GetArgTypes(**subgraph, &arg_types)); + + // Compute a permutation of the arguments such that the constant + // arguments are first. + const int num_consts = + std::count(const_args.begin(), const_args.end(), true); + + const int num_resources = + std::count(arg_types.begin(), arg_types.end(), DT_RESOURCE); + const int num_nonconsts = num_args - num_resources - num_consts; + if (num_nonconsts < 0) { + return errors::Internal("num_nonconsts should be >= 0, was ", + num_nonconsts); + } - int const_pos = 0; - int arg_pos = num_consts; - int resource_pos = num_consts + num_nonconsts; - for (int i = 0; i < num_args; ++i) { - if (const_args[i]) { - if (arg_types[i] == DT_RESOURCE) { - return errors::Internal( - "Resource arguments cannot be constant (argument ", i, ")"); + int const_pos = 0; + int arg_pos = num_consts; + int resource_pos = num_consts + num_nonconsts; + for (int i = 0; i < num_args; ++i) { + if (const_args[i]) { + if (arg_types[i] == DT_RESOURCE) { + return errors::Internal( + "Resource arguments cannot be constant (argument ", i, ")"); + } + (*input_permutation)[i] = const_pos; + ++const_pos; + } else if (arg_types[i] == DT_RESOURCE) { + (*input_permutation)[i] = resource_pos; + ++resource_pos; + } else { + (*input_permutation)[i] = arg_pos; + ++arg_pos; + } } - (*input_permutation)[i] = const_pos; - ++const_pos; - } else if (arg_types[i] == DT_RESOURCE) { - (*input_permutation)[i] = resource_pos; - ++resource_pos; - } else { - (*input_permutation)[i] = arg_pos; - ++arg_pos; - } - } - // Renumber argument nodes in the graph. - TF_RETURN_IF_ERROR(RenumberArguments(subgraph->get(), *input_permutation)); + // Renumber argument nodes in the graph. + TF_RETURN_IF_ERROR( + RenumberArguments(subgraph->get(), *input_permutation)); - // TODO(phawkins): add a forward is-constant analysis, similarly split - // outputs into host-memory constants and device-memory non-constants. + // TODO(phawkins): add a forward is-constant analysis, similarly split + // outputs into host-memory constants and device-memory non-constants. - AddNodeAttr(kXlaCompiledKernelAttr, true, node); - AddNodeAttr(kXlaNumConstantArgsAttr, num_consts, node); - AddNodeAttr(kXlaNumResourceArgsAttr, num_resources, node); - return Status::OK(); - }; + AddNodeAttr(kXlaCompiledKernelAttr, true, node); + AddNodeAttr(kXlaNumConstantArgsAttr, num_consts, node); + AddNodeAttr(kXlaNumResourceArgsAttr, num_resources, node); + return Status::OK(); + }; - TF_RETURN_IF_ERROR(EncapsulateSubgraphsInFunctions( - kXlaClusterAttr, kXlaOutsideCompilationAttr, **options.graph, - rewrite_subgraph, flags->tf_xla_parallel_checking, - /*reuse_existing_functions=*/false, &graph_out, library)); + TF_RETURN_WITH_CONTEXT_IF_ERROR( + EncapsulateSubgraphsInFunctions( + kXlaClusterAttr, kXlaOutsideCompilationAttr, **options.graph, + rewrite_subgraph, /*reuse_existing_functions=*/false, &graph_out, + library), + "EncapsulateSubgraphsPass failed"); if (VLOG_IS_ON(1)) { - dump_graph::DumpGraphToFile("after_encapsulate_subgraphs", *graph_out, + dump_graph::DumpGraphToFile("encapsulate_subgraphs_after", *graph_out, options.flib_def); } diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h index 5fee36f022a7515504cb6faa5cca658481b784c5..926589546fec72048485d30966f31b24e44b1245 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h @@ -28,6 +28,9 @@ limitations under the License. namespace tensorflow { // A rewriting function to apply to each subgraph during encapsulation. +// 'arg_source_tensors' are the tensors corresponding to the arguments in the +// original source graph (*not* 'graph'). +// // 'graph' is the subgraph. The rewriting may renumber the inputs and outputs; // 'input_permutation' is a mapping from old argument numbers to new argument // numbers, whereas 'output_permutation' is the same for outputs. Both @@ -37,6 +40,7 @@ namespace tensorflow { // The rewrite may also change the NodeDef's operator name, and that // name will be used as the name of the generated function. typedef std::function& arg_source_tensors, std::unique_ptr* graph, std::vector* input_permutation, std::vector* output_permutation, NodeDef* node_def)> RewriteSubgraphFn; @@ -61,10 +65,6 @@ typedef std::function* graph_out, FunctionLibraryDefinition* library); + bool reuse_existing_functions, std::unique_ptr* graph_out, + FunctionLibraryDefinition* library); // The attribute that marks function calls produced by the encapsulate // subgraphs pass and that should in turn be compiled via XlaLaunch operators. diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc index 5ec24d39a2c40a766dbb0ec51ebe798de620e24b..7bc0ef030302dc6495e3e6d1151f458b450ed2c3 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h" +#include "absl/strings/match.h" #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/core/framework/function_testlib.h" @@ -25,7 +26,6 @@ limitations under the License. #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/graph_def_builder.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/util/equal_graph_def.h" @@ -124,8 +124,8 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b, std::unordered_set control_input_a; std::unordered_set control_input_b; for (int i = 0; i < a.input_size(); ++i) { - if (str_util::StartsWith(a.input(i), "^")) { - if (!str_util::StartsWith(b.input(i), "^")) { + if (absl::StartsWith(a.input(i), "^")) { + if (!absl::StartsWith(b.input(i), "^")) { if (diff) { *diff = strings::StrCat( diff_preamble, " mismatch for node ", a.name(), " input ", i, @@ -379,7 +379,7 @@ Node* InputShaped(const GraphDefBuilder::Options& opts) { return ops::SourceOp("InputTestShaped", opts); } -Node* KnownShapeBase(DataType dtype, const gtl::ArraySlice& shape, +Node* KnownShapeBase(DataType dtype, absl::Span shape, const GraphDefBuilder::Options& opts) { if (opts.HaveError()) return nullptr; NodeBuilder node_builder(opts.GetNameForOp("Const"), "Const", @@ -394,7 +394,7 @@ Node* KnownShapeBase(DataType dtype, const gtl::ArraySlice& shape, .FinalizeBuilder(&node_builder); } -Node* KnownShape(const gtl::ArraySlice& shape, +Node* KnownShape(absl::Span shape, const GraphDefBuilder::Options& opts) { return KnownShapeBase(DT_FLOAT, shape, opts); } @@ -417,8 +417,7 @@ Node* KeyPlaceholder(const string& call_node, } Node* RecvAtHost(ops::NodeOut key_input, const string& cluster, - const string& oc_cluster, - const gtl::ArraySlice& dtypes, + const string& oc_cluster, absl::Span dtypes, const GraphDefBuilder::Options& opts) { if (opts.HaveError()) return nullptr; string key = @@ -511,7 +510,6 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) { std::unique_ptr graph_out; s = EncapsulateSubgraphsInFunctions("_encapsulate", "_outside", *graph, /*rewrite_subgraph_fn=*/{}, - /*parallel_checking=*/false, /*reuse_existing_functions=*/false, &graph_out, lib_def.get()); if (!s.ok()) return s; @@ -560,8 +558,9 @@ TEST(EncapsulateSubgraphsTest, OneFunction) { Node* b = Input(b1.opts().WithName("B")); // Give nodes 'c' and 'd' names that collide after lowercasing. Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); - Node* d = Binary(b, c, b1.opts().WithName("c").WithControlInput(c).WithAttr( - "_encapsulate", "F1")); + Node* d = Binary(b, c, + b1.opts().WithName("c").WithControlInput(c).WithAttr( + "_encapsulate", "F1")); Binary(a, d, b1.opts().WithName("E")); TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); } @@ -614,8 +613,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctions) { Node* c = Unary(a, b1.opts().WithName("C").WithControlInput(control).WithAttr( "_encapsulate", "F1")); - Node* d = - Binary(b, c, b1.opts().WithName("D").WithControlInput(control).WithAttr( + Node* d = Binary(b, c, + b1.opts().WithName("D").WithControlInput(control).WithAttr( "_encapsulate", "F2")); Binary(a, d, b1.opts().WithName("E")); TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); @@ -707,7 +706,7 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) { std::unique_ptr graph; TF_ASSERT_OK(EncapsulateSubgraphsInFunctions( "_cluster", "_outside", graph_before_encapsulation, - /*rewrite_subgraph_fn=*/{}, /*parallel_checking=*/false, + /*rewrite_subgraph_fn=*/{}, /*reuse_existing_functions=*/false, &graph, &library)); std::vector expected_nodes = {"cluster1", "cluster2", "mul", "x"}; @@ -721,47 +720,6 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) { EXPECT_EQ(expected_edges, GraphEdges(*graph)); } -TEST(EncapsulateSubgraphsTest, ParallelChecking) { - Scope root = Scope::NewRootScope().ExitOnError().WithDevice( - "/job:localhost/replica:0/task:0/cpu:0"); - auto x1 = ops::Placeholder(root.WithOpName("x1"), DT_FLOAT); - auto x2 = ops::Placeholder(root.WithOpName("x2"), DT_FLOAT); - auto add1 = ops::Add(root.WithOpName("add1"), x1, x2); - add1.node()->AddAttr("_cluster", "cluster1"); - auto add2 = ops::Add(root.WithOpName("add2"), add1, x2); - add2.node()->AddAttr("_cluster", "cluster1"); - auto out = ops::Mul(root.WithOpName("mul"), x1, add2); - - Graph graph_before_encapsulation(OpRegistry::Global()); - TF_ASSERT_OK(root.ToGraph(&graph_before_encapsulation)); - - FunctionLibraryDefinition library(OpRegistry::Global(), {}); - std::unique_ptr graph; - TF_ASSERT_OK(EncapsulateSubgraphsInFunctions( - "_cluster", "_outside", graph_before_encapsulation, - /*rewrite_subgraph_fn=*/{}, /*parallel_checking=*/true, - /*reuse_existing_functions=*/false, &graph, &library)); - - std::vector expected_nodes = { - "add1", "add2", "cluster1", "cluster1_parallel_check/_0", - "mul", "x1", "x2"}; - EXPECT_EQ(expected_nodes, GraphNodes(*graph)); - - std::vector> expected_edges = { - {"add1:0", "add2:0"}, - {"add2:0", "cluster1_parallel_check/_0:0"}, - {"cluster1:0", "cluster1_parallel_check/_0:1"}, - {"cluster1_parallel_check/_0:0", "mul:1"}, - {"x1:0", "add1:0"}, - {"x1:0", "cluster1:0"}, - {"x1:0", "mul:0"}, - {"x2:0", "add1:1"}, - {"x2:0", "add2:1"}, - {"x2:0", "cluster1:1"}, - }; - EXPECT_EQ(expected_edges, GraphEdges(*graph)); -} - const Node* FindNodeByName(const Graph& graph, const string& name) { for (const Node* node : graph.nodes()) { if (node->name() == name) return node; @@ -783,10 +741,13 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) { Scope root = Scope::NewRootScope().ExitOnError().WithDevice( "/job:localhost/replica:0/task:0/cpu:0"); auto x1 = ops::Placeholder(root.WithOpName("x1"), DT_FLOAT); - auto const_x2 = ops::Const(root.WithOpName("const_x2"), 10.0f); + auto x2 = ops::Placeholder(root.WithOpName("x2"), DT_FLOAT); + auto const_guarantee_x2 = + ops::GuaranteeConst(root.WithOpName("const_guarantee_x2"), x2); auto const_guarantee_x1 = ops::GuaranteeConst(root.WithOpName("const_guarantee_x1"), x1); - auto add1 = ops::Add(root.WithOpName("add1"), const_guarantee_x1, const_x2); + auto add1 = + ops::Add(root.WithOpName("add1"), const_guarantee_x1, const_guarantee_x2); add1.node()->AddAttr("_encapsulate", "encapsulate1"); Graph graph_before(OpRegistry::Global()); @@ -798,14 +759,15 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) { TF_ASSERT_OK(EncapsulateSubgraphsInFunctions( "_encapsulate", "_outside", graph_before, /*rewrite_subgraph_fn=*/ - [&guaranteed_consts](std::unique_ptr* graph_ptr, + [&guaranteed_consts](const std::vector& arg_source_tensors, + std::unique_ptr* graph_ptr, std::vector* input_permutation, std::vector* output_permutation, NodeDef* call_def) { Graph* graph = graph_ptr->get(); for (const Node* n : graph->nodes()) { if (n->type_string() == "_Arg" && - str_util::StartsWith(n->name(), "const")) { + absl::StartsWith(n->name(), "const")) { ++guaranteed_consts; EXPECT_TRUE(HasGuaranteeConstAttr(*n)); } else { @@ -814,7 +776,6 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) { } return Status::OK(); }, - /*parallel_checking=*/false, /*reuse_existing_functions=*/false, &graph_after, &library)); EXPECT_EQ(2, guaranteed_consts); } @@ -843,14 +804,15 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) { TF_ASSERT_OK(EncapsulateSubgraphsInFunctions( "_encapsulate", "_outside", graph_before, /*rewrite_subgraph_fn=*/ - [&guaranteed_consts](std::unique_ptr* graph_ptr, + [&guaranteed_consts](const std::vector& arg_source_tensors, + std::unique_ptr* graph_ptr, std::vector* input_permutation, std::vector* output_permutation, NodeDef* call_def) { Graph* graph = graph_ptr->get(); for (const Node* n : graph->nodes()) { if (n->type_string() == "_Arg" && - str_util::StartsWith(n->name(), "const")) { + absl::StartsWith(n->name(), "const")) { ++guaranteed_consts; EXPECT_TRUE(HasGuaranteeConstAttr(*n)); } else { @@ -859,7 +821,6 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) { } return Status::OK(); }, - /*parallel_checking=*/false, /*reuse_existing_functions=*/false, &graph_after, &library)); // Only 1 runtime const, which is const_guarantee_add1. Add2 has one const // and another non-const, so overall non-const. @@ -930,13 +891,13 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"C:o:0", "c:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}, {"c"}}, }, @@ -1050,7 +1011,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) { .WithAttr("_outside", "O1")); Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT, DT_FLOAT}, shape2.opts()); - Node* h = Binary(ops::NodeOut(recv2, 0), e, + Node* h = Binary(ops::NodeOut(recv2, 1), e, shape2.opts() .WithName("H") .WithAttr("_encapsulate", "F1") @@ -1075,27 +1036,27 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) { {"outside_compilation_O1_host_compute"}}, {{"outside_compilation_O2_host_compute"}, "XlaHostCompute", - {"D:o:0", "F:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"F:o:0", "D:o:0"}, + {{"Tinputs", absl::Span({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, {"ancestors", - gtl::ArraySlice({"outside_compilation_O1_host_compute"})}, + absl::Span({"outside_compilation_O1_host_compute"})}, {"key", "host_compute_channel_F1_O2"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O2"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O2"}}, {"F", "outside_compilation_O1_host_compute"}}, {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"C:o:0", "D:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}, {"D"}}, }, @@ -1123,13 +1084,13 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) { Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT, DT_FLOAT}, b2.opts()); - Node* g = Binary(e, ops::NodeOut(recv2, 1), + Node* g = Binary(e, ops::NodeOut(recv2, 0), b2.opts() .WithName("G") .WithControlInputs({recv2, e}) .WithAttr("_encapsulate", "F1") .WithAttr("_outside", "O2")); - Node* h = Binary(ops::NodeOut(recv2, 0), e, + Node* h = Binary(ops::NodeOut(recv2, 1), e, b2.opts() .WithName("H") .WithAttr("_encapsulate", "F1") @@ -1228,13 +1189,13 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"C:o:0", "D:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}, {"D"}}, }, @@ -1251,13 +1212,13 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"G:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F2_O1"}, {"shape_inference_graph", ""}, {"shapes", - gtl::ArraySlice({shape_proto_expected})}, + absl::Span({shape_proto_expected})}, {"_outside_compilation_subgraph", "O1"}}}, }, {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}}); @@ -1402,13 +1363,13 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"C:o:0", "D:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}, {"D"}}, }, @@ -1424,13 +1385,13 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"G:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F2_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F2_O1"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}}, }, {{"i_0_retval", "I:o:0"}}); @@ -1533,13 +1494,13 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {}, - {{"Tinputs", gtl::ArraySlice({})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", - gtl::ArraySlice({shape_proto_expected})}, + absl::Span({shape_proto_expected})}, {"_outside_compilation_subgraph", "O1"}}}, }, {{"f_0_retval", "F:o:0"}}); @@ -1617,13 +1578,13 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {}, - {{"Tinputs", gtl::ArraySlice({})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", - gtl::ArraySlice({shape_proto_expected})}, + absl::Span({shape_proto_expected})}, {"_outside_compilation_subgraph", "O1"}}, {"D"}}, }, @@ -1699,12 +1660,12 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"D:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}}, }, {{"f_0_retval", "F:o:0"}}); @@ -1780,12 +1741,12 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"D:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}}, }, {{"f_0_retval", "F:o:0"}}); @@ -1884,13 +1845,13 @@ TEST(EncapsulateSubgraphsTest, {{"outside_compilation_O2_host_compute"}, "XlaHostCompute", {"F:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O2"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O2"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O2"}}}, }, {{"h_0_retval", "H:o:0"}}); @@ -1993,13 +1954,13 @@ TEST(EncapsulateSubgraphsTest, {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"D:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}}, }, {{"h_0_retval", "H:o:0"}}); @@ -2104,37 +2065,37 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"D:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}}, {{"outside_compilation_O2_host_compute"}, "XlaHostCompute", {"D:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({})}, {"ancestors", - gtl::ArraySlice({"outside_compilation_O1_host_compute"})}, + absl::Span({"outside_compilation_O1_host_compute"})}, {"key", "host_compute_channel_F1_O2"}, {"shape_inference_graph", ""}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O2"}}, {"outside_compilation_O1_host_compute"}}, {{"outside_compilation_O3_host_compute"}, "XlaHostCompute", {"D:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({})}, {"ancestors", - gtl::ArraySlice({"outside_compilation_O1_host_compute", - "outside_compilation_O2_host_compute"})}, + absl::Span({"outside_compilation_O1_host_compute", + "outside_compilation_O2_host_compute"})}, {"key", "host_compute_channel_F1_O3"}, {"shape_inference_graph", ""}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O3"}}, {"outside_compilation_O1_host_compute", "outside_compilation_O2_host_compute"}}}, @@ -2310,13 +2271,13 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) { {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"c:o:0"}, - {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, - {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, - {"ancestors", gtl::ArraySlice({})}, + {{"Tinputs", absl::Span({DT_FLOAT})}, + {"Toutputs", absl::Span({DT_FLOAT})}, + {"ancestors", absl::Span({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, - {"shapes", gtl::ArraySlice({})}, + {"shapes", absl::Span({})}, {"_outside_compilation_subgraph", "O1"}}, {"c"}}, }, diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc index 4d49a14b24d53bbcb434560d59b8c97a17e18f86..c37b6112cc8a92047d495d057f59e2281710e678 100644 --- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc +++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/jit/build_xla_launch_ops_pass.h" #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h" #include "tensorflow/compiler/jit/mark_for_compilation_pass.h" +#include "tensorflow/compiler/jit/partially_decluster_pass.h" #include "tensorflow/core/common_runtime/optimization_registry.h" namespace tensorflow { @@ -23,15 +24,18 @@ namespace tensorflow { REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 10, MarkForCompilationPass); +REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20, + PartiallyDeclusterPass); + // The EncapsulateSubgraphs pass must run after the MarkForCompilationPass. We // also need to run it after the graph been rewritten to have _Send nodes added // for fetches. Before the _Send nodes are added, fetch nodes are identified by // name, and encapsulation might remove that node from the graph. -REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20, +REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30, EncapsulateSubgraphsPass); // Must run after EncapsulateSubgraphsPass. -REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30, +REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 40, BuildXlaLaunchOpsPass); } // namespace tensorflow diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD index 00a6f4075f9a18efc3895b033eb6d08e36088a53..253a5d254792a19d98b75310ea6848f42597c0c7 100644 --- a/tensorflow/compiler/jit/kernels/BUILD +++ b/tensorflow/compiler/jit/kernels/BUILD @@ -16,6 +16,7 @@ cc_library( "//tensorflow/compiler/jit:xla_device", "//tensorflow/compiler/jit:xla_launch_util", "//tensorflow/compiler/tf2xla:common", + "//tensorflow/compiler/tf2xla:tf2xla_util", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla/client:client_library", @@ -28,16 +29,3 @@ cc_library( ], alwayslink = 1, ) - -cc_library( - name = "parallel_check_op", - srcs = ["parallel_check_op.cc"], - visibility = ["//tensorflow/compiler/jit:friends"], - deps = [ - "//tensorflow/compiler/jit/legacy_flags:parallel_check_op_flags", - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework", - "//tensorflow/core:lib", - ], - alwayslink = 1, -) diff --git a/tensorflow/compiler/jit/kernels/parallel_check_op.cc b/tensorflow/compiler/jit/kernels/parallel_check_op.cc deleted file mode 100644 index bd4eefbc0bb960f8ddc1d238057e73a29a098f26..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/jit/kernels/parallel_check_op.cc +++ /dev/null @@ -1,144 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/jit/legacy_flags/parallel_check_op_flags.h" -#include "tensorflow/core/common_runtime/device.h" -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/macros.h" - -namespace tensorflow { -namespace { - -// Inputs 2*N tensors, outputs the first N inputs. -// Logs errors if input tensor i and i + N are not (near) identical -// in any position. -class ParallelCheckOp : public OpKernel { - public: - explicit ParallelCheckOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} - - template - int CompareTensors(DataType dtype, const char* v0, const char* v1, - int64 num_elts, int input_idx) { - int failed = 0; - const T* p0 = reinterpret_cast(v0); - const T* p1 = reinterpret_cast(v1); - double rtol; - legacy_flags::ParallelCheckOpFlags* flags = - legacy_flags::GetParallelCheckOpFlags(); - if (!tensorflow::strings::safe_strtod(flags->parallel_check_rtol.c_str(), - &rtol)) { - LOG(ERROR) << "can't convert parallel_check_rtol " - << flags->parallel_check_rtol << " to double"; - } - double atol; - if (!tensorflow::strings::safe_strtod(flags->parallel_check_atol.c_str(), - &atol)) { - LOG(ERROR) << "can't convert parallel_check_atol " - << flags->parallel_check_atol << " to double"; - } - for (int i = 0; i < num_elts; ++i) { - bool ok = (p0[i] == p1[i]); - VLOG(2) << "output " << input_idx << " element " << i << ": " << p0[i]; - if (!ok) { - if (std::is_same::value || std::is_same::value) { - float tolerance = - std::max(atol, std::max(fabs(rtol * p0[i]), fabs(rtol * p1[i]))); - T diff = p0[i] - p1[i]; - if (diff < 0) diff = 0 - diff; - ok = (diff <= tolerance); - } - if (ok) continue; - LOG(ERROR) << "Op " << name() << " fails equality at output " - << input_idx << " type " << DataTypeString(dtype) - << " element " << i << ": std_val=" << p0[i] - << " test_val=" << p1[i] << " diff=" << (p0[i] - p1[i]); - if (++failed > 10) break; - } - } - return failed; - } - - void Compute(OpKernelContext* ctx) override { - VLOG(1) << "Compute " << name(); - const int num_pairs = ctx->num_inputs() / 2; - for (int i = 0; i < num_pairs; ++i) { - CHECK_EQ(ctx->input_dtype(i), ctx->input_dtype(i + num_pairs)); - Tensor t0 = ctx->input(i); - Tensor t1 = ctx->input(i + num_pairs); - int64 num_elts = t0.NumElements(); - CHECK_EQ(num_elts, t1.NumElements()); - - // Compare inputs elementwise for near-exact equality. - const char* v0 = t0.tensor_data().data(); - const char* v1 = t1.tensor_data().data(); - int failed = 0; - switch (ctx->input_dtype(i)) { - case DT_INT32: - failed = - CompareTensors(ctx->input_dtype(i), v0, v1, num_elts, i); - break; - case DT_INT64: - failed = - CompareTensors(ctx->input_dtype(i), v0, v1, num_elts, i); - break; - case DT_FLOAT: - failed = - CompareTensors(ctx->input_dtype(i), v0, v1, num_elts, i); - break; - case DT_DOUBLE: - failed = - CompareTensors(ctx->input_dtype(i), v0, v1, num_elts, i); - break; - case DT_BOOL: - failed = - CompareTensors(ctx->input_dtype(i), v0, v1, num_elts, i); - break; - default: - LOG(FATAL) << "unimpl: " << ctx->input_dtype(i); - } - if (failed > 0) { - LOG(ERROR) << "check failed for " << name() << " output " << i - << " num_elts: " << num_elts; - legacy_flags::ParallelCheckOpFlags* flags = - legacy_flags::GetParallelCheckOpFlags(); - if (flags->parallel_check_failfast) { - LOG(QFATAL) << "failfast on first parallel-check failure"; - } - } else { - VLOG(1) << "check passed for " << name() << " output " << i - << " num_elts: " << num_elts; - } - - // Propagate the std value. - if (IsRefType(ctx->input_dtype(i))) { - ctx->forward_ref_input_to_ref_output(i, i); - } else { - ctx->set_output(i, ctx->input(i)); - } - } - } - - TF_DISALLOW_COPY_AND_ASSIGN(ParallelCheckOp); -}; - -REGISTER_KERNEL_BUILDER(Name("ParallelCheck").Device(DEVICE_CPU), - ParallelCheckOp); - -} // namespace -} // namespace tensorflow diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc index 27287e0f9637929b2e04c6a76de19c2785ec357e..b6f2f632f7155234c87a0ea16fdc1910a09ed139 100644 --- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc +++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc @@ -16,9 +16,9 @@ limitations under the License. #include "tensorflow/compiler/jit/kernels/xla_launch_op.h" #include "tensorflow/compiler/jit/defs.h" -#include "tensorflow/compiler/jit/xla_device.h" #include "tensorflow/compiler/jit/xla_launch_util.h" #include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/client_library.h" @@ -51,19 +51,22 @@ XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx, if (device_type_ == DeviceType(DEVICE_CPU)) { platform_id_ = se::host::kHostPlatformId; } else if (device_type_ == DeviceType(DEVICE_GPU)) { - platform_id_ = se::cuda::kCudaPlatformId; - } else { - platform_id_ = nullptr; + platform_id_ = ctx->device() + ->tensorflow_gpu_device_info() + ->stream->parent() + ->platform() + ->id(); + } else if (XlaDevice::GetMetadata(ctx, &xla_device_metadata_).ok()) { + use_multiple_streams_ = xla_device_metadata_->UseMultipleStreams(); + platform_id_ = xla_device_metadata_->platform()->id(); } } Status XlaLocalLaunchBase::BuildCompilationCache(OpKernelContext* ctx, XlaCompilationCache** cache) { - const XlaDevice::Metadata* metadata; - Status s = XlaDevice::GetMetadata(ctx, &metadata); - if (s.ok()) { - *cache = new XlaCompilationCache(metadata->client(), - metadata->jit_device_type()); + if (xla_device_metadata_) { + *cache = new XlaCompilationCache(xla_device_metadata_->client(), + xla_device_metadata_->jit_device_type()); return Status::OK(); } @@ -112,17 +115,6 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { // this is more obviously correct.) core::ScopedUnref cache_ref(cache); - const XlaDevice::Metadata* metadata = nullptr; - Status s = XlaDevice::GetMetadata(ctx, &metadata); - bool allocate_xla_tensors = s.ok(); - - // Get the platform_id_ for XLA_* devices. - if (platform_id_ == nullptr) { - if (s.ok()) { - platform_id_ = metadata->platform()->id(); - } - } - std::map variables = SnapshotResourceVariables(ctx, resources_); @@ -140,7 +132,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { // (which local_xla_allocator above uses) as on an XlaDevice, this is a // dummy allocator that returns XlaTensor objects. The XlaCompiler needs a // real allocator to allocate real buffers. - if (allocate_xla_tensors) { + if (xla_device_metadata_) { xla_allocator = client->backend().memory_allocator(); } else { xla_allocator = &local_xla_allocator; @@ -148,13 +140,18 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { XlaCompiler::Options options; options.client = client; - options.device_type = &cache->device_type(); + if (ctx->op_device_context() != nullptr) { + options.device_ordinal = + ctx->op_device_context()->stream()->parent()->device_ordinal(); + } + options.device_type = cache->device_type(); options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition(); options.graph_def_version = ctx->function_library()->graph_def_version(); options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId); options.device_allocator = xla_allocator; - if (metadata) { - options.shape_representation_fn = metadata->shape_representation_fn(); + if (xla_device_metadata_) { + options.shape_representation_fn = + xla_device_metadata_->shape_representation_fn(); } const XlaCompiler::CompilationResult* kernel; @@ -166,14 +163,25 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { } XlaCompiler::CompileOptions compile_options; compile_options.is_entry_computation = true; + // If we resolve constants we never emit them on the device, meaning that if + // they are needed by a following computation the host has to transfer + // them. Not resolving constants is expected to be faster than resolving + // constants. + compile_options.resolve_compile_time_constants = true; + // Optimization: where possible, have the computation return a naked array + // rather than a one-element tuple. + compile_options.always_return_tuple = false; + OP_REQUIRES_OK( ctx, cache->Compile(options, function_, constant_args, variables, ctx, - &kernel, &executable, &compile_options)); + &kernel, &executable, compile_options)); VLOG(1) << "Executing XLA Computation..."; - XlaComputationLaunchContext launch_context(client, xla_allocator, - allocate_xla_tensors); + XlaComputationLaunchContext launch_context( + client, xla_allocator, + /*allocate_xla_tensors=*/xla_device_metadata_ != nullptr, + use_multiple_streams_); launch_context.PopulateInputs(ctx, kernel, variables); // Execute the computation. @@ -182,7 +190,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { run_options.set_stream(stream); run_options.set_allocator(xla_allocator); run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device()); - run_options.set_rng_seed(ctx->step_id()); + run_options.set_rng_seed(GetXLARandomSeed()); Env* env = Env::Default(); auto start_time = env->NowMicros(); @@ -192,7 +200,8 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { auto elapsed = env->NowMicros() - start_time; VLOG(2) << "Elapsed time: " << elapsed << "us"; - launch_context.PopulateOutputs(ctx, kernel, run_result.ConsumeValueOrDie()); + OP_REQUIRES_OK(ctx, launch_context.PopulateOutputs( + ctx, kernel, run_result.ConsumeValueOrDie())); VLOG(1) << "Done"; } diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.h b/tensorflow/compiler/jit/kernels/xla_launch_op.h index 8dfc4b382d51151b6383fe7dd75429f3124d39be..e0f10e981737ad60e2b785a235dcb7fe7d21a053 100644 --- a/tensorflow/compiler/jit/kernels/xla_launch_op.h +++ b/tensorflow/compiler/jit/kernels/xla_launch_op.h @@ -13,10 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LOCAL_LAUNCH_OP_H_ -#define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LOCAL_LAUNCH_OP_H_ +#ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LAUNCH_OP_H_ +#define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LAUNCH_OP_H_ #include "tensorflow/compiler/jit/xla_compilation_cache.h" +#include "tensorflow/compiler/jit/xla_device.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -58,7 +59,9 @@ class XlaLocalLaunchBase : public OpKernel { DeviceType device_type_; NameAttrList function_; - se::Platform::Id platform_id_; + se::Platform::Id platform_id_ = nullptr; + bool use_multiple_streams_ = false; + const XlaDevice::Metadata* xla_device_metadata_ = nullptr; }; // XlaLocalLaunchOp is used to replace a region of the TensorFlow graph @@ -81,4 +84,4 @@ class XlaLocalLaunchOp : public XlaLocalLaunchBase { } // namespace tensorflow -#endif // TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LOCAL_LAUNCH_OP_H_ +#endif // TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LAUNCH_OP_H_ diff --git a/tensorflow/compiler/jit/legacy_flags/BUILD b/tensorflow/compiler/jit/legacy_flags/BUILD index 5d211f4d733d8d807426e62dd116092799184f35..5b6692f523658749f7ef48f9d7d89e97d4ce8b09 100644 --- a/tensorflow/compiler/jit/legacy_flags/BUILD +++ b/tensorflow/compiler/jit/legacy_flags/BUILD @@ -16,18 +16,6 @@ licenses(["notice"]) # Apache 2.0 package(default_visibility = ["//tensorflow:internal"]) -cc_library( - name = "encapsulate_subgraphs_pass_flags", - srcs = ["encapsulate_subgraphs_pass_flags.cc"], - hdrs = ["encapsulate_subgraphs_pass_flags.h"], - deps = - [ - "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env", - "//tensorflow/core:framework_internal", - "//tensorflow/core:lib", - ], -) - cc_library( name = "mark_for_compilation_pass_flags", srcs = ["mark_for_compilation_pass_flags.cc"], diff --git a/tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.cc deleted file mode 100644 index 856475f12c8a411cd80c1c1859323304ca4029e0..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.cc +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// Legacy flags for the XLA bridge's encapsulate_subgraphs_pass module. - -#include -#include - -#include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h" -#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/command_line_flags.h" - -namespace tensorflow { -namespace legacy_flags { - -// Pointers to the parsed value of the flags and flag descriptors, initialized -// via flags_init. -static EncapsulateSubgraphsPassFlags* flags; -static std::vector* flag_list; -static std::once_flag flags_init; - -// Allocate *flags. Called via call_once(&flags_init,...). -static void AllocateFlags() { - flags = new EncapsulateSubgraphsPassFlags; - flags->tf_xla_parallel_checking = false; - flag_list = new std::vector({ - Flag("tf_xla_parallel_checking", &flags->tf_xla_parallel_checking, - "Debug tool. Runs both JIT-compiled and interpreted graphs in " - "parallel and verifies they produce the same outputs."), - }); - xla::legacy_flags::ParseFlagsFromEnv(*flag_list); -} - -// Append to *append_to flag definitions associated with the XLA bridge's -// encapsulate_subgraphs_pass module. -void AppendEncapsulateSubgraphsPassFlags(std::vector* append_to) { - std::call_once(flags_init, &AllocateFlags); - append_to->insert(append_to->end(), flag_list->begin(), flag_list->end()); -} - -// Return a pointer to the EncapsulateSubgraphsPassFlags struct; -// repeated calls return the same pointer. -// This should be called only after Flags::Parse() has returned. -EncapsulateSubgraphsPassFlags* GetEncapsulateSubgraphsPassFlags() { - std::call_once(flags_init, &AllocateFlags); - return flags; -} - -} // namespace legacy_flags -} // namespace tensorflow diff --git a/tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h b/tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h deleted file mode 100644 index d371bd269dbdfbf737d81490fb877fcf88661a8f..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_ENCAPSULATE_SUBGRAPHS_PASS_FLAGS_H_ -#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_ENCAPSULATE_SUBGRAPHS_PASS_FLAGS_H_ - -// Legacy flags for the XLA bridge's encapsulate_subgraphs_pass module. - -#include - -#include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/command_line_flags.h" - -namespace tensorflow { -namespace legacy_flags { - -// Append to *flag_list flag definitions associated with the XLA bridge's -// encapsulate_subgraphs_pass module. -void AppendEncapsulateSubgraphsPassFlags( - std::vector* flag_list); - -// The values of flags associated with the XLA bridge's -// encapsulate_subgraphs_pass module. -typedef struct { - bool tf_xla_parallel_checking; // Debug tool. Runs both JIT-compiled and - // interpreted graphs in parallel and verifies - // they produce the same outputs. -} EncapsulateSubgraphsPassFlags; - -// Return a pointer to the EncapsulateSubgraphsPassFlags struct; -// repeated calls return the same pointer. -// This should be called only after Flags::Parse() has returned. -EncapsulateSubgraphsPassFlags* GetEncapsulateSubgraphsPassFlags(); - -} // namespace legacy_flags -} // namespace tensorflow - -#endif // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_ENCAPSULATE_SUBGRAPHS_PASS_FLAGS_H_ diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 8e2ee0f1d71bc17b4c12c792c38002af4f9eb5eb..4e4abade3278089a1c7f8fdee46a34b8ce503651 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -21,12 +21,17 @@ limitations under the License. #include #include +#include "tensorflow/compiler/jit/deadness_analysis.h" #include "tensorflow/compiler/jit/defs.h" #include "tensorflow/compiler/jit/graphcycles/graphcycles.h" #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h" #include "tensorflow/compiler/jit/union_find.h" +#include "tensorflow/compiler/jit/xla_cluster_util.h" +#include "tensorflow/compiler/tf2xla/const_analysis.h" #include "tensorflow/compiler/tf2xla/dump_graph.h" +#include "tensorflow/compiler/tf2xla/resource_operation_table.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/framework/graph_def_util.h" #include "tensorflow/core/framework/memory_types.h" @@ -36,14 +41,14 @@ limitations under the License. #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/control_flow.h" #include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/public/version.h" namespace tensorflow { -const char* const kXlaClusterAttr = "_XlaCluster"; -const char* const kXlaOutsideCompilationAttr = "_XlaOutsideCompilation"; - namespace { bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) { @@ -60,54 +65,88 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) { return false; } } + + // XLA does not offer guaranteed aliasing between the input and output of the + // XLA cluster so it can't implement the forward-tensor-ref semantic. Leave + // such nodes out of XLA clusters. + if (HasForwardedRefInput(node)) { + VLOG(2) << "Rejecting " << node.name() << ": Identity with unsafe cast."; + return false; + } + return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok(); } +bool HasResourceOutput(const Node& node) { + return std::find(node.output_types().begin(), node.output_types().end(), + DT_RESOURCE) != node.output_types().end(); +} + +bool HasResourceInput(const Node& node) { + return std::find(node.input_types().begin(), node.input_types().end(), + DT_RESOURCE) != node.input_types().end(); +} + +// Returns true if `node` is a resource operation recognized by tf2xla that +// operates on something other than resource variables. +bool IsNonResourceVarResourceOp(const Node& node) { + // TODO(b/112837194): We can't cluster these because we only support + // snapshotting resource variables (and we can't e.g. snapshot stacks). This + // limitation may be fixable with some work. + const XlaResourceOpInfo* op_info = GetResourceOpInfoForOp(node.type_string()); + return op_info && op_info->resource_kind() != XlaResourceKind::kVariable; +} + // Make sure we don't recurse infinitely on recursive functions. const int kMaxRecursionDepth = 10; bool IsCompilableCall(const NodeDef& call_def, - const DeviceType& jit_device_type, int depth, + const DeviceType& jit_device_type, + bool allow_resource_ops, int depth, FunctionLibraryRuntime* lib_runtime); // Tests whether 'while_node' is a completely compilable loop. // Every operator in the condition and body functions must be compilable for a // while loop to be compilable. bool IsCompilableWhile(const Node& while_node, - const DeviceType& jit_device_type, int depth, + const DeviceType& jit_device_type, + bool allow_resource_ops, int depth, FunctionLibraryRuntime* lib_runtime) { - VLOG(2) << "Loop marking: " << while_node.type_string(); - const NameAttrList* name_attr; NodeDef call; Status status; status = GetNodeAttr(while_node.attrs(), "cond", &name_attr); if (!status.ok()) { - VLOG(2) << "Missing 'cond' attribute on While node."; + VLOG(2) << "Rejecting While " << while_node.name() + << ": missing 'cond' attribute on While node."; return false; } const string cond_func = name_attr->name(); call.set_name("while_cond"); call.set_op(cond_func); *call.mutable_attr() = name_attr->attr(); - if (!IsCompilableCall(call, jit_device_type, depth + 1, lib_runtime)) { - VLOG(2) << "Can't compile loop condition: " << cond_func; + if (!IsCompilableCall(call, jit_device_type, allow_resource_ops, depth + 1, + lib_runtime)) { + VLOG(2) << "Rejecting While " << while_node.name() + << ": can't compile loop condition: " << cond_func; return false; } status = GetNodeAttr(while_node.attrs(), "body", &name_attr); if (!status.ok()) { - VLOG(2) << "Missing 'body' attribute on While node."; + VLOG(2) << "Rejecting While " << while_node.name() + << ": missing 'body' attribute on While node."; return false; } const string body_func = name_attr->name(); call.set_name("while_body"); call.set_op(body_func); *call.mutable_attr() = name_attr->attr(); - if (!IsCompilableCall(call, jit_device_type, depth + 1, lib_runtime)) { - VLOG(2) << "Can't compile loop body: " << body_func; + if (!IsCompilableCall(call, jit_device_type, allow_resource_ops, depth + 1, + lib_runtime)) { + VLOG(2) << "Rejecting While " << while_node.name() + << ": can't compile loop body: " << body_func; return false; } - VLOG(2) << "Loop is compilable."; return true; } @@ -115,12 +154,12 @@ bool IsCompilableWhile(const Node& while_node, // Every operator in the function must be compilable for a function to be // compilable. bool IsCompilableCall(const NodeDef& call_def, - const DeviceType& jit_device_type, int depth, + const DeviceType& jit_device_type, + bool allow_resource_ops, int depth, FunctionLibraryRuntime* lib_runtime) { - VLOG(2) << "Function marking: " << call_def.op(); - if (depth > kMaxRecursionDepth) { - VLOG(2) << "Function depth limit exceeded"; + VLOG(2) << "Rejecting " << call_def.op() + << ": function depth limit exceeded."; return false; } @@ -128,9 +167,14 @@ bool IsCompilableCall(const NodeDef& call_def, Status status = lib_runtime->Instantiate(call_def.op(), AttrSlice(call_def), &handle); if (!status.ok()) { - VLOG(2) << "Could not instantiate " << call_def.op() << ": " << status; + VLOG(2) << "Rejecting " << call_def.op() + << ": could not instantiate: " << status; return false; } + + auto release_handle_on_return = gtl::MakeCleanup( + [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); }); + const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle); CHECK(fbody); const FunctionDef& fdef = fbody->fdef; @@ -142,7 +186,8 @@ bool IsCompilableCall(const NodeDef& call_def, // tf2xla to translate the TF graph into XLA. So we avoid this for now. // // TODO(b/36139787): Create a mechanism to set inlining hints. - VLOG(2) << "Can't compile noinline function: " << fdef.DebugString(); + VLOG(2) << "Rejecting " << call_def.op() + << ": can't compile noinline function."; return false; } @@ -150,51 +195,30 @@ bool IsCompilableCall(const NodeDef& call_def, if (node->type_string() == "_Arg" || node->type_string() == "_Retval") continue; if (node->type_string() == "While") { - // Handle functional While loop (not in open source build). - return IsCompilableWhile(*node, jit_device_type, depth + 1, lib_runtime); + // Handle functional While loop. + return IsCompilableWhile(*node, jit_device_type, allow_resource_ops, + depth + 1, lib_runtime); + } + if (!allow_resource_ops && + (HasResourceInput(*node) || HasResourceOutput(*node))) { + return false; } if (!HasXLAKernel(*node, jit_device_type) && - !IsCompilableCall(node->def(), jit_device_type, depth + 1, - lib_runtime)) { - VLOG(2) << "Function marking failed: unsupported op " << node->name() - << ": " << node->def().ShortDebugString(); + !IsCompilableCall(node->def(), jit_device_type, allow_resource_ops, + depth + 1, lib_runtime)) { + VLOG(2) << "Rejecting " << call_def.op() << ": unsupported op " + << node->name() << ": " << node->def().ShortDebugString(); return false; } } - VLOG(2) << "Function is compilable: " << call_def.op(); return true; } -// Returns the DeviceType corresponding to 'device'. -Status DeviceTypeOfDevice(const string& device, DeviceType* device_type) { - DeviceNameUtils::ParsedName parsed; - if (!DeviceNameUtils::ParseFullName(device, &parsed)) { - return errors::Internal("Malformed assigned device '", device, "'"); - } - *device_type = DeviceType(parsed.type); - return Status::OK(); -} - -// Tests whether `node` has a DT_RESOURCE typed input or output. -bool HasResourceInputOrOutput(const Node& node) { - return std::find(node.input_types().begin(), node.input_types().end(), - DT_RESOURCE) != node.input_types().end() || - std::find(node.output_types().begin(), node.output_types().end(), - DT_RESOURCE) != node.output_types().end(); -} - -struct NodeCompare { - bool operator()(const Node* a, const Node* b) const { - return a->id() < b->id(); - } -}; -using OrderedNodeSet = std::set; - // Returns true if the op can be decomposed into XLA ops for which // there are fusable elemental implementations. // -// TODO(hpucha): Consider a black list instead of a white list as -// implemented below. +// TODO(hpucha): Remove this code since this functionality is subsumed by +// Grappler XlaFusionOptimizer. bool IsXlaFusable(const NodeDef& node) { static const std::unordered_set* elementwise_ops = new std::unordered_set( @@ -352,6 +376,10 @@ Status FindCompilationCandidates( flib_def, opts)); FunctionLibraryRuntime* lib_runtime = pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice); + std::vector compile_time_const_nodes(graph.num_node_ids(), false); + TF_RETURN_IF_ERROR( + BackwardsConstAnalysis(graph, /*compile_time_const_arg_indices=*/nullptr, + &compile_time_const_nodes)); int64& fuel = legacy_flags::GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel; @@ -364,57 +392,87 @@ Status FindCompilationCandidates( for (Node* node : graph.op_nodes()) { sorted_nodes.push_back(node); } - std::sort(sorted_nodes.begin(), sorted_nodes.end(), NodeCompare()); + std::sort(sorted_nodes.begin(), sorted_nodes.end(), NodeComparatorID()); + + if (fuel >= std::numeric_limits::max() / 2) { + // The assumption is that if fuel started out as INT64_MAX, it will forever + // stay greater than INT64_MAX / 2. + VLOG(2) << "Starting fuel: infinity"; + } else { + VLOG(2) << "Starting fuel: " << fuel; + } for (Node* node : sorted_nodes) { - VLOG(2) << "Fuel: " << fuel; if (fuel <= 0) { - VLOG(2) + VLOG(1) << "Hit fuel limit; not marking any remaining ops as clusterable."; break; } - VLOG(2) << "FindCompilationCandidates(): Processing " - << node->DebugString(); - DeviceType device_type(""); TF_RETURN_IF_ERROR( - DeviceTypeOfDevice(node->assigned_device_name(), &device_type)); + DeviceToDeviceType(node->assigned_device_name(), &device_type)); - if (is_compilable_fn && !is_compilable_fn(node, device_type)) continue; + if (is_compilable_fn && !is_compilable_fn(node, device_type)) { + // is_compilable_fn has already logged the reason if it returned false. + continue; + } const XlaOpRegistry::DeviceRegistration* registration; CHECK( XlaOpRegistry::GetCompilationDevice(device_type.type(), ®istration)); DeviceType jit_device_type(registration->compilation_device_name); if (!HasXLAKernel(*node, jit_device_type) && - !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime)) { - VLOG(2) << "Compilation rejected node: unsupported op " << node->name() - << ": " << node->type_string(); + !IsCompilableCall(node->def(), jit_device_type, + registration->compile_resource_ops, 0, lib_runtime)) { + VLOG(2) << "Rejecting " << node->name() << ": unsupported op " + << node->type_string(); continue; } if (!registration->compile_resource_ops && - HasResourceInputOrOutput(*node)) { - VLOG(2) << "Compilation rejected node: resource input/output " - << node->name() << ": " << node->type_string(); + (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) { + // We don't have a way of returning values of type DT_RESOURCE from XLA + // computations so we avoid auto-clustering nodes producing DT_RESOURCE. + // XlaLaunchOp also cannot snapshot resources that are not resource + // variables so we avoid clustering resource operations that operate on + // non-resource variables. + VLOG(2) << "Rejecting: " << node->name() << ": resource output " + << node->type_string(); continue; } + if (compile_time_const_nodes[node->id()] && + !registration->requires_compilation) { + const OpDef* op_def; + TF_RETURN_IF_ERROR( + OpRegistry::Global()->LookUpOpDef(node->type_string(), &op_def)); + if (op_def->is_stateful()) { + // We need to be able to constant fold the nodes in + // compile_time_const_nodes given constant inputs (required by XLA) and + // therefore can't auto-cluster stateful ops since these can never be + // constant folded. + VLOG(2) << "Rejecting " << node->name() + << ": must-be-constant stateful op"; + continue; + } + } + // We don't auto-cluster functional control flow nodes containing resource + // operations because safety checks are trickier in this case. + // registration->compile_resource_ops is true for XLA_CPU/XLA_GPU but not + // for CPU/GPU. if (node->type_string() == "While" && - !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime)) { + !IsCompilableWhile(*node, jit_device_type, + registration->compile_resource_ops, 0, + lib_runtime)) { continue; } // _Arg nodes in a top-level function represent feeds. // Do not compile them. if (node->type_string() == "_Arg") { - VLOG(2) << "Skipping jit compilation for '_Arg'-typed node " - << node->DebugString(); continue; } // _Retval nodes in a top-level function represent fetches. // Do not compile them. if (node->type_string() == "_Retval") { - VLOG(2) << "Compilation rejected node: return value " << node->name() - << ": " << node->type_string(); continue; } candidates->insert(node); @@ -424,52 +482,37 @@ Status FindCompilationCandidates( return Status::OK(); } +// Determine the global jit level which is ON if either the +// GraphOptimizationPassOptions has the jit ON, or if the --tf_xla_auto_jit flag +// is true. +OptimizerOptions::GlobalJitLevel GetGlobalJitLevel( + const GraphOptimizationPassOptions& options) { + OptimizerOptions::GlobalJitLevel global_jit_level = + options.session_options->config.graph_options() + .optimizer_options() + .global_jit_level(); + if (global_jit_level == OptimizerOptions::DEFAULT) { + // To set compilation to be on by default, change the following line. + global_jit_level = OptimizerOptions::OFF; + } + legacy_flags::MarkForCompilationPassFlags* flags = + legacy_flags::GetMarkForCompilationPassFlags(); + if (flags->tf_xla_auto_jit == -1 || + (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) { + // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides + // the setting in ConfigProto. + global_jit_level = + static_cast(flags->tf_xla_auto_jit); + } + return global_jit_level; +} + struct Cluster { // Identifies the node that represents this cluster in the cycle detection // graph. int representative = -1; }; -// Returns a string describing how an edge from src to dst would -// create a cycle. -string DescribeCycle(const GraphCycles& cycles, const Graph& graph, int src, - int dst) { - int32 max_path_size = graph.num_node_ids() + 1; - std::vector path(max_path_size); - int32 path_size = cycles.FindPath(dst, src, max_path_size, path.data()); - if (path_size == 0) { - return ""; - } - - auto node_name = [&cycles, &graph](int node_id) { - if (!FastBoundsCheck(node_id, graph.num_node_ids())) { - return string("(null)"); - } - auto* node = graph.FindNodeId(node_id); - if (node == nullptr) { - return string("(null)"); - } - return node->name(); - }; - - string description; - strings::StrAppend(&description, "Edge from ", node_name(src), " to ", - node_name(dst), " would create a cycle.\n"); - path.resize(path_size); - for (int32 node_id : path) { - string ascii_art; - if (node_id == dst) { - ascii_art = "+-> "; - } else if (node_id != src) { - ascii_art = "| "; - } else { - ascii_art = "+-- "; - } - strings::StrAppend(&description, ascii_art, node_name(node_id), "\n"); - } - return description; -} - } // anonymous namespace bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) { @@ -478,7 +521,11 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) { CHECK(XlaOpRegistry::GetCompilationDevice(device->device_type(), ®istration)); DeviceType jit_device_type(registration->compilation_device_name); - return IsCompilableCall(ndef, jit_device_type, 0, flr); + + // We can always *compile* resource operations, even if we are sometimes + // unable to auto-cluster them. + const bool compile_resource_ops = true; + return IsCompilableCall(ndef, jit_device_type, compile_resource_ops, 0, flr); } Status MarkForCompilationPass::Run( @@ -486,54 +533,67 @@ Status MarkForCompilationPass::Run( // TODO(phawkins): precompute the "GetCompilationDevice" properties of each // device ahead of time. OptimizerOptions::GlobalJitLevel global_jit_level = - options.session_options->config.graph_options() - .optimizer_options() - .global_jit_level(); - if (global_jit_level == OptimizerOptions::DEFAULT) { - // To set compilation to be on by default, change the following line. - global_jit_level = OptimizerOptions::OFF; - } + GetGlobalJitLevel(options); legacy_flags::MarkForCompilationPassFlags* flags = legacy_flags::GetMarkForCompilationPassFlags(); - if (flags->tf_xla_auto_jit == -1 || - (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) { - // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides - // the setting in ConfigProto. - global_jit_level = - static_cast(flags->tf_xla_auto_jit); - } bool cpu_global_jit = flags->tf_xla_cpu_global_jit; bool fusion_only = flags->tf_xla_fusion_only; VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit; VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only; + VLOG(1) << "flags->tf_xla_auto_jit = " << flags->tf_xla_auto_jit; const FunctionLibraryDefinition* fld = options.flib_def; - auto is_compilable = [global_jit_level, cpu_global_jit, fusion_only, fld]( - const Node* node, const DeviceType& device_type) { + std::unique_ptr deadness; + { + XLA_SCOPED_LOGGING_TIMER_LEVEL("DeadnessAnalysis", 1); + TF_RETURN_IF_ERROR(DeadnessAnalysis::Run(**options.graph, &deadness)); + } + + auto is_compilable = [&](const Node* node, const DeviceType& device_type) { const XlaOpRegistry::DeviceRegistration* registration; if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), ®istration)) { + VLOG(2) << "Rejecting " << node->name() << ": could not find JIT device."; return false; } - // Don't compile control trigger nodes. We won't preserve their deadness - // semantics correctly, so it's safest not to compile them. - if (node->IsControlTrigger()) return false; - // If this device requires a JIT, we must say yes. if (registration->requires_compilation) return true; // If there is a _XlaCompile annotation, use its value. bool compile = false; Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile); - if (status.ok()) return compile; + if (status.ok()) { + if (!compile) { + VLOG(2) << "Rejecting " << node->name() << ": kXlaCompileAttr(" + << kXlaCompileAttr << ") is false."; + } + return compile; + } status = fld->GetAttr(*node, kXlaCompileAttr, &compile); - if (status.ok()) return compile; + if (status.ok()) { + if (!compile) { + VLOG(2) << "Rejecting " << node->name() << ": kXlaCompileAttr(" + << kXlaCompileAttr << ") on callee is false."; + } + return compile; + } + + // If inputs to `node` can have conflicting deadness (i.e. some are alive + // and some are dead) then don't compile it. XLA cannot represent the + // deadness semantics of these nodes correctly and auto-clustering these + // nodes can cause deadness to propagate to nodes that should be live. + if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) { + VLOG(2) << "Rejecting " << node->name() << ": mismatching deadness."; + return false; + } // Check for fusable ops only if requested. if (global_jit_level > 0 && fusion_only && !IsXlaFusable(node->def())) { + VLOG(2) << "Rejecting " << node->name() + << ": not fusable op but fusion_only enabled."; return false; } @@ -541,12 +601,151 @@ Status MarkForCompilationPass::Run( // Ignore enable_jit_by_default if global jit compilation for CPU // is explicitly requested via tf_xla_cpu_global_jit flag bool ignore_registration = cpu_global_jit && device_type == DEVICE_CPU; - return (ignore_registration || registration->enable_jit_by_default) && - global_jit_level > 0; + bool should_compile = + (ignore_registration || registration->enable_jit_by_default) && + global_jit_level != OptimizerOptions::OFF; + if (!should_compile) { + if (global_jit_level == OptimizerOptions::OFF) { + VLOG(2) << "Rejecting " << node->name() << ": global jit disabled."; + } else { + VLOG(2) << "Rejecting " << node->name() << ": JIT for device disabled."; + } + } + return should_compile; }; return RunImpl(options, is_compilable); } +static string RatioToString(int numerator, int denominator) { + return strings::Printf("%d / %d (%.2f%%)", numerator, denominator, + (100.0 * numerator) / denominator); +} + +static void VLogClusteringSummary(const Graph& g) { + if (!VLOG_IS_ON(2)) { + return; + } + + std::map cluster_name_to_size; + std::map> + cluster_name_to_op_histogram; + std::map unclustered_op_histogram; + int clustered_node_count = 0; + + for (Node* n : g.nodes()) { + absl::optional cluster_name = GetXlaClusterForNode(*n); + if (cluster_name) { + clustered_node_count++; + cluster_name_to_size[*cluster_name]++; + cluster_name_to_op_histogram[*cluster_name][n->type_string()]++; + } else { + unclustered_op_histogram[n->type_string()]++; + } + } + + int unclustered_node_count = g.num_nodes() - clustered_node_count; + + VLOG(2) << "*** Clustering info for graph of size " << g.num_nodes(); + VLOG(2) << " Built " << cluster_name_to_size.size() << " clusters, size " + << RatioToString(clustered_node_count, g.num_nodes()); + + for (const auto& cluster_name_size_pair : cluster_name_to_size) { + StringPiece cluster_name = cluster_name_size_pair.first; + int size = cluster_name_size_pair.second; + VLOG(2) << " " << cluster_name << " " + << RatioToString(size, g.num_nodes()); + for (const auto& op_count_pair : + cluster_name_to_op_histogram[cluster_name]) { + VLOG(3) << " " << op_count_pair.first << ": " << op_count_pair.second + << " instances"; + } + } + + if (!unclustered_op_histogram.empty()) { + VLOG(2) << " Unclustered nodes: " + << RatioToString(unclustered_node_count, g.num_nodes()); + for (const auto& pair : unclustered_op_histogram) { + VLOG(3) << " " << pair.first << ": " << pair.second << " instances"; + } + } + + struct EdgeInfo { + StringPiece node_name; + absl::optional cluster_name; + + StringPiece GetClusterName() const { + return cluster_name ? *cluster_name : "[none]"; + } + + std::pair> AsPair() const { + return {node_name, cluster_name}; + } + + bool operator<(const EdgeInfo& other) const { + return AsPair() < other.AsPair(); + } + }; + + using EdgeInfoMap = std::map>; + + EdgeInfoMap incoming_edge_infos; + EdgeInfoMap outgoing_edge_infos; + + std::set cluster_names_to_print; + + for (const Edge* e : g.edges()) { + const Node* from = e->src(); + absl::optional from_cluster_name = GetXlaClusterForNode(*from); + + const Node* to = e->dst(); + absl::optional to_cluster_name = GetXlaClusterForNode(*to); + + if (to_cluster_name == from_cluster_name) { + continue; + } + + if (to_cluster_name) { + incoming_edge_infos[*to_cluster_name] + [EdgeInfo{from->name(), from_cluster_name}]++; + cluster_names_to_print.insert(*to_cluster_name); + } + + if (from_cluster_name) { + outgoing_edge_infos[*from_cluster_name][{to->name(), to_cluster_name}]++; + cluster_names_to_print.insert(*from_cluster_name); + } + } + + VLOG(2) << "*** Inter-Cluster edges:"; + if (cluster_names_to_print.empty()) { + VLOG(2) << " [none]"; + } + + auto print_edge_info_set_for_cluster = [&](StringPiece cluster_name, + const EdgeInfoMap& edge_info_map, + StringPiece desc) { + auto it = edge_info_map.find(cluster_name); + if (it != edge_info_map.end()) { + VLOG(2) << " " << it->second.size() << " " << desc << " edges"; + for (const auto& edge_info_count_pair : it->second) { + VLOG(2) << " " << edge_info_count_pair.first.GetClusterName() << " " + << edge_info_count_pair.first.node_name << " # " + << edge_info_count_pair.second; + } + } else { + VLOG(2) << " No " << desc << " edges."; + } + }; + + for (StringPiece cluster_name : cluster_names_to_print) { + VLOG(2) << " ** Cluster " << cluster_name; + print_edge_info_set_for_cluster(cluster_name, incoming_edge_infos, + "incoming"); + print_edge_info_set_for_cluster(cluster_name, outgoing_edge_infos, + "outgoing"); + } +} + // Is 'node' an operator that consumes only the shape of its input, not the // data itself? static bool IsShapeConsumerOp(const Node& node) { @@ -554,6 +753,43 @@ static bool IsShapeConsumerOp(const Node& node) { node.type_string() == "Size"; } +static Status IgnoreResourceOpForSafetyAnalysis(const Node& n, bool* ignore) { + // If a resource operation is assigned to XLA_CPU or XLA_GPU explicitly then + // ignore it during resource operation safety analysis. We need this hack + // because of two reasons: + // + // 1. Operations assigned to XLA_CPU and XLA_GPU have to always be compiled. + // 2. We don't support live-out values of type DT_RESOURCE and live-in values + // of type DT_RESOURCE that are not resource variables. + // + // Together these imply we cannot let resource variable safety analysis + // constrain e.g. a TensorArrayV3->TensorArrayAssignV3 edge to be in different + // clusters: both of them will have to be clustered because of (1) and we + // won't be able to keep the edge between the two as neither the input to the + // second XLA cluster nor the output from the first XLA cluster are supported + // because of (2). + // + // TODO(b/113100872): This can be fixed if the TensorFlow representation for + // TensorArray and Stack on the XLA_{C|G}PU devices were the same in XLA; then + // (2) would no longer hold. + + if (n.assigned_device_name().empty()) { + *ignore = false; + return Status::OK(); + } + DeviceType device_type(""); + TF_RETURN_IF_ERROR( + DeviceToDeviceType(n.assigned_device_name(), &device_type)); + + const XlaOpRegistry::DeviceRegistration* registration; + if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), ®istration)) { + *ignore = true; + } else { + *ignore = registration->compile_resource_ops; + } + return Status::OK(); +} + // Sequence number generator to ensure clusters have unique names. static std::atomic cluster_sequence_num; @@ -575,84 +811,15 @@ Status MarkForCompilationPass::RunImpl( : Env::Default(), is_compilable_fn, &compilation_candidates)); - GraphCycles cycles; - for (int i = 0; i < graph->num_node_ids(); ++i) { - // We rely on the node IDs in the cycle detection graph being consecutive - // integers starting from 0. - CHECK_EQ(i, cycles.NewNode()); + if (compilation_candidates.empty()) { + VLOG(2) << "No compilable candidates"; + return Status::OK(); } - // Compute the loop structure of the graph. - std::vector control_flow_info; - TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &control_flow_info)); - - // The clustering code must avoid adding cycles to the graph to prevent - // deadlock. However, the graph may contain loops, which would trigger the - // cycle detection code. To handle loops, we alter the structure of the cycle - // detection graph, disconnecting each loop from the enclosing graph. - // Specifically, we: - // * add a new "frame" node for each loop. - // * replace edges to "Enter" nodes, and edges from "Exit" nodes with edges - // to/from the corresponding frame node. In essence, we collapse the loop - // into a single node for the purpose of cycle detection in the enclosing - // graph. - // * the body of the loop should now be disconnected from the rest of the - // graph; we make it acyclic by breaking loop backedges (edges outgoing from - // "NextIteration" nodes. - - // Map from frame name strings to node IDs in the cycle detection graph. - std::unordered_map frame_nodes; - - // Get the cycle graph node ID for frame 'frame_name', or add one if none - // exists. - auto GetOrAddFrameNodeId = [&frame_nodes, &cycles](const string& frame_name) { - int& frame_id = frame_nodes.emplace(frame_name, -1).first->second; - if (frame_id < 0) { - // The emplace succeeded; we have not allocated a frame node yet. - frame_id = cycles.NewNode(); - } - return frame_id; - }; - - for (Edge const* edge : graph->edges()) { - if (edge->dst()->IsEnter()) { - // Lift edges to an "Enter" node to the corresponding frame node. - const string& frame_name = - control_flow_info[edge->dst()->id()].frame_name; - int dst = GetOrAddFrameNodeId(frame_name); - if (!cycles.InsertEdge(edge->src()->id(), dst)) { - return errors::Internal( - "Cycle detected when adding enter->frame edge: ", - DescribeCycle(cycles, *graph, edge->src()->id(), dst)); - } - continue; - } - if (edge->src()->IsExit()) { - // Lift edges from an "Exit" node to the corresponding frame node. - const string& frame_name = - control_flow_info[edge->src()->id()].frame_name; - int src = GetOrAddFrameNodeId(frame_name); - if (!cycles.InsertEdge(src, edge->dst()->id())) { - return errors::Internal( - "Cycle detected when adding frame->exit edge: ", - DescribeCycle(cycles, *graph, src, edge->dst()->id())); - } - // Drop the original edge. - continue; - } - if (edge->src()->IsNextIteration()) { - // Break loop back-edges. - continue; - } - if (!cycles.InsertEdge(edge->src()->id(), edge->dst()->id())) { - // This should never happen. All cycles in the graph should contain - // a control flow operator. - return errors::Internal( - "Found cycle in graph without control flow operator during XLA " - "compilation: ", - DescribeCycle(cycles, *graph, edge->src()->id(), edge->dst()->id())); - } - } + GraphCycles cycles; + TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(graph, &cycles)); + TF_RETURN_IF_ERROR(AdjustCycleDetectionGraphForResourceOps( + graph, options.flib_def, IgnoreResourceOpForSafetyAnalysis, &cycles)); // Each compilation candidate belongs to a cluster. The cluster's // representative @@ -665,11 +832,16 @@ Status MarkForCompilationPass::RunImpl( worklist.push_back(&clusters[node->id()]); } + OptimizerOptions::GlobalJitLevel global_jit_level = + GetGlobalJitLevel(options); legacy_flags::MarkForCompilationPassFlags* flags = legacy_flags::GetMarkForCompilationPassFlags(); // Repeatedly contract edges between clusters that are on the same device, // provided the contraction would not create a cycle. + // + // TODO(hpucha): Handle the case where kXlaClusterAttr is already set (for + // example, from the Grappler fusion pass). while (!worklist.empty()) { int from = worklist.front()->Get().representative; worklist.pop_front(); @@ -686,7 +858,7 @@ Status MarkForCompilationPass::RunImpl( string to_scope; for (int to : cycles.Successors(from)) { if (to >= graph->num_node_ids()) { - // Node is a "frame" node that is present only in the cycle detection + // Node is a fictitious node that is present only in the cycle detection // graph. No clustering is possible. continue; } @@ -701,13 +873,15 @@ Status MarkForCompilationPass::RunImpl( } // Look for an _XlaScope on both nodes. If both nodes have a // scope and the scopes do not match, do not cluster along this - // edge. If even one of the nodes lacks an _XlaScope attribute, + // edge. This restriction is overridden if the global_jit_level is ON. If + // even one of the nodes lacks an _XlaScope attribute, // then it is treated as a "bridge" and a cluster may be created // along it. We may want to restrict this behavior to require // all nodes marked with _XlaCompile=true to also have a // _XlaScope property set (and raise an error otherwise); but // for now we don't do this. - if (GetNodeAttr(node_from->attrs(), kXlaScopeAttr, &from_scope).ok() && + if (global_jit_level == OptimizerOptions::OFF && + GetNodeAttr(node_from->attrs(), kXlaScopeAttr, &from_scope).ok() && GetNodeAttr(node_to->attrs(), kXlaScopeAttr, &to_scope).ok() && from_scope != to_scope) { continue; @@ -778,7 +952,7 @@ Status MarkForCompilationPass::RunImpl( // compilation. DeviceType device_type(""); TF_RETURN_IF_ERROR( - DeviceTypeOfDevice(n->assigned_device_name(), &device_type)); + DeviceToDeviceType(n->assigned_device_name(), &device_type)); const XlaOpRegistry::DeviceRegistration* registration; XlaOpRegistry::GetCompilationDevice(device_type.type(), ®istration); @@ -803,6 +977,9 @@ Status MarkForCompilationPass::RunImpl( dump_graph::DumpGraphToFile("mark_for_compilation", **options.graph, options.flib_def); } + + VLogClusteringSummary(*graph); + return Status::OK(); } diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.h b/tensorflow/compiler/jit/mark_for_compilation_pass.h index e9acbfb19e42cb43cb0b986c438a569de29b2ebc..f1137af3c1e8539fda318d88d2c5b5187953ccab 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.h +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h @@ -40,20 +40,18 @@ class MarkForCompilationPass : public GraphOptimizationPass { Status Run(const GraphOptimizationPassOptions& options) override; - // Run() just calls RunImpl() if --tf_xla_auto_jit is enabled. To run the pass - // unconditionally, call RunImpl() directly. - // is_compilable_fn, if set, is a predicate that must be true for a node to - // be compiled. + private: Status RunImpl(const GraphOptimizationPassOptions& options, const std::function& is_compilable_fn = {}); + + friend class MarkForCompilationPassTestHelper; }; // Returns true iff 'ndef' is a call to a function that is compilable. A // function is compilable iff every operator in the function body is // compilable. bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef); - } // namespace tensorflow #endif // TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_H_ diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc index 703d8825d74ced8d4d69c31ccd730adc89a8bffe..807ab51fd3c133b95915ea88e0bf99dbb8661452 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc @@ -13,23 +13,26 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/jit/mark_for_compilation_pass.h" +#include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h" +#include "absl/strings/match.h" #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/ops/array_ops.h" #include "tensorflow/cc/ops/control_flow_ops_internal.h" #include "tensorflow/cc/ops/function_ops.h" +#include "tensorflow/cc/ops/resource_variable_ops.h" +#include "tensorflow/cc/ops/sendrecv_ops.h" #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/compiler/jit/defs.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/op.h" +#include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/graph_def_builder.h" #include "tensorflow/core/graph/graph_def_builder_util.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/test.h" namespace tensorflow { @@ -38,27 +41,6 @@ namespace { REGISTER_OP("UncompilableNullary").Output("o: float"); REGISTER_OP("UncompilableUnary").Input("a: float").Output("o: float"); -Status MarkForCompilation(std::unique_ptr* graph, - FunctionLibraryDefinition* flib_def) { - // Assign all nodes to the CPU device. - static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0"; - for (Node* n : (*graph)->nodes()) { - n->set_assigned_device_name(kCpuDevice); - } - - GraphOptimizationPassOptions opt_options; - opt_options.graph = graph; - opt_options.flib_def = flib_def; - MarkForCompilationPass pass; - return pass.RunImpl(opt_options); -} - -Status MarkForCompilation(std::unique_ptr* graph) { - FunctionDefLibrary flib; - FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib); - return MarkForCompilation(graph, &flib_def); -} - std::unordered_map GetClusters(const Graph& graph) { std::unordered_map ids; for (Node* node : graph.nodes()) { @@ -68,9 +50,35 @@ std::unordered_map GetClusters(const Graph& graph) { ids[node->name()] = cluster; } } + + if (VLOG_IS_ON(2)) { + VLOG(2) << "Clusters:"; + for (const auto& p : ids) { + VLOG(2) << " " << p.first << " -> " << p.second; + } + } return ids; } +gtl::FlatMap> GetClusterSets( + const Graph& g, std::vector* cluster_names = nullptr) { + CHECK(cluster_names == nullptr || cluster_names->empty()); + gtl::FlatMap> cluster_sets; + for (const auto& p : GetClusters(g)) { + cluster_sets[p.second].push_back(p.first); + } + for (auto& p : cluster_sets) { + if (cluster_names != nullptr) { + cluster_names->push_back(p.first); + } + std::sort(p.second.begin(), p.second.end()); + } + if (cluster_names != nullptr) { + std::sort(cluster_names->begin(), cluster_names->end()); + } + return cluster_sets; +} + TEST(XlaCompilationTest, Chains) { std::unique_ptr graph(new Graph(OpRegistry::Global())); GraphDef graphdef; @@ -87,7 +95,7 @@ TEST(XlaCompilationTest, Chains) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(4, clusters.size()); EXPECT_EQ(clusters["B"], clusters["C"]); @@ -112,7 +120,7 @@ TEST(XlaCompilationTest, UncompilableCycles) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_TRUE(clusters.empty()); @@ -132,7 +140,7 @@ TEST(XlaCompilationTest, CompilableCycles) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(3, clusters.size()); @@ -155,7 +163,7 @@ TEST(XlaCompilationTest, Complex128Unsupported) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_TRUE(clusters.empty()); } @@ -176,7 +184,7 @@ TEST(XlaCompilationTest, HalfSupported) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_FALSE(clusters.empty()); } @@ -205,7 +213,7 @@ TEST(XlaCompilationTest, ConcatWithConstArg) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(3, clusters.size()); // Everything should be compiled. } @@ -219,7 +227,7 @@ TEST(XlaCompilationTest, FunctionCalls) { {}, {{{"n_c"}, "UncompilableUnary", {"n_a"}}}); FunctionDef noinline = compilable; noinline.mutable_signature()->set_name("NoInlineFn"); - AddAttr("_noinline", bool(true), noinline.mutable_attr()); + AddAttr("_noinline", static_cast(true), noinline.mutable_attr()); FunctionDefLibrary flib; *flib.add_function() = compilable; @@ -240,7 +248,8 @@ TEST(XlaCompilationTest, FunctionCalls) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph, &flib_def)); + TF_ASSERT_OK( + MarkForCompilationPassTestHelper::MarkForCompilation(&graph, &flib_def)); auto clusters = GetClusters(*graph); EXPECT_EQ(2, clusters.size()); @@ -271,7 +280,7 @@ TEST(XlaCompilationTest, MetadataOpsDontStartClusters) { ops::UnaryOp("Shape", d, builder.opts().WithName("E")); TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(0, clusters.size()); // Nothing should be compiled. } @@ -358,7 +367,7 @@ TEST(XlaCompilationTest, SymbolicGradients) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(2, clusters.size()); @@ -383,7 +392,7 @@ TEST(XlaCompilationTest, Loops) { std::unique_ptr graph(new Graph(OpRegistry::Global())); TF_EXPECT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); // Nothing should be compiled. In particular, 'd' and 'c' must not be @@ -391,6 +400,44 @@ TEST(XlaCompilationTest, Loops) { EXPECT_EQ(0, clusters.size()); } +TEST(XlaCompilationTest, CyclesWithAllDifferentScopesGlobalJitOverridden) { + std::unique_ptr graph(new Graph(OpRegistry::Global())); + GraphDef graphdef; + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* a = ops::SourceOp("Const", builder.opts() + .WithName("A") + .WithAttr("dtype", DT_FLOAT) + .WithAttr("value", Tensor()) + .WithAttr(kXlaScopeAttr, "ScopeA")); + Node* b = ops::UnaryOp( + "Relu", a, + builder.opts().WithName("B").WithAttr(kXlaScopeAttr, "ScopeB")); + ops::BinaryOp( + "MatMul", a, b, + builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "ScopeC")); + TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get())); + } + + FunctionDefLibrary flib; + FunctionLibraryDefinition flib_def(graph->op_registry(), flib); + SessionOptions session_options; + session_options.config.mutable_graph_options() + ->mutable_optimizer_options() + ->set_global_jit_level(OptimizerOptions::ON_2); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation( + &graph, &flib_def, &session_options)); + auto clusters = GetClusters(*graph); + + // The computation is: C = A + relu(A) + // where A sits in ScopeA, relu(A) sits in ScopeB, and C sits in ScopeC. + // In this case, the GlobalJitLevel overrides the scopes to cluster while + // ignoring scopes. + EXPECT_EQ(3, clusters.size()); + EXPECT_EQ(clusters["A"], clusters["B"]); + EXPECT_EQ(clusters["A"], clusters["C"]); +} + TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) { std::unique_ptr graph(new Graph(OpRegistry::Global())); GraphDef graphdef; @@ -410,7 +457,7 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) { TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); // The computation is: C = A + relu(A) @@ -441,7 +488,7 @@ TEST(XlaCompilationTest, CyclesWithSplittingScopes) { TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); // The computation is: D = relu(A) + (A @ relu(A)) @@ -471,7 +518,7 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) { TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); // The computation is: C = A @ relu(A) @@ -482,38 +529,104 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) { EXPECT_EQ(clusters["B"], clusters["C"]); } -REGISTER_OP("ResourceInput").Input("a: resource").Output("o: float"); -REGISTER_OP("ResourceOutput").Input("a: float").Output("o: resource"); - namespace { +Node* MakeRead(const Scope& scope, const string& id) { + Output var_handle = + ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({})); + Output read = + ops::ReadVariableOp(scope.WithOpName("Read" + id), var_handle, DT_FLOAT); + return read.node(); +} -class DummyOp : public XlaOpKernel { - using XlaOpKernel::XlaOpKernel; - void Compile(XlaOpKernelContext* ctx) override {} -}; - -REGISTER_XLA_OP(Name("ResourceInput"), DummyOp); -REGISTER_XLA_OP(Name("ResourceOutput"), DummyOp); +Node* MakeWrite(const Scope& scope, const string& id) { + Output var_handle = + ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({})); + Output value_to_write = + ops::Const(scope.WithOpName("ValueToAssign" + id), 1.0f); + ops::AssignVariableOp assign_op(scope.WithOpName("Assignment" + id), + var_handle, value_to_write); + return assign_op.operation.node(); +} +Node* MakeNeutral(const Scope& scope, const string& id) { + return ops::Const(scope.WithOpName("Const" + id), 42.0f).node(); +} } // namespace -TEST(XlaCompilationTest, Resources) { +TEST(XlaCompilationTest, ResourcesClusteringAllowed) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* read = MakeRead(root, "R"); + Node* write = MakeWrite(root, "W"); + + root.graph()->AddControlEdge(read, write); + + FixupSourceAndSinkEdges(root.graph()); std::unique_ptr graph(new Graph(OpRegistry::Global())); - GraphDef graphdef; - { - GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); - Node* a = - ops::SourceOp("UncompilableNullary", builder.opts().WithName("A")); - Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B")); - // We should not form clusters with resource ops by default. - Node* c = ops::UnaryOp("ResourceOutput", b, builder.opts().WithName("C")); - Node* d = ops::UnaryOp("ResourceInput", c, builder.opts().WithName("D")); - ops::UnaryOp("Relu", d, builder.opts().WithName("E")); - TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); - } - TF_ASSERT_OK(MarkForCompilation(&graph)); - auto clusters = GetClusters(*graph); - EXPECT_EQ(0, clusters.size()); // Nothing should be compiled. + TF_EXPECT_OK(root.ToGraph(graph.get())); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + gtl::FlatMap> cluster_sets = + GetClusterSets(*graph); + ASSERT_EQ(cluster_sets.size(), 1); + std::vector expected_clustered_nodes = {"AssignmentW", "ReadR", + "ValueToAssignW"}; + ASSERT_EQ(cluster_sets.begin()->second, expected_clustered_nodes); +} + +TEST(XlaCompilationTest, ResourcesClusteringDisallowed) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* read = MakeRead(root, "R"); + Node* write = MakeWrite(root, "W"); + + root.graph()->AddControlEdge(write, read); + + FixupSourceAndSinkEdges(root.graph()); + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_EXPECT_OK(root.ToGraph(graph.get())); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + gtl::FlatMap> cluster_sets = + GetClusterSets(*graph); + ASSERT_EQ(cluster_sets.size(), 1); + std::vector expected_clustered_nodes = {"AssignmentW", + "ValueToAssignW"}; + ASSERT_EQ(cluster_sets.begin()->second, expected_clustered_nodes); +} + +TEST(XlaCompilationTest, ChainOfOps) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* write_0 = MakeWrite(root, "W0"); + Node* neutral_0 = MakeNeutral(root, "N0"); + Node* read_0 = MakeRead(root, "R0"); + Node* write_1 = MakeWrite(root, "W1"); + Node* neutral_1 = MakeNeutral(root, "N1"); + Node* read_1 = MakeRead(root, "R1"); + + root.graph()->AddControlEdge(write_0, neutral_0); + root.graph()->AddControlEdge(neutral_0, read_0); + root.graph()->AddControlEdge(read_0, write_1); + root.graph()->AddControlEdge(write_1, neutral_1); + root.graph()->AddControlEdge(neutral_1, read_1); + + FixupSourceAndSinkEdges(root.graph()); + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_EXPECT_OK(root.ToGraph(graph.get())); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + + std::vector cluster_names; + gtl::FlatMap> cluster_sets = + GetClusterSets(*graph, &cluster_names); + + ASSERT_EQ(cluster_sets.size(), 2); + + std::vector expected_clustered_nodes_a = {"AssignmentW0", "ConstN0", + "ValueToAssignW0"}; + ASSERT_EQ(cluster_sets[cluster_names[0]], expected_clustered_nodes_a); + + std::vector expected_clustered_nodes_b = { + "AssignmentW1", "ConstN1", "ReadR0", "ValueToAssignW1"}; + ASSERT_EQ(cluster_sets[cluster_names[1]], expected_clustered_nodes_b); } TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) { @@ -541,13 +654,13 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) { TF_EXPECT_OK(root.ToGraph(graph.get())); - Status status = MarkForCompilation(&graph); + Status status = MarkForCompilationPassTestHelper::MarkForCompilation(&graph); EXPECT_FALSE(status.ok()); - EXPECT_TRUE(str_util::StrContains(status.ToString(), - "Edge from c to a would create a cycle.\n" - "+-> a\n" - "| b\n" - "+-- c\n")); + EXPECT_TRUE(absl::StrContains(status.ToString(), + "Edge from c to a would create a cycle.\n" + "+-> a\n" + "| b\n" + "+-- c\n")); } TEST(XlaCompilationTest, Retval) { @@ -569,7 +682,7 @@ TEST(XlaCompilationTest, Retval) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(2, clusters.size()); @@ -587,7 +700,7 @@ TEST(XlaCompilationTest, DontCountIdentityOps) { auto r = ops::_Retval(root.WithOpName("R"), c, 0); } TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_TRUE(clusters.empty()); @@ -603,7 +716,7 @@ TEST(XlaCompilationTest, DontCountIdentityOpsWithLocalJit) { auto r = ops::_Retval(root.WithOpName("R"), b, 0); } TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_TRUE(clusters.empty()); @@ -617,7 +730,7 @@ TEST(XlaCompilationTest, ConstOp) { auto c = ops::Const(root.WithOpName("const"), 0.5f); c.node()->AddAttr(kXlaCompileAttr, true); TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); EXPECT_EQ(1, GetClusters(*graph).size()); } @@ -628,10 +741,111 @@ TEST(XlaCompilationTest, ConstOp) { auto c = ops::Const(root.WithOpName("const"), string("string")); c.node()->AddAttr(kXlaCompileAttr, true); TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); EXPECT_TRUE(GetClusters(*graph).empty()); } } +TEST(XlaCompilationTest, DontClusterIdentityWithRefInput) { + Scope root = Scope::NewRootScope().ExitOnError(); + Output variable = ops::Variable(root.WithOpName("variable"), + PartialTensorShape{}, DT_FLOAT); + Output read = ops::Identity(root.WithOpName("read"), variable); + Output neg = ops::Negate(root.WithOpName("negate"), read); + Output add = ops::Add(root.WithOpName("add"), neg, neg); + std::unique_ptr graph(new Graph(OpRegistry::Global())); + + TF_ASSERT_OK(root.ToGraph(graph.get())); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + + std::unordered_map clusters = GetClusters(*graph); + + ASSERT_FALSE(clusters.empty()); + string cluster_name = clusters.begin()->second; + + std::unordered_map expected_clusters( + {{"negate", cluster_name}, {"add", cluster_name}}); + EXPECT_EQ(clusters, expected_clusters); +} + +TEST(XlaCompilationTest, ClusterIdentityWithNonRefInput) { + Scope root = Scope::NewRootScope().ExitOnError(); + Output variable = ops::Variable(root.WithOpName("variable"), + PartialTensorShape{}, DT_FLOAT); + Output read = ops::Identity(root.WithOpName("read"), variable); + Output neg = ops::Negate(root.WithOpName("negate"), read); + Output identity = ops::Negate(root.WithOpName("identity"), neg); + Output add = ops::Add(root.WithOpName("add"), identity, neg); + std::unique_ptr graph(new Graph(OpRegistry::Global())); + + TF_ASSERT_OK(root.ToGraph(graph.get())); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + + std::unordered_map clusters = GetClusters(*graph); + + ASSERT_FALSE(clusters.empty()); + string cluster_name = clusters.begin()->second; + + std::unordered_map expected_clusters( + {{"negate", cluster_name}, + {"identity", cluster_name}, + {"add", cluster_name}}); + EXPECT_EQ(clusters, expected_clusters); +} + +TEST(XlaCompilationTest, ClusterControlTrigger) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Output recv_a = ops::_Recv(root.WithOpName("recv_a"), DT_BOOL, "tensor_a", + "sender", 0, "receiver"); + Output recv_b = ops::_Recv(root.WithOpName("recv_b"), DT_BOOL, "tensor_b", + "sender", 0, "receiver"); + Output const_a = ops::Const(root.WithOpName("const_a"), 42); + + ops::ControlTrigger ctrl_trigger_a(root.WithOpName("ctrl_trigger_a")); + ops::ControlTrigger ctrl_trigger_b(root.WithOpName("ctrl_trigger_b")); + root.graph()->AddControlEdge(recv_a.node(), ctrl_trigger_a.operation.node()); + root.graph()->AddControlEdge(recv_b.node(), ctrl_trigger_a.operation.node()); + root.graph()->AddControlEdge(ctrl_trigger_b.operation.node(), const_a.node()); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + + TF_ASSERT_OK(root.ToGraph(graph.get())); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + + std::unordered_map clusters = GetClusters(*graph); + + ASSERT_FALSE(clusters.empty()); + string cluster_name = clusters.begin()->second; + + // ctrl_trigger_a has inputs with mismatching deadness so it won't be + // clustered. ctrl_trigger_b is okay to cluster. + std::unordered_map expected_clusters( + {{"const_a", cluster_name}, {"ctrl_trigger_b", cluster_name}}); + EXPECT_EQ(clusters, expected_clusters); +} + +TEST(XlaCompilationTest, RandomShape) { + Scope root = Scope::NewRootScope().ExitOnError(); + Output shape_shape = ops::Const(root.WithOpName("shape_shape"), {2}, {1}); + Output shape = + ops::RandomUniformInt(root.WithOpName("shape"), shape_shape, + ops::Const(root.WithOpName("minval"), 1), + ops::Const(root.WithOpName("maxval"), 20)); + Output reshape_input = + ops::Placeholder(root.WithOpName("reshape_input"), DT_FLOAT, + ops::Placeholder::Shape(TensorShape({500, 500}))); + Output reshape = + ops::Reshape(root.WithOpName("reshape"), reshape_input, shape); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + + TF_ASSERT_OK(root.ToGraph(graph.get())); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + + std::unordered_map clusters = GetClusters(*graph); + EXPECT_EQ(clusters["shape"], ""); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..65669877f732bad9e145da36a3aedeba611a0fe5 --- /dev/null +++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc @@ -0,0 +1,49 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h" +#include "tensorflow/core/public/session_options.h" + +namespace tensorflow { +/*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation( + std::unique_ptr* graph, FunctionLibraryDefinition* flib_def, + SessionOptions* session_options) { + // Assign all nodes to the CPU device. + static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0"; + for (Node* n : (*graph)->nodes()) { + n->set_assigned_device_name(kCpuDevice); + } + + GraphOptimizationPassOptions opt_options; + opt_options.graph = graph; + opt_options.session_options = session_options; + opt_options.flib_def = flib_def; + MarkForCompilationPass pass; + return pass.RunImpl(opt_options); +} + +/*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation( + std::unique_ptr* graph, FunctionLibraryDefinition* flib_def) { + SessionOptions session_options; + return MarkForCompilation(graph, flib_def, &session_options); +} + +/*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation( + std::unique_ptr* graph) { + FunctionDefLibrary flib; + FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib); + return MarkForCompilation(graph, &flib_def); +} +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..216baaf933dc1f7e694289eea5d23996b595f4d4 --- /dev/null +++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h @@ -0,0 +1,40 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_ +#define TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_ + +#include "tensorflow/compiler/jit/mark_for_compilation_pass.h" + +namespace tensorflow { +class MarkForCompilationPassTestHelper { + public: + // Runs the MarkForCompilation pass on `graph` after assigning all nodes in + // `graph` to the CPU device. To make testing easier, ignores device + // registration, _XlaCompile attributes, input deadness and global jit level. + static Status MarkForCompilation(std::unique_ptr* graph, + FunctionLibraryDefinition* flib_def, + SessionOptions* session_options); + + // Like `MarkForCompilation` but creates a default SessionOptions. + static Status MarkForCompilation(std::unique_ptr* graph, + FunctionLibraryDefinition* flib_def); + + // Like `MarkForCompilation` but creates `flib_def` from the op registry. + static Status MarkForCompilation(std::unique_ptr* graph); +}; +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_ diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD index c9e46bc1475aed0e35a48765ad70eef4362e8281..13804c6a0575b921839f99ef7d142e0871693b5a 100644 --- a/tensorflow/compiler/jit/ops/BUILD +++ b/tensorflow/compiler/jit/ops/BUILD @@ -10,10 +10,3 @@ cc_library( deps = ["//tensorflow/core:framework"], alwayslink = 1, ) - -cc_library( - name = "parallel_check_op", - srcs = ["parallel_check_op.cc"], - deps = ["//tensorflow/core:framework"], - alwayslink = 1, -) diff --git a/tensorflow/compiler/jit/ops/parallel_check_op.cc b/tensorflow/compiler/jit/ops/parallel_check_op.cc deleted file mode 100644 index db5c195578869f3f72f06fe2d86f507830a4e14b..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/jit/ops/parallel_check_op.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/op.h" - -namespace tensorflow { - -REGISTER_OP("ParallelCheck") - .Attr("T: list(type) >= 0") - .Input("expected: T") - .Input("actual: T") - .Output("result: T") - .Doc(R"doc( -Op that compares two sets of inputs for near-identity, and propagates the first. -Inequality is logged to ERROR log. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..a8f09bfa5034e020fe3448d8ecfe0f70605e14d2 --- /dev/null +++ b/tensorflow/compiler/jit/partially_decluster_pass.cc @@ -0,0 +1,177 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/partially_decluster_pass.h" +#include "tensorflow/compiler/jit/xla_cluster_util.h" +#include "tensorflow/core/framework/memory_types.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/lib/gtl/flatset.h" + +namespace tensorflow { +namespace { +Status FindNodesToDecluster(const Graph& graph, gtl::FlatSet* result, + absl::Span post_order) { + // Find nodes that have at least one user outside their cluster that expects + // hostmem output. These nodes should be cloned to outside the cluster to + // avoid the device-host copy we'd otherwise need. + + MemoryTypeVector input_mtypes, output_mtypes; + + for (Node* n : post_order) { + absl::optional from_cluster = GetXlaClusterForNode(*n); + if (!from_cluster) { + continue; + } + + // We assume the only XLA-auto-clusterable operations with side effects are + // resource variable updates. We can't execute these twice. + if (HasResourceInputOrOutput(*n)) { + continue; + } + + DeviceType device_type(""); + TF_RETURN_IF_ERROR( + DeviceToDeviceType(n->assigned_device_name(), &device_type)); + TF_RETURN_IF_ERROR(MemoryTypesForNode(graph.op_registry(), device_type, + n->def(), &input_mtypes, + &output_mtypes)); + for (const Edge* e : n->out_edges()) { + Node* dst = e->dst(); + + if (e->IsControlEdge()) { + continue; + } + + bool edge_incurs_extra_device_to_host_copy; + if (output_mtypes[e->src_output()] == DEVICE_MEMORY) { + // If the output of the *TensorFlow* operation is in DEVICE_MEMORY then + // keep the node clustered -- XLA will also produce the output in device + // memory and we will get some benefit from clustering. + edge_incurs_extra_device_to_host_copy = false; + } else { + MemoryTypeVector dst_input_mtypes, dst_output_mtypes; + DeviceType dst_device_type(""); + TF_RETURN_IF_ERROR( + DeviceToDeviceType(dst->assigned_device_name(), &dst_device_type)); + TF_RETURN_IF_ERROR(MemoryTypesForNode(graph.op_registry(), device_type, + dst->def(), &dst_input_mtypes, + &dst_output_mtypes)); + edge_incurs_extra_device_to_host_copy = + dst_input_mtypes[e->dst_input()] == HOST_MEMORY; + } + + if (!edge_incurs_extra_device_to_host_copy) { + continue; + } + + // Check if `dst` is in a different cluster, unclustered, or about to be + // partially declustered (here we rely on the post-order traversal order). + // If yes, decluster `n` to avoid the device-to-host memcpy. + absl::optional dst_cluster = + result->count(dst) ? absl::nullopt : GetXlaClusterForNode(*dst); + if (from_cluster != dst_cluster) { + CHECK(result->insert(n).second); + break; + } + } + } + return Status::OK(); +} + +Status PartiallyDeclusterNode(Graph* graph, Node* n) { + StringPiece cluster_name = *GetXlaClusterForNode(*n); + gtl::InlinedVector out_edges_to_clone; + for (const Edge* out_edge : n->out_edges()) { + if (out_edge->IsControlEdge()) { + continue; + } + + Node* dst = out_edge->dst(); + absl::optional dst_cluster_name = GetXlaClusterForNode(*dst); + if (dst_cluster_name != cluster_name) { + out_edges_to_clone.push_back(out_edge); + } + } + + CHECK(!out_edges_to_clone.empty()) << n->DebugString(); + + NodeDef ndef = n->def(); + ndef.set_name(strings::StrCat(n->name(), "/declustered")); + RemoveFromXlaCluster(&ndef); + Status s; + Node* cloned_node = graph->AddNode(ndef, &s); + cloned_node->set_assigned_device_name(n->assigned_device_name()); + TF_RETURN_IF_ERROR(s); + + for (const Edge* in_edge : n->in_edges()) { + graph->AddEdge(in_edge->src(), in_edge->src_output(), cloned_node, + in_edge->dst_input()); + } + + for (const Edge* out_edge_to_clone : out_edges_to_clone) { + graph->AddEdge(cloned_node, out_edge_to_clone->src_output(), + out_edge_to_clone->dst(), out_edge_to_clone->dst_input()); + graph->RemoveEdge(out_edge_to_clone); + } + + return Status::OK(); +} +} // namespace + +Status PartiallyDeclusterPass::Run( + const GraphOptimizationPassOptions& options) { + // NB! In this pass we assume the only XLA-auto-clusterable operations that + // may have side effects are resource variable operations so we don't cluster + // those. The pass will have to be updated if this assumption becomes + // invalid. + + Graph* graph = options.graph->get(); + + // When deciding whether to decluster a particular node, we base our decision + // on if we've decided that some of its consumers have to be declustered too. + // Iterating the graph in post-order guarantees that consumers have been + // visited before producers. + std::vector post_order; + GetPostOrder(*graph, &post_order, /*stable_comparator=*/NodeComparatorName(), + /*edge_filter=*/[](const Edge& edge) { + return !edge.src()->IsNextIteration(); + }); + + gtl::FlatSet nodes_to_partially_decluster; + TF_RETURN_IF_ERROR(FindNodesToDecluster( + **options.graph, &nodes_to_partially_decluster, post_order)); + + if (VLOG_IS_ON(3)) { + for (Node* n : post_order) { + if (nodes_to_partially_decluster.count(n)) { + VLOG(3) << n->DebugString(); + } + } + } + + for (Node* n : post_order) { + if (nodes_to_partially_decluster.count(n)) { + TF_RETURN_IF_ERROR(PartiallyDeclusterNode(graph, n)); + } + } + + nodes_to_partially_decluster.clear(); + TF_RETURN_IF_ERROR(FindNodesToDecluster( + **options.graph, &nodes_to_partially_decluster, post_order)); + CHECK(nodes_to_partially_decluster.empty()); + + return Status::OK(); +} +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/partially_decluster_pass.h b/tensorflow/compiler/jit/partially_decluster_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..6949b5028ee55e182b27589f9a9711dad7839e86 --- /dev/null +++ b/tensorflow/compiler/jit/partially_decluster_pass.h @@ -0,0 +1,58 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_ +#define TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_ + +#include "tensorflow/core/common_runtime/optimization_registry.h" + +namespace tensorflow { + +// Clones nodes from within a cluster to outside the cluster if profitable. +// +// Today this only clones to avoid device-to-host copies, but in the future we +// may consider other reasons to clone. For instance, we convert this: +// +// ..... +// | +// v +// A_Clustered ====> C_Unclustered +// | +// v +// B_Clustered +// +// to: +// +// ..... +// | | +// | +-------------+ +// | | +// v v +// A_Clustered A_Unclustered ====> C_Unclustered +// | +// v +// B_Clustered +// +// where the ===> arrow has a hostmem source and destination and would entail a +// device to host copy if the source and destination were not in the same XLA +// cluster. +class PartiallyDeclusterPass : public GraphOptimizationPass { + public: + Status Run(const GraphOptimizationPassOptions& options) override; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_ diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f61a955c222dd7ce11a177cd54bb8851a5400496 --- /dev/null +++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc @@ -0,0 +1,283 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/partially_decluster_pass.h" + +#include "tensorflow/cc/framework/ops.h" +#include "tensorflow/cc/ops/array_ops.h" +#include "tensorflow/cc/ops/control_flow_ops_internal.h" +#include "tensorflow/cc/ops/function_ops.h" +#include "tensorflow/cc/ops/sendrecv_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/jit/defs.h" +#include "tensorflow/compiler/jit/xla_cluster_util.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/graph/graph_def_builder_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { +REGISTER_OP("FakeNullary").Output("out: float"); + +REGISTER_OP("FakeBinary") + .Input("host_in: float") + .Input("device_in: float") + .Output("host_out: float") + .Output("device_out: float"); + +REGISTER_OP("FakeResourceVar").Output("out: resource"); + +REGISTER_OP("FakeResourceUpdate") + .Input("in: resource") + .Output("out: resource") + .Output("something_else: float"); + +class FakeBinaryOp : public OpKernel { + public: + explicit FakeBinaryOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* ctx) override { CHECK(false); } +}; + +class FakeResourceVarUpdateOp : public OpKernel { + public: + explicit FakeResourceVarUpdateOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* ctx) override { CHECK(false); } +}; + +REGISTER_KERNEL_BUILDER(Name("FakeBinary") + .Device(DEVICE_CPU) + .HostMemory("host_in") + .HostMemory("host_out"), + FakeBinaryOp); + +REGISTER_KERNEL_BUILDER(Name("FakeResourceVarUpdate") + .Device(DEVICE_CPU) + .HostMemory("something_else"), + FakeResourceVarUpdateOp); + +Status PartiallyDecluster(std::unique_ptr* graph) { + FixupSourceAndSinkEdges(graph->get()); + // Assign all nodes to the CPU device. + static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0"; + for (Node* n : (*graph)->nodes()) { + n->set_assigned_device_name(kCpuDevice); + } + + GraphOptimizationPassOptions opt_options; + opt_options.graph = graph; + PartiallyDeclusterPass pass; + return pass.Run(opt_options); +} + +const Node* FindNodeByName(const Graph& graph, const string& name) { + for (const Node* node : graph.nodes()) { + if (node->name() == name) { + return node; + } + } + return nullptr; +} + +bool GetInputsForNode(const Graph& graph, const string& node_name, + std::vector* inputs) { + const Node* node = FindNodeByName(graph, node_name); + if (node == nullptr) { + return false; + } + for (const Edge* e : node->in_edges()) { + inputs->push_back(e->src()); + } + std::sort(inputs->begin(), inputs->end(), NodeComparatorName()); + return true; +} + +TEST(PartiallyDeclusterPassTest, ClusteredAndUnclustered) { + std::unique_ptr graph(new Graph(OpRegistry::Global())); + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* input = + ops::SourceOp("FakeNullary", builder.opts().WithName("Input")); + Node* clustered_producer = + ops::BinaryOp("FakeBinary", input, input, + builder.opts().WithName("ClusteredProducer")); + ops::BinaryOp("FakeBinary", clustered_producer, input, + builder.opts().WithName("UnclusteredConsumer")); + Node* clustered_consumer = + ops::BinaryOp("FakeBinary", {clustered_producer, 1}, input, + builder.opts().WithName("ClusteredConsumer")); + clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0"); + clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0"); + TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); + } + + TF_ASSERT_OK(PartiallyDecluster(&graph)); + std::vector unclustered_consumer_inputs; + ASSERT_TRUE(GetInputsForNode(*graph, "UnclusteredConsumer", + &unclustered_consumer_inputs)); + ASSERT_EQ(unclustered_consumer_inputs.size(), 2); + EXPECT_EQ(unclustered_consumer_inputs[0]->name(), + "ClusteredProducer/declustered"); + EXPECT_EQ(unclustered_consumer_inputs[1]->name(), "Input"); + + std::vector clustered_consumer_inputs; + ASSERT_TRUE(GetInputsForNode(*graph, "ClusteredConsumer", + &clustered_consumer_inputs)); + ASSERT_EQ(clustered_consumer_inputs.size(), 2); + EXPECT_EQ(clustered_consumer_inputs[0]->name(), "ClusteredProducer"); + EXPECT_EQ(clustered_consumer_inputs[1]->name(), "Input"); +} + +TEST(PartiallyDeclusterPassTest, DifferentClusters) { + std::unique_ptr graph(new Graph(OpRegistry::Global())); + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* input = + ops::SourceOp("FakeNullary", builder.opts().WithName("Input")); + Node* clustered_producer = + ops::BinaryOp("FakeBinary", input, input, + builder.opts().WithName("ClusteredProducer")); + Node* consumer_in_different_cluster = + ops::BinaryOp("FakeBinary", clustered_producer, input, + builder.opts().WithName("ConsumerInDifferentCluster")); + Node* clustered_consumer = + ops::BinaryOp("FakeBinary", input, {clustered_producer, 1}, + builder.opts().WithName("ClusteredConsumer")); + clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0"); + clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0"); + consumer_in_different_cluster->AddAttr(kXlaClusterAttr, "cluster_1"); + TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); + } + + TF_ASSERT_OK(PartiallyDecluster(&graph)); + std::vector inputs; + ASSERT_TRUE(GetInputsForNode(*graph, "ConsumerInDifferentCluster", &inputs)); + ASSERT_EQ(inputs.size(), 2); + EXPECT_EQ(inputs[0]->name(), "ClusteredProducer/declustered"); + EXPECT_EQ(inputs[1]->name(), "Input"); +} + +TEST(PartiallyDeclusterPassTest, DontDeclusterIfUserIsDeviceMem) { + std::unique_ptr graph(new Graph(OpRegistry::Global())); + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* input = + ops::SourceOp("FakeNullary", builder.opts().WithName("Input")); + Node* clustered_producer = + ops::BinaryOp("FakeBinary", input, input, + builder.opts().WithName("ClusteredProducer")); + // The first input is hostmem and the second input is devicemem. + Node* consumer_in_different_cluster = + ops::BinaryOp("FakeBinary", input, clustered_producer, + builder.opts().WithName("ConsumerInDifferentCluster")); + Node* clustered_consumer = + ops::BinaryOp("FakeBinary", input, {clustered_producer, 1}, + builder.opts().WithName("ClusteredConsumer")); + clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0"); + clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0"); + consumer_in_different_cluster->AddAttr(kXlaClusterAttr, "cluster_1"); + TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); + } + + TF_ASSERT_OK(PartiallyDecluster(&graph)); + std::vector inputs; + ASSERT_TRUE(GetInputsForNode(*graph, "ConsumerInDifferentCluster", &inputs)); + ASSERT_EQ(inputs.size(), 2); + EXPECT_EQ(inputs[0]->name(), "ClusteredProducer"); + EXPECT_EQ(inputs[1]->name(), "Input"); +} + +TEST(PartiallyDeclusterPassTest, DontDuplicateResourceVarOps) { + std::unique_ptr graph(new Graph(OpRegistry::Global())); + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* input = + ops::SourceOp("FakeNullary", builder.opts().WithName("Input")); + Node* resource_var = ops::SourceOp("FakeResourceVar", + builder.opts().WithName("ResourceVar")); + Node* clustered_producer = + ops::UnaryOp("FakeResourceUpdate", resource_var, + builder.opts().WithName("ClusteredProducer")); + Node* consumer_in_different_cluster = + ops::BinaryOp("FakeBinary", {clustered_producer, 1}, input, + builder.opts().WithName("ConsumerInDifferentCluster")); + Node* clustered_consumer = + ops::BinaryOp("FakeBinary", input, {clustered_producer, 1}, + builder.opts().WithName("ClusteredConsumer")); + clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0"); + clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0"); + consumer_in_different_cluster->AddAttr(kXlaClusterAttr, "cluster_1"); + TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); + } + + TF_ASSERT_OK(PartiallyDecluster(&graph)); + std::vector inputs; + ASSERT_TRUE(GetInputsForNode(*graph, "ConsumerInDifferentCluster", &inputs)); + ASSERT_EQ(inputs.size(), 2); + EXPECT_EQ(inputs[0]->name(), "ClusteredProducer"); + EXPECT_EQ(inputs[1]->name(), "Input"); +} + +TEST(PartiallyDeclusterPassTest, DeclusterDependentNodes) { + std::unique_ptr graph(new Graph(OpRegistry::Global())); + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* input = + ops::SourceOp("FakeNullary", builder.opts().WithName("Input")); + Node* clustered_producer_0 = + ops::BinaryOp("FakeBinary", input, input, + builder.opts().WithName("ClusteredProducer0")); + Node* clustered_producer_1 = + ops::BinaryOp("FakeBinary", clustered_producer_0, input, + builder.opts().WithName("ClusteredProducer1")); + ops::BinaryOp("FakeBinary", clustered_producer_1, input, + builder.opts().WithName("UnclusteredConsumer")); + Node* clustered_consumer = + ops::BinaryOp("FakeBinary", {clustered_producer_1, 1}, input, + builder.opts().WithName("ClusteredConsumer")); + clustered_producer_0->AddAttr(kXlaClusterAttr, "cluster_0"); + clustered_producer_1->AddAttr(kXlaClusterAttr, "cluster_0"); + clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0"); + TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); + } + + TF_ASSERT_OK(PartiallyDecluster(&graph)); + std::vector unclustered_consumer_inputs, declustered_producer_1_inputs; + + ASSERT_TRUE(GetInputsForNode(*graph, "UnclusteredConsumer", + &unclustered_consumer_inputs)); + ASSERT_EQ(unclustered_consumer_inputs.size(), 2); + EXPECT_EQ(unclustered_consumer_inputs[0]->name(), + "ClusteredProducer1/declustered"); + EXPECT_EQ(unclustered_consumer_inputs[1]->name(), "Input"); + + ASSERT_TRUE(GetInputsForNode(*graph, "ClusteredProducer1/declustered", + &declustered_producer_1_inputs)); + ASSERT_EQ(declustered_producer_1_inputs.size(), 2); + EXPECT_EQ(declustered_producer_1_inputs[0]->name(), + "ClusteredProducer0/declustered"); + EXPECT_EQ(declustered_producer_1_inputs[1]->name(), "Input"); +} +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc new file mode 100644 index 0000000000000000000000000000000000000000..1ba4a5ef7399111e512da8c4966f5899ed828b17 --- /dev/null +++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc @@ -0,0 +1,336 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// ALGORITHM OVERVIEW +// ================== +// +// An XLA cluster hoists all resource reads to be beginning of the cluster +// execution and all the resource writes to the end. This means it cannot +// enforce arbitrary ordering dependencies (via control or data edges) between +// resource operations. Since all resource reads happen before all resource +// writes, edges constraining resource reads to happen before resource writes +// are fine, but all other kinds of edges are problematic. This analysis +// computes the set of pairs of resource operations that cannot be put in the +// same cluster because XLA cannot respect the dependencies between them in the +// TensorFlow program. +// +// TODO(b/112856632): We can, in theory, support Read->Read and Write->Write +// dependencies. +// +// Specifically the result computed by this analysis contains the edge {W, R} +// iff all of these hold true: +// +// - In the graph (g - {edges from NextIteration to Merge}) there is a path +// from W to R. +// - IsEdgeSafe(W, R) == False [defined below] +// - W != R (note: some resource operations both read from and write to +// resource variables). +// +// The result is incorrect around loops because we ignore edges from +// NextIteration to Merge, but that should be fine because we don't cluster +// these edges. For instance, in: +// +// Init -----> Merge <-------+ +// | | +// v | +// Read | +// | | +// v | +// Write | +// | | +// v | +// NextIteration --+ +// +// we won't put (Read, Write) in the returned set. This is fine if +// auto-clustering can only cluster the Read->Write edge, but it is a problem if +// it clusters the Write->NextIteration->Merge->Read edges instead. The same +// problem is present for the functional version of the loop above. We rely on +// auto-clustering to not cluster control flow edges like NextIteration->Merge. +// This is enough to avoid the explicit-control-flow problem shown above. One +// way to think about this is that we only care about cases where two nodes, A +// and B, would normally have been put in the same cluster but cannot legally be +// in the same cluster because of resourcevar-dependencies. If A and B would +// normally have been put in the same cluster then all paths between A and B +// would have to be clusterable (otherwise we'd have introduced a cycle). Ergo +// there could not have been a NextIteration->Merge edge between A and B since +// we don't cluster these edges. +// +// We also rely on auto-clustering to not cluster functional control flow nodes +// that contain resource operations. +// +// IMPLEMENTATION +// -------------- +// +// We traverse the graph minus backedges in reverse post order, mapping each +// node to the set of resource operation reaching that node. Since we visit +// producers before consumers, we can construct the set of reaching operations +// by taking the union of the operations reaching the input nodes. These +// "reaching resource operations" can then be used to create the pairs of +// incompatible nodes using `IsEdgeSafe`. + +#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h" + +#include "absl/memory/memory.h" +#include "absl/strings/str_join.h" +#include "absl/types/optional.h" +#include "tensorflow/compiler/tf2xla/resource_operation_table.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/tensor_id.h" +#include "tensorflow/core/lib/gtl/flatmap.h" +#include "tensorflow/core/lib/gtl/flatset.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/util/ptr_util.h" + +namespace tensorflow { +namespace { +// Returns true if `n` may call a function. +Status MayCallFunction(const Node& n, const FunctionLibraryDefinition* flib_def, + bool* out_result) { + if (flib_def->Contains(n.type_string())) { + *out_result = true; + } else { + *out_result = + std::any_of(n.def().attr().begin(), n.def().attr().end(), + [](const std::pair& name_attr_pair) { + return name_attr_pair.second.has_func(); + }); + } + + return Status::OK(); +} + +// Maps `n` to the XlaResourceOpKind corresponding to its operation. If `n` is +// not a resource operation recognized by XLA then sets `out_resource_op_kind` +// to nullopt. +Status XlaResourceOpKindForNode( + const Node& n, const FunctionLibraryDefinition* flib_def, + const std::function& resource_ops_to_ignore, + absl::optional* out_resource_op_kind) { + bool should_ignore = false; + if (resource_ops_to_ignore) { + TF_RETURN_IF_ERROR(resource_ops_to_ignore(n, &should_ignore)); + } + if (should_ignore) { + *out_resource_op_kind = absl::nullopt; + return Status::OK(); + } + + const XlaResourceOpInfo* op_info = GetResourceOpInfoForOp(n.type_string()); + if (op_info) { + *out_resource_op_kind = op_info->kind(); + return Status::OK(); + } + + // We conservatively assume that functions will both read and write resource + // variables. In the future we may consider doing some form of + // inter-procedural analysis. + bool may_call_function; + TF_RETURN_IF_ERROR(MayCallFunction(n, flib_def, &may_call_function)); + if (may_call_function) { + *out_resource_op_kind = XlaResourceOpKind::kReadWrite; + } else { + *out_resource_op_kind = absl::nullopt; + } + + return Status::OK(); +} + +// Returns true if a control or data dependence from a TensorFlow operation of +// resource op kind `from` to a TensorFlow operation of resource op kind `to` +// can be represented by an XLA cluster and needs no special handling around +// auto-jit. +bool IsEdgeSafe(XlaResourceOpKind from, XlaResourceOpKind to) { + // XLA clusters forces all reads to happen before all writes, which means the + // kinds of edges it can faithfully represent are: Read->Write, Read->Modify, + // Modify->Write, Read->Read, Write->Write. + // + // TODO(b/112856632): We can, in theory, support Read->Read and Write->Write + // dependencies. + return from == XlaResourceOpKind::kRead && to == XlaResourceOpKind::kWrite; +} + +using ResourceOp = std::pair; + +string ResourceOpToString(const ResourceOp& resource_op) { + return strings::StrCat( + resource_op.first, ": ", + XlaResourceOpInfo::XlaResourceOpKindToString(resource_op.second)); +} + +// A copy-on-write set used to store the set of ResourceOps reaching a node in a +// TensorFlow graph. +// +// TODO(sanjoy): It may be useful to pull this out into its own header at some +// point. +class ResourceOpSet { + private: + using Impl = gtl::FlatSet; + + public: + ResourceOpSet() = default; + + // Adds all ResourceOp s in `other` to this set. + void Add(const ResourceOpSet& other) { + CHECK(!frozen_); + if (other.impl_ == impl_) { + other.frozen_ = true; + return; + } + + if (!impl_) { + other.frozen_ = true; + impl_ = other.impl_; + return; + } + + for (ResourceOp resource_op : other) { + Add(resource_op); + } + } + + void Add(const ResourceOp& resource_op) { + CHECK(!frozen_); + if (!IsCopy() && Contains(resource_op)) { + // We can avoid the copy if the item we want to insert already exists. + return; + } + + EnsureIsCopied(); + impl_->insert(resource_op); + } + + Impl::const_iterator begin() const { + return impl_ ? impl_->begin() : GetEmptyImpl()->begin(); + } + + Impl::const_iterator end() const { + return impl_ ? impl_->end() : GetEmptyImpl()->end(); + } + + bool Contains(const ResourceOp& resource_op) const { + return impl_ != nullptr && impl_->count(resource_op); + } + + private: + bool IsCopy() const { return storage_ != nullptr; } + + void EnsureIsCopied() { + if (storage_ == nullptr) { + storage_ = absl::make_unique(); + for (ResourceOp op : *this) { + storage_->insert(op); + } + impl_ = storage_.get(); + } + } + + static Impl* GetEmptyImpl() { + static Impl* empty_impl = new Impl; + return empty_impl; + } + + Impl* impl_ = nullptr; + std::unique_ptr storage_; + + // frozen_ is true if there is another set pointing to this set's impl_. We + // can no longer add elements to this set in that case since the sets pointing + // to this set expect the contents of this set to be stable. + mutable bool frozen_ = false; + + TF_DISALLOW_COPY_AND_ASSIGN(ResourceOpSet); +}; + +string ResourceOpSetToString(const ResourceOpSet& resource_op_set) { + std::vector elements_debug_string; + std::transform(resource_op_set.begin(), resource_op_set.end(), + std::back_inserter(elements_debug_string), ResourceOpToString); + return strings::StrCat("{", absl::StrJoin(elements_debug_string, ","), "}"); +} + +string NodeToString(const Node& n, XlaResourceOpKind resource_op_kind) { + return strings::StrCat( + "[", n.name(), ": ", n.type_string(), "(", + XlaResourceOpInfo::XlaResourceOpKindToString(resource_op_kind), ")", "]"); +} +} // namespace + +Status ComputeIncompatibleResourceOperationPairs( + const Graph& g, const FunctionLibraryDefinition* flib_def, + const std::function& resource_ops_to_ignore, + std::vector>* result) { + CHECK(result->empty()); + + std::vector rpo; + GetReversePostOrder(g, &rpo, /*stable_comparator=*/NodeComparatorName(), + /*edge_filter=*/[](const Edge& edge) { + return !edge.src()->IsNextIteration(); + }); + + auto resource_op_set_for_node = + absl::make_unique(g.num_node_ids()); + + const bool vlog = VLOG_IS_ON(2); + + for (Node* n : rpo) { + absl::optional op_kind; + TF_RETURN_IF_ERROR(XlaResourceOpKindForNode( + *n, flib_def, resource_ops_to_ignore, &op_kind)); + + ResourceOpSet* resource_op_set = &resource_op_set_for_node[n->id()]; + + // Merge the reaching resource operations for all the incoming edges to + // create the set of all possible resource ops reaching `n`. + for (const Edge* e : n->in_edges()) { + if (n->IsMerge() && e->src()->IsNextIteration()) { + // Ignore back-edges (see file comment). + continue; + } + + const ResourceOpSet& incoming_op_set = + resource_op_set_for_node[e->src()->id()]; + resource_op_set->Add(incoming_op_set); + } + + // Add to the "incompatible resource ops" set if necessary. + if (op_kind) { + for (ResourceOp incoming_op : *resource_op_set) { + if (IsEdgeSafe(incoming_op.second, *op_kind)) { + continue; + } + + if (vlog) { + VLOG(2) << "Unsafe edge: " + << NodeToString(*g.FindNodeId(incoming_op.first), + incoming_op.second) + << " -> " << NodeToString(*n, *op_kind); + } + result->push_back({incoming_op.first, n->id()}); + } + + resource_op_set->Add({n->id(), *op_kind}); + } + + if (vlog) { + VLOG(3) << n->name() << " -> " << ResourceOpSetToString(*resource_op_set); + } + } + + std::sort(result->begin(), result->end()); + CHECK(std::unique(result->begin(), result->end()) == result->end()); + + return Status::OK(); +} +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.h b/tensorflow/compiler/jit/resource_operation_safety_analysis.h new file mode 100644 index 0000000000000000000000000000000000000000..ae8cfeecad9b9cd631db3e9865bb3c3ff28a2e48 --- /dev/null +++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.h @@ -0,0 +1,73 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_ +#define TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_ + +#include "tensorflow/compiler/jit/graphcycles/graphcycles.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/graph/graph.h" + +namespace tensorflow { +// An XLA cluster hoists all resource reads to be beginning of the cluster +// execution and all the resource writes to the end. This means it cannot +// enforce arbitrary ordering dependencies (via control or data edges) between +// resource operations. Since all resource reads happen before all resource +// writes, edges constraining resource reads to happen before resource writes +// are fine, but all other kinds of edges are problematic. This analysis +// returns the set of pairs of resource operations that cannot be put in the +// same cluster because XLA cannot respect the dependencies between them in the +// TensorFlow program. +// +// The restrictions are not transitive: it is fine to put A and C in the same +// cluster even if the returned set contains (A,B) and (B,C). +// +// In other words, if these pairs are seen as edges in an undirected graph of +// the nodes in `g` then auto-clustering is at least as constrained as the graph +// coloring problem on this graph. +// +// +// For instance if we auto-cluster all operations in this TensorFlow graph: +// +// ReadVariablepOp0 -> ReadVariableOp1 +// | +// v +// AssignVariableOp0 -> AssignVariableOp1 +// +// we will lose the ReadVariablepOp0 -> ReadVariableOp1 and the +// AssignVariableOp0 -> AssignVariableOp1 dependencies. I.e. it is possible for +// XlaLaunchOp to issue ReadVariableOp1 before ReadVariablepOp0 since it reads +// all the resource variables when the cluster starts executing without any +// particular ordering between them; same holds for the AssignVariableOp0 -> +// AssignVariableOp1 edge. The ReadVariableOp1 -> AssignVariableOp0 edge will +// be respected by XlaLaunchOp though because all reads happen before all +// writes. +// +// +// NB! The result computed by this analysis assumes that we don't auto-cluster +// back-edges (i.e. the edges from NextIteration to Merge). +// +// NB! The result computed by this analysis assumes that we don't auto-cluster +// functional control flow nodes containing resource operations. +// +// If `resource_ops_to_ignore` is set then nodes for which it returns true are +// ignored (we pretend these nodes are not resource operations). +Status ComputeIncompatibleResourceOperationPairs( + const Graph& g, const FunctionLibraryDefinition* flib_def, + const std::function& resource_ops_to_ignore, + std::vector>* result); +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_ diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e54b547abcfea698fe79e81dce547ea7858ff829 --- /dev/null +++ b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc @@ -0,0 +1,540 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h" + +#include "tensorflow/cc/framework/ops.h" +#include "tensorflow/cc/ops/array_ops.h" +#include "tensorflow/cc/ops/control_flow_ops_internal.h" +#include "tensorflow/cc/ops/function_ops.h" +#include "tensorflow/cc/ops/functional_ops.h" +#include "tensorflow/cc/ops/resource_variable_ops.h" +#include "tensorflow/cc/ops/sendrecv_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/jit/defs.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/graph/graph_def_builder_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +Node* MakeRead(const Scope& scope, const string& id) { + Output var_handle = + ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({})); + Output read = + ops::ReadVariableOp(scope.WithOpName("Read" + id), var_handle, DT_FLOAT); + return read.node(); +} + +Node* MakeWrite(const Scope& scope, const string& id) { + Output var_handle = + ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({})); + Output value_to_write = + ops::Const(scope.WithOpName("ValueToAssign" + id), 1.0f); + ops::AssignVariableOp assign_op(scope.WithOpName("Assignee" + id), var_handle, + value_to_write); + return assign_op.operation.node(); +} + +Node* MakeModify(const Scope& scope, const string& id) { + Output var_handle = + ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({})); + Output value_to_write = ops::Const(scope.WithOpName("Increment" + id), 1.0f); + ops::AssignAddVariableOp assign_add_op(scope.WithOpName("Increment" + id), + var_handle, value_to_write); + return assign_add_op.operation.node(); +} + +Node* MakeNeutral(const Scope& scope, const string& id) { + return ops::Const(scope.WithOpName("Const" + id), 42.0f).node(); +} + +Status ComputeIncompatiblePairs(Graph* g, + std::vector>* result) { + FixupSourceAndSinkEdges(g); + return ComputeIncompatibleResourceOperationPairs(*g, &g->flib_def(), {}, + result); +} + +TEST(ResourceOperationSafetyAnalysisTest, WriteRead) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* read = MakeRead(root, "R"); + Node* write = MakeWrite(root, "W"); + + root.graph()->AddControlEdge(write, read); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + std::pair write_read_pair = {write->id(), read->id()}; + EXPECT_EQ(incompatible_pairs[0], write_read_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, ReadWrite) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* read = MakeRead(root, "R"); + Node* write = MakeWrite(root, "W"); + + root.graph()->AddControlEdge(read, write); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + EXPECT_EQ(incompatible_pairs.size(), 0); +} + +TEST(ResourceOperationSafetyAnalysisTest, ReadWriteNoEdges) { + Scope root = Scope::NewRootScope().ExitOnError(); + + MakeRead(root, "R"); + MakeWrite(root, "W"); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + EXPECT_EQ(incompatible_pairs.size(), 0); +} + +TEST(ResourceOperationSafetyAnalysisTest, ReadModify) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* read = MakeRead(root, "R"); + Node* modify = MakeModify(root, "M"); + + root.graph()->AddControlEdge(read, modify); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + EXPECT_EQ(incompatible_pairs.size(), 1); + std::pair read_modify_pair = {read->id(), modify->id()}; + EXPECT_EQ(incompatible_pairs[0], read_modify_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, ModifyRead) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* read = MakeRead(root, "R"); + Node* modify = MakeModify(root, "M"); + + root.graph()->AddControlEdge(modify, read); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + std::pair modify_read_pair = {modify->id(), read->id()}; + EXPECT_EQ(incompatible_pairs[0], modify_read_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, ModifyWrite) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* modify = MakeModify(root, "M"); + Node* write = MakeWrite(root, "W"); + + root.graph()->AddControlEdge(modify, write); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + EXPECT_EQ(incompatible_pairs.size(), 1); + std::pair modify_write_pair = {modify->id(), write->id()}; + EXPECT_EQ(incompatible_pairs[0], modify_write_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, WriteModify) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* modify = MakeModify(root, "M"); + Node* write = MakeWrite(root, "W"); + + root.graph()->AddControlEdge(write, modify); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + std::pair write_modify_pair = {write->id(), modify->id()}; + EXPECT_EQ(incompatible_pairs[0], write_modify_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, ReadModifyWrite) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* read = MakeRead(root, "R"); + Node* modify = MakeModify(root, "M"); + Node* write = MakeWrite(root, "W"); + + root.graph()->AddControlEdge(read, modify); + root.graph()->AddControlEdge(modify, write); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + EXPECT_EQ(incompatible_pairs.size(), 2); + std::pair modify_write_pair = {modify->id(), write->id()}; + std::pair read_modify_pair = {read->id(), modify->id()}; + EXPECT_EQ(incompatible_pairs[0], read_modify_pair); + EXPECT_EQ(incompatible_pairs[1], modify_write_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, WriteModifyRead) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* read = MakeRead(root, "R"); + Node* modify = MakeModify(root, "M"); + Node* write = MakeWrite(root, "W"); + + root.graph()->AddControlEdge(write, modify); + root.graph()->AddControlEdge(modify, read); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 3); + + std::pair write_modify_pair = {write->id(), modify->id()}; + std::pair modify_read_pair = {modify->id(), read->id()}; + std::pair write_read_pair = {write->id(), read->id()}; + EXPECT_EQ(incompatible_pairs[0], modify_read_pair); + EXPECT_EQ(incompatible_pairs[1], write_read_pair); + EXPECT_EQ(incompatible_pairs[2], write_modify_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, WriteReadModify) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* read = MakeRead(root, "R"); + Node* modify = MakeModify(root, "M"); + Node* write = MakeWrite(root, "W"); + + root.graph()->AddControlEdge(write, read); + root.graph()->AddControlEdge(read, modify); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 3); + + std::pair write_modify_pair = {write->id(), modify->id()}; + std::pair write_read_pair = {write->id(), read->id()}; + std::pair read_modify_pair = {read->id(), modify->id()}; + EXPECT_EQ(incompatible_pairs[0], read_modify_pair); + EXPECT_EQ(incompatible_pairs[1], write_read_pair); + EXPECT_EQ(incompatible_pairs[2], write_modify_pair); +} + +FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) { + FunctionDefLibrary flib_def; + FunctionDef func = FunctionDefHelper::Create( + /*function_name=*/name, /*in_def=*/{}, /*out_def=*/{"out: float"}, + /*attr_def*/ + {}, /*node_def=*/{FunctionDefHelper::Const("one", 1.0f)}, + /*ret_def=*/{{"out", "out:output:0"}}); + *flib_def.add_function() = std::move(func); + return flib_def; +} + +Node* MakeCall(Graph* graph, const string& callee_name, const string& node_name, + Status* status) { + NodeDef call_node; + call_node.set_name(node_name); + call_node.set_op(callee_name); + return graph->AddNode(call_node, status); +} + +TEST(ResourceOperationSafetyAnalysisTest, CallRead) { + Scope root = Scope::NewRootScope().ExitOnError(); + + FunctionDefLibrary flib_def = + CreateFunctionDefLibWithConstFunction("Const_func"); + TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def)); + + Node* read = MakeRead(root, "R"); + Status status; + Node* call = MakeCall(root.graph(), "Const_func", "C", &status); + TF_ASSERT_OK(status); + + root.graph()->AddControlEdge(call, read); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + std::pair call_read_edge = {call->id(), read->id()}; + EXPECT_EQ(incompatible_pairs[0], call_read_edge); +} + +TEST(ResourceOperationSafetyAnalysisTest, ReadCall) { + Scope root = Scope::NewRootScope().ExitOnError(); + + FunctionDefLibrary flib_def = + CreateFunctionDefLibWithConstFunction("Const_func"); + TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def)); + + Node* read = MakeRead(root, "R"); + Status status; + Node* call = MakeCall(root.graph(), "Const_func", "C", &status); + TF_ASSERT_OK(status); + + root.graph()->AddControlEdge(read, call); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + std::pair read_call_edge = {read->id(), call->id()}; + EXPECT_EQ(incompatible_pairs[0], read_call_edge); +} + +TEST(ResourceOperationSafetyAnalysisTest, CallWrite) { + Scope root = Scope::NewRootScope().ExitOnError(); + + FunctionDefLibrary flib_def = + CreateFunctionDefLibWithConstFunction("Const_func"); + TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def)); + + Node* write = MakeWrite(root, "W"); + Status status; + Node* call = MakeCall(root.graph(), "Const_func", "C", &status); + TF_ASSERT_OK(status); + + root.graph()->AddControlEdge(call, write); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + std::pair call_write_edge = {call->id(), write->id()}; + EXPECT_EQ(incompatible_pairs[0], call_write_edge); +} + +TEST(ResourceOperationSafetyAnalysisTest, WriteCall) { + Scope root = Scope::NewRootScope().ExitOnError(); + + FunctionDefLibrary flib_def = + CreateFunctionDefLibWithConstFunction("Const_func"); + TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def)); + + Node* write = MakeWrite(root, "W"); + Status status; + Node* call = MakeCall(root.graph(), "Const_func", "C", &status); + TF_ASSERT_OK(status); + + root.graph()->AddControlEdge(write, call); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + std::pair write_call_edge = {write->id(), call->id()}; + EXPECT_EQ(incompatible_pairs[0], write_call_edge); +} + +TEST(ResourceOperationSafetyAnalysisTest, SymbolicGradientRead) { + Scope root = Scope::NewRootScope().ExitOnError(); + + FunctionDefLibrary flib_def = + CreateFunctionDefLibWithConstFunction("Const_func"); + TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def)); + + Node* read = MakeRead(root, "R"); + NameAttrList fn; + fn.set_name("Const_func"); + Node* symbolic_gradient = + ops::SymbolicGradient(root, /*input=*/{ops::Const(root, 1.0f)}, + /*Tout=*/{DT_FLOAT}, fn) + .output[0] + .node(); + + root.graph()->AddControlEdge(symbolic_gradient, read); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + std::pair symbolic_gradient_read_edge = {symbolic_gradient->id(), + read->id()}; + EXPECT_EQ(incompatible_pairs[0], symbolic_gradient_read_edge); +} + +TEST(ResourceOperationSafetyAnalysisTest, WriteSymbolicGradient) { + Scope root = Scope::NewRootScope().ExitOnError(); + + FunctionDefLibrary flib_def = + CreateFunctionDefLibWithConstFunction("Const_func"); + TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def)); + + Node* write = MakeWrite(root, "W"); + NameAttrList fn; + fn.set_name("Const_func"); + Node* symbolic_gradient = + ops::SymbolicGradient(root, /*input=*/{ops::Const(root, 1.0f)}, + /*Tout=*/{DT_FLOAT}, fn) + .output[0] + .node(); + + root.graph()->AddControlEdge(write, symbolic_gradient); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + std::pair write_symbolic_gradient_edge = {write->id(), + symbolic_gradient->id()}; + EXPECT_EQ(incompatible_pairs[0], write_symbolic_gradient_edge); +} + +TEST(ResourceOperationSafetyAnalysisTest, ChainOfOps) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* write_0 = MakeWrite(root, "W0"); + Node* neutral_0 = MakeNeutral(root, "N0"); + Node* read_0 = MakeRead(root, "R0"); + Node* write_1 = MakeWrite(root, "W1"); + Node* neutral_1 = MakeNeutral(root, "N1"); + Node* read_1 = MakeRead(root, "R1"); + + root.graph()->AddControlEdge(write_0, neutral_0); + root.graph()->AddControlEdge(neutral_0, read_0); + root.graph()->AddControlEdge(read_0, write_1); + root.graph()->AddControlEdge(write_1, neutral_1); + root.graph()->AddControlEdge(neutral_1, read_1); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 5); + std::pair write_0_read_0_pair = {write_0->id(), read_0->id()}; + std::pair write_0_read_1_pair = {write_0->id(), read_1->id()}; + std::pair write_1_read_1_pair = {write_1->id(), read_1->id()}; + std::pair write_0_write_1_pair = {write_0->id(), write_1->id()}; + std::pair read_0_read_1_pair = {read_0->id(), read_1->id()}; + + EXPECT_EQ(incompatible_pairs[0], write_0_read_0_pair); + EXPECT_EQ(incompatible_pairs[1], write_0_write_1_pair); + EXPECT_EQ(incompatible_pairs[2], write_0_read_1_pair); + EXPECT_EQ(incompatible_pairs[3], read_0_read_1_pair); + EXPECT_EQ(incompatible_pairs[4], write_1_read_1_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, DagOfOps) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* write_0 = MakeWrite(root, "W0"); + Node* write_1 = MakeWrite(root, "W1"); + Node* neutral = MakeNeutral(root, "N"); + Node* read_0 = MakeRead(root, "R0"); + Node* read_1 = MakeRead(root, "R1"); + + root.graph()->AddControlEdge(write_0, neutral); + root.graph()->AddControlEdge(write_1, neutral); + root.graph()->AddControlEdge(neutral, read_0); + root.graph()->AddControlEdge(neutral, read_1); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 4); + std::pair write_0_read_0_pair = {write_0->id(), read_0->id()}; + std::pair write_0_read_1_pair = {write_0->id(), read_1->id()}; + std::pair write_1_read_0_pair = {write_1->id(), read_0->id()}; + std::pair write_1_read_1_pair = {write_1->id(), read_1->id()}; + + EXPECT_EQ(incompatible_pairs[0], write_0_read_0_pair); + EXPECT_EQ(incompatible_pairs[1], write_0_read_1_pair); + EXPECT_EQ(incompatible_pairs[2], write_1_read_0_pair); + EXPECT_EQ(incompatible_pairs[3], write_1_read_1_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, DagOfOpsWithRepeatedPaths) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Node* write_0 = MakeWrite(root, "W0"); + Node* write_1 = MakeWrite(root, "W1"); + Node* neutral = MakeNeutral(root, "N"); + Node* read_0 = MakeRead(root, "R0"); + Node* read_1 = MakeRead(root, "R1"); + + root.graph()->AddControlEdge(write_0, neutral); + root.graph()->AddControlEdge(write_1, neutral); + root.graph()->AddControlEdge(neutral, read_0); + root.graph()->AddControlEdge(neutral, read_1); + root.graph()->AddControlEdge(write_1, read_1); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 4); + std::pair write_0_read_0_pair = {write_0->id(), read_0->id()}; + std::pair write_0_read_1_pair = {write_0->id(), read_1->id()}; + std::pair write_1_read_0_pair = {write_1->id(), read_0->id()}; + std::pair write_1_read_1_pair = {write_1->id(), read_1->id()}; + + EXPECT_EQ(incompatible_pairs[0], write_0_read_0_pair); + EXPECT_EQ(incompatible_pairs[1], write_0_read_1_pair); + EXPECT_EQ(incompatible_pairs[2], write_1_read_0_pair); + EXPECT_EQ(incompatible_pairs[3], write_1_read_1_pair); +} + +TEST(ResourceOperationSafetyAnalysisTest, Loop) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Output init_value = ops::Placeholder(root.WithOpName("init"), DT_FLOAT); + Output loop_cond = ops::Placeholder(root.WithOpName("init"), DT_BOOL); + Output enter_value = + ops::internal::Enter(root.WithOpName("enter"), init_value, "fr"); + ops::Merge iv(root.WithOpName("iv"), {enter_value, enter_value}); + ops::Switch latch(root.WithOpName("latch"), iv.output, loop_cond); + ops::internal::Exit exit(root.WithOpName("exit"), iv.output); + Output next_iteration = + ops::NextIteration(root.WithOpName("next_iteration"), latch.output_true); + TF_ASSERT_OK( + root.graph()->UpdateEdge(next_iteration.node(), 0, iv.output.node(), 1)); + + Node* write = MakeWrite(root, "W"); + Node* read = MakeRead(root, "R"); + + root.graph()->AddControlEdge(iv.output.node(), write); + root.graph()->AddControlEdge(write, read); + root.graph()->AddControlEdge(read, next_iteration.node()); + + std::vector> incompatible_pairs; + TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs)); + + ASSERT_EQ(incompatible_pairs.size(), 1); + + std::pair write_read_pair = {write->id(), read->id()}; + EXPECT_EQ(incompatible_pairs[0], write_read_pair); +} + +bool IsResourceArgDef(const OpDef::ArgDef& arg_def) { + return arg_def.type() == DT_RESOURCE; +} +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f2fabd658330b8ab182e13e02ed0bca41641e46 --- /dev/null +++ b/tensorflow/compiler/jit/xla_cluster_util.cc @@ -0,0 +1,234 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/xla_cluster_util.h" + +#include + +#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/graph/control_flow.h" +#include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/util/device_name_utils.h" + +namespace tensorflow { + +const char* const kXlaClusterAttr = "_XlaCluster"; +const char* const kXlaOutsideCompilationAttr = "_XlaOutsideCompilation"; + +namespace { +// Returns a string describing how an edge from src to dst would +// create a cycle. +string DescribeCycle(const GraphCycles* cycles, const Graph& graph, int src, + int dst) { + int32 max_path_size = graph.num_node_ids() + 1; + std::vector path(max_path_size); + int32 path_size = cycles->FindPath(dst, src, max_path_size, path.data()); + if (path_size == 0) { + return ""; + } + + auto node_name = [cycles, &graph](int node_id) { + if (!FastBoundsCheck(node_id, graph.num_node_ids())) { + return string("(null)"); + } + auto* node = graph.FindNodeId(node_id); + if (node == nullptr) { + return string("(null)"); + } + return node->name(); + }; + + string description; + strings::StrAppend(&description, "Edge from ", node_name(src), " to ", + node_name(dst), " would create a cycle.\n"); + path.resize(path_size); + for (int32 node_id : path) { + string ascii_art; + if (node_id == dst) { + ascii_art = "+-> "; + } else if (node_id != src) { + ascii_art = "| "; + } else { + ascii_art = "+-- "; + } + strings::StrAppend(&description, ascii_art, node_name(node_id), "\n"); + } + return description; +} + +bool AlwaysForwardsRefInput(const Node& node) { return node.IsIdentity(); } + +} // namespace + +Status DeviceToDeviceType(const string& device, DeviceType* device_type) { + DeviceNameUtils::ParsedName parsed; + if (!DeviceNameUtils::ParseFullName(device, &parsed)) { + return errors::Internal("Malformed assigned device '", device, "'"); + } + *device_type = DeviceType(parsed.type); + return Status::OK(); +} + +bool HasForwardedRefInput(const Node& node) { + if (AlwaysForwardsRefInput(node)) { + for (const Edge* incoming_edge : node.in_edges()) { + if (incoming_edge->IsControlEdge()) { + continue; + } + + Node* incoming_node = incoming_edge->src(); + if (IsRefType(incoming_node->output_type(incoming_edge->src_output()))) { + VLOG(2) << "Node " << node.def().ShortDebugString() << " has ref input " + << incoming_node->name() << " " << incoming_node->type_string(); + return true; + } + } + } + return false; +} + +Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles) { + for (int i = 0; i < graph->num_node_ids(); ++i) { + // We rely on the node IDs in the cycle detection graph being consecutive + // integers starting from 0. + CHECK_EQ(i, cycles->NewNode()); + } + + // Compute the loop structure of the graph. + std::vector control_flow_info; + TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &control_flow_info)); + + // The clustering code must avoid adding cycles to the graph to prevent + // deadlock. However, the graph may contain loops, which would trigger the + // cycle detection code. To handle loops, we alter the structure of the cycle + // detection graph, disconnecting each loop from the enclosing graph. + // Specifically, we: + // * add a new "frame" node for each loop. + // * replace edges to "Enter" nodes, and edges from "Exit" nodes with edges + // to/from the corresponding frame node. In essence, we collapse the loop + // into a single node for the purpose of cycle detection in the enclosing + // graph. + // * the body of the loop should now be disconnected from the rest of the + // graph; we make it acyclic by breaking loop backedges (edges outgoing from + // "NextIteration" nodes. + + // Map from frame name strings to node IDs in the cycle detection graph. + std::unordered_map frame_nodes; + + // Get the cycle graph node ID for frame 'frame_name', or add one if none + // exists. + auto GetOrAddFrameNodeId = [&frame_nodes, cycles](const string& frame_name) { + int& frame_id = frame_nodes.emplace(frame_name, -1).first->second; + if (frame_id < 0) { + // The emplace succeeded; we have not allocated a frame node yet. + frame_id = cycles->NewNode(); + } + return frame_id; + }; + + for (Edge const* edge : graph->edges()) { + if (edge->dst()->IsEnter() || edge->src()->IsExit()) { + const char* src_type = "pre-enter"; + const char* dst_type = "post-exit"; + int src = edge->src()->id(); + int dst = edge->dst()->id(); + + if (edge->dst()->IsEnter()) { + // Lift edges to an "Enter" node to the corresponding frame node. + const string& frame_name = + control_flow_info[edge->dst()->id()].frame_name; + dst = GetOrAddFrameNodeId(frame_name); + dst_type = "frame"; + } + + if (edge->src()->IsExit()) { + // Lift edges from an "Exit" node to the corresponding frame node. + const string& frame_name = + control_flow_info[edge->src()->id()].frame_name; + src = GetOrAddFrameNodeId(frame_name); + src_type = "frame"; + } + + if (!cycles->InsertEdge(src, dst)) { + return errors::Internal( + "Cycle detected when adding ", src_type, "->", dst_type, + " edge: ", DescribeCycle(cycles, *graph, src, dst)); + } + // Drop the original edge. + continue; + } + if (edge->src()->IsNextIteration()) { + // Break loop back-edges. + continue; + } + if (!cycles->InsertEdge(edge->src()->id(), edge->dst()->id())) { + // This should never happen. All cycles in the graph should contain + // a control flow operator. + return errors::Internal( + "Found cycle in graph without control flow operator during XLA " + "compilation: ", + DescribeCycle(cycles, *graph, edge->src()->id(), edge->dst()->id())); + } + } + return Status::OK(); +} + +absl::optional GetXlaClusterForNode(const Node& node) { + const AttrValue* attr_value = node.attrs().Find(kXlaClusterAttr); + if (attr_value == nullptr) { + return absl::nullopt; + } + Status s = AttrValueHasType(*attr_value, "string"); + if (!s.ok()) { + return absl::nullopt; + } + return attr_value->s(); +} + +bool HasResourceInputOrOutput(const Node& node) { + return std::find(node.input_types().begin(), node.input_types().end(), + DT_RESOURCE) != node.input_types().end() || + std::find(node.output_types().begin(), node.output_types().end(), + DT_RESOURCE) != node.output_types().end(); +} + +void RemoveFromXlaCluster(NodeDef* node_def) { + node_def->mutable_attr()->erase(kXlaClusterAttr); +} + +Status AdjustCycleDetectionGraphForResourceOps( + const Graph* graph, const FunctionLibraryDefinition* flib_def, + const std::function& resource_ops_to_ignore, + GraphCycles* cycles) { + std::vector> unsafe_deps; + TF_RETURN_IF_ERROR(ComputeIncompatibleResourceOperationPairs( + *graph, flib_def, resource_ops_to_ignore, &unsafe_deps)); + + // An edge {P,Q} in `unsafe_deps` denotes that P and Q, both of which are + // operations that interact with resource variables, must not be put in the + // same cluster. We enforce this constraint by creating a phantom node, X, + // and adding edges P->X and X->Q. MarkForCompilation then cannot cluster P + // and Q together since that would create a cycle with X. + + for (std::pair unsafe_dep : unsafe_deps) { + int phantom_node_id = cycles->NewNode(); + CHECK(cycles->InsertEdge(unsafe_dep.first, phantom_node_id)); + CHECK(cycles->InsertEdge(phantom_node_id, unsafe_dep.second)); + } + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h new file mode 100644 index 0000000000000000000000000000000000000000..b0439a63ca6476b6b1d63e65308712270381dd9f --- /dev/null +++ b/tensorflow/compiler/jit/xla_cluster_util.h @@ -0,0 +1,67 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Contains utilities for clustering compilable graph nodes via XLA. + +#ifndef TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_ +#define TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_ + +#include "absl/types/optional.h" +#include "tensorflow/compiler/jit/graphcycles/graphcycles.h" +#include "tensorflow/core/graph/algorithm.h" + +namespace tensorflow { + +// The attribute that marks nodes to be grouped into functions by the +// encapsulate subgraphs pass. +extern const char* const kXlaClusterAttr; + +// The attribute that marks nodes in a cluster to be placed outside the xla +// compilation by the encapsulate subgraphs pass. +extern const char* const kXlaOutsideCompilationAttr; + +using OrderedNodeSet = std::set; + +// Returns the DeviceType corresponding to 'device'. +Status DeviceToDeviceType(const string& device, DeviceType* device_type); + +// Returns true if `node` has a ref tensor input that it forwards to its output. +bool HasForwardedRefInput(const Node& node); + +// Creates a graph representation to enable cycle detection when clustering. +// This representation handles loops in graph by disconnecting each loop from +// the enclosing graph. +Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles); + +// Returns the XLA cluster in which `node` is placed if it is in an XLA cluster, +// otherwise returns nullopt. +absl::optional GetXlaClusterForNode(const Node& node); + +// Removes `node_def` its XLA cluster (by clearing its _XlaCluster attribute). +void RemoveFromXlaCluster(NodeDef* node_def); + +// Returns true if `node` has a DT_RESOURCE typed input or output. +bool HasResourceInputOrOutput(const Node& node); + +// Adds edges to `cycles` to prevent clustering resource operations that cannot +// be legally clustered. +Status AdjustCycleDetectionGraphForResourceOps( + const Graph* graph, const FunctionLibraryDefinition* flib_def, + const std::function& resource_ops_to_ignore, + GraphCycles* cycles); + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_ diff --git a/tensorflow/compiler/jit/xla_cluster_util_test.cc b/tensorflow/compiler/jit/xla_cluster_util_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..65bbf3efe85ba30f44531ff6d54b041786dca0a5 --- /dev/null +++ b/tensorflow/compiler/jit/xla_cluster_util_test.cc @@ -0,0 +1,68 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/xla_cluster_util.h" + +#include "tensorflow/cc/framework/ops.h" +#include "tensorflow/cc/ops/control_flow_ops_internal.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/framework/function_testlib.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +TEST(CreateCycleDetectionGraph, ConnectivityThroughEnterExitRegion) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Output a = ops::Const(root.WithOpName("a"), Input::Initializer(0.0)); + Output enter = + ops::internal::Enter(root.WithOpName("enter"), a, "only_frame"); + Output exit = ops::internal::Exit(root.WithOpName("exit"), enter); + Output b = ops::Add(root.WithOpName("b"), a, exit); + + FixupSourceAndSinkEdges(root.graph()); + + GraphCycles cycles; + TF_ASSERT_OK(CreateCycleDetectionGraph(root.graph(), &cycles)); + EXPECT_FALSE(cycles.ContractEdge(a.node()->id(), b.node()->id())); +} + +TEST(CreateCycleDetectionGraph, ConnectivityThroughMultipleEnterExitRegions) { + Scope root = Scope::NewRootScope().ExitOnError(); + + Output a = ops::Const(root.WithOpName("a"), Input::Initializer(0.0)); + Output enter_0 = + ops::internal::Enter(root.WithOpName("enter_0"), a, "frame_0"); + Output exit_0 = ops::internal::Exit(root.WithOpName("exit_0"), enter_0); + Output enter_1 = + ops::internal::Enter(root.WithOpName("enter_1"), a, "frame_1"); + Output exit_1 = ops::internal::Exit(root.WithOpName("exit_1"), enter_1); + Output b = ops::Add(root.WithOpName("b"), a, exit_1); + + FixupSourceAndSinkEdges(root.graph()); + + GraphCycles cycles; + TF_ASSERT_OK(CreateCycleDetectionGraph(root.graph(), &cycles)); + EXPECT_FALSE(cycles.ContractEdge(a.node()->id(), b.node()->id())); +} +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc index 7ed609c43748062656b631243c01d790519c54fd..ef6b0e67d3c4007f86dc7eef89cacb4cea98fc15 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.cc +++ b/tensorflow/compiler/jit/xla_compilation_cache.cc @@ -40,7 +40,23 @@ namespace tensorflow { XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client, DeviceType device_type) : client_(client), device_type_(std::move(device_type)) {} -XlaCompilationCache::~XlaCompilationCache() = default; +XlaCompilationCache::~XlaCompilationCache() { + // Ensure any use of our programs have completed by waiting for all stream + // executors to complete. + for (auto* executor : client_->backend().stream_executors()) { + bool ok = executor->SynchronizeAllActivity(); + if (!ok) { + LOG(ERROR) << "Error synchronizing activity while waiting for all " + "programs to complete"; + } + } + // TODO(b/110813685): Think about the program ownership model. Programs are + // currently owned by the compilation cache which means we must wait for + // program completion in the destructor. There are multiple compilation caches + // around, which complicates things a little. Perhaps having programs be + // shared_ptrs (an invasive change) would make the model easier to reason + // about? +} string XlaCompilationCache::DebugString() { return "XLA JIT compilation cache"; @@ -193,7 +209,9 @@ Status XlaCompilationCache::BuildExecutable( argument_layouts[i] = &result.xla_input_shapes[i]; } xla::ExecutableBuildOptions build_options; - build_options.set_device_ordinal(client_->default_device_ordinal()); + build_options.set_device_ordinal(options.device_ordinal != -1 + ? options.device_ordinal + : client_->default_device_ordinal()); build_options.set_result_layout(result.xla_output_shape); build_options.set_device_allocator(options.device_allocator); @@ -212,7 +230,7 @@ Status XlaCompilationCache::Compile( const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options) { + const XlaCompiler::CompileOptions& compile_options) { return CompileImpl(options, function, constant_args, variable_args, ctx, compilation_result, executable, compile_options, false); } @@ -223,7 +241,7 @@ Status XlaCompilationCache::CompileSingleOp( const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options) { + const XlaCompiler::CompileOptions& compile_options) { const NodeDef& def = ctx->op_kernel().def(); NameAttrList name; name.set_name(def.op()); @@ -238,8 +256,9 @@ Status XlaCompilationCache::CompileImpl( const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options, + const XlaCompiler::CompileOptions& compile_options, bool compile_single_op) { + CHECK_NE(executable, nullptr); VLOG(1) << "XlaCompilationCache::Compile " << DebugString(); if (VLOG_IS_ON(2)) { @@ -277,7 +296,7 @@ Status XlaCompilationCache::CompileImpl( // protect the contents of the cache entry. Entry* entry; { - mutex_lock lock(mu_); + mutex_lock lock(compile_cache_mu_); // Find or create a cache entry. std::unique_ptr& e = cache_[signature]; if (!e) { @@ -293,6 +312,8 @@ Status XlaCompilationCache::CompileImpl( if (!entry->compiled) { VLOG(1) << "Compilation cache miss for signature: " << SignatureDebugString(signature); + tensorflow::Env* env = tensorflow::Env::Default(); + const uint64 compile_start_us = env->NowMicros(); // Do the actual JIT compilation without holding the lock (it can take // a long time.) std::vector args; @@ -303,26 +324,42 @@ Status XlaCompilationCache::CompileImpl( entry->compiled = true; if (compile_single_op) { - entry->compilation_status = compiler.CompileSingleOp( - compile_options ? *compile_options : XlaCompiler::CompileOptions(), - signature.name, ctx, args, &entry->compilation_result); + entry->compilation_status = + compiler.CompileSingleOp(compile_options, signature.name, ctx, args, + &entry->compilation_result); } else { entry->compilation_status = compiler.CompileFunction( - compile_options ? *compile_options : XlaCompiler::CompileOptions(), - function, args, &entry->compilation_result); + compile_options, function, args, &entry->compilation_result); } - } - *compilation_result = &entry->compilation_result; - if (entry->compilation_status.ok() && executable) { - if (entry->executable == nullptr) { - entry->compilation_status = BuildExecutable( - options, entry->compilation_result, &entry->executable); + TF_RETURN_IF_ERROR(entry->compilation_status); + CHECK_EQ(entry->executable.get(), nullptr); + entry->compilation_status = + BuildExecutable(options, entry->compilation_result, &entry->executable); + + const uint64 compile_end_us = env->NowMicros(); + const uint64 compile_time_us = compile_end_us - compile_start_us; + { + mutex_lock lock(compile_stats_mu_); + auto it = compile_stats_.emplace(function.name(), CompileStats{}).first; + it->second.compile_count++; + it->second.cumulative_compile_time_us += compile_time_us; + VLOG(1) << "compiled " << function.name() << " " + << it->second.compile_count + << " times, compile time: " << compile_time_us + << " us, cumulative: " << it->second.cumulative_compile_time_us + << " us (" + << tensorflow::strings::HumanReadableElapsedTime(compile_time_us / + 1.0e6) + << " / " + << tensorflow::strings::HumanReadableElapsedTime( + it->second.cumulative_compile_time_us / 1.0e6) + << ")"; } - *executable = entry->executable.get(); } - - Status status = entry->compilation_status; - return status; + TF_RETURN_IF_ERROR(entry->compilation_status); + *compilation_result = &entry->compilation_result; + *executable = entry->executable.get(); + return Status::OK(); } } // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h index be1043d8c3fc0573922837e541615114a6d7a1a5..10ad87e38cc4d614e869782329f84351bc3b1f0b 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.h +++ b/tensorflow/compiler/jit/xla_compilation_cache.h @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -69,7 +70,7 @@ class XlaCompilationCache : public ResourceBase { OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options); + const XlaCompiler::CompileOptions& compile_options); // As above, but calls XlaCompiler::CompileSingleOp instead of // XlaCompiler::CompileFunction. @@ -79,7 +80,7 @@ class XlaCompilationCache : public ResourceBase { const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options); + const XlaCompiler::CompileOptions& compile_options); xla::LocalClient* client() const { return client_; } const DeviceType& device_type() const { return device_type_; } @@ -95,7 +96,7 @@ class XlaCompilationCache : public ResourceBase { OpKernelContext* ctx, const XlaCompiler::CompilationResult** compilation_result, xla::LocalExecutable** executable, - const XlaCompiler::CompileOptions* compile_options, + const XlaCompiler::CompileOptions& compile_options, bool compile_single_op); // Takes `result` which has been compiled from a Tensorflow subgraph to a @@ -150,9 +151,22 @@ class XlaCompilationCache : public ResourceBase { std::unique_ptr executable GUARDED_BY(mu); }; - mutex mu_; - std::unordered_map, Signature::Hash> cache_ - GUARDED_BY(mu_); + mutex compile_cache_mu_; + gtl::FlatMap, Signature::Hash> cache_ + GUARDED_BY(compile_cache_mu_); + + struct CompileStats { + // Number of times the cluster has been (re-)compiled. + int64 compile_count = 0; + + // Cumulative time spent compiling the cluster. + int64 cumulative_compile_time_us = 0; + }; + mutex compile_stats_mu_; + + // Maps cluster names to compilation statistics for said cluster. + gtl::FlatMap compile_stats_ + GUARDED_BY(compile_stats_mu_); TF_DISALLOW_COPY_AND_ASSIGN(XlaCompilationCache); }; diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc index ab644ff5a61c407b246b97af5328bf5cd8c1893b..3ba48e8c318f84a4691fb74434bc009fdd0d81bf 100644 --- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc +++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h" #include "tensorflow/compiler/jit/xla_device.h" #include "tensorflow/compiler/jit/xla_launch_util.h" +#include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" @@ -53,7 +54,9 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx, // Builds an XLA allocator for the device. XlaComputationLaunchContext launch_context( - client, client->backend().memory_allocator(), true); + client, client->backend().memory_allocator(), + /*allocate_xla_tensors=*/true, + /*use_multiple_streams=*/metadata.UseMultipleStreams()); launch_context.PopulateInputs(ctx, result, variables); @@ -61,17 +64,22 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx, ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr; TF_RET_CHECK(stream); - VLOG(2) << "Executing computation."; + VLOG(2) << "Executing computation: " << name(); + for (const xla::ShapedBuffer* arg : launch_context.arguments()) { + VLOG(2) << name() << ": " << *arg; + } xla::ExecutableRunOptions run_options; run_options.set_stream(stream); run_options.set_allocator(client->backend().memory_allocator()); run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device()); - run_options.set_rng_seed(ctx->step_id()); + run_options.set_rng_seed(GetXLARandomSeed()); - auto run_result = executable->Run(launch_context.arguments(), run_options); + xla::StatusOr run_result = + executable->Run(launch_context.arguments(), run_options); TF_RETURN_IF_ERROR(run_result.status()); - launch_context.PopulateOutputs(ctx, result, run_result.ConsumeValueOrDie()); + TF_RETURN_IF_ERROR(launch_context.PopulateOutputs( + ctx, result, run_result.ConsumeValueOrDie())); return Status::OK(); } @@ -151,8 +159,7 @@ Status XlaCompileOnDemandOp::Compile( core::ScopedUnref cache_ref(cache); XlaCompiler::Options options; - DeviceType device_type = metadata.jit_device_type(); - options.device_type = &device_type; + options.device_type = metadata.jit_device_type(); options.client = metadata.client(); options.flib_def = new FunctionLibraryDefinition(OpRegistry::Global(), FunctionDefLibrary{}); @@ -160,10 +167,17 @@ Status XlaCompileOnDemandOp::Compile( XlaCompiler::CompileOptions compile_options; compile_options.is_entry_computation = true; + // Optimization: don't resolve constants. If we resolve constants we never + // emit them on the device, meaning that if they are needed by a following + // computation the host has to transfer them. + compile_options.resolve_compile_time_constants = false; + // Optimization: where possible, have the computation return a naked array + // rather than a one-element tuple. + compile_options.always_return_tuple = false; std::map variable_args = GetVariables(ctx); return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx, - result, executable, &compile_options); + result, executable, compile_options); } void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) { diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc index 43648402f65c656b6b4eb2e83e61ce45f1c73669..7e159e3171113b0d53f03bb676ac9c21db7fe77a 100644 --- a/tensorflow/compiler/jit/xla_cpu_device.cc +++ b/tensorflow/compiler/jit/xla_cpu_device.cc @@ -54,6 +54,7 @@ Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& options, DEVICE_CPU_XLA_JIT, options, name_prefix, registration, /*transfer_as_literal=*/false, + /*use_multiple_streams=*/false, /*shape_representation_fn=*/{}, /*padded_shape_fn=*/{}, &device)); devices->push_back(device.release()); diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc index ed007d603ea1b3d27dd25f00726261cdd029c20c..f31879a2bc517d8b05e129cf0777196d0ee4dc79 100644 --- a/tensorflow/compiler/jit/xla_device.cc +++ b/tensorflow/compiler/jit/xla_device.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/jit/defs.h" #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h" #include "tensorflow/compiler/jit/xla_device_context.h" @@ -26,6 +27,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/client_library.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/dma_helper.h" @@ -100,7 +102,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator( } std::unique_ptr alloc = - xla::MakeUnique(); + absl::make_unique(); XlaDeviceAllocator* alloc_ptr = alloc.get(); state.allocators_[{backend, device_ordinal}] = std::move(alloc); return alloc_ptr; @@ -130,7 +132,7 @@ Status DefaultPaddedShapeFn(const Tensor& tensor, xla::Shape* shape) { const string& jit_device_name, const SessionOptions& options, const string& name_prefix, const XlaOpRegistry::DeviceRegistration& registration, - bool transfer_as_literal, + bool transfer_as_literal, bool use_multiple_streams, const XlaCompiler::ShapeRepresentationFn& shape_representation_fn, const PaddedShapeFn& padded_shape_fn, std::unique_ptr* device) { VLOG(1) << "XlaDevice::Create " << platform_name << " " << device_name << ":" @@ -151,22 +153,24 @@ Status DefaultPaddedShapeFn(const Tensor& tensor, xla::Shape* shape) { DeviceType(device_name), Bytes(16ULL << 30), DeviceLocality(), strings::StrCat("device: ", device_name, " device")); - device->reset(new XlaDevice( - options, attrs, device_ordinal, DeviceType(jit_device_name), - platform.ValueOrDie(), transfer_as_literal, shape_representation_fn, - padded_shape_fn ? padded_shape_fn : DefaultPaddedShapeFn)); + device->reset( + new XlaDevice(options, attrs, device_ordinal, DeviceType(jit_device_name), + platform.ValueOrDie(), transfer_as_literal, + use_multiple_streams, shape_representation_fn, + padded_shape_fn ? padded_shape_fn : DefaultPaddedShapeFn)); return Status::OK(); } XlaDevice::Metadata::Metadata( int device_ordinal, se::Platform* platform, const DeviceType& device_type, XlaCompiler::ShapeRepresentationFn shape_representation_fn, - PaddedShapeFn padded_shape_fn) + PaddedShapeFn padded_shape_fn, bool use_multiple_streams) : device_ordinal_(device_ordinal), device_type_(device_type), platform_(platform), shape_representation_fn_(std::move(shape_representation_fn)), - padded_shape_fn_(std::move(padded_shape_fn)) {} + padded_shape_fn_(std::move(padded_shape_fn)), + use_multiple_streams_(use_multiple_streams) {} int XlaDevice::Metadata::device_ordinal() const { return device_ordinal_; } @@ -181,14 +185,13 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const { return device_type_; } -/* static */ Status XlaDevice::GetMetadata(OpKernelContext* ctx, - const Metadata** metadata) { +/*static*/ Status XlaDevice::GetMetadataFromDevice( + DeviceBase* device, const XlaDevice::Metadata** metadata) { *metadata = nullptr; - XlaDevice* xla_device = - dynamic_cast(ctx->device()->UnderlyingDevice()); + XlaDevice* xla_device = dynamic_cast(device->UnderlyingDevice()); if (xla_device == nullptr) { return errors::Internal( - "Cannot get XLA metadata from non-XLA device \"", ctx->device()->name(), + "Cannot get XLA metadata from non-XLA device \"", device->name(), "\". GetMetadata must only be called on an XLA device. Either an " "internal bug has been triggered, or an XLA-specific op has been " "placed on the wrong device."); @@ -197,27 +200,42 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const { return Status::OK(); } +/* static */ Status XlaDevice::GetMetadata(OpKernelContext* ctx, + const Metadata** metadata) { + return GetMetadataFromDevice(ctx->device(), metadata); +} + +/* static */ Status XlaDevice::GetMetadata(OpKernelConstruction* ctx, + const Metadata** metadata) { + return GetMetadataFromDevice(ctx->device(), metadata); +} + XlaDevice::XlaDevice( const SessionOptions& options, const DeviceAttributes& attrs, int device_ordinal, const DeviceType& jit_device_name, - se::Platform* platform, bool transfer_as_literal, + se::Platform* platform, bool transfer_as_literal, bool use_multiple_streams, const XlaCompiler::ShapeRepresentationFn& shape_representation_fn, const PaddedShapeFn& padded_shape_fn) : LocalDevice(options, attrs), xla_metadata_(device_ordinal, platform, jit_device_name, - shape_representation_fn, padded_shape_fn), + shape_representation_fn, padded_shape_fn, + use_multiple_streams), device_ordinal_(device_ordinal), jit_device_name_(jit_device_name), - xla_allocator_(nullptr), platform_(platform), + use_multiple_streams_(use_multiple_streams), transfer_as_literal_(transfer_as_literal), shape_representation_fn_(shape_representation_fn) { - VLOG(1) << "Created XLA device " << jit_device_name; + VLOG(1) << "Created XLA device " << jit_device_name << " " << this; + thread_pool_.reset(new thread::ThreadPool(options.env, "xla_device", + /*num_threads=*/1)); } XlaDevice::~XlaDevice() { - if (gpu_device_info_ != nullptr) { - gpu_device_info_->default_context->Unref(); + VLOG(1) << "Destroying XLA device " << jit_device_name_ << " " << this; + mutex_lock lock(mu_); + if (device_context_) { + device_context_->Unref(); } } @@ -233,6 +251,11 @@ xla::LocalClient* XlaDevice::client() const { } Allocator* XlaDevice::GetAllocator(AllocatorAttributes attr) { + mutex_lock lock(mu_); + return GetAllocatorLocked(attr); +} + +Allocator* XlaDevice::GetAllocatorLocked(AllocatorAttributes attr) { if (attr.on_host()) { return cpu_allocator(); } @@ -245,69 +268,140 @@ Allocator* XlaDevice::GetAllocator(AllocatorAttributes attr) { return xla_allocator_; } -xla::StatusOr XlaDevice::GetStream() { - if (!stream_) { - xla::Backend* backend = client()->mutable_backend(); - TF_ASSIGN_OR_RETURN(stream_, backend->BorrowStream(device_ordinal_)); +Status XlaDevice::EnsureDeviceContextOk() { + mutex_lock lock(mu_); + return GetDeviceContextLocked().status(); +} + +Status XlaDevice::EnsureStreamOkLocked(xla::Backend* backend, + const string& name, + std::shared_ptr* stream, + bool* stream_was_changed) { + if (!(*stream) || !(*stream)->ok()) { + xla::StreamPool::Ptr ptr; + TF_ASSIGN_OR_RETURN(ptr, backend->BorrowStream(device_ordinal_)); + *stream = std::shared_ptr(std::move(ptr)); + VLOG(1) << "XlaDevice " << this << " new " << name << " " + << (*stream)->DebugStreamPointers(); + *stream_was_changed = true; } - return stream_.get(); + return Status::OK(); } -Status XlaDevice::CreateAndSetGpuDeviceInfo() { - if (gpu_device_info_ == nullptr) { - TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); - // Call GetAllocator for the side-effect of ensuring the allocator - // is created. - GetAllocator({}); - // XlaDevice owns both gpu_device_info_ and - // gpu_device_info_->default_context. - gpu_device_info_ = MakeUnique(); - gpu_device_info_->stream = stream; - gpu_device_info_->default_context = new XlaDeviceContext( - stream, client(), transfer_as_literal_, shape_representation_fn_); - set_tensorflow_gpu_device_info(gpu_device_info_.get()); +xla::StatusOr XlaDevice::GetDeviceContextLocked() { + xla::Backend* backend = client()->mutable_backend(); + + // Ensure all our streams are valid, borrowing new streams if necessary. + bool need_new_device_context = !device_context_; + TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "stream", &stream_, + &need_new_device_context)); + + std::shared_ptr host_to_device_stream = stream_; + std::shared_ptr device_to_host_stream = stream_; + if (use_multiple_streams_) { + TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "host_to_device_stream", + &host_to_device_stream_, + &need_new_device_context)); + TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "device_to_host_stream", + &device_to_host_stream_, + &need_new_device_context)); + host_to_device_stream = host_to_device_stream_; + device_to_host_stream = device_to_host_stream_; } - return Status::OK(); + if (!need_new_device_context) { + return device_context_; + } + + // At this point we know we need a new device context. + // Call GetAllocator for the side-effect of ensuring the allocator is created. + GetAllocatorLocked({}); + if (device_context_) { + device_context_->Unref(); + } + // The XlaDeviceContext keeps a reference count to the streams, and the + // XlaDeviceContext remains live for the duration of a Executor run. This + // ensures that the streams remain live for the duration of a run, even if + // an error is encountered and the streams are replaced with new ones. + device_context_ = new XlaDeviceContext( + stream_, host_to_device_stream, device_to_host_stream, client(), + transfer_as_literal_, shape_representation_fn_, thread_pool_.get()); + VLOG(1) << "XlaDevice " << this << " new XlaDeviceContext " + << device_context_; + + // Create and set a new GpuDeviceInfo, if necessary. + // + // TODO(b/78232898): This isn't thread-safe; there is a race between the call + // to set_tensorflow_gpu_device_info() with ops that call the getter + // tensorflow_gpu_device_info(). This isn't trivially fixed by adding locking + // to those methods; see the bug for details. Our only saving grace at the + // moment is that this race doesn't seem to occur in practice. + if (use_gpu_device_info_) { + auto gpu_device_info = absl::make_unique(); + gpu_device_info->stream = stream_.get(); + gpu_device_info->default_context = device_context_; + set_tensorflow_gpu_device_info(gpu_device_info.get()); + gpu_device_info_ = std::move(gpu_device_info); + VLOG(1) << "XlaDevice " << this << " new GpuDeviceInfo " + << gpu_device_info_.get(); + } + + return device_context_; +} + +Status XlaDevice::UseGpuDeviceInfo() { + mutex_lock lock(mu_); + use_gpu_device_info_ = true; + return GetDeviceContextLocked().status(); } Status XlaDevice::FillContextMap(const Graph* graph, DeviceContextMap* device_context_map) { VLOG(1) << "XlaDevice::FillContextMap"; + mutex_lock lock(mu_); + TF_ASSIGN_OR_RETURN(XlaDeviceContext * device_context, + GetDeviceContextLocked()); + device_context_map->resize(graph->num_node_ids()); - TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); - // Call GetAllocator for the side-effect of ensuring the allocator is created. - GetAllocator({}); - auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_, - shape_representation_fn_); for (Node* n : graph->nodes()) { VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name(); - ctx->Ref(); - (*device_context_map)[n->id()] = ctx; + device_context->Ref(); + (*device_context_map)[n->id()] = device_context; } - ctx->Unref(); return Status::OK(); } void XlaDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { - VLOG(1) << "XlaDevice::Compute " << op_kernel->name() << ":" + VLOG(2) << "XlaDevice::Compute " << op_kernel->name() << ":" << op_kernel->type_string(); - // When Xprof profiling is off (which is the default), constructing the - // activity is simple enough that its overhead is negligible. - tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(), - op_kernel->IsExpensive()); - op_kernel->Compute(context); + TracingDevice::Compute(op_kernel, context); } void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, AsyncOpKernel::DoneCallback done) { - VLOG(1) << "XlaDevice::ComputeAsync " << op_kernel->name() << ":" + VLOG(2) << "XlaDevice::ComputeAsync " << op_kernel->name() << ":" << op_kernel->type_string(); tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(), op_kernel->IsExpensive()); op_kernel->ComputeAsync(context, done); } +Status XlaDevice::Sync() { + VLOG(1) << "XlaDevice::Sync"; + std::shared_ptr stream; + { + mutex_lock lock(mu_); + stream = stream_; + } + if (!stream) return Status::OK(); + + if (!stream->parent()->SynchronizeAllActivity() || !stream->ok()) { + return errors::Internal("XlaDevice::Sync() failed."); + } + VLOG(1) << "XlaDevice::Sync completed"; + return Status::OK(); +} + Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs, Tensor* tensor) { @@ -323,16 +417,17 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto, if (alloc_attrs.on_host()) { *tensor = parsed; } else { - Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape()); + mutex_lock lock(mu_); + TF_ASSIGN_OR_RETURN(XlaDeviceContext * device_context, + GetDeviceContextLocked()); + Allocator* allocator = GetAllocatorLocked(alloc_attrs); + Tensor copy(allocator, parsed.dtype(), parsed.shape()); Notification n; - TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); - XlaTransferManager manager(stream, client(), transfer_as_literal_, - shape_representation_fn_); - manager.CopyCPUTensorToDevice(&parsed, this, ©, - [&n, &status](const Status& s) { - status = s; - n.Notify(); - }); + device_context->CopyCPUTensorToDevice(&parsed, this, ©, + [&n, &status](const Status& s) { + status = s; + n.Notify(); + }); n.WaitForNotification(); *tensor = copy; } diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h index 02e88ee6793e984a7b782790f8011cbcbc5a5026..92891ffa8c6e4a19623172574b17d90fd344c570 100644 --- a/tensorflow/compiler/jit/xla_device.h +++ b/tensorflow/compiler/jit/xla_device.h @@ -25,6 +25,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_ #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_ +#include "tensorflow/compiler/jit/xla_device_context.h" #include "tensorflow/compiler/jit/xla_tensor.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" @@ -39,6 +40,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace tensorflow { @@ -57,7 +59,7 @@ class XlaDevice : public LocalDevice { Metadata(int device_ordinal, se::Platform* platform, const DeviceType& device_type, XlaCompiler::ShapeRepresentationFn shape_representation_fn, - PaddedShapeFn padded_shape_fn); + PaddedShapeFn padded_shape_fn, bool use_multiple_streams); // The index of the device on this host. int device_ordinal() const; @@ -70,12 +72,15 @@ class XlaDevice : public LocalDevice { } const PaddedShapeFn& padded_shape_fn() const { return padded_shape_fn_; } + bool UseMultipleStreams() const { return use_multiple_streams_; } + private: const int device_ordinal_; const DeviceType device_type_; se::Platform* platform_; // Not owned. XlaCompiler::ShapeRepresentationFn shape_representation_fn_; PaddedShapeFn padded_shape_fn_; + const bool use_multiple_streams_; TF_DISALLOW_COPY_AND_ASSIGN(Metadata); }; @@ -83,12 +88,18 @@ class XlaDevice : public LocalDevice { // Sets `*metadata` to the XlaDevice Metadata in the XLA device used by `ctx`. static Status GetMetadata(OpKernelContext* ctx, const Metadata** metadata); + // Sets `*metadata` to the XlaDevice Metadata in the XLA device used by `ctx`. + static Status GetMetadata(OpKernelConstruction* ctx, + const Metadata** metadata); + // Factory function. 'platform_name' is the name of the XLA platform. // 'device_name' is the name of the Tensorflow device to create. // 'jit_device_name' is the name of the corresponding JIT device. // 'transfer_as_literal' is true if device<->host transfers must be done using // XLA's TransferLiteral{To,From}Device interface. If false, we can use // ThenMemcpy instead. + // If 'use_multiple_streams' is true, we create separate streams for + // host-to-device and device-to-host communication. // If padded_shape_fn is empty, a default implementation that returns // the on-host shape is used. static Status Create( @@ -96,7 +107,7 @@ class XlaDevice : public LocalDevice { int device_ordinal, const string& jit_device_name, const SessionOptions& options, const string& name_prefix, const XlaOpRegistry::DeviceRegistration& registration, - bool transfer_as_literal, + bool transfer_as_literal, bool use_multiple_streams, const XlaCompiler::ShapeRepresentationFn& shape_representation_fn, const PaddedShapeFn& padded_shape_fn, std::unique_ptr* device); @@ -106,54 +117,96 @@ class XlaDevice : public LocalDevice { XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs, int device_ordinal, const DeviceType& jit_device_name, se::Platform* platform, bool transfer_as_literal, + bool use_multiple_streams, const XlaCompiler::ShapeRepresentationFn& shape_representation_fn, const PaddedShapeFn& padded_shape_fn); ~XlaDevice() override; - Allocator* GetAllocator(AllocatorAttributes attr) override; + Allocator* GetAllocator(AllocatorAttributes attr) override + LOCKS_EXCLUDED(mu_); void Compute(OpKernel* op_kernel, OpKernelContext* context) override; void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, AsyncOpKernel::DoneCallback done) override; - Status Sync() override { return Status::OK(); } + Status Sync() override; Status FillContextMap(const Graph* graph, - DeviceContextMap* device_context_map) override; + DeviceContextMap* device_context_map) override + LOCKS_EXCLUDED(mu_); Status MakeTensorFromProto(const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs, - Tensor* tensor) override; + Tensor* tensor) override LOCKS_EXCLUDED(mu_); - xla::LocalClient* client() const; const Metadata& metadata() { return xla_metadata_; } - xla::StatusOr GetStream(); - // If not already set, create and set GpuDeviceInfo. - // Not thread-safe - Status CreateAndSetGpuDeviceInfo(); + // Ensures the DeviceContext associated with this XlaDevice is created and + // valid (i.e. all streams are ok). If any state is not valid, a new + // DeviceContext will be created. + // + // TODO(b/111859745): The Eager context needs to call this method to recover + // from failures. + Status EnsureDeviceContextOk() LOCKS_EXCLUDED(mu_); + + // Instructs this XlaDevice to set a GpuDeviceInfo, which holds extra + // information for GPU and TPU devices. + Status UseGpuDeviceInfo() LOCKS_EXCLUDED(mu_); private: + xla::LocalClient* client() const; + Allocator* GetAllocatorLocked(AllocatorAttributes attr) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + Status EnsureStreamOkLocked(xla::Backend* backend, const string& name, + std::shared_ptr* stream, + bool* stream_was_changed) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + xla::StatusOr GetDeviceContextLocked() + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + static Status GetMetadataFromDevice(DeviceBase* device, + const XlaDevice::Metadata** metadata); + + mutex mu_; // The metadata of this XlaDevice. const Metadata xla_metadata_; // Which hardware device in the client's platform this XlaDevice controls. const int device_ordinal_; // The name of the device that is used to compile Ops for this XlaDevice. - DeviceType jit_device_name_; + const DeviceType jit_device_name_; + // The platform for this device. + se::Platform* const platform_; // Not owned. // Memory allocator associated with this device. - Allocator* xla_allocator_; // Not owned. - se::Platform* platform_; // Not owned. + Allocator* xla_allocator_ GUARDED_BY(mu_) = nullptr; // Not owned. // Stream associated with this device. Operations enqueued on this // stream are executed on the device. Operations include data // copying back and forth between CPU and the device, and // computations enqueued by XLA. - xla::Backend::StreamPtr stream_; + std::shared_ptr stream_ GUARDED_BY(mu_); + // If false, only stream_ is valid and all computation and transfers use + // stream_. If true, computation is performed by stream_ and transfers are + // performed by host_to_device/device_to_host_stream. + const bool use_multiple_streams_; + // If use_multiple_streams_, host to device transfers are performed using this + // stream. + std::shared_ptr host_to_device_stream_ GUARDED_BY(mu_); + // If use_multiple_streams_, device to host transfers are performed using this + // stream. + std::shared_ptr device_to_host_stream_ GUARDED_BY(mu_); // Must we use XLA's transfer manager for correct host<->device transfers? if // false, we can use ThenMemcpy() instead. - bool transfer_as_literal_; - XlaCompiler::ShapeRepresentationFn shape_representation_fn_; + const bool transfer_as_literal_; + const XlaCompiler::ShapeRepresentationFn shape_representation_fn_; + + // The device context accessed by all users of the XlaDevice, set by calls to + // EnsureDeviceContextOk. If gpu_device_info_ is non-null, this pointer is + // also filled in to that struct. XlaDeviceContext is a ref-counted object. + XlaDeviceContext* device_context_ GUARDED_BY(mu_) = nullptr; + + // Holds extra information for GPU and TPU devices, e.g. the device context. + bool use_gpu_device_info_ GUARDED_BY(mu_) = false; + std::unique_ptr gpu_device_info_ GUARDED_BY(mu_); - // If set, holds default device context (that we must Unref) - // and its stream. - std::unique_ptr gpu_device_info_; + // Thread pool used for running closures + std::unique_ptr thread_pool_; }; // Builds OpKernel registrations on 'device' for the JIT operators diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc index 71e63b110b3b132a57fc291e53a165954c72a03c..ee07c5c9643ef1119b9077326c1cf7c83930e90c 100644 --- a/tensorflow/compiler/jit/xla_device_context.cc +++ b/tensorflow/compiler/jit/xla_device_context.cc @@ -15,6 +15,9 @@ limitations under the License. #include "tensorflow/compiler/jit/xla_device_context.h" +#include + +#include "tensorflow/compiler/jit/xla_device.h" #include "tensorflow/compiler/jit/xla_launch_util.h" #include "tensorflow/compiler/tf2xla/literal_util.h" #include "tensorflow/compiler/tf2xla/shape_util.h" @@ -48,17 +51,27 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) { void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); } XlaTransferManager::XlaTransferManager( - se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal, - XlaCompiler::ShapeRepresentationFn shape_representation_fn) - : stream_(stream), + std::shared_ptr compute_stream, + std::shared_ptr host_to_device_stream, + std::shared_ptr device_to_host_stream, xla::LocalClient* client, + bool transfer_as_literal, + XlaCompiler::ShapeRepresentationFn shape_representation_fn, + thread::ThreadPool* thread_pool) + : stream_(std::move(compute_stream)), + host_to_device_stream_(std::move(host_to_device_stream)), + device_to_host_stream_(std::move(device_to_host_stream)), client_(client), transfer_manager_(client->backend().transfer_manager()), transfer_as_literal_(transfer_as_literal), - shape_representation_fn_(std::move(shape_representation_fn)) { + shape_representation_fn_(std::move(shape_representation_fn)), + thread_pool_(thread_pool) { + CHECK(host_to_device_stream_ != nullptr); + CHECK(device_to_host_stream_ != nullptr); + CHECK(stream_ != nullptr); if (!shape_representation_fn_) { - shape_representation_fn_ = [](const TensorShape& shape, DataType dtype) { - return shape; - }; + shape_representation_fn_ = + [](const TensorShape& shape, + DataType dtype) -> xla::StatusOr { return shape; }; } } @@ -67,99 +80,126 @@ Status XlaTransferManager::TransferLiteralToDevice( xla::Shape xla_shape; TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(), host_tensor.shape(), &xla_shape)); - xla::BorrowingLiteral literal( + // Create a reference to hold onto host_tensor until after the literal has + // been transferred. Also make sure the literal exists until the function + // asynchronously completes, as it will be wrapped in an xla::LiteralSlice. + TensorReference ref(host_tensor); + auto literal = std::make_shared( static_cast(DMAHelper::base(&host_tensor)), xla_shape); - const xla::ShapedBuffer& shaped_buffer = - XlaTensor::FromTensor(device_tensor)->shaped_buffer(); - VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " " + XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor); + const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer(); + VLOG(1) << "Transfer to device as literal: " << literal->ToString() << " " << shaped_buffer.ToString(); - return transfer_manager_->TransferLiteralToDevice(stream_->parent(), literal, - shaped_buffer); + if (UseMultipleStreams() && !transfer_manager_->CanShapedBufferBeAccessedNow( + stream_->parent(), shaped_buffer)) { + // Initially wait for the compute stream so that memory allocations are + // synchronized. + host_to_device_stream_->ThenWaitFor(stream_.get()); + } + TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync( + host_to_device_stream_.get(), *literal, shaped_buffer)); + if (UseMultipleStreams()) { + auto event = std::make_shared(stream_->parent()); + TF_RET_CHECK(event->Init()) << "Event failed to initialize!"; + host_to_device_stream_->ThenRecordEvent(event.get()); + xla_tensor->SetDefinedOn(host_to_device_stream_.get(), std::move(event)); + } + // Unref the host tensor, and capture the literal shared_ptr too so it goes + // out of scope when the lambda completes. + host_to_device_stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); }); + + return Status::OK(); } -Status XlaTransferManager::TransferLiteralFromDevice( - Tensor* host_tensor, const Tensor& device_tensor) const { +void XlaTransferManager::TransferLiteralFromDevice( + Tensor* host_tensor, const Tensor& device_tensor, + const StatusCallback& done) const { + xla::MutableBorrowingLiteral literal; + TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(host_tensor, &literal)); + const xla::ShapedBuffer& shaped_buffer = XlaTensor::FromTensor(&device_tensor)->shaped_buffer(); - TF_ASSIGN_OR_RETURN(std::unique_ptr literal, - transfer_manager_->TransferLiteralFromDevice( - stream_->parent(), shaped_buffer)); - VLOG(1) << "Transfer from device as literal: " << literal->ToString() << " " - << shaped_buffer.ToString(); - Tensor tensor; - TF_RETURN_IF_ERROR( - LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor)); - // Reshape the tensor back to its declared shape. - if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) { - return errors::Internal( - "Tensor::CopyFrom failed when copying from XLA device to CPU"); - } - return Status::OK(); + TensorReference ref(device_tensor); + transfer_manager_->TransferLiteralFromDevice( + device_to_host_stream_.get(), shaped_buffer, literal, + [=, &shaped_buffer](xla::Status status) { + ref.Unref(); + done([&]() -> Status { + VLOG(1) << "Transfer from device as literal: " + << shaped_buffer.ToString(); + return status; + }()); + }); } void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, Tensor* device_tensor, StatusCallback done) const { - if (cpu_tensor->NumElements() > 0) { - VLOG(2) << "CopyCPUTensorToDevice " - << reinterpret_cast(cpu_tensor->tensor_data().data()) - << " " - << reinterpret_cast( - device_tensor->tensor_data().data()) - << " " << cpu_tensor->NumElements() << " " - << cpu_tensor->shape().DebugString() << " " - << device_tensor->shape().DebugString(); - - void* src_ptr = const_cast(DMAHelper::base(cpu_tensor)); - const int64 total_bytes = cpu_tensor->TotalBytes(); - - XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor); - CHECK(xla_tensor); - - TensorShape shape = shape_representation_fn_(device_tensor->shape(), - device_tensor->dtype()); - if (!xla_tensor->has_shaped_buffer()) { - Status s = xla_tensor->AllocateShapedBuffer( - device_tensor->dtype(), shape, client_, - stream_->parent()->device_ordinal()); - if (!s.ok()) { - done(s); - return; - } - } + if (cpu_tensor->NumElements() == 0) { + VLOG(2) << "CopyCPUTensorToDevice empty tensor"; + done(Status::OK()); + return; + } - Status status; - if (transfer_as_literal_) { - Tensor reshaped_cpu_tensor; - if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) { - done(errors::Internal( - "Tensor::CopyFrom failed when copying from CPU to XLA device")); - return; - } - status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor); - } else { - se::DeviceMemoryBase dev_dst_ptr = - XlaTensor::DeviceMemoryFromTensor(*device_tensor); - stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes); - // TODO(hpucha): Make this asynchronous. - Status block_status = stream_->BlockHostUntilDone(); - if (!block_status.ok()) { - status = xla::InternalError( - "Failed to complete data transfer on stream %p: %s", stream_, - block_status.error_message().c_str()); - } - } - xla_tensor->set_host_tensor(*cpu_tensor); + VLOG(2) << "CopyCPUTensorToDevice " + << reinterpret_cast(cpu_tensor->tensor_data().data()) + << " " + << reinterpret_cast(device_tensor->tensor_data().data()) + << " " << cpu_tensor->NumElements() << " " + << cpu_tensor->shape().DebugString() << " " + << device_tensor->shape().DebugString(); + + void* src_ptr = const_cast(DMAHelper::base(cpu_tensor)); + const int64 total_bytes = cpu_tensor->TotalBytes(); + + XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor); + CHECK(xla_tensor); - done(status); + xla::StatusOr shape_or_status = + shape_representation_fn_(device_tensor->shape(), device_tensor->dtype()); + if (!shape_or_status.ok()) { + done(shape_or_status.status()); return; } + TensorShape shape = shape_or_status.ValueOrDie(); + if (!xla_tensor->has_shaped_buffer()) { + Status s = + xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_, + stream_->parent()->device_ordinal()); + if (!s.ok()) { + done(s); + return; + } + } - VLOG(2) << "CopyCPUTensorToDevice empty tensor"; - done(Status::OK()); + Status status; + if (transfer_as_literal_) { + Tensor reshaped_cpu_tensor; + if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) { + done(errors::Internal( + "Tensor::CopyFrom failed when copying from CPU to XLA device")); + return; + } + status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor); + } else { + se::DeviceMemoryBase dev_dst_ptr = + XlaTensor::DeviceMemoryFromTensor(*device_tensor); + host_to_device_stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes); + // TODO(hpucha): Make this asynchronous. + Status block_status = host_to_device_stream_->BlockHostUntilDone(); + if (!block_status.ok()) { + status = xla::InternalError( + "Failed to complete data transfer on stream %p: %s", + host_to_device_stream_.get(), block_status.error_message().c_str()); + } + } + if (status.ok()) { + xla_tensor->set_host_tensor(*cpu_tensor); + } + done(status); } void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor, @@ -167,84 +207,129 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor, Device* device, Tensor* cpu_tensor, StatusCallback done) { - if (device_tensor->NumElements() > 0) { - VLOG(2) << "CopyDeviceTensorToCPU " - << reinterpret_cast( - device_tensor->tensor_data().data()) - << " " - << reinterpret_cast(cpu_tensor->tensor_data().data()) - << " " << device_tensor->NumElements() << " " - << cpu_tensor->shape().DebugString() << " " - << device_tensor->shape().DebugString(); - - const int64 total_bytes = cpu_tensor->TotalBytes(); - se::DeviceMemoryBase dev_src_ptr = - XlaTensor::DeviceMemoryFromTensor(*device_tensor); - void* dst_ptr = DMAHelper::base(cpu_tensor); - - Status status; - if (transfer_as_literal_) { - status = TransferLiteralFromDevice(cpu_tensor, *device_tensor); - } else { - stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes); - // TODO(hpucha): Make this asynchronous. - Status block_status = stream_->BlockHostUntilDone(); - if (!block_status.ok()) { - status = xla::InternalError( - "Failed to complete data transfer on stream %p: %s", stream_, - block_status.error_message().c_str()); - } - } + if (device_tensor->NumElements() == 0) { + VLOG(2) << "CopyDeviceTensorToCPU empty tensor"; + done(Status::OK()); + return; + } + VLOG(2) << "CopyDeviceTensorToCPU " + << reinterpret_cast(device_tensor->tensor_data().data()) + << " " + << reinterpret_cast(cpu_tensor->tensor_data().data()) + << " " << device_tensor->NumElements() << " " + << cpu_tensor->shape().DebugString() << " " + << device_tensor->shape().DebugString(); + + const int64 total_bytes = cpu_tensor->TotalBytes(); + se::DeviceMemoryBase dev_src_ptr = + XlaTensor::DeviceMemoryFromTensor(*device_tensor); + void* dst_ptr = DMAHelper::base(cpu_tensor); + XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor); + + if (se::Event* event = + xla_tensor->GetDefinitionEvent(device_to_host_stream_.get())) { + device_to_host_stream_->ThenWaitFor(event); + xla_tensor->SetDefinedOn(device_to_host_stream_.get()); + } - done(status); + Status status; + if (transfer_as_literal_) { + TransferLiteralFromDevice(cpu_tensor, *device_tensor, done); return; + } else { + device_to_host_stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes); + // TODO(hpucha): Make this asynchronous. + Status block_status = device_to_host_stream_->BlockHostUntilDone(); + if (!block_status.ok()) { + status = xla::InternalError( + "Failed to complete data transfer on stream %p: %s", stream_.get(), + block_status.error_message().c_str()); + } } - VLOG(2) << "CopyDeviceTensorToCPU empty tensor"; - done(Status::OK()); + done(status); } void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor, Tensor* dst_tensor, const StatusCallback& done) { - // TODO(phawkins): replace this code with an asynchronous implementation. - auto body = [&]() { + VLOG(2) << "CopyDeviceTensorToDevice " + << reinterpret_cast(src_tensor.tensor_data().data()) + << " " + << reinterpret_cast(dst_tensor->tensor_data().data()); + // Perform memory allocation now, and enqueue the device-to-device transfer. + Status status = [&]() -> Status { if (src_tensor.NumElements() == 0) { return Status::OK(); } + // TODO(jmolloy): We co-opt the device_to_host stream for device to device + // transfers; perhaps we should have a dedicated device to device stream? or + // one per device? + auto device_to_device_stream = stream_; XlaTensor* xla_src = XlaTensor::FromTensor(&src_tensor); XlaTensor* xla_dst = XlaTensor::FromTensor(dst_tensor); CHECK(xla_src && xla_dst) << "Missing destination tensor for device-to-device copy"; if (!xla_dst->has_shaped_buffer()) { - TensorShape shape = - shape_representation_fn_(src_tensor.shape(), src_tensor.dtype()); + TF_ASSIGN_OR_RETURN( + TensorShape shape, + shape_representation_fn_(src_tensor.shape(), src_tensor.dtype())); TF_RETURN_IF_ERROR( xla_dst->AllocateShapedBuffer(src_tensor.dtype(), shape, client_, stream_->parent()->device_ordinal())); + if (stream_ != device_to_device_stream) { + // Initially wait for the compute stream so that memory allocations are + // synchronized. + device_to_device_stream->ThenWaitFor(stream_.get()); + } + } + + if (se::Event* event = + xla_src->GetDefinitionEvent(device_to_device_stream.get())) { + device_to_device_stream->ThenWaitFor(event); + xla_src->SetDefinedOn(device_to_device_stream.get()); + } + + auto from_iter = xla_src->shaped_buffer().buffers().begin(); + auto to_iter = xla_dst->shaped_buffer().buffers().begin(); + for (auto end_iter = xla_src->shaped_buffer().buffers().end(); + from_iter != end_iter; ++from_iter, ++to_iter) { + device_to_device_stream->ThenMemcpyD2D( + &to_iter->second, from_iter->second, to_iter->second.size()); + } + + if (UseMultipleStreams()) { + auto event = std::make_shared(stream_->parent()); + TF_RET_CHECK(event->Init()) << "Event failed to initialize"; + device_to_device_stream->ThenRecordEvent(event.get()); + xla_dst->SetDefinedOn(device_to_device_stream.get(), std::move(event)); } - TF_RETURN_IF_ERROR( - xla_dst->shaped_buffer().buffers().ForEachMutableElementWithStatus( - [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) { - const se::DeviceMemoryBase& from_buffer = - xla_src->shaped_buffer().buffers().element(index); - CHECK_EQ(buffer->size(), from_buffer.size()); - if (!stream_->parent()->SynchronousMemcpy(buffer, from_buffer, - buffer->size())) { - return errors::Internal("Device to device memcpy failed"); - } - return Status::OK(); - })); return Status::OK(); - }; - done(body()); + }(); + if (!status.ok()) { + return done(status); + } else { + stream_->ThenDoHostCallback([this, done]() { + // We must not call the done closure directly from DoHostCallback to avoid + // a deadlock. If done() is the callback that ends an Executor's run, the + // Executor may call XlaDevice::Sync() inside the callback. This + // deadlocks, because XlaDevice::Sync() waits for all stream activity to + // complete. + thread_pool_->Schedule([done]() { done(Status::OK()); }); + }); + } } XlaDeviceContext::XlaDeviceContext( - se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal, - XlaCompiler::ShapeRepresentationFn shape_representation_fn) - : manager_(stream, client, transfer_as_literal, - std::move(shape_representation_fn)) {} + std::shared_ptr compute_stream, + std::shared_ptr host_to_device_stream, + std::shared_ptr device_to_host_stream, xla::LocalClient* client, + bool transfer_as_literal, + XlaCompiler::ShapeRepresentationFn shape_representation_fn, + thread::ThreadPool* thread_pool) + : manager_(std::move(compute_stream), std::move(host_to_device_stream), + std::move(device_to_host_stream), client, transfer_as_literal, + std::move(shape_representation_fn), thread_pool) {} void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h index ee346e5653bbf9f393df202572c2150b4989506f..2e7445340cbaf788bfd06260f4376596895231c1 100644 --- a/tensorflow/compiler/jit/xla_device_context.h +++ b/tensorflow/compiler/jit/xla_device_context.h @@ -47,8 +47,12 @@ class XlaDeviceAllocator : public Allocator { class XlaTransferManager { public: explicit XlaTransferManager( - se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal, - XlaCompiler::ShapeRepresentationFn shape_representation_fn); + std::shared_ptr compute_stream, + std::shared_ptr host_to_device_stream, + std::shared_ptr device_to_host_stream, + xla::LocalClient* client, bool transfer_as_literal, + XlaCompiler::ShapeRepresentationFn shape_representation_fn, + thread::ThreadPool* thread_pool); void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, Tensor* device_tensor, StatusCallback done) const; @@ -59,17 +63,25 @@ class XlaTransferManager { void CopyDeviceTensorToDevice(const Tensor& src_tensor, Tensor* dst_tensor, const StatusCallback& done); - se::Stream* stream() const { return stream_; } + se::Stream* stream() const { return stream_.get(); } private: Status TransferLiteralToDevice(const Tensor& host_tensor, Tensor* device_tensor) const; - Status TransferLiteralFromDevice(Tensor* host_tensor, - const Tensor& device_tensor) const; - - // Stream obtained from a Device, used to transfer tensors between - // CPU and device. - se::Stream* stream_; + void TransferLiteralFromDevice(Tensor* host_tensor, + const Tensor& device_tensor, + const StatusCallback& done) const; + bool UseMultipleStreams() const { return stream_ != host_to_device_stream_; } + + // The main compute stream of the device, used to synchronize the transfer + // streams if they are set. + std::shared_ptr stream_; + // The stream to use for transferring data from host to device. Can be + // idential to stream_, but must not be nullptr. + std::shared_ptr host_to_device_stream_; + // The stream to use for transferring data from device to host. Can be + // idential to stream_, but must not be nullptr. + std::shared_ptr device_to_host_stream_; // For the underlying memory allocator and XLA's TransferManager. xla::LocalClient* client_; // Transfer manager, for marshalling data to and from the device. @@ -77,6 +89,9 @@ class XlaTransferManager { // True if we must use XLA's TransferManager for correct device transfers. const bool transfer_as_literal_; XlaCompiler::ShapeRepresentationFn shape_representation_fn_; + + // Thread pool used for running closures + thread::ThreadPool* thread_pool_; }; // DeviceContext for operators assigned to XlaDevice devices. The @@ -85,8 +100,12 @@ class XlaTransferManager { class XlaDeviceContext : public DeviceContext { public: explicit XlaDeviceContext( - se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal, - XlaCompiler::ShapeRepresentationFn shape_representation_fn); + std::shared_ptr compute_stream, + std::shared_ptr host_to_device_stream, + std::shared_ptr device_to_host_stream, + xla::LocalClient* client, bool transfer_as_literal, + XlaCompiler::ShapeRepresentationFn shape_representation_fn, + thread::ThreadPool* thread_pool); void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, Tensor* device_tensor, diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h index b27c32e9bcca5eb10ff6fc1d44760eff6e75678c..13da5d2f948df671df6d0d80687321eaaa923943 100644 --- a/tensorflow/compiler/jit/xla_device_ops.h +++ b/tensorflow/compiler/jit/xla_device_ops.h @@ -23,11 +23,18 @@ limitations under the License. #include "tensorflow/core/kernels/cast_op.h" #include "tensorflow/core/kernels/constant_op.h" #include "tensorflow/core/kernels/control_flow_ops.h" +#include "tensorflow/core/kernels/data/generator_dataset_op.h" +#include "tensorflow/core/kernels/data/iterator_ops.h" +#include "tensorflow/core/kernels/data/prefetch_dataset_op.h" +#include "tensorflow/core/kernels/fifo_queue.h" +#include "tensorflow/core/kernels/function_ops.h" #include "tensorflow/core/kernels/identity_n_op.h" #include "tensorflow/core/kernels/identity_op.h" #include "tensorflow/core/kernels/no_op.h" +#include "tensorflow/core/kernels/queue_op.h" #include "tensorflow/core/kernels/resource_variable_ops.h" #include "tensorflow/core/kernels/sendrecv_ops.h" +#include "tensorflow/core/kernels/shape_ops.h" #include "tensorflow/core/kernels/variable_ops.h" namespace tensorflow { @@ -74,9 +81,7 @@ class XlaAssignVariableOp : public AsyncOpKernel { ConstantOp); \ REGISTER_KERNEL_BUILDER( \ Name("Identity").Device(DEVICE).TypeConstraint("T", TYPES), IdentityOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("IdentityN").Device(DEVICE).TypeConstraint("T", TYPES), \ - IdentityNOp); \ + REGISTER_KERNEL_BUILDER(Name("IdentityN").Device(DEVICE), IdentityNOp); \ REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE), PlaceholderOp); \ REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE), \ PlaceholderOp); \ @@ -87,6 +92,49 @@ class XlaAssignVariableOp : public AsyncOpKernel { REGISTER_KERNEL_BUILDER( \ Name("ReadVariableOp").Device(DEVICE).HostMemory("resource"), \ ReadVariableOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("DestroyResourceOp").Device(DEVICE).HostMemory("resource"), \ + DestroyResourceOp); \ + REGISTER_KERNEL_BUILDER(Name("Shape") \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("out_type") \ + .TypeConstraint("T", TYPES), \ + ShapeOp); \ + REGISTER_KERNEL_BUILDER(Name("Shape") \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("out_type") \ + .TypeConstraint("T", TYPES), \ + ShapeOp); \ + REGISTER_KERNEL_BUILDER(Name("ShapeN") \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("out_type") \ + .TypeConstraint("T", TYPES), \ + ShapeNOp); \ + REGISTER_KERNEL_BUILDER(Name("ShapeN") \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("out_type") \ + .TypeConstraint("T", TYPES), \ + ShapeNOp); \ + REGISTER_KERNEL_BUILDER(Name("Size") \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("out_type") \ + .TypeConstraint("T", TYPES), \ + SizeOp); \ + REGISTER_KERNEL_BUILDER(Name("Size") \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("out_type") \ + .TypeConstraint("T", TYPES), \ + SizeOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("Rank").Device(DEVICE).HostMemory("output").TypeConstraint("T", \ + TYPES), \ + RankOp); \ REGISTER_KERNEL_BUILDER( \ Name("AssignVariableOp").Device(DEVICE).HostMemory("resource"), \ XlaAssignVariableOp); \ @@ -95,7 +143,105 @@ class XlaAssignVariableOp : public AsyncOpKernel { REGISTER_KERNEL_BUILDER(Name("Switch").Device(DEVICE).HostMemory("pred"), \ SwitchOp); \ REGISTER_KERNEL_BUILDER( \ - Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp); + Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp); \ + REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE), EnterOp); \ + REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE), ExitOp); \ + REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE), \ + NextIterationOp); \ + REGISTER_KERNEL_BUILDER(Name("LoopCond") \ + .Device(DEVICE) \ + .HostMemory("input") \ + .HostMemory("output"), \ + LoopCondOp); \ + \ + REGISTER_KERNEL_BUILDER( \ + Name("QueueEnqueueV2").Device(DEVICE).HostMemory("handle"), EnqueueOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("QueueDequeueV2").Device(DEVICE).HostMemory("handle"), DequeueOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("QueueCloseV2").Device(DEVICE).HostMemory("handle"), QueueCloseOp); \ + REGISTER_KERNEL_BUILDER(Name("QueueSizeV2") \ + .Device(DEVICE) \ + .HostMemory("size") \ + .HostMemory("handle"), \ + QueueSizeOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("QueueIsClosedV2").Device(DEVICE).HostMemory("handle"), \ + QueueIsClosedOp); \ + \ + REGISTER_KERNEL_BUILDER( \ + Name("FIFOQueueV2").Device(DEVICE).HostMemory("handle"), FIFOQueueOp); \ + \ + REGISTER_KERNEL_BUILDER( \ + Name(kArgOp).Device(DEVICE).HostMemory("output").TypeConstraint("T", \ + TYPES), \ + ArgOp); \ + REGISTER_KERNEL_BUILDER(Name(kArgOp) \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + ArgOp); \ + \ + REGISTER_KERNEL_BUILDER(Name(kRetOp) \ + .Device(DEVICE) \ + .TypeConstraint("T", TYPES) \ + .HostMemory("input"), \ + RetvalOp); \ + REGISTER_KERNEL_BUILDER(Name(kRetOp) \ + .Device(DEVICE) \ + .TypeConstraint("T") \ + .HostMemory("input"), \ + RetvalOp); \ + \ + REGISTER_KERNEL_BUILDER( \ + Name("RemoteCall").Device(DEVICE).HostMemory("target"), RemoteCallOp); \ + \ + REGISTER_KERNEL_BUILDER( \ + Name("GeneratorDataset").Device(DEVICE).HostMemory("handle"), \ + GeneratorDatasetOp); \ + REGISTER_KERNEL_BUILDER(Name("PrefetchDataset") \ + .Device(DEVICE) \ + .HostMemory("buffer_size") \ + .HostMemory("input_dataset") \ + .HostMemory("handle"), \ + PrefetchDatasetOp); \ + \ + REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE), \ + IteratorHandleOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("MakeIterator").Device(DEVICE).HostMemory("dataset"), \ + MakeIteratorOp); \ + REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE), \ + AnonymousIteratorHandleOp); \ + REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE), \ + IteratorGetNextOp); \ + REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE), \ + IteratorGetNextSyncOp); \ + REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle") \ + .Device(DEVICE) \ + .HostMemory("string_handle"), \ + IteratorToStringHandleOp); \ + REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2") \ + .Device(DEVICE) \ + .HostMemory("string_handle"), \ + IteratorFromStringHandleOp); \ + REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kArgOp) \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + ArgOp); \ + REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kRetOp) \ + .Device(DEVICE) \ + .TypeConstraint("T") \ + .HostMemory("input"), \ + RetvalOp); + +// TODO(phawkins): currently we do not register the QueueEnqueueMany, +// QueueDequeueMany, or QueueDequeueUpTo kernels because they attempt to read +// and write the tensors they access in order to concatenate them into a batch. +// We would need either to call out to an XLA computation to perform the +// concatenation, or we would need to refactor those kernels so the splitting +// or merging is done in a separate operator that can be compiled. } // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.cc b/tensorflow/compiler/jit/xla_fusion_optimizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..07cfab615157650aea0e15cdafa8c9b0925f9e5f --- /dev/null +++ b/tensorflow/compiler/jit/xla_fusion_optimizer.cc @@ -0,0 +1,342 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/xla_fusion_optimizer.h" + +#include +#include +#include +#include + +#include "tensorflow/compiler/jit/deadness_analysis.h" +#include "tensorflow/compiler/jit/defs.h" +#include "tensorflow/compiler/jit/graphcycles/graphcycles.h" +#include "tensorflow/compiler/jit/union_find.h" +#include "tensorflow/compiler/jit/xla_cluster_util.h" +#include "tensorflow/core/common_runtime/shape_refiner.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/grappler/grappler_item.h" +#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" + +namespace tensorflow { + +// Is 'node' an operator that consumes only the shape of its input, not the +// data itself? +static bool IsShapeConsumerOp(const Node& node) { + return node.type_string() == "Shape" || node.type_string() == "ShapeN" || + node.type_string() == "Rank" || node.type_string() == "Size"; +} + +// Returns true if the op can be decomposed into XLA ops for which +// there are fusible elemental implementations. +static bool IsXlaFusible(const NodeDef& node) { + static const std::unordered_set* elementwise_ops = + new std::unordered_set( + {// tf2xla/kernels/aggregate_ops.cc + "AddN", + // tf2xla/kernels/binary_ops.cc + "Add", "Sub", "Mul", "Div", "Atan2", "Complex", "FloorDiv", + "FloorMod", "BitwiseAnd", "BitwiseOr", "LeftShift", "RightShift", + "LogicalAnd", "LogicalOr", "Mod", "Maximum", "Minimum", "RealDiv", + "ReciprocalGrad", "RsqrtGrad", "SqrtGrad", "SquaredDifference", + "TruncateDiv", "TruncateMod", "Equal", "NotEqual", "Greater", + "GreaterEqual", "Less", "LessEqual", "SigmoidGrad", "SoftplusGrad", + "SoftsignGrad", "TanhGrad", "Pow", "ApproximateEqual", + // tf2xla/kernels/unary_ops.cc + "ComplexAbs", "Angle", "Conj", "Abs", "Acos", "Acosh", "Asin", + "Asinh", "Atan", "Atanh", "Ceil", "Cos", "Cosh", "Sin", "Exp", + "Expm1", "Floor", "IsFinite", "IsInf", "IsNan", "Inv", "Reciprocal", + "Log", "Log1p", "Invert", "LogicalNot", "Neg", "Rint", "Round", + "Rsqrt", "Sigmoid", "Sign", "Sinh", "Softplus", "Softsign", "Sqrt", + "Square", "Tan", "Tanh", "Real", "Imag", + // tf2xla/kernels/bcast_ops.cc + "BroadcastArgs", "BroadcastGradientArgs", + // tf2xla/kernels/bias_ops.cc + "BiasAdd", "BiasAddV1", "BiasAddGrad" /*(Reduce)*/, + // tf2xla/kernels/cast_op.cc + "Cast", + // tf2xla/kernels/concat_op.cc + "Concat", "ConcatV2", "ConcatOffset", + // tf2xla/kernels/const_op.cc + "Const", + // tf2xla/kernels/elu_op.cc + "Elu", "EluGrad", "Selu", "SeluGrad", + // tf2xla/kernels/fill_op.cc + "Fill", + // tf2xla/kernels/identity_op.cc + "Identity", "IdentityN", "PreventGradient", + "StopGradient", /*"Snapshot",*/ + // tf2xla/kernels/index_ops.cc + "ArgMax", "ArgMin", + // tf2xla/kernels/mirror_pad_op.cc + "MirrorPad", + // tf2xla/kernels/one_hot_op.cc + "OneHot", + // tf2xla/kernels/pack_op.cc + "Pack", + // tf2xla/kernels/pad_op.cc + "Pad", "PadV2", + // tf2xla/kernels/relu_op.cc + "Relu", "Relu6", "ReluGrad", "Relu6Grad", + // tf2xla/kernels/reshape_op.cc + "Reshape", + // tf2xla/kernels/reverse_op.cc + "Reverse", "ReverseV2", + // tf2xla/kernels/reverse_sequence_op.cc + "ReverseSequence", + // tf2xla/kernels/shape_op.cc + "Shape", "ShapeN", "Rank", "Size", "ExpandDims", "Squeeze", + "ZerosLike", "OnesLike", + // tf2xla/kernels/slice_op.cc + "Slice", + // tf2xla/kernels/split_op.cc + "Split", "SplitV", + // tf2xla/kernels/strided_slice_op.cc + "StridedSlice", "StridedSliceGrad", "ResourceStridedSliceAssign", + // tf2xla/kernels/tile_ops.cc + "Tile", + // tf2xla/kernels/transpose_op.cc + "Transpose", "InvertPermutation", + // tf2xla/kernels/unpack_op.cc + "Unpack"}); + + return elementwise_ops->count(node.op()) > 0; +} + +Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster, + const grappler::GrapplerItem& item, + GraphDef* output) { + VLOG(2) << "Here at fusion optimizer"; + + // TODO(hpucha): Implement encapsulation and replacing with XlaLaunch op. + // Once that happens, the expected interaction between this optimizer and when + // the global_jit_level is set is as follows: Fusion optimizer will replace + // appropriate fusion clusters with XlaLaunch nodes. The remaining graph can + // be further compiled where possible via mark_for_compilation_pass. Note that + // this might lead to inefficient clustering, and it is best to use either the + // fusion optimizer or the global_jit flag, and not combine the two. + + // Create a Graph out of GraphDef. This is required currently because the + // helpers around clustering, encapsulation etc work on graphs. + FunctionLibraryDefinition function_library(OpRegistry::Global(), + item.graph.library()); + Graph graph(function_library); + ShapeRefiner shape_refiner(graph.versions(), graph.op_registry()); + shape_refiner.set_require_shape_inference_fns(false); + shape_refiner.set_disable_constant_propagation(true); + ImportGraphDefOptions options; + // Graph optimization happens at the late stage of graph execution, when + // colocation constraints are already validated previously and the device + // placement of nodes has also completed, so there is no need to validate + // colocation constraints again. + options.validate_colocation_constraints = false; + options.validate_shape = false; + TF_RETURN_IF_ERROR( + ImportGraphDef(options, item.graph, &graph, &shape_refiner)); + + std::unique_ptr deadness; + TF_RETURN_IF_ERROR(DeadnessAnalysis::Run(graph, &deadness)); + + // Collect nodes that can be fused via XLA, while ignoring those that + // explicitly ask for XLA: (*) nodes that are marked to be compiled + // explicitly. (*) nodes assigned to XLA device. + OrderedNodeSet compilation_candidates; + for (Node* node : graph.op_nodes()) { + // If there is a _XlaCompile annotation, ignore the node if it is + // true. Nodes are marked with this attr via experimental_jit_scope, and + // will be handled by the mark_for_compilation pass. + bool compile = false; + Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile); + if (status.ok() && compile) { + continue; + } + // If there is already a _XlaCluster annotation, ignore the node. Nodes are + // marked with this attr to indicate they are already part of a cluster and + // hence ignored. + status = GetNodeAttr(node->attrs(), kXlaClusterAttr, &compile); + if (status.ok()) { + continue; + } + + // If there is an explicit XLA device placement, ignore the node. + DeviceType device_type(""); + TF_RETURN_IF_ERROR(DeviceToDeviceType(node->def().device(), &device_type)); + if (device_type.type_string().find("XLA") != string::npos) continue; + + // Assume all fusible ops are registered. + // TODO(hpucha): Check for registration if possible. + if (!IsXlaFusible(node->def())) { + continue; + } + + // XLA does not offer guaranteed aliasing between the input and output of + // the XLA cluster so it can't implement the forward-tensor-ref semantic. + // Leave such nodes out of XLA clusters. + if (HasForwardedRefInput(*node)) { + continue; + } + + // If inputs to `node` can have conflicting deadness (i.e. some are alive + // and some are dead) then don't compile it. XLA cannot represent the + // deadness semantics of these nodes correctly and auto-clustering these + // nodes can cause deadness to propagate to nodes that should be live. + if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) { + continue; + } + + compilation_candidates.insert(node); + } + + if (compilation_candidates.empty()) { + VLOG(2) << "No compilable candidates"; + *output = item.graph; + return Status::OK(); + } + + GraphCycles cycles; + TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(&graph, &cycles)); + TF_RETURN_IF_ERROR(AdjustCycleDetectionGraphForResourceOps( + &graph, &graph.flib_def(), /*resource_ops_to_ignore=*/{}, &cycles)); + + // TODO(hpucha): Make clustering more robust. There are two known issues that + // we need to mitigate: (a) Non-resource variables can cause deadlocks + // when clustering changes order of execution. See b/77263461 for a specific + // example. (b) Queue operations can also cause deadlocks. See b/77261498 for + // example. + + struct Cluster { + // Identifies the node that represents this cluster in the cycle detection + // graph. + int representative = -1; + }; + + // Each compilation candidate belongs to a cluster. The cluster's + // representative names the node in the 'cycles' graph that represents the + // cluster. + std::vector> clusters(graph.num_node_ids()); + std::deque*> worklist; + for (Node* node : compilation_candidates) { + Cluster& cluster = clusters[node->id()].Get(); + cluster.representative = node->id(); + worklist.push_back(&clusters[node->id()]); + } + + // Repeatedly contract edges between clusters that are on the same device, + // provided the contraction would not create a cycle. This is a simplified + // version of the clustering in mark_for_compilation_pass that also deals with + // nodes that are explicitly tagged to be compiled/clustered. + while (!worklist.empty()) { + int from = worklist.front()->Get().representative; + worklist.pop_front(); + + Node* node_from = graph.FindNodeId(from); + if (node_from->IsControlFlow()) { + // Control flow nodes aren't compilation candidates and should never + // appear. + return errors::Internal( + "Found control flow node in clustering worklist: ", + node_from->type_string()); + } + for (int to : cycles.Successors(from)) { + if (to >= graph.num_node_ids()) { + // Node is a "frame" node that is present only in the cycle detection + // graph. No clustering is possible. + continue; + } + Node* node_to = graph.FindNodeId(to); + if (compilation_candidates.find(node_to) == + compilation_candidates.cend()) { + continue; + } + + // Do not cluster across devices. + if (node_from->def().device() != node_to->def().device()) { + VLOG(2) << "Devices " << node_from->def().device() << " " + << node_to->def().device(); + VLOG(2) << "Device names " << node_from->assigned_device_name() << " " + << node_to->assigned_device_name(); + continue; + } + + // Ops that consume shapes cannot be the root of a cluster. This is an + // optimization. + if (clusters[from].Size() == 1 && IsShapeConsumerOp(*node_from)) { + continue; + } + + // If contracting the edge would create a cycle, bail out. + // However, just because we can't merge the clusters now does not mean + // we won't be able to merge them in the future. + // e.g., if we have edges 1->2, 2->3 and 1->3, we cannot contract edge + // 1->3. But if we first contract 1->2 then we can later contract 1->3. + if (!cycles.ContractEdge(from, to)) continue; + + // Merge the clusters. ContractEdge uses 'from' as the number of the + // merged node, so make sure 'from' is the chosen representative. + clusters[from].Merge(&clusters[to]); + + worklist.push_back(&clusters[from]); + break; + } + } + + // Count the number of non-trivial elements in each cluster. + std::vector effective_cluster_sizes(graph.num_node_ids()); + for (const Node* n : compilation_candidates) { + int cluster = clusters[n->id()].Get().representative; + // Identity nodes will be removed if the node gets marked for compilation. + // Therefore we don't want to count them towards the effective cluster size. + if (n->def().op() != "Identity") { + effective_cluster_sizes[cluster]++; + } + } + + const int min_cluster_size = 2; + int num_clusters = 0; + for (auto size : effective_cluster_sizes) { + if (size >= min_cluster_size) { + VLOG(3) << "Cluster " << num_clusters << " " << size; + num_clusters++; + } + } + + // Names for each cluster. + std::unordered_map cluster_names; + // Sequence number generator to ensure clusters have unique names. + static std::atomic cluster_sequence_num; + + for (Node* n : compilation_candidates) { + int cluster = clusters[n->id()].Get().representative; + + // Compile if this is a cluster of >= min_cluster_size compilable operators. + if (effective_cluster_sizes[cluster] >= min_cluster_size) { + string& name = cluster_names[cluster]; + + if (name.empty()) { + name = strings::StrCat("cluster_", cluster_sequence_num++); + } + n->AddAttr(kXlaClusterAttr, name); + VLOG(3) << "Assigning node " << n->name() << " to cluster " << name; + } + } + + graph.ToGraphDef(output); + return Status::OK(); +} + +REGISTER_GRAPH_OPTIMIZER_AS(XlaFusionOptimizer, "xla-fusion"); + +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.h b/tensorflow/compiler/jit/xla_fusion_optimizer.h new file mode 100644 index 0000000000000000000000000000000000000000..3d2309e782d38725f8db025fbfda0bf0f63d18be --- /dev/null +++ b/tensorflow/compiler/jit/xla_fusion_optimizer.h @@ -0,0 +1,49 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_JIT_XLA_FUSION_OPTIMIZER_H_ +#define TENSORFLOW_COMPILER_JIT_XLA_FUSION_OPTIMIZER_H_ + +#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h" + +namespace tensorflow { + +// Optimizes graphs by fusing ops where possible, resulting in more efficient +// execution. +class XlaFusionOptimizer : public grappler::CustomGraphOptimizer { + public: + XlaFusionOptimizer() {} + ~XlaFusionOptimizer() override {} + + Status Init( + const RewriterConfig_CustomGraphOptimizer* config = nullptr) override { + return Status::OK(); + } + + string name() const override { return "xla-fusion"; }; + + Status Optimize(grappler::Cluster* cluster, + const grappler::GrapplerItem& item, + GraphDef* output) override; + + void Feedback(grappler::Cluster* cluster, const grappler::GrapplerItem& item, + const GraphDef& optimize_output, double result) override { + // Nothing to do for XlaFusionOptimizer. + } +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_JIT_XLA_FUSION_OPTIMIZER_H_ diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..68e19c8a135735a79fcabf121e619157fa22b4d8 --- /dev/null +++ b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc @@ -0,0 +1,208 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/xla_fusion_optimizer.h" +#include "tensorflow/cc/ops/resource_variable_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/jit/defs.h" +#include "tensorflow/compiler/jit/xla_cluster_util.h" +#include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/graph/graph_def_builder_util.h" +#include "tensorflow/core/grappler/utils/grappler_test.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace { + +REGISTER_OP("UncompilableNullary").Output("o: float"); +REGISTER_OP("UncompilableUnary").Input("a: float").Output("o: float"); + +class XlaFusionOptimizerTest : public grappler::GrapplerTest { + protected: + std::unordered_map GetClusters(const GraphDef& graph) { + std::unordered_map ids; + for (const NodeDef& node : graph.node()) { + string cluster; + if (GetNodeAttr(AttrSlice(node), kXlaClusterAttr, &cluster).ok()) { + CHECK(!cluster.empty()); + ids[node.name()] = cluster; + } + } + return ids; + } +}; + +TEST_F(XlaFusionOptimizerTest, Chains) { + GraphDef graph; + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* a = + ops::SourceOp("UncompilableNullary", builder.opts().WithName("A")); + Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B")); + Node* c = ops::UnaryOp("Relu", b, builder.opts().WithName("C")); + Node* d = + ops::UnaryOp("UncompilableUnary", c, builder.opts().WithName("D")); + Node* e = ops::UnaryOp("Relu", d, builder.opts().WithName("E")); + ops::UnaryOp("Relu", e, builder.opts().WithName("F")); + TF_ASSERT_OK(builder.ToGraphDef(&graph)); + } + grappler::GrapplerItem item; + item.graph = graph; + + XlaFusionOptimizer optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + auto clusters = GetClusters(output); + EXPECT_EQ(4, clusters.size()); + EXPECT_EQ(clusters["B"], clusters["C"]); + EXPECT_EQ(clusters["E"], clusters["F"]); + EXPECT_NE(clusters["B"], clusters["E"]); + EXPECT_TRUE(clusters.find("A") == clusters.cend()); + EXPECT_TRUE(clusters.find("D") == clusters.cend()); +} + +TEST_F(XlaFusionOptimizerTest, FusibleOps) { + GraphDef graph; + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* a = ops::SourceOp( + "Placeholder", + builder.opts().WithName("A").WithAttr("dtype", tensorflow::DT_FLOAT)); + Node* b = ops::SourceOp( + "Placeholder", + builder.opts().WithName("B").WithAttr("dtype", tensorflow::DT_FLOAT)); + + Node* c = ops::BinaryOp("Add", a, b, builder.opts().WithName("C")); + ops::BinaryOp("MatMul", a, c, builder.opts().WithName("D")); + ops::UnaryOp("Abs", c, builder.opts().WithName("E")); + + TF_ASSERT_OK(builder.ToGraphDef(&graph)); + } + grappler::GrapplerItem item; + item.graph = graph; + + XlaFusionOptimizer optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + auto clusters = GetClusters(output); + EXPECT_EQ(2, clusters.size()); + EXPECT_EQ(clusters["C"], clusters["E"]); + EXPECT_TRUE(clusters.find("D") == clusters.cend()); +} + +TEST_F(XlaFusionOptimizerTest, IgnoreExplicitXLAAttrs) { + GraphDef graph; + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* a = ops::SourceOp( + "Placeholder", + builder.opts().WithName("A").WithAttr("dtype", tensorflow::DT_FLOAT)); + Node* b = ops::SourceOp( + "Placeholder", + builder.opts().WithName("B").WithAttr("dtype", tensorflow::DT_FLOAT)); + + Node* c = ops::BinaryOp( + "Add", a, b, + builder.opts().WithName("C").WithDevice("/device:XLA_CPU")); + ops::BinaryOp("MatMul", a, c, builder.opts().WithName("D")); + Node* e = ops::UnaryOp("Abs", c, builder.opts().WithName("E")); + ops::UnaryOp("Cos", e, + builder.opts().WithName("F").WithAttr(kXlaCompileAttr, true)); + + TF_ASSERT_OK(builder.ToGraphDef(&graph)); + } + grappler::GrapplerItem item; + item.graph = graph; + + XlaFusionOptimizer optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + auto clusters = GetClusters(output); + EXPECT_TRUE(clusters.empty()); +} + +TEST_F(XlaFusionOptimizerTest, UncompilableCycles) { + GraphDef graph; + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* a = ops::SourceOp("Const", builder.opts() + .WithName("A") + .WithAttr("dtype", DT_FLOAT) + .WithAttr("value", Tensor())); + Node* b = + ops::UnaryOp("UncompilableUnary", a, builder.opts().WithName("B")); + ops::BinaryOp("Mul", a, b, builder.opts().WithName("C")); + + TF_ASSERT_OK(builder.ToGraphDef(&graph)); + } + grappler::GrapplerItem item; + item.graph = graph; + + XlaFusionOptimizer optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + auto clusters = GetClusters(output); + EXPECT_TRUE(clusters.empty()); +} + +TEST_F(XlaFusionOptimizerTest, CompilableCycles) { + GraphDef graph; + { + GraphDefBuilder builder(GraphDefBuilder::kFailImmediately); + Node* a = ops::SourceOp("Const", builder.opts() + .WithName("A") + .WithAttr("dtype", DT_FLOAT) + .WithAttr("value", Tensor())); + Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B")); + ops::BinaryOp("Mul", a, b, builder.opts().WithName("C")); + TF_ASSERT_OK(builder.ToGraphDef(&graph)); + } + grappler::GrapplerItem item; + item.graph = graph; + + XlaFusionOptimizer optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + auto clusters = GetClusters(output); + EXPECT_EQ(3, clusters.size()); + EXPECT_EQ(clusters["A"], clusters["B"]); + EXPECT_EQ(clusters["A"], clusters["C"]); +} + +TEST_F(XlaFusionOptimizerTest, ResourcesClusteringDisallowed) { + Scope root = Scope::NewRootScope().ExitOnError(); + Output var_handle = + ops::VarHandleOp(root.WithOpName("Var"), DT_FLOAT, TensorShape({})); + Output to_assign = ops::Const(root.WithOpName("Const"), 10.0f); + Output begin = ops::Const(root.WithOpName("begin"), 0); + Output end = ops::Const(root.WithOpName("end"), 1); + Output strides = ops::Const(root.WithOpName("strides"), 1); + ops::ResourceStridedSliceAssign assign_1( + root.WithOpName("assign_1"), var_handle, begin, end, strides, to_assign); + ops::ResourceStridedSliceAssign assign_2( + root.WithOpName("assign_2"), var_handle, begin, end, strides, to_assign); + root.graph()->AddControlEdge(assign_1.operation.node(), + assign_2.operation.node()); + grappler::GrapplerItem item; + root.graph()->ToGraphDef(&item.graph); + + XlaFusionOptimizer optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + auto clusters = GetClusters(output); + EXPECT_NE(clusters["assign_1"], clusters["assign_2"]); +} +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index c0d86a28c7698c302e28bab972bb2f847cc00ca4..ef4466f0056ea98adc1ae6774105466af0d14293 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -49,6 +49,7 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options, XlaDevice::Create("CUDA", DEVICE_XLA_GPU, 0, DEVICE_GPU_XLA_JIT, options, name_prefix, registration, /*transfer_as_literal=*/false, + /*use_multiple_streams=*/false, /*shape_representation_fn=*/{}, /*padded_shape_fn=*/{}, &device); if (!status.ok()) { @@ -58,7 +59,7 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options, } // TODO(b/78468222): Uncomment after fixing this bug - // status = device->CreateAndSetGpuDeviceInfo(); + // status = device->UseGpuDeviceInfo(); // if (!status.ok()) { // errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT, // " device"); diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc index 661187f4a873b03b8d013aa74cb6b6315bb4e2eb..45745596749207189c60ee1e3dcf19b6ecb7eb5b 100644 --- a/tensorflow/compiler/jit/xla_interpreter_device.cc +++ b/tensorflow/compiler/jit/xla_interpreter_device.cc @@ -52,6 +52,7 @@ Status XlaInterpreterDeviceFactory::CreateDevices( DEVICE_INTERPRETER_XLA_JIT, options, name_prefix, registration, /*transfer_as_literal=*/false, + /*use_multiple_streams=*/false, /*shape_representation_fn=*/{}, /*padded_shape_fn=*/{}, &device)); devices->push_back(device.release()); diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index d0c7a9365125708b2af43f87c7617d8d84050a61..affeab4a8c43b63ac0e2b8ef40de5223ce39d410 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -15,6 +15,9 @@ limitations under the License. #include "tensorflow/compiler/jit/xla_launch_util.h" +#include + +#include "absl/memory/memory.h" #include "tensorflow/compiler/jit/defs.h" #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" @@ -64,11 +67,13 @@ xla::StatusOr XlaAllocator::Allocate( int device_ordinal, uint64 size, bool retry_on_failure) { AllocationAttributes attrs; attrs.no_retry_on_failure = !retry_on_failure; - void* data = - wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size, attrs); - if (data == nullptr) { - return errors::ResourceExhausted("Out of memory while trying to allocate ", - size, " bytes."); + void* data = nullptr; + if (size != 0) { + data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size, attrs); + if (data == nullptr) { + return errors::ResourceExhausted( + "Out of memory while trying to allocate ", size, " bytes."); + } } return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size), device_ordinal, this); @@ -115,14 +120,22 @@ using internal::ExtractSubShapedBuffer; XlaComputationLaunchContext::XlaComputationLaunchContext( xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator, - bool allocate_xla_tensors) + bool allocate_xla_tensors, bool use_multiple_streams) : client_(client), xla_allocator_(xla_allocator), - allocate_xla_tensors_(allocate_xla_tensors) {} + allocate_xla_tensors_(allocate_xla_tensors), + use_multiple_streams_(use_multiple_streams) { + if (use_multiple_streams_) { + CHECK(allocate_xla_tensors_) << "To use multiple streams correctly we must " + "be allocating XLA tensors!"; + } +} void XlaComputationLaunchContext::PopulateInputs( OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, const std::map& variables) { + se::Stream* stream = + ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr; // Build ShapedBuffers that point directly to the Tensor buffers. arg_buffers_.reserve(kernel->xla_input_shapes.size() + 1); arg_buffers_.resize(kernel->xla_input_shapes.size()); @@ -140,6 +153,16 @@ void XlaComputationLaunchContext::PopulateInputs( t = &(ctx->input(arg_num)); } + if (use_multiple_streams_) { + CHECK(stream) << "Must have a stream available when using XLA tensors!"; + XlaTensor* xla_tensor = XlaTensor::FromTensor(t); + CHECK(xla_tensor); + if (se::Event* event = xla_tensor->GetDefinitionEvent(stream)) { + stream->ThenWaitFor(event); + xla_tensor->SetDefinedOn(stream); + } + } + const xla::Shape on_device_shape = client_->backend().transfer_manager()->HostShapeToDeviceShape(shape); if (xla::ShapeUtil::IsTuple(on_device_shape)) { @@ -153,7 +176,7 @@ void XlaComputationLaunchContext::PopulateInputs( << " not the same as on-host shape " << xla::ShapeUtil::HumanStringWithLayout(shape); se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t); - arg_buffers_[i] = xla::MakeUnique( + arg_buffers_[i] = absl::make_unique( /*on_host_shape=*/shape, /*on_device_shape=*/shape, client_->platform(), client_->default_device_ordinal()); arg_buffers_[i]->set_buffer(dmem, /*index=*/{}); @@ -162,7 +185,7 @@ void XlaComputationLaunchContext::PopulateInputs( } } -void XlaComputationLaunchContext::PopulateOutputs( +Status XlaComputationLaunchContext::PopulateOutputs( OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, ScopedShapedBuffer output) { se::Stream* stream = @@ -176,6 +199,30 @@ void XlaComputationLaunchContext::PopulateOutputs( } CHECK_EQ(ctx->num_outputs(), kernel->outputs.size()); + // If the on-host-shape isn't a tuple, create a new single-element tuple + // buffer with a nullptr root index table. This allows the code below to treat + // output as a tuple unconditionally. + if (!xla::ShapeUtil::IsTuple(output.on_host_shape())) { + ShapedBuffer nontuple_buffer = output.release(); + ShapedBuffer buffer( + xla::ShapeUtil::MakeTupleShape({nontuple_buffer.on_host_shape()}), + xla::ShapeUtil::MakeTupleShape({nontuple_buffer.on_device_shape()}), + output.platform(), output.device_ordinal()); + buffer.buffers().CopySubtreeFrom(nontuple_buffer.buffers(), + /*source_base_index=*/{}, + /*target_base_index=*/{0}); + output = ScopedShapedBuffer(std::move(buffer), output.memory_allocator()); + } + + std::shared_ptr definition_event; + if (use_multiple_streams_) { + definition_event = std::make_shared(stream->parent()); + if (!definition_event->Init()) { + return errors::Internal("Failed to initialize tensor definition event."); + } + stream->ThenRecordEvent(definition_event.get()); + } + // Copy XLA results to the OpOutputList. int output_num = 0; for (int i = 0; i < ctx->num_outputs(); ++i) { @@ -193,12 +240,13 @@ void XlaComputationLaunchContext::PopulateOutputs( // reallocate the device buffer later. VLOG(1) << "Constant output tensor on device"; - OP_REQUIRES_OK( - ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor)); + TF_RETURN_IF_ERROR( + ctx->allocate_output(i, const_tensor.shape(), &output_tensor)); Device* device = dynamic_cast(ctx->device()); - OP_REQUIRES(ctx, device != nullptr, - errors::Internal("DeviceBase was not a Device.")); + if (device == nullptr) { + return errors::Internal("DeviceBase was not a Device."); + } ctx->op_device_context()->CopyCPUTensorToDevice( &const_tensor, device, output_tensor, [&](Status status) { TF_CHECK_OK(status); }); @@ -223,23 +271,36 @@ void XlaComputationLaunchContext::PopulateOutputs( } } else { const TensorShape& shape = kernel->outputs[i].shape; - VLOG(2) << "Retval " << i << " shape " << shape.DebugString(); - - se::DeviceMemoryBase buffer = output.buffer({output_num}); - if (allocate_xla_tensors_) { - Tensor* output_tensor; - OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor)); - XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor); - CHECK(xla_tensor); - xla_tensor->set_shaped_buffer(ScopedShapedBuffer( - ExtractSubShapedBuffer(&output, output_num, xla_allocator_))); + const DataType& type = kernel->outputs[i].type; + VLOG(2) << "Retval " << i << " shape " << shape.DebugString() << " type " + << DataTypeString(type); + if (type == DT_RESOURCE) { + ctx->set_output(i, ctx->input(kernel->outputs[i].input_index)); } else { - Tensor output_tensor = XlaTensorBuffer::MakeTensor( - ctx->expected_output_dtype(i), shape, buffer, allocator); - output.set_buffer(xla::OwningDeviceMemory(), {output_num}); - ctx->set_output(i, output_tensor); + se::DeviceMemoryBase buffer = output.buffer({output_num}); + if (allocate_xla_tensors_) { + Tensor* output_tensor; + TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor)); + XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor); + if (xla_tensor) { + xla_tensor->set_shaped_buffer(ScopedShapedBuffer( + ExtractSubShapedBuffer(&output, output_num, xla_allocator_))); + if (use_multiple_streams_) { + xla_tensor->SetDefinedOn(stream, definition_event); + } + } else { + // xla_tensor wasn't valid, which must mean this is a zero-element + // tensor. + CHECK_EQ(output_tensor->TotalBytes(), 0); + } + } else { + Tensor output_tensor = XlaTensorBuffer::MakeTensor( + ctx->expected_output_dtype(i), shape, buffer, allocator); + output.set_buffer(xla::OwningDeviceMemory(), {output_num}); + ctx->set_output(i, output_tensor); + } + ++output_num; } - ++output_num; } if (VLOG_IS_ON(3)) { @@ -252,36 +313,40 @@ void XlaComputationLaunchContext::PopulateOutputs( for (int i = 0; i < kernel->resource_updates.size(); ++i) { Allocator* allocator = ctx->device()->GetAllocator({}); const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i]; - OP_REQUIRES(ctx, - write.input_index >= 0 && write.input_index < ctx->num_inputs(), - errors::Internal("Invalid input index for variable write.")); + if (write.input_index < 0 || write.input_index >= ctx->num_inputs()) { + return errors::Internal("Invalid input index for variable write."); + } se::DeviceMemoryBase buffer = output.buffer({output_num}); Var* variable = nullptr; // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor, // not a Tensor. - OP_REQUIRES_OK(ctx, LookupOrCreateResource( - ctx, HandleFromInput(ctx, write.input_index), - &variable, [this, ctx, &write](Var** ptr) { - *ptr = new Var(write.type); - return Status::OK(); - })); + TF_RETURN_IF_ERROR(LookupOrCreateResource( + ctx, HandleFromInput(ctx, write.input_index), &variable, + [&write](Var** ptr) { + *ptr = new Var(write.type); + return Status::OK(); + })); core::ScopedUnref s(variable); mutex_lock ml(*variable->mu()); - OP_REQUIRES(ctx, variable->tensor()->dtype() == write.type, - errors::Internal("Mismatched type in variable write")); + if (variable->tensor()->dtype() != write.type) { + return errors::Internal("Mismatched type in variable write"); + } if (allocate_xla_tensors_) { Tensor output_tensor; - OP_REQUIRES_OK( - ctx, ctx->allocate_temp(write.type, write.shape, &output_tensor)); + TF_RETURN_IF_ERROR( + ctx->allocate_temp(write.type, write.shape, &output_tensor)); XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor); CHECK(xla_tensor); xla_tensor->set_shaped_buffer( ExtractSubShapedBuffer(&output, output_num, xla_allocator_)); + if (use_multiple_streams_) { + xla_tensor->SetDefinedOn(stream, definition_event); + } *variable->tensor() = output_tensor; } else { Tensor output_tensor = XlaTensorBuffer::MakeTensor( @@ -291,6 +356,7 @@ void XlaComputationLaunchContext::PopulateOutputs( } ++output_num; } + return Status::OK(); } } // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h index 4390701ccbd0bc3971413ddcd917c11019990087..7ac275fab833400b90ced0180192845c9be30534 100644 --- a/tensorflow/compiler/jit/xla_launch_util.h +++ b/tensorflow/compiler/jit/xla_launch_util.h @@ -76,9 +76,15 @@ class XlaComputationLaunchContext { // Create a new launch context. 'allocate_xla_tensors' is true if allocated // output tensors and variables are always XlaTensors. If false they are // assumed to be "normal" device pointers. + // If 'use_multiple_streams' is true, tensors may be defined and used on + // multiple streams and so se::Events must be defined and waited for. If + // 'use_multiple_streams' is true, 'allocate_xla_tensors' must also be true + // because we track inter-stream dependencies through events inside XlaTensor + // objects. XlaComputationLaunchContext(xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator, - bool allocate_xla_tensors); + bool allocate_xla_tensors, + bool use_multiple_streams); // Add all inputs within `ctx` as XLA arguments (returned by arguments()). // `variables` is a map from TensorFlow argument number to resource variable. @@ -87,9 +93,9 @@ class XlaComputationLaunchContext { const std::map& variables); // Given the XLA output in `output`, populate all outputs of `ctx`. - void PopulateOutputs(OpKernelContext* ctx, - const XlaCompiler::CompilationResult* kernel, - xla::ScopedShapedBuffer output); + Status PopulateOutputs(OpKernelContext* ctx, + const XlaCompiler::CompilationResult* kernel, + xla::ScopedShapedBuffer output); // Return the argument list. Only valid after PopulateInputs() has been // called. @@ -99,6 +105,7 @@ class XlaComputationLaunchContext { xla::LocalClient* client_; xla::DeviceMemoryAllocator* xla_allocator_; bool allocate_xla_tensors_; + bool use_multiple_streams_; std::vector> arg_buffers_; std::vector arg_ptrs_; }; @@ -115,7 +122,11 @@ class XlaTensorBuffer : public TensorBuffer { data_ = const_cast(ptr); } - ~XlaTensorBuffer() override { allocator_->DeallocateRaw(data_); } + ~XlaTensorBuffer() override { + if (data_) { + allocator_->DeallocateRaw(data_); + } + } void* data() const override { return data_; } size_t size() const override { return expected_size_; } @@ -156,4 +167,4 @@ xla::ScopedShapedBuffer ExtractSubShapedBuffer( } // namespace tensorflow -#endif +#endif // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_ diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc index 3c44c4ae6df7f3e2d60d8933561c0c71888e8c3f..92ba7de1b7d32fcf693cd12a380d7a1e0d861d71 100644 --- a/tensorflow/compiler/jit/xla_tensor.cc +++ b/tensorflow/compiler/jit/xla_tensor.cc @@ -73,6 +73,35 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape, return Status::OK(); } +se::Event* XlaTensor::GetDefinitionEvent(se::Stream* stream) { + mutex_lock lock(mu_); + if (!definition_event_) { + return nullptr; + } + + // The set of defined streams is expected to be very small indeed (usually + // 1-2), so a simple linear scan should be fast enough. + if (std::find(streams_defined_on_.begin(), streams_defined_on_.end(), + stream) != streams_defined_on_.end()) { + // stream is in streams_defined_on_; it doesn't need to be waited on. + return nullptr; + } + + return definition_event_.get(); +} + +void XlaTensor::SetDefinedOn(se::Stream* stream, + std::shared_ptr event) { + mutex_lock lock(mu_); + definition_event_ = std::move(event); + streams_defined_on_ = {stream}; +} + +void XlaTensor::SetDefinedOn(se::Stream* stream) { + mutex_lock lock(mu_); + streams_defined_on_.push_back(stream); +} + // The pointer tag, OR-ed into the XlaTensor's address to distinguish it from // device-side tensors, which are either CPU or GPU memory pointers. This works // because we're guaranteed that CPU and GPU pointers are aligned to > 1 bits. diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h index c54001a999998f45c0cdacd752ca4036f0792857..4c9bb2e27b0ca3c83848be7fdf189fdbad89cee5 100644 --- a/tensorflow/compiler/jit/xla_tensor.h +++ b/tensorflow/compiler/jit/xla_tensor.h @@ -16,6 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_ #define TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_ +#include + +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/core/framework/allocator.h" @@ -68,7 +71,7 @@ class XlaTensor { // Mutates the XlaTensor to set the ShapedBuffer. void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) { shaped_buffer_ = - xla::MakeUnique(std::move(shaped_buffer)); + absl::make_unique(std::move(shaped_buffer)); } // Some tensors on the device may have known values on the host. We use these @@ -85,6 +88,24 @@ class XlaTensor { host_tensor_.reset(new Tensor(tensor)); } + // If the tensor's content is not yet defined on 'stream', and there exists an + // se::Event declaring when the tensor's content is defined, return it. + // Otherwise, return nullptr. If this function returns nullptr then the + // tensor's content can be read on 'stream' without additional + // synchronization. + se::Event* GetDefinitionEvent(se::Stream* stream); + + // Assert that the tensor's content is defined on 'stream' by the time 'event' + // triggers. + void SetDefinedOn(se::Stream* stream, std::shared_ptr event); + + // Assert that the tensor's content is defined on 'stream'. This version does + // not provide an event, and must be called *after* SetDefinedOn(Stream, + // Event). This call can be read as an assertion that the definition event has + // been waited on by 'stream', so further calls to GetDefinitionEvent(stream) + // do not need to also wait on the event. + void SetDefinedOn(se::Stream* stream); + // Convert from a raw pointer to an XlaTensor, removing the pointer tag. static XlaTensor* FromOpaquePointer(void* ptr); // Convert to a raw pointer from an XlaTensor, adding the pointer tag. @@ -95,8 +116,16 @@ class XlaTensor { std::unique_ptr shaped_buffer_; // An optional host tensor value. std::unique_ptr host_tensor_; + // An optional event that is triggered when the tensor's content has been + // defined. If this event is nullptr, it is assumed that the tensor's content + // is always defined. + std::shared_ptr definition_event_; + // A list of all streams for which the tensor's content is defined for any + // newly enqueued command. + gtl::InlinedVector streams_defined_on_ GUARDED_BY(mu_); + mutex mu_; }; } // namespace tensorflow -#endif +#endif // TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_ diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index b51c11bf6e9b952d9e282b498101ec4f73f87885..34defe1c7ade687a7524390cee78657e1a27f5b4 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -51,6 +51,38 @@ py_library( ], ) +py_library( + name = "test_utils", + testonly = 1, + srcs = ["test_utils.py"], + srcs_version = "PY2AND3", + deps = [ + "//third_party/py/numpy", + ], +) + +py_test( + name = "xla_test_test", + size = "small", + srcs = ["xla_test_test.py"], + deps = [ + ":xla_test", + ], +) + +tf_xla_py_test( + name = "adadelta_test", + size = "large", + srcs = ["adadelta_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework", + "//tensorflow/python:platform_test", + "//tensorflow/python:training", + ], +) + tf_xla_py_test( name = "adagrad_test", size = "small", @@ -65,6 +97,19 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "adagrad_da_test", + size = "small", + srcs = ["adagrad_da_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework", + "//tensorflow/python:platform_test", + "//tensorflow/python:training", + ], +) + tf_xla_py_test( name = "adam_test", size = "small", @@ -79,6 +124,48 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "adamax_test", + size = "small", + srcs = ["adamax_test.py"], + deps = [ + ":xla_test", + "//tensorflow/contrib/opt:opt_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework", + "//tensorflow/python:training", + ], +) + +tf_xla_py_test( + name = "addsign_test", + size = "small", + srcs = ["addsign_test.py"], + deps = [ + ":xla_test", + "//tensorflow/contrib/opt:opt_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework", + "//tensorflow/python:training", + ], +) + +tf_xla_py_test( + name = "powersign_test", + size = "small", + srcs = ["powersign_test.py"], + deps = [ + ":xla_test", + "//tensorflow/contrib/opt:opt_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework", + "//tensorflow/python:training", + ], +) + tf_xla_py_test( name = "argminmax_test", size = "small", @@ -148,7 +235,7 @@ tf_xla_py_test( tf_xla_py_test( name = "cholesky_op_test", - size = "small", + size = "medium", srcs = ["cholesky_op_test.py"], tags = ["optonly"], deps = [ @@ -164,6 +251,7 @@ tf_xla_py_test( tf_xla_py_test( name = "matrix_triangular_solve_op_test", size = "small", + timeout = "moderate", srcs = ["matrix_triangular_solve_op_test.py"], tags = ["optonly"], deps = [ @@ -238,6 +326,7 @@ tf_xla_py_test( srcs = ["conv2d_test.py"], shard_count = 10, deps = [ + ":test_utils", ":xla_test", "//tensorflow/python:array_ops", "//tensorflow/python:framework", @@ -245,6 +334,7 @@ tf_xla_py_test( "//tensorflow/python:nn_ops", "//tensorflow/python:nn_ops_gen", "//tensorflow/python:platform_test", + "@absl_py//absl/testing:parameterized", ], ) @@ -298,6 +388,19 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "reshape_op_test", + size = "small", + srcs = ["reshape_op_test.py"], + deps = [ + "//tensorflow/compiler/tests:xla_test", + "//tensorflow/compiler/tf2xla/python:xla", + "//tensorflow/python:array_ops", + "//tensorflow/python:dtypes", + "@absl_py//absl/testing:parameterized", + ], +) + tf_xla_py_test( name = "dynamic_stitch_test", size = "small", @@ -329,7 +432,7 @@ tf_xla_py_test( tf_xla_py_test( name = "eager_test", - size = "small", + size = "large", srcs = ["eager_test.py"], disabled_backends = [ # TODO(b/78199195) Support XLA CPU devices in eager runtime @@ -350,6 +453,20 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "fifo_queue_test", + size = "medium", + srcs = ["fifo_queue_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:data_flow_ops", + "//tensorflow/python:extra_py_tests_deps", + "//tensorflow/python:framework", + "//tensorflow/python:platform_test", + ], +) + tf_xla_py_test( name = "fft_test", size = "medium", @@ -456,6 +573,7 @@ tf_xla_py_test( tf_xla_py_test( name = "matrix_band_part_test", size = "medium", + timeout = "long", srcs = ["matrix_band_part_test.py"], tags = ["optonly"], deps = [ @@ -535,17 +653,66 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "proximal_adagrad_test", + size = "medium", + srcs = ["proximal_adagrad_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework", + "//tensorflow/python:training", + ], +) + +tf_xla_py_test( + name = "proximal_gradient_descent_test", + size = "medium", + srcs = ["proximal_gradient_descent_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework", + "//tensorflow/python:training", + ], +) + +tf_xla_py_test( + name = "qr_op_test", + size = "medium", + srcs = ["qr_op_test.py"], + disabled_backends = [ + # Test is very slow on CPU. + "cpu", + "cpu_ondemand", + ], + shard_count = 5, + tags = ["optonly"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:math_ops", + "//tensorflow/python:platform_test", + "//tensorflow/python:training", + "@absl_py//absl/testing:parameterized", + ], +) + tf_xla_py_test( name = "random_ops_test", size = "small", srcs = ["random_ops_test.py"], - # TODO(b/31361304): enable RNG ops on GPU when parallelized. disabled_backends = [ - "gpu", + "cpu_ondemand", ], deps = [ ":xla_test", + "//tensorflow/python:array_ops", "//tensorflow/python:framework", + "//tensorflow/python:math_ops", "//tensorflow/python:platform_test", "//tensorflow/python:random_ops", ], @@ -563,6 +730,7 @@ tf_xla_py_test( "//tensorflow/python:framework", "//tensorflow/python:math_ops", "//tensorflow/python:platform_test", + "@absl_py//absl/testing:parameterized", ], ) @@ -662,6 +830,19 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "sparse_to_dense_op_test", + size = "small", + srcs = ["sparse_to_dense_op_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework", + "//tensorflow/python:platform_test", + "//tensorflow/python:sparse_ops", + ], +) + tf_xla_py_test( name = "stack_ops_test", size = "small", @@ -741,9 +922,10 @@ tf_xla_py_test( tf_xla_py_test( name = "fused_batchnorm_test", - size = "small", + size = "medium", srcs = ["fused_batchnorm_test.py"], deps = [ + ":test_utils", ":xla_test", "//tensorflow/python:framework", "//tensorflow/python:math_ops", @@ -753,6 +935,7 @@ tf_xla_py_test( "//tensorflow/python:nn_ops_gen", "//tensorflow/python:platform_test", "//tensorflow/python:training", + "@absl_py//absl/testing:parameterized", ], ) @@ -828,6 +1011,21 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "sort_ops_test", + size = "medium", + srcs = ["sort_ops_test.py"], + shard_count = 5, + # Times out in fastbuild mode. + tags = ["optonly"], + deps = [ + "//tensorflow/compiler/tests:xla_test", + "//tensorflow/compiler/tf2xla/python:xla", + "//tensorflow/python:array_ops", + "//tensorflow/python:dtypes", + ], +) + tf_xla_py_test( name = "xla_device_test", size = "small", @@ -995,3 +1193,19 @@ tf_xla_py_test( "//tensorflow/python:platform_test", ], ) + +tf_xla_py_test( + name = "xla_ops_test", + size = "small", + srcs = ["xla_ops_test.py"], + disabled_backends = ["cpu_ondemand"], + deps = [ + ":xla_test", + "//tensorflow/compiler/tf2xla/python:xla", + "//tensorflow/python:array_ops", + "//tensorflow/python:errors", + "//tensorflow/python:framework", + "//tensorflow/python:platform_test", + "@absl_py//absl/testing:parameterized", + ], +) diff --git a/tensorflow/compiler/tests/adadelta_test.py b/tensorflow/compiler/tests/adadelta_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b7b7fda293b69d6f0cec61d0d234277636a3670d --- /dev/null +++ b/tensorflow/compiler/tests/adadelta_test.py @@ -0,0 +1,134 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for Adadelta Optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import constant_op +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import adadelta + + +class AdadeltaOptimizerTest(xla_test.XLATestCase): + + def testBasic(self): + num_updates = 4 # number of ADADELTA steps to perform + for dtype in self.float_types: + with self.cached_session(), self.test_scope(): + for grad in [0.2, 0.1, 0.01]: + for lr in [1.0, 0.5, 0.1]: + var0_init = [1.0, 2.0] + var1_init = [3.0, 4.0] + var0 = resource_variable_ops.ResourceVariable( + var0_init, dtype=dtype) + var1 = resource_variable_ops.ResourceVariable( + var1_init, dtype=dtype) + + grads = constant_op.constant([grad, grad], dtype=dtype) + + accum = 0.0 + accum_update = 0.0 + + # ADADELTA gradient optimizer + rho = 0.95 + epsilon = 1e-8 + adadelta_opt = adadelta.AdadeltaOptimizer( + learning_rate=lr, rho=rho, epsilon=epsilon) + adadelta_update = adadelta_opt.apply_gradients( + zip([grads, grads], [var0, var1])) + self.evaluate(variables.global_variables_initializer()) + opt_vars = adadelta_opt.variables() + self.assertStartsWith(opt_vars[0].name, var0._shared_name) + self.assertStartsWith(opt_vars[1].name, var0._shared_name) + self.assertStartsWith(opt_vars[2].name, var1._shared_name) + self.assertStartsWith(opt_vars[3].name, var1._shared_name) + self.assertEqual(4, len(opt_vars)) + # Assign slots + slot = [None] * 2 + slot_update = [None] * 2 + self.assertEqual(["accum", "accum_update"], + adadelta_opt.get_slot_names()) + slot[0] = adadelta_opt.get_slot(var0, "accum") + self.assertEquals(slot[0].get_shape(), var0.get_shape()) + self.assertFalse(slot[0] in variables.trainable_variables()) + + slot_update[0] = adadelta_opt.get_slot(var0, "accum_update") + self.assertEquals(slot_update[0].get_shape(), var0.get_shape()) + self.assertFalse(slot_update[0] in variables.trainable_variables()) + + slot[1] = adadelta_opt.get_slot(var1, "accum") + self.assertEquals(slot[1].get_shape(), var1.get_shape()) + self.assertFalse(slot[1] in variables.trainable_variables()) + + slot_update[1] = adadelta_opt.get_slot(var1, "accum_update") + self.assertEquals(slot_update[1].get_shape(), var1.get_shape()) + self.assertFalse(slot_update[1] in variables.trainable_variables()) + + # Fetch params to validate initial values + self.assertAllClose(var0_init, self.evaluate(var0)) + self.assertAllClose(var1_init, self.evaluate(var1)) + + update = [None] * num_updates + tot_update = 0 + for step in range(num_updates): + # Run adadelta update for comparison + self.evaluate(adadelta_update) + + # Perform initial update without previous accum values + accum = accum * rho + (grad**2) * (1 - rho) + update[step] = ( + np.sqrt(accum_update + epsilon) * + (1. / np.sqrt(accum + epsilon)) * grad) + accum_update = ( + accum_update * rho + (update[step]**2) * (1.0 - rho)) + tot_update += update[step] * lr + + # Check that the accumulators have been updated + for slot_idx in range(2): + self.assertAllCloseAccordingToType( + np.array([accum, accum], dtype=dtype), + self.evaluate(slot[slot_idx]), + rtol=1e-5) + + self.assertAllCloseAccordingToType( + np.array([accum_update, accum_update], dtype=dtype), + self.evaluate(slot_update[slot_idx]), + rtol=1e-5) + + # Check that the parameters have been updated + self.assertAllCloseAccordingToType( + np.array( + [var0_init[0] - tot_update, var0_init[1] - tot_update], + dtype=dtype), + self.evaluate(var0), + rtol=1e-5) + + self.assertAllCloseAccordingToType( + np.array( + [var1_init[0] - tot_update, var1_init[1] - tot_update], + dtype=dtype), + self.evaluate(var1), + rtol=1e-5) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/adagrad_da_test.py b/tensorflow/compiler/tests/adagrad_da_test.py new file mode 100644 index 0000000000000000000000000000000000000000..69fb3ec2964a09508e612515b9e291fc14121d68 --- /dev/null +++ b/tensorflow/compiler/tests/adagrad_da_test.py @@ -0,0 +1,165 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for AdagradDA optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import adagrad_da + + +class AdagradDAOptimizerTest(xla_test.XLATestCase): + + def testAdagradDAWithoutRegularizationBasic1(self): + for dtype in self.float_types: + with self.cached_session(), self.test_scope(): + global_step = resource_variable_ops.ResourceVariable( + 0, dtype=dtypes.int64) + var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype) + var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.02], dtype=dtype) + opt = adagrad_da.AdagradDAOptimizer( + 3.0, + global_step, + initial_gradient_squared_accumulator_value=0.1, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0) + update = opt.apply_gradients( + zip([grads0, grads1], [var0, var1]), global_step=global_step) + variables.global_variables_initializer().run() + + self.assertAllClose([0.0, 0.0], var0.eval()) + self.assertAllClose([0.0, 0.0], var1.eval()) + + # Run a step of AdagradDA + update.run() + + # Let g to be gradient accumulator, gg to be gradient squared + # accumulator, T be the global step, lr is the learning rate, and k the + # initial gradient squared accumulator value. + # w = \dfrac{sign(-g)*lr*|g - l1*T|_{+}}{l2*T*lr + \sqrt{k+gg})} + # For -0.1*3.0*(0.1 - 0)/(0 + sqrt(0.1 + 0.1*0.1)) = -0.904534 + # similarly for others. + self.assertAllCloseAccordingToType( + np.array([-0.904534, -1.603567]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([-0.094821, -0.189358]), var1.eval()) + + def testAdagradDAwithoutRegularizationBasic2(self): + for dtype in self.float_types: + with self.cached_session(), self.test_scope(): + global_step = resource_variable_ops.ResourceVariable( + 0, dtype=dtypes.int64) + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) + var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.02], dtype=dtype) + + opt = adagrad_da.AdagradDAOptimizer( + 3.0, + global_step, + initial_gradient_squared_accumulator_value=0.1, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0) + update = opt.apply_gradients( + zip([grads0, grads1], [var0, var1]), global_step=global_step) + variables.global_variables_initializer().run() + + self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) + self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval()) + + # Run a step of AdagradDA + update.run() + + self.assertAllCloseAccordingToType( + np.array([-0.904534, -1.603567]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([-0.094821, -0.189358]), var1.eval()) + + def testAdagradDAWithL1(self): + for dtype in self.float_types: + with self.cached_session(), self.test_scope(): + global_step = resource_variable_ops.ResourceVariable( + 0, dtype=dtypes.int64) + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) + var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.02], dtype=dtype) + + opt = adagrad_da.AdagradDAOptimizer( + 3.0, + global_step, + initial_gradient_squared_accumulator_value=0.1, + l1_regularization_strength=0.001, + l2_regularization_strength=0.0) + update = opt.apply_gradients( + zip([grads0, grads1], [var0, var1]), global_step=global_step) + variables.global_variables_initializer().run() + + self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) + self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval()) + + # Run a step of AdagradDA + update.run() + + self.assertAllCloseAccordingToType( + np.array([-0.895489, -1.59555]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([-0.085339, -0.17989]), var1.eval()) + + def testAdagradDAWithL1_L2(self): + for dtype in self.float_types: + with self.cached_session(), self.test_scope(): + global_step = resource_variable_ops.ResourceVariable( + 0, dtype=dtypes.int64) + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) + var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.02], dtype=dtype) + + opt = adagrad_da.AdagradDAOptimizer( + 3.0, + global_step, + initial_gradient_squared_accumulator_value=0.1, + l1_regularization_strength=0.001, + l2_regularization_strength=2.0) + update = opt.apply_gradients( + zip([grads0, grads1], [var0, var1]), global_step=global_step) + variables.global_variables_initializer().run() + + self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) + self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval()) + + # Run a step of AdagradDA + update.run() + + self.assertAllCloseAccordingToType( + np.array([-0.046907, -0.093659]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([-0.004275, -0.009023]), var1.eval()) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py index 9a93b3216404d8ed21fd6c57757bec1730c119b4..ab69319c59fb07e7ce56c3c287a50a6290effdfd 100644 --- a/tensorflow/compiler/tests/adagrad_test.py +++ b/tensorflow/compiler/tests/adagrad_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variables @@ -28,11 +28,11 @@ from tensorflow.python.platform import test from tensorflow.python.training import adagrad -class AdagradOptimizerTest(XLATestCase): +class AdagradOptimizerTest(xla_test.XLATestCase): def testBasic(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) @@ -57,7 +57,7 @@ class AdagradOptimizerTest(XLATestCase): def testTensorLearningRate(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) @@ -83,7 +83,7 @@ class AdagradOptimizerTest(XLATestCase): def testSharing(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py index 3215dc36e5b2d517aa951db1b0d41188185ef93a..df0f21471a1c67e69e037f6409bcab1297d3399d 100644 --- a/tensorflow/compiler/tests/adam_test.py +++ b/tensorflow/compiler/tests/adam_test.py @@ -20,8 +20,9 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variable_scope @@ -48,10 +49,13 @@ def adam_update_numpy(param, return param_t, m_t, v_t -class AdamOptimizerTest(XLATestCase): +class AdamOptimizerTest(xla_test.XLATestCase): def testBasic(self): for dtype in self.float_types: + # TODO: test fails for float16 due to excessive precision requirements. + if dtype in [np.float16, dtypes.bfloat16.as_numpy_dtype]: + continue with self.test_session(), self.test_scope(): variable_scope.get_variable_scope().set_use_resource(True) @@ -91,6 +95,9 @@ class AdamOptimizerTest(XLATestCase): def testTensorLearningRate(self): for dtype in self.float_types: + # TODO: test fails for float16 due to excessive precision requirements. + if dtype in [np.float16, dtypes.bfloat16.as_numpy_dtype]: + continue with self.test_session(), self.test_scope(): variable_scope.get_variable_scope().set_use_resource(True) @@ -130,6 +137,9 @@ class AdamOptimizerTest(XLATestCase): def testSharing(self): for dtype in self.float_types: + # TODO: test fails for float16 due to excessive precision requirements. + if dtype in [np.float16, dtypes.bfloat16.as_numpy_dtype]: + continue with self.test_session(), self.test_scope(): variable_scope.get_variable_scope().set_use_resource(True) diff --git a/tensorflow/compiler/tests/adamax_test.py b/tensorflow/compiler/tests/adamax_test.py new file mode 100644 index 0000000000000000000000000000000000000000..3ed1d41b7121f44dd7470f61180f7a7055369174 --- /dev/null +++ b/tensorflow/compiler/tests/adamax_test.py @@ -0,0 +1,139 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for AdaMax optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.contrib.opt.python.training import adamax +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + + +def adamax_update_numpy(param, + g_t, + t, + m, + v, + alpha=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8): + m_t = beta1 * m + (1 - beta1) * g_t + v_t = np.maximum(beta2 * v, np.abs(g_t)) + param_t = param - (alpha / (1 - beta1**t)) * (m_t / (v_t + epsilon)) + return param_t, m_t, v_t + + +class AdaMaxOptimizerTest(xla_test.XLATestCase): + + def testBasic(self): + for i, dtype in enumerate(self.float_types): + with self.cached_session(), self.test_scope(): + variable_scope.get_variable_scope().set_use_resource(True) + # Initialize variables for numpy implementation. + m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 + var0_np = np.array([1.0, 2.0], dtype=dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype) + var1_np = np.array([3.0, 4.0], dtype=dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype) + + var0 = resource_variable_ops.ResourceVariable( + var0_np, name="var0_%d" % i) + var1 = resource_variable_ops.ResourceVariable( + var1_np, name="var1_%d" % i) + grads0 = constant_op.constant(grads0_np) + grads1 = constant_op.constant(grads1_np) + + opt = adamax.AdaMaxOptimizer() + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + opt_variables = opt.variables() + beta1_power = opt._get_beta_accumulators() + self.assertTrue(beta1_power is not None) + self.assertIn(beta1_power, opt_variables) + + with ops.Graph().as_default(): + # Shouldn't return non-slot variables from other graphs. + self.assertEqual(0, len(opt.variables())) + + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + beta1_power = opt._get_beta_accumulators() + + # Run 3 steps of AdaMax + for t in range(1, 4): + update.run() + + self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval()) + + var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) + var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1) + + # Validate updated params + self.assertAllCloseAccordingToType(var0_np, var0.eval(), rtol=1e-2) + self.assertAllCloseAccordingToType(var1_np, var1.eval(), rtol=1e-2) + self.assertEqual("var0_%d/AdaMax:0" % (i,), + opt.get_slot(var=var0, name="m").name) + + def testTensorLearningRate(self): + for dtype in self.float_types: + with self.cached_session(), self.test_scope(): + variable_scope.get_variable_scope().set_use_resource(True) + # Initialize variables for numpy implementation. + m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 + var0_np = np.array([1.0, 2.0], dtype=dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype) + var1_np = np.array([3.0, 4.0], dtype=dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype) + + var0 = resource_variable_ops.ResourceVariable(var0_np) + var1 = resource_variable_ops.ResourceVariable(var1_np) + grads0 = constant_op.constant(grads0_np) + grads1 = constant_op.constant(grads1_np) + opt = adamax.AdaMaxOptimizer(constant_op.constant(0.001)) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + beta1_power = opt._get_beta_accumulators() + + # Run 3 steps of AdaMax + for t in range(1, 4): + self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) + update.run() + + var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) + var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1) + + # Validate updated params + self.assertAllCloseAccordingToType(var0_np, var0.eval()) + self.assertAllCloseAccordingToType(var1_np, var1.eval()) + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/addsign_test.py b/tensorflow/compiler/tests/addsign_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1bc07ace23ccdc83103abe71ee11b72994c75a6d --- /dev/null +++ b/tensorflow/compiler/tests/addsign_test.py @@ -0,0 +1,142 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for AddSign.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.contrib.opt.python.training import addsign +from tensorflow.contrib.opt.python.training import sign_decay +from tensorflow.python.framework import constant_op +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + + +def py_linear_decay_fn(decay_steps): + def linear_decay(step): + step = min(step, decay_steps) + return float(decay_steps - step) / decay_steps + return linear_decay + + +def addsign_update_numpy(params, + g_t, + m, + lr, + alpha=1.0, + beta=0.9, + py_sign_decay_fn=None, + t=None): + m_t = beta * m + (1 - beta) * g_t + if py_sign_decay_fn is None: + sign_decayed = 1.0 + else: + sign_decayed = py_sign_decay_fn(t-1) + multiplier = alpha + sign_decayed * np.sign(g_t) * np.sign(m_t) + params_t = params - lr * multiplier * g_t + return params_t, m_t + + +class AddSignTest(xla_test.XLATestCase): + + def _testDense(self, + learning_rate=0.1, + sign_decay_fn=None, + py_sign_decay_fn=None, + alpha=1.0, + beta=0.9): + for dtype in self.float_types: + with self.cached_session(), self.test_scope(): + # Initialize variables for numpy implementation. + m0, m1 = 0.0, 0.0 + var0_np = np.array([1.0, 2.0], dtype=dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype) + var1_np = np.array([3.0, 4.0], dtype=dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype) + + var0 = resource_variable_ops.ResourceVariable(var0_np) + var1 = resource_variable_ops.ResourceVariable(var1_np) + global_step = resource_variable_ops.ResourceVariable(0, trainable=False) + grads0 = constant_op.constant(grads0_np) + grads1 = constant_op.constant(grads1_np) + + opt = addsign.AddSignOptimizer( + learning_rate=learning_rate, + alpha=alpha, + beta=beta, + sign_decay_fn=sign_decay_fn, + ) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]), + global_step=global_step) + neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]), + global_step=global_step) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + # Run 7 steps of AddSign + # first 4 steps with positive gradient + # last 3 steps with negative gradient (sign(gm) should be -1) + for t in range(1, 8): + if t < 5: + update.run() + else: + neg_update.run() + + var0_np, m0 = addsign_update_numpy( + var0_np, + grads0_np if t < 5 else -grads0_np, + m0, + learning_rate, + alpha=alpha, + beta=beta, + py_sign_decay_fn=py_sign_decay_fn, + t=t, + ) + var1_np, m1 = addsign_update_numpy( + var1_np, + grads1_np if t < 5 else -grads1_np, + m1, + learning_rate, + alpha=alpha, + beta=beta, + py_sign_decay_fn=py_sign_decay_fn, + t=t, + ) + + # Validate updated params + self.assertAllCloseAccordingToType( + var0_np, var0.eval(), half_rtol=1e-2) + self.assertAllCloseAccordingToType(var1_np, var1.eval()) + + def testDense(self): + decay_steps = 10 + sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps) + py_sign_decay_fn = py_linear_decay_fn(decay_steps) + self._testDense() + self._testDense(learning_rate=0.01, alpha=0.1, beta=0.8) + self._testDense( + sign_decay_fn=sign_decay_fn, py_sign_decay_fn=py_sign_decay_fn) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/compiler/tests/argminmax_test.py b/tensorflow/compiler/tests/argminmax_test.py index 9d3a889b1f54c813e881bb03b5275f809af1b3c8..4155342787fbbdeaf5c5958c44d007b1ea0660ed 100644 --- a/tensorflow/compiler/tests/argminmax_test.py +++ b/tensorflow/compiler/tests/argminmax_test.py @@ -40,7 +40,7 @@ class ArgMinMaxTest(xla_test.XLATestCase): op_input: numpy input array to use as input to 'op'. expected: numpy array representing the expected output of 'op'. """ - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): pinp = array_ops.placeholder( dtypes.as_dtype(op_input.dtype), op_input.shape, name="a") diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py index 1e4dd32916c3a40282735fb8f75670b0e9ef0dc9..17280e445b329d1541aaed78ec106f8f282cbc74 100644 --- a/tensorflow/compiler/tests/binary_ops_test.py +++ b/tensorflow/compiler/tests/binary_ops_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.ops import array_ops @@ -32,11 +32,11 @@ from tensorflow.python.ops import nn_ops from tensorflow.python.platform import googletest -class BinaryOpsTest(XLATestCase): +class BinaryOpsTest(xla_test.XLATestCase): """Test cases for binary operators.""" def _testBinary(self, op, a, b, expected, equality_test=None): - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): pa = array_ops.placeholder(dtypes.as_dtype(a.dtype), a.shape, name="a") pb = array_ops.placeholder(dtypes.as_dtype(b.dtype), b.shape, name="b") @@ -226,6 +226,11 @@ class BinaryOpsTest(XLATestCase): np.array([0b1, 0b101, 0b1000], dtype=dtype), np.array([0b0, 0b101, 0b1001], dtype=dtype), expected=np.array([0b1, 0b101, 0b1001], dtype=dtype)) + self._testSymmetricBinary( + bitwise_ops.bitwise_xor, + np.array([0b1, 0b111, 0b1100], dtype=dtype), + np.array([0b0, 0b101, 0b1001], dtype=dtype), + expected=np.array([0b1, 0b010, 0b0101], dtype=dtype)) lhs = np.array([0, 5, 3, 14], dtype=dtype) rhs = np.array([5, 0, 7, 11], dtype=dtype) @@ -686,11 +691,13 @@ class BinaryOpsTest(XLATestCase): np.array([[10], [7], [2]], dtype=np.float32), np.float32(7), expected=np.array([[False], [False], [True]], dtype=np.bool)) - self._testBinary( - less_op, - np.array([[10], [7], [2], [-1]], dtype=np.int64), - np.int64(7), - expected=np.array([[False], [False], [True], [True]], dtype=np.bool)) + if np.int64 in self.numeric_types: + self._testBinary( + less_op, + np.array([[10], [7], [2], [-1]], dtype=np.int64), + np.int64(7), + expected=np.array( + [[False], [False], [True], [True]], dtype=np.bool)) for less_equal_op in [math_ops.less_equal, (lambda x, y: x <= y)]: self._testBinary( @@ -1003,7 +1010,38 @@ class BinaryOpsTest(XLATestCase): [7, 7, 7, 7, 7, 7]], dtype=dtype)) - def testMirrorPad(self): + def testSymmetricMirrorPad(self): + mirror_pad = lambda t, paddings: array_ops.pad(t, paddings, "SYMMETRIC") + for dtype in self.numeric_types: + self._testBinary( + mirror_pad, + np.array( + [ + [1, 2, 3], # + [4, 5, 6], # + ], + dtype=dtype), + np.array([[ + 2, + 2, + ], [3, 3]], dtype=np.int32), + expected=np.array( + [ + [6, 5, 4, 4, 5, 6, 6, 5, 4], # + [3, 2, 1, 1, 2, 3, 3, 2, 1], # + [3, 2, 1, 1, 2, 3, 3, 2, 1], # + [6, 5, 4, 4, 5, 6, 6, 5, 4], # + [6, 5, 4, 4, 5, 6, 6, 5, 4], # + [3, 2, 1, 1, 2, 3, 3, 2, 1], # + ], + dtype=dtype)) + self._testBinary( + mirror_pad, + np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype), + np.array([[0, 0], [0, 0]], dtype=np.int32), + expected=np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)) + + def testReflectMirrorPad(self): mirror_pad = lambda t, paddings: array_ops.pad(t, paddings, "REFLECT") for dtype in self.numeric_types: self._testBinary( @@ -1158,6 +1196,16 @@ class BinaryOpsTest(XLATestCase): def testTile(self): for dtype in self.numeric_types: + self._testBinary( + array_ops.tile, + np.array([[6], [3], [4]], dtype=dtype), + np.array([2, 0], dtype=np.int32), + expected=np.empty([6, 0], dtype=dtype)) + self._testBinary( + array_ops.tile, + np.array([[6, 3, 4]], dtype=dtype), + np.array([2, 0], dtype=np.int32), + expected=np.empty([2, 0], dtype=dtype)) self._testBinary( array_ops.tile, np.array([[6]], dtype=dtype), @@ -1216,6 +1264,24 @@ class BinaryOpsTest(XLATestCase): np.array([1, 0], dtype=np.int32), expected=np.array([[1, 3], [2, 4]], dtype=dtype)) + def testConjugateTranspose(self): + for dtype in self.complex_types: + self._testBinary( + array_ops.conjugate_transpose, + np.zeros(shape=[1, 0, 4], dtype=dtype), + np.array([1, 2, 0], dtype=np.int32), + expected=np.zeros(shape=[0, 4, 1], dtype=dtype)) + self._testBinary( + array_ops.conjugate_transpose, + np.array([[1 - 1j, 2 + 2j], [3 - 3j, 4 + 4j]], dtype=dtype), + np.array([0, 1], dtype=np.int32), + expected=np.array([[1 + 1j, 2 - 2j], [3 + 3j, 4 - 4j]], dtype=dtype)) + self._testBinary( + array_ops.conjugate_transpose, + np.array([[1 - 1j, 2 + 2j], [3 - 3j, 4 + 4j]], dtype=dtype), + np.array([1, 0], dtype=np.int32), + expected=np.array([[1 + 1j, 3 + 3j], [2 - 2j, 4 - 4j]], dtype=dtype)) + def testCross(self): for dtype in self.float_types: self._testBinary( @@ -1337,5 +1403,40 @@ class BinaryOpsTest(XLATestCase): [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]], dtype=dtype)) + def testBroadcastTo(self): + for dtype in self.all_types: + x = np.random.randint(0, high=100, size=[2, 3]) + self._testBinary( + array_ops.broadcast_to, + x, + np.array([2, 3], dtype=np.int32), + expected=x) + self._testBinary( + array_ops.broadcast_to, + x, + np.array([6, 6], dtype=np.int32), + expected=np.tile(x, [3, 2])) + self._testBinary( + array_ops.broadcast_to, + x, + np.array([7, 4, 3], dtype=np.int32), + expected=np.tile(x, [7, 2, 1])) + self._testBinary( + array_ops.broadcast_to, + x, + np.array([7, 0, 3], dtype=np.int32), + expected=np.zeros([7, 0, 3], dtype=dtype)) + self._testBinary( + array_ops.broadcast_to, + x, + np.array([7, 1, 2, 9], dtype=np.int32), + expected=np.tile(x, [7, 1, 1, 3])) + self._testBinary( + array_ops.broadcast_to, + np.zeros([2, 0], dtype=dtype), + np.array([4, 0], dtype=np.int32), + expected=np.zeros([4, 0], dtype=dtype)) + + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/compiler/tests/bucketize_op_test.py b/tensorflow/compiler/tests/bucketize_op_test.py index fde9759a1c209844caac99d5f303cd3e406e5370..5c24db539bce5df701d8229290ddb4c20997d40a 100644 --- a/tensorflow/compiler/tests/bucketize_op_test.py +++ b/tensorflow/compiler/tests/bucketize_op_test.py @@ -18,7 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.ops import array_ops @@ -26,10 +26,10 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -class BucketizationOpTest(XLATestCase): +class BucketizationOpTest(xla_test.XLATestCase): def testInt(self): - with self.test_session() as sess: + with self.cached_session() as sess: p = array_ops.placeholder(dtypes.int32) with self.test_scope(): op = math_ops._bucketize(p, boundaries=[0, 3, 8, 11]) @@ -38,7 +38,7 @@ class BucketizationOpTest(XLATestCase): sess.run(op, {p: [-5, 0, 2, 3, 5, 8, 10, 11, 12]})) def testFloat(self): - with self.test_session() as sess: + with self.cached_session() as sess: p = array_ops.placeholder(dtypes.float32) with self.test_scope(): op = math_ops._bucketize(p, boundaries=[0., 3., 8., 11.]) @@ -48,7 +48,7 @@ class BucketizationOpTest(XLATestCase): sess.run(op, {p: [-5., 0., 2., 3., 5., 8., 10., 11., 12.]})) def test2DInput(self): - with self.test_session() as sess: + with self.cached_session() as sess: p = array_ops.placeholder(dtypes.float32) with self.test_scope(): op = math_ops._bucketize(p, boundaries=[0, 3, 8, 11]) @@ -58,7 +58,7 @@ class BucketizationOpTest(XLATestCase): {p: [[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]})) def testInvalidBoundariesOrder(self): - with self.test_session() as sess: + with self.cached_session() as sess: p = array_ops.placeholder(dtypes.int32) with self.test_scope(): op = math_ops._bucketize(p, boundaries=[0, 8, 3, 11]) @@ -67,7 +67,7 @@ class BucketizationOpTest(XLATestCase): sess.run(op, {p: [-5, 0]}) def testBoundariesNotList(self): - with self.test_session(): + with self.cached_session(): with self.assertRaisesRegexp(TypeError, "Expected list.*"): p = array_ops.placeholder(dtypes.int32) with self.test_scope(): diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py index 035cdea1786d39f3d21bb63be5c8ccffe1608bdf..a57d1dc81ea2c9c188b0a3005904738aa8156bf3 100644 --- a/tensorflow/compiler/tests/categorical_op_test.py +++ b/tensorflow/compiler/tests/categorical_op_test.py @@ -22,7 +22,7 @@ import collections import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.framework import random_seed from tensorflow.python.ops import array_ops @@ -32,7 +32,7 @@ from tensorflow.python.platform import googletest # TODO(srvasude): Merge this with # third_party/tensorflow/python/kernel_tests/random/multinomial_op_test.py. -class CategoricalTest(XLATestCase): +class CategoricalTest(xla_test.XLATestCase): """Test cases for random-number generating operators.""" def output_dtypes(self): @@ -56,7 +56,7 @@ class CategoricalTest(XLATestCase): Returns: Frequencies from sampled classes; shape [batch_size, num_classes]. """ - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): random_seed.set_random_seed(1618) op = random_ops.multinomial(logits, num_samples, output_dtype=dtypes.int32) @@ -79,7 +79,7 @@ class CategoricalTest(XLATestCase): def _testRngIsNotConstant(self, rng, dtype, output_dtype): # Tests that 'rng' does not always return the same value. - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): x = rng(dtype, output_dtype) @@ -107,7 +107,7 @@ class CategoricalTest(XLATestCase): def testCategoricalIsInRange(self): for dtype in self.float_types: for output_dtype in self.output_dtypes(): - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): x = random_ops.multinomial( array_ops.ones(shape=[1, 20], dtype=dtype), 1000, diff --git a/tensorflow/compiler/tests/cholesky_op_test.py b/tensorflow/compiler/tests/cholesky_op_test.py index 1a8989d7c2f617525c301f30fd899a01362310bf..d1896a50f7037f2972cba8a4fa16cc1e2cd4fe3e 100644 --- a/tensorflow/compiler/tests/cholesky_op_test.py +++ b/tensorflow/compiler/tests/cholesky_op_test.py @@ -18,12 +18,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import unittest - import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops @@ -32,7 +30,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -class CholeskyOpTest(XLATestCase): +class CholeskyOpTest(xla_test.XLATestCase): # Cholesky defined for float64, float32, complex64, complex128 # (https://www.tensorflow.org/api_docs/python/tf/cholesky) @@ -56,7 +54,7 @@ class CholeskyOpTest(XLATestCase): def _verifyCholesky(self, x, atol=1e-6): # Verify that LL^T == x. - with self.test_session() as sess: + with self.cached_session() as sess: placeholder = array_ops.placeholder( dtypes.as_dtype(x.dtype), shape=x.shape) with self.test_scope(): @@ -103,9 +101,8 @@ class CholeskyOpTest(XLATestCase): with self.assertRaises(ValueError): linalg_ops.cholesky(tensor3) - @unittest.skip("Test is slow") - def testLarge(self): - n = 200 + def testLarge2000x2000(self): + n = 2000 shape = (n, n) data = np.ones(shape).astype(np.float32) / (2.0 * n) + np.diag( np.ones(n).astype(np.float32)) @@ -128,6 +125,5 @@ class CholeskyOpTest(XLATestCase): matrix = np.dot(np.dot(w, np.diag(v)), w.T).astype(dtype) self._verifyCholesky(matrix, atol=1e-4) - if __name__ == "__main__": test.main() diff --git a/tensorflow/compiler/tests/clustering_test.py b/tensorflow/compiler/tests/clustering_test.py index 574f82fc717818334ac5d72ebef2191f1c18e669..88bd58b2da6b2892f898ad10f3467d8ce39d6388 100644 --- a/tensorflow/compiler/tests/clustering_test.py +++ b/tensorflow/compiler/tests/clustering_test.py @@ -21,7 +21,7 @@ from __future__ import print_function import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -32,13 +32,13 @@ from tensorflow.python.platform import googletest CPU_DEVICE = "/job:localhost/replica:0/task:0/cpu:0" -class ClusteringTest(XLATestCase): +class ClusteringTest(xla_test.XLATestCase): def testAdd(self): val1 = np.array([4, 3, 2, 1], dtype=np.float32) val2 = np.array([5, 6, 7, 8], dtype=np.float32) expected = val1 + val2 - with self.test_session(): + with self.cached_session(): with self.test_scope(): input1 = constant_op.constant(val1, name="const1") input2 = constant_op.constant(val2, name="const2") @@ -50,7 +50,7 @@ class ClusteringTest(XLATestCase): val1 = np.array([4, 3, 2, 1]).astype(np.float32) val2 = np.array([5, 6, 7, 8]).astype(np.float32) expected = val1 + val2 - with self.test_session(): + with self.cached_session(): with ops.device(CPU_DEVICE): input1 = constant_op.constant(val1, name="const1") input2 = constant_op.constant(val2, name="const2") @@ -68,7 +68,7 @@ class ClusteringTest(XLATestCase): # where x and z are placed on the CPU and y and w are placed on the XLA # device. If y and w are clustered for compilation, then the graph will # deadlock since the clustered graph will contain a self-loop. - with self.test_session() as sess: + with self.cached_session() as sess: with ops.device(CPU_DEVICE): x = array_ops.placeholder(dtypes.float32, [2]) with self.test_scope(): @@ -81,7 +81,7 @@ class ClusteringTest(XLATestCase): self.assertAllClose(result, [12., 2.], rtol=1e-3) def testHostMemory(self): - with self.test_session() as sess: + with self.cached_session() as sess: x = array_ops.placeholder(dtypes.int32) with self.test_scope(): y = x + 1 diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py index f10973e19f1945515b776cf86349445ed7334629..37e5318bb54c5d8ecdedc7bb346e89765f2adf35 100644 --- a/tensorflow/compiler/tests/concat_ops_test.py +++ b/tensorflow/compiler/tests/concat_ops_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops @@ -30,10 +30,10 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import googletest -class ConcatTest(XLATestCase): +class ConcatTest(xla_test.XLATestCase): def testHStack(self): - with self.test_session(): + with self.cached_session(): p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4]) p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4]) with self.test_scope(): @@ -49,7 +49,7 @@ class ConcatTest(XLATestCase): self.assertAllEqual(result[4:, :], params[p2]) def testVStack(self): - with self.test_session(): + with self.cached_session(): p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4]) p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4]) with self.test_scope(): @@ -65,7 +65,7 @@ class ConcatTest(XLATestCase): self.assertAllEqual(result[:, 4:], params[p2]) def testInt32(self): - with self.test_session(): + with self.cached_session(): p1 = np.random.rand(2, 3).astype("i") p2 = np.random.rand(2, 3).astype("i") x1 = constant_op.constant(p1) @@ -88,7 +88,7 @@ class ConcatTest(XLATestCase): dtype_feed = dtypes.float32 else: dtype_feed = dtype - with self.test_session(): + with self.cached_session(): p = [] for i in np.arange(num_tensors): input_shape = shape @@ -130,7 +130,7 @@ class ConcatTest(XLATestCase): self._testRandom(dtypes.int32) def _testGradientsSimple(self): - with self.test_session(): + with self.cached_session(): inp = [] inp_tensors = [] with self.test_scope(): @@ -157,7 +157,7 @@ class ConcatTest(XLATestCase): self._testGradientsSimple() def _testGradientsFirstDim(self): - with self.test_session(): + with self.cached_session(): inp = [] inp_tensors = [] with self.test_scope(): @@ -185,7 +185,7 @@ class ConcatTest(XLATestCase): self._testGradientsFirstDim() def _testGradientsLastDim(self): - with self.test_session(): + with self.cached_session(): inp = [] inp_tensors = [] with self.test_scope(): @@ -220,7 +220,7 @@ class ConcatTest(XLATestCase): # Random dim to concat on concat_dim = np.random.randint(5) concat_dim_sizes = np.random.randint(1, 5, size=num_tensors) - with self.test_session(): + with self.cached_session(): inp = [] inp_tensors = [] with self.test_scope(): @@ -254,7 +254,7 @@ class ConcatTest(XLATestCase): def DISABLED_testZeroSize(self): # Verify that concat doesn't crash and burn for zero size inputs np.random.seed(7) - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): for shape0 in (), (2,): axis = len(shape0) @@ -276,14 +276,14 @@ class ConcatTest(XLATestCase): def testConcatTuple(self): c1 = np.random.rand(4, 4).astype(np.float32) c2 = np.random.rand(4, 4).astype(np.float32) - with self.test_session(): + with self.cached_session(): with self.test_scope(): concat_list_t = array_ops.concat([c1, c2], 0) concat_tuple_t = array_ops.concat((c1, c2), 0) self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval()) def testConcatNoScalars(self): - with self.test_session(): + with self.cached_session(): with self.test_scope(): scalar = constant_op.constant(7) dim = array_ops.placeholder(dtypes.int32) @@ -292,10 +292,10 @@ class ConcatTest(XLATestCase): array_ops.concat([scalar, scalar, scalar], dim) -class ConcatOffsetTest(XLATestCase): +class ConcatOffsetTest(xla_test.XLATestCase): def testBasic(self): - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): cdim = constant_op.constant(1, dtypes.int32) s0 = constant_op.constant([2, 3, 5], dtypes.int32) @@ -306,10 +306,10 @@ class ConcatOffsetTest(XLATestCase): self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]]) -class PackTest(XLATestCase): +class PackTest(xla_test.XLATestCase): def testBasic(self): - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): s0 = constant_op.constant([2, 3, 5], dtypes.int32) s1 = constant_op.constant([2, 7, 5], dtypes.int32) @@ -319,7 +319,7 @@ class PackTest(XLATestCase): self.assertAllEqual(ans, [[2, 3, 5], [2, 7, 5], [2, 20, 5]]) def testScalars(self): - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): s0 = constant_op.constant(2, dtypes.int32) s1 = constant_op.constant(3, dtypes.int32) @@ -329,7 +329,7 @@ class PackTest(XLATestCase): self.assertAllEqual(ans, [2, 3, 5]) def testEmpty(self): - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): s0 = constant_op.constant([[]], dtypes.int32) s1 = constant_op.constant([[]], dtypes.int32) diff --git a/tensorflow/compiler/tests/conv2d_test.py b/tensorflow/compiler/tests/conv2d_test.py index 62577b70ce96e220d79978f01614b2d9a3647680..af00ff287d43a8542b5a3d14eedc00c3d7aef1b7 100644 --- a/tensorflow/compiler/tests/conv2d_test.py +++ b/tensorflow/compiler/tests/conv2d_test.py @@ -22,17 +22,24 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import test_utils +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import nn_ops from tensorflow.python.platform import googletest +DATA_FORMATS = ( + ("_data_format_NHWC", "NHWC"), + ("_data_format_NCHW", "NCHW"), +) -class Conv2DTest(XLATestCase): + +class Conv2DTest(xla_test.XLATestCase, parameterized.TestCase): def _VerifyValues(self, input_sizes=None, @@ -40,6 +47,8 @@ class Conv2DTest(XLATestCase): strides=None, dilations=None, padding=None, + data_format_src="NHWC", + data_format_dst="NHWC", expected=None): """Tests that tf.nn.conv2d produces the expected value. @@ -51,8 +60,12 @@ class Conv2DTest(XLATestCase): strides: Strides. dilations: RHS dilations. padding: Padding type. + data_format_src: Data format input is in. + data_format_dst: Data format verification will run and input is converted + to. expected: Expected output. """ + total_size_1 = np.prod(input_sizes) total_size_2 = np.prod(filter_sizes) x1 = np.arange(1, total_size_1 + 1, dtype=np.float32).reshape(input_sizes) @@ -62,7 +75,19 @@ class Conv2DTest(XLATestCase): dilations = [1, 1] dilations = [1] + dilations + [1] - with self.test_session() as sess: + # Convert between data formats. + expected = test_utils.ConvertBetweenDataFormats(expected, data_format_src, + data_format_dst) + x1 = test_utils.ConvertBetweenDataFormats(x1, data_format_src, + data_format_dst) + input_sizes = test_utils.PermuteDimsBetweenDataFormats( + input_sizes, data_format_src, data_format_dst) + strides = test_utils.PermuteDimsBetweenDataFormats(strides, data_format_src, + data_format_dst) + dilations = test_utils.PermuteDimsBetweenDataFormats( + dilations, data_format_src, data_format_dst) + + with self.cached_session() as sess: t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes) t2 = array_ops.placeholder(dtypes.float32, shape=filter_sizes) with self.test_scope(): @@ -71,12 +96,14 @@ class Conv2DTest(XLATestCase): t2, strides=strides, padding=padding, - data_format="NHWC", + data_format=data_format_dst, dilations=dilations) + value = sess.run(out, {t1: x1, t2: x2}) self.assertAllClose(expected, value, 1e-3) - def testConv2D1x1Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x1Filter(self, data_format): expected_output = np.reshape([ 30.0, 36.0, 42.0, 66.0, 81.0, 96.0, 102.0, 126.0, 150.0, 138.0, 171.0, 204.0, 174.0, 216.0, 258.0, 210.0, 261.0, 312.0 @@ -86,9 +113,12 @@ class Conv2DTest(XLATestCase): filter_sizes=[1, 1, 3, 3], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Filter(self, data_format): expected_output = np.reshape( [2271.0, 2367.0, 2463.0, 2901.0, 3033.0, 3165.0], [1, 1, 2, 3]) self._VerifyValues( @@ -96,9 +126,12 @@ class Conv2DTest(XLATestCase): filter_sizes=[2, 2, 3, 3], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2Filter2x1Dilation(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Filter2x1Dilation(self, data_format): expected_output = np.array([[[[72], [82], [92]], [[112], [122], [132]]]]) self._VerifyValues( input_sizes=[1, 4, 4, 1], @@ -106,9 +139,12 @@ class Conv2DTest(XLATestCase): strides=[1, 1], dilations=[2, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2Filter(self, data_format): expected_output = np.reshape([ 231.0, 252.0, 273.0, 384.0, 423.0, 462.0, 690.0, 765.0, 840.0, 843.0, 936.0, 1029.0 @@ -118,18 +154,24 @@ class Conv2DTest(XLATestCase): filter_sizes=[1, 2, 3, 3], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterStride2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterStride2(self, data_format): expected_output = np.reshape([2271.0, 2367.0, 2463.0], [1, 1, 1, 3]) self._VerifyValues( input_sizes=[1, 2, 3, 3], filter_sizes=[2, 2, 3, 3], strides=[2, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterStride2Same(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterStride2Same(self, data_format): expected_output = np.reshape( [2271.0, 2367.0, 2463.0, 1230.0, 1305.0, 1380.0], [1, 1, 2, 3]) self._VerifyValues( @@ -137,47 +179,61 @@ class Conv2DTest(XLATestCase): filter_sizes=[2, 2, 3, 3], strides=[2, 2], padding="SAME", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2DEmptyDilation(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2DEmptyDilation(self, data_format): self._VerifyValues( input_sizes=[0, 2, 3, 3], filter_sizes=[1, 1, 3, 3], strides=[1, 1], dilations=[2, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=np.zeros([0, 2, 3, 3])) - def testConv2D2x2FilterDilation(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterDilation(self, data_format): self._VerifyValues( input_sizes=[1, 2, 3, 3], filter_sizes=[2, 2, 3, 3], strides=[1, 1], dilations=[1, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=np.reshape([2667, 2781, 2895], [1, 1, 1, 3])) - def testConv2D1x2FilterDilation(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2FilterDilation(self, data_format): self._VerifyValues( input_sizes=[1, 2, 3, 3], filter_sizes=[1, 2, 3, 3], strides=[1, 1], dilations=[2, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=np.array([[[[231, 252, 273], [384, 423, 462]], [[690, 765, 840], [843, 936, 1029]]]])) - def testConv2DKernelSizeMatchesInputSizeDilation(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2DKernelSizeMatchesInputSizeDilation(self, data_format): self._VerifyValues( input_sizes=[1, 3, 3, 1], filter_sizes=[2, 2, 1, 2], strides=[1, 1], dilations=[2, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=np.reshape([108, 128], [1, 1, 1, 2])) -class Conv2DBackpropInputTest(XLATestCase): +class Conv2DBackpropInputTest(xla_test.XLATestCase, parameterized.TestCase): def _VerifyValues(self, input_sizes=None, @@ -186,6 +242,8 @@ class Conv2DBackpropInputTest(XLATestCase): strides=None, dilations=None, padding=None, + data_format_src="NHWC", + data_format_dst="NHWC", expected=None): """Tests that gen_nn_ops.conv2d_backprop_input produces the expected output. @@ -198,8 +256,12 @@ class Conv2DBackpropInputTest(XLATestCase): strides: Strides. dilations: Dilations. padding: Padding type. + data_format_src: Data format input is in. + data_format_dst: Data format verification will run and input is converted + to. expected: Expected output. """ + total_size_1 = np.prod(filter_sizes) total_size_2 = np.prod(out_backprop_sizes) x1 = np.arange(1, total_size_1 + 1, dtype=np.float32).reshape(filter_sizes) @@ -209,7 +271,24 @@ class Conv2DBackpropInputTest(XLATestCase): if dilations is not None: dilations = [1] + dilations + [1] - with self.test_session() as sess: + expected = np.reshape(expected, input_sizes) + + # Convert between data formats. + expected = test_utils.ConvertBetweenDataFormats(expected, data_format_src, + data_format_dst) + x2 = test_utils.ConvertBetweenDataFormats(x2, data_format_src, + data_format_dst) + input_sizes = test_utils.PermuteDimsBetweenDataFormats( + input_sizes, data_format_src, data_format_dst) + out_backprop_sizes = test_utils.PermuteDimsBetweenDataFormats( + out_backprop_sizes, data_format_src, data_format_dst) + strides = test_utils.PermuteDimsBetweenDataFormats(strides, data_format_src, + data_format_dst) + if dilations is not None: + dilations = test_utils.PermuteDimsBetweenDataFormats( + dilations, data_format_src, data_format_dst) + + with self.cached_session() as sess: t1 = array_ops.placeholder(dtypes.float32, shape=filter_sizes) t2 = array_ops.placeholder(dtypes.float32, shape=out_backprop_sizes) with self.test_scope(): @@ -220,12 +299,14 @@ class Conv2DBackpropInputTest(XLATestCase): strides=strides, dilations=dilations, padding=padding, - data_format="NHWC") + data_format=data_format_dst) + value = sess.run(out, {t1: x1, t2: x2}) self.assertAllEqual(input_sizes, value.shape) - self.assertAllClose(expected, np.ravel(value), 1e-3) + self.assertAllClose(expected, value, 1e-3) - def testConv2D1x1Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x1Filter(self, data_format): expected_output = [ 5, 11, 17, 11, 25, 39, 17, 39, 61, 23, 53, 83, 29, 67, 105, 35, 81, 127, 41, 95, 149, 47, 109, 171, 53, 123, 193, 59, 137, 215, 65, 151, 237, 71, @@ -237,9 +318,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 4, 4, 2], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2FilterStride3Width5(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2FilterStride3Width5(self, data_format): expected_output = [1, 2, 0, 2, 4] self._VerifyValues( input_sizes=[1, 1, 5, 1], @@ -247,9 +331,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[3, 3], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2FilterStride3Width6(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2FilterStride3Width6(self, data_format): expected_output = [1, 2, 0, 2, 4, 0] self._VerifyValues( input_sizes=[1, 1, 6, 1], @@ -257,9 +344,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[3, 3], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2FilterStride3Width7(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2FilterStride3Width7(self, data_format): expected_output = [1, 2, 0, 2, 4, 0, 0] self._VerifyValues( input_sizes=[1, 1, 7, 1], @@ -267,9 +357,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[3, 3], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterC1Same(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterC1Same(self, data_format): expected_output = [1, 4, 7, 7, 23, 33] self._VerifyValues( input_sizes=[1, 2, 3, 1], @@ -277,9 +370,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 2, 3, 1], strides=[1, 1], padding="SAME", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Filter(self, data_format): expected_output = [ 14, 32, 50, 100, 163, 226, 167, 212, 257, 122, 140, 158, 478, 541, 604, 437, 482, 527 @@ -290,9 +386,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 3], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterSame(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterSame(self, data_format): expected_output = [ 14, 32, 50, 100, 163, 226, 217, 334, 451, 190, 307, 424, 929, 1217, 1505, 1487, 1883, 2279 @@ -303,9 +402,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 2, 3, 3], strides=[1, 1], padding="SAME", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2Filter(self, data_format): expected_output = [1, 4, 4, 3, 10, 8, 5, 16, 12] self._VerifyValues( input_sizes=[1, 3, 3, 1], @@ -313,9 +415,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 3, 2, 1], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2FilterSame(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2FilterSame(self, data_format): expected_output = [1, 4, 7, 4, 13, 16, 7, 22, 25] self._VerifyValues( input_sizes=[1, 3, 3, 1], @@ -323,9 +428,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 3, 3, 1], strides=[1, 1], padding="SAME", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterStride2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterStride2(self, data_format): expected_output = [1, 2, 5, 4, 6, 0, 0, 0, 0, 0, 3, 6, 13, 8, 12] self._VerifyValues( input_sizes=[1, 3, 5, 1], @@ -333,9 +441,12 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 2, 2, 1], strides=[2, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterStride2Same(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterStride2Same(self, data_format): expected_output = [1, 2, 2, 3, 4, 6] self._VerifyValues( input_sizes=[1, 2, 3, 1], @@ -343,9 +454,13 @@ class Conv2DBackpropInputTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[2, 2], padding="SAME", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1( + self, data_format): self._VerifyValues( input_sizes=[1, 3, 6, 1], filter_sizes=[2, 2, 1, 1], @@ -353,9 +468,12 @@ class Conv2DBackpropInputTest(XLATestCase): strides=[1, 1], dilations=[2, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=[1, 4, 7, 10, 13, 10, 0, 0, 0, 0, 0, 0, 3, 10, 17, 24, 31, 20]) - def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self, data_format): self._VerifyValues( input_sizes=[1, 2, 3, 1], filter_sizes=[2, 2, 1, 1], @@ -363,9 +481,12 @@ class Conv2DBackpropInputTest(XLATestCase): strides=[1, 1], dilations=[1, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=[1, 0, 2, 3, 0, 4]) - def testConv2DEmptyBackpropInputDilation1x2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2DEmptyBackpropInputDilation1x2(self, data_format): self._VerifyValues( input_sizes=[0, 2, 3, 1], filter_sizes=[2, 2, 1, 1], @@ -373,9 +494,12 @@ class Conv2DBackpropInputTest(XLATestCase): strides=[1, 1], dilations=[1, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=np.zeros([0])) - def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self, data_format): # The GPU version of this test is not very stable. So adjusting the # error threshold to 1e-4. self._VerifyValues( @@ -385,12 +509,16 @@ class Conv2DBackpropInputTest(XLATestCase): strides=[1, 1], dilations=[2, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=[ 14, 32, 50, 68, 86, 104, 0, 0, 0, 0, 0, 0, 122, 140, 158, 176, 194, 212 ]) - def testConv2DKernelSizeMatchesInputSizeBackpropInputDilation2x2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2DKernelSizeMatchesInputSizeBackpropInputDilation2x2( + self, data_format): self._VerifyValues( input_sizes=[1, 3, 3, 1], filter_sizes=[2, 2, 1, 2], @@ -398,10 +526,12 @@ class Conv2DBackpropInputTest(XLATestCase): strides=[1, 1], dilations=[2, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=[5, 0, 11, 0, 0, 0, 17, 0, 23]) -class Conv2DBackpropFilterTest(XLATestCase): +class Conv2DBackpropFilterTest(xla_test.XLATestCase, parameterized.TestCase): def _VerifyValues(self, input_sizes=None, @@ -410,6 +540,8 @@ class Conv2DBackpropFilterTest(XLATestCase): strides=None, dilations=None, padding=None, + data_format_src="NHWC", + data_format_dst="NHWC", expected=None): """Tests that gen_nn_ops.conv2d_backprop_filter produces the right output. @@ -422,6 +554,9 @@ class Conv2DBackpropFilterTest(XLATestCase): strides: Stride. dilations: Dilations. padding: Padding type. + data_format_src: Data format input is in. + data_format_dst: Data format verification will run and input is converted + to. expected: Expected output. """ @@ -434,7 +569,24 @@ class Conv2DBackpropFilterTest(XLATestCase): if dilations is not None: dilations = [1] + dilations + [1] - with self.test_session() as sess: + expected = np.reshape(expected, filter_sizes) + + # Convert between data formats. + x1 = test_utils.ConvertBetweenDataFormats(x1, data_format_src, + data_format_dst) + x2 = test_utils.ConvertBetweenDataFormats(x2, data_format_src, + data_format_dst) + input_sizes = test_utils.PermuteDimsBetweenDataFormats( + input_sizes, data_format_src, data_format_dst) + out_backprop_sizes = test_utils.PermuteDimsBetweenDataFormats( + out_backprop_sizes, data_format_src, data_format_dst) + strides = test_utils.PermuteDimsBetweenDataFormats(strides, data_format_src, + data_format_dst) + if dilations is not None: + dilations = test_utils.PermuteDimsBetweenDataFormats( + dilations, data_format_src, data_format_dst) + + with self.cached_session() as sess: t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes) t2 = array_ops.placeholder(dtypes.float32, shape=out_backprop_sizes) with self.test_scope(): @@ -445,13 +597,14 @@ class Conv2DBackpropFilterTest(XLATestCase): strides=strides, dilations=dilations, padding=padding, - data_format="NHWC") + data_format=data_format_dst) value = sess.run(tensor, {t1: x1, t2: x2}) self.assertAllEqual(filter_sizes, value.shape) - self.assertAllClose(expected, np.ravel(value), 1e-3) + self.assertAllClose(expected, value, 1e-3) - def testConv2D1x1Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x1Filter(self, data_format): expected_output = [8056, 8432, 8312, 8704, 8568, 8976] self._VerifyValues( input_sizes=[1, 4, 4, 3], @@ -459,9 +612,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 4, 4, 2], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2Filter(self, data_format): expected_output = [120, 141] self._VerifyValues( input_sizes=[1, 3, 3, 1], @@ -469,9 +625,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 3, 2, 1], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterDepth1(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterDepth1(self, data_format): expected_output = [5, 8, 14, 17] self._VerifyValues( input_sizes=[1, 2, 3, 1], @@ -479,9 +638,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Filter(self, data_format): expected_output = [ 17, 22, 27, 22, 29, 36, 27, 36, 45, 32, 43, 54, 37, 50, 63, 42, 57, 72, 62, 85, 108, 67, 92, 117, 72, 99, 126, 77, 106, 135, 82, 113, 144, 87, @@ -493,9 +655,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 3], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2FilterStride3Width5(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2FilterStride3Width5(self, data_format): expected_output = [9, 12] self._VerifyValues( input_sizes=[1, 1, 5, 1], @@ -503,9 +668,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[3, 3], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2FilterStride3Width6(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2FilterStride3Width6(self, data_format): expected_output = [9, 12] self._VerifyValues( input_sizes=[1, 1, 6, 1], @@ -513,9 +681,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[3, 3], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x2FilterStride3Width7(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x2FilterStride3Width7(self, data_format): expected_output = [9, 12] self._VerifyValues( input_sizes=[1, 1, 7, 1], @@ -523,9 +694,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[3, 3], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x3Filter(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x3Filter(self, data_format): expected_output = [5, 8, 11] self._VerifyValues( input_sizes=[1, 1, 4, 1], @@ -533,9 +707,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[1, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x3FilterSame(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x3FilterSame(self, data_format): expected_output = [20, 30, 20] self._VerifyValues( input_sizes=[1, 1, 4, 1], @@ -543,9 +720,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 1, 4, 1], strides=[1, 1], padding="SAME", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D1x3FilterSameOutbackprop2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D1x3FilterSameOutbackprop2(self, data_format): expected_output = [7, 10, 3] self._VerifyValues( input_sizes=[1, 1, 4, 1], @@ -553,9 +733,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[2, 2], padding="SAME", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterC1Same(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterC1Same(self, data_format): expected_output = [91, 58, 32, 17] self._VerifyValues( input_sizes=[1, 2, 3, 1], @@ -563,9 +746,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 2, 3, 1], strides=[1, 1], padding="SAME", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterStride2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterStride2(self, data_format): expected_output = [92, 102, 112] self._VerifyValues( input_sizes=[1, 3, 5, 1], @@ -573,9 +759,12 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 2, 2, 1], strides=[2, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2FilterStride2Same(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2FilterStride2Same(self, data_format): expected_output = [7, 2, 16, 5] self._VerifyValues( input_sizes=[1, 2, 3, 1], @@ -583,9 +772,13 @@ class Conv2DBackpropFilterTest(XLATestCase): out_backprop_sizes=[1, 1, 2, 1], strides=[2, 2], padding="SAME", + data_format_src="NHWC", + data_format_dst=data_format, expected=expected_output) - def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1( + self, data_format): self._VerifyValues( input_sizes=[1, 3, 6, 1], filter_sizes=[2, 2, 1, 1], @@ -593,9 +786,12 @@ class Conv2DBackpropFilterTest(XLATestCase): strides=[1, 1], dilations=[2, 1], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=[55, 70, 235, 250]) - def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self, data_format): self._VerifyValues( input_sizes=[1, 2, 3, 1], filter_sizes=[2, 2, 1, 1], @@ -603,9 +799,12 @@ class Conv2DBackpropFilterTest(XLATestCase): strides=[1, 1], dilations=[1, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=[1, 3, 4, 6]) - def testConv2DEmptyBackpropFilterDilation1x2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2DEmptyBackpropFilterDilation1x2(self, data_format): self._VerifyValues( input_sizes=[1, 2, 3, 1], filter_sizes=[2, 2, 1, 0], @@ -613,9 +812,12 @@ class Conv2DBackpropFilterTest(XLATestCase): strides=[1, 1], dilations=[1, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=np.zeros([0])) - def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self, data_format): self._VerifyValues( input_sizes=[1, 3, 4, 3], filter_sizes=[2, 2, 3, 3], @@ -623,13 +825,17 @@ class Conv2DBackpropFilterTest(XLATestCase): strides=[1, 1], dilations=[2, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=[ 17, 22, 27, 22, 29, 36, 27, 36, 45, 47, 64, 81, 52, 71, 90, 57, 78, 99, 137, 190, 243, 142, 197, 252, 147, 204, 261, 167, 232, 297, 172, 239, 306, 177, 246, 315 ]) - def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2(self): + @parameterized.named_parameters(*DATA_FORMATS) + def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2( + self, data_format): self._VerifyValues( input_sizes=[1, 3, 3, 1], filter_sizes=[2, 2, 1, 2], @@ -637,6 +843,8 @@ class Conv2DBackpropFilterTest(XLATestCase): strides=[1, 1], dilations=[2, 2], padding="VALID", + data_format_src="NHWC", + data_format_dst=data_format, expected=[1, 2, 3, 6, 7, 14, 9, 18]) diff --git a/tensorflow/compiler/tests/conv3d_test.py b/tensorflow/compiler/tests/conv3d_test.py index 3bebf46511cbc471d3fbbbe92d28511fcc717387..33fd983b5485e503c2fcc96db2dfdecfc41e309f 100644 --- a/tensorflow/compiler/tests/conv3d_test.py +++ b/tensorflow/compiler/tests/conv3d_test.py @@ -21,7 +21,7 @@ from __future__ import print_function import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops @@ -33,10 +33,10 @@ from tensorflow.python.platform import googletest # Test cloned from # tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py -class Conv3DBackpropFilterV2GradTest(XLATestCase): +class Conv3DBackpropFilterV2GradTest(xla_test.XLATestCase): def testGradient(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): for padding in ["SAME", "VALID"]: for stride in [1, 2]: np.random.seed(1) @@ -66,10 +66,10 @@ class Conv3DBackpropFilterV2GradTest(XLATestCase): # Test cloned from tensorflow/python/kernel_tests/conv3d_transpose_test.py -class Conv3DTransposeTest(XLATestCase): +class Conv3DTransposeTest(xla_test.XLATestCase): def testConv3DTransposeSingleStride(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): strides = [1, 1, 1, 1, 1] # Input, output: [batch, depth, height, width, channel] @@ -119,7 +119,7 @@ class Conv3DTransposeTest(XLATestCase): self.assertAllClose(target, value[n, d, h, w, k]) def testConv3DTransposeSame(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): strides = [1, 2, 2, 2, 1] # Input, output: [batch, depth, height, width, depth] @@ -157,7 +157,7 @@ class Conv3DTransposeTest(XLATestCase): self.assertAllClose(target, value[n, d, h, w, k]) def testConv3DTransposeValid(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): strides = [1, 2, 2, 2, 1] # Input, output: [batch, depth, height, width, depth] @@ -217,7 +217,7 @@ class Conv3DTransposeTest(XLATestCase): np.random.seed(1) # Make it reproducible. x_val = np.random.random_sample(x_shape).astype(np.float64) f_val = np.random.random_sample(f_shape).astype(np.float64) - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): x = constant_op.constant(x_val, name="x", dtype=dtypes.float32) f = constant_op.constant(f_val, name="f", dtype=dtypes.float32) output = nn_ops.conv3d_transpose( diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py index 865f60ccab46ec6829e49409508303052944e13b..04f3b3ef4905984b0432a536c3b1c275738ede17 100644 --- a/tensorflow/compiler/tests/dense_layer_test.py +++ b/tensorflow/compiler/tests/dense_layer_test.py @@ -86,7 +86,7 @@ class DenseLayerTest(test.TestCase): XlaLaunch op by XLA. """ - with self.test_session() as sess: + with self.cached_session() as sess: x = array_ops.placeholder(shape=[2, 2, 3], dtype=np.float32) with jit_scope(): y = layers.dense(x, 3) @@ -113,7 +113,7 @@ class DenseLayerTest(test.TestCase): cluster, causing dense layer to be split into TWO XlaLaunch ops. """ - with self.test_session() as sess: + with self.cached_session() as sess: x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32) with jit_scope(): y = layers.dense(x, 3) diff --git a/tensorflow/compiler/tests/depthwise_conv_op_test.py b/tensorflow/compiler/tests/depthwise_conv_op_test.py index 03d96a2cd8ab22a472a67f092e36224820405fa8..6ef8a68ca5d35d3d2f78f0cb491e7bb98ff97ac9 100644 --- a/tensorflow/compiler/tests/depthwise_conv_op_test.py +++ b/tensorflow/compiler/tests/depthwise_conv_op_test.py @@ -21,7 +21,7 @@ from __future__ import print_function import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -114,7 +114,7 @@ def CheckGradConfigsToTest(): yield i, f, o, s, p -class DepthwiseConv2DTest(XLATestCase): +class DepthwiseConv2DTest(xla_test.XLATestCase): # This is testing that depthwise_conv2d and depthwise_conv2d_native # produce the same results. It also tests that NCHW and NWHC @@ -151,7 +151,7 @@ class DepthwiseConv2DTest(XLATestCase): dtype=data_type).reshape(tensor_in_sizes) x2 = np.array([f * 1.0 for f in range(1, total_size_2 + 1)], dtype=data_type).reshape(filter_in_sizes) - with self.test_session() as sess: + with self.cached_session() as sess: if data_type == np.float32: tolerance = 1e-4 else: @@ -247,7 +247,7 @@ class DepthwiseConv2DTest(XLATestCase): dtype=np.float32).reshape(tensor_in_sizes) x2 = np.array([f * 1.0 for f in range(1, total_size_2 + 1)], dtype=np.float32).reshape(filter_in_sizes) - with self.test_session() as sess: + with self.cached_session() as sess: t1 = array_ops.placeholder(shape=tensor_in_sizes, dtype=np.float32) t2 = array_ops.placeholder(shape=filter_in_sizes, dtype=np.float32) with self.test_scope(): @@ -321,7 +321,7 @@ class DepthwiseConv2DTest(XLATestCase): x2 = np.random.rand(*output_sizes).astype(np.float32) def _GetVal(use_xla): - with self.test_session(): + with self.cached_session(): t0 = constant_op.constant(input_sizes, shape=[len(input_sizes)]) t1 = array_ops.placeholder(np.float32, shape=filter_sizes) t2 = array_ops.placeholder(np.float32, shape=output_sizes) @@ -356,7 +356,7 @@ class DepthwiseConv2DTest(XLATestCase): x2 = np.random.rand(*output_sizes).astype(np.float32) def _GetVal(use_xla): - with self.test_session(): + with self.cached_session(): t0 = array_ops.placeholder(np.float32, shape=input_sizes) t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)]) t2 = array_ops.placeholder(np.float32, shape=output_sizes) diff --git a/tensorflow/compiler/tests/dynamic_slice_ops_test.py b/tensorflow/compiler/tests/dynamic_slice_ops_test.py index 6a46d2ec3e7aee3a4ecfbf1ab9f622d8eb659e3c..5f01e128f0b0fa725d99b00ba3406bd50a1b8962 100644 --- a/tensorflow/compiler/tests/dynamic_slice_ops_test.py +++ b/tensorflow/compiler/tests/dynamic_slice_ops_test.py @@ -20,17 +20,17 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.compiler.tf2xla.python import xla from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class DynamicUpdateSliceOpsTest(XLATestCase): +class DynamicUpdateSliceOpsTest(xla_test.XLATestCase): def _assertOpOutputMatchesExpected(self, op, args, expected): - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): placeholders = [ array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape) diff --git a/tensorflow/compiler/tests/dynamic_stitch_test.py b/tensorflow/compiler/tests/dynamic_stitch_test.py index c109c27abe2f145685f83251e1d21ec8ddad563a..50b04daa6b9f4159a3c4bdeecaf900a5b35a833c 100644 --- a/tensorflow/compiler/tests/dynamic_stitch_test.py +++ b/tensorflow/compiler/tests/dynamic_stitch_test.py @@ -20,17 +20,17 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import data_flow_ops from tensorflow.python.platform import googletest -class DynamicStitchTest(XLATestCase): +class DynamicStitchTest(xla_test.XLATestCase): def _AssertDynamicStitchResultIs(self, indices, data, expected): - with self.test_session() as session: + with self.cached_session() as session: index_placeholders = [ array_ops.placeholder(dtypes.as_dtype(arg.dtype)) for arg in indices ] diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py index 4dff5f0f405fb1d936ab2e6bcd82e05e926172c7..63cee550fde9d9d4314b1541fba191df776a4da2 100644 --- a/tensorflow/compiler/tests/eager_test.py +++ b/tensorflow/compiler/tests/eager_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.core.protobuf import config_pb2 from tensorflow.python.eager import backprop from tensorflow.python.eager import context @@ -31,14 +31,17 @@ from tensorflow.python.framework import ops from tensorflow.python.layers import convolutional from tensorflow.python.layers import pooling from tensorflow.python.ops import array_ops +from tensorflow.python.ops import embedding_ops +from tensorflow.python.ops import gen_random_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops from tensorflow.python.ops import resource_variable_ops from tensorflow.python.platform import googletest +from tensorflow.python.training import adam -class EagerTest(XLATestCase): +class EagerTest(xla_test.XLATestCase): def testBasic(self): with self.test_scope(): @@ -47,6 +50,21 @@ class EagerTest(XLATestCase): product = three * five self.assertAllEqual(15, product) + def testGradientTape(self): + with self.test_scope(): + + x = constant_op.constant(1.0) + y = constant_op.constant(10.0) + with backprop.GradientTape(persistent=True) as tape: + tape.watch(x) + tape.watch(y) + a = x + y + x * y + da_dx = tape.gradient(a, x) + da_dy = tape.gradient(a, y) + + self.assertEqual(11.0, da_dx.numpy()) + self.assertEqual(2.0, da_dy.numpy()) + def testExecuteListOutputLen0(self): with self.test_scope(): empty = constant_op.constant([], dtype=dtypes.float32) @@ -83,7 +101,7 @@ class EagerTest(XLATestCase): self.assertAllEqual(15, product) # Run some ops graphly - with context.graph_mode(), self.test_session() as sess: + with context.graph_mode(), self.cached_session() as sess: with self.test_scope(): three = constant_op.constant(3) five = constant_op.constant(5) @@ -105,6 +123,14 @@ class EagerTest(XLATestCase): with self.test_scope(): self.assertAllEqual(2, array_ops.identity(2)) + def testRandomOps(self): + with self.test_scope(): + tensor = gen_random_ops.random_uniform((2, 2), dtypes.float32) + row0 = tensor[0].numpy() + row1 = tensor[1].numpy() + # It should be very unlikely to rng to generate two equal rows. + self.assertFalse((row0 == row1).all()) + def testIdentityOnVariable(self): with self.test_scope(): v = resource_variable_ops.ResourceVariable(True) @@ -160,12 +186,120 @@ class EagerTest(XLATestCase): for _ in range(100): values.append(var.value()) + # The shape, shape_n, size, and rank are tested here because their + # execution kernels (as opposed to compilation only tf2xla kernels) + # are distincts from tf2xla kernels. + + def testShape(self): + def const(value): + return array_ops.shape( + constant_op.constant(value)).numpy() -class EagerFunctionTest(XLATestCase): + def ones(value): + return array_ops.shape( + array_ops.ones(value)).numpy() + + with self.test_scope(): + # Shapes of directly constructed tensors + self.assertAllEqual([], const(3)) + self.assertAllEqual([3], const([1.0, 2.0, 3.0])) + self.assertAllEqual([2, 2], const([[1.0, 2.0], [3.0, 4.0]])) + self.assertAllEqual([2, 1, 2], const([[[1.0, 2.0]], [[3.0, 4.0]]])) + + # Shapes of tensors created by op running on device + # We make this distinction because directly constructed tensors + # are treated differently in a few places that can influence shape: + # - they always have on_host_tensor + # - they and their shapes can be cached + # - they end up on device via a copy, instead of as program output + self.assertAllEqual([], ones([])) + self.assertAllEqual([3], ones([3])) + self.assertAllEqual([2, 2], ones([2, 2])) + self.assertAllEqual([2, 1, 2], ones([2, 1, 2])) + + def testShapeN(self): + with self.test_scope(): + # Shapes of directly constructed tensors + shapes = array_ops.shape_n([ + constant_op.constant(1.0), + constant_op.constant([1.0, 2.0, 3.0]), + constant_op.constant([[1.0, 2.0], [3.0, 4.0]])]) + self.assertAllEqual( + [[], [3], [2, 2]], + [x.numpy().tolist() for x in shapes]) + + # Shapes of tensors created by op running on device + shapes = array_ops.shape_n([ + array_ops.ones([]), + array_ops.ones([3]), + array_ops.ones([2, 2])]) + self.assertAllEqual( + [[], [3], [2, 2]], + [x.numpy().tolist() for x in shapes]) + + def testSize(self): + with self.test_scope(): + self.assertEqual( + 1, array_ops.size(constant_op.constant(1.0)).numpy()) + self.assertEqual( + 3, array_ops.size(constant_op.constant([1.0, 2.0, 3.0])).numpy()) + self.assertEqual( + 4, array_ops.size( + constant_op.constant([[1.0, 2.0], [3.0, 4.0]])).numpy()) + + def testRank(self): + with self.test_scope(): + self.assertEqual( + 0, array_ops.rank(constant_op.constant(1.0)).numpy()) + self.assertEqual( + 1, array_ops.rank(constant_op.constant([1.0, 2.0, 3.0])).numpy()) + self.assertEqual( + 2, array_ops.rank( + constant_op.constant([[1.0, 2.0], [3.0, 4.0]])).numpy()) + + def testAdam(self): + with self.test_scope(): + optimizer = adam.AdamOptimizer(0.1) + x = resource_variable_ops.ResourceVariable(10.0) + with backprop.GradientTape() as tape: + y = x * x + dy_dx = tape.gradient(y, x) + optimizer.apply_gradients([(dy_dx, x)]) + self.assertAlmostEqual(9.9, x.numpy(), places=3) + + def testAdamSparse(self): + with ops.device('/cpu:0'): + # Create 2-D embedding for 3 objects on CPU because sparse/sliced updates + # are not implemented on TPU. + embedding_matrix = resource_variable_ops.ResourceVariable( + array_ops.ones([3, 2])) + + with self.test_scope(): + with backprop.GradientTape() as tape: + embedding = embedding_ops.embedding_lookup(embedding_matrix, [1]) + y = math_ops.reduce_sum(embedding) + dy_dx = tape.gradient(y, embedding_matrix) + self.assertIsInstance(dy_dx, ops.IndexedSlices) + optimizer = adam.AdamOptimizer(0.1) + # The gradient application operations will run on CPU because optimizer + # updates are always collocated with the variable. + optimizer.apply_gradients([(dy_dx, embedding_matrix)]) + + # This assign_add will run on CPU because when an input to an + # operation is a resource, this operation is placed on the resource's + # device by the eager runtime. + embedding_matrix.assign_add(array_ops.ones([3, 2])) + + self.assertAllClose([[2.0, 2.0], + [1.9, 1.9], + [2.0, 2.0]], embedding_matrix.numpy()) + + +class EagerFunctionTest(xla_test.XLATestCase): def testBasic(self): with self.test_scope(): - matmul = function.defun(math_ops.matmul, compiled=True) + matmul = function.defun(math_ops.matmul) t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]]) sq = matmul(t, t, transpose_a=True) self.assertAllEqual(sq.numpy().reshape(-1), [10, 14, 14, 20]) @@ -187,7 +321,7 @@ class EagerFunctionTest(XLATestCase): def model(x): x = conv(x) return pool(x) - model = function.defun(model, compiled=True) + model = function.defun(model) x = array_ops.ones([1, 4, 4, 1]) y = model(x) @@ -197,7 +331,7 @@ class EagerFunctionTest(XLATestCase): with self.test_scope(): v = resource_variable_ops.ResourceVariable(1.0) - @function.defun(compiled=True) + @function.defun def f(): return v.read_value() @@ -212,11 +346,43 @@ class EagerFunctionTest(XLATestCase): v.assign_add(1.0) return v - f = function.defun(f, compiled=True) + f = function.defun(f) var = f(v) self.assertEqual(2.0, var.numpy()) + def testReturnResourceHandle(self): + with self.test_scope(): + v = resource_variable_ops.ResourceVariable([[1.0, 2.0], [3.0, 4.0]]) + + def f(v): + return v.handle + + f = function.defun(f) + handle = f(v) + self.assertAllEqual(v.numpy(), + resource_variable_ops.read_variable_op( + handle, dtypes.float32).numpy()) + + def testReturnMultipleResourceHandles(self): + with self.test_scope(): + v1 = resource_variable_ops.ResourceVariable(1.25) + v2 = resource_variable_ops.ResourceVariable(2.0) + + def f(v): + return v.handle, 3.0 * v, v2.handle, v + v2 + + f = function.defun(f) + v1_handle, v1_times_3, v2_handle, variable_sum = f(v1) + self.assertAllEqual(v1.numpy(), + resource_variable_ops.read_variable_op( + v1_handle, dtypes.float32).numpy()) + self.assertEqual(3.75, v1_times_3.numpy()) + self.assertAllEqual(v2.numpy(), + resource_variable_ops.read_variable_op( + v2_handle, dtypes.float32).numpy()) + self.assertEqual(3.25, variable_sum.numpy()) + def testAllArgumentKinds(self): """Test a complex function that takes different argument kinds. @@ -240,7 +406,7 @@ class EagerFunctionTest(XLATestCase): d = r2 * v2 return a, b, c, d - foo = function.defun(foo, compiled=True) + foo = function.defun(foo) c1 = [0, 0] c2 = array_ops.ones([2], dtype=dtypes.int32) @@ -262,7 +428,7 @@ class EagerFunctionTest(XLATestCase): with self.test_scope(): v0 = resource_variable_ops.ResourceVariable(5.0) - @function.defun(compiled=True) + @function.defun def f(x): x = v0 * v0 * x return x @@ -275,8 +441,122 @@ class EagerFunctionTest(XLATestCase): self.assertEqual(75, y.numpy()) self.assertEqual(30, dy.numpy()) + def testGradientTapeInDefun(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(5.0) + + @function.defun + def f(): + x = constant_op.constant(1.0) + with backprop.GradientTape() as tape: + y = v0 * x + dy = tape.gradient(y, v0) + return dy + + dy = f() + self.assertEqual(1.0, dy.numpy()) + + def testSliceInDefun(self): + with self.test_scope(): + + @function.defun + def f(x, y): + return x[0::2, y:, ...] + + x = array_ops.ones([2, 3, 4]) + y = array_ops.ones([], dtype=dtypes.int32) + with backprop.GradientTape() as tape: + tape.watch(x) + tape.watch(y) + z = f(x, y) + dz = tape.gradient(z, x) + + self.assertAllEqual(np.ones([1, 2, 4]), z.numpy()) + self.assertAllEqual((2, 3, 4), dz.shape.as_list()) + + def testNestedDefun(self): + with self.test_scope(): + + @function.defun + def times_two(x): + return 2 * x + + @function.defun + def two_x_plus_1(x): + return times_two(x) + 1 + + x = constant_op.constant([2, 3, 4]) + y = two_x_plus_1(x) + self.assertAllEqual([5, 7, 9], y.numpy()) + + def testNestedDefunWithVariable(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(5.0) + + @function.defun + def g(x): + x = v0 * x + return x + + @function.defun + def f(x): + x = g(v0 * x) + return x + + x = constant_op.constant(3.0) + y = f(x) + + self.assertEqual(75, y.numpy()) + + def testNestedDefunInGradientTape(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(5.0) + + @function.defun + def g(x): + x = v0 * x + return x + + @function.defun + def f(x): + x = g(v0 * x) + return x + + x = constant_op.constant(3.0) + with backprop.GradientTape() as tape: + y = f(x) + dy = tape.gradient(y, v0) + + self.assertEqual(75, y.numpy()) + self.assertEqual(30, dy.numpy()) + + def testNestedDefunInGradientTapeDifferentVars(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(5.0) + v1 = resource_variable_ops.ResourceVariable(3.0) + + @function.defun + def g(x): + x = v1 * x + return x + + @function.defun + def f(x): + x = g(v0 * x) + return x + + x = constant_op.constant(3.0) + with backprop.GradientTape(persistent=True) as tape: + y = f(x) + dy_v0 = tape.gradient(y, v0) + dy_v1 = tape.gradient(y, v1) + + self.assertEqual(45, y.numpy()) + self.assertEqual(9, dy_v0.numpy()) + self.assertEqual(15, dy_v1.numpy()) -class ExcessivePaddingTest(XLATestCase): + +class ExcessivePaddingTest(xla_test.XLATestCase): """Test that eager execution works with TPU flattened tensors. Tensors that would normally be excessively padded when written @@ -307,7 +587,7 @@ class ExcessivePaddingTest(XLATestCase): def testAsFunctionInput(self): with self.test_scope(): - @function.defun(compiled=True) + @function.defun def f(x): return math_ops.reduce_sum(x, axis=2) @@ -318,7 +598,7 @@ class ExcessivePaddingTest(XLATestCase): def testAsFunctionOutput(self): with self.test_scope(): - @function.defun(compiled=True) + @function.defun def f(x): return x * constant_op.constant(100 * [[[10.0, 2.0]]]) @@ -327,6 +607,36 @@ class ExcessivePaddingTest(XLATestCase): self.assertAllEqual(100 * [[36.0]], reduced) +def multiple_tpus(): + devices = context.context().devices() + return len([d for d in devices if 'device:TPU:' in d]) > 1 + + +class MultiDeviceTest(xla_test.XLATestCase): + """Test running TPU computation on more than one core.""" + + def testBasic(self): + if not multiple_tpus(): + self.skipTest('MultiDeviceTest requires multiple TPU devices.') + + # Compute 10 on TPU core 0 + with ops.device('device:TPU:0'): + two = constant_op.constant(2) + five = constant_op.constant(5) + ten = two * five + self.assertAllEqual(10, ten) + + # Compute 6 on TPU core 1 + with ops.device('device:TPU:1'): + two = constant_op.constant(2) + three = constant_op.constant(3) + six = two * three + self.assertAllEqual(6, six) + + # Copy 10 and 6 to CPU and sum them + self.assertAllEqual(16, ten + six) + + if __name__ == '__main__': ops.enable_eager_execution( config=config_pb2.ConfigProto(log_device_placement=True)) diff --git a/tensorflow/compiler/tests/extract_image_patches_op_test.py b/tensorflow/compiler/tests/extract_image_patches_op_test.py index 0361702e7af778176daed941d64e61198090daf2..37061e91d161db352b388a965eb72c9c32d3d752 100644 --- a/tensorflow/compiler/tests/extract_image_patches_op_test.py +++ b/tensorflow/compiler/tests/extract_image_patches_op_test.py @@ -20,13 +20,13 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class ExtractImagePatches(XLATestCase): +class ExtractImagePatches(xla_test.XLATestCase): """Functional tests for ExtractImagePatches op.""" def _VerifyValues(self, image, ksizes, strides, rates, padding, patches): @@ -44,7 +44,7 @@ class ExtractImagePatches(XLATestCase): strides = [1] + strides + [1] rates = [1] + rates + [1] - with self.test_session(): + with self.cached_session(): image_placeholder = array_ops.placeholder(dtypes.float32) with self.test_scope(): out_tensor = array_ops.extract_image_patches( diff --git a/tensorflow/compiler/tests/fake_quant_ops_test.py b/tensorflow/compiler/tests/fake_quant_ops_test.py index dfe9400ef0f55ca011d4e23ba5d735899ca2e054..2178c4455609550226c89ceb185837768be1f622 100644 --- a/tensorflow/compiler/tests/fake_quant_ops_test.py +++ b/tensorflow/compiler/tests/fake_quant_ops_test.py @@ -17,14 +17,14 @@ from __future__ import division from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_array_ops from tensorflow.python.platform import googletest -class FakeQuantWithMinMaxArgsTest(XLATestCase): +class FakeQuantWithMinMaxArgsTest(xla_test.XLATestCase): """Test cases for FakeQuantWithMinMaxArgs operation.""" # 8 bits, wide range. @@ -107,7 +107,7 @@ class FakeQuantWithMinMaxArgsTest(XLATestCase): ], dtype=np.float32) - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): input_placeholder = array_ops.placeholder( dtypes.float32, inputs.shape, name="inputs") @@ -122,7 +122,7 @@ class FakeQuantWithMinMaxArgsTest(XLATestCase): result, expected, rtol=1e-3, atol=1e-5, bfloat16_rtol=0.03) -class FakeQuantWithMinMaxArgsGradientTest(XLATestCase): +class FakeQuantWithMinMaxArgsGradientTest(xla_test.XLATestCase): """Test cases for FakeQuantWithMinMaxArgsGradient operation.""" # 8 bits, wide range. @@ -198,7 +198,7 @@ class FakeQuantWithMinMaxArgsGradientTest(XLATestCase): [0.0, 0.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0, 0.0], dtype=np.float32) - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): gradient_placeholder = array_ops.placeholder( dtypes.float32, gradients.shape, name="gradients") @@ -223,7 +223,7 @@ class FakeQuantWithMinMaxArgsGradientTest(XLATestCase): bfloat16_rtol=0.03) -class FakeQuantWithMinMaxVarsTest(XLATestCase): +class FakeQuantWithMinMaxVarsTest(xla_test.XLATestCase): """Test cases for FakeQuantWithMinMaxVars operation.""" # 8 bits, wide range. @@ -306,7 +306,7 @@ class FakeQuantWithMinMaxVarsTest(XLATestCase): ], dtype=np.float32) - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): input_placeholder = array_ops.placeholder( dtypes.float32, inputs.shape, name="inputs") @@ -328,7 +328,7 @@ class FakeQuantWithMinMaxVarsTest(XLATestCase): result, expected, rtol=1e-3, atol=1e-5, bfloat16_rtol=0.03) -class FakeQuantWithMinMaxVarsGradientTest(XLATestCase): +class FakeQuantWithMinMaxVarsGradientTest(xla_test.XLATestCase): """Test cases for FakeQuantWithMinMaxVarsGradient operation.""" # 8 bits, wide range. @@ -406,7 +406,7 @@ class FakeQuantWithMinMaxVarsGradientTest(XLATestCase): expected_backprops_wrt_min = 1.0 + 2.0 expected_backprops_wrt_max = 10.0 + 11.0 - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): gradient_placeholder = array_ops.placeholder( dtypes.float32, gradients.shape, name="gradients") diff --git a/tensorflow/compiler/tests/fft_test.py b/tensorflow/compiler/tests/fft_test.py index afb5fa4bb4fefe5bc2ecded826143ffc83c2b559..b3e13fbaa6b33bdaa1be123be558059e96de282e 100644 --- a/tensorflow/compiler/tests/fft_test.py +++ b/tensorflow/compiler/tests/fft_test.py @@ -23,10 +23,11 @@ import itertools import numpy as np import scipy.signal as sps -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.contrib.signal.python.ops import spectral_ops as signal from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gradients_impl from tensorflow.python.ops import spectral_ops from tensorflow.python.platform import googletest @@ -57,7 +58,7 @@ INNER_DIMS_2D = pick_10(itertools.product(POWS_OF_2, POWS_OF_2)) INNER_DIMS_3D = pick_10(itertools.product(POWS_OF_2, POWS_OF_2, POWS_OF_2)) -class FFTTest(XLATestCase): +class FFTTest(xla_test.XLATestCase): def _VerifyFftMethod(self, inner_dims, complex_to_input, input_to_expected, tf_method): @@ -70,7 +71,7 @@ class FFTTest(XLATestCase): data = np.reshape(data.astype(np.float32).view(np.complex64), shape) data = to_32bit(complex_to_input(data)) expected = to_32bit(input_to_expected(data)) - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): ph = array_ops.placeholder( dtypes.as_dtype(data.dtype), shape=data.shape) @@ -92,13 +93,16 @@ class FFTTest(XLATestCase): data, nperseg=ws, noverlap=ws - hs, boundary=None, window=window)[2] expected = np.swapaxes(expected, -1, -2) expected *= window.sum() # scipy divides by window sum - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): ph = array_ops.placeholder( dtypes.as_dtype(data.dtype), shape=data.shape) out = signal.stft(ph, ws, hs) + grad = gradients_impl.gradients(out, ph, + grad_ys=array_ops.ones_like(out)) - value = sess.run(out, {ph: data}) + # For gradients, we simply verify that they compile & execute. + value, _ = sess.run([out, grad], {ph: data}) self.assertAllClose(expected, value, rtol=RTOL, atol=ATOL) def testFFT(self): diff --git a/tensorflow/compiler/tests/fifo_queue_test.py b/tensorflow/compiler/tests/fifo_queue_test.py new file mode 100644 index 0000000000000000000000000000000000000000..8c7edfd277c992c35a81dd5f261256a86352254e --- /dev/null +++ b/tensorflow/compiler/tests/fifo_queue_test.py @@ -0,0 +1,201 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow.ops.data_flow_ops.FIFOQueue.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import dtypes as dtypes_lib +from tensorflow.python.ops import data_flow_ops +from tensorflow.python.platform import test + + +class FIFOQueueTest(xla_test.XLATestCase): + + def testEnqueue(self): + with self.cached_session(), self.test_scope(): + q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32) + enqueue_op = q.enqueue((10.0,)) + enqueue_op.run() + + def testEnqueueWithShape(self): + with self.cached_session(), self.test_scope(): + q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32, shapes=(3, 2)) + enqueue_correct_op = q.enqueue(([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],)) + enqueue_correct_op.run() + with self.assertRaises(ValueError): + q.enqueue(([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],)) + self.assertEqual(1, q.size().eval()) + + def testMultipleDequeues(self): + with self.cached_session(), self.test_scope(): + q = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()]) + self.evaluate(q.enqueue([1])) + self.evaluate(q.enqueue([2])) + self.evaluate(q.enqueue([3])) + a, b, c = self.evaluate([q.dequeue(), q.dequeue(), q.dequeue()]) + self.assertAllEqual(set([1, 2, 3]), set([a, b, c])) + + def testQueuesDontShare(self): + with self.cached_session(), self.test_scope(): + q = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()]) + self.evaluate(q.enqueue(1)) + q2 = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()]) + self.evaluate(q2.enqueue(2)) + self.assertAllEqual(self.evaluate(q2.dequeue()), 2) + self.assertAllEqual(self.evaluate(q.dequeue()), 1) + + def testEnqueueDictWithoutNames(self): + with self.cached_session(), self.test_scope(): + q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32) + with self.assertRaisesRegexp(ValueError, "must have names"): + q.enqueue({"a": 12.0}) + + def testParallelEnqueue(self): + with self.cached_session() as sess, self.test_scope(): + q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32) + elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] + enqueue_ops = [q.enqueue((x,)) for x in elems] + dequeued_t = q.dequeue() + + # Run one producer thread for each element in elems. + def enqueue(enqueue_op): + sess.run(enqueue_op) + + threads = [ + self.checkedThread(target=enqueue, args=(e,)) for e in enqueue_ops + ] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + # Dequeue every element using a single thread. + results = [] + for _ in xrange(len(elems)): + results.append(dequeued_t.eval()) + self.assertItemsEqual(elems, results) + + def testParallelDequeue(self): + with self.cached_session() as sess, self.test_scope(): + q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32) + elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] + enqueue_ops = [q.enqueue((x,)) for x in elems] + dequeued_t = q.dequeue() + + # Enqueue every element using a single thread. + for enqueue_op in enqueue_ops: + enqueue_op.run() + + # Run one consumer thread for each element in elems. + results = [] + + def dequeue(): + results.append(sess.run(dequeued_t)) + + threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + self.assertItemsEqual(elems, results) + + def testDequeue(self): + with self.cached_session(), self.test_scope(): + q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32) + elems = [10.0, 20.0, 30.0] + enqueue_ops = [q.enqueue((x,)) for x in elems] + dequeued_t = q.dequeue() + + for enqueue_op in enqueue_ops: + enqueue_op.run() + + for i in xrange(len(elems)): + vals = dequeued_t.eval() + self.assertEqual([elems[i]], vals) + + def testEnqueueAndBlockingDequeue(self): + with self.cached_session() as sess, self.test_scope(): + q = data_flow_ops.FIFOQueue(3, dtypes_lib.float32) + elems = [10.0, 20.0, 30.0] + enqueue_ops = [q.enqueue((x,)) for x in elems] + dequeued_t = q.dequeue() + + def enqueue(): + # The enqueue_ops should run after the dequeue op has blocked. + # TODO(mrry): Figure out how to do this without sleeping. + time.sleep(0.1) + for enqueue_op in enqueue_ops: + sess.run(enqueue_op) + + results = [] + + def dequeue(): + for _ in xrange(len(elems)): + results.append(sess.run(dequeued_t)) + + enqueue_thread = self.checkedThread(target=enqueue) + dequeue_thread = self.checkedThread(target=dequeue) + enqueue_thread.start() + dequeue_thread.start() + enqueue_thread.join() + dequeue_thread.join() + + for elem, result in zip(elems, results): + self.assertEqual([elem], result) + + def testMultiEnqueueAndDequeue(self): + with self.cached_session() as sess, self.test_scope(): + q = data_flow_ops.FIFOQueue(10, (dtypes_lib.int32, dtypes_lib.float32)) + elems = [(5, 10.0), (10, 20.0), (15, 30.0)] + enqueue_ops = [q.enqueue((x, y)) for x, y in elems] + dequeued_t = q.dequeue() + + for enqueue_op in enqueue_ops: + enqueue_op.run() + + for i in xrange(len(elems)): + x_val, y_val = sess.run(dequeued_t) + x, y = elems[i] + self.assertEqual([x], x_val) + self.assertEqual([y], y_val) + + def testQueueSizeEmpty(self): + with self.cached_session(), self.test_scope(): + q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32) + self.assertEqual([0], q.size().eval()) + + def testQueueSizeAfterEnqueueAndDequeue(self): + with self.cached_session(), self.test_scope(): + q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32) + enqueue_op = q.enqueue((10.0,)) + dequeued_t = q.dequeue() + size = q.size() + self.assertEqual([], size.get_shape()) + + enqueue_op.run() + self.assertEqual(1, size.eval()) + dequeued_t.op.run() + self.assertEqual(0, size.eval()) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py index 8e6407dffdac3adbcda8cbca2109ef9196defa8c..f1b87a5ffb73bed62a80abaa152d335f64d970c5 100644 --- a/tensorflow/compiler/tests/ftrl_test.py +++ b/tensorflow/compiler/tests/ftrl_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variables @@ -29,8 +29,7 @@ from tensorflow.python.training import adagrad from tensorflow.python.training import ftrl from tensorflow.python.training import gradient_descent - -class FtrlOptimizerTest(XLATestCase): +class FtrlOptimizerTest(xla_test.XLATestCase): def initVariableAndGradient(self, dtype): var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype) @@ -112,7 +111,7 @@ class FtrlOptimizerTest(XLATestCase): def testFtrlwithoutRegularization(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) @@ -146,7 +145,7 @@ class FtrlOptimizerTest(XLATestCase): def testFtrlwithoutRegularization2(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) @@ -174,7 +173,7 @@ class FtrlOptimizerTest(XLATestCase): def testFtrlWithL1(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) @@ -196,13 +195,17 @@ class FtrlOptimizerTest(XLATestCase): # Validate updated params self.assertAllCloseAccordingToType( - np.array([-7.66718769, -10.91273689]), var0.eval(), rtol=1e-4) + np.array([-7.66718769, -10.91273689]), + var0.eval(), + rtol=1e-4, + bfloat16_rtol=1e-1, + bfloat16_atol=1e-1) self.assertAllCloseAccordingToType( np.array([-0.93460727, -1.86147261]), var1.eval(), rtol=1e-4) def testFtrlWithL1_L2(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) @@ -236,7 +239,7 @@ class FtrlOptimizerTest(XLATestCase): weights will tend to have smaller magnitudes with this parameter set. """ for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) @@ -259,9 +262,49 @@ class FtrlOptimizerTest(XLATestCase): # Validate updated params self.assertAllCloseAccordingToType( - np.array([-0.21931979, -0.40642974]), var0.eval(), rtol=1e-4) + np.array([-0.22578996, -0.44345799]), var0.eval(), rtol=1e-4) self.assertAllCloseAccordingToType( - np.array([-0.0282721, -0.07188385]), var1.eval(), rtol=1e-4) + np.array([-0.14378493, -0.13229476]), var1.eval(), rtol=1e-4) + + def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self): + """Verifies that l2 shrinkage in FTRL does not change lr schedule.""" + for dtype in self.float_types: + with self.test_session(), self.test_scope(): + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) + var1 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.2], dtype=dtype) + grads1 = constant_op.constant([0.1, 0.2], dtype=dtype) + + opt0 = ftrl.FtrlOptimizer( + 3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.001, + l2_regularization_strength=2.0, + l2_shrinkage_regularization_strength=0.1) + opt1 = ftrl.FtrlOptimizer( + 3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.001, + l2_regularization_strength=2.0) + update0 = opt0.apply_gradients([(grads0, var0)]) + update1 = opt1.apply_gradients([(grads1, var1)]) + variables.global_variables_initializer().run() + + self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) + self.assertAllCloseAccordingToType([1.0, 2.0], var1.eval()) + + # Run 10 steps FTRL + for _ in range(10): + update0.run() + update1.run() + + # var0 is experiencing L2 shrinkage so it should be smaller than var1 + # in magnitude. + self.assertTrue((var0.eval()**2 < var1.eval()**2).all()) + accum0 = list(opt0._slots["accum"].values())[0].eval() + accum1 = list(opt1._slots["accum"].values())[0].eval() + # L2 shrinkage should not change how we update grad accumulator. + self.assertAllCloseAccordingToType(accum0, accum1) # When variables are initialized with Zero, FTRL-Proximal has two properties: # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical @@ -273,9 +316,9 @@ class FtrlOptimizerTest(XLATestCase): def testEquivAdagradwithoutRegularization(self): steps = 5 for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): val0, val1 = self.equivAdagradTest_FtrlPart(steps, dtype) - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype) self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4, half_rtol=1e-2) @@ -284,9 +327,9 @@ class FtrlOptimizerTest(XLATestCase): def testEquivGradientDescentwithoutRegularization(self): steps = 5 for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): val0, val1 = self.equivGradientDescentTest_FtrlPart(steps, dtype) - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): val2, val3 = self.equivGradientDescentTest_GradientDescentPart( steps, dtype) diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py index 8a3f4b0bdc7a61d6cfa2ba7474ce8579e293a5c7..b1891b918c6584abce9da382088ed0037f5319fb 100644 --- a/tensorflow/compiler/tests/function_test.py +++ b/tensorflow/compiler/tests/function_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import function @@ -28,7 +28,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.platform import googletest -class FunctionTest(XLATestCase): +class FunctionTest(xla_test.XLATestCase): def testFunction(self): """Executes a simple TensorFlow function.""" @@ -40,7 +40,7 @@ class FunctionTest(XLATestCase): bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32) expected = APlus2B(aval, bval) - with self.test_session() as sess: + with self.cached_session() as sess: @function.Defun(dtypes.float32, dtypes.float32) def Foo(a, b): @@ -66,7 +66,7 @@ class FunctionTest(XLATestCase): bval = np.array([4, 3, 2, 1]).reshape([2, 2]).astype(np.float32) expected = APlus2B(aval, bval) - with self.test_session() as sess: + with self.cached_session() as sess: @function.Defun(dtypes.float32, dtypes.float32) def Foo(a, b): @@ -90,7 +90,7 @@ class FunctionTest(XLATestCase): bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32) expected = Func(aval, bval) - with self.test_session() as sess: + with self.cached_session() as sess: @function.Defun(dtypes.float32, dtypes.float32) def Foo(a, b): @@ -105,7 +105,7 @@ class FunctionTest(XLATestCase): def testCompileTimeConstantsInDefun(self): """Tests that XLA handles compile-time constants in defuns.""" - with self.test_session() as sess: + with self.cached_session() as sess: @function.Defun(dtypes.float32, dtypes.int32, dtypes.int32) def Foo(a, c, d): @@ -140,7 +140,7 @@ class FunctionTest(XLATestCase): bval = np.array([4, 3, 2, 1]).reshape([2, 2]).astype(np.float32) expected = aval + bval * 2 - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): a = array_ops.placeholder(dtypes.float32, name="a") b = array_ops.placeholder(dtypes.float32, name="b") diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py index a80d69fa5f5099b8a8b67df0da9c92b957e9d194..8c018cccb83a05babb0b7f73b80b4f9de7267c98 100644 --- a/tensorflow/compiler/tests/fused_batchnorm_test.py +++ b/tensorflow/compiler/tests/fused_batchnorm_test.py @@ -18,9 +18,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import test_utils +from tensorflow.compiler.tests import xla_test from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import gradient_checker @@ -28,7 +30,7 @@ from tensorflow.python.ops import nn from tensorflow.python.platform import test -class FusedBatchNormTest(XLATestCase): +class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase): def _reference_training(self, x, scale, offset, epsilon, data_format): if data_format != "NHWC": @@ -63,24 +65,36 @@ class FusedBatchNormTest(XLATestCase): grad_offset = np.sum(grad_y, axis=(0, 1, 2)) return grad_x, grad_scale, grad_offset - def testInference(self): + @parameterized.named_parameters( + ("_data_format_NHWC", "NHWC"), + ("_data_format_NCHW", "NCHW"), + ("_data_format_HWNC", "HWNC"), + ("_data_format_HWCN", "HWCN"), + ) + def testInference(self, data_format): channel = 3 x_shape = [2, 2, 6, channel] scale_shape = [channel] x_val = np.random.random_sample(x_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32) - offset_val = np.random.random_sample(scale_shape).astype(np.float32) - data_format = "NHWC" - with self.test_session() as sess, self.test_scope(): + epsilon = 0.001 + data_format_src = "NHWC" + y_ref, mean_ref, var_ref = self._reference_training( + x_val, scale_val, offset_val, epsilon, data_format_src) + + with self.cached_session() as sess, self.test_scope(): # To avoid constant folding - t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x") + x_val_converted = test_utils.ConvertBetweenDataFormats( + x_val, data_format_src, data_format) + y_ref_converted = test_utils.ConvertBetweenDataFormats( + y_ref, data_format_src, data_format) + + t_val = array_ops.placeholder( + np.float32, shape=x_val_converted.shape, name="x") scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale") offset = array_ops.placeholder( np.float32, shape=scale_shape, name="offset") - epsilon = 0.001 - y_ref, mean_ref, var_ref = self._reference_training( - x_val, scale_val, offset_val, epsilon, data_format) y, mean, variance = nn.fused_batch_norm( t_val, scale, @@ -91,31 +105,39 @@ class FusedBatchNormTest(XLATestCase): data_format=data_format, is_training=False) - y_val, _, _ = sess.run( - [y, mean, - variance], {t_val: x_val, - scale: scale_val, - offset: offset_val}) - self.assertAllClose(y_val, y_ref, atol=1e-3) + y_val, _, _ = sess.run([y, mean, variance], { + t_val: x_val_converted, + scale: scale_val, + offset: offset_val + }) + self.assertAllClose(y_val, y_ref_converted, atol=1e-3) - def _testLearning(self, use_gradient_checker): + def _testLearning(self, use_gradient_checker, data_format): channel = 3 x_shape = [2, 2, 6, channel] scale_shape = [channel] x_val = np.random.random_sample(x_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32) - offset_val = np.random.random_sample(scale_shape).astype(np.float32) mean_val = np.random.random_sample(scale_shape).astype(np.float32) var_val = np.random.random_sample(scale_shape).astype(np.float32) - data_format = "NHWC" - with self.test_session() as sess, self.test_scope(): + epsilon = 0.001 + data_format_src = "NHWC" + y_ref, mean_ref, var_ref = self._reference_training( + x_val, scale_val, offset_val, epsilon, data_format_src) + + with self.cached_session() as sess, self.test_scope(): # To avoid constant folding - t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x") + x_val_converted = test_utils.ConvertBetweenDataFormats( + x_val, data_format_src, data_format) + y_ref_converted = test_utils.ConvertBetweenDataFormats( + y_ref, data_format_src, data_format) + + t_val = array_ops.placeholder( + np.float32, shape=x_val_converted.shape, name="x") scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale") offset = array_ops.placeholder( np.float32, shape=scale_shape, name="offset") - epsilon = 0.001 y, mean, var = nn.fused_batch_norm( t_val, scale, @@ -129,33 +151,50 @@ class FusedBatchNormTest(XLATestCase): if use_gradient_checker: err = gradient_checker.compute_gradient_error( t_val, - x_shape, + x_val_converted.shape, y, - x_shape, + x_val_converted.shape, extra_feed_dict={ - t_val: x_val, + t_val: x_val_converted, scale: scale_val, offset: offset_val }) self.assertLess(err, 1e-3) - y_val, mean_val, var_val = sess.run( - [y, mean, var], {t_val: x_val, - scale: scale_val, - offset: offset_val}) - y_ref, mean_ref, var_ref = self._reference_training( - x_val, scale_val, offset_val, epsilon, data_format) + y_val, mean_val, var_val = sess.run([y, mean, var], { + t_val: x_val_converted, + scale: scale_val, + offset: offset_val + }) self.assertAllClose(mean_val, mean_ref, atol=1e-3) - self.assertAllClose(y_val, y_ref, atol=1e-3) + self.assertAllClose(y_val, y_ref_converted, atol=1e-3) self.assertAllClose(var_val, var_ref, atol=1e-3) - def testLearning(self): - self._testLearning(False) + @parameterized.named_parameters( + ("_data_format_NHWC", "NHWC"), + ("_data_format_NCHW", "NCHW"), + ("_data_format_HWNC", "HWNC"), + ("_data_format_HWCN", "HWCN"), + ) + def testLearning(self, data_format): + self._testLearning(False, data_format) - def testLearningWithGradientChecker(self): - self._testLearning(True) + @parameterized.named_parameters( + ("_data_format_NHWC", "NHWC"), + ("_data_format_NCHW", "NCHW"), + ("_data_format_HWNC", "HWNC"), + ("_data_format_HWCN", "HWCN"), + ) + def testLearningWithGradientChecker(self, data_format): + self._testLearning(True, data_format) - def testGradientTraining(self): + @parameterized.named_parameters( + ("_data_format_NHWC", "NHWC"), + ("_data_format_NCHW", "NCHW"), + ("_data_format_HWNC", "HWNC"), + ("_data_format_HWCN", "HWCN"), + ) + def testGradientTraining(self, data_format): # TODO(b/64270657): Use gradient_checker here in addition to comparing with # this reference implementation. channel = 3 @@ -167,33 +206,48 @@ class FusedBatchNormTest(XLATestCase): mean_val = np.random.random_sample(scale_shape).astype(np.float32) var_val = np.random.random_sample(scale_shape).astype(np.float32) epsilon = 0.001 + data_format_src = "NHWC" + grad_x_ref, grad_scale_ref, grad_offset_ref = self._reference_grad( + x_val, grad_val, scale_val, mean_val, var_val, epsilon, data_format_src) + + with self.cached_session() as sess, self.test_scope(): + grad_val_converted = test_utils.ConvertBetweenDataFormats( + grad_val, data_format_src, data_format) + x_val_converted = test_utils.ConvertBetweenDataFormats( + x_val, data_format_src, data_format) + grad_x_ref_converted = test_utils.ConvertBetweenDataFormats( + grad_x_ref, data_format_src, data_format) - with self.test_session() as sess, self.test_scope(): - grad = array_ops.placeholder(np.float32, shape=x_shape, name="grad") - x = array_ops.placeholder(np.float32, shape=x_shape, name="x") + grad = array_ops.placeholder( + np.float32, shape=x_val_converted.shape, name="grad") + x = array_ops.placeholder( + np.float32, shape=x_val_converted.shape, name="x") mean = array_ops.placeholder(np.float32, shape=scale_shape, name="mean") var = array_ops.placeholder(np.float32, shape=scale_shape, name="var") scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale") grad_x, grad_scale, grad_offset, _, _ = gen_nn_ops.fused_batch_norm_grad( - grad, x, scale, mean, var, data_format="NHWC", is_training=True) + grad, x, scale, mean, var, data_format=data_format, is_training=True) grad_x_val, grad_scale_val, grad_offset_val = sess.run( [grad_x, grad_scale, grad_offset], { - grad: grad_val, - x: x_val, + grad: grad_val_converted, + x: x_val_converted, mean: mean_val, var: var_val, scale: scale_val }) - grad_x_ref, grad_scale_ref, grad_offset_ref = self._reference_grad( - x_val, grad_val, scale_val, mean_val, var_val, epsilon, "NHWC") - - self.assertAllClose(grad_x_val, grad_x_ref, atol=1e-2) + self.assertAllClose(grad_x_val, grad_x_ref_converted, atol=1e-2) self.assertAllClose(grad_scale_val, grad_scale_ref, atol=1e-2) self.assertAllClose(grad_offset_val, grad_offset_ref, atol=1e-3) - def testGradientInference(self): + @parameterized.named_parameters( + ("_data_format_NHWC", "NHWC"), + ("_data_format_NCHW", "NCHW"), + ("_data_format_HWNC", "HWNC"), + ("_data_format_HWCN", "HWCN"), + ) + def testGradientInference(self, data_format): # TODO(b/64270657): Use gradient_checker here in addition to comparing with # this reference implementation. channel = 3 @@ -204,33 +258,47 @@ class FusedBatchNormTest(XLATestCase): scale_val = np.random.random_sample(scale_shape).astype(np.float32) mean_val = np.random.random_sample(scale_shape).astype(np.float32) var_val = np.random.random_sample(scale_shape).astype(np.float32) + data_format_src = "NHWC" + + with self.cached_session() as sess, self.test_scope(): + grad_val_converted = test_utils.ConvertBetweenDataFormats( + grad_val, data_format_src, data_format) + x_val_converted = test_utils.ConvertBetweenDataFormats( + x_val, data_format_src, data_format) - with self.test_session() as sess, self.test_scope(): - grad = array_ops.placeholder(np.float32, shape=x_shape, name="grad") - x = array_ops.placeholder(np.float32, shape=x_shape, name="x") + grad = array_ops.placeholder( + np.float32, shape=x_val_converted.shape, name="grad") + x = array_ops.placeholder( + np.float32, shape=x_val_converted.shape, name="x") mean = array_ops.placeholder(np.float32, shape=scale_shape, name="mean") var = array_ops.placeholder(np.float32, shape=scale_shape, name="var") scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale") with self.test_scope(): out = gen_nn_ops.fused_batch_norm_grad( - grad, x, scale, mean, var, data_format="NHWC", is_training=False) + grad, + x, + scale, + mean, + var, + data_format=data_format, + is_training=False) grad_x, grad_scale, grad_offset, _, _ = out ref_x, ref_scale, ref_offset, _, _ = gen_nn_ops.fused_batch_norm_grad( - grad, x, scale, mean, var, data_format="NHWC", is_training=False) + grad, x, scale, mean, var, data_format=data_format, is_training=False) grad_x_val, grad_scale_val, grad_offset_val, = sess.run( [grad_x, grad_scale, grad_offset], { - grad: grad_val, - x: x_val, + grad: grad_val_converted, + x: x_val_converted, mean: mean_val, var: var_val, scale: scale_val }) grad_x_ref, grad_scale_ref, grad_offset_ref, = sess.run( [ref_x, ref_scale, ref_offset], { - grad: grad_val, - x: x_val, + grad: grad_val_converted, + x: x_val_converted, mean: mean_val, var: var_val, scale: scale_val diff --git a/tensorflow/compiler/tests/gather_nd_op_test.py b/tensorflow/compiler/tests/gather_nd_op_test.py index 9378b1db7245c0da3e8298e7dcd972491616b0cd..7161f4ab339b6f4069dd2b02ddbc6a89973e0074 100644 --- a/tensorflow/compiler/tests/gather_nd_op_test.py +++ b/tensorflow/compiler/tests/gather_nd_op_test.py @@ -20,16 +20,16 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import errors from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class GatherNdTest(XLATestCase): +class GatherNdTest(xla_test.XLATestCase): def _runGather(self, params, indices): - with self.test_session(): + with self.cached_session(): paramsp = array_ops.placeholder(params.dtype) indicesp = array_ops.placeholder(indices.dtype) with self.test_scope(): @@ -46,7 +46,7 @@ class GatherNdTest(XLATestCase): np.array([[4], [4], [0]], np.int32))) def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self): - with self.test_session(): + with self.cached_session(): params = np.ones((3, 3), dtype=np.float32) indices_empty = np.empty((0, 2), dtype=np.int32) diff --git a/tensorflow/compiler/tests/gather_test.py b/tensorflow/compiler/tests/gather_test.py index 1a8c4519118f69ce51ca9a5eb95a9d706c7766cc..089d95daab7e502b4ba13796fadc2ba3f209759b 100644 --- a/tensorflow/compiler/tests/gather_test.py +++ b/tensorflow/compiler/tests/gather_test.py @@ -42,7 +42,7 @@ class GatherTest(xla_test.XLATestCase): return data def testScalar1D(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): data = np.array([0, 1, 2, 3, 7, 5]) for dtype in self.all_tf_types: for indices in 4, [4], [1, 2, 2, 4, 5]: @@ -55,7 +55,7 @@ class GatherTest(xla_test.XLATestCase): self.assertAllEqual(np_val, gather_val) def testScalar2D(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14]]) for dtype in self.all_tf_types: @@ -69,7 +69,7 @@ class GatherTest(xla_test.XLATestCase): self.assertAllEqual(expected, gather_val) def testSimpleTwoD32(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14]]) for dtype in self.all_tf_types: @@ -87,7 +87,7 @@ class GatherTest(xla_test.XLATestCase): if np.int64 not in self.int_types: return - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14]]) # The indices must be in bounds for any axis. @@ -114,7 +114,7 @@ class GatherTest(xla_test.XLATestCase): for axis in 0, 1, 2, 3, -1, -2: params = self._buildParams(np.random.randn(*shape), dtype) indices = np.random.randint(shape[axis], size=indices_shape) - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): tf_params = array_ops.placeholder(dtype=dtype) tf_indices = constant_op.constant(indices, dtype=dtypes.int32) gather = array_ops.gather(tf_params, tf_indices, axis=axis) @@ -123,7 +123,7 @@ class GatherTest(xla_test.XLATestCase): self.assertAllEqual(gather_np, gather_value) def testIndicesWithDifferentDimensions(self): - with self.test_session(): + with self.cached_session(): for dtype in self.numeric_tf_types: params = array_ops.placeholder(dtype=dtype) indices = array_ops.placeholder(dtype=np.int32) @@ -136,6 +136,20 @@ class GatherTest(xla_test.XLATestCase): self.assertAllEqual( [[7]], gather.eval(feed_dict={params: [4, 7, 2], indices: [[1]]})) + def testGatherPrecision(self): + with self.cached_session() as session, self.test_scope(): + data = np.array([[0, 0, 0, 0], [0, 2 * (1 + np.exp2(-8)), 0, 0], + [0, 0, 0, 0], [0.015789, 0.0985, 0.55789, 0.3842]]) + indices = np.array([1, 2, 3, 1]) + dtype = dtypes.float32 + params_np = self._buildParams(data, dtype) + params = array_ops.placeholder(dtype=dtype) + indices_tf = constant_op.constant(indices) + gather_t = array_ops.gather(params, indices_tf) + gather_val = session.run(gather_t, feed_dict={params: params_np}) + np_val = params_np[indices] + self.assertAllEqual(np_val, gather_val) + class GatherBenchmark(test.Benchmark): """Microbenchmarks for the gather op.""" diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py index 42e637734c578fcc70473060cb156e172a0a1995..6fe5a66e0e6717ec738dded9196eef6ba1e2114d 100644 --- a/tensorflow/compiler/tests/image_ops_test.py +++ b/tensorflow/compiler/tests/image_ops_test.py @@ -25,7 +25,8 @@ import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test +from tensorflow.python.compat import compat from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -41,7 +42,7 @@ def GenerateNumpyRandomRGB(shape): return np.random.randint(0, 256, shape) / 256. -class RGBToHSVTest(XLATestCase): +class RGBToHSVTest(xla_test.XLATestCase): def testBatch(self): # Build an arbitrary RGB image @@ -53,7 +54,7 @@ class RGBToHSVTest(XLATestCase): inp = GenerateNumpyRandomRGB(shape).astype(nptype) # Convert to HSV and back, as a batch and individually - with self.test_session() as sess: + with self.cached_session() as sess: batch0 = array_ops.placeholder(nptype, shape=shape) with self.test_scope(): batch1 = image_ops.rgb_to_hsv(batch0) @@ -65,9 +66,7 @@ class RGBToHSVTest(XLATestCase): join1 = array_ops.stack(split1) join2 = array_ops.stack(split2) batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2], - { - batch0: inp - }) + {batch0: inp}) # Verify that processing batch elements together is the same as separate self.assertAllClose(batch1, join1) @@ -79,7 +78,7 @@ class RGBToHSVTest(XLATestCase): data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] for nptype in self.float_types: rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255. - with self.test_session(): + with self.cached_session(): placeholder = array_ops.placeholder(nptype) with self.test_scope(): hsv = image_ops.rgb_to_hsv(placeholder) @@ -98,7 +97,7 @@ class RGBToHSVTest(XLATestCase): for r, g, b in rgb_flat ]) hsv_np = hsv_np.reshape(4, 4, 4, 3) - with self.test_session(): + with self.cached_session(): placeholder = array_ops.placeholder(nptype) with self.test_scope(): hsv_op = image_ops.rgb_to_hsv(placeholder) @@ -106,10 +105,10 @@ class RGBToHSVTest(XLATestCase): self.assertAllCloseAccordingToType(hsv_tf, hsv_np) -class AdjustContrastTest(XLATestCase): +class AdjustContrastTest(xla_test.XLATestCase): def _testContrast(self, x_np, y_np, contrast_factor): - with self.test_session(): + with self.cached_session(): x = array_ops.placeholder(x_np.dtype, shape=x_np.shape) flt_x = image_ops.convert_image_dtype(x, dtypes.float32) with self.test_scope(): @@ -147,7 +146,7 @@ class AdjustContrastTest(XLATestCase): return y_np def _adjustContrastTf(self, x_np, contrast_factor): - with self.test_session(): + with self.cached_session(): x = array_ops.placeholder(np.float32) with self.test_scope(): y = image_ops.adjust_contrast(x, contrast_factor) @@ -170,7 +169,7 @@ class AdjustContrastTest(XLATestCase): self.assertAllClose(y_tf, y_np, rtol=1e-5, atol=1e-5) -class AdjustHueTest(XLATestCase): +class AdjustHueTest(xla_test.XLATestCase): def testAdjustNegativeHue(self): x_shape = [2, 2, 3] @@ -181,7 +180,7 @@ class AdjustHueTest(XLATestCase): y_data = [0, 13, 1, 54, 226, 59, 8, 234, 150, 255, 39, 1] y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape) - with self.test_session(): + with self.cached_session(): x = array_ops.placeholder(x_np.dtype, shape=x_shape) flt_x = image_ops.convert_image_dtype(x, dtypes.float32) with self.test_scope(): @@ -199,7 +198,7 @@ class AdjustHueTest(XLATestCase): y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255] y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape) - with self.test_session(): + with self.cached_session(): x = array_ops.placeholder(x_np.dtype, shape=x_shape) flt_x = image_ops.convert_image_dtype(x, dtypes.float32) with self.test_scope(): @@ -217,7 +216,7 @@ class AdjustHueTest(XLATestCase): y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255] y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape) - with self.test_session(): + with self.cached_session(): x = array_ops.placeholder(x_np.dtype, shape=x_shape) flt_x = image_ops.convert_image_dtype(x, dtypes.float32) with self.test_scope(): @@ -245,7 +244,7 @@ class AdjustHueTest(XLATestCase): return y_v.reshape(x_np.shape) def _adjustHueTf(self, x_np, delta_h): - with self.test_session(): + with self.cached_session(): x = array_ops.placeholder(dtypes.float32) with self.test_scope(): y = gen_image_ops.adjust_hue(x, delta_h) @@ -305,7 +304,7 @@ class AdjustHueTest(XLATestCase): self._adjustHueTf(x_np, delta_h) -class AdjustSaturationTest(XLATestCase): +class AdjustSaturationTest(xla_test.XLATestCase): def _adjust_saturation(self, image, saturation_factor): image = ops.convert_to_tensor(image, name="image") @@ -325,7 +324,7 @@ class AdjustSaturationTest(XLATestCase): y_rgb_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128] y_np = np.array(y_rgb_data, dtype=np.uint8).reshape(x_shape) - with self.test_session(): + with self.cached_session(): x = array_ops.placeholder(x_np.dtype, shape=x_shape) y = self._adjust_saturation(x, saturation_factor) y_tf = y.eval({x: x_np}) @@ -340,7 +339,7 @@ class AdjustSaturationTest(XLATestCase): y_data = [0, 5, 13, 0, 106, 226, 30, 0, 234, 89, 255, 0] y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape) - with self.test_session(): + with self.cached_session(): x = array_ops.placeholder(x_np.dtype, shape=x_shape) y = self._adjust_saturation(x, saturation_factor) y_tf = y.eval({x: x_np}) @@ -379,7 +378,7 @@ class AdjustSaturationTest(XLATestCase): "gb_same", "rgb_same", ] - with self.test_session(): + with self.cached_session(): for x_shape in x_shapes: for test_style in test_styles: x_np = np.random.rand(*x_shape) * 255. @@ -401,26 +400,30 @@ class AdjustSaturationTest(XLATestCase): x = array_ops.placeholder(dtypes.float32, shape=x_shape) with self.test_scope(): y_fused = self._adjust_saturation(x, - scale).eval(feed_dict={ - x: x_np - }) + scale).eval(feed_dict={x: x_np}) self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5) -class ResizeBilinearTest(XLATestCase): +class ResizeBilinearTest(xla_test.XLATestCase): def _assertForwardOpMatchesExpected(self, image_np, target_shape, - expected=None): + expected=None, + large_tolerance=False, + align_corners=True): if expected is None: self.fail("expected must be specified") - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): image = array_ops.placeholder(image_np.dtype) resized = gen_image_ops.resize_bilinear( - image, target_shape, align_corners=True) + image, target_shape, align_corners=align_corners) out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]}) - self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out) + if large_tolerance: + self.assertAllClose( + expected[np.newaxis, :, :, np.newaxis], out, rtol=0.03, atol=0.1) + else: + self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out) def _assertBackwardOpMatchesExpected(self, grads_np, @@ -431,7 +434,7 @@ class ResizeBilinearTest(XLATestCase): self.fail("input_shape must be specified") if expected is None: self.fail("expected must be specified") - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): dtype = dtype or np.float32 grads = array_ops.placeholder(np.float32) resized = gen_image_ops.resize_bilinear_grad( @@ -555,6 +558,184 @@ class ResizeBilinearTest(XLATestCase): [[12.5, 27.5, 21.875], [42.5, 80.0, 57.5], [40.625, 72.5, 50]], dtype=np.float32)) + def testAlignCorners4x4To8x8(self): + self._assertForwardOpMatchesExpected( + (np.array([[0, 1, 2, 3]], dtype=np.float32) + np.array( + [[0], [1], [2], [3]], dtype=np.float32)) * 7.0, [8, 8], + expected=3 * + (np.array([[0, 1, 2, 3, 4, 5, 6, 7]], dtype=np.float32) + np.array( + [[0], [1], [2], [3], [4], [5], [6], [7]], dtype=np.float32)), + large_tolerance=True) + + def testAlignCorners8x8To16x16(self): + self._assertForwardOpMatchesExpected( + (np.array([[0, 1, 2, 3, 4, 5, 6, 7]], dtype=np.float32) + np.array( + [[0], [1], [2], [3], [4], [5], [6], [7]], dtype=np.float32)) * 15.0, + [16, 16], + expected=7 * (np.array( + [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]], + dtype=np.float32) + np.array( + [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], + [12], [13], [14], [15]], + dtype=np.float32)), + large_tolerance=True) + + def testNonAlignCorners3x2To6x4(self): + input_data = [[64, 32], [32, 64], [50, 100]] + expected_data = [[64.0, 48.0, 32.0, 32.0], [48.0, 48.0, 48.0, 48.0], + [32.0, 48.0, 64.0, 64.0], [41.0, 61.5, 82.0, 82.0], + [50.0, 75.0, 100.0, 100.0], [50.0, 75.0, 100.0, 100.0]] + for dtype in self.float_types: + self._assertForwardOpMatchesExpected( + np.array(input_data, dtype=dtype), [6, 4], + expected=np.array(expected_data, dtype=np.float32), + align_corners=False) + + def testNonAlignCorners6x4To3x2(self): + input_data = [[127, 127, 64, 64], [127, 127, 64, 64], [64, 64, 127, 127], + [64, 64, 127, 127], [50, 50, 100, 100], [50, 50, 100, 100]] + expected_data = [[127, 64], [64, 127], [50, 100]] + for dtype in self.float_types: + self._assertForwardOpMatchesExpected( + np.array(input_data, dtype=dtype), [3, 2], + expected=np.array(expected_data, dtype=dtype), + align_corners=False) + + +class NonMaxSuppressionTest(xla_test.XLATestCase): + + def testNMS128From1024(self): + # TODO(b/26783907): The Sort HLO is not implemented on CPU or GPU. + if self.device in ["XLA_CPU", "XLA_GPU"]: + return + + with compat.forward_compatibility_horizon(2018, 8, 8): + num_boxes = 1024 + boxes_np = np.random.normal(50, 10, (num_boxes, 4)).astype("f4") + scores_np = np.random.normal(0.5, 0.1, (num_boxes,)).astype("f4") + + max_output_size = 128 + iou_threshold_np = np.array(0.5, dtype=np.float32) + score_threshold_np = np.array(0.0, dtype=np.float32) + + with self.cached_session() as sess: + boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape) + scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape) + iou_threshold = array_ops.placeholder(iou_threshold_np.dtype, + iou_threshold_np.shape) + score_threshold = array_ops.placeholder(score_threshold_np.dtype, + score_threshold_np.shape) + with self.test_scope(): + selected_indices = image_ops.non_max_suppression_padded( + boxes=boxes, + scores=scores, + max_output_size=max_output_size, + iou_threshold=iou_threshold, + score_threshold=score_threshold, + pad_to_max_output_size=True) + inputs_feed = { + boxes: boxes_np, + scores: scores_np, + score_threshold: score_threshold_np, + iou_threshold: iou_threshold_np + } + (indices_tf, _) = sess.run(selected_indices, feed_dict=inputs_feed) + + self.assertEqual(indices_tf.size, max_output_size) + + def testNMS3From6Boxes(self): + # TODO(b/26783907): The Sort HLO is not implemented on CPU or GPU. + if self.device in ["XLA_CPU", "XLA_GPU"]: + return + + with compat.forward_compatibility_horizon(2018, 8, 8): + # Three boxes are selected based on IOU. + boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9], + [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]] + boxes_np = np.array(boxes_data, dtype=np.float32) + + scores_data = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3] + scores_np = np.array(scores_data, dtype=np.float32) + + max_output_size = 3 + iou_threshold_np = np.array(0.5, dtype=np.float32) + score_threshold_np = np.array(0.0, dtype=np.float32) + + with self.cached_session() as sess: + boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape) + scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape) + iou_threshold = array_ops.placeholder(iou_threshold_np.dtype, + iou_threshold_np.shape) + score_threshold = array_ops.placeholder(score_threshold_np.dtype, + score_threshold_np.shape) + with self.test_scope(): + selected_indices = image_ops.non_max_suppression_padded( + boxes=boxes, + scores=scores, + max_output_size=max_output_size, + iou_threshold=iou_threshold, + score_threshold=score_threshold, + pad_to_max_output_size=True) + inputs_feed = { + boxes: boxes_np, + scores: scores_np, + score_threshold: score_threshold_np, + iou_threshold: iou_threshold_np + } + (indices_tf, num_valid) = sess.run( + selected_indices, feed_dict=inputs_feed) + + self.assertEqual(indices_tf.size, max_output_size) + self.assertEqual(num_valid, 3) + self.assertAllClose(indices_tf[:num_valid], [3, 0, 5]) + + def testNMS3Then2WithScoreThresh(self): + # Three boxes are selected based on IOU. + # One is filtered out by score threshold. + + # TODO(b/26783907): The Sort HLO is not implemented on CPU or GPU. + if self.device in ["XLA_CPU", "XLA_GPU"]: + return + + with compat.forward_compatibility_horizon(2018, 8, 8): + boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9], + [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]] + boxes_np = np.array(boxes_data, dtype=np.float32) + + scores_data = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3] + scores_np = np.array(scores_data, dtype=np.float32) + max_output_size = 3 + iou_threshold_np = np.array(0.5, dtype=np.float32) + score_threshold_np = np.array(0.4, dtype=np.float32) + + with self.cached_session() as sess: + boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape) + scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape) + iou_threshold = array_ops.placeholder(iou_threshold_np.dtype, + iou_threshold_np.shape) + score_threshold = array_ops.placeholder(score_threshold_np.dtype, + score_threshold_np.shape) + with self.test_scope(): + selected_indices = image_ops.non_max_suppression_padded( + boxes=boxes, + scores=scores, + max_output_size=max_output_size, + iou_threshold=iou_threshold, + score_threshold=score_threshold, + pad_to_max_output_size=True) + inputs_feed = { + boxes: boxes_np, + scores: scores_np, + iou_threshold: iou_threshold_np, + score_threshold: score_threshold_np + } + (indices_tf, num_valid) = sess.run( + selected_indices, feed_dict=inputs_feed) + + self.assertEqual(indices_tf.size, max_output_size) + self.assertEqual(num_valid, 2) + self.assertAllClose(indices_tf[:num_valid], [3, 0]) + if __name__ == "__main__": test.main() diff --git a/tensorflow/compiler/tests/listdiff_op_test.py b/tensorflow/compiler/tests/listdiff_op_test.py index 45a04f0cf56e88946b946bedacb25ce6da3121b4..58622114e4f552fb71db9b040a39b57d7da0037c 100644 --- a/tensorflow/compiler/tests/listdiff_op_test.py +++ b/tensorflow/compiler/tests/listdiff_op_test.py @@ -33,7 +33,7 @@ class ListDiffTest(xla_test.XLATestCase): def _testListDiff(self, x, y, out, idx): for dtype in [dtypes.int32, dtypes.int64]: for index_dtype in [dtypes.int32, dtypes.int64]: - with self.test_session() as sess: + with self.cached_session() as sess: x_tensor = ops.convert_to_tensor(x, dtype=dtype) y_tensor = ops.convert_to_tensor(y, dtype=dtype) with self.test_scope(): diff --git a/tensorflow/compiler/tests/lrn_ops_test.py b/tensorflow/compiler/tests/lrn_ops_test.py index 69bd8f7230d4394c45764d02a88fb0ec097c5756..c6ad67993e8bc196a74c9a328df8c9200c92c575 100644 --- a/tensorflow/compiler/tests/lrn_ops_test.py +++ b/tensorflow/compiler/tests/lrn_ops_test.py @@ -22,7 +22,7 @@ import copy import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -36,7 +36,7 @@ CPU_DEVICE = "/job:localhost/replica:0/task:0/cpu:0" # Local response normalization tests. The forward tests are copied from # tensorflow/python/kernel_tests/lrn_op_test.py -class LRNTest(XLATestCase): +class LRNTest(xla_test.XLATestCase): def _LRN(self, input_image, lrn_depth_radius=5, bias=1.0, alpha=1.0, beta=0.5): @@ -58,7 +58,7 @@ class LRNTest(XLATestCase): return output def _RunAndVerify(self, dtype): - with self.test_session(): + with self.cached_session(): # random shape shape = np.random.randint(1, 16, size=4) # Make depth at least 2 to make it meaningful @@ -110,7 +110,7 @@ class LRNTest(XLATestCase): alpha = 1.0 * np.random.rand() beta = 1.0 * np.random.rand() - with self.test_session(): + with self.cached_session(): in_image = constant_op.constant(in_image_vals, shape=shape) out_image = constant_op.constant(out_image_vals, shape=shape) out_grads = constant_op.constant(out_grads_vals, shape=shape) diff --git a/tensorflow/compiler/tests/lstm_test.py b/tensorflow/compiler/tests/lstm_test.py index 31093c65713df55390c3130b8654fdcb10fbc133..265c0b6d1412de7be3a5bf5e79129cb330ceb162 100644 --- a/tensorflow/compiler/tests/lstm_test.py +++ b/tensorflow/compiler/tests/lstm_test.py @@ -73,7 +73,7 @@ class LSTMTest(test.TestCase): def _RunLSTMCell(self, basename, init_weights, m_prev_scalar, c_prev_scalar, pad_scalar): - with self.test_session() as sess: + with self.cached_session() as sess: num_inputs = 1 num_nodes = 1 @@ -156,7 +156,7 @@ class LSTMTest(test.TestCase): def _RunLSTMLayer(self, basename, init_weights, m_init_scalar, c_init_scalar, pad_scalar): - with self.test_session() as sess: + with self.cached_session() as sess: num_inputs = 1 num_nodes = 1 seq_length = 3 diff --git a/tensorflow/compiler/tests/matrix_band_part_test.py b/tensorflow/compiler/tests/matrix_band_part_test.py index 29394f9ea5139b30f88f53de0469b27e37d79195..9222db4b7ebf020c8cee1c0af81e05129fb33c4d 100644 --- a/tensorflow/compiler/tests/matrix_band_part_test.py +++ b/tensorflow/compiler/tests/matrix_band_part_test.py @@ -19,17 +19,17 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class MatrixBandPartTest(XLATestCase): +class MatrixBandPartTest(xla_test.XLATestCase): def _testMatrixBandPart(self, dtype, shape): - with self.test_session(): + with self.cached_session(): batch_shape = shape[:-2] mat = np.ones(shape).astype(dtype) batch_mat = np.tile(mat, batch_shape + [1, 1]) diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py index 5819b2bf2b55b9213a039c0ba82dd0bf1c738b00..94cd3eeb3179da9b920ea9f03216d602b042a639 100644 --- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py +++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py @@ -22,7 +22,7 @@ import itertools import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops @@ -35,7 +35,7 @@ def MakePlaceholder(x): return array_ops.placeholder(dtypes.as_dtype(x.dtype), shape=x.shape) -class MatrixTriangularSolveOpTest(XLATestCase): +class MatrixTriangularSolveOpTest(xla_test.XLATestCase): # MatrixTriangularSolve defined for float64, float32, complex64, complex128 # (https://www.tensorflow.org/api_docs/python/tf/matrix_triangular_solve) @@ -54,7 +54,7 @@ class MatrixTriangularSolveOpTest(XLATestCase): def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol): clean_a = np.tril(a) if lower else np.triu(a) - with self.test_session() as sess: + with self.cached_session() as sess: placeholder_a = MakePlaceholder(a) placeholder_ca = MakePlaceholder(clean_a) placeholder_b = MakePlaceholder(b) diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py index af9394e7d7dc9cf7dd009420ff9c845aec8785bd..f77521a7c49dba39849869ddceb7c0e885147722 100644 --- a/tensorflow/compiler/tests/momentum_test.py +++ b/tensorflow/compiler/tests/momentum_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops @@ -30,7 +30,7 @@ from tensorflow.python.platform import test from tensorflow.python.training import momentum as momentum_lib -class MomentumOptimizerTest(XLATestCase): +class MomentumOptimizerTest(xla_test.XLATestCase): def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum): var += accum * lr * momentum @@ -41,7 +41,7 @@ class MomentumOptimizerTest(XLATestCase): def testBasic(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) @@ -95,7 +95,7 @@ class MomentumOptimizerTest(XLATestCase): def testNesterovMomentum(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([0.1, 0.2], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([0.3, 0.4], dtype=dtype) var0_np = np.array([0.1, 0.2], dtype=dtype) @@ -120,7 +120,7 @@ class MomentumOptimizerTest(XLATestCase): def testTensorLearningRateAndMomentum(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) diff --git a/tensorflow/compiler/tests/nary_ops_test.py b/tensorflow/compiler/tests/nary_ops_test.py index e4843b169b943b63346b783ddc50039030988ca5..a1c07fce732d3b91a7c0550545a03fdab67644d3 100644 --- a/tensorflow/compiler/tests/nary_ops_test.py +++ b/tensorflow/compiler/tests/nary_ops_test.py @@ -22,17 +22,17 @@ import unittest import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import googletest -class NAryOpsTest(XLATestCase): +class NAryOpsTest(xla_test.XLATestCase): def _testNAry(self, op, args, expected, equality_fn=None): - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): placeholders = [ array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape) @@ -126,7 +126,7 @@ class NAryOpsTest(XLATestCase): [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], dtype=np.float32)) def testOneHot(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): indices = array_ops.constant(np.array([[2, 3], [0, 1]], dtype=np.int32)) op = array_ops.one_hot(indices, np.int32(4), @@ -148,7 +148,7 @@ class NAryOpsTest(XLATestCase): self.assertAllEqual(output, expected) def testSplitV(self): - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): output = session.run( array_ops.split(np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 0, 1, 2]], diff --git a/tensorflow/compiler/tests/nullary_ops_test.py b/tensorflow/compiler/tests/nullary_ops_test.py index 6f588d8ab562cb24f33c4c2987df22264aede027..f985c5d2d96e06fc0117f3935d61b19c9e8562b1 100644 --- a/tensorflow/compiler/tests/nullary_ops_test.py +++ b/tensorflow/compiler/tests/nullary_ops_test.py @@ -20,23 +20,23 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.ops import control_flow_ops from tensorflow.python.platform import googletest -class NullaryOpsTest(XLATestCase): +class NullaryOpsTest(xla_test.XLATestCase): def _testNullary(self, op, expected): - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): output = op() result = session.run(output) self.assertAllClose(result, expected, rtol=1e-3) def testNoOp(self): - with self.test_session(): + with self.cached_session(): with self.test_scope(): output = control_flow_ops.no_op() # This should not crash. diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py index d68d32057a367776d5b70d5ac21d5618297c605d..7635f89249b7b71e5353e0b7cb1cea5c1f7bca1d 100644 --- a/tensorflow/compiler/tests/oom_test.py +++ b/tensorflow/compiler/tests/oom_test.py @@ -46,7 +46,7 @@ class OutOfMemoryTest(xla_test.XLATestCase): def test_loop(): size = int(2e8) while True: - with self.test_session(): + with self.cached_session(): # Force the compiled code to not be constant by feeding in a # parameter. p = array_ops.placeholder(dtypes.float32, shape=[2, 1, 1]) diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py index 5e6d1313bd0336eba71fcf3658d949bd3342ae11..77bb839409f0c323ff6ed2c8d6bd105d3003b398 100644 --- a/tensorflow/compiler/tests/placeholder_test.py +++ b/tensorflow/compiler/tests/placeholder_test.py @@ -18,17 +18,17 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.ops import array_ops from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variables from tensorflow.python.platform import googletest -class PlaceholderTest(XLATestCase): +class PlaceholderTest(xla_test.XLATestCase): def test_placeholder_with_default_default(self): - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): v = resource_variable_ops.ResourceVariable(4.0) ph = array_ops.placeholder_with_default(v, shape=[]) out = ph * 2 @@ -36,7 +36,7 @@ class PlaceholderTest(XLATestCase): self.assertEqual(8.0, sess.run(out)) def test_placeholder_with_default_fed(self): - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): v = resource_variable_ops.ResourceVariable(4.0) ph = array_ops.placeholder_with_default(v, shape=[]) out = ph * 2 diff --git a/tensorflow/compiler/tests/pooling_ops_3d_test.py b/tensorflow/compiler/tests/pooling_ops_3d_test.py index 4eed903963a34a253ea5c409782d9a89a97a4fdf..b6cdd38345b9a9f6b03e8799587e3f6ffe07b407 100644 --- a/tensorflow/compiler/tests/pooling_ops_3d_test.py +++ b/tensorflow/compiler/tests/pooling_ops_3d_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -41,7 +41,7 @@ def _AvgPoolGrad(inputs, outputs, output_gradients, ksize, strides, padding): padding=padding) -class Pooling3DTest(XLATestCase): +class Pooling3DTest(xla_test.XLATestCase): def _VerifyValues(self, pool_func, input_sizes, window, strides, padding, expected): @@ -62,7 +62,7 @@ class Pooling3DTest(XLATestCase): # numbers from 1. x = np.arange(1.0, total_size + 1, dtype=np.float32) x = x.reshape(input_sizes) - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): inputs = array_ops.placeholder(dtypes.float32) t = pool_func( inputs, @@ -187,8 +187,14 @@ class Pooling3DTest(XLATestCase): padding="VALID", expected=[29.5, 32.5, 50.5, 53.5, 176.5, 179.5, 197.5, 200.5]) - def _VerifyGradient(self, pool_func, pool_grad_func, input_sizes, ksize, - strides, padding): + def _VerifyGradient(self, + pool_func, + pool_grad_func, + input_sizes, + ksize, + strides, + padding, + pool_grad_grad_func=None): """Verifies the output values of the pooling gradient function. Args: @@ -198,12 +204,13 @@ class Pooling3DTest(XLATestCase): ksize: The kernel size dimensions strides: The stride dimensions padding: Padding type. + pool_grad_grad_func: Second-order gradient function, if available. """ ksize = [1] + ksize + [1] strides = [1] + strides + [1] total_size = np.prod(input_sizes) x = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_sizes) - with self.test_session() as sess: + with self.cached_session() as sess: # Use the forward pool function to compute some corresponding outputs # (needed for the CPU device, and we need the shape in both cases). with ops.device("CPU"): @@ -218,6 +225,8 @@ class Pooling3DTest(XLATestCase): output_gradient_vals = np.arange( 1, output_vals.size + 1, dtype=np.float32) output_gradient_vals = output_gradient_vals.reshape(output_vals.shape) + output_grad_grad_vals = np.arange(1, x.size + 1, dtype=np.float32) + output_grad_grad_vals = output_grad_grad_vals.reshape(x.shape) # Use the Tensorflow CPU pooling gradient to compute the expected input # gradients. @@ -236,6 +245,22 @@ class Pooling3DTest(XLATestCase): {inputs: x, output_gradients: output_gradient_vals}) + output_grad_gradients = array_ops.placeholder( + dtypes.float32, shape=expected_input_gradient_vals.shape) + if pool_grad_grad_func is not None: + expected_grad_gradients = pool_grad_grad_func( + inputs, + outputs, + output_grad_gradients, + ksize=ksize, + strides=strides, + padding=padding, + data_format="NDHWC") + expected_grad_gradients_vals = sess.run(expected_grad_gradients, { + inputs: x, + output_grad_gradients: output_grad_grad_vals + }) + # Run the gradient op on the XLA device with self.test_scope(): outputs = array_ops.placeholder(dtypes.float32, shape=output_vals.shape) @@ -246,6 +271,16 @@ class Pooling3DTest(XLATestCase): ksize=ksize, strides=strides, padding=padding) + if pool_grad_grad_func is not None: + actual_grad_gradients = pool_grad_grad_func( + inputs, + outputs, + output_grad_gradients, + ksize=ksize, + strides=strides, + padding=padding, + data_format="NDHWC") + actual = sess.run(actual_input_gradients, { inputs: x, outputs: output_vals, @@ -260,6 +295,22 @@ class Pooling3DTest(XLATestCase): atol=1e-6) self.assertShapeEqual(actual, inputs) + if pool_grad_grad_func is not None: + actual_grad_gradients_vals = sess.run( + actual_grad_gradients, { + inputs: x, + outputs: output_vals, + output_grad_gradients: output_grad_grad_vals + }) + + # Compare the Tensorflow and XLA results. + self.assertAllClose( + expected_grad_gradients_vals, + actual_grad_gradients_vals, + rtol=1e-4, + atol=1e-6) + self.assertShapeEqual(actual_grad_gradients_vals, outputs) + def testMaxPoolGradValidPadding1_1_3d(self): self._VerifyGradient( nn_ops.max_pool3d, @@ -267,7 +318,8 @@ class Pooling3DTest(XLATestCase): input_sizes=[1, 3, 3, 3, 1], ksize=[1, 1, 1], strides=[1, 1, 1], - padding="VALID") + padding="VALID", + pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad) def testMaxPoolGradValidPadding2_1_6_3d(self): self._VerifyGradient( @@ -276,9 +328,13 @@ class Pooling3DTest(XLATestCase): input_sizes=[2, 3, 3, 6, 3], ksize=[2, 2, 2], strides=[1, 1, 1], - padding="VALID") + padding="VALID", + pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad) def testMaxPoolGradValidPadding2_1_7_3d(self): + # TODO(b/73062247): the bfloat16 implementation of MaxPool3DGradGrad does + # not have enough precision for this test case to pass if + # pool_grad_grad_func is passed. self._VerifyGradient( nn_ops.max_pool3d, gen_nn_ops.max_pool3d_grad, @@ -294,7 +350,8 @@ class Pooling3DTest(XLATestCase): input_sizes=[2, 2, 2, 2, 3], ksize=[2, 2, 2], strides=[2, 2, 2], - padding="VALID") + padding="VALID", + pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad) def testMaxPoolGradSamePadding1_1_3d(self): self._VerifyGradient( @@ -303,7 +360,8 @@ class Pooling3DTest(XLATestCase): input_sizes=[2, 3, 2, 4, 1], ksize=[1, 1, 1], strides=[1, 1, 1], - padding="SAME") + padding="SAME", + pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad) def testMaxPoolGradSamePadding2_1_3d(self): self._VerifyGradient( @@ -312,7 +370,8 @@ class Pooling3DTest(XLATestCase): input_sizes=[2, 3, 2, 4, 1], ksize=[2, 2, 2], strides=[1, 1, 1], - padding="SAME") + padding="SAME", + pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad) def testMaxPoolGradSamePadding2_2_3d(self): self._VerifyGradient( @@ -321,7 +380,8 @@ class Pooling3DTest(XLATestCase): input_sizes=[2, 5, 2, 4, 3], ksize=[2, 2, 2], strides=[2, 2, 2], - padding="SAME") + padding="SAME", + pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad) def testMaxPoolGradSamePadding3_1_3d(self): self._VerifyGradient( @@ -330,7 +390,8 @@ class Pooling3DTest(XLATestCase): input_sizes=[1, 3, 3, 7, 1], ksize=[3, 3, 3], strides=[1, 1, 1], - padding="SAME") + padding="SAME", + pool_grad_grad_func=gen_nn_ops.max_pool3d_grad_grad) def testAvgPoolGradValidPadding1_1_3d(self): self._VerifyGradient( diff --git a/tensorflow/compiler/tests/pooling_ops_test.py b/tensorflow/compiler/tests/pooling_ops_test.py index fe270af3d636c0824621f36360ce9e7d14d8fc91..d03bd4fdbb7694bc36291faf9b845ec48e26a386 100644 --- a/tensorflow/compiler/tests/pooling_ops_test.py +++ b/tensorflow/compiler/tests/pooling_ops_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -69,7 +69,7 @@ def GetTestConfigs(): return test_configs -class PoolingTest(XLATestCase): +class PoolingTest(xla_test.XLATestCase): def _VerifyOneTest(self, pool_func, input_sizes, ksize, strides, padding, data_format, expected): @@ -89,7 +89,7 @@ class PoolingTest(XLATestCase): # numbers from 1. x = np.array([f * 1.0 for f in range(1, total_size + 1)], dtype=np.float32) x = x.reshape(input_sizes) - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): inputs = array_ops.placeholder(dtypes.float32) t = inputs @@ -288,7 +288,7 @@ class PoolingTest(XLATestCase): expected=expected_output) -class PoolGradTest(XLATestCase): +class PoolGradTest(xla_test.XLATestCase): CPU_DEVICE = "/job:localhost/replica:0/task:0/cpu:0" @@ -324,7 +324,7 @@ class PoolGradTest(XLATestCase): # TODO(b/74222344): Fix nan handling for max pool grad. # x[np.random.choice(total_size)] = np.nan x = x.reshape(input_sizes) - with self.test_session() as sess: + with self.cached_session() as sess: # Use the forward pool function to compute some corresponding outputs # (needed for the CPU device, and we need the shape in both cases). with ops.device(self.CPU_DEVICE): diff --git a/tensorflow/compiler/tests/powersign_test.py b/tensorflow/compiler/tests/powersign_test.py new file mode 100644 index 0000000000000000000000000000000000000000..86536da7fed0e2309beb32fee9c7c605491592ed --- /dev/null +++ b/tensorflow/compiler/tests/powersign_test.py @@ -0,0 +1,142 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for PowerSign.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.contrib.opt.python.training import powersign +from tensorflow.contrib.opt.python.training import sign_decay +from tensorflow.python.framework import constant_op +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + + +def py_linear_decay_fn(decay_steps): + def linear_decay(step): + step = min(step, decay_steps) + return float(decay_steps - step) / decay_steps + return linear_decay + + +def powersign_update_numpy(params, + g_t, + m, + lr, + base=math.e, + beta=0.9, + py_sign_decay_fn=None, + t=None): + m_t = beta * m + (1 - beta) * g_t + if py_sign_decay_fn is None: + sign_decayed = 1.0 + else: + sign_decayed = py_sign_decay_fn(t-1) + multiplier = base ** (sign_decayed * np.sign(g_t) * np.sign(m_t)) + params_t = params - lr * multiplier * g_t + return params_t, m_t + + +class PowerSignTest(xla_test.XLATestCase): + + def _testDense(self, + learning_rate=0.1, + sign_decay_fn=None, + py_sign_decay_fn=None, + base=math.e, + beta=0.9): + for dtype in self.float_types: + with self.cached_session(), self.test_scope(): + # Initialize variables for numpy implementation. + m0, m1 = 0.0, 0.0 + var0_np = np.array([1.0, 2.0], dtype=dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype) + var1_np = np.array([3.0, 4.0], dtype=dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype) + + var0 = resource_variable_ops.ResourceVariable(var0_np) + var1 = resource_variable_ops.ResourceVariable(var1_np) + global_step = resource_variable_ops.ResourceVariable(0, trainable=False) + grads0 = constant_op.constant(grads0_np) + grads1 = constant_op.constant(grads1_np) + + opt = powersign.PowerSignOptimizer( + learning_rate=learning_rate, + base=base, + beta=beta, + sign_decay_fn=sign_decay_fn, + ) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]), + global_step=global_step) + neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]), + global_step=global_step) + + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + # Run 7 steps of powersign + # first 4 steps with positive gradient + # last 3 steps with negative gradient (sign(gm) should be -1) + for t in range(1, 8): + if t < 5: + update.run() + else: + neg_update.run() + + var0_np, m0 = powersign_update_numpy( + var0_np, + grads0_np if t < 5 else -grads0_np, + m0, + learning_rate, + base=base, + beta=beta, + py_sign_decay_fn=py_sign_decay_fn, + t=t, + ) + var1_np, m1 = powersign_update_numpy( + var1_np, + grads1_np if t < 5 else -grads1_np, + m1, + learning_rate, + base=base, + beta=beta, + py_sign_decay_fn=py_sign_decay_fn, + t=t, + ) + + # Validate updated params + self.assertAllCloseAccordingToType(var0_np, var0.eval()) + self.assertAllCloseAccordingToType(var1_np, var1.eval()) + + def testDense(self): + decay_steps = 10 + sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps) + py_sign_decay_fn = py_linear_decay_fn(decay_steps) + self._testDense() + self._testDense(learning_rate=0.1, base=10.0, beta=0.8) + self._testDense( + sign_decay_fn=sign_decay_fn, py_sign_decay_fn=py_sign_decay_fn) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/compiler/tests/proximal_adagrad_test.py b/tensorflow/compiler/tests/proximal_adagrad_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c41b4171e26af4f7ad0237d7407a5b3691299595 --- /dev/null +++ b/tensorflow/compiler/tests/proximal_adagrad_test.py @@ -0,0 +1,172 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for Proximal Adagrad optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import constant_op +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import adagrad +from tensorflow.python.training import proximal_adagrad + + +class ProximalAdagradOptimizerTest(xla_test.XLATestCase): + + def testResourceProximalAdagradwithoutRegularization(self): + with self.cached_session(), self.test_scope(): + var0 = resource_variable_ops.ResourceVariable([0.0, 0.0]) + var1 = resource_variable_ops.ResourceVariable([0.0, 0.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + opt = proximal_adagrad.ProximalAdagradOptimizer( + 3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([0.0, 0.0], var0.eval()) + self.assertAllClose([0.0, 0.0], var1.eval()) + + # Run 3 steps Proximal Adagrad. + for _ in range(3): + update.run() + + self.assertAllClose(np.array([-2.60260963, -4.29698515]), var0.eval()) + self.assertAllClose(np.array([-0.28432083, -0.56694895]), var1.eval()) + opt_vars = opt.variables() + self.assertStartsWith(opt_vars[0].name, var0._shared_name) + self.assertStartsWith(opt_vars[1].name, var1._shared_name) + self.assertEqual(2, len(opt_vars)) + + def testProximalAdagradwithoutRegularization2(self): + with self.cached_session(), self.test_scope(): + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) + var1 = resource_variable_ops.ResourceVariable([4.0, 3.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + + opt = proximal_adagrad.ProximalAdagradOptimizer( + 3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([4.0, 3.0], var1.eval()) + + # Run 3 steps Proximal Adagrad. + for _ in range(3): + update.run() + self.assertAllClose(np.array([-1.60261, -2.296985]), var0.eval()) + self.assertAllClose(np.array([3.715679, 2.433051]), var1.eval()) + + def testProximalAdagradWithL1(self): + with self.cached_session(), self.test_scope(): + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) + var1 = resource_variable_ops.ResourceVariable([4.0, 3.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + + opt = proximal_adagrad.ProximalAdagradOptimizer( + 3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.001, + l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([4.0, 3.0], var1.eval()) + + # Run 10 steps Proximal Adagrad + for _ in range(10): + update.run() + self.assertAllClose(np.array([-6.663634, -9.190331]), var0.eval()) + self.assertAllClose(np.array([2.959304, 1.029232]), var1.eval()) + + def testProximalAdagradWithL1_L2(self): + with self.cached_session(), self.test_scope(): + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) + var1 = resource_variable_ops.ResourceVariable([4.0, 3.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + + opt = proximal_adagrad.ProximalAdagradOptimizer( + 3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.001, + l2_regularization_strength=2.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([4.0, 3.0], var1.eval()) + + # Run 10 steps Proximal Adagrad. + for _ in range(10): + update.run() + + self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval()) + self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval()) + + def applyOptimizer(self, opt, steps=5): + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) + var1 = resource_variable_ops.ResourceVariable([3.0, 4.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + # Run ProximalAdagrad for a few steps + for _ in range(steps): + update.run() + + return var0.eval(), var1.eval() + + def testEquivAdagradwithoutRegularization(self): + with self.cached_session(), self.test_scope(): + val0, val1 = self.applyOptimizer( + proximal_adagrad.ProximalAdagradOptimizer( + 3.0, + initial_accumulator_value=0.1, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0)) + + with self.cached_session(), self.test_scope(): + val2, val3 = self.applyOptimizer( + adagrad.AdagradOptimizer( + 3.0, initial_accumulator_value=0.1)) + + self.assertAllClose(val0, val2) + self.assertAllClose(val1, val3) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/proximal_gradient_descent_test.py b/tensorflow/compiler/tests/proximal_gradient_descent_test.py new file mode 100644 index 0000000000000000000000000000000000000000..3d808e6b8a71ef9fa60b671d07bfd907e9f58efc --- /dev/null +++ b/tensorflow/compiler/tests/proximal_gradient_descent_test.py @@ -0,0 +1,156 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for Proximal Gradient Descent optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import constant_op +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import gradient_descent +from tensorflow.python.training import proximal_gradient_descent + + +class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase): + + def testResourceProximalGradientDescentwithoutRegularization(self): + with self.cached_session(), self.test_scope(): + var0 = resource_variable_ops.ResourceVariable([0.0, 0.0]) + var1 = resource_variable_ops.ResourceVariable([0.0, 0.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + opt = proximal_gradient_descent.ProximalGradientDescentOptimizer( + 3.0, l1_regularization_strength=0.0, l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([0.0, 0.0], var0.eval()) + self.assertAllClose([0.0, 0.0], var1.eval()) + + # Run 3 steps Proximal Gradient Descent. + for _ in range(3): + update.run() + + self.assertAllClose(np.array([-0.9, -1.8]), var0.eval()) + self.assertAllClose(np.array([-0.09, -0.18]), var1.eval()) + + def testProximalGradientDescentwithoutRegularization2(self): + with self.cached_session(), self.test_scope(): + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) + var1 = resource_variable_ops.ResourceVariable([4.0, 3.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + + opt = proximal_gradient_descent.ProximalGradientDescentOptimizer( + 3.0, l1_regularization_strength=0.0, l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([4.0, 3.0], var1.eval()) + + # Run 3 steps Proximal Gradient Descent + for _ in range(3): + update.run() + + self.assertAllClose(np.array([0.1, 0.2]), var0.eval()) + self.assertAllClose(np.array([3.91, 2.82]), var1.eval()) + + def testProximalGradientDescentWithL1(self): + with self.cached_session(), self.test_scope(): + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) + var1 = resource_variable_ops.ResourceVariable([4.0, 3.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + + opt = proximal_gradient_descent.ProximalGradientDescentOptimizer( + 3.0, l1_regularization_strength=0.001, l2_regularization_strength=0.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([4.0, 3.0], var1.eval()) + + # Run 10 steps proximal gradient descent. + for _ in range(10): + update.run() + + self.assertAllClose(np.array([-1.988, -3.988001]), var0.eval()) + self.assertAllClose(np.array([3.67, 2.37]), var1.eval()) + + def testProximalGradientDescentWithL1_L2(self): + with self.cached_session(), self.test_scope(): + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) + var1 = resource_variable_ops.ResourceVariable([4.0, 3.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + + opt = proximal_gradient_descent.ProximalGradientDescentOptimizer( + 3.0, l1_regularization_strength=0.001, l2_regularization_strength=2.0) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([4.0, 3.0], var1.eval()) + + # Run 10 steps Proximal Gradient Descent + for _ in range(10): + update.run() + + self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval()) + self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval()) + + def applyOptimizer(self, opt, steps=5): + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) + var1 = resource_variable_ops.ResourceVariable([3.0, 4.0]) + grads0 = constant_op.constant([0.1, 0.2]) + grads1 = constant_op.constant([0.01, 0.02]) + + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + # Run ProximalAdagrad for a few steps + for _ in range(steps): + update.run() + + return var0.eval(), var1.eval() + + def testEquivGradientDescentwithoutRegularization(self): + with self.cached_session(), self.test_scope(): + val0, val1 = self.applyOptimizer( + proximal_gradient_descent.ProximalGradientDescentOptimizer( + 3.0, + l1_regularization_strength=0.0, + l2_regularization_strength=0.0)) + + with self.cached_session(), self.test_scope(): + val2, val3 = self.applyOptimizer( + gradient_descent.GradientDescentOptimizer(3.0)) + + self.assertAllClose(val0, val2) + self.assertAllClose(val1, val3) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/qr_op_test.py b/tensorflow/compiler/tests/qr_op_test.py new file mode 100644 index 0000000000000000000000000000000000000000..236b1b881dcaffc1a5b0c6395f0605c1d7ef0269 --- /dev/null +++ b/tensorflow/compiler/tests/qr_op_test.py @@ -0,0 +1,115 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow.ops.math_ops.matrix_inverse.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import itertools + +from absl.testing import parameterized +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import linalg_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import test + + +class QrOpTest(xla_test.XLATestCase, parameterized.TestCase): + + def AdjustedNorm(self, x): + """Computes the norm of matrices in 'x', adjusted for dimension and type.""" + norm = np.linalg.norm(x, axis=(-2, -1)) + return norm / (max(x.shape[-2:]) * np.finfo(x.dtype).eps) + + def CompareOrthogonal(self, x, y, rank): + # We only compare the first 'rank' orthogonal vectors since the + # remainder form an arbitrary orthonormal basis for the + # (row- or column-) null space, whose exact value depends on + # implementation details. Notice that since we check that the + # matrices of singular vectors are unitary elsewhere, we do + # implicitly test that the trailing vectors of x and y span the + # same space. + x = x[..., 0:rank] + y = y[..., 0:rank] + # Q is only unique up to sign (complex phase factor for complex matrices), + # so we normalize the sign first. + sum_of_ratios = np.sum(np.divide(y, x), -2, keepdims=True) + phases = np.divide(sum_of_ratios, np.abs(sum_of_ratios)) + x *= phases + self.assertTrue(np.all(self.AdjustedNorm(x - y) < 30.0)) + + def CheckApproximation(self, a, q, r): + # Tests that a ~= q*r. + precision = self.AdjustedNorm(a - np.matmul(q, r)) + self.assertTrue(np.all(precision < 10.0)) + + def CheckUnitary(self, x): + # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. + xx = math_ops.matmul(x, x, adjoint_a=True) + identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0) + precision = self.AdjustedNorm(xx.eval() - identity.eval()) + self.assertTrue(np.all(precision < 5.0)) + + def _test(self, dtype, shape, full_matrices): + np.random.seed(1) + x_np = np.random.uniform( + low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype) + + with self.cached_session() as sess: + x_tf = array_ops.placeholder(dtype) + with self.test_scope(): + q_tf, r_tf = linalg_ops.qr(x_tf, full_matrices=full_matrices) + q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np}) + + q_dims = q_tf_val.shape + np_q = np.ndarray(q_dims, dtype) + np_q_reshape = np.reshape(np_q, (-1, q_dims[-2], q_dims[-1])) + new_first_dim = np_q_reshape.shape[0] + + x_reshape = np.reshape(x_np, (-1, x_np.shape[-2], x_np.shape[-1])) + for i in range(new_first_dim): + if full_matrices: + np_q_reshape[i, :, :], _ = np.linalg.qr( + x_reshape[i, :, :], mode="complete") + else: + np_q_reshape[i, :, :], _ = np.linalg.qr( + x_reshape[i, :, :], mode="reduced") + np_q = np.reshape(np_q_reshape, q_dims) + self.CompareOrthogonal(np_q, q_tf_val, min(shape[-2:])) + self.CheckApproximation(x_np, q_tf_val, r_tf_val) + self.CheckUnitary(q_tf_val) + + SIZES = [1, 2, 5, 10, 32, 100, 300] + DTYPES = [np.float32] + PARAMS = itertools.product(SIZES, SIZES, DTYPES) + + @parameterized.parameters(*PARAMS) + def testQR(self, rows, cols, dtype): + # TODO(b/111317468): Test other types. + for full_matrices in [True, False]: + # Only tests the (3, 2) case for small numbers of rows/columns. + for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10): + self._test(dtype, batch_dims + (rows, cols), full_matrices) + + def testLarge2000x2000(self): + self._test(np.float32, (2000, 2000), full_matrices=True) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py index d6c93088d4efff7d8306e262a79ae49d3d8ac722..6e183441179ebf2e8c063b333f9328d6fa86cc88 100644 --- a/tensorflow/compiler/tests/random_ops_test.py +++ b/tensorflow/compiler/tests/random_ops_test.py @@ -18,15 +18,20 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import math + import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops +from tensorflow.python.ops.distributions import special_math from tensorflow.python.platform import googletest -class RandomOpsTest(XLATestCase): +class RandomOpsTest(xla_test.XLATestCase): """Test cases for random-number generating operators.""" def _random_types(self): @@ -34,7 +39,7 @@ class RandomOpsTest(XLATestCase): def _testRngIsNotConstant(self, rng, dtype): # Tests that 'rng' does not always return the same value. - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): x = rng(dtype) @@ -47,18 +52,19 @@ class RandomOpsTest(XLATestCase): # We use exact equality here. If the random-number generator is producing # deterministic output, all three outputs will be bitwise identical. self.assertTrue((not np.array_equal(y, z)) or - (not np.array_equal(z, w)) or - (not np.array_equal(y, w))) + (not np.array_equal(z, w)) or (not np.array_equal(y, w))) def testRandomUniformIsNotConstant(self): + def rng(dtype): - return random_ops.random_uniform(shape=[2], dtype=dtype, - maxval=1000000) + dtype = dtypes.as_dtype(dtype) + return random_ops.random_uniform(shape=[2], dtype=dtype, maxval=dtype.max) for dtype in self._random_types(): self._testRngIsNotConstant(rng, dtype) def testRandomNormalIsNotConstant(self): + def rng(dtype): return random_ops.random_normal(shape=[2], dtype=dtype) @@ -68,24 +74,100 @@ class RandomOpsTest(XLATestCase): def testRandomUniformIsInRange(self): for dtype in self._random_types(): - with self.test_session() as sess: + # TODO (b/112272078): enable bfloat16 for CPU and GPU when the bug is + # fixed. + if (self.device in ["XLA_GPU", "XLA_CPU" + ]) and (dtype in [dtypes.bfloat16, dtypes.half]): + continue + with self.cached_session() as sess: with self.test_scope(): - x = random_ops.random_uniform(shape=[1000], dtype=dtype, minval=-2, - maxval=33) + x = random_ops.random_uniform( + shape=[1000], dtype=dtype, minval=-2, maxval=33) y = sess.run(x) self.assertTrue((y >= -2).sum() == 1000) self.assertTrue((y < 33).sum() == 1000) + def testTruncatedNormalIsNotConstant(self): + + def rng(dtype): + return random_ops.truncated_normal(shape=[2], dtype=dtype) + + # TODO(b/34339814): implement inverse erf support for non-F32 types. + self._testRngIsNotConstant(rng, dtypes.float32) + def testTruncatedNormalIsInRange(self): - count = 10000 + count = 10000000 # TODO(b/34339814): implement inverse erf support for non-F32 types. for dtype in [dtypes.float32]: - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): - x = random_ops.truncated_normal(shape=[count], dtype=dtype, seed=42) + x = random_ops.truncated_normal(shape=[count], dtype=dtype) y = sess.run(x) - self.assertTrue((y >= -2).sum() == count) - self.assertTrue((y <= 2).sum() == count) + + def normal_cdf(x): + return .5 * math.erfc(-x / math.sqrt(2)) + + def normal_pdf(x): + return math.exp(-(x**2) / 2.) / math.sqrt(2 * math.pi) + + def probit(x, sess=sess): + return sess.run(special_math.ndtri(x)) + + a = -2. + b = 2. + mu = 0. + sigma = 1. + + alpha = (a - mu) / sigma + beta = (b - mu) / sigma + z = normal_cdf(beta) - normal_cdf(alpha) + + self.assertTrue((y >= a).sum() == count) + self.assertTrue((y <= b).sum() == count) + + # For more information on these calculations, see: + # Burkardt, John. "The Truncated Normal Distribution". + # Department of Scientific Computing website. Florida State University. + expected_mean = mu + (normal_pdf(alpha) - normal_pdf(beta)) / z * sigma + actual_mean = np.mean(y) + self.assertAllClose(actual_mean, expected_mean, atol=2e-3) + + expected_median = mu + probit( + (normal_cdf(alpha) + normal_cdf(beta)) / 2.) * sigma + actual_median = np.median(y) + self.assertAllClose(actual_median, expected_median, atol=1e-2) + + expected_variance = sigma**2 * (1 + ( + (alpha * normal_pdf(alpha) - beta * normal_pdf(beta)) / z) - ( + (normal_pdf(alpha) - normal_pdf(beta)) / z)**2) + actual_variance = np.var(y) + self.assertAllClose(actual_variance, expected_variance, rtol=2*1e-3) + + def testShuffle1d(self): + # TODO(b/26783907): this test requires the CPU backend to implement sort. + if self.device in ["XLA_CPU"]: + return + with self.cached_session() as sess: + with self.test_scope(): + x = math_ops.range(1 << 16) + shuffle = random_ops.random_shuffle(x) + result = sess.run(shuffle) + expected = range(1 << 16) + # Compare sets to avoid randomness behavior changes but make sure still + # have all the values. + self.assertAllEqual(set(result), set(expected)) + + def testShuffle2d(self): + with self.cached_session() as sess: + with self.test_scope(): + x = array_ops.diag(math_ops.range(20)) + shuffle = random_ops.random_shuffle(x) + result = sess.run(shuffle) + expected = np.diag(range(20)).flatten() + # Compare sets to avoid randomness behavior changes but make sure still + # have all the values. + self.assertAllEqual(len(result.flatten()), len(expected)) + self.assertAllEqual(set(result.flatten()), set(expected)) if __name__ == '__main__': diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc index 16f293891d56d78885dd515bb7b9899faf0690f7..0faf0fd8edf355838ccf42f1d6de20ac01faa3db 100644 --- a/tensorflow/compiler/tests/randomized_tests.cc +++ b/tensorflow/compiler/tests/randomized_tests.cc @@ -62,6 +62,7 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/public/session.h" #include "tensorflow/core/public/session_options.h" @@ -101,6 +102,9 @@ class OpTestBuilder { OpTestBuilder& RandomInput(DataType type); OpTestBuilder& RandomInput(DataType type, std::vector dims); + // As RandomInput but the values are unique. + OpTestBuilder& RandomUniqueInput(DataType type, std::vector dims); + // Sets an attribute. template OpTestBuilder& Attr(StringPiece attr_name, T&& value); @@ -126,6 +130,7 @@ class OpTestBuilder { DataType type = DT_INVALID; bool has_dims = false; + bool needs_unique_values = false; std::vector dims; }; @@ -167,6 +172,18 @@ OpTestBuilder& OpTestBuilder::RandomInput(DataType type, return *this; } +OpTestBuilder& OpTestBuilder::RandomUniqueInput(DataType type, + std::vector dims) { + VLOG(1) << "Adding input: " << type << " " << TensorShape(dims).DebugString(); + InputDescription input; + input.type = type; + input.has_dims = true; + input.needs_unique_values = true; + input.dims = std::move(dims); + inputs_.push_back(input); + return *this; +} + template OpTestBuilder& OpTestBuilder::Attr(StringPiece attr_name, T&& value) { AddNodeAttr(attr_name, std::forward(value), &node_def_); @@ -258,13 +275,13 @@ class OpTest : public ::testing::Test { // Select a random element from 'candidates'. template - T Choose(gtl::ArraySlice candidates); + T Choose(absl::Span candidates); static constexpr int kDefaultMaxRank = 5; static constexpr int64 kDefaultMaxDimensionSize = 256LL; // Returns true if 'dims' have a size less than tf_xla_max_tensor_size. - bool TensorSizeIsOk(gtl::ArraySlice dims); + bool TensorSizeIsOk(absl::Span dims); // Returns a random dimension size, in the range [min, max). int64 RandomDim(int64 min = 0, int64 max = kDefaultMaxDimensionSize); @@ -289,11 +306,12 @@ class OpTest : public ::testing::Test { // Returns a tensor filled with random but "reasonable" values from the middle // of the type's range. If the shape is omitted, a random shape is used. // TODO(phawkins): generalize this code to a caller-supplied distribution. - Tensor RandomTensor(DataType dtype, gtl::ArraySlice shape); + Tensor RandomTensor(DataType dtype, bool needs_unique_values, + absl::Span shape); Tensor RandomTensor(DataType dtype); // Like RandomTensor, but uses values >= 0. - Tensor RandomNonNegativeTensor(DataType dtype, gtl::ArraySlice shape); + Tensor RandomNonNegativeTensor(DataType dtype, absl::Span shape); Tensor RandomNonNegativeTensor(DataType dtype); // Returns a random subset of the integers in the range [0, rank), suitable @@ -397,7 +415,7 @@ void OpTest::Repeatedly(const std::function& fn) { } template -T OpTest::Choose(gtl::ArraySlice candidates) { +T OpTest::Choose(absl::Span candidates) { std::uniform_int_distribution d(0, candidates.size() - 1); return candidates[d(generator())]; } @@ -407,7 +425,7 @@ int64 OpTest::RandomDim(int64 min, int64 max) { return size_distribution(generator()); } -bool OpTest::TensorSizeIsOk(gtl::ArraySlice dims) { +bool OpTest::TensorSizeIsOk(absl::Span dims) { int64 size = 1LL; for (int64 dim : dims) { size *= dim; @@ -432,49 +450,90 @@ std::vector OpTest::RandomDims(int min_rank, int max_rank, return dims; } -Tensor OpTest::RandomTensor(DataType dtype, gtl::ArraySlice shape) { +Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values, + absl::Span shape) { Tensor tensor(dtype, TensorShape(shape)); switch (dtype) { case DT_FLOAT: { + gtl::FlatSet already_generated; std::uniform_real_distribution distribution(-1.0f, 1.0f); - test::FillFn(&tensor, [this, &distribution](int i) -> float { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> float { + float generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } case DT_DOUBLE: { + gtl::FlatSet already_generated; std::uniform_real_distribution distribution(-1.0, 1.0); - test::FillFn(&tensor, [this, &distribution](int i) -> double { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> double { + double generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } case DT_COMPLEX64: { + gtl::FlatSet> already_generated; std::uniform_real_distribution distribution(-1.0f, 1.0f); - test::FillFn(&tensor, [this, &distribution](int i) { - return complex64(distribution(generator()), distribution(generator())); + test::FillFn(&tensor, [&](int i) { + complex64 generated; + do { + generated = + complex64(distribution(generator()), distribution(generator())); + } while ( + needs_unique_values && + !already_generated + .insert(std::make_pair(generated.real(), generated.imag())) + .second); + return generated; }); break; } case DT_INT32: { + gtl::FlatSet already_generated; std::uniform_int_distribution distribution(-(1 << 20), 1 << 20); - test::FillFn(&tensor, [this, &distribution](int i) -> int32 { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> int32 { + int32 generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } case DT_INT64: { + gtl::FlatSet already_generated; std::uniform_int_distribution distribution(-(1LL << 40), 1LL << 40); - test::FillFn(&tensor, [this, &distribution](int i) -> int64 { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> int64 { + int64 generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } case DT_BOOL: { + gtl::FlatSet already_generated; std::bernoulli_distribution distribution; - test::FillFn(&tensor, [this, &distribution](int i) -> bool { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> bool { + bool generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } @@ -485,11 +544,11 @@ Tensor OpTest::RandomTensor(DataType dtype, gtl::ArraySlice shape) { } Tensor OpTest::RandomTensor(DataType dtype) { - return RandomTensor(dtype, RandomDims()); + return RandomTensor(dtype, /*needs_unique_values=*/false, RandomDims()); } Tensor OpTest::RandomNonNegativeTensor(DataType dtype, - gtl::ArraySlice shape) { + absl::Span shape) { Tensor tensor(dtype, TensorShape(shape)); switch (dtype) { case DT_FLOAT: { @@ -761,7 +820,8 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose( VLOG(1) << "Ignoring oversize dims."; return kInvalid; } - input_tensors.push_back(RandomTensor(input.type, dims)); + input_tensors.push_back( + RandomTensor(input.type, input.needs_unique_values, dims)); } VLOG(1) << "Input: " << input_tensors.back().DebugString(); } @@ -960,7 +1020,7 @@ TEST_F(OpTest, ArgMax) { std::uniform_int_distribution(-num_dims, num_dims)(generator()); return ExpectTfAndXlaOutputsAreClose( OpTestBuilder("ArgMax") - .RandomInput(DT_FLOAT, dims) + .RandomUniqueInput(DT_FLOAT, dims) .Input(test::AsScalar(reduce_dim)) .Attr("T", DT_FLOAT) .Attr("Tidx", DT_INT32) @@ -976,7 +1036,7 @@ TEST_F(OpTest, ArgMin) { std::uniform_int_distribution(-num_dims, num_dims)(generator()); return ExpectTfAndXlaOutputsAreClose( OpTestBuilder("ArgMin") - .RandomInput(DT_FLOAT, dims) + .RandomUniqueInput(DT_FLOAT, dims) .Input(test::AsScalar(reduce_dim)) .Attr("T", DT_FLOAT) .Attr("Tidx", DT_INT32) @@ -1824,7 +1884,8 @@ TEST_F(OpTest, DynamicStitch) { for (int i = 0; i < n; ++i) { TensorShape shape(index_dims[i]); Tensor t = test::AsTensor( - gtl::ArraySlice(indices, pos, shape.num_elements()), shape); + absl::Span(indices).subspan(pos, shape.num_elements()), + shape); builder.Input(t); pos += t.NumElements(); } diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py index 7420724bdbeab63b39542ada59328621febad895..132c59c32c9db0c8759bdbb31f8613c3ef88b485 100644 --- a/tensorflow/compiler/tests/reduce_ops_test.py +++ b/tensorflow/compiler/tests/reduce_ops_test.py @@ -20,9 +20,10 @@ from __future__ import print_function import functools import itertools +from absl.testing import parameterized import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.ops import array_ops @@ -30,22 +31,24 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import googletest -class ReduceOpsTest(XLATestCase): - +@parameterized.named_parameters(('32_bit_index', dtypes.int32), + ('64_bit_index', dtypes.int64)) +class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase): def _testReduction(self, tf_reduce_fn, np_reduce_fn, dtype, test_inputs, + index_dtype, rtol=1e-4, atol=1e-4): """Tests that the output of 'tf_reduce_fn' matches numpy's output.""" for test_input in test_inputs: - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): a = array_ops.placeholder(dtype) - index = array_ops.placeholder(dtypes.int32) + index = array_ops.placeholder(index_dtype) out = tf_reduce_fn(a, index) result = sess.run(out, {a: test_input, index: [0]}) self.assertAllClose( @@ -89,22 +92,23 @@ class ReduceOpsTest(XLATestCase): np.array([[False, True, False], [True, True, False]]), ] - def testReduceSumF32(self): - self._testReduction(math_ops.reduce_sum, np.sum, np.float32, self.REAL_DATA) + def testReduceSumF32(self, index_dtype): + self._testReduction(math_ops.reduce_sum, np.sum, np.float32, self.REAL_DATA, + index_dtype) - def testReduceSumC64(self): + def testReduceSumC64(self, index_dtype): self._testReduction(math_ops.reduce_sum, np.sum, np.complex64, - self.COMPLEX_DATA) + self.COMPLEX_DATA, index_dtype) - def testReduceProdF32(self): + def testReduceProdF32(self, index_dtype): self._testReduction(math_ops.reduce_prod, np.prod, np.float32, - self.REAL_DATA) + self.REAL_DATA, index_dtype) - def testReduceProdC64(self): + def testReduceProdC64(self, index_dtype): self._testReduction(math_ops.reduce_prod, np.prod, np.complex64, - self.COMPLEX_DATA) + self.COMPLEX_DATA, index_dtype) - def testReduceMin(self): + def testReduceMin(self, index_dtype): def reference_min(dtype, inp, axis): """Wrapper around np.amin that returns +infinity for an empty input.""" @@ -119,9 +123,9 @@ class ReduceOpsTest(XLATestCase): [np.float32, np.int32, np.int64]): self._testReduction(math_ops.reduce_min, functools.partial(reference_min, dtype), dtype, - self.REAL_DATA) + self.REAL_DATA, index_dtype) - def testReduceMax(self): + def testReduceMax(self, index_dtype): def reference_max(dtype, inp, axis): """Wrapper around np.amax that returns -infinity for an empty input.""" @@ -137,26 +141,28 @@ class ReduceOpsTest(XLATestCase): [np.float32, np.int32, np.int64]): self._testReduction(math_ops.reduce_max, functools.partial(reference_max, dtype), dtype, - self.REAL_DATA) + self.REAL_DATA, index_dtype) - def testReduceMeanF32(self): + def testReduceMeanF32(self, index_dtype): # TODO(phawkins): mean on XLA currently returns 0 instead of NaN when # reducing across zero inputs. self._testReduction(math_ops.reduce_mean, np.mean, np.float32, - self.NONEMPTY_REAL_DATA) + self.NONEMPTY_REAL_DATA, index_dtype) - def testReduceMeanC64(self): + def testReduceMeanC64(self, index_dtype): self._testReduction(math_ops.reduce_mean, np.mean, np.complex64, - self.NONEMPTY_COMPLEX_DATA) + self.NONEMPTY_COMPLEX_DATA, index_dtype) - def testReduceAll(self): - self._testReduction(math_ops.reduce_all, np.all, np.bool, self.BOOL_DATA) + def testReduceAll(self, index_dtype): + self._testReduction(math_ops.reduce_all, np.all, np.bool, self.BOOL_DATA, + index_dtype) - def testReduceAny(self): - self._testReduction(math_ops.reduce_any, np.any, np.bool, self.BOOL_DATA) + def testReduceAny(self, index_dtype): + self._testReduction(math_ops.reduce_any, np.any, np.bool, self.BOOL_DATA, + index_dtype) -class ReduceOpPrecisionTest(XLATestCase): +class ReduceOpPrecisionTest(xla_test.XLATestCase): def _testReduceSum(self, expected_result, @@ -178,7 +184,7 @@ class ReduceOpPrecisionTest(XLATestCase): """ for test_input in test_inputs: - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): a = array_ops.placeholder(dtype) index = array_ops.placeholder(dtypes.int32) @@ -213,7 +219,7 @@ class ReduceOpPrecisionTest(XLATestCase): bf16_max = np.float32(dtypes.bfloat16.max) f32_max = dtypes.float32.max - value = min(bf16_max, f32_max - bf16_max) + value = min(bf16_max, f32_max - bf16_max) / 2 self._testReduceSum( dtypes.bfloat16.as_numpy_dtype(value), dtypes.bfloat16.as_numpy_dtype, itertools.permutations([bf16_max, value, bf16_max * (-1.0)], 3)) diff --git a/tensorflow/compiler/tests/reduce_window_test.py b/tensorflow/compiler/tests/reduce_window_test.py index e78a63465b80644d8810d9fa7433653bc4639fed..ff20ea3f4287b4666684501fa4920435a77b4183 100644 --- a/tensorflow/compiler/tests/reduce_window_test.py +++ b/tensorflow/compiler/tests/reduce_window_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.compiler.tf2xla.python import xla from tensorflow.python.framework import dtypes from tensorflow.python.framework import function @@ -28,11 +28,11 @@ from tensorflow.python.ops import array_ops from tensorflow.python.platform import googletest -class ReduceWindowTest(XLATestCase): +class ReduceWindowTest(xla_test.XLATestCase): """Test cases for xla.reduce_window.""" def _reduce_window(self, operand, init, reducer, **kwargs): - with self.test_session(): + with self.cached_session(): placeholder = array_ops.placeholder(operand.dtype) with self.test_scope(): output = xla.reduce_window(placeholder, init, reducer, **kwargs) diff --git a/tensorflow/compiler/tests/reshape_op_test.py b/tensorflow/compiler/tests/reshape_op_test.py new file mode 100644 index 0000000000000000000000000000000000000000..84c67779400f7a800bd88abc32d95058a6c0904d --- /dev/null +++ b/tensorflow/compiler/tests/reshape_op_test.py @@ -0,0 +1,50 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for slicing.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl.testing import parameterized + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import googletest + + +class ReshapeTest(xla_test.XLATestCase, parameterized.TestCase): + + @parameterized.named_parameters(('32_bit_index', dtypes.int32), + ('64_bit_index', dtypes.int64)) + def testBasic(self, index_dtype): + for dtype in self.numeric_types: + with self.test_session(): + i = array_ops.placeholder(dtype, shape=[2, 3]) + with self.test_scope(): + shape = constant_op.constant([3, 2], dtype=index_dtype) + o = array_ops.reshape(i, shape) + params = { + i: [[1, 2, 3], [4, 5, 6]], + } + result = o.eval(feed_dict=params) + + self.assertAllEqual([[1, 2], [3, 4], [5, 6]], result) + + +if __name__ == '__main__': + googletest.main() diff --git a/tensorflow/compiler/tests/reverse_ops_test.py b/tensorflow/compiler/tests/reverse_ops_test.py index 18fabca28c9817fc8517595fa1694a18399f54b0..392290fd92d0c7c928581422433892147374b2dd 100644 --- a/tensorflow/compiler/tests/reverse_ops_test.py +++ b/tensorflow/compiler/tests/reverse_ops_test.py @@ -21,44 +21,51 @@ from __future__ import print_function import itertools import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.platform import googletest -class ReverseOpsTest(XLATestCase): +class ReverseOpsTest(xla_test.XLATestCase): def testReverseOneDim(self): shape = (7, 5, 9, 11) - for revdim in range(len(shape)): + for revdim in range(-len(shape), len(shape)): self._AssertReverseEqual([revdim], shape) def testReverseMoreThanOneDim(self): shape = (7, 5, 9, 11) + # The offset is used to test various (but not all) combinations of negative + # and positive axis indices that are guaranteed to not collide at the same + # index. for revdims in itertools.chain.from_iterable( - itertools.combinations(range(len(shape)), k) - for k in range(2, len(shape)+1)): + itertools.combinations(range(-offset, + len(shape) - offset), k) + for k in range(2, + len(shape) + 1) + for offset in range(0, len(shape))): self._AssertReverseEqual(revdims, shape) def _AssertReverseEqual(self, revdims, shape): np.random.seed(120) pval = np.random.randint(0, 100, size=shape).astype(float) - with self.test_session(): + with self.cached_session(): with self.test_scope(): p = array_ops.placeholder(dtypes.int32, shape=shape) axis = constant_op.constant( np.array(revdims, dtype=np.int32), - shape=(len(revdims),), dtype=dtypes.int32) + shape=(len(revdims),), + dtype=dtypes.int32) rval = array_ops.reverse(p, axis).eval({p: pval}) slices = [ - slice(-1, None, -1) if d in revdims else slice(None) - for d in range(len(shape))] - self.assertEqual( - pval[slices].flatten().tolist(), - rval.flatten().tolist()) + slice(-1, None, -1) + if d in revdims or d - len(shape) in revdims else slice(None) + for d in range(len(shape)) + ] + self.assertEqual(pval[slices].flatten().tolist(), rval.flatten().tolist()) if __name__ == '__main__': diff --git a/tensorflow/compiler/tests/reverse_sequence_op_test.py b/tensorflow/compiler/tests/reverse_sequence_op_test.py index 1a5d05094e53cfecd9476d7d87f023e8a02d7458..60c2337743b44e9bad61c4d65280eb2b1a1ad9ea 100644 --- a/tensorflow/compiler/tests/reverse_sequence_op_test.py +++ b/tensorflow/compiler/tests/reverse_sequence_op_test.py @@ -20,13 +20,13 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class ReverseSequenceTest(XLATestCase): +class ReverseSequenceTest(xla_test.XLATestCase): def _testReverseSequence(self, x, @@ -35,7 +35,7 @@ class ReverseSequenceTest(XLATestCase): seq_lengths, truth, expected_err_re=None): - with self.test_session(): + with self.cached_session(): p = array_ops.placeholder(dtypes.as_dtype(x.dtype)) lengths = array_ops.placeholder(dtypes.as_dtype(seq_lengths.dtype)) with self.test_scope(): diff --git a/tensorflow/compiler/tests/rmsprop_test.py b/tensorflow/compiler/tests/rmsprop_test.py index ecdce4f052bbe3eeae8697c02c891105103f4f69..8840a1329a907bddc6ef1cb6dd1c2a6d234def5c 100644 --- a/tensorflow/compiler/tests/rmsprop_test.py +++ b/tensorflow/compiler/tests/rmsprop_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variables @@ -28,33 +28,104 @@ from tensorflow.python.platform import test from tensorflow.python.training import rmsprop -class RmspropTest(XLATestCase): +class RmspropTest(xla_test.XLATestCase): + + def _rmsprop_update_numpy(self, + var, + g, + mg, + rms, + mom, + lr, + decay=0.9, + momentum=0.0, + epsilon=1e-10, + centered=False): + rms_t = rms * decay + (1 - decay) * g * g + denom_t = rms_t + epsilon + if centered: + mg_t = mg * decay + (1 - decay) * g + denom_t -= mg_t * mg_t + else: + mg_t = mg + mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype) + var_t = var - mom_t + return var_t, mg_t, rms_t, mom_t def testBasic(self): for dtype in self.float_types: - with self.test_session(), self.test_scope(): - var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) - var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) - grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) - grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) - rms_opt = rmsprop.RMSPropOptimizer(3.0) - rms_update = rms_opt.apply_gradients( - zip([grads0, grads1], [var0, var1])) - variables.global_variables_initializer().run() - - # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) - - # Run 3 steps of RMSProp - for _ in range(3): - rms_update.run() - - # Validate updated params - self.assertAllCloseAccordingToType( - np.array([2.91705132e-04, 1.00029182e+00]), var0.eval()) - self.assertAllCloseAccordingToType( - np.array([2.89990854, 3.89990854]), var1.eval()) + for centered in [False, True]: + with self.cached_session(), self.test_scope(): + # Initialize variables for numpy implementation. + var0_np = np.array([1.0, 2.0], dtype=dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype) + var1_np = np.array([3.0, 4.0], dtype=dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype) + mg0_np = np.array([0.0, 0.0], dtype=dtype) + mg1_np = np.array([0.0, 0.0], dtype=dtype) + rms0_np = np.array([1.0, 1.0], dtype=dtype) + rms1_np = np.array([1.0, 1.0], dtype=dtype) + mom0_np = np.array([0.0, 0.0], dtype=dtype) + mom1_np = np.array([0.0, 0.0], dtype=dtype) + + var0 = resource_variable_ops.ResourceVariable(var0_np) + var1 = resource_variable_ops.ResourceVariable(var1_np) + grads0 = constant_op.constant(grads0_np) + grads1 = constant_op.constant(grads1_np) + learning_rate = 3.0 + rms_opt = rmsprop.RMSPropOptimizer(learning_rate, centered=centered) + rms_update = rms_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + mg0 = rms_opt.get_slot(var0, "mg") + self.assertEqual(mg0 is not None, centered) + mg1 = rms_opt.get_slot(var1, "mg") + self.assertEqual(mg1 is not None, centered) + rms0 = rms_opt.get_slot(var0, "rms") + self.assertTrue(rms0 is not None) + rms1 = rms_opt.get_slot(var1, "rms") + self.assertTrue(rms1 is not None) + mom0 = rms_opt.get_slot(var0, "momentum") + self.assertTrue(mom0 is not None) + mom1 = rms_opt.get_slot(var1, "momentum") + self.assertTrue(mom1 is not None) + + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + # Run 3 steps of RMSProp + for _ in range(3): + rms_update.run() + + var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy( + var0_np, + grads0_np, + mg0_np, + rms0_np, + mom0_np, + learning_rate, + centered=centered) + var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy( + var1_np, + grads1_np, + mg1_np, + rms1_np, + mom1_np, + learning_rate, + centered=centered) + + # Validate updated params + if centered: + self.assertAllCloseAccordingToType(mg0_np, mg0.eval()) + self.assertAllCloseAccordingToType(mg1_np, mg1.eval()) + self.assertAllCloseAccordingToType(rms0_np, rms0.eval()) + self.assertAllCloseAccordingToType(rms1_np, rms1.eval()) + self.assertAllCloseAccordingToType(mom0_np, mom0.eval()) + self.assertAllCloseAccordingToType(mom1_np, mom1.eval()) + self.assertAllCloseAccordingToType(var0_np, var0.eval()) + self.assertAllCloseAccordingToType(var1_np, var1.eval()) if __name__ == "__main__": diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py index 3260e63b23226d736a7ddc0f21a94a8c791e0442..897db384b7e8067b0460b5f344201f101a4d8479 100644 --- a/tensorflow/compiler/tests/scan_ops_test.py +++ b/tensorflow/compiler/tests/scan_ops_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops @@ -69,7 +69,7 @@ def handle_options(func, x, axis, exclusive, reverse): return x -class CumsumTest(XLATestCase): +class CumsumTest(xla_test.XLATestCase): valid_dtypes = [np.float32] @@ -78,7 +78,7 @@ class CumsumTest(XLATestCase): def _compare(self, x, axis, exclusive, reverse): np_out = handle_options(np.cumsum, x, axis, exclusive, reverse) - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): p = array_ops.placeholder(x.dtype) tf_out = math_ops.cumsum(p, axis, exclusive, reverse).eval( feed_dict={p: x}) @@ -100,7 +100,7 @@ class CumsumTest(XLATestCase): for dtype in self.valid_dtypes: x = np.arange(1, 6).reshape([5]).astype(dtype) for axis_dtype in self.axis_dtypes(): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): p = array_ops.placeholder(x.dtype) axis = constant_op.constant(0, axis_dtype) math_ops.cumsum(p, axis).eval(feed_dict={p: x}) @@ -131,7 +131,7 @@ class CumsumTest(XLATestCase): def testInvalidAxis(self): x = np.arange(0, 10).reshape([2, 5]).astype(np.float32) - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): input_tensor = ops.convert_to_tensor(x) with self.assertRaisesWithPredicateMatch( errors_impl.InvalidArgumentError, @@ -147,7 +147,7 @@ class CumsumTest(XLATestCase): math_ops.cumsum(input_tensor, [0]).eval() -class CumprodTest(XLATestCase): +class CumprodTest(xla_test.XLATestCase): valid_dtypes = [np.float32] @@ -156,7 +156,7 @@ class CumprodTest(XLATestCase): def _compare(self, x, axis, exclusive, reverse): np_out = handle_options(np.cumprod, x, axis, exclusive, reverse) - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): p = array_ops.placeholder(x.dtype) prod = math_ops.cumprod(p, axis, exclusive, reverse) tf_out = prod.eval(feed_dict={p: x}) @@ -178,7 +178,7 @@ class CumprodTest(XLATestCase): for dtype in self.valid_dtypes: x = np.arange(1, 6).reshape([5]).astype(dtype) for axis_dtype in self.axis_dtypes(): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): p = array_ops.placeholder(x.dtype) axis = constant_op.constant(0, axis_dtype) math_ops.cumprod(x, axis).eval(feed_dict={p: x}) @@ -209,7 +209,7 @@ class CumprodTest(XLATestCase): def testInvalidAxis(self): x = np.arange(0, 10).reshape([2, 5]).astype(np.float32) - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): input_tensor = ops.convert_to_tensor(x) with self.assertRaisesWithPredicateMatch( errors_impl.InvalidArgumentError, diff --git a/tensorflow/compiler/tests/scatter_nd_op_test.py b/tensorflow/compiler/tests/scatter_nd_op_test.py index 638946e234daf28dc4a34e6c33fc0f78b8e8699b..693f8513bc54e30060a2e963abd504768535a50a 100644 --- a/tensorflow/compiler/tests/scatter_nd_op_test.py +++ b/tensorflow/compiler/tests/scatter_nd_op_test.py @@ -22,7 +22,7 @@ import functools import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import errors from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -68,7 +68,7 @@ def _NumpyUpdate(indices, updates, shape): return _NumpyScatterNd(ref, indices, updates, lambda p, u: u) -class ScatterNdTest(XLATestCase): +class ScatterNdTest(xla_test.XLATestCase): def _VariableRankTest(self, np_scatter, @@ -119,7 +119,7 @@ class ScatterNdTest(XLATestCase): self._VariableRankTest(np_scatter, tf_scatter, vtype, itype) def _runScatterNd(self, indices, updates, shape): - with self.test_session(): + with self.cached_session(): updates_placeholder = array_ops.placeholder(updates.dtype) indices_placeholder = array_ops.placeholder(indices.dtype) with self.test_scope(): diff --git a/tensorflow/compiler/tests/segment_reduction_ops_test.py b/tensorflow/compiler/tests/segment_reduction_ops_test.py index 4a9c0e7471f9cdb2a47b54705495d2dda9748890..287bb0d84e24de3bdcde3aa4c61acee00626e88f 100644 --- a/tensorflow/compiler/tests/segment_reduction_ops_test.py +++ b/tensorflow/compiler/tests/segment_reduction_ops_test.py @@ -21,26 +21,40 @@ from __future__ import print_function import functools import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import googletest -class SegmentReductionOpsTest(XLATestCase): +class SegmentReductionOpsTest(xla_test.XLATestCase): """Test cases for segment reduction ops.""" - def UnsortedSegmentSum(self, data, indices, num_segments): - with self.test_session() as sess, self.test_scope(): + def _segmentReduction(self, op, data, indices, num_segments): + with self.cached_session() as sess, self.test_scope(): d = array_ops.placeholder(data.dtype, shape=data.shape) if isinstance(indices, int): i = array_ops.placeholder(np.int32, shape=[]) else: i = array_ops.placeholder(indices.dtype, shape=indices.shape) - return sess.run( - math_ops.unsorted_segment_sum(d, i, num_segments), - {d: data, - i: indices}) + return sess.run(op(d, i, num_segments), {d: data, i: indices}) + + def _unsortedSegmentSum(self, data, indices, num_segments): + return self._segmentReduction(math_ops.unsorted_segment_sum, data, indices, + num_segments) + + def _unsortedSegmentProd(self, data, indices, num_segments): + return self._segmentReduction(math_ops.unsorted_segment_prod, data, indices, + num_segments) + + def _unsortedSegmentMin(self, data, indices, num_segments): + return self._segmentReduction(math_ops.unsorted_segment_min, data, indices, + num_segments) + + def _unsortedSegmentMax(self, data, indices, num_segments): + return self._segmentReduction(math_ops.unsorted_segment_max, data, indices, + num_segments) def testUnsortedSegmentSum0DIndices1DData(self): for dtype in self.numeric_types: @@ -49,14 +63,14 @@ class SegmentReductionOpsTest(XLATestCase): [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5], [0, 0, 0, 0, 0, 0]], dtype=dtype), - self.UnsortedSegmentSum( + self._unsortedSegmentSum( np.array([0, 1, 2, 3, 4, 5], dtype=dtype), 2, 4)) def testUnsortedSegmentSum1DIndices1DData(self): for dtype in self.numeric_types: self.assertAllClose( np.array([1, 3, 2, 9], dtype=dtype), - self.UnsortedSegmentSum( + self._unsortedSegmentSum( np.array([0, 1, 2, 3, 4, 5], dtype=dtype), np.array([3, 0, 2, 1, 3, 3], dtype=np.int32), 4)) @@ -64,7 +78,7 @@ class SegmentReductionOpsTest(XLATestCase): for dtype in self.numeric_types: self.assertAllClose( np.array([6, 3, 0, 6], dtype=dtype), - self.UnsortedSegmentSum( + self._unsortedSegmentSum( np.array([0, 1, 2, 3, 4, 5, 6], dtype=dtype), np.array([3, -1, 0, 1, 0, -1, 3], dtype=np.int32), 4)) @@ -76,7 +90,7 @@ class SegmentReductionOpsTest(XLATestCase): dtype=dtype) indices = np.array([8, 1, 0, 3, 7], dtype=np.int32) num_segments = 10 - y = self.UnsortedSegmentSum(data, indices, num_segments) + y = self._unsortedSegmentSum(data, indices, num_segments) self.assertAllClose( np.array( [[30, 31, 32, 33], [20, 21, 22, 23], [0, 0, 0, 0], @@ -92,7 +106,7 @@ class SegmentReductionOpsTest(XLATestCase): dtype=dtype) indices = np.array([0, 1, 2, 0, 1], dtype=np.int32) num_segments = 4 - y = self.UnsortedSegmentSum(data, indices, num_segments) + y = self._unsortedSegmentSum(data, indices, num_segments) self.assertAllClose( np.array( [[40, 42, 44, 46], [70, 72, 74, 76], [30, 31, 32, 33], @@ -102,30 +116,30 @@ class SegmentReductionOpsTest(XLATestCase): def testUnsortedSegmentSum2DIndices3DData(self): for dtype in self.numeric_types: data = np.array( - [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]], - [[200, 201, 202], [210, 211, 212]], [[300, 301, 302], - [310, 311, 312]]], + [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]], [[ + 200, 201, 202 + ], [210, 211, 212]], [[300, 301, 302], [310, 311, 312]]], dtype=dtype) indices = np.array([[3, 5], [3, 1], [5, 0], [6, 2]], dtype=np.int32) num_segments = 8 - y = self.UnsortedSegmentSum(data, indices, num_segments) + y = self._unsortedSegmentSum(data, indices, num_segments) self.assertAllClose( np.array( - [[210, 211, 212], [110, 111, 112], [310, 311, 312], - [100, 102, 104], [0, 0, 0.], [210, 212, 214], [300, 301, - 302], [0, 0, 0]], + [[210, 211, 212], [110, 111, 112], [310, 311, 312], [ + 100, 102, 104 + ], [0, 0, 0.], [210, 212, 214], [300, 301, 302], [0, 0, 0]], dtype=dtype), y) def testUnsortedSegmentSum1DIndices3DData(self): for dtype in self.numeric_types: data = np.array( - [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]], - [[200, 201, 202], [210, 211, 212]], [[300, 301, 302], - [310, 311, 312]]], + [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]], [[ + 200, 201, 202 + ], [210, 211, 212]], [[300, 301, 302], [310, 311, 312]]], dtype=dtype) indices = np.array([3, 0, 2, 5], dtype=np.int32) num_segments = 6 - y = self.UnsortedSegmentSum(data, indices, num_segments) + y = self._unsortedSegmentSum(data, indices, num_segments) self.assertAllClose( np.array( [[[100, 101, 102.], [110, 111, 112]], [[0, 0, 0], [0, 0, 0]], @@ -138,10 +152,40 @@ class SegmentReductionOpsTest(XLATestCase): data = np.ones((4, 8, 7), dtype=dtype) indices = np.ones((3, 2), dtype=np.int32) num_segments = 4 - self.assertRaises(ValueError, - functools.partial(self.UnsortedSegmentSum, data, - indices, num_segments)) + self.assertRaises( + ValueError, + functools.partial(self._segmentReduction, + math_ops.unsorted_segment_sum, data, indices, + num_segments)) + + def testUnsortedSegmentOps1DIndices1DDataNegativeIndices(self): + """Tests for min, max, and prod ops. + + These share most of their implementation with sum, so we only test basic + functionality. + """ + for dtype in self.numeric_types: + self.assertAllClose( + np.array([8, 3, 1, 0], dtype=dtype), + self._unsortedSegmentProd( + np.array([0, 1, 2, 3, 4, 5, 6], dtype=dtype), + np.array([3, -1, 0, 1, 0, -1, 3], dtype=np.int32), 4)) + + for dtype in self.int_types | self.float_types: + minval = dtypes.as_dtype(dtype).min + maxval = dtypes.as_dtype(dtype).max + + self.assertAllClose( + np.array([2, 3, maxval, 0], dtype=dtype), + self._unsortedSegmentMin( + np.array([0, 1, 2, 3, 4, 5, 6], dtype=dtype), + np.array([3, -1, 0, 1, 0, -1, 3], dtype=np.int32), 4)) + self.assertAllClose( + np.array([4, 3, minval, 6], dtype=dtype), + self._unsortedSegmentMax( + np.array([0, 1, 2, 3, 4, 5, 6], dtype=dtype), + np.array([3, -1, 0, 1, 0, -1, 3], dtype=np.int32), 4)) -if __name__ == '__main__': +if __name__ == "__main__": googletest.main() diff --git a/tensorflow/compiler/tests/slice_ops_test.py b/tensorflow/compiler/tests/slice_ops_test.py index 305ca0c6b78d3ef985deb38816f9388e7983906b..2c611a959e1d71c53e44bc92c31258153d01507d 100644 --- a/tensorflow/compiler/tests/slice_ops_test.py +++ b/tensorflow/compiler/tests/slice_ops_test.py @@ -18,18 +18,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.platform import googletest -class SliceTest(XLATestCase): +class SliceTest(xla_test.XLATestCase): def test1D(self): for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[10]) with self.test_scope(): o = array_ops.slice(i, [2], [4]) @@ -40,9 +40,22 @@ class SliceTest(XLATestCase): self.assertAllEqual([2, 3, 4, 5], result) + def testZeroSlice(self): + for dtype in self.numeric_types: + with self.cached_session(): + i = array_ops.placeholder(dtype, shape=[2]) + with self.test_scope(): + o = array_ops.slice(i, [0], [0]) + params = { + i: [0, 1], + } + result = o.eval(feed_dict=params) + + self.assertAllEqual([], result) + def test3D(self): for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[3, 3, 10]) with self.test_scope(): o = array_ops.slice(i, [1, 2, 2], [1, 1, 4]) @@ -64,7 +77,7 @@ class SliceTest(XLATestCase): def test3DWithDynamicBegin(self): """Tests a slice where the start offset is not known at compile time.""" for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[3, 3, 10]) begin = array_ops.placeholder(dtypes.int32, shape=[3]) with self.test_scope(): @@ -88,7 +101,7 @@ class SliceTest(XLATestCase): def test3DWithDynamicBeginAndNegativeSize(self): """Tests a slice where `begin` is fed dynamically and `size` contains -1.""" for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[3, 3, 10]) begin = array_ops.placeholder(dtypes.int32, shape=[3]) with self.test_scope(): @@ -110,11 +123,11 @@ class SliceTest(XLATestCase): self.assertAllEqual([[[1, 1, 1, 1], [6, 5, 4, 3]]], result) -class StridedSliceTest(XLATestCase): +class StridedSliceTest(xla_test.XLATestCase): def test1D(self): for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[10]) with self.test_scope(): o = array_ops.strided_slice(i, [2], [6], [2]) @@ -127,7 +140,7 @@ class StridedSliceTest(XLATestCase): def test1DNegativeStride(self): for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[10]) with self.test_scope(): o = array_ops.strided_slice(i, [6], [2], [-2]) @@ -140,7 +153,7 @@ class StridedSliceTest(XLATestCase): def test2DDegenerate(self): for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[2, 3]) with self.test_scope(): o = array_ops.strided_slice(i, [-1, 0], [0, 3]) @@ -154,7 +167,7 @@ class StridedSliceTest(XLATestCase): def test2DDegenerateNegativeStride(self): for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[2, 3]) with self.test_scope(): o = array_ops.strided_slice(i, [0, 0], [-1, 3], [-1, 1]) @@ -168,7 +181,7 @@ class StridedSliceTest(XLATestCase): def test3D(self): for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[3, 3, 10]) with self.test_scope(): o = array_ops.strided_slice(i, [0, 2, 2], [2, 3, 6], [1, 1, 2]) @@ -189,7 +202,7 @@ class StridedSliceTest(XLATestCase): def test3DNegativeStride(self): for dtype in self.numeric_types: - with self.test_session(): + with self.cached_session(): i = array_ops.placeholder(dtype, shape=[3, 4, 10]) with self.test_scope(): o = array_ops.strided_slice(i, [2, 2, 6], [0, 0, 2], [-1, -1, -2]) diff --git a/tensorflow/compiler/tests/sort_ops_test.py b/tensorflow/compiler/tests/sort_ops_test.py new file mode 100644 index 0000000000000000000000000000000000000000..51c04b5c4796474700a92a8b23a1cbdf533fcbb4 --- /dev/null +++ b/tensorflow/compiler/tests/sort_ops_test.py @@ -0,0 +1,172 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for sorting operators.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.compiler.tf2xla.python import xla +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import nn_ops +from tensorflow.python.platform import test + + +class XlaSortOpTest(xla_test.XLATestCase): + + def _assertOpOutputMatchesExpected(self, op, args, expected): + with self.cached_session() as session: + with self.test_scope(): + placeholders = [ + array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape) + for arg in args + ] + feeds = {placeholders[i]: args[i] for i in range(0, len(args))} + output = op(*placeholders) + if isinstance(output, ops.Tensor): + output = [output] + + results = session.run(output, feeds) + for result, v in zip(results, expected): + self.assertAllClose(v, result, rtol=1e-3) + + def testSort(self): + # TODO(b/26783907): The Sort HLO is not implemented on CPU or GPU. + if self.device in ["XLA_CPU", "XLA_GPU"]: + return + + supported_types = set([dtypes.bfloat16.as_numpy_dtype, np.float32]) + for dtype in supported_types.intersection(self.numeric_types): + x = np.arange(101, dtype=dtype) + np.random.shuffle(x) + self._assertOpOutputMatchesExpected( + xla.sort, [x], expected=[np.arange(101, dtype=dtype)]) + + def testTopK(self): + # TODO(b/26783907): The Sort HLO is not implemented on CPU or GPU. + if self.device in ["XLA_CPU", "XLA_GPU"]: + return + + supported_types = set( + [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32]) + for dtype in supported_types.intersection(self.numeric_types): + # Use small input size for bfloat16. Otherwise, we'll get duplicate values + # after conversion to bfloat16, so the possible resulting index array is + # no longer unique. + if dtype == dtypes.bfloat16.as_numpy_dtype: + array_size = 20 + k_options = [0, 1, 2, 10, 20] + else: + array_size = 200 * 1000 + k_options = [0, 1, 2, 10, 20, 100, 1000, 200 * 1000] + for x in [np.arange(array_size)]: + np.random.shuffle(x) + for k in k_options: + indices = x.argsort()[::-1][:k] + + def topk(v, k=k): + return nn_ops.top_k(v, k=k, sorted=True) + + self._assertOpOutputMatchesExpected( + topk, [x.astype(dtype)], + expected=[x[indices].astype(dtype), indices]) + + def testTopK2D(self): + # TODO(b/26783907): The Sort HLO is not implemented on CPU or GPU. + if self.device in ["XLA_CPU", "XLA_GPU"]: + return + + supported_types = set( + [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32]) + for dtype in supported_types.intersection(self.numeric_types): + # Use small input size for bfloat16. Otherwise, we'll get duplicate values + # after conversion to bfloat16, so the possible resulting index array is + # no longer unique. + if dtype == dtypes.bfloat16.as_numpy_dtype: + array_size = 10 + k_options = [0, 1, 2, 10] + else: + array_size = 200 * 1000 + k_options = [0, 1, 2, 10, 20, 100, 1000, 200 * 1000] + batch = 16 + for x in [np.arange(batch * array_size)]: + np.random.shuffle(x) + x = np.reshape(x, [batch, array_size]) + for k in k_options: + indices = x.argsort(axis=1)[::, -1:-k - 1:-1] + expected = np.sort(x, axis=1)[::, -1:-k - 1:-1] + + def topk(v, k=k): + return nn_ops.top_k(v, k=k, sorted=True) + + self._assertOpOutputMatchesExpected( + topk, [x.astype(dtype)], + expected=[expected.astype(dtype), indices]) + + def testTopKZeros(self): + """Tests that positive and negative zeros sort correctly.""" + # TODO(b/26783907): The Sort HLO is not implemented on CPU or GPU. + if self.device in ["XLA_CPU", "XLA_GPU"]: + return + + # Only bfloat16 is implemented. + bfloat16 = dtypes.bfloat16.as_numpy_dtype + if bfloat16 not in self.numeric_types: + return + + with self.cached_session() as sess: + p = array_ops.placeholder(dtypes.bfloat16) + with self.test_scope(): + topk = nn_ops.top_k(p, k=4) + results = sess.run( + topk, + {p: np.array([0., -0., 0., 3., -0., -4., 0., -0.], dtype=bfloat16)}) + self.assertAllEqual( + np.array([3., 0., 0., 0.], dtype=bfloat16), results[0]) + self.assertEqual(list([3, 0, 2, 6]), list(results[1])) + + def testTopKInfinities(self): + """Tests that positive and negative infinity sort correctly.""" + # TODO(b/26783907): The Sort HLO is not implemented on CPU or GPU. + if self.device in ["XLA_CPU", "XLA_GPU"]: + return + + # Only bfloat16 is implemented. + bfloat16 = dtypes.bfloat16.as_numpy_dtype + if bfloat16 not in self.numeric_types: + return + + with self.cached_session() as sess: + p = array_ops.placeholder(dtypes.bfloat16) + with self.test_scope(): + topk = nn_ops.top_k(p, k=6) + results = sess.run(topk, { + p: np.array( + [1, 2, float("inf"), -float("inf"), -1, -2], dtype=bfloat16) + }) + self.assertAllEqual( + np.array( + [float("inf"), 2.0, 1.0, -1.0, -2.0, -float("inf")], + dtype=bfloat16), results[0]) + self.assertEqual(list([2, 1, 0, 4, 5, 3]), list(results[1])) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py index f37c34156f96761632247be4bc1b62fca54f666e..33b84cec7188c85a3bacb20a6df29c73adbd107c 100644 --- a/tensorflow/compiler/tests/spacetobatch_op_test.py +++ b/tensorflow/compiler/tests/spacetobatch_op_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_array_ops @@ -68,11 +68,11 @@ def space_to_batch_direct(input_array, block_shape, paddings): return permuted_reshaped_padded.reshape(output_shape) -class SpaceToBatchTest(XLATestCase): +class SpaceToBatchTest(xla_test.XLATestCase): """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops.""" def _testPad(self, inputs, paddings, block_size, outputs): - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): for dtype in self.float_types: # outputs = space_to_batch(inputs) placeholder = array_ops.placeholder(dtype) @@ -149,13 +149,13 @@ class SpaceToBatchTest(XLATestCase): self._testOne(x_np, block_size, x_out) -class SpaceToBatchNDTest(XLATestCase): +class SpaceToBatchNDTest(xla_test.XLATestCase): """Tests input-output pairs for the SpaceToBatchND and BatchToSpaceND ops.""" def _testPad(self, inputs, block_shape, paddings, outputs): block_shape = np.array(block_shape) paddings = np.array(paddings).reshape((len(block_shape), 2)) - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): for dtype in self.float_types: # TODO(b/68813416): Skip bfloat16's as the input type for direct is # float32 and results in a mismatch, while making testDirect provide the diff --git a/tensorflow/compiler/tests/sparse_to_dense_op_test.py b/tensorflow/compiler/tests/sparse_to_dense_op_test.py new file mode 100644 index 0000000000000000000000000000000000000000..07afd1ab3fb78d5accc52ee2382af0b9fb8079d3 --- /dev/null +++ b/tensorflow/compiler/tests/sparse_to_dense_op_test.py @@ -0,0 +1,118 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow.kernels.sparse_op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import sparse_ops +from tensorflow.python.platform import test + + +def _SparseToDense(sparse_indices, + output_size, + sparse_values, + default_value, + validate_indices=True): + feed_sparse_indices = array_ops.placeholder(dtypes.int32) + feed_dict = {feed_sparse_indices: sparse_indices} + return sparse_ops.sparse_to_dense( + feed_sparse_indices, + output_size, + sparse_values, + default_value=default_value, + validate_indices=validate_indices).eval(feed_dict=feed_dict) + + +class SparseToDenseTest(xla_test.XLATestCase): + + def testInt(self): + with self.cached_session(), self.test_scope(): + tf_ans = _SparseToDense([1, 3], [5], 1, 0) + np_ans = np.array([0, 1, 0, 1, 0]).astype(np.int32) + self.assertAllClose(np_ans, tf_ans) + + def testFloat(self): + with self.cached_session(), self.test_scope(): + tf_ans = _SparseToDense([1, 3], [5], 1.0, 0.0) + np_ans = np.array([0, 1, 0, 1, 0]).astype(np.float32) + self.assertAllClose(np_ans, tf_ans) + + def testSetValue(self): + with self.cached_session(), self.test_scope(): + tf_ans = _SparseToDense([1, 3], [5], [1, 2], -1) + np_ans = np.array([-1, 1, -1, 2, -1]).astype(np.int32) + self.assertAllClose(np_ans, tf_ans) + + def testSetSingleValue(self): + with self.cached_session(), self.test_scope(): + tf_ans = _SparseToDense([1, 3], [5], 1, -1) + np_ans = np.array([-1, 1, -1, 1, -1]).astype(np.int32) + self.assertAllClose(np_ans, tf_ans) + + def test2d(self): + # pylint: disable=bad-whitespace + with self.cached_session(), self.test_scope(): + tf_ans = _SparseToDense([[1, 3], [2, 0]], [3, 4], 1, -1) + np_ans = np.array([[-1, -1, -1, -1], + [-1, -1, -1, 1], + [ 1, -1, -1, -1]]).astype(np.int32) + self.assertAllClose(np_ans, tf_ans) + + def testZeroDefault(self): + with self.cached_session(): + x = sparse_ops.sparse_to_dense(2, [4], 7).eval() + self.assertAllEqual(x, [0, 0, 7, 0]) + + def test3d(self): + with self.cached_session(), self.test_scope(): + tf_ans = _SparseToDense([[1, 3, 0], [2, 0, 1]], [3, 4, 2], 1, -1) + np_ans = np.ones((3, 4, 2), dtype=np.int32) * -1 + np_ans[1, 3, 0] = 1 + np_ans[2, 0, 1] = 1 + self.assertAllClose(np_ans, tf_ans) + + def testBadShape(self): + with self.cached_session(), self.test_scope(): + with self.assertRaisesWithPredicateMatch(ValueError, "must be rank 1"): + _SparseToDense([1, 3], [[5], [3]], 1, -1) + + def testBadValue(self): + with self.cached_session(), self.test_scope(): + with self.assertRaisesOpError( + r"sparse_values has incorrect shape \[2,1\], " + r"should be \[\] or \[2\]"): + _SparseToDense([1, 3], [5], [[5], [3]], -1) + + def testBadNumValues(self): + with self.cached_session(), self.test_scope(): + with self.assertRaisesOpError( + r"sparse_values has incorrect shape \[3\], should be \[\] or \[2\]"): + _SparseToDense([1, 3], [5], [1, 2, 3], -1) + + def testBadDefault(self): + with self.cached_session(), self.test_scope(): + with self.assertRaisesOpError("default_value should be a scalar"): + _SparseToDense([1, 3], [5], [1, 2], [0]) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/stack_ops_test.py b/tensorflow/compiler/tests/stack_ops_test.py index 94342f9567ca71274609e63b0482d55637c98d51..720595a159eea997be2246c4c7dad49612b257eb 100644 --- a/tensorflow/compiler/tests/stack_ops_test.py +++ b/tensorflow/compiler/tests/stack_ops_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -28,10 +28,10 @@ from tensorflow.python.ops import gen_data_flow_ops from tensorflow.python.platform import test -class StackOpTest(XLATestCase): +class StackOpTest(xla_test.XLATestCase): def testStackPushPop(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): size = array_ops.placeholder(dtypes.int32) v = array_ops.placeholder(dtypes.float32) h = gen_data_flow_ops.stack_v2(size, dtypes.float32, stack_name="foo") @@ -41,7 +41,7 @@ class StackOpTest(XLATestCase): self.assertAllClose([[4.0, 5.0]], c1.eval({size: 5, v: [[4.0, 5.0]]})) def testStackPushPopSwap(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): a = np.arange(2000) x = array_ops.placeholder(dtypes.float32) h = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo") @@ -51,7 +51,7 @@ class StackOpTest(XLATestCase): self.assertAllClose(a, c1.eval({x: a})) def testMultiStack(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): v = array_ops.placeholder(dtypes.float32) h1 = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo") c1 = gen_data_flow_ops.stack_push_v2(h1, v) @@ -66,7 +66,7 @@ class StackOpTest(XLATestCase): def testSameNameStacks(self): """Different stacks with the same name do not interfere.""" - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): v1 = array_ops.placeholder(dtypes.float32) v2 = array_ops.placeholder(dtypes.float32) h1 = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo") @@ -84,14 +84,14 @@ class StackOpTest(XLATestCase): self.assertAllClose(out2, 5.0) def testCloseStack(self): - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): size = array_ops.placeholder(dtypes.int32) h = gen_data_flow_ops.stack_v2(size, dtypes.float32, stack_name="foo") c1 = gen_data_flow_ops.stack_close_v2(h) sess.run(c1, {size: 5}) def testPushCloseStack(self): - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): v = array_ops.placeholder(dtypes.float32) h = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo") c = gen_data_flow_ops.stack_push_v2(h, v) diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py index b6f8390a45d43bf7666b90e14cc6ff2f3f61947e..1bea7d9355e40c5a71f848dabc0fa7fa760429d2 100644 --- a/tensorflow/compiler/tests/stateless_random_ops_test.py +++ b/tensorflow/compiler/tests/stateless_random_ops_test.py @@ -22,14 +22,15 @@ import math import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.contrib import stateless from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops +from tensorflow.python.ops.distributions import special_math from tensorflow.python.platform import test -class StatelessRandomOpsTest(XLATestCase): +class StatelessRandomOpsTest(xla_test.XLATestCase): """Test cases for stateless random-number generator operators.""" def _random_types(self): @@ -37,7 +38,7 @@ class StatelessRandomOpsTest(XLATestCase): def testDeterminism(self): # Stateless values should be equal iff the seeds are equal (roughly) - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) seeds = [(x, y) for x in range(5) for y in range(5)] * 3 for stateless_op in [ @@ -54,7 +55,7 @@ class StatelessRandomOpsTest(XLATestCase): self.assertEqual(s0 == s1, np.all(v0 == v1)) def testRandomUniformIsInRange(self): - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): for dtype in self._random_types(): seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) x = stateless.stateless_random_uniform( @@ -73,7 +74,7 @@ class StatelessRandomOpsTest(XLATestCase): def testDistributionOfStatelessRandomUniform(self): """Use Pearson's Chi-squared test to test for uniformity.""" - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): for dtype in self._random_types(): seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) n = 1000 @@ -87,7 +88,7 @@ class StatelessRandomOpsTest(XLATestCase): self.assertTrue(self._chi_squared(y, 10) < 16.92) def testRandomNormalIsFinite(self): - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): for dtype in self._random_types(): seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) x = stateless.stateless_random_uniform( @@ -110,7 +111,7 @@ class StatelessRandomOpsTest(XLATestCase): def testDistributionOfStatelessRandomNormal(self): """Use Anderson-Darling test to test distribution appears normal.""" - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): for dtype in self._random_types(): seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) n = 1000 @@ -122,6 +123,56 @@ class StatelessRandomOpsTest(XLATestCase): # so to avoid flakiness the seed is fixed. self.assertTrue(self._anderson_darling(y) < 2.492) + def testTruncatedNormalIsInRange(self): + # TODO(b/34339814): implement inverse erf support for non-F32 types. + for dtype in [dtypes.float32]: + with self.cached_session() as sess, self.test_scope(): + seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) + n = 10000000 + x = stateless.stateless_truncated_normal( + shape=[n], seed=seed_t, dtype=dtype) + y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]}) + + def normal_cdf(x): + return .5 * math.erfc(-x / math.sqrt(2)) + + def normal_pdf(x): + return math.exp(-(x**2) / 2.) / math.sqrt(2 * math.pi) + + def probit(x, sess=sess): + return sess.run(special_math.ndtri(x)) + + a = -2. + b = 2. + mu = 0. + sigma = 1. + + alpha = (a - mu) / sigma + beta = (b - mu) / sigma + z = normal_cdf(beta) - normal_cdf(alpha) + + self.assertTrue((y >= a).sum() == n) + self.assertTrue((y <= b).sum() == n) + + # For more information on these calculations, see: + # Burkardt, John. "The Truncated Normal Distribution". + # Department of Scientific Computing website. Florida State University. + expected_mean = mu + (normal_pdf(alpha) - normal_pdf(beta)) / z * sigma + actual_mean = np.mean(y) + self.assertAllClose(actual_mean, expected_mean, atol=2e-4) + + expected_median = mu + probit( + (normal_cdf(alpha) + normal_cdf(beta)) / 2.) * sigma + actual_median = np.median(y) + self.assertAllClose(actual_median, expected_median, atol=8e-4) + + expected_variance = sigma**2 * (1 + ( + (alpha * normal_pdf(alpha) - beta * normal_pdf(beta)) / z) - ( + (normal_pdf(alpha) - normal_pdf(beta)) / z)**2) + actual_variance = np.var(y) + self.assertAllClose(actual_variance, expected_variance, rtol=1e-3) + + if __name__ == '__main__': test.main() diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py index f332aa2e9b97e13654cf9b10588c18fed32f7ad4..78244d0b366d9128a4c59f786e4c5ac12e743b75 100644 --- a/tensorflow/compiler/tests/tensor_array_ops_test.py +++ b/tensorflow/compiler/tests/tensor_array_ops_test.py @@ -44,7 +44,7 @@ def _make_converter(dtype): class TensorArrayTest(xla_test.XLATestCase): def testTensorArrayWriteRead(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -66,7 +66,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual([], flow_val.shape) def _testTensorArrayWritePack(self, tf_dtype): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=tf_dtype, tensor_array_name="foo", size=3) @@ -86,7 +86,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArrayWritePack(dtype) def testEmptyTensorArrayPack(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", size=3) @@ -100,7 +100,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual([3, 0, 1], c0.eval().shape) def _testTensorArrayWriteConcat(self, tf_dtype): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=tf_dtype, tensor_array_name="foo", size=3) @@ -121,7 +121,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArrayWriteConcat(dtype) def _testTensorArrayUnpackRead(self, tf_dtype): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=tf_dtype, tensor_array_name="foo", size=3) @@ -176,7 +176,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArrayUnpackReadMaybeLegacy() def _testTensorArraySplitRead(self, tf_dtype): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=tf_dtype, tensor_array_name="foo", size=3) @@ -228,7 +228,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArraySplitRead(dtype) def testTensorGradArrayWriteRead(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -261,7 +261,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual([[-2.0]], g_d2) def testTensorGradArrayDynamicWriteRead(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -300,7 +300,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual(3, g_vs) def testTensorGradAccessTwiceReceiveSameObject(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", size=3, element_shape=[1, 2]) @@ -317,7 +317,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual([[4.0, 5.0]], d_r1_0) def testTensorArrayWriteWrongIndexOrDataTypeFails(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", size=3) @@ -331,7 +331,7 @@ class TensorArrayTest(xla_test.XLATestCase): # the first type, but try to read the other type. if len(self.float_types) > 1: dtype1, dtype2 = list(self.float_types)[:2] - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtype1, tensor_array_name="foo", size=3) @@ -347,7 +347,7 @@ class TensorArrayTest(xla_test.XLATestCase): w0.read(1) def testTensorArraySplitIncompatibleShapesFails(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -379,7 +379,7 @@ class TensorArrayTest(xla_test.XLATestCase): ta.split([1.0], [1]).flow.eval() def _testTensorArrayWriteGradientAddMultipleAdds(self, dtype): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtype, tensor_array_name="foo", size=3, infer_shape=False) @@ -410,7 +410,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArrayWriteGradientAddMultipleAdds(dtype) def testMultiTensorArray(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): h1 = tensor_array_ops.TensorArray( size=1, dtype=dtypes.float32, tensor_array_name="foo") w1 = h1.write(0, 4.0) @@ -425,7 +425,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllClose(9.0, r.eval()) def _testTensorArrayGradientWriteReadType(self, dtype): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.as_dtype(dtype), tensor_array_name="foo", @@ -478,7 +478,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArrayGradientWriteReadType(dtype) def _testTensorArrayGradientWritePackConcatAndRead(self): - with self.test_session() as sess, self.test_scope(): + with self.cached_session() as sess, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -513,7 +513,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArrayGradientWritePackConcatAndRead() def testTensorArrayReadTwice(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]]) ta_readtwice = tensor_array_ops.TensorArray( @@ -529,7 +529,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual([1.0, -1.0], r1_readtwice.eval()) def _testTensorArrayGradientUnpackRead(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -557,7 +557,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArrayGradientUnpackRead() def testTensorArrayGradientSplitConcat(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", size=2) @@ -581,21 +581,21 @@ class TensorArrayTest(xla_test.XLATestCase): grad_vals[0]) def testCloseTensorArray(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", size=3) c1 = ta.close() session.run(c1) def testSizeTensorArray(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", size=3) s = ta.size() self.assertAllEqual(3, s.eval()) def testWriteCloseTensorArray(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -608,7 +608,7 @@ class TensorArrayTest(xla_test.XLATestCase): # TODO(phawkins): implement while loops. # def _testWhileLoopWritePackGradients(self, dynamic_size, dtype): # np_dtype = dtype.as_numpy_dtype - # with self.test_session() as session, self.test_scope(): + # with self.cached_session() as session, self.test_scope(): # v0 = array_ops.identity(np.arange(3 * 5, dtype=np_dtype).reshape(3, 5)) # var = variables.Variable(np.arange(100, 105, dtype=np_dtype)) # state0 = array_ops.identity(np.array([1] * 5, dtype=np_dtype)) @@ -692,7 +692,7 @@ class TensorArrayTest(xla_test.XLATestCase): # dynamic_size=True, dtype=dtypes.float32) # def testGradSerialTwoLoops(self): - # with self.test_session(), self.test_scope(): + # with self.cached_session(), self.test_scope(): # num_steps = 100 # acc = tensor_array_ops.TensorArray( # dtype=dtypes.float32, @@ -725,7 +725,7 @@ class TensorArrayTest(xla_test.XLATestCase): # self.assertAllClose(31.0, grad.eval()) def testSumOfTwoReadVariablesWithoutRepeatGrad(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): a = array_ops.identity( np.arange( 3 * 5, dtype=np.float32).reshape(3, 5) + 1) @@ -757,7 +757,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual(joint_grad_b_t, g0) def testWriteShape(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", size=3) c0 = constant_op.constant([4.0, 5.0]) @@ -781,7 +781,7 @@ class TensorArrayTest(xla_test.XLATestCase): w0.write(0, c2) def testPartlyUnknownShape(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", size=6) @@ -821,7 +821,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual([5, 4, 2, 3], r5.get_shape().as_list()) def _testUnpackShape(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -846,7 +846,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testUnpackShape() def testSplitShape(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -867,7 +867,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual(r0.get_shape(), tensor_shape.unknown_shape()) def testWriteUnknownShape(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -879,7 +879,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual(r0.get_shape(), tensor_shape.unknown_shape()) def _testGradientWhenNotAllComponentsRead(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2) x = constant_op.constant([2.0, 3.0]) w = ta.unstack(x) @@ -893,7 +893,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testGradientWhenNotAllComponentsRead() def _testTensorArrayEvalEmpty(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, size=0, infer_shape=False) with self.assertRaisesOpError( @@ -906,7 +906,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArrayEvalEmpty() def _testTensorArrayEvalEmptyWithDefault(self): - with self.test_session(), self.test_scope(): + with self.cached_session(), self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, size=0, infer_shape=True) self.assertEqual(0, ta.size().eval()) @@ -921,7 +921,7 @@ class TensorArrayTest(xla_test.XLATestCase): self._testTensorArrayEvalEmptyWithDefault() def testTensorArrayScatterReadAndGradients(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -946,7 +946,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0]) def testTensorArrayWriteGatherAndGradients(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", @@ -974,7 +974,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertAllEqual(expected_grad, grad_vals[0]) def testTensorArrayIdentity(self): - with self.test_session() as session, self.test_scope(): + with self.cached_session() as session, self.test_scope(): ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2, infer_shape=False) ta1 = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=4, diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py index ef047005b60bd156a677050368ef67ae030d6c3a..55a992195f2df72677b77757ae86171fa662439f 100644 --- a/tensorflow/compiler/tests/ternary_ops_test.py +++ b/tensorflow/compiler/tests/ternary_ops_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_math_ops @@ -28,10 +28,10 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import googletest -class TernaryOpsTest(XLATestCase): +class TernaryOpsTest(xla_test.XLATestCase): def _testTernary(self, op, a, b, c, expected): - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): pa = array_ops.placeholder(dtypes.as_dtype(a.dtype), a.shape, name="a") pb = array_ops.placeholder(dtypes.as_dtype(b.dtype), b.shape, name="b") diff --git a/tensorflow/compiler/tests/test_utils.py b/tensorflow/compiler/tests/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6abde18ea91f16d153a154b94effab037a911c6c --- /dev/null +++ b/tensorflow/compiler/tests/test_utils.py @@ -0,0 +1,63 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utilities for helping test ops.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + + +def ConvertBetweenDataFormats(x, data_format_src, data_format_dst): + """Converts 4D tensor between data formats.""" + + valid_data_formats = ["NHWC", "NCHW", "HWNC", "HWCN"] + if data_format_src not in valid_data_formats: + raise ValueError("data_format_src must be of %s, got %s." % + (valid_data_formats, data_format_src)) + if data_format_dst not in valid_data_formats: + raise ValueError("data_format_dst must be of %s, got %s." % + (valid_data_formats, data_format_dst)) + if len(x.shape) != 4: + raise ValueError("x must be 4D, got shape %s." % x.shape) + + if data_format_src == data_format_dst: + return x + + dim_map = {d: i for i, d in enumerate(data_format_src)} + transpose_dims = [dim_map[d] for d in data_format_dst] + return np.transpose(x, transpose_dims) + + +def PermuteDimsBetweenDataFormats(dims, data_format_src, data_format_dst): + """Get new shape for converting between data formats.""" + + valid_data_formats = ["NHWC", "NCHW", "HWNC", "HWCN"] + if data_format_src not in valid_data_formats: + raise ValueError("data_format_src must be of %s, got %s." % + (valid_data_formats, data_format_src)) + if data_format_dst not in valid_data_formats: + raise ValueError("data_format_dst must be of %s, got %s." % + (valid_data_formats, data_format_dst)) + if len(dims) != 4: + raise ValueError("dims must be of length 4, got %s." % dims) + + if data_format_src == data_format_dst: + return dims + + dim_map = {d: i for i, d in enumerate(data_format_src)} + permuted_dims = [dims[dim_map[d]] for d in data_format_dst] + return permuted_dims diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py index 689a4a1f4e02f5dd48f64dc94afd0fcb50df8b5b..5b0e57f83ff4b5a8d1891bef0675074bd67addce 100644 --- a/tensorflow/compiler/tests/unary_ops_test.py +++ b/tensorflow/compiler/tests/unary_ops_test.py @@ -23,7 +23,7 @@ import unittest import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import bitwise_ops @@ -44,11 +44,16 @@ def nhwc_to_format(x, data_format): raise ValueError("Unknown format {}".format(data_format)) -class UnaryOpsTest(XLATestCase): +class UnaryOpsTest(xla_test.XLATestCase): """Test cases for unary operators.""" - def _assertOpOutputMatchesExpected(self, op, inp, expected, - equality_test=None, rtol=1e-3, atol=1e-5): + def _assertOpOutputMatchesExpected(self, + op, + inp, + expected, + equality_test=None, + rtol=1e-3, + atol=1e-5): """Verifies that 'op' produces 'expected' when fed input 'inp' . Args: @@ -60,7 +65,7 @@ class UnaryOpsTest(XLATestCase): rtol: relative tolerance for equality test. atol: absolute tolerance for equality test. """ - with self.test_session() as session: + with self.cached_session() as session: with self.test_scope(): pinp = array_ops.placeholder( dtypes.as_dtype(inp.dtype), inp.shape, name="a") @@ -81,10 +86,10 @@ class UnaryOpsTest(XLATestCase): def testAllTypeOps(self): for dtype in self.numeric_types: self._assertOpOutputMatchesExpected( - array_ops.diag, - np.array([1, 2, 3, 4], dtype=dtype), - np.array([[1, 0, 0, 0], [0, 2, 0, 0], [0, 0, 3, 0], [0, 0, 0, 4]], - dtype=dtype)) + array_ops.diag, np.array([1, 2, 3, 4], dtype=dtype), + np.array( + [[1, 0, 0, 0], [0, 2, 0, 0], [0, 0, 3, 0], [0, 0, 0, 4]], + dtype=dtype)) self._assertOpOutputMatchesExpected( array_ops.diag_part, np.arange(36).reshape([2, 3, 2, 3]).astype(dtype), @@ -102,8 +107,7 @@ class UnaryOpsTest(XLATestCase): expected=np.array([[-1, 1]], dtype=dtype)) self._assertOpOutputMatchesExpected( - array_ops.matrix_diag, - np.array([[1, 2], [3, 4]], dtype=dtype), + array_ops.matrix_diag, np.array([[1, 2], [3, 4]], dtype=dtype), np.array([[[1, 0], [0, 2]], [[3, 0], [0, 4]]], dtype=dtype)) self._assertOpOutputMatchesExpected( array_ops.matrix_diag, np.array([1, 2, 3, 4], dtype=dtype), @@ -115,10 +119,10 @@ class UnaryOpsTest(XLATestCase): np.array( [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=dtype), np.array( - [[[[1, 0, 0], [0, 2, 0], [0, 0, 3]], - [[4, 0, 0], [0, 5, 0], [0, 0, 6]]], - [[[7, 0, 0], [0, 8, 0], [0, 0, 9]], - [[10, 0, 0], [0, 11, 0], [0, 0, 12]]]], + [[[[1, 0, 0], [0, 2, 0], [0, 0, 3]], [[4, 0, 0], [0, 5, 0], [ + 0, 0, 6 + ]]], [[[7, 0, 0], [0, 8, 0], [0, 0, 9]], [[10, 0, 0], [0, 11, 0], + [0, 0, 12]]]], dtype=dtype)) self._assertOpOutputMatchesExpected( array_ops.matrix_diag_part, @@ -159,36 +163,30 @@ class UnaryOpsTest(XLATestCase): continue x = np.arange(-0.90, 0.90, 0.25) self._assertOpOutputMatchesExpected( - math_ops.acos, - x.astype(dtype), - expected=np.arccos(x).astype(dtype)) + math_ops.acos, x.astype(dtype), expected=np.arccos(x).astype(dtype)) self._assertOpOutputMatchesExpected( - math_ops.asin, - x.astype(dtype), - expected=np.arcsin(x).astype(dtype)) + math_ops.asin, x.astype(dtype), expected=np.arcsin(x).astype(dtype)) x = np.arange(-3, 3).reshape(1, 3, 2) self._assertOpOutputMatchesExpected( - math_ops.atan, - x.astype(dtype), - expected=np.arctan(x).astype(dtype)) + math_ops.atan, x.astype(dtype), expected=np.arctan(x).astype(dtype)) self._assertOpOutputMatchesExpected( math_ops.acosh, np.array([1, 2, 3, 4], dtype=dtype), - expected=np.array([0, 1.3169579, 1.76274717, 2.06343707], - dtype=dtype)) + expected=np.array( + [0, 1.3169579, 1.76274717, 2.06343707], dtype=dtype)) self._assertOpOutputMatchesExpected( math_ops.asinh, np.array([1, 2, 3, 4], dtype=dtype), - expected=np.array([0.88137359, 1.44363548, 1.81844646, 2.09471255], - dtype=dtype)) + expected=np.array( + [0.88137359, 1.44363548, 1.81844646, 2.09471255], dtype=dtype)) self._assertOpOutputMatchesExpected( math_ops.atanh, np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype), - expected=np.array([0.10033535, 0.20273255, 0.3095196, 0.42364893], - dtype=dtype)) + expected=np.array( + [0.10033535, 0.20273255, 0.3095196, 0.42364893], dtype=dtype)) self._assertOpOutputMatchesExpected( math_ops.ceil, @@ -198,8 +196,18 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( math_ops.cosh, np.array([1, 2, 3, 4], dtype=dtype), - expected=np.array([1.54308063, 3.76219569, 10.067662, 27.30823284], - dtype=dtype)) + expected=np.array( + [1.54308063, 3.76219569, 10.067662, 27.30823284], dtype=dtype)) + + # Disable float16 testing for now + if dtype != np.float16: + x = np.arange(-10, 10, 1).astype(dtype) + with self.cached_session() as session: + erf_x = session.run(math_ops.erf(x)) + erfc_x = session.run(math_ops.erfc(x)) + + self._assertOpOutputMatchesExpected(math_ops.erf, x, expected=erf_x) + self._assertOpOutputMatchesExpected(math_ops.erfc, x, expected=erfc_x) self._assertOpOutputMatchesExpected( math_ops.exp, @@ -219,8 +227,8 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( math_ops.is_finite, - np.array([[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], - dtype=dtype), + np.array( + [[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype), expected=np.array([[0, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=np.bool)) # Tests for tf.nn ops. @@ -261,16 +269,20 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( math_ops.rint, - np.array([[-1.7, 1.2, 4.0, 0.0], [-3.5, -2.5, -1.5, -0.5], - [0.5, 1.5, 2.5, 3.5]], dtype=dtype), - expected=np.array([[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], - dtype=dtype)) + np.array( + [[-1.7, 1.2, 4.0, 0.0], [-3.5, -2.5, -1.5, -0.5], + [0.5, 1.5, 2.5, 3.5]], + dtype=dtype), + expected=np.array( + [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype)) self._assertOpOutputMatchesExpected( math_ops.round, - np.array([[-1.7, 1.2, 4.0, 0.0], [-3.5, -2.5, -1.5, -0.5], - [0.5, 1.5, 2.5, 3.5]], dtype=dtype), - expected=np.array([[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], - dtype=dtype)) + np.array( + [[-1.7, 1.2, 4.0, 0.0], [-3.5, -2.5, -1.5, -0.5], + [0.5, 1.5, 2.5, 3.5]], + dtype=dtype), + expected=np.array( + [[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], dtype=dtype)) self._assertOpOutputMatchesExpected( math_ops.rsqrt, @@ -279,10 +291,7 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( math_ops.sigmoid, - np.array( - [[1, 1, 1, 1], - [1, 2, 3, 4]], - dtype=dtype), + np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype), expected=np.array( [[0.7310586, 0.7310586, 0.7310586, 0.7310586], [0.7310586, 0.880797, 0.95257413, 0.98201376]], @@ -296,8 +305,8 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( math_ops.sinh, np.array([1, 2, 3, 4], dtype=dtype), - expected=np.array([1.17520119, 3.62686041, 10.01787493, 27.2899172], - dtype=dtype)) + expected=np.array( + [1.17520119, 3.62686041, 10.01787493, 27.2899172], dtype=dtype)) self._assertOpOutputMatchesExpected( math_ops.sqrt, @@ -307,15 +316,12 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( math_ops.tan, np.array([1, 2, 3, 4], dtype=dtype), - expected=np.array([1.55740772, -2.18503986, -0.14254654, 1.15782128], - dtype=dtype)) + expected=np.array( + [1.55740772, -2.18503986, -0.14254654, 1.15782128], dtype=dtype)) self._assertOpOutputMatchesExpected( math_ops.tanh, - np.array( - [[1, 1, 1, 1], - [1, 2, 3, 4]], - dtype=dtype), + np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype), expected=np.array( [[0.76159418, 0.76159418, 0.76159418, 0.76159418], [0.76159418, 0.96402758, 0.99505478, 0.99932933]], @@ -323,10 +329,7 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( nn_ops.log_softmax, - np.array( - [[1, 1, 1, 1], - [1, 2, 3, 4]], - dtype=dtype), + np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype), expected=np.array( [[-1.3862944, -1.3862944, -1.3862944, -1.3862944], [-3.4401896, -2.4401896, -1.4401897, -0.44018969]], @@ -360,20 +363,31 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( nn_ops.softmax, - np.array( - [[1, 1, 1, 1], - [1, 2, 3, 4]], - dtype=dtype), + np.array([1, 2, 3, 4], dtype=dtype), + expected=np.array([0.032058604, 0.087144323, 0.23688284, 0.64391428], + dtype=dtype)) + + self._assertOpOutputMatchesExpected( + nn_ops.softmax, + np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype), expected=np.array( [[0.25, 0.25, 0.25, 0.25], [0.032058604, 0.087144323, 0.23688284, 0.64391428]], dtype=dtype)) + self._assertOpOutputMatchesExpected( + nn_ops.softmax, + np.array([[[1, 1], [1, 1]], [[1, 2], [3, 4]]], dtype=dtype), + expected=np.array( + [[[0.5, 0.5], [0.5, 0.5]], + [[0.26894142, 0.73105858], [0.26894142, 0.73105858]]], + dtype=dtype)) + self._assertOpOutputMatchesExpected( nn_ops.softsign, np.array([[-2, -1, 0, 1, 2]], dtype=dtype), - expected=np.array([[-0.66666669, -0.5, 0, 0.5, 0.66666669]], - dtype=dtype)) + expected=np.array( + [[-0.66666669, -0.5, 0, 0.5, 0.66666669]], dtype=dtype)) self._assertOpOutputMatchesExpected( math_ops.is_finite, @@ -383,9 +397,96 @@ class UnaryOpsTest(XLATestCase): [[True, False, True], [False, True, True]], dtype=np.bool)) self._assertOpOutputMatchesExpected( - lambda x: array_ops.quantize_and_dequantize_v2(x, -127, 127, True, 8), + math_ops.lgamma, + np.array(0.5, dtype=dtype), + expected=np.array(np.log(np.pi) / 2, dtype=dtype)) + + self._assertOpOutputMatchesExpected( + math_ops.lgamma, + np.array( + [[1, 2, 3], [4, 5, 6], [1 / 2, 3 / 2, 5 / 2], + [-3 / 2, -7 / 2, -11 / 2]], + dtype=dtype), + expected=np.array( + [ + [0, 0, np.log(2.0)], + [np.log(6.0), np.log(24.0), + np.log(120)], + [ + np.log(np.pi) / 2, + np.log(np.pi) / 2 - np.log(2), + np.log(np.pi) / 2 - np.log(4) + np.log(3) + ], + [ + np.log(np.pi) / 2 - np.log(3) + np.log(4), + np.log(np.pi) / 2 - np.log(105) + np.log(16), + np.log(np.pi) / 2 - np.log(10395) + np.log(64), + ], + ], + dtype=dtype)) + + # The actual result is complex. Take the real part. + self._assertOpOutputMatchesExpected( + math_ops.lgamma, + np.array([-1 / 2, -5 / 2, -9 / 2], dtype=dtype), + expected=np.array( + [ + np.log(np.pi) / 2 + np.log(2), + np.log(np.pi) / 2 - np.log(15) + np.log(8), + np.log(np.pi) / 2 - np.log(945) + np.log(32), + ], + dtype=dtype), + atol=1e-4) + + self._assertOpOutputMatchesExpected( + math_ops.digamma, + np.array( + [[1.0, 0.5, 1 / 3.0], [0.25, 1 / 6.0, 0.125], [2.0, 3.0, 4.0], + [6.0, 8.0, 9.0]], + dtype=dtype), + expected=np.array( + [ + [ + -np.euler_gamma, -2 * np.log(2) - np.euler_gamma, + -np.pi / 2 / np.sqrt(3) - 3 * np.log(3) / 2 - + np.euler_gamma + ], + [ + -np.pi / 2 - 3 * np.log(2) - np.euler_gamma, + -np.pi * np.sqrt(3) / 2 - 2 * np.log(2) - + 3 * np.log(3) / 2 - np.euler_gamma, + -np.pi / 2 - 4 * np.log(2) - + (np.pi + np.log(2 + np.sqrt(2)) - np.log(2 - np.sqrt(2))) + / np.sqrt(2) - np.euler_gamma + ], + [ + 1 - np.euler_gamma, 1.5 - np.euler_gamma, + 11 / 6.0 - np.euler_gamma + ], + [ + 137 / 60.0 - np.euler_gamma, 363 / 140.0 - np.euler_gamma, + 761 / 280.0 - np.euler_gamma + ], + ], + dtype=dtype)) + + def quantize_and_dequantize_v2(x): + return array_ops.quantize_and_dequantize_v2( + x, -127, 127, signed_input=True, num_bits=8) + + self._assertOpOutputMatchesExpected( + quantize_and_dequantize_v2, np.array([-1, -0.5, 0, 0.3], dtype=dtype), - expected=np.array([-1, -64.0 / 127, 0, 38.0 / 127], dtype=dtype)) + expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype)) + + def quantize_and_dequantize_v3(x): + return array_ops.quantize_and_dequantize_v3( + x, -127, 127, num_bits=8, signed_input=True, range_given=False) + + self._assertOpOutputMatchesExpected( + quantize_and_dequantize_v3, + np.array([-1, -0.5, 0, 0.3], dtype=dtype), + expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype)) def testComplexOps(self): for dtype in self.complex_types: @@ -566,13 +667,13 @@ class UnaryOpsTest(XLATestCase): for dtype in self.float_types: self._assertOpOutputMatchesExpected( math_ops.is_inf, - np.array([[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], - dtype=dtype), + np.array( + [[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype), expected=np.array([[1, 0, 0, 0, 0, 0, 0, 1, 0]], dtype=np.bool)) self._assertOpOutputMatchesExpected( math_ops.is_nan, - np.array([[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], - dtype=dtype), + np.array( + [[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype), expected=np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=np.bool)) def testLogicalOps(self): @@ -589,14 +690,15 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( lambda x: gen_nn_ops.bias_add_grad(x, data_format="NCHW"), - np.array([[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]]], - dtype=np.float32), + np.array( + [[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]]], dtype=np.float32), expected=np.array([10., 26.], dtype=np.float32)) def testCast(self): shapes = [[], [4], [2, 3], [2, 0, 4]] - types = (set([dtypes.bool, dtypes.int32, dtypes.float32]) | - self.complex_tf_types) + types = ( + set([dtypes.bool, dtypes.int32, dtypes.float32]) + | self.complex_tf_types) for shape in shapes: for src_type in types: for dst_type in types: @@ -638,14 +740,11 @@ class UnaryOpsTest(XLATestCase): self._assertOpOutputMatchesExpected( rank_op, dtype(7), expected=np.int32(0)) self._assertOpOutputMatchesExpected( - rank_op, np.array( - [[], []], dtype=dtype), expected=np.int32(2)) + rank_op, np.array([[], []], dtype=dtype), expected=np.int32(2)) self._assertOpOutputMatchesExpected( - rank_op, np.array( - [-1, 1], dtype=dtype), expected=np.int32(1)) + rank_op, np.array([-1, 1], dtype=dtype), expected=np.int32(1)) self._assertOpOutputMatchesExpected( - rank_op, np.array( - [[-1, 1]], dtype=dtype), expected=np.int32(2)) + rank_op, np.array([[-1, 1]], dtype=dtype), expected=np.int32(2)) self._assertOpOutputMatchesExpected( rank_op, np.array([[-1], [1], [4]], dtype=dtype), @@ -710,97 +809,97 @@ class UnaryOpsTest(XLATestCase): equality_test=self.ListsAreClose) def testDepthToSpace(self): + def make_op(data_format): + def op(x): - return array_ops.depth_to_space(x, block_size=2, - data_format=data_format) + return array_ops.depth_to_space( + x, block_size=2, data_format=data_format) + return op for dtype in self.numeric_types: for data_format in ["NCHW", "NHWC"]: self._assertOpOutputMatchesExpected( make_op(data_format), - nhwc_to_format(np.array([[[[1, 2, 3, 4]]]], dtype=dtype), - data_format), - expected=nhwc_to_format(np.array([[[[1], [2]], - [[3], [4]]]], dtype=dtype), - data_format)) + nhwc_to_format( + np.array([[[[1, 2, 3, 4]]]], dtype=dtype), data_format), + expected=nhwc_to_format( + np.array([[[[1], [2]], [[3], [4]]]], dtype=dtype), data_format)) self._assertOpOutputMatchesExpected( make_op(data_format), nhwc_to_format( - np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]], - dtype=dtype), + np.array( + [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]], dtype=dtype), data_format), expected=nhwc_to_format( - np.array([[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], - dtype=dtype), - data_format)) + np.array( + [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], + dtype=dtype), data_format)) self._assertOpOutputMatchesExpected( make_op(data_format), nhwc_to_format( - np.array([[[[1, 2, 3, 4], - [5, 6, 7, 8]], - [[9, 10, 11, 12], - [13, 14, 15, 16]]]], dtype=dtype), - data_format), + np.array( + [[[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], + [13, 14, 15, 16]]]], + dtype=dtype), data_format), expected=nhwc_to_format( - np.array([[[[1], [2], [5], [6]], - [[3], [4], [7], [8]], - [[9], [10], [13], [14]], - [[11], [12], [15], [16]]]], dtype=dtype), - data_format)) + np.array( + [[[[1], [2], [5], [6]], [[3], [4], [7], [8]], + [[9], [10], [13], [14]], [[11], [12], [15], [16]]]], + dtype=dtype), data_format)) def testSpaceToDepth(self): + def make_op(data_format): + def op(x): - return array_ops.space_to_depth(x, block_size=2, - data_format=data_format) + return array_ops.space_to_depth( + x, block_size=2, data_format=data_format) + return op for dtype in self.numeric_types: for data_format in ["NCHW", "NHWC"]: self._assertOpOutputMatchesExpected( make_op(data_format), - nhwc_to_format(np.array([[[[1], [2]], - [[3], [4]]]], dtype=dtype), - data_format), - expected=nhwc_to_format(np.array([[[[1, 2, 3, 4]]]], dtype=dtype), - data_format)) + nhwc_to_format( + np.array([[[[1], [2]], [[3], [4]]]], dtype=dtype), data_format), + expected=nhwc_to_format( + np.array([[[[1, 2, 3, 4]]]], dtype=dtype), data_format)) self._assertOpOutputMatchesExpected( make_op(data_format), - nhwc_to_format(np.array([[[[1, 2, 3], [4, 5, 6]], - [[7, 8, 9], [10, 11, 12]]]], dtype=dtype), - data_format), + nhwc_to_format( + np.array( + [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], + dtype=dtype), data_format), expected=nhwc_to_format( - np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]], - dtype=dtype), + np.array( + [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]], dtype=dtype), data_format)) self._assertOpOutputMatchesExpected( make_op(data_format), - nhwc_to_format(np.array([[[[1], [2], [5], [6]], - [[3], [4], [7], [8]], - [[9], [10], [13], [14]], - [[11], [12], [15], [16]]]], dtype=dtype), - data_format), + nhwc_to_format( + np.array( + [[[[1], [2], [5], [6]], [[3], [4], [7], [8]], + [[9], [10], [13], [14]], [[11], [12], [15], [16]]]], + dtype=dtype), data_format), expected=nhwc_to_format( - np.array([[[[1, 2, 3, 4], - [5, 6, 7, 8]], - [[9, 10, 11, 12], - [13, 14, 15, 16]]]], dtype=dtype), - data_format)) + np.array( + [[[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], + [13, 14, 15, 16]]]], + dtype=dtype), data_format)) def _assertSoftplusMatchesExpected(self, features, dtype): features = np.array(features, dtype=dtype) zero = np.asarray(0).astype(dtype) expected = np.logaddexp(zero, features) self._assertOpOutputMatchesExpected( - nn_ops.softplus, features, expected=expected, - rtol=1e-6, - atol=9.1e-6) + nn_ops.softplus, features, expected=expected, rtol=1e-6, atol=9.1e-6) def testSoftplus(self): for dtype in self.float_types: @@ -814,9 +913,10 @@ class UnaryOpsTest(XLATestCase): one = dtype(1) ten = dtype(10) self._assertSoftplusMatchesExpected([ - log_eps, log_eps - one, log_eps + one, log_eps - ten, - log_eps + ten, -log_eps, -log_eps - one, -log_eps + one, - -log_eps - ten, -log_eps + ten], dtype) + log_eps, log_eps - one, log_eps + one, log_eps - ten, log_eps + ten, + -log_eps, -log_eps - one, -log_eps + one, -log_eps - ten, + -log_eps + ten + ], dtype) if __name__ == "__main__": diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py index 2c09b03d5a35cde2c42d8a145781270c0c908587..dd2c252d383bca9c59033ac07e442b487e4975a6 100644 --- a/tensorflow/compiler/tests/variable_ops_test.py +++ b/tensorflow/compiler/tests/variable_ops_test.py @@ -20,12 +20,13 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_state_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import resource_variable_ops @@ -36,7 +37,7 @@ from tensorflow.python.platform import googletest from tensorflow.python.training.gradient_descent import GradientDescentOptimizer -class VariableOpsTest(XLATestCase): +class VariableOpsTest(xla_test.XLATestCase): """Test cases for resource variable operators.""" def testOneWriteOneOutput(self): @@ -52,9 +53,7 @@ class VariableOpsTest(XLATestCase): with ops.control_dependencies([x]): y = v.read_value() self.assertAllClose( - np.array([[2, 1 + 2j], [4, 5]]).astype(dtype), sess.run(y, { - p: 1 - })) + np.array([[2, 1 + 2j], [4, 5]]).astype(dtype), sess.run(y, {p: 1})) def testSparseRead0DIndices(self): for dtype in self.numeric_types: @@ -103,9 +102,9 @@ class VariableOpsTest(XLATestCase): x = v.sparse_read([[2, 1], [3, 0]]) self.assertAllClose( np.array( - [[[[20, 21, 22], [23, 24j, 25]], [[10, 11, 12], [13, 14, 15]]], - [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]], - ).astype(dtype), sess.run(x)) + [[[[20, 21, 22], [23, 24j, 25]], [[10, 11, 12], [13, 14, 15]] + ], [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]] + ],).astype(dtype), sess.run(x)) def testShape(self): for dtype in self.numeric_types: @@ -206,6 +205,206 @@ class VariableOpsTest(XLATestCase): self.assertAllClose(update, result[1]) self.assertAllClose(update, result[2]) + def testScatterAdd(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[2, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[1], [7]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_add( + handle, [0], constant_op.constant([[2]], dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertAllEqual(sess.run(read), [[3], [7]]) + + def testScatterSub(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[2, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[4], [1]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_sub( + handle, [1], constant_op.constant([[2]], dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertAllEqual(sess.run(read), [[4], [-1]]) + + def testScatterMul(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[1]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_mul( + handle, [0], constant_op.constant([[5]], dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[5]]) + + def testScatterDiv(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[6]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_div( + handle, [0], constant_op.constant([[3]], dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertAllEqual(sess.run(read), [[2]]) + + def testScatterMin(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[6]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_min( + handle, [0], constant_op.constant([[3]], dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[3]]) + + def testScatterMax(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[6]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_max( + handle, [0], constant_op.constant([[3]], dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[6]]) + + def testScatterUpdate(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[6]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_update( + handle, [0], constant_op.constant([[3]], dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[3]]) + + def testScatterAddScalar(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[1]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_add( + handle, [0], constant_op.constant(2, dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[3]]) + + def testScatterSubScalar(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[1]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_sub( + handle, [0], constant_op.constant(2, dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[-1]]) + + def testScatterMulScalar(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[1]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_mul( + handle, [0], constant_op.constant(5, dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[5]]) + + def testScatterDivScalar(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[6]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_div( + handle, [0], constant_op.constant(3, dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[2]]) + + def testScatterMinScalar(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[6]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_min( + handle, [0], constant_op.constant(3, dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[3]]) + + def testScatterMaxScalar(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.int32, shape=[1, 1]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([[6]], dtype=dtypes.int32))) + sess.run( + resource_variable_ops.resource_scatter_max( + handle, [0], constant_op.constant(3, dtype=dtypes.int32))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) + self.assertEqual(sess.run(read), [[6]]) + + def testScatterNdAddOps(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.float32, shape=[8]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([1] * 8, dtype=dtypes.float32))) + indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32) + updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32) + expected = np.array([1, 12, 1, 11, 10, 1, 1, 13]) + sess.run(gen_state_ops.resource_scatter_nd_add(handle, indices, updates)) + read = resource_variable_ops.read_variable_op( + handle, dtype=dtypes.float32) + self.assertAllClose(expected, sess.run(read)) + + def testScatterNdUpdateAddOps(self): + with self.test_session() as sess, self.test_scope(): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.float32, shape=[8]) + sess.run( + resource_variable_ops.assign_variable_op( + handle, constant_op.constant([1] * 8, dtype=dtypes.float32))) + indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32) + updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32) + expected = np.array([1, 11, 1, 10, 9, 1, 1, 12]) + sess.run( + gen_state_ops.resource_scatter_nd_update(handle, indices, updates)) + read = resource_variable_ops.read_variable_op( + handle, dtype=dtypes.float32) + self.assertAllClose(expected, sess.run(read)) + class StridedSliceAssignChecker(object): """Compares the results of a slice assignment using Tensorflow and numpy.""" @@ -236,12 +435,12 @@ class StridedSliceAssignChecker(object): self.test.assertAllEqual(val, valnp) -class SliceAssignTest(XLATestCase): +class SliceAssignTest(xla_test.XLATestCase): def testSliceAssign(self): for dtype in self.numeric_types: - checker = StridedSliceAssignChecker(self, [[1, 2, 3], [4, 5, 6]], - dtype=dtype) + checker = StridedSliceAssignChecker( + self, [[1, 2, 3], [4, 5, 6]], dtype=dtype) # No-op assignment checker[:] = [[10, 20, 30], [40, 50, 60]] # Checks trivial (1,1) shape tensor diff --git a/tensorflow/compiler/tests/while_test.py b/tensorflow/compiler/tests/while_test.py index f79eb27435cc954cebde4357c1d946a320f4ed75..4ee144beb7f3243be069d59ee4a613484fe183b3 100644 --- a/tensorflow/compiler/tests/while_test.py +++ b/tensorflow/compiler/tests/while_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test from tensorflow.compiler.tf2xla.python import xla from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes @@ -29,7 +29,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class WhileTest(XLATestCase): +class WhileTest(xla_test.XLATestCase): def testSingletonLoopHandrolled(self): # Define a function for the loop body @@ -43,7 +43,7 @@ class WhileTest(XLATestCase): def loop_cond(step): return step < 10 - with self.test_session() as sess: + with self.cached_session() as sess: init_index = array_ops.placeholder(dtypes.int32, []) with self.test_scope(): loop_outputs = xla.while_loop([init_index], loop_cond, loop_body) @@ -65,7 +65,7 @@ class WhileTest(XLATestCase): del rsum return step < 10 - with self.test_session() as sess: + with self.cached_session() as sess: init_index = array_ops.placeholder(dtypes.int32, []) init_sum = array_ops.placeholder(dtypes.float32, []) with self.test_scope(): @@ -91,7 +91,7 @@ class WhileTest(XLATestCase): del rsum return step < 10 - with self.test_session() as sess: + with self.cached_session() as sess: init_index = array_ops.placeholder(dtypes.int32, []) init_sum = array_ops.placeholder(dtypes.complex64, []) with self.test_scope(): @@ -117,7 +117,7 @@ class WhileTest(XLATestCase): del x return step < 10 - with self.test_session() as sess: + with self.cached_session() as sess: init_index = array_ops.placeholder(dtypes.int32, []) with self.test_scope(): loop_outputs = xla.while_loop([init_index, 42], loop_cond, loop_body) diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py index f0b010fa67f2ffb3f81fd14d4d89585f716b4890..28d61fb07dcb665fa0dbe3f3e566e291e24fa662 100644 --- a/tensorflow/compiler/tests/xla_device_test.py +++ b/tensorflow/compiler/tests/xla_device_test.py @@ -20,14 +20,16 @@ from __future__ import print_function import numpy as np -from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_control_flow_ops from tensorflow.python.platform import test -class XlaDeviceTest(XLATestCase): +class XlaDeviceTest(xla_test.XLATestCase): def testCopies(self): """Tests that copies onto and off XLA devices work.""" @@ -35,7 +37,7 @@ class XlaDeviceTest(XLATestCase): [16384, 1], [1, 16384], [1, 20000, 1, 1]] for dtype in self.numeric_types: for shape in shapes: - with self.test_session() as sess: + with self.cached_session() as sess: with ops.device("CPU"): x = array_ops.placeholder(dtype, shape) with self.test_scope(): @@ -47,8 +49,36 @@ class XlaDeviceTest(XLATestCase): result = sess.run(z, {x: inputs}) self.assertAllCloseAccordingToType(result, inputs + inputs) + def testCopiesOfUnsupportedTypesFailGracefully(self): + """Tests that copies of unsupported types don't crash.""" + test_types = set([ + np.uint8, np.uint16, np.uint32, np.uint64, np.int8, np.int16, np.int32, + np.int64, np.float16, np.float32, np.float16, + dtypes.bfloat16.as_numpy_dtype + ]) + shape = (10, 10) + for unsupported_dtype in test_types - self.all_types: + with self.cached_session() as sess: + with ops.device("CPU"): + x = array_ops.placeholder(unsupported_dtype, shape) + with self.test_scope(): + y, = array_ops.identity_n([x]) + with ops.device("CPU"): + z = array_ops.identity(y) + + inputs = np.random.randint(-100, 100, shape) + inputs = inputs.astype(unsupported_dtype) + # Execution should either succeed or raise an InvalidArgumentError, + # but not crash. Even "unsupported types" may succeed here since some + # backends (e.g., the CPU backend) are happy to handle buffers of + # unsupported types, even if they cannot compute with them. + try: + sess.run(z, {x: inputs}) + except errors.InvalidArgumentError: + pass + def testControlTrigger(self): - with self.test_session() as sess: + with self.cached_session() as sess: with self.test_scope(): x = gen_control_flow_ops.control_trigger() sess.run(x) diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b2f026df6c0c28fcbceaa0493871bc12c2d23b1f --- /dev/null +++ b/tensorflow/compiler/tests/xla_ops_test.py @@ -0,0 +1,301 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for XLA op wrappers.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl.testing import parameterized +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.compiler.tf2xla.python import xla +from tensorflow.compiler.xla import xla_data_pb2 +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import function +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import googletest + + +class XlaOpsTest(xla_test.XLATestCase, parameterized.TestCase): + + def _assertOpOutputMatchesExpected(self, op, args, expected, + equality_fn=None): + with self.test_session() as session: + with self.test_scope(): + placeholders = [ + array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape) + for arg in args + ] + feeds = {placeholders[i]: args[i] for i in range(0, len(args))} + output = op(*placeholders) + result = session.run(output, feeds) + if not equality_fn: + equality_fn = self.assertAllClose + equality_fn(result, expected, rtol=1e-3) + + def testAdd(self): + for dtype in self.numeric_types: + self._assertOpOutputMatchesExpected( + xla.add, + args=(np.array([1, 2, 3], dtype=dtype), + np.array([4, 5, 6], dtype=dtype)), + expected=np.array([5, 7, 9], dtype=dtype)) + + self._assertOpOutputMatchesExpected( + lambda x, y: xla.add(x, y, broadcast_dims=(0,)), + args=(np.array([[1, 2], [3, 4]], dtype=dtype), + np.array([7, 11], dtype=dtype)), + expected=np.array([[8, 9], [14, 15]], dtype=dtype)) + + self._assertOpOutputMatchesExpected( + lambda x, y: xla.add(x, y, broadcast_dims=(1,)), + args=(np.array([[1, 2], [3, 4]], dtype=dtype), + np.array([7, 11], dtype=dtype)), + expected=np.array([[8, 13], [10, 15]], dtype=dtype)) + + def testBroadcast(self): + for dtype in self.numeric_types: + v = np.arange(4, dtype=np.int32).astype(dtype).reshape([2, 2]) + self._assertOpOutputMatchesExpected( + lambda x: xla.broadcast(x, (7, 42)), + args=(v,), + expected=np.tile(v, (7, 42, 1, 1))) + + def testShiftRightLogical(self): + self._assertOpOutputMatchesExpected( + xla.shift_right_logical, + args=(np.array([-1, 16], dtype=np.int32), np.int32(4)), + expected=np.array([0x0FFFFFFF, 1], dtype=np.int32)) + + self._assertOpOutputMatchesExpected( + xla.shift_right_logical, + args=(np.array([0xFFFFFFFF, 16], dtype=np.uint32), np.uint32(4)), + expected=np.array([0x0FFFFFFF, 1], dtype=np.uint32)) + + def testShiftRightArithmetic(self): + self._assertOpOutputMatchesExpected( + xla.shift_right_arithmetic, + args=(np.array([-1, 16], dtype=np.int32), np.int32(4)), + expected=np.array([-1, 1], dtype=np.int32)) + + self._assertOpOutputMatchesExpected( + xla.shift_right_arithmetic, + args=(np.array([0xFFFFFFFF, 16], dtype=np.uint32), np.uint32(4)), + expected=np.array([0xFFFFFFFF, 1], dtype=np.uint32)) + + PRECISION_VALUES = (None, xla_data_pb2.PrecisionConfigProto.DEFAULT, + xla_data_pb2.PrecisionConfigProto.HIGH, + xla_data_pb2.PrecisionConfigProto.HIGHEST) + + @parameterized.parameters(*PRECISION_VALUES) + def testConv(self, precision): + for dtype in set(self.float_types).intersection( + set([dtypes.bfloat16.as_numpy_dtype, np.float32])): + + def conv_1d_fn(lhs, rhs): + dnums = xla_data_pb2.ConvolutionDimensionNumbers() + num_spatial_dims = 1 + dnums.input_batch_dimension = 0 + dnums.input_feature_dimension = 1 + dnums.output_batch_dimension = 0 + dnums.output_feature_dimension = 1 + dnums.kernel_output_feature_dimension = 0 + dnums.kernel_input_feature_dimension = 1 + dnums.input_spatial_dimensions.extend(range(2, 2 + num_spatial_dims)) + dnums.kernel_spatial_dimensions.extend(range(2, 2 + num_spatial_dims)) + dnums.output_spatial_dimensions.extend(range(2, 2 + num_spatial_dims)) + precision_config = None + if precision: + precision_config = xla_data_pb2.PrecisionConfigProto() + precision_config.operand_precision.extend([precision, precision]) + return xla.conv( + lhs, + rhs, + window_strides=(1,), + padding=((2, 1),), + lhs_dilation=(1,), + rhs_dilation=(2,), + dimension_numbers=dnums) + + self._assertOpOutputMatchesExpected( + conv_1d_fn, + args=( + np.array([[[3, 4, 5, 6]]], dtype=dtype), + np.array([[[-2, -3]]], dtype=dtype), + ), + expected=np.array([[[-9, -12, -21, -26, -10]]], dtype=dtype)) + + @parameterized.parameters(*PRECISION_VALUES) + def testDotGeneral(self, precision): + for dtype in self.float_types: + + def dot_fn(lhs, rhs): + dnums = xla_data_pb2.DotDimensionNumbers() + dnums.lhs_contracting_dimensions.append(2) + dnums.rhs_contracting_dimensions.append(1) + dnums.lhs_batch_dimensions.append(0) + dnums.rhs_batch_dimensions.append(0) + precision_config = None + if precision: + precision_config = xla_data_pb2.PrecisionConfigProto() + precision_config.operand_precision.extend([precision, precision]) + return xla.dot_general( + lhs, + rhs, + dimension_numbers=dnums, + precision_config=precision_config) + + lhs = np.array( + [ + [[1, 2], [3, 4]], + [[5, 6], [7, 8]], + ], dtype=dtype) + rhs = np.array( + [ + [[1, 2, 3], [4, 5, 6]], + [[7, 8, 9], [10, 11, 12]], + ], dtype=dtype) + self._assertOpOutputMatchesExpected( + dot_fn, + args=(lhs, rhs), + expected=np.array( + [ + [[9, 12, 15], [19, 26, 33]], + [[95, 106, 117], [129, 144, 159]], + ], + dtype=dtype)) + + def testNeg(self): + for dtype in self.numeric_types: + self._assertOpOutputMatchesExpected( + xla.neg, + args=(np.array([1, 2, 3], dtype=dtype),), + expected=np.array([-1, -2, -3], dtype=dtype)) + + def testPad(self): + for dtype in self.numeric_types: + + def pad_fn(x): + return xla.pad( + x, + padding_value=7, + padding_low=[2, 1], + padding_high=[1, 2], + padding_interior=[1, 0]) + + self._assertOpOutputMatchesExpected( + pad_fn, + args=(np.arange(4, dtype=np.int32).astype(dtype).reshape([2, 2]),), + expected=np.array( + [[7, 7, 7, 7, 7], [7, 7, 7, 7, 7], [7, 0, 1, 7, 7], + [7, 7, 7, 7, 7], [7, 2, 3, 7, 7], [7, 7, 7, 7, 7]], + dtype=dtype)) + + def testReduce(self): + for dtype in set(self.numeric_types).intersection( + set([dtypes.bfloat16.as_numpy_dtype, np.float32])): + + @function.Defun(dtype, dtype) + def sum_reducer(x, y): + return x + y + + def sum_reduction(dims): + + def fn(x): + return xla.reduce( + x, init_value=0, dimensions_to_reduce=dims, reducer=sum_reducer) + + return fn + + self._assertOpOutputMatchesExpected( + sum_reduction(dims=[]), + args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),), + expected=np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4])) + self._assertOpOutputMatchesExpected( + sum_reduction(dims=[0]), + args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),), + expected=np.array([12, 15, 18, 21], dtype=dtype)) + self._assertOpOutputMatchesExpected( + sum_reduction(dims=[1]), + args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),), + expected=np.array([6, 22, 38], dtype=dtype)) + self._assertOpOutputMatchesExpected( + sum_reduction(dims=[0, 1]), + args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),), + expected=dtype(66)) + + @function.Defun(dtype, dtype) + def mul_reducer(x, y): + return x * y + + def mul_reduction(dims): + + def fn(x): + return xla.reduce( + x, init_value=1, dimensions_to_reduce=dims, reducer=mul_reducer) + + return fn + + self._assertOpOutputMatchesExpected( + mul_reduction(dims=[0]), + args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),), + expected=np.array([0, 45, 120, 231], dtype=dtype)) + + def testSelectAndScatter(self): + for dtype in set(self.numeric_types).intersection( + set([dtypes.bfloat16.as_numpy_dtype, np.float32])): + + @function.Defun(dtype, dtype) + def add_scatter(x, y): + return x + y + + @function.Defun(dtype, dtype) + def ge_select(x, y): + return x >= y + + def test_fn(operand, source): + return xla.select_and_scatter( + operand, + window_dimensions=[2, 3, 1, 1], + window_strides=[2, 2, 1, 1], + padding=[[0, 0]] * 4, + source=source, + init_value=0, + select=ge_select, + scatter=add_scatter) + + self._assertOpOutputMatchesExpected( + test_fn, + args=(np.array( + [[7, 2, 5, 3, 8], [3, 8, 9, 3, 4], [1, 5, 7, 5, 6], + [0, 6, 2, 10, 2]], + dtype=dtype).reshape((4, 5, 1, 1)), + np.array([[2, 6], [3, 1]], dtype=dtype).reshape((2, 2, 1, 1))), + expected=np.array( + [[0, 0, 0, 0, 0], [0, 0, 8, 0, 0], [0, 0, 3, 0, 0], + [0, 0, 0, 1, 0]], + dtype=dtype).reshape((4, 5, 1, 1))) + + def testTranspose(self): + for dtype in self.numeric_types: + v = np.arange(4, dtype=np.int32).astype(dtype).reshape([2, 2]) + self._assertOpOutputMatchesExpected( + lambda x: xla.transpose(x, [1, 0]), args=(v,), expected=v.T) + + +if __name__ == '__main__': + googletest.main() diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py index e924fe1e61454aefda622a5a46a0e483d26db5c1..88827cb53bee7bb809d0163d6badcef17e59aa78 100644 --- a/tensorflow/compiler/tests/xla_test.py +++ b/tensorflow/compiler/tests/xla_test.py @@ -49,6 +49,32 @@ flags.DEFINE_string('tf_xla_flags', None, 'Value to set the TF_XLA_FLAGS environment variable to') +def parse_disabled_manifest(manifest_content): + comments_re = re.compile('#.*$') + disabled_tests = [] + disabled_method_types = [] + for l in manifest_content.splitlines(): + stripped = comments_re.sub('', l).strip() + if not stripped: + continue + entry = stripped.split(' ') + if len(entry) == 1: + disabled_tests.append(entry[0]) + elif len(entry) == 2: + disabled_method_types.append((entry[0], entry[1].strip().split(','))) + else: + raise ValueError('Bad entry in manifest file.') + + disabled_regex = '|'.join(disabled_tests) + method_types_filter = dict() + for method, types in disabled_method_types: + method_types_filter[method] = set([ + dtypes.as_dtype(types_pb2.DataType.Value(name)).as_numpy_dtype + for name in types + ]) + return disabled_regex, method_types_filter + + class XLATestCase(test.TestCase): """XLA test cases are parameterized test cases.""" @@ -85,38 +111,21 @@ class XLATestCase(test.TestCase): # Parse the manifest file, if any, into a regex identifying tests to # disable - self.disabled_regex = None - self._method_types_filter = dict() # TODO(xpan): Make it text proto if it doesn't scale. # Each line of the manifest file specifies an entry. The entry can be # 1) TestNameRegex // E.g. CumprodTest.* Or # 2) TestName TypeName // E.g. AdamOptimizerTest.testSharing DT_BFLOAT16 # The 1) disables the entire test. While 2) only filter some numeric types # so that they are not used in those tests. + self.disabled_regex = None + self._method_types_filter = {} if FLAGS.disabled_manifest is not None: - comments_re = re.compile('#.*$') - manifest_file = open(FLAGS.disabled_manifest, 'r') - disabled_tests = [] - disabled_method_types = [] - for l in manifest_file.read().splitlines(): - if not l: - continue - entry = comments_re.sub('', l).strip().split(' ') - if len(entry) == 1: - disabled_tests.append(entry[0]) - elif len(entry) == 2: - disabled_method_types.append( - (entry[0], entry[1].strip().split(','))) - else: - raise ValueError('Bad entry in manifest file.') - - self.disabled_regex = re.compile('|'.join(disabled_tests)) - for method, types in disabled_method_types: - self._method_types_filter[method] = set([ - dtypes.as_dtype(types_pb2.DataType.Value(name)).as_numpy_dtype - for name in types]) - manifest_file.close() + with open(FLAGS.disabled_manifest, 'r') as manifest_file: + disabled_regex, self._method_types_filter = ( + parse_disabled_manifest(manifest_file.read())) + if disabled_regex: + self.disabled_regex = re.compile(disabled_regex) if FLAGS.tf_xla_flags is not None: os.environ['TF_XLA_FLAGS'] = FLAGS.tf_xla_flags diff --git a/tensorflow/compiler/tests/xla_test_test.py b/tensorflow/compiler/tests/xla_test_test.py new file mode 100644 index 0000000000000000000000000000000000000000..24664451579445edaadb335c30d253ee55f003da --- /dev/null +++ b/tensorflow/compiler/tests/xla_test_test.py @@ -0,0 +1,44 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for the XLATestCase test fixture base class.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.platform import test + + +class XlaTestCaseTestCase(test.TestCase): + + def testManifestEmptyLineDoesNotCatchAll(self): + manifest = """ +testCaseOne +""" + disabled_regex, _ = xla_test.parse_disabled_manifest(manifest) + self.assertEqual(disabled_regex, "testCaseOne") + + def testManifestWholeLineCommentDoesNotCatchAll(self): + manifest = """# I am a comment +testCaseOne +testCaseTwo +""" + disabled_regex, _ = xla_test.parse_disabled_manifest(manifest) + self.assertEqual(disabled_regex, "testCaseOne|testCaseTwo") + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index cd57452302fcbde37d79ce760a80615a76d7ad8c..0797b2cb17f5aae4080f339a201b44d69bbb2187 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -39,6 +39,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:ops", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/strings", ], ) @@ -81,13 +82,30 @@ cc_library( "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops", "//tensorflow/compiler/tf2xla/kernels:xla_ops", "//tensorflow/compiler/xla/client", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "cpu_function_runtime", + srcs = ["cpu_function_runtime.cc"], + hdrs = ["cpu_function_runtime.h"], + visibility = [ + "//tensorflow/compiler/aot:__pkg__", + "//tensorflow/compiler/xla/service/cpu:__pkg__", + ], + deps = [ + # Keep dependencies to a minimum here; this library is used in every AOT + # binary produced by tfcompile. + "//tensorflow/compiler/xla:executable_run_options", + "//tensorflow/core:framework_lite", ], ) @@ -99,12 +117,23 @@ cc_library( deps = [ # Keep dependencies to a minimum here; this library is used in every AOT # binary produced by tfcompile. - "//tensorflow/compiler/aot:runtime", + ":cpu_function_runtime", "//tensorflow/compiler/xla:executable_run_options", "//tensorflow/core:framework_lite", ], ) +tf_cc_test( + name = "cpu_function_runtime_test", + srcs = ["cpu_function_runtime_test.cc"], + deps = [ + ":cpu_function_runtime", + "//tensorflow/core:framework", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + cc_library( name = "xla_jit_compiled_cpu_function", srcs = ["xla_jit_compiled_cpu_function.cc"], @@ -119,7 +148,9 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:cpu_plugin", + "//tensorflow/compiler/xla/service/cpu:buffer_info_util", "//tensorflow/compiler/xla/service/cpu:cpu_executable", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", @@ -162,15 +193,19 @@ cc_library( ":sharding_util", ":tf2xla_util", "//tensorflow/compiler/tf2xla/lib:util", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", + "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/lib:constants", + "//tensorflow/compiler/xla/client/lib:numeric", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", @@ -178,6 +213,8 @@ cc_library( "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], alwayslink = 1, ) @@ -187,24 +224,23 @@ cc_library( srcs = [ "literal_util.cc", "shape_util.cc", - "str_util.cc", "type_util.cc", ], hdrs = [ "literal_util.h", "shape_util.h", - "str_util.h", "type_util.h", ], visibility = [":friends"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/types:span", ], ) @@ -222,6 +258,7 @@ cc_library( "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/strings", ], ) @@ -254,6 +291,7 @@ cc_library( "//tensorflow/core:graph", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/types:optional", ], ) @@ -272,6 +310,7 @@ tf_cc_test( "//tensorflow/core:protos_all_cc", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/strings", ], ) @@ -281,10 +320,12 @@ tf_cc_test( deps = [ ":tf2xla", ":tf2xla_proto", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:cpu_plugin", "//tensorflow/core:framework", "//tensorflow/core:lib", @@ -323,7 +364,7 @@ tf_cc_test( "//tensorflow/cc:ops", "//tensorflow/cc:resource_variable_ops", "//tensorflow/compiler/tf2xla/kernels:xla_ops", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla/client:client_library", @@ -337,19 +378,7 @@ tf_cc_test( "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:testlib", - ], -) - -tf_cc_test( - name = "str_util_test", - srcs = [ - "str_util_test.cc", - ], - deps = [ - ":common", - "//tensorflow/core:lib", - "//tensorflow/core:test", - "//tensorflow/core:test_main", + "@com_google_absl//absl/strings", ], ) @@ -360,6 +389,7 @@ tf_cc_test( ], deps = [ ":common", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/core:framework", "//tensorflow/core:test", @@ -406,22 +436,97 @@ cc_library( ], ) +cc_library( + name = "functionalize_control_flow_util", + srcs = [ + "functionalize_control_flow_util.cc", + ], + hdrs = [ + "functionalize_control_flow_util.h", + ], + deps = [ + "//tensorflow/compiler/tf2xla/ops:xla_ops", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/core:core_cpu", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:graph", + "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "functionalize_cond", + srcs = [ + "functionalize_cond.cc", + ], + hdrs = [ + "functionalize_cond.h", + ], + deps = [ + ":functionalize_control_flow_util", + ":tf2xla_util", + "//tensorflow/compiler/jit:union_find", + "//tensorflow/compiler/tf2xla:dump_graph", + "//tensorflow/compiler/tf2xla/ops:xla_ops", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/core:core_cpu", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", + "//tensorflow/core:graph", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + ], +) + cc_library( name = "functionalize_control_flow", - srcs = ["functionalize_control_flow.cc"], - hdrs = ["functionalize_control_flow.h"], + srcs = [ + "functionalize_control_flow.cc", + ], + hdrs = [ + "functionalize_control_flow.h", + ], deps = [ + ":functionalize_cond", + ":functionalize_control_flow_util", + ":functionalize_while", ":tf2xla_util", "//tensorflow/compiler/jit:union_find", "//tensorflow/compiler/tf2xla:dump_graph", "//tensorflow/compiler/tf2xla/ops:xla_ops", "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:util", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:graph", - "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:optional", + ], +) + +cc_library( + name = "functionalize_while", + srcs = [ + "functionalize_while.cc", + ], + hdrs = [ + "functionalize_while.h", + ], + deps = [ + ":functionalize_control_flow_util", + ":tf2xla_util", + "//tensorflow/compiler/jit:union_find", + "//tensorflow/compiler/tf2xla:dump_graph", + "//tensorflow/compiler/tf2xla/ops:xla_ops", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/core:core_cpu", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", + "//tensorflow/core:graph", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:optional", ], ) @@ -449,6 +554,32 @@ tf_cc_test( ], ) +tf_cc_test( + name = "functionalize_cond_test", + srcs = ["functionalize_cond_test.cc"], + deps = [ + ":functionalize_cond", + ":functionalize_control_flow", + ":test_util", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:cc_ops_internal", + "//tensorflow/cc:function_ops", + "//tensorflow/cc:ops", + "//tensorflow/cc:resource_variable_ops", + "//tensorflow/compiler/tf2xla/cc:xla_ops", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/core:core_cpu", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", + "//tensorflow/core:framework_internal", + "//tensorflow/core:ops", + "//tensorflow/core:resource_variable_ops_op_lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + cc_library( name = "test_util", testonly = 1, @@ -462,3 +593,40 @@ cc_library( "//tensorflow/core:protos_all_cc", ], ) + +tf_cc_test( + name = "xla_op_registry_test", + srcs = ["xla_op_registry_test.cc"], + deps = [ + ":xla_compiler", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + +cc_library( + name = "resource_operation_table", + srcs = ["resource_operation_table.cc"], + hdrs = ["resource_operation_table.h"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:ops", + "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/algorithm:container", + ], +) + +tf_cc_test( + name = "resource_operation_table_test", + srcs = ["resource_operation_table_test.cc"], + deps = [ + ":resource_operation_table", + ":xla_compiler", + "//tensorflow/compiler/tf2xla/kernels:xla_ops", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", + ], +) diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc index de1008803d69fefa415c7bdbe6c27a62e625b417..e8673d77903bd5a1a85412e9dfa86437f73d56bc 100644 --- a/tensorflow/compiler/tf2xla/const_analysis.cc +++ b/tensorflow/compiler/tf2xla/const_analysis.cc @@ -23,11 +23,11 @@ limitations under the License. #include "tensorflow/core/graph/algorithm.h" namespace tensorflow { - // Backwards dataflow analysis that finds arguments to a graph that must be // compile-time constants. Status BackwardsConstAnalysis(const Graph& g, - std::vector* compile_time_const_args) { + std::vector* compile_time_const_args, + std::vector* compile_time_const_nodes) { // Operators that don't look at the data of their inputs, just the shapes. const std::unordered_set metadata_ops = { "Rank", @@ -36,9 +36,16 @@ Status BackwardsConstAnalysis(const Graph& g, "Size", }; + std::vector compile_time_const_nodes_impl; + if (compile_time_const_nodes) { + CHECK_EQ(compile_time_const_nodes->size(), g.num_node_ids()); + } else { + compile_time_const_nodes_impl.resize(g.num_node_ids()); + compile_time_const_nodes = &compile_time_const_nodes_impl; + } + Status status; - std::unordered_set must_be_const; - auto visit = [&status, &metadata_ops, &must_be_const, + auto visit = [&status, &metadata_ops, compile_time_const_nodes, compile_time_const_args](Node* node) { if (!status.ok()) return; @@ -47,17 +54,19 @@ Status BackwardsConstAnalysis(const Graph& g, // If this node must be const, and it isn't a metadata op, then all of its // parents must be const. - if (must_be_const.find(node) != must_be_const.end()) { + if ((*compile_time_const_nodes)[node->id()]) { if (node->type_string() == "_Arg") { int index; status = GetNodeAttr(node->attrs(), "index", &index); if (!status.ok()) return; - compile_time_const_args->at(index) = true; + if (compile_time_const_args) { + (*compile_time_const_args)[index] = true; + } return; } for (const Edge* pred : node->in_edges()) { if (!pred->IsControlEdge()) { - must_be_const.insert(pred->src()); + (*compile_time_const_nodes)[pred->src()->id()] = true; } } return; @@ -80,7 +89,7 @@ Status BackwardsConstAnalysis(const Graph& g, for (Edge const* edge : node->in_edges()) { if (edge->dst_input() >= name_range->second.first && edge->dst_input() < name_range->second.second) { - must_be_const.insert(edge->src()); + (*compile_time_const_nodes)[edge->src()->id()] = true; } } } diff --git a/tensorflow/compiler/tf2xla/const_analysis.h b/tensorflow/compiler/tf2xla/const_analysis.h index 634b97d7e3760c0344c948a56353ade243284aa6..af57e5a4033248e3fd32dabeda252c4ca0a44050 100644 --- a/tensorflow/compiler/tf2xla/const_analysis.h +++ b/tensorflow/compiler/tf2xla/const_analysis.h @@ -23,10 +23,18 @@ limitations under the License. namespace tensorflow { -// Backwards dataflow analysis that finds arguments (_Arg nodes) to a graph that -// must be compile-time constants. +// Backwards dataflow analysis that finds nodes in a graph that must be +// compile-time constants for us to be able to lower the graph to XLA. +// +// The indices of the arguments to `graph` that must be constant are returned in +// `compile_time_const_arg_indices`, if `compile_time_const_arg_indices` is not +// null. +// +// The ids of the nodes in `graph` that must be constant are returned in +// `compile_time_const_nodes`, if `compile_time_const_nodes` is not null. Status BackwardsConstAnalysis(const Graph& graph, - std::vector* compile_time_const_args); + std::vector* compile_time_const_arg_indices, + std::vector* compile_time_const_nodes); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/const_analysis_test.cc b/tensorflow/compiler/tf2xla/const_analysis_test.cc index 992b12c06db5efc0ae54284d0ea77017c1c79aca..56065be894697bc72ecc0089c665c19aafee7bf8 100644 --- a/tensorflow/compiler/tf2xla/const_analysis_test.cc +++ b/tensorflow/compiler/tf2xla/const_analysis_test.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/ops/function_ops.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" @@ -38,17 +39,23 @@ TEST(ConstAnalysisTest, Basics) { auto c = ops::Reshape(root, arg2, b); auto d = ops::Mul(root, c, ops::Sum(root, arg3, arg3)); - Graph graph(OpRegistry::Global()); - TF_ASSERT_OK(root.ToGraph(&graph)); + FixupSourceAndSinkEdges(root.graph()); std::vector const_args(4, false); - TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args)); + std::vector const_nodes(root.graph()->num_node_ids(), false); + TF_ASSERT_OK( + BackwardsConstAnalysis(*root.graph(), &const_args, &const_nodes)); // Arg 0 doesn't need to be constant since the graph only uses its shape. // Arg 1 must be constant because it flows to the shape argument of a Reshape. // Arg 2 is used only as the value input to a Reshape and need not be const. // Arg 3 is used as the reduction-indices argument to Sum and must be const. EXPECT_EQ(const_args, std::vector({false, true, false, true})); + + EXPECT_FALSE(const_nodes[arg0.node()->id()]); + EXPECT_TRUE(const_nodes[arg1.node()->id()]); + EXPECT_FALSE(const_nodes[arg2.node()->id()]); + EXPECT_TRUE(const_nodes[arg3.node()->id()]); } // Regression test for a case where the backward const analysis did @@ -73,7 +80,8 @@ TEST(ConstAnalysisTest, TopologicalOrder) { TF_ASSERT_OK(root.ToGraph(&graph)); std::vector const_args(3, false); - TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args)); + TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args, + /*compile_time_const_nodes=*/nullptr)); EXPECT_EQ(const_args, std::vector({true, true, false})); } @@ -93,7 +101,8 @@ TEST(ConstAnalysisTest, DontFollowControlDependencies) { TF_ASSERT_OK(root.ToGraph(&graph)); std::vector const_args(2, false); - TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args)); + TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args, + /*compile_time_const_nodes=*/nullptr)); EXPECT_EQ(const_args, std::vector({false, true})); } diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime.cc b/tensorflow/compiler/tf2xla/cpu_function_runtime.cc new file mode 100644 index 0000000000000000000000000000000000000000..fcc4095e39673b786544984a41988c3e9c5b0efb --- /dev/null +++ b/tensorflow/compiler/tf2xla/cpu_function_runtime.cc @@ -0,0 +1,108 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" + +#include "tensorflow/core/platform/dynamic_annotations.h" + +namespace tensorflow { +namespace { +// Inline memory allocation routines here, because depending on '//base' brings +// in libraries which use c++ streams, which adds considerable code size on +// android. +void* aligned_malloc(size_t size, int minimum_alignment) { +#if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN) + return memalign(minimum_alignment, size); +#elif defined(_WIN32) + return _aligned_malloc(size, minimum_alignment); +#else // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN + void* ptr = nullptr; + // posix_memalign requires that the requested alignment be at least + // sizeof(void*). In this case, fall back on malloc which should return memory + // aligned to at least the size of a pointer. + const int required_alignment = sizeof(void*); + if (minimum_alignment < required_alignment) return malloc(size); + if (posix_memalign(&ptr, minimum_alignment, size) != 0) + return nullptr; + else + return ptr; +#endif +} + +void aligned_free(void* aligned_memory) { +#if defined(_WIN32) + _aligned_free(aligned_memory); +#else + free(aligned_memory); +#endif +} + +size_t align_to(size_t n, size_t align) { + return (((n - 1) / align) + 1) * align; +} +} // namespace + +namespace cpu_function_runtime { +size_t AlignedBufferBytes(const BufferInfo* buffer_infos, size_t n, + bool allocate_entry_params) { + size_t total = 0; + for (size_t i = 0; i < n; ++i) { + bool should_allocate = + buffer_infos[i].is_temp_buffer() || + (buffer_infos[i].is_entry_parameter() && allocate_entry_params); + + if (should_allocate) { + total += align_to(buffer_infos[i].size(), kAlign); + } + } + return total; +} + +void* MallocContiguousBuffers(const BufferInfo* buffer_infos, size_t n, + bool allocate_entry_params, void** bufs, + bool annotate_initialized) { + const size_t total = + AlignedBufferBytes(buffer_infos, n, allocate_entry_params); + void* contiguous = nullptr; + if (total > 0) { + contiguous = aligned_malloc(total, kAlign); + if (annotate_initialized) { + // Since the memory for temp buffers is written to by JITed code, msan has + // no way of knowing the memory was initialized, so explicitly mark it. + TF_ANNOTATE_MEMORY_IS_INITIALIZED(contiguous, total); + } + } + uintptr_t pos = reinterpret_cast(contiguous); + for (size_t i = 0; i < n; ++i) { + bool should_allocate = + buffer_infos[i].is_temp_buffer() || + (buffer_infos[i].is_entry_parameter() && allocate_entry_params); + if (should_allocate) { + bufs[i] = reinterpret_cast(pos); + pos += align_to(buffer_infos[i].size(), kAlign); + } else { + bufs[i] = nullptr; + } + } + return contiguous; +} + +void FreeContiguous(void* contiguous) { + if (contiguous != nullptr) { + aligned_free(contiguous); + } +} +} // namespace cpu_function_runtime +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime.h b/tensorflow/compiler/tf2xla/cpu_function_runtime.h new file mode 100644 index 0000000000000000000000000000000000000000..dfc1e8b8aebcf3142e9f61f60171c6b58634c71d --- /dev/null +++ b/tensorflow/compiler/tf2xla/cpu_function_runtime.h @@ -0,0 +1,165 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2XLA_CPU_FUNCTION_RUNTIME_H_ +#define TENSORFLOW_COMPILER_TF2XLA_CPU_FUNCTION_RUNTIME_H_ + +#include "tensorflow/core/platform/types.h" + +#include + +namespace tensorflow { +namespace cpu_function_runtime { +// Stores information about one buffer used by an XLA:CPU compiled function. +// These buffers are used for holding inputs to the computation, outputs from +// the computation and as temporary scratch space. +class BufferInfo { + public: + // Creates a BufferInfo from a serialized encoding generated by `Encode`. + explicit BufferInfo(std::pair encoding) + : entry_param_number_(encoding.second) { + Kind kind; + uint64 size; + Unpack(encoding.first, &kind, &size); + kind_ = kind; + size_ = size; + } + + // Returns true if this buffer stores a constant. These never need to be + // allocated by the runtime. + bool is_constant() const { return kind() == Kind::kConstant; } + + // Returns true if this buffer stores an entry parameter. These may or may + // not need to be allocated by the runtime, depending on + // XlaCompiledCpuFunction::AllocMode. + bool is_entry_parameter() const { return kind() == Kind::kEntryParameter; } + + // Returns the entry parameter number of this buffer. + uint64 entry_parameter_number() const { + assert(is_entry_parameter()); + return entry_param_number_; + } + + // Returns true if this buffer is temporary scratch space required by the XLA + // computations. These are always allocated by the runtime. + bool is_temp_buffer() const { return kind() == Kind::kTempBuffer; } + + // Returns true if this buffer is allocated on the C stack or into registers. + // These buffers are never allocated by the runtime. + bool is_on_stack_buffer() const { return kind() == Kind::kOnStackBuffer; } + + // Returns the size for this buffer. + uint64 size() const { return size_; } + + // Encodes this BufferInfo into two 64 bit integers that can be used to + // reconstruct the BufferInfo later using the constructor. We need this + // because we use BufferInfo in places where using protocol buffers would + // negatively impact binary size. + std::pair Encode() const { + static_assert(sizeof(*this) == 16, ""); + uint64 upper = Pack(kind(), size_); + uint64 lower = entry_param_number_; + return {upper, lower}; + } + + bool operator==(const BufferInfo& buffer_info) const { + if (kind() != buffer_info.kind() || size() != buffer_info.size()) { + return false; + } + return !is_entry_parameter() || + entry_parameter_number() == buffer_info.entry_parameter_number(); + } + + // Factory methods: + + static BufferInfo MakeTempBuffer(uint64 size) { + return BufferInfo(Kind::kTempBuffer, /*size=*/size, + /*entry_param_number=*/-1); + } + static BufferInfo MakeConstant(uint64 size) { + return BufferInfo(Kind::kConstant, /*size=*/size, + /*entry_param_number=*/-1); + } + static BufferInfo MakeEntryParameter(uint64 size, uint64 param_number) { + return BufferInfo(Kind::kEntryParameter, /*size=*/size, + /*entry_param_number=*/param_number); + } + static BufferInfo MakeOnStackBuffer(uint64 size) { + return BufferInfo(Kind::kOnStackBuffer, /*size=*/size, + /*entry_param_number=*/-1); + } + + private: + BufferInfo() = default; + + enum class Kind : unsigned { + kConstant, + kTempBuffer, + kEntryParameter, + kOnStackBuffer + }; + + Kind kind() const { return static_cast(kind_); } + + explicit BufferInfo(Kind kind, uint64 size, uint64 entry_param_number) + : kind_(kind), size_(size), entry_param_number_(entry_param_number) {} + + static uint64 Pack(Kind kind, uint64 size) { + return (static_cast(size) << 2) | static_cast(kind); + } + + static void Unpack(uint64 packed, Kind* kind, uint64* size) { + *size = packed >> 2; + *kind = static_cast((packed << 62) >> 62); + } + + Kind kind_ : 2; + uint64 size_ : 62; + int64 entry_param_number_; +}; + +// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment. +constexpr size_t kAlign = 64; + +// AlignedBufferBytes returns the sum of the size of each buffer in +// `buffer_infos`, skipping constants, on-stack buffers and, if +// allocate_entry_params is false, entry parameters. There are `n` entries in +// `buffer_infos`. Each buffer is aligned to kAlign byte boundaries. +size_t AlignedBufferBytes(const BufferInfo* buffer_infos, size_t n, + bool allocate_entry_params); + +// MallocContiguousBuffers allocates buffers for use by the entry point +// generated by tfcompile. There are `n` entries in `buffer_infos`. If +// `annotate_initialized` is set, the allocated memory will be annotated as +// having been initialized - this is useful when allocating temporary buffers. +// If allocate_entry_params is true then allocates temp buffers and entry +// parameters, otherwise allocated only temp buffers. Slots in `bufs` +// corresponding to unallocated buffers are set to nullptr. +// +// A single contiguous block of memory is allocated, and portions of it are +// parceled out into `bufs`, which must have space for `n` entries. Returns +// the head of the allocated contiguous block, which should be passed to +// FreeContiguous when the buffers are no longer in use. +void* MallocContiguousBuffers(const BufferInfo* buffer_infos, size_t n, + bool allocate_entry_params, void** bufs, + bool annotate_initialized); + +// FreeContiguous frees the contiguous block of memory allocated by +// MallocContiguousBuffers. +void FreeContiguous(void* contiguous); +} // namespace cpu_function_runtime +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_TF2XLA_CPU_FUNCTION_RUNTIME_H_ diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8ca628c4eb6700d7184899bc1753dd6c6aa392b0 --- /dev/null +++ b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc @@ -0,0 +1,171 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +using cpu_function_runtime::BufferInfo; + +TEST(XlaCompiledCpuFunctionTest, AlignmentValue) { + // We've chosen 64 byte alignment for the tfcompile runtime to mimic the + // regular tensorflow allocator, which was chosen to play nicely with Eigen. + // The tfcompile runtime also has a requirement that comes from the xla + // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8 + // So any value that we choose must abide by that constraint as well. + EXPECT_EQ(cpu_function_runtime::kAlign, Allocator::kAllocatorAlignment); +} + +std::vector SizesToBufferInfos(const intptr_t* sizes, size_t n) { + std::vector buffer_infos; + std::transform(sizes, sizes + n, std::back_inserter(buffer_infos), + [&](intptr_t size) { + if (size == -1) { + // Use a dummy on-stack buffer allocation to indicat the + // the current slot does not need an allocation. + int64 on_stack_buffer_size = 4; + return BufferInfo::MakeOnStackBuffer(on_stack_buffer_size); + } + return BufferInfo::MakeTempBuffer(size); + }); + return buffer_infos; +} + +// Simple wrappers to make writing tests more ergonomic. + +size_t AlignedBufferBytesFromSizes(const intptr_t* sizes, size_t n) { + std::vector buffer_infos = SizesToBufferInfos(sizes, n); + return AlignedBufferBytes(buffer_infos.data(), n, + /*allocate_entry_params=*/false); +} + +void* MallocContiguousBuffersFromSizes(const intptr_t* sizes, size_t n, + void** bufs, bool annotate_initialized) { + std::vector buffer_infos = SizesToBufferInfos(sizes, n); + return MallocContiguousBuffers(buffer_infos.data(), n, + /*allocate_entry_params=*/false, bufs, + annotate_initialized); +} + +TEST(XlaCompiledCpuFunctionTest, AlignedBufferBytes) { + EXPECT_EQ(AlignedBufferBytesFromSizes(nullptr, 0), 0); + + static constexpr intptr_t sizesA[1] = {-1}; + EXPECT_EQ(AlignedBufferBytesFromSizes(sizesA, 1), 0); + + static constexpr intptr_t sizesB[1] = {3}; + EXPECT_EQ(AlignedBufferBytesFromSizes(sizesB, 1), 64); + + static constexpr intptr_t sizesC[1] = {32}; + EXPECT_EQ(AlignedBufferBytesFromSizes(sizesC, 1), 64); + + static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3}; + EXPECT_EQ(AlignedBufferBytesFromSizes(sizesD, 7), 320); +} + +void* add_ptr(void* base, uintptr_t delta) { + return reinterpret_cast(reinterpret_cast(base) + delta); +} + +// To test MallocContiguousBuffers and FreeContiguous, we just check for +// expected nullptrs, and write to each byte of allocated memory. We rely on +// the leak checker to tell us if there's an inconsistency between malloc and +// free. We also check the contiguous property. +TEST(XlaCompiledCpuFunctionTest, MallocFreeContiguousBuffers) { + // Test empty sizes. + void* base = MallocContiguousBuffersFromSizes(nullptr, 0, nullptr, false); + EXPECT_EQ(base, nullptr); + cpu_function_runtime::FreeContiguous(base); + + // Test non-empty sizes with 0 sum. + static constexpr intptr_t sizesA[1] = {-1}; + void* bufA[1]; + base = MallocContiguousBuffersFromSizes(sizesA, 1, bufA, false); + EXPECT_EQ(base, nullptr); + EXPECT_EQ(bufA[0], nullptr); + cpu_function_runtime::FreeContiguous(base); + + // Test non-empty sizes with non-0 sum. + static constexpr intptr_t sizesB[1] = {3}; + void* bufB[1]; + base = MallocContiguousBuffersFromSizes(sizesB, 1, bufB, false); + EXPECT_NE(base, nullptr); + EXPECT_EQ(bufB[0], add_ptr(base, 0)); + char* bufB0_bytes = static_cast(bufB[0]); + bufB0_bytes[0] = 'A'; + bufB0_bytes[1] = 'B'; + bufB0_bytes[2] = 'C'; + cpu_function_runtime::FreeContiguous(base); + + // Test non-empty sizes with non-0 sum, and annotate_initialized. + static constexpr intptr_t sizesC[1] = {3}; + void* bufC[1]; + base = MallocContiguousBuffersFromSizes(sizesC, 1, bufC, true); + EXPECT_NE(base, nullptr); + EXPECT_EQ(bufC[0], add_ptr(base, 0)); + char* bufC0_bytes = static_cast(bufC[0]); + bufC0_bytes[0] = 'A'; + bufC0_bytes[1] = 'B'; + bufC0_bytes[2] = 'C'; + cpu_function_runtime::FreeContiguous(base); + + // Test mixed sizes. + static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3}; + void* bufD[7]; + base = MallocContiguousBuffersFromSizes(sizesD, 7, bufD, false); + EXPECT_NE(base, nullptr); + EXPECT_EQ(bufD[0], add_ptr(base, 0)); + EXPECT_EQ(bufD[1], nullptr); + EXPECT_EQ(bufD[2], add_ptr(base, 64)); + EXPECT_EQ(bufD[3], nullptr); + EXPECT_EQ(bufD[4], add_ptr(base, 128)); + EXPECT_EQ(bufD[5], add_ptr(base, 192)); + EXPECT_EQ(bufD[6], add_ptr(base, 256)); + for (int i = 0; i < 7; ++i) { + const intptr_t size = sizesD[i]; + if (size != -1) { + char* bufD_bytes = static_cast(bufD[i]); + for (size_t j = 0; j < size; ++j) { + bufD_bytes[j] = 'A' + j; + } + } + } + cpu_function_runtime::FreeContiguous(base); +} + +void CheckRoundTripIsOk(const BufferInfo& buffer_info) { + BufferInfo round_trip(buffer_info.Encode()); + ASSERT_EQ(round_trip, buffer_info); +} + +TEST(XlaCompiledCpuFunctionTest, BufferInfoTest) { + CheckRoundTripIsOk(BufferInfo::MakeTempBuffer(0)); + CheckRoundTripIsOk(BufferInfo::MakeTempBuffer(4)); + CheckRoundTripIsOk(BufferInfo::MakeOnStackBuffer(0)); + CheckRoundTripIsOk(BufferInfo::MakeOnStackBuffer(4)); + CheckRoundTripIsOk(BufferInfo::MakeConstant(0)); + CheckRoundTripIsOk(BufferInfo::MakeConstant(4)); + CheckRoundTripIsOk( + BufferInfo::MakeEntryParameter(/*size=*/0, /*param_number=*/4)); + CheckRoundTripIsOk( + BufferInfo::MakeEntryParameter(/*size=*/4, /*param_number=*/0)); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc index 03603ee9baefd1d20d220faf63c9c1c427ebdf31..24616c01c7e54b2e8662457ca6af23a0bc563e08 100644 --- a/tensorflow/compiler/tf2xla/dump_graph.cc +++ b/tensorflow/compiler/tf2xla/dump_graph.cc @@ -33,7 +33,7 @@ struct NameCounts { std::unordered_map counts; }; -string MakeUniquePath(string name) { +string MakeUniqueFilename(string name) { static NameCounts& instance = *new NameCounts; // Remove illegal characters from `name`. @@ -50,26 +50,41 @@ string MakeUniquePath(string name) { count = instance.counts[name]++; } - legacy_flags::DumpGraphFlags* flags = legacy_flags::GetDumpGraphFlags(); - string path = strings::StrCat(flags->tf_dump_graph_prefix, "/", name); + string filename = name; if (count > 0) { - strings::StrAppend(&path, "_", count); + strings::StrAppend(&filename, "_", count); } - strings::StrAppend(&path, ".pbtxt"); - return path; + strings::StrAppend(&filename, ".pbtxt"); + return filename; +} + +string WriteTextProtoToUniqueFile( + Env* env, const string& name, const char* proto_type, + const ::tensorflow::protobuf::Message& proto) { + const string& dirname = + legacy_flags::GetDumpGraphFlags()->tf_dump_graph_prefix; + Status status = env->RecursivelyCreateDir(dirname); + if (!status.ok()) { + LOG(WARNING) << "Failed to create " << dirname << " for dumping " + << proto_type << ": " << status; + return "(unavailable)"; + } + string filepath = strings::StrCat(dirname, "/", MakeUniqueFilename(name)); + status = WriteTextProto(Env::Default(), filepath, proto); + if (!status.ok()) { + LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath + << " : " << status; + return "(unavailable)"; + } + LOG(INFO) << "Dumped " << proto_type << " to " << filepath; + return filepath; } } // anonymous namespace string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) { - string path = MakeUniquePath(name); - Status status = WriteTextProto(Env::Default(), path, graph_def); - if (!status.ok()) { - VLOG(1) << "Failed to dump GraphDef to file: " << path << " : " << status; - path.clear(); - path = "(unavailable)"; - } - return path; + return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef", + graph_def); } string DumpGraphToFile(const string& name, Graph const& graph, @@ -83,15 +98,7 @@ string DumpGraphToFile(const string& name, Graph const& graph, } string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) { - string path = MakeUniquePath(name); - Status status = WriteTextProto(Env::Default(), path, fdef); - if (!status.ok()) { - VLOG(1) << "Failed to dump FunctionDef to file: " << path << " : " - << status; - path.clear(); - path = "(unavailable)"; - } - return path; + return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef); } } // namespace dump_graph diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc new file mode 100644 index 0000000000000000000000000000000000000000..b5667ca0d3ba35bea9da2d702b5b49fb38fe6f02 --- /dev/null +++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc @@ -0,0 +1,1385 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/functionalize_cond.h" + +#include +#include +#include +#include +#include + +#include "absl/memory/memory.h" +#include "absl/strings/str_join.h" +#include "absl/types/optional.h" +#include "tensorflow/compiler/jit/union_find.h" +#include "tensorflow/compiler/tf2xla/dump_graph.h" +#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h" +#include "tensorflow/compiler/tf2xla/tf2xla_util.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/control_flow.h" +#include "tensorflow/core/graph/node_builder.h" + +using xla::StatusOr; + +namespace tensorflow { +namespace functionalize_cond { + +string DebugString(const CondStateMap::CondNode& node) { + return node.ToString(); +} + +// TODO(jpienaar): Move to OutputTensor. +string DebugString(const OutputTensor& tensor) { + return strings::StrCat(tensor.node->name(), ":", tensor.index); +} + +string DebugString(CondStateMap::CondId cond_state) { + if (cond_state == nullptr || cond_state->empty()) return "[]"; + return strings::StrCat( + "[", + absl::StrJoin(*cond_state, ", ", + [](string* output, const CondStateMap::CondNode& node) { + strings::StrAppend(output, node.ToString()); + }), + "]"); +} + +string Branch_Name(BranchType b) { + switch (b) { + case BranchType::kElseBranch: + return "else"; + case BranchType::kThenBranch: + return "then"; + case BranchType::kBoth: + return "both"; + case BranchType::kNeither: + return "neither"; + } +} + +// Returns the predicate of a switch. +Status GetSwitchPredicate(const Node& switch_node, OutputTensor* pred) { + const Edge* pred_edge; + TF_RETURN_IF_ERROR(switch_node.input_edge(1, &pred_edge)); + // The predicate can be preceded by a identity node. Look through + // identity nodes to predicate. + while (pred_edge->src()->IsIdentity()) { + TF_RETURN_IF_ERROR(pred_edge->src()->input_edge(0, &pred_edge)); + } + *pred = OutputTensor(pred_edge->src(), pred_edge->src_output()); + return Status::OK(); +} + +CondStateMap::CondNode::CondNode(Type type, Node* switch_node, + BranchType branch) + : type(type), branch(branch) { + if (type == Type::kSwitch) { + TF_CHECK_OK(GetSwitchPredicate(*switch_node, &predicate)); + } +} + +string CondStateMap::CondNode::ToString() const { + switch (type) { + case Type::kSwitch: + return strings::StrCat("s(", DebugString(predicate), ",", + Branch_Name(branch), ")"); + case Type::kMerge: + return "m"; + case Type::kDead: + return "d"; + } +} + +bool CondStateMap::CondNode::operator==(const CondNode& other) const { + if (type != Type::kSwitch) return type == other.type; + return type == other.type && predicate == other.predicate && + branch == other.branch; +} + +bool CondStateMap::CondNode::operator!=(const CondNode& other) const { + return !(*this == other); +} + +CondStateMap::CondStateMap(Graph* graph) { + node_to_condid_map_.resize(graph->num_node_ids()); + // Initialize the dead state (empty state is designated with a nullptr). + dead_id_ = GetUniqueId({CondNode(CondStateMap::CondNode::Type::kDead)}); +} + +bool CondStateMap::IsDead(CondStateMap::CondId id) const { + return id == dead_id_; +} + +bool CondStateMap::IsEmpty(CondStateMap::CondId id) const { + return id == nullptr; +} + +size_t CondStateMap::CondHash::operator()( + const CondStateMap::CondNode& item) const { + return Hash64Combine(Hash64Combine(OutputTensor::Hash()(item.predicate), + hash()(item.branch)), + hash()(item.type)); +} + +size_t CondStateMap::CondHash::operator()( + const CondStateMap::CondState& vec) const { + if (vec.empty()) return 0; + size_t h = (*this)(vec.front()); + auto it = vec.begin(); + for (++it; it != vec.end(); ++it) { + h = Hash64Combine(h, (*this)(*it)); + } + return h; +} + +// CondArgNode represents a input to the conditional and its corresponding +// switch nodes. +struct CondArgNode { + explicit CondArgNode(Node* src, int src_output) + : src(src), src_output(src_output) {} + + string ToString() const { + return strings::StrCat("src=", src->name(), ":", src_output, + " switches=", NodesToString(switches)); + } + + Node* src; + int src_output; + std::array branch_copy; + std::vector switches; +}; +using CondArgNodes = std::vector; + +string DebugString(const CondArgNodes& nodes) { + return strings::StrCat( + "[", + absl::StrJoin(nodes, ", ", + [](string* output, const CondArgNode& node) { + strings::StrAppend(output, node.ToString()); + }), + "]"); +} + +CondStateMap::CondId CondStateMap::LookupId(const Node* node) const { + if (node->id() < node_to_condid_map_.size()) + return node_to_condid_map_[node->id()]; + return added_node_mapping_.at(node->id()); +} + +CondStateMap::CondId CondStateMap::GetUniqueId( + const CondStateMap::CondState& state) { + if (state.empty()) return nullptr; + return &*condstate_set_.insert(state).first; +} + +const CondStateMap::CondState& CondStateMap::LookupState( + const Node* node) const { + return *LookupId(node); +} + +void CondStateMap::ResetId(const Node* node, CondStateMap::CondId id) { + if (node->id() < node_to_condid_map_.size()) + node_to_condid_map_[node->id()] = id; + else + added_node_mapping_[node->id()] = id; +} + +void CondStateMap::MarkDead(const Node* node) { ResetId(node, dead_id_); } + +string CondStateMap::CondStateToString(const Node* node) const { + return CondStateToString(LookupId(node)); +} + +string CondStateMap::CondStateToString(CondStateMap::CondId id) const { + return DebugString(id); +} + +FunctionalizeCond::FunctionalizeCond(Graph* graph, + FunctionLibraryDefinition* library) + : cond_state_map_(graph), library_(library), graph_(graph) {} + +// Class representing the merge/switch nodes that will become a conditional. +class Conditional { + public: + Conditional(OutputTensor predicate, FunctionalizeCond* parent, + CondStateMap* cond_state_map); + + // Adds merge node that is part of this conditional. + Status AddMerge(Node* m); + + // Constructs an If node from the merge nodes. + Status BuildAndReplace(Graph* graph, FunctionLibraryDefinition* library); + + private: + // Extracts the then/else bodies: creates new graphs with the nodes + // corresponding to the nodes in the then/else branches as of this conditional + // as function bodies. + Status ExtractBodies(Graph* graph); + + // Builds the arguments that are the input to the If. + Status BuildArgumentNodes(); + + // Builds the If node for the extracted bodies with the given predicate. + Status BuildIfNode(Graph* graph, FunctionLibraryDefinition* library); + + // Adds input edges to If node. + Status AddInputEdges(Graph* graph); + + // Adds output edges from If node. + Status AddOutputEdges(Graph* graph); + + // Adds switch node that is part of this conditional. + Status AddSwitch(Node* s); + + // Internal name of conditional. The name is based on the first merge node + // added. + string name() const; + + // The FunctionalizeCond instance that created this. + FunctionalizeCond* parent_; + + // Mapping between nodes and their cond state. + CondStateMap* cond_state_map_; + + // The predicate of the conditional. + OutputTensor predicate_; + + // The predicate of the switches of the conditional. This may be different + // than predicate (which is initialized from the original graph) as the + // predicate could be the output of a newly created If node. + OutputTensor switch_predicate_; + + // Switch nodes in graph that are part of this conditional. + std::set switches_; + + // Merge nodes in graph that are part of this conditional. + std::set merges_; + + // Vector of control inputs from outside the conditional to a node inside. + std::vector external_control_inputs_; + std::vector external_control_outputs_; + + // Graphs corresponding to the then and else branch. + std::array, 2> bodies_; + + // Maps from graph_ to the branch body's graph. + std::array, 2> node_maps_; + + // The argument nodes created for the switches. + CondArgNodes cond_arg_nodes_; + + // The constructed If node. + Node* if_node_ = nullptr; + + // Whether the merge nodes of this conditional have been replaced. + bool replaced_ = false; +}; + +Conditional::Conditional(OutputTensor predicate, FunctionalizeCond* parent, + CondStateMap* cond_state_map) + : parent_(parent), cond_state_map_(cond_state_map), predicate_(predicate) {} + +Status Conditional::AddMerge(Node* m) { + merges_.insert(m); + return Status::OK(); +} + +Status Conditional::AddSwitch(Node* s) { + VLOG(5) << "Adding switch " << s->DebugString(); + OutputTensor predicate; + TF_RETURN_IF_ERROR(GetSwitchPredicate(*s, &predicate)); + if (switch_predicate_.node == nullptr) switch_predicate_ = predicate; + if (!(switch_predicate_ == predicate)) { + return errors::InvalidArgument( + "Merge nodes ", NodesToString(merges_), + " directly dominated by switch nodes with different predicates (", + DebugString(switch_predicate_), " vs ", DebugString(predicate), ")."); + } + switches_.insert(s); + return Status::OK(); +} + +Status Conditional::BuildArgumentNodes() { + VLOG(1) << "Build function arguments"; + struct Hash { + size_t operator()(const std::pair& item) const { + return Hash64Combine(hash()(item.first), + std::hash()(item.second)); + } + }; + + std::unordered_map, int, Hash> input_index; + for (Node* switch_node : switches_) { + const Edge* e; + TF_RETURN_IF_ERROR(switch_node->input_edge(0, &e)); + std::pair key = std::make_pair(e->src(), e->src_output()); + if (input_index.find(key) == input_index.end()) { + input_index[key] = cond_arg_nodes_.size(); + cond_arg_nodes_.emplace_back(key.first, key.second); + } + cond_arg_nodes_.at(input_index.at(key)).switches.push_back(switch_node); + } + VLOG(5) << "CondArg nodes created: " << DebugString(cond_arg_nodes_); + + int arg_count = 0; + for (CondArgNode& cond_arg_node : cond_arg_nodes_) { + DataType dtype = cond_arg_node.src->output_type(cond_arg_node.src_output); + for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) { + int branch_index = static_cast(branch); + TF_RETURN_IF_ERROR( + NodeBuilder(strings::StrCat("_Arg", arg_count), + FunctionLibraryDefinition::kArgOp) + .Attr("T", dtype) + .Attr("index", arg_count) + .Finalize(bodies_[branch_index].get(), + &cond_arg_node.branch_copy[branch_index])); + } + for (Node* node : cond_arg_node.switches) { + for (const Edge* e : node->out_edges()) { + if (e->IsControlEdge()) continue; + int branch_index = e->src_output(); + Node* src_copy = cond_arg_node.branch_copy[branch_index]; + Node* dst_copy = node_maps_[branch_index][e->dst()->id()]; + + // The graph may contain dead switch nodes, + if (dst_copy == nullptr) continue; + + TF_RET_CHECK(dst_copy != nullptr) + << "Unable to find copied node for " << e->dst()->DebugString() + << " on branch " << Branch_Name(BranchType(branch_index)); + // If the input goes directly to a merge then the merge has + // been replaced by a retval so the dst input is 0 instead of + // dst_input. + int dst_input = IsMerge(e->dst()) ? 0 : e->dst_input(); + bodies_[branch_index]->AddEdge(src_copy, 0, dst_copy, dst_input); + } + } + ++arg_count; + } + + // Verify that all retvals have an input. + // TODO(jpienaar): One could add a ZerosLike in the branch that doesn't have + // input. + for (Node* m : merges_) { + for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) { + bool has_input = false; + for (auto e : node_maps_[static_cast(branch)][m->id()]->in_edges()) { + if (!e->IsControlEdge()) { + has_input = true; + break; + } + } + if (!has_input) { + return errors::Internal( + "Failed to functionalize control flow with merge ", + FormatNodeForError(*m), " that doesn't have input on ", + Branch_Name(branch), " branch."); + } + } + } + + return Status::OK(); +} + +Status Conditional::ExtractBodies(Graph* graph) { + VLOG(2) << "Extracting bodies for " << name(); + for (auto b : {BranchType::kElseBranch, BranchType::kThenBranch}) { + bodies_[static_cast(b)] = + absl::make_unique(graph->op_registry()); + } + + auto find_branch = [&](const Edge* e) { + const auto& id = cond_state_map_->LookupId(e->src()); + return IsSwitch(e->src()) ? BranchType(e->src_output()) + : cond_state_map_->FindBranchOf(id, predicate_); + }; + + std::array, 2> stacks; + VLOG(5) << "Merges: " << NodesToString(merges_); + for (Node* m : merges_) { + VLOG(5) << "For merge: " << m->DebugString() << " " + << cond_state_map_->CondStateToString(m); + for (auto e : m->in_edges()) { + if (e->IsControlEdge()) continue; + BranchType branch = find_branch(e); + TF_RET_CHECK(branch == BranchType::kThenBranch || + branch == BranchType::kElseBranch) + << "Error: " << e->src()->name() + << " is not on either then or else branch (" << Branch_Name(branch) + << ")."; + Node* src = e->src(); + if (IsSwitch(src)) { + // Switch node outputs and dependencies are handled separately. + TF_RETURN_IF_ERROR(AddSwitch(src)); + } else { + stacks[static_cast(branch)].push_back(src); + } + } + } + + for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) { + int branch_index = static_cast(branch); + auto output = bodies_[branch_index].get(); + auto& stack = stacks[branch_index]; + VLOG(5) << "In branch: " << Branch_Name(branch) << " " + << NodesToString(stack); + std::vector visited(graph->num_node_ids(), false); + node_maps_[branch_index].resize(graph->num_node_ids(), nullptr); + auto& node_map = node_maps_[branch_index]; + + while (!stack.empty()) { + Node* n = stack.back(); + stack.pop_back(); + + if (visited.at(n->id())) continue; + visited[n->id()] = true; + + // Verify output edges and record control edges exitting scope. + for (const Edge* e : n->out_edges()) { + Node* dst = e->dst(); + if (IsMerge(dst)) continue; + Node* src = e->src(); + + auto dst_id = cond_state_map_->LookupId(dst); + auto src_id = cond_state_map_->LookupId(src); + if (dst_id != src_id) { + if (e->IsControlEdge()) { + external_control_outputs_.push_back(e->src()); + } else { + // Constants are treated specially to workaround the case of + // non-dominated constant nodes. + if (!IsConstant(src)) { + // TODO(b/78882471): A node that feeds into two different + // CondState is not necessarily an error so log a warning for now + // but revisit to improve the testing to enable making this an + // error. + LOG(WARNING) << errors::InvalidArgument( + "Graph contains node ", FormatNodeForError(*src), + " that feeds into node ", FormatNodeForError(*dst), + " but these nodes are in different control contexts (", + DebugString(src_id), " vs ", DebugString(dst_id), + " (detected during out edge testing)"); + } + } + } + } + + // Copying incomming edges to dst node. + for (const Edge* e : n->in_edges()) { + Node* src = e->src(); + // Skip src/dst node. + if (!src->IsOp()) continue; + + Node* dst = e->dst(); + if (IsSwitch(src)) { + // Switch node outputs and dependencies are handled separately. + TF_RETURN_IF_ERROR(AddSwitch(src)); + continue; + } + + // Verify input is from the same context. + auto src_id = cond_state_map_->LookupId(src); + auto dst_id = cond_state_map_->LookupId(dst); + if (IsMerge(dst) || src_id == dst_id) { + // TODO(jpienaar): The merge case can be more strict. + if (node_map.at(src->id()) == nullptr) { + node_map.at(src->id()) = output->CopyNode(src); + stack.push_back(src); + } + } else if (e->IsControlEdge()) { + external_control_inputs_.push_back(src); + } else { + // This shouldn't happen, this means we have an external data input + // not entering via a switch node. Work around this for constant + // nodes as some constant nodes are inserted without the required + // control context dominance. + if (IsConstant(src)) { + node_map.at(src->id()) = output->CopyNode(src); + } else { + return errors::InvalidArgument( + "Graph contains node ", FormatNodeForError(*src), + " that feeds into node ", FormatNodeForError(*dst), + " but these nodes are in different control contexts (", + DebugString(src_id), " vs ", DebugString(dst_id), + " (detected during in edge testing)"); + } + } + + Node* src_copy = node_map.at(e->src()->id()); + int src_output = e->src_output(); + if (node_map.at(dst->id()) == nullptr) { + node_map.at(dst->id()) = output->CopyNode(dst); + } + Node* dst_copy = node_map.at(e->dst()->id()); + if (e->IsControlEdge()) { + // Skip control inputs from external context. + if (src_copy != nullptr) output->AddControlEdge(src_copy, dst_copy); + } else { + output->AddEdge(src_copy, src_output, dst_copy, e->dst_input()); + } + } + } + } + + // Build return values from the merge nodes. + int index = 0; + for (Node* m : merges_) { + for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) { + int branch_index = static_cast(branch); + auto& node_map = node_maps_[branch_index]; + auto output = bodies_[branch_index].get(); + TF_ASSIGN_OR_RETURN(node_map[m->id()], + BuildRetvalNode(output, m->output_type(0), index)); + } + ++index; + + // Connect the input to the merge_ with the retval, except if it is a + // Swich node, which is handled separately. + for (auto e : m->in_edges()) { + if (e->IsControlEdge()) continue; + int branch_index = static_cast(find_branch(e)); + auto& node_map = node_maps_[branch_index]; + auto output = bodies_[branch_index].get(); + Node* in = e->src(); + if (!IsSwitch(in)) { + if (node_map.at(in->id()) == nullptr) { + node_map[in->id()] = output->CopyNode(in); + } + output->AddEdge(node_map[in->id()], e->src_output(), + node_map.at(m->id()), 0); + } + } + } + return Status::OK(); +} + +Status Conditional::BuildIfNode(Graph* graph, + FunctionLibraryDefinition* library) { + VLOG(2) << "Build cond function for " << name(); + NodeDefBuilder builder(name(), "If"); + const string branch_name[] = {"else_branch", "then_branch"}; + for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) { + int branch_index = static_cast(branch); + static std::atomic sequence_num(0LL); + int64 id = ++sequence_num; + + NameAttrList body_name; + body_name.set_name(strings::StrCat("_functionalize_if_", + branch_name[branch_index], "_", id)); + + VLOG(3) << "FunctionalizeControlFlow (" << branch_name[branch_index] + << "): " + << dump_graph::DumpGraphToFile( + "functionalize_cond_body_" + branch_name[branch_index], + *bodies_[branch_index], nullptr); + + FunctionDef body_fdef; + TF_RETURN_IF_ERROR(GraphToFunctionDef(*bodies_[branch_index], + body_name.name(), &body_fdef)); + TF_RETURN_IF_ERROR(library->AddFunctionDef(body_fdef)); + builder.Attr(branch_name[branch_index], body_name); + } + + VLOG(3) << "Build input type"; + std::vector inputs; + DataTypeVector in_arg_types; + for (auto& kv : cond_arg_nodes_) { + bool inserted = false; + for (const Node* arg : kv.switches) { + const Edge* in_edge; + TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge)); + if (in_edge->IsControlEdge()) { + builder.ControlInput(in_edge->src()->name()); + } else { + if (!inserted) { + DataType dtype = arg->input_type(0); + inputs.emplace_back(NodeDefBuilder::NodeOut( + in_edge->src()->name(), in_edge->src_output(), dtype)); + in_arg_types.push_back(dtype); + inserted = true; + } + } + } + } + builder.Attr("Tin", in_arg_types); + + DataTypeVector out_type; + for (const Node* merge : merges_) { + DataType dtype = merge->output_type(0); + out_type.push_back(dtype); + } + builder.Attr("Tout", out_type); + VLOG(3) << "Build output type: " << DataTypeVectorString(out_type); + + builder.Attr("Tcond", DT_BOOL); + builder.Device(predicate_.node->assigned_device_name()); + // Conditional should be the first input ... + builder.Input(NodeDefBuilder::NodeOut(predicate_.node->name(), + predicate_.index, + predicate_.node->output_type(0))); + // ... followed by the other inputs. + builder.Input(inputs); + + VLOG(3) << "Build If node"; + NodeDef if_def; + TF_RETURN_IF_ERROR(builder.Finalize(&if_def)); + TF_ASSIGN_OR_RETURN(if_node_, parent_->AddIfNode(if_def, *merges_.begin())); + + return Status::OK(); +} + +Status Conditional::AddInputEdges(Graph* graph) { + VLOG(2) << "AddInputEdges for " << if_node_->name(); + int index = 0; + // Add predicate input. + graph->AddEdge(const_cast(predicate_.node), predicate_.index, if_node_, + index++); + // Add function body inputs. + for (auto& arg : cond_arg_nodes_) { + if (arg.src_output == Graph::kControlSlot) { + graph->AddControlEdge(arg.src, if_node_); + } else { + graph->AddEdge(arg.src, arg.src_output, if_node_, index++); + } + } + for (Node* n : external_control_inputs_) { + graph->AddControlEdge(n, if_node_); + } + return Status::OK(); +} + +Status Conditional::AddOutputEdges(Graph* graph) { + VLOG(2) << "AddOutputEdges for " << if_node_->name(); + int i = 0; + for (Node* node : merges_) { + TF_RETURN_IF_ERROR(parent_->AddIdentityNode(node, if_node_, i)); + std::vector edges(node->out_edges().begin(), + node->out_edges().end()); + for (const Edge* edge : edges) { + Node* dst = edge->dst(); + int dst_input = edge->dst_input(); + if (edge->src_output() > 0) { + return errors::Unimplemented("Output of index (", edge->src_output(), + ") of merge node ", + FormatNodeForError(*node)); + } + + bool control_edge = edge->IsControlEdge(); + graph->RemoveEdge(edge); + if (control_edge) { + graph->AddControlEdge(if_node_, dst); + } else { + graph->AddEdge(if_node_, i, dst, dst_input); + } + } + ++i; + } + for (Node* n : external_control_outputs_) { + graph->AddControlEdge(if_node_, n); + } + + return Status::OK(); +} + +Status Conditional::BuildAndReplace(Graph* graph, + FunctionLibraryDefinition* library) { + VLOG(1) << "Build If and replace merge nodes " << name(); + if (replaced_) return Status::OK(); + + TF_RETURN_IF_ERROR(ExtractBodies(graph)); + TF_RETURN_IF_ERROR(BuildArgumentNodes()); + + if (VLOG_IS_ON(3)) { + LOG(INFO) << "Extracted bodies:"; + for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) { + int branch_index = static_cast(branch); + auto output = bodies_[branch_index].get(); + LOG(INFO) << Branch_Name(branch) << ": " + << DebugString(output->ToGraphDefDebug()); + } + } + + TF_RETURN_IF_ERROR(BuildIfNode(graph, library)); + TF_RETURN_IF_ERROR(AddInputEdges(graph)); + TF_RETURN_IF_ERROR(AddOutputEdges(graph)); + TF_RETURN_IF_ERROR(parent_->PropagateUpdatedState(if_node_)); + for (Node* m : merges_) cond_state_map_->MarkDead(m); + + // Check that the if_node doesn't feed into itself. + TF_RETURN_WITH_CONTEXT_IF_ERROR( + CheckNodeNotInCycle(if_node_, graph->num_node_ids()), + "Converting to If failed."); + + replaced_ = true; + return Status::OK(); +} + +string Conditional::name() const { + CHECK(!merges_.empty()); + return strings::StrCat((*merges_.begin())->name(), "_if"); +} + +bool CondStateMap::ScopeIn(CondStateMap::CondId id, + CondStateMap::CondId* scope) { + if (id == nullptr) { + *scope = nullptr; + return true; + } + CondState state; + for (const CondNode& node : *id) { + if (node.type == CondNode::Type::kSwitch) { + state.push_back(node); + } + if (node.type == CondNode::Type::kMerge) { + if (state.empty()) { + return false; + } + DCHECK(state.back().type == CondNode::Type::kSwitch && + state.back().branch == BranchType::kBoth); + state.pop_back(); + } + } + *scope = GetUniqueId(state); + return true; +} + +Status FunctionalizeCond::AddIdentityNode(const Node* replacee, Node* if_node, + int port) { + Node* id; + TF_RETURN_IF_ERROR(NodeBuilder(replacee->name(), "Identity") + .Input(if_node, port) + .Finalize(graph_, &id)); + cond_state_map_.ResetId(id, cond_state_map_.LookupId(if_node)); + return Status::OK(); +} + +StatusOr FunctionalizeCond::AddIfNode(const NodeDef& def, + const Node* replacee) { + Status status; + Node* ret = graph_->AddNode(def, &status); + TF_RETURN_IF_ERROR(status); + CondStateMap::CondState state = cond_state_map_.LookupState(replacee); + state.pop_back(); + VLOG(1) << "Adding If for " << replacee->name(); + cond_state_map_.ResetId(ret, cond_state_map_.GetUniqueId(state)); + return ret; +} + +Status FunctionalizeCond::PropagateUpdatedState(const Node* replacee) { + VLOG(2) << "Propagating update state for " << replacee->name() << " " + << cond_state_map_.CondStateToString(replacee); + // Redo topological sort as the order could have changed. + // TODO(jpienaar): The original topological order could also be updated + // dynamically if needed. + std::vector rev_topo_order; + GetPostOrder(*graph_, &rev_topo_order); + + // All the outputs of the new node could potentially be updated. + std::unordered_set changed; + for (auto n : replacee->out_nodes()) + if (n->IsOp()) changed.insert(n); + + // Iterate through the changed/possible changed nodes in topological order. + for (auto it = rev_topo_order.rbegin(); + it != rev_topo_order.rend() && !changed.empty(); ++it) { + if (changed.find(*it) != changed.end()) { + // Update the node state. + Node* n = *it; + CondStateMap::CondId old_state = cond_state_map_.LookupId(n); + cond_state_map_.ResetId(n, nullptr); + TF_RETURN_IF_ERROR(DetermineCondState(n)); + if (cond_state_map_.LookupId(n) != old_state) { + for (auto out : n->out_nodes()) + if (out->IsOp()) changed.insert(out); + } + changed.erase(n); + } + } + return Status::OK(); +} + +// Returns the most restrictive branch of two branches or neither. This is the +// meet operator of the BranchType lattice. +BranchType MeetBranch(const BranchType& lhs, const BranchType& rhs) { + if (lhs == rhs) return lhs; + if (lhs == BranchType::kNeither) return rhs; + if (rhs == BranchType::kNeither) return lhs; + if (lhs == BranchType::kBoth) return rhs; + if (rhs == BranchType::kBoth) return lhs; + return BranchType::kNeither; +} + +CondStateMap::ContainsResult CondStateMap::LhsHoldsWhereverRhsHolds( + CondStateMap::CondId lhs, CondStateMap::CondId rhs) { + CondId lhs_scope; + CondId rhs_scope; + bool could_determine_scope = ScopeIn(lhs, &lhs_scope); + could_determine_scope = could_determine_scope && ScopeIn(rhs, &rhs_scope); + if (!could_determine_scope) return kIncomparable; + + // Returns whether a contains b. + auto contains = [&](CondId a, CondId b) { + // Handle empty states. + if (a == nullptr && b != nullptr) return true; + if (a == nullptr && b == nullptr) return true; + if (a != nullptr && b == nullptr) return false; + + if (a->size() > b->size()) return false; + auto a_it = a->begin(); + auto b_it = b->begin(); + while (a_it != a->end()) { + if (*a_it != *b_it) { + if (!(a_it->predicate == b_it->predicate)) return false; + BranchType mb = MeetBranch(a_it->branch, b_it->branch); + if (mb != b_it->branch) return false; + } + ++a_it; + ++b_it; + } + return true; + }; + + bool lhs_contains_rhs = contains(lhs_scope, rhs_scope); + bool rhs_contains_lhs = contains(rhs_scope, lhs_scope); + if (lhs_contains_rhs && rhs_contains_lhs) return kEqual; + if (lhs_contains_rhs) return kLhsContainsRhs; + if (rhs_contains_lhs) return kRhsContainsLhs; + return kIncomparable; +} + +BranchType CondStateMap::FindBranchOf(CondId id, OutputTensor predicate) const { + if (IsEmpty(id)) return BranchType::kNeither; + absl::optional b; + const CondState& nodes = *id; + for (auto it = nodes.rbegin(); it != nodes.rend(); ++it) { + if (it->type == CondStateMap::CondNode::Type::kSwitch && + it->predicate == predicate) { + if (b.has_value()) { + b = MeetBranch(*b, it->branch); + } else { + b = it->branch; + } + if (*b == BranchType::kNeither) { + LOG(FATAL) << "Inconsistent state for node: " << DebugString(id); + } + } + } + return b.has_value() ? *b : BranchType::kNeither; +} + +StatusOr FunctionalizeCond::JoinCondStatesNonMerge( + CondStateMap::CondId src, CondStateMap::CondId dst) { + VLOG(4) << "Joining src=" << DebugString(src) << " [" << src + << "] and dst=" << DebugString(dst) << " [" << dst << "]"; + + if (cond_state_map_.IsEmpty(dst) || cond_state_map_.IsDead(src)) return src; + if (cond_state_map_.IsDead(dst)) return dst; + + // Nothing to do if the CondState is the same. + if (src == dst) return src; + + CondStateMap::CondId src_scope; + CondStateMap::CondId dst_scope; + if (!cond_state_map_.ScopeIn(src, &src_scope)) + return errors::Unimplemented( + "Predicates that must hold for node to execute are invalid! ", + DebugString(src)); + if (!cond_state_map_.ScopeIn(dst, &dst_scope)) + return errors::Unimplemented( + "Predicates that must hold for node to execute are invalid! ", + DebugString(dst)); + + auto result = cond_state_map_.LhsHoldsWhereverRhsHolds(src_scope, dst_scope); + switch (result) { + case CondStateMap::kIncomparable: + return errors::InvalidArgument( + "Graph contains node with inputs predicated on incompatible " + "predicates: ", + DebugString(src), " and ", DebugString(dst)); + case CondStateMap::kEqual: + // If both respect the same predicates, propagate the longer constraint. + if ((src != nullptr && dst == nullptr) || + (src != nullptr && dst != nullptr && src->size() > dst->size())) + return src; + else + return dst; + case CondStateMap::kLhsContainsRhs: + // src contains dst, so dst is already more restrictive. + return dst; + case CondStateMap::kRhsContainsLhs: + // dst contains src, so src is more restrictive. + return src; + } +} + +StatusOr +FindThenElseSwitchForPredicate(const OutputTensor& pred, + CondStateMap::CondId id) { + for (auto it = id->begin(); it != id->end(); ++it) { + // Along every path one there can be only one instance of a then or else + // switch for a given predicate, so return once found. + if (it->type == CondStateMap::CondNode::Type::kSwitch && + it->predicate == pred && + (it->branch == BranchType::kThenBranch || + it->branch == BranchType::kElseBranch)) + return it; + } + return errors::Internal("Unable to find then/else branch with predicate ", + DebugString(pred), " for ", DebugString(id)); +} + +StatusOr FunctionalizeCond::JoinCondStatesMerge( + CondStateMap::CondId src, CondStateMap::CondId dst) { + // Determine the flow state when joining two states for a merge + // node. Combining the two states for a merge node is effectively performing a + // disjunction of the states along the different input edges. For a merge that + // can be transformed into a If the two inputs paths have to have a predicate + // on which they differ (e.g., along one edge predicate `p` has to hold while + // on another it should not). This function first determines this predicate + // and then the resultant state is the common path between the two inputs + // followed by s(p, both). + VLOG(4) << "Joining (for merge) " << DebugString(src) << " and " + << DebugString(dst); + if (cond_state_map_.IsEmpty(dst)) return src; + + if (cond_state_map_.IsDead(src)) return src; + if (cond_state_map_.IsDead(dst)) return dst; + + CondStateMap::CondId src_scope; + CondStateMap::CondId dst_scope; + if (!cond_state_map_.ScopeIn(src, &src_scope)) + return errors::Unimplemented( + "Predicates that must hold for node to execute are invalid! ", + DebugString(src)); + if (!cond_state_map_.ScopeIn(dst, &dst_scope)) + return errors::Unimplemented( + "Predicates that must hold for node to execute are invalid! ", + DebugString(dst)); + + TF_RET_CHECK(src_scope != nullptr && dst_scope != nullptr) + << "Illegal merge inputs from outer scope: src=" << DebugString(src) + << " dst=" << DebugString(dst); + auto src_it = src_scope->begin(); + auto dst_it = dst_scope->begin(); + + // Find branch divergent condition. + OutputTensor pred; + while (src_it != src_scope->end() && dst_it != dst_scope->end()) { + if (*src_it != *dst_it) { + VLOG(5) << "Diverges with: " << DebugString(*src_it) << " and " + << DebugString(*dst_it); + if (!(src_it->predicate == dst_it->predicate)) { + return errors::InvalidArgument( + "Unable to find common predicate which holds for one input " + "but not the other of the merge node."); + } + pred = src_it->predicate; + break; + } + ++src_it; + ++dst_it; + } + + if (pred.node == nullptr) + return errors::InvalidArgument("Unable to determine predicate for merge."); + + TF_ASSIGN_OR_RETURN(auto div_src_it, + FindThenElseSwitchForPredicate(pred, src)); + TF_ASSIGN_OR_RETURN(auto div_dst_it, + FindThenElseSwitchForPredicate(pred, dst)); + TF_RET_CHECK(*div_src_it != *div_dst_it); + + CondStateMap::CondState result; + // Populate result with the longest/most restrictive path up to the divergent + // node. For example, if the one input is `[switch(pred:0, then)]` and the + // other is `[switch(pred:0, both), merge, switch(pred:0, else)]` (as created + // in gradient of cond test), then the resultant state here should be + // `[switch(pred:0, both), merge, switch(pred:0, both)]`. + if (std::distance(src->begin(), div_src_it) > + std::distance(dst->begin(), div_dst_it)) { + result.assign(src->begin(), std::next(div_src_it)); + } else { + result.assign(dst->begin(), std::next(div_dst_it)); + } + result.back().branch = BranchType::kBoth; + return cond_state_map_.GetUniqueId(result); +} + +CondStateMap::CondId FunctionalizeCond::StateAlongEdge(const Edge* e) { + Node* src = e->src(); + CondStateMap::CondId id = cond_state_map_.LookupId(e->src()); + if (IsMerge(src)) { + CondStateMap::CondState state; + if (id != nullptr) state = *id; + state.emplace_back(CondStateMap::CondNode::Type::kMerge); + return cond_state_map_.GetUniqueId(state); + } + if (IsSwitch(src)) { + CondStateMap::CondState state; + if (id != nullptr) state = *id; + if (e->IsControlEdge()) { + state.emplace_back(CondStateMap::CondNode::Type::kSwitch, src, + BranchType::kBoth); + } else { + state.emplace_back(CondStateMap::CondNode::Type::kSwitch, src, + BranchType(e->src_output())); + } + return cond_state_map_.GetUniqueId(state); + } + return id; +} + +Status FunctionalizeCond::DetermineCondStateMerge(Node* dst) { + // Only Merge nodes with two inputs are supported, but if this is a redundant + // merge, then the dead edge may already have been removed (if due to a + // switch) and so the input count would be incorrect. + if (cond_state_map_.IsDead(cond_state_map_.LookupId(dst))) + return Status::OK(); + + int data_inputs = 0; + for (auto e : dst->in_edges()) { + Node* src = e->src(); + VLOG(5) << "Processing forward flow for merge: " << e->DebugString() << " " + << cond_state_map_.CondStateToString(src); + if (!src->IsOp()) continue; + if (!e->IsControlEdge()) ++data_inputs; + + CondStateMap::CondId prop = StateAlongEdge(e); + auto id_or = JoinCondStatesMerge(prop, cond_state_map_.LookupId(dst)); + TF_RETURN_WITH_CONTEXT_IF_ERROR(id_or.status(), "for node ", + FormatNodeForError(*dst)); + cond_state_map_.ResetId(dst, id_or.ValueOrDie()); + } + + // Incomplete Merge nodes are not supported. + if (data_inputs != 2) { + return errors::Unimplemented( + dst->name(), " only has ", data_inputs, + " inputs, while only merge nodes with two inputs supported."); + } + return Status::OK(); +} + +Status FunctionalizeCond::DetermineCondState(Node* dst) { + // The logic for the merge and non-merge case differ: for non-merge it is + // the most restrictive CondState, while for merge nodes the + // resultant state is less restrictive than either. + if (IsMerge(dst)) { + TF_RETURN_IF_ERROR(DetermineCondStateMerge(dst)); + } else { + // Handle non-merge join. + for (auto e : dst->in_edges()) { + VLOG(5) << "Processing forward flow for: " << e->DebugString() << " " + << cond_state_map_.CondStateToString(dst); + Node* src = e->src(); + if (!src->IsOp()) continue; + + // Joining the state between the current and propagated state. + CondStateMap::CondId prop = StateAlongEdge(e); + auto id_or = JoinCondStatesNonMerge(prop, cond_state_map_.LookupId(dst)); + TF_RETURN_WITH_CONTEXT_IF_ERROR(id_or.status(), "for node ", + FormatNodeForError(*dst)); + cond_state_map_.ResetId(dst, id_or.ValueOrDie()); + } + } + return Status::OK(); +} + +Status FunctionalizeCond::RemoveRedundantMerge(Node* node) { + // Handle redundant merge nodes. A merge node is considered redundant if + // one input edge is dead while the other has a value. + if (!cond_state_map_.IsDead(cond_state_map_.LookupId(node))) + return Status::OK(); + + const Edge* non_dead_edge = nullptr; + for (auto e : node->in_edges()) { + if (e->IsControlEdge()) continue; + Node* src = e->src(); + + // Handle merge with dead state. + const auto& src_id = cond_state_map_.LookupId(src); + if (!cond_state_map_.IsDead(src_id)) { + non_dead_edge = e; + break; + } + } + + if (non_dead_edge == nullptr) { + return errors::InvalidArgument("Merge node ", FormatNodeForError(*node), + " has no non-dead inputs."); + } + cond_state_map_.MarkDead(node); + delete_nodes_.push_back(node->id()); + VLOG(5) << "removing redundant merge: " << node->name(); + while (!node->out_edges().empty()) { + const Edge* oe = *node->out_edges().begin(); + Node* dst_node = oe->dst(); + int dst_port = oe->dst_input(); + graph_->RemoveEdge(oe); + graph_->AddEdge(non_dead_edge->src(), + dst_port == Graph::kControlSlot + ? Graph::kControlSlot + : non_dead_edge->src_output(), + dst_node, dst_port); + } + return Status::OK(); +} + +Status FunctionalizeCond::RemoveRedundantSwitch(Node* node) { + // Handle redundant switch nodes. A switch node is considered redundant if + // the predicate of the switch already holds on the current branch. E.g., if + // p is the predicate of the switch but p is already known to hold on this + // branch, then the switch can be removed and the dead state propagated + // along one. The checking of predicate is based on the exact predicate + // (rather than boolean equivalence) and aimed at redundant switches as + // currently generated by gradient code. + OutputTensor pred; + TF_RETURN_IF_ERROR(GetSwitchPredicate(*node, &pred)); + auto dst_id = cond_state_map_.LookupId(node); + BranchType b = cond_state_map_.FindBranchOf(dst_id, pred); + // Determine if we are already on a branch where the switch predicate is + // true/false. + if (b != BranchType::kThenBranch && b != BranchType::kElseBranch) + return Status::OK(); + + VLOG(5) << "Redundant switch " << node->name(); + const Edge* value_edge; + TF_RETURN_IF_ERROR(node->input_edge(0, &value_edge)); + Node* val_node = value_edge->src(); + int val_port = value_edge->src_output(); + while (!node->out_edges().empty()) { + auto e = *node->out_edges().begin(); + Node* dst_node = e->dst(); + int dst_input = e->dst_input(); + int switch_branch = e->src_output(); + graph_->RemoveEdge(e); + if (switch_branch == Graph::kControlSlot) { + if (IsMerge(dst_node)) { + auto id_or = + JoinCondStatesMerge(dst_id, cond_state_map_.LookupId(dst_node)); + TF_RETURN_WITH_CONTEXT_IF_ERROR(id_or.status(), "for node ", + FormatNodeForError(*dst_node)); + cond_state_map_.ResetId(dst_node, id_or.ValueOrDie()); + } else { + auto id_or = + JoinCondStatesNonMerge(dst_id, cond_state_map_.LookupId(dst_node)); + TF_RETURN_IF_ERROR(id_or.status()); + cond_state_map_.ResetId(dst_node, id_or.ValueOrDie()); + } + } else if (BranchType(switch_branch) != b) { + cond_state_map_.MarkDead(dst_node); + delete_nodes_.push_back(dst_node->id()); + continue; + } + graph_->AddEdge( + val_node, + switch_branch == Graph::kControlSlot ? Graph::kControlSlot : val_port, + dst_node, dst_input); + } + return Status::OK(); +} + +Status FunctionalizeCond::DetermineCondStates( + std::vector rev_topo_order) { + // The state that is propagated along the given edge. + for (auto it = rev_topo_order.rbegin(); it != rev_topo_order.rend(); ++it) { + Node* dst = *it; + TF_RETURN_IF_ERROR(DetermineCondState(dst)); + if (IsSwitch(dst)) TF_RETURN_IF_ERROR(RemoveRedundantSwitch(dst)); + if (IsMerge(dst)) TF_RETURN_IF_ERROR(RemoveRedundantMerge(dst)); + + VLOG(5) << dst->name() << " :: " << cond_state_map_.CondStateToString(dst); + } + return Status::OK(); +} + +void FunctionalizeCond::DeleteReachableNodes() { + // Delete all nodes that have been extracted or are reachable from + // deleted/dead nodes. The input and outgoing edges should have already been + // removed. + std::vector deleted(graph_->num_node_ids(), false); + // Don't try to delete source or sink nodes. + deleted[graph_->kSourceId] = true; + deleted[graph_->kSinkId] = true; + while (!delete_nodes_.empty()) { + int d_id = delete_nodes_.front(); + delete_nodes_.pop_front(); + if (deleted[d_id]) continue; + Node* d = graph_->FindNodeId(d_id); + // Switch and Merge nodes could have been deleted already. + if (d == nullptr) continue; + for (const Edge* e : d->out_edges()) { + delete_nodes_.push_back(e->dst()->id()); + } + deleted[d_id] = true; + graph_->RemoveNode(d); + } +} + +void FunctionalizeCond::SortMergeNodes(std::vector* merge_order) { + // Sort merge nodes by nesting depth. + using sort_pair = std::pair; + std::vector inner_to_outer_merge_order; + inner_to_outer_merge_order.reserve(merge_order->size()); + for (auto it = merge_order->rbegin(); it != merge_order->rend(); ++it) { + Node* merge = *it; + CondStateMap::CondId id = cond_state_map_.LookupId(merge); + int depth = 0; + for (auto cond_node_it = id->begin(); cond_node_it != id->end(); + ++cond_node_it) { + if (cond_node_it->type == CondStateMap::CondNode::Type::kSwitch && + (cond_node_it->branch == BranchType::kThenBranch || + cond_node_it->branch == BranchType::kElseBranch)) { + ++depth; + } + } + inner_to_outer_merge_order.emplace_back(depth, merge); + } + std::stable_sort( + inner_to_outer_merge_order.begin(), inner_to_outer_merge_order.end(), + [](sort_pair lhs, sort_pair rhs) { return lhs.first > rhs.first; }); + merge_order->clear(); + for (sort_pair t : inner_to_outer_merge_order) { + merge_order->push_back(t.second); + } +} + +Status FunctionalizeCond::FunctionalizeInternal() { + // The general approach for converting a tf.cond (as lowered via switch/merge + // nodes) to a functional if is as follows: + // 1. Determine the topological order and collect all the switch and merge + // nodes in the graph; + // 2. Compute the predicates and dominance structure for all the nodes in the + // graph - this includes which predicate must be true for a op to execute + // (predicate values are considered directly rather than attempting to + // determine deeper equivalence). We shall refer to this structure as the + // CondState; + // 3. Sort the merge nodes by nesting depth; + // 4. Extract merge nodes together that have the same CondState and whose + // input nodes have the same state from the innermost to the outermost into + // IfOps; Note: In the above only nodes paths that converge to a merge node + // will be considered for removal. + + // Perform a DFS over the graph and + // * Determine the reverse topological order of the nodes (there should be no + // cycles at this point so the post-order numbering corresponds to the + // reverse topological sorting); + // * Record reverse topological for merge and switch nodes; + std::vector rev_topo_order; + std::vector switch_ids; + std::vector merge_order; + DFS(*graph_, nullptr, [&](Node* n) { + if (IsSwitch(n)) { + switch_ids.push_back(n->id()); + } + if (IsMerge(n)) { + merge_order.push_back(n); + } + if (n->IsOp()) { + rev_topo_order.push_back(n); + } + }); + + // No merges to functionalize. + if (merge_order.empty()) { + // No merges mean no switch values consumed (as only considering values + // fetchable as output of merge); + for (auto it = switch_ids.begin(); it != switch_ids.end(); ++it) { + graph_->RemoveNode(graph_->FindNodeId(*it)); + } + return Status::OK(); + } + + TF_RETURN_IF_ERROR(DetermineCondStates(std::move(rev_topo_order))); + + if (VLOG_IS_ON(4)) DumpGraphWithCondState("cond_id"); + + // Sort the merge nodes from innermost outwards. + SortMergeNodes(&merge_order); + + // Extract from innermost out. + for (auto it = merge_order.begin(); it != merge_order.end(); ++it) { + Node* merge = *it; + auto id = cond_state_map_.LookupId(merge); + if (cond_state_map_.IsDead(id)) continue; + + // Construct a Conditional with the predicate of the merge (which is the + // last entry of the CondState for the merge) and this as parent. + DCHECK(id->back().predicate.node != nullptr); + Conditional cond(id->back().predicate, this, &cond_state_map_); + TF_RETURN_IF_ERROR(cond.AddMerge(merge)); + + // Find all merge nodes with the same CondId. This is done repeatedly as + // the CondId can change due replaced conditionals. E.g., the one branch + // could previously have had a conditional nested in it, and so would have + // had CondState with sub-state [switch(p,b),m] (where p is some predicate), + // post removing the nested conditional that sub-state would no longer be + // path of the propagated state along that path. + auto end = merge_order.end(); + for (auto merge_candidate_it = std::next(it); merge_candidate_it != end; + ++merge_candidate_it) { + auto merge_candidate_it_id = + cond_state_map_.LookupId(*merge_candidate_it); + if (merge_candidate_it_id != id) continue; + TF_RETURN_IF_ERROR(cond.AddMerge(*merge_candidate_it)); + } + + TF_RETURN_IF_ERROR(cond.BuildAndReplace(graph_, library_)); + + if (VLOG_IS_ON(4)) DumpGraphWithCondState("after_extract"); + } + + // All remaining Switch nodes are not reachable from a Merge node and + // removed. This is to account for dead Switch nodes. + for (int s_id : switch_ids) delete_nodes_.push_back(s_id); + for (Node* m : merge_order) delete_nodes_.push_back(m->id()); + DeleteReachableNodes(); + + return Status::OK(); +} + +void FunctionalizeCond::DumpGraphWithCondState(const string& name) { + const char* const kCondGroupDebugAttr = "_XlaFunctionalizeCondGroup"; + + for (Node* n : graph_->nodes()) { + n->ClearAttr(kCondGroupDebugAttr); + n->AddAttr(kCondGroupDebugAttr, cond_state_map_.CondStateToString(n)); + } + LOG(INFO) << "FunctionalizeControlFlow (" << name << "): " + << dump_graph::DumpGraphToFile( + strings::StrCat("functionalize_", name), *graph_, library_); +} + +Status FunctionalizeCond::Functionalize(Graph* graph, + FunctionLibraryDefinition* library) { + VLOG(1) << "FunctionalizeCond::Functionalize"; + FunctionalizeCond fc(graph, library); + return fc.FunctionalizeInternal(); +} + +} // namespace functionalize_cond + +Status FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library) { + // FunctionalizeControlFlow is invoked for every function, so the loops's + // bodies and conditionals that were extracted into functions will be handled + // in successive invocations. + return functionalize_cond::FunctionalizeCond::Functionalize(graph, library); +} + +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.h b/tensorflow/compiler/tf2xla/functionalize_cond.h new file mode 100644 index 0000000000000000000000000000000000000000..86436011c6ebdc608a5811a1b0d6a10015d405bd --- /dev/null +++ b/tensorflow/compiler/tf2xla/functionalize_cond.h @@ -0,0 +1,248 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_ +#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_ + +#include +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/graph/graph.h" + +namespace tensorflow { + +// Functionalize all the switch-merge nodes of a loop-free graph into If +// nodes. That is, attempt to transform every remaining switch and merge nodes +// in the graph into If nodes. +// Precondition: All while loops have been removed from graph. +Status FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library); + +// Internal functions/classes exposed for testing purposes. +namespace functionalize_cond { + +// All nodes are assumed to be either in no branch, then branch, else branch, +// or both branches (such as merge nodes). +// The code below relies on Else and Then being 0 and 1 (corresponding to the +// switch outputs). Both and Neither are arbitrary. +enum class BranchType { + kElseBranch = 0, + kThenBranch = 1, + kBoth = 2, + kNeither = 3, +}; + +// CondStateMap is responsible for mapping from each graph Node to a CondState, +// where each CondState is the array of CondNodes (corresponding to switch, +// merge or dead states) as described below. For efficiency, this class interns +// the CondState, so that CondState equality comparisons are simply pointer +// comparisons. +class CondStateMap { + public: + explicit CondStateMap(Graph* graph); + + // Represents an entry in the CondState. An entry can either be the + // switch (along with predicate), merge, or dead: + // * switch node indicates a node that is executed along a branch with the + // given predicate - a branch can be then, else or both; + // * merge node indicates that the node is executed as output of a merge; + // * dead indicates that this node can never be executed; + struct CondNode { + enum class Type { kSwitch = 1, kMerge = 2, kDead = 3 }; + + CondNode(Type type, Node* switch_node = nullptr, + BranchType branch = BranchType::kNeither); + + string ToString() const; + bool operator==(const CondNode& other) const; + bool operator!=(const CondNode& other) const; + + // Type of node. + Type type; + + // Predicate and branch, only used when type is kSwitch. + OutputTensor predicate; + BranchType branch; + }; + + // A node in the graph is executed when multiple conditions hold. The order + // represents the nesting of the predicates that hold and is used when + // extracting the nested conditionals. + using CondState = std::vector; + + // Every unique ID is mapped to a CondState. + using CondId = const CondState*; + + // Returns the CondId for a given node. + CondId LookupId(const Node* node) const; + + // Returns the unique CondId for CondState. + CondId GetUniqueId(const CondState& state); + + // Returns the CondState for a Node. + // REQUIRES: node has a non-empty CondState. + const CondState& LookupState(const Node* node) const; + + // Resets the CondId for a given node. + void ResetId(const Node* node, CondId id); + + // Marks `node` as dead. + void MarkDead(const Node* node); + + // Determine branch execution of CondState. + BranchType FindBranchOf(CondId id, OutputTensor predicate) const; + + // Enum to represent whether one cond flow state contains another. + enum ContainsResult { + kIncomparable, + kEqual, + kLhsContainsRhs, + kRhsContainsLhs + }; + + // Returns whether the lhs CondState holds wherever rhs CondState hols. I.e., + // [(p,t)] contains [(p,t), (r,t)]. + ContainsResult LhsHoldsWhereverRhsHolds(CondId lhs, CondId rhs); + + // Returns textual representation of node's CondState. + string CondStateToString(const Node* node) const; + string CondStateToString(CondId id) const; + + // Returns whether the cond state is the dead state. + bool IsDead(CondId id) const; + + // Returns whether the cond state is the empty state. + bool IsEmpty(CondId id) const; + + // Computes the predicates that have to hold for a node to execute and returns + // whether it was possible to determine the predicates that must hold. `scope` + // is populated with these predicates. Scope differs from state in that it + // does not include merge and both nodes. + bool ScopeIn(CondId id, CondId* scope); + + private: + // Hash for CondNode and CondState. + struct CondHash { + size_t operator()(const CondNode& item) const; + size_t operator()(const CondState& vec) const; + }; + + // Set to keep track of unique CondStates. + // Pointers to the entries in the unordered set are used as identifiers: + // unordered_set guarantees that the pointers remain the same. + std::unordered_set condstate_set_; + + // Mapping from Node id to CondId. + std::vector node_to_condid_map_; + + // Track the CondId for newly inserted nodes. We use a vector to quickly map + // from Node id in the original graph to the CondId, but there will be nodes + // added to the original graph (such as If nodes) whose CondState needs to be + // tracked too. + std::unordered_map added_node_mapping_; + + // Identifier of the dead flow state. The empty flow state is represented with + // a nullptr. + CondId dead_id_; +}; + +// FunctionalizeCond groups all the state used by functionalizing conditionals +// of the given graph together. +class FunctionalizeCond { + public: + // Functionalize all the switch-merge nodes of a loop-free graph into If + // nodes. That is, attempt to transform every remaining switch and merge nodes + // in the graph into If nodes. + // Precondition: All while loops have been removed from graph. + static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library); + + // Build identity node with the same name as the merge that will be replaced + // in case the output is fetched/colocated. + Status AddIdentityNode(const Node* replacee, Node* if_node, int port); + + // Add a If node to the graph defined by def that will, amongst other, replace + // replacee in the graph. + xla::StatusOr AddIfNode(const NodeDef& def, const Node* replacee); + + // Propagates the state of a newly inserted node. + Status PropagateUpdatedState(const Node* replacee); + + // Dump graph with the CondState annotated. + void DumpGraphWithCondState(const string& name); + + private: + FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library); + + // Performs the actual cond functionalization. Iterate over groups of merge + // nodes (linked by common predicate & CondIds of the incomming edges), + // from innermost to outermost, and extract into If nodes. + Status FunctionalizeInternal(); + + // Returns the forward flow state propagated along edge `e`. + // This may modify cond_state_map_. + CondStateMap::CondId StateAlongEdge(const Edge* e); + + // Determines the CondState of all the nodes in the given vector where + // the input is expected in reverse topological order. + // This populates the cond_state_map_. + Status DetermineCondStates(std::vector rev_topo_order); + + // Determine the CondState for a given node using the incomming edges + // to the node. Note: it is expected that this node's CondState is only + // determined once its input's CondState is. + Status DetermineCondState(Node* dst); + + // Helper functions for DetermineCondState. + Status DetermineCondStateMerge(Node* dst); + + // Helper functions for DetermineCondStates. Determines the dst node's + // CondState by joining the src and dst's CondState where either + // the dst node is a merge or not. + // These may modify cond_state_map_. + xla::StatusOr JoinCondStatesMerge( + CondStateMap::CondId src, CondStateMap::CondId dst); + xla::StatusOr JoinCondStatesNonMerge( + CondStateMap::CondId src, CondStateMap::CondId dst); + + // Checks if a merge node is redundant and if so removes it from the graph. + Status RemoveRedundantMerge(Node* node); + + // Checks if a switch node is redundant and if so removes it from the graph. + Status RemoveRedundantSwitch(Node* node); + + // Sorts merge nodes (in reverse topological order) in order of increasing + // nesting depth. + void SortMergeNodes(std::vector* merge_order); + + // Deletes all nodes in/consumers of `delete_nodes_`. + void DeleteReachableNodes(); + + // Member used to unique the CondState to a unique CondId and keep track of + // CondState/CondId per Node. + CondStateMap cond_state_map_; + + // Nodes to be deleted. + std::deque delete_nodes_; + + FunctionLibraryDefinition* library_; + Graph* graph_; + + friend class FunctionalizeCondTest; +}; + +} // namespace functionalize_cond + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_ diff --git a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a27f8893925855f536801a8a68855b82ac07462d --- /dev/null +++ b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc @@ -0,0 +1,184 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Tests for the backward const analysis. + +#include "tensorflow/compiler/tf2xla/functionalize_cond.h" + +#include "tensorflow/cc/framework/ops.h" +#include "tensorflow/cc/ops/function_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace functionalize_cond { + +class FunctionalizeCondTest : public ::testing::Test { + protected: + FunctionalizeCondTest() { + graph_.reset(new Graph(OpRegistry::Global())); + flib_def_.reset( + new FunctionLibraryDefinition(OpRegistry::Global(), fdef_lib_)); + fc_.reset(new functionalize_cond::FunctionalizeCond(graph_.get(), + flib_def_.get())); + } + + CondStateMap::CondId GetUniqueId( + const CondStateMap::CondStateMap::CondState& state) { + return fc_->cond_state_map_.GetUniqueId(state); + } + + xla::StatusOr JoinCondStatesNonMerge( + CondStateMap::CondId src, CondStateMap::CondId dst) { + return fc_->JoinCondStatesNonMerge(src, dst); + } + + xla::StatusOr JoinCondStatesMerge( + CondStateMap::CondId src, CondStateMap::CondId dst) { + return fc_->JoinCondStatesMerge(src, dst); + } + + bool ScopeIn(CondStateMap::CondId ff, CondStateMap::CondId* scope) { + return fc_->cond_state_map_.ScopeIn(ff, scope); + } + + CondStateMap::ContainsResult LhsHoldsWhereverRhsHolds( + CondStateMap::CondId lhs, CondStateMap::CondId rhs) { + return fc_->cond_state_map_.LhsHoldsWhereverRhsHolds(lhs, rhs); + } + + FunctionDefLibrary fdef_lib_; + std::unique_ptr fc_; + std::unique_ptr flib_def_; + std::unique_ptr graph_; +}; + +namespace { + +TEST_F(FunctionalizeCondTest, ScopeIn) { + Tensor pred_tensor(DT_BOOL, TensorShape()); + pred_tensor.flat().setZero(); + Node* pred = test::graph::Constant(graph_.get(), pred_tensor, "pred"); + Tensor val_tensor(DT_INT32, TensorShape()); + val_tensor.flat().setZero(); + Node* val = test::graph::Constant(graph_.get(), val_tensor, "val"); + Node* s = test::graph::Switch(graph_.get(), val, pred); + + { + CondStateMap::CondStateMap::CondState ss; + ss.emplace_back(CondStateMap::CondNode( + CondStateMap::CondNode::Type::kSwitch, s, BranchType::kThenBranch)); + CondStateMap::CondId id = GetUniqueId(ss); + CondStateMap::CondId scope; + ASSERT_TRUE(ScopeIn(id, &scope)); + ASSERT_TRUE(id == scope); + } + + CondStateMap::CondState empty; + { + CondStateMap::CondState ss; + ss.emplace_back(CondStateMap::CondNode( + CondStateMap::CondNode::Type::kSwitch, s, BranchType::kBoth)); + ss.emplace_back( + CondStateMap::CondNode(CondStateMap::CondNode::Type::kMerge)); + CondStateMap::CondId id = GetUniqueId(ss); + CondStateMap::CondId scope_1; + ASSERT_TRUE(ScopeIn(id, &scope_1)); + ASSERT_TRUE(scope_1 == GetUniqueId(empty)); + ASSERT_TRUE(id != scope_1); + + ss.clear(); + ss.emplace_back(CondStateMap::CondNode( + CondStateMap::CondNode::Type::kSwitch, s, BranchType::kBoth)); + id = GetUniqueId(ss); + CondStateMap::CondId scope_2; + ASSERT_TRUE(ScopeIn(id, &scope_2)); + + ASSERT_TRUE(LhsHoldsWhereverRhsHolds(scope_1, scope_2) == + CondStateMap::ContainsResult::kLhsContainsRhs); + } +} + +TEST_F(FunctionalizeCondTest, JoinCondStates) { + Tensor pred_tensor(DT_BOOL, TensorShape()); + pred_tensor.flat().setZero(); + Node* pred = test::graph::Constant(graph_.get(), pred_tensor, "pred"); + Tensor val_tensor(DT_INT32, TensorShape()); + val_tensor.flat().setZero(); + Node* val = test::graph::Constant(graph_.get(), val_tensor, "val"); + Node* s = test::graph::Switch(graph_.get(), val, pred); + + CondStateMap::CondId empty = GetUniqueId({}); + + CondStateMap::CondId then_branch; + { + CondStateMap::CondState ss; + ss.emplace_back(CondStateMap::CondNode( + CondStateMap::CondNode::Type::kSwitch, s, BranchType::kThenBranch)); + then_branch = GetUniqueId(ss); + } + CondStateMap::CondId else_branch; + { + CondStateMap::CondState ss; + ss.emplace_back(CondStateMap::CondNode( + CondStateMap::CondNode::Type::kSwitch, s, BranchType::kElseBranch)); + else_branch = GetUniqueId(ss); + } + + // An non-merge op with inputs from then and else branch. + Status status = JoinCondStatesNonMerge(then_branch, else_branch).status(); + EXPECT_TRUE(errors::IsInvalidArgument(status)); + + // Merge between then and else branch. + auto joined_or = JoinCondStatesMerge(then_branch, else_branch); + TF_EXPECT_OK(joined_or.status()); + CondStateMap::CondId joined = joined_or.ValueOrDie(); + + // Merge between then branch and both branch. + auto t = JoinCondStatesNonMerge(then_branch, joined); + // Note: this is OK in terms of constraint predication, but + TF_EXPECT_OK(t.status()); + + // Post merge the propagated forward flow state has an additional merge. + CondStateMap::CondId post_merge; + { + CondStateMap::CondState ss; + ss = *joined; + ss.emplace_back( + CondStateMap::CondNode(CondStateMap::CondNode::Type::kMerge)); + post_merge = GetUniqueId(ss); + } + + t = JoinCondStatesNonMerge(post_merge, joined); + TF_EXPECT_OK(t.status()); + EXPECT_TRUE(joined == t.ValueOrDie()); + + // No predicate that results in two paths predicated on different conditions + // merge. + t = JoinCondStatesMerge(post_merge, joined); + EXPECT_FALSE(t.ok()); + + // Post the merge we are effectively in the root scope and merging should + // result in the more restrictive post merge state. + t = JoinCondStatesNonMerge(post_merge, empty); + TF_EXPECT_OK(t.status()); + EXPECT_TRUE(post_merge == t.ValueOrDie()); +} + +} // namespace +} // namespace functionalize_cond +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc index 42585ad4d8a17d71146e48b69f9fa56f9ff24c3e..5932be4e525dec11a8f3c59bb85e0449e76e79c0 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc @@ -21,1412 +21,24 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/types/optional.h" #include "tensorflow/compiler/jit/union_find.h" #include "tensorflow/compiler/tf2xla/dump_graph.h" +#include "tensorflow/compiler/tf2xla/functionalize_cond.h" +#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h" +#include "tensorflow/compiler/tf2xla/functionalize_while.h" #include "tensorflow/compiler/tf2xla/tf2xla_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/control_flow.h" -#include "tensorflow/core/lib/gtl/optional.h" +#include "tensorflow/core/graph/node_builder.h" namespace tensorflow { -namespace { - -using xla::StatusOr; - -const char* const kArgOp = "_Arg"; -const char* const kRetValOp = "_Retval"; - -// Information about a loop argument. -struct Arg { - // Every loop argument has an Enter node. - Node* enter; - - // Is the loop argument a loop-invariant value? Taken from the `is_constant` - // attribute on the Enter node. - bool is_loop_invariant; - - // If 'is_loop_invariant' is true, the following are all nullptr. Non-constant - // arguments must have all of the following nodes: - Node* merge = nullptr; - Node* switch_node = nullptr; - Node* next_iteration = nullptr; - Node* exit = nullptr; -}; - -// Information about a loop frame. -struct Frame { - string name; - - // Pointer to the parent frame. The root frame has a pointer to itself. - Frame* parent = nullptr; - int num_children = 0; - - // Arguments to this loop. - std::vector args; - - // The loop condition of the loop. There should be exactly one loop condition - // in every loop. - Node* loop_cond = nullptr; - - // Set of nodes that belong to the loop frame. - std::unordered_set nodes; -}; - -// Comparison function used for sorting nodes consistently. -// a) resource variables are last, and -// b) sort lexicographically by name (for deterministic output). -struct NodeCmp { - bool operator()(const Node* lhs, const Node* rhs) const { - bool lhs_is_resource = - lhs->num_inputs() > 0 ? (lhs->input_type(0) == DT_RESOURCE) : false; - bool rhs_is_resource = - rhs->num_inputs() > 0 ? (rhs->input_type(0) == DT_RESOURCE) : false; - return std::tie(lhs_is_resource, lhs->name()) < - std::tie(rhs_is_resource, rhs->name()); - } -}; - -// Returns a textual representation of the names of the nodes in the input. -template -string NodesToString(const T& nodes) { - return strings::StrCat("{", - str_util::Join(nodes, ",", - [](string* output, const Node* node) { - strings::StrAppend(output, - node->name()); - }), - "}"); -} - -// Copies a subgraph from `graph` to `output` by performing a reverse DFS -// starting at nodes in vector `stack`. -// `node_map` is a vector indexed by source node ID to dest nodes. -// Does not traverse into nodes in `node_map`, so by adding nodes to `node_map` -// before the traversal clients can cut the graph. If a frame is provided (frame -// != nullptr), then this functions will return an error if the -// traversal leaves 'frame'; the client must add enough nodes to `node_map` to -// cut the graph and prevent the traversal from escaping. -// -// `squash_src_outputs` contains a bool for each source node ID. If true, then -// the source output on that node will be replaced by zero when copied. This is -// used when replacing a Switch node with an _Arg node. The output we are -// taking from the Switch node was not necessarily the first output, but _Arg -// nodes only have one output. By adding the Switch node to `squash_src_outputs` -// we rewrite the src_output of the corresponding edge to be 0. -Status CopySubgraph(const Graph& graph, const Frame* frame, - std::vector stack, - const std::vector& squash_src_outputs, - std::vector* node_map, Graph* output) { - VLOG(3) << "Stack: " << NodesToString(stack); - std::vector visited(graph.num_node_ids(), false); - while (!stack.empty()) { - Node* n = stack.back(); - stack.pop_back(); - - VLOG(5) << "Copying node " << n->name(); - - if (visited[n->id()]) continue; - visited[n->id()] = true; - - for (const Edge* e : n->in_edges()) { - Node* src = e->src(); - if (frame != nullptr && frame->nodes.find(src) == frame->nodes.end()) { - // We traversed out of the loop frame, without encountering a cut node. - return errors::Internal("Graph traversal of loop frame ", frame->name, - " escaped frame at ", src->name(), - " without encountering an argument node."); - } - if ((*node_map)[src->id()] == nullptr) { - (*node_map)[src->id()] = output->CopyNode(src); - stack.push_back(src); - } - Node* src_copy = (*node_map)[e->src()->id()]; - int src_output = squash_src_outputs[e->src()->id()] && !e->IsControlEdge() - ? 0 - : e->src_output(); - Node* dst_copy = (*node_map)[e->dst()->id()]; - output->AddEdge(src_copy, src_output, dst_copy, e->dst_input()); - } - } - return Status::OK(); -} - -StatusOr AddNode(const NodeDef& node_def, Graph* graph) { - Status status; - Node* inserted_node = graph->AddNode(node_def, &status); - if (!status.ok()) { - return status; - } - return inserted_node; -} - -StatusOr BuildArgNode(Graph* graph, DataType type, int index) { - NodeDef arg_def; - NodeDefBuilder builder(strings::StrCat(kArgOp, index), kArgOp); - builder.Attr("T", type); - builder.Attr("index", index); - TF_RETURN_IF_ERROR(builder.Finalize(&arg_def)); - return AddNode(arg_def, graph); -} - -StatusOr BuildRetvalNode(Graph* graph, DataType type, int index) { - NodeDef ret_def; - ret_def.set_op(kRetValOp); - ret_def.set_name(strings::StrCat(kRetValOp, index)); - AddNodeAttr("T", type, &ret_def); - AddNodeAttr("index", index, &ret_def); - return AddNode(ret_def, graph); -} - -// Builds a graph for the loop condition. -Status BuildLoopCondition(const Graph& graph, Frame* frame, - std::unique_ptr* cond_output) { - VLOG(2) << "Building loop condition for " << frame->name; - *cond_output = xla::MakeUnique(graph.op_registry()); - Graph* output = cond_output->get(); - - // Map from nodes in the original graph to the condition graph. - std::vector node_map(graph.num_node_ids(), nullptr); - std::vector squash_src_outputs(graph.num_node_ids(), false); - - // Build one _Arg node for each Enter node. - for (int i = 0; i < frame->args.size(); ++i) { - const Arg& arg = frame->args[i]; - - TF_ASSIGN_OR_RETURN(Node * arg_node, - BuildArgNode(output, arg.enter->input_type(0), i)); - if (arg.is_loop_invariant) { - node_map[arg.enter->id()] = arg_node; - } else { - node_map[arg.merge->id()] = arg_node; - } - } - - // Build a Retval node for the loop condition. The LoopCond nodes are always - // boolean because of the type constraints on the LoopCond op. - TF_ASSIGN_OR_RETURN(node_map[frame->loop_cond->id()], - BuildRetvalNode(output, DT_BOOL, 0)); - - // Performs a reverse DFS, copying nodes and edges to the output graph. - // The _Arg and _Retval nodes were added unconditionally above, so we are - // guaranteed to get the correct function signature. - return CopySubgraph(graph, frame, {frame->loop_cond}, squash_src_outputs, - &node_map, output); -} - -// Builds a graph for the loop body. -Status BuildLoopBody(const Graph& graph, Frame* frame, - DataTypeVector* arg_types, - std::unique_ptr* body_output) { - VLOG(2) << "Building loop body for " << frame->name; - *body_output = xla::MakeUnique(graph.op_registry()); - Graph* output = body_output->get(); - - // Map from nodes in the original graph to the condition graph. - std::vector node_map(graph.num_node_ids(), nullptr); - std::vector squash_src_outputs(graph.num_node_ids(), false); - - // Build one _Arg node for each Enter node. - std::vector next_iterations; - next_iterations.reserve(frame->args.size()); - arg_types->reserve(frame->args.size()); - for (int i = 0; i < frame->args.size(); ++i) { - const Arg& arg = frame->args[i]; - - DataType dtype = arg.enter->input_type(0); - arg_types->push_back(dtype); - - TF_ASSIGN_OR_RETURN(Node * arg_node, BuildArgNode(output, dtype, i)); - - if (dtype == DT_RESOURCE) { - // The convention of the XLA bridge is that resource variable arguments - // are only inputs to the loop body and have no corresponding output. - // TODO(b/37741920): change the convention so that DT_RESOURCE variables - // are both inputs and outputs, and then remove this case. - TF_RET_CHECK(arg.is_loop_invariant); - node_map[arg.enter->id()] = arg_node; - } else { - TF_ASSIGN_OR_RETURN(Node * retval_node, - BuildRetvalNode(output, dtype, i)); - - if (arg.is_loop_invariant) { - // Argument is loop-invariant. Forward it from the Arg to the Retval. - node_map[arg.enter->id()] = arg_node; - output->AddEdge(arg_node, 0, retval_node, 0); - } else { - // Argument is loop-varying. - node_map[arg.switch_node->id()] = arg_node; - // The Switch node has two outputs, but _Arg only has one. This tells - // the CopySubgraph function to rewrite the output number of edges from - // the _Arg node to be 0 rather than copying the output number from the - // Switch node. - squash_src_outputs[arg.switch_node->id()] = true; - node_map[arg.next_iteration->id()] = retval_node; - next_iterations.push_back(arg.next_iteration); - } - } - } - - // Performs a reverse DFS, copying nodes and edges to the output graph. - // The _Arg and _Retval nodes were added unconditionally above, so we are - // guaranteed to get the correct function signature. - TF_RETURN_IF_ERROR(CopySubgraph(graph, frame, std::move(next_iterations), - squash_src_outputs, &node_map, output)); - - return Status::OK(); -} - -// Copy the FunctionDef of given function from lookup_library to library, if -// it can be found in lookup_library but is missing from library. -Status AddMissingFunctionByName(const string& function_name, - const FunctionLibraryDefinition* lookup_library, - FunctionLibraryDefinition* library) { - if (!library->Find(function_name) && lookup_library->Find(function_name)) { - return library->AddFunctionDef(*lookup_library->Find(function_name)); - } - return Status::OK(); -} - -// Iterate over all functions that the given fdef refers to. Copy the missing -// FunctionDefs from lookup_library to library. -Status AddMissingFunctionDef(const FunctionDef& fdef, - const FunctionLibraryDefinition* lookup_library, - FunctionLibraryDefinition* library) { - TF_RET_CHECK(lookup_library); - for (const NodeDef& node : fdef.node_def()) { - if (library->Find(node.op())) { - continue; - } - // The function refered by 'SymbolicGradient' node is specified in its - // attribute 'f'. - if (node.op() == FunctionLibraryDefinition::kGradientOp) { - const AttrValue* attr = - AttrSlice(&node.attr()).Find(FunctionLibraryDefinition::kFuncAttr); - if (!attr) { - return errors::InvalidArgument("SymbolicGradient is missing attr: f"); - } - const string& func_name = attr->func().name(); - TF_RETURN_IF_ERROR( - AddMissingFunctionByName(func_name, lookup_library, library)); - // Copy the user-defined gradient function if it exists. - const string grad_name = lookup_library->FindGradient(func_name); - if (!grad_name.empty() && library->FindGradient(func_name).empty()) { - TF_RETURN_IF_ERROR( - AddMissingFunctionByName(grad_name, lookup_library, library)); - GradientDef grad_def; - grad_def.set_function_name(func_name); - grad_def.set_gradient_func(grad_name); - TF_RETURN_IF_ERROR(library->AddGradientDef(grad_def)); - } - } else if (lookup_library->Find(node.op())) { - TF_RETURN_IF_ERROR( - library->AddFunctionDef(*lookup_library->Find(node.op()))); - } - } - return Status::OK(); -} - -Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library, - Graph* graph, Frame* frame, - FunctionLibraryDefinition* library) { - VLOG(2) << "Frame " << frame->name << " before: " - << dump_graph::DumpGraphToFile("functionalize_before", *graph, - library); - - // Split loop-varying Enter nodes with multiple successors. If the same - // Tensor is fed as input to multiple loop arguments, we may end up with a - // shared Enter node. We clone Enter nodes with multiple successors to - // maintain the invariant of a unique Enter node per argument of the final - // loop. - std::vector args; - for (const Arg& arg : frame->args) { - if (arg.is_loop_invariant) { - args.push_back(arg); - } else { - std::vector edges(arg.enter->out_edges().begin(), - arg.enter->out_edges().end()); - for (int i = 0; i < edges.size(); ++i) { - if (edges[i]->IsControlEdge() && edges[i]->dst()->IsSink()) { - continue; - } - TF_RET_CHECK(!edges[i]->IsControlEdge()) << edges[i]->src()->name(); - Arg new_arg; - new_arg.is_loop_invariant = false; - if (i == 0) { - new_arg.enter = arg.enter; - } else { - new_arg.enter = graph->CopyNode(arg.enter); - frame->nodes.insert(new_arg.enter); - for (Edge const* e : arg.enter->in_edges()) { - graph->AddEdge(e->src(), e->src_output(), new_arg.enter, - e->IsControlEdge() ? Graph::kControlSlot : 0); - } - Node* dst = edges[i]->dst(); - int dst_input = edges[i]->dst_input(); - graph->RemoveEdge(edges[i]); - graph->AddEdge(new_arg.enter, 0, dst, dst_input); - } - args.push_back(new_arg); - } - } - } - frame->args = std::move(args); - - std::sort( - frame->args.begin(), frame->args.end(), - [](const Arg& a, const Arg& b) { return NodeCmp()(a.enter, b.enter); }); - - if (frame->loop_cond == nullptr) { - return errors::InvalidArgument("Loop ", frame->name, - " has no LoopCond node"); - } - - // Find the set of Switch nodes that are successors of the LoopCond. - std::unordered_set switches; - for (const Edge* edge : frame->loop_cond->out_edges()) { - if (!edge->IsControlEdge() && IsSwitch(edge->dst()) && - edge->dst_input() == 1) { - switches.insert(edge->dst()); - } - } - - // For each non-constant argument, looks for the following pattern of nodes: - // Enter ----> Merge --------> Switch --> Exit - // ^ ^ - // | | - // NextIteration LoopCond - // ^ ^ - // | | - // ... ... - for (Arg& arg : frame->args) { - if (!arg.is_loop_invariant) { - // Follow the edge from the Enter to Merge. - const Edge* enter_merge = nullptr; - for (const Edge* e : arg.enter->out_edges()) { - // Ignore control-edges to the sink node. These are allowed by the - // graph invariants, although probably they should have been stripped - // off earlier. - if (e->IsControlEdge() && e->dst()->IsSink()) { - continue; - } - if (enter_merge != nullptr) { - return errors::Internal( - "Enter node for loop-varying argument ", arg.enter->name(), - " has multiple successors: ", enter_merge->dst()->name(), " and ", - e->dst()->name()); - } - enter_merge = e; - } - if (enter_merge == nullptr) { - return errors::Internal("Enter node for loop-varying argument ", - arg.enter->name(), " has zero successors"); - } - arg.merge = enter_merge->dst(); - if (!IsMerge(arg.merge)) { - return errors::InvalidArgument( - "Successor of Enter node for loop-varying argument ", - arg.merge->name(), - " is not a Merge node; got: ", arg.merge->type_string()); - } - - // Find the NextIteration from the merge. There should be two inputs to - // the Merge and the NextIteration should be the other input. - if (arg.merge->input_types().size() != 2) { - return errors::InvalidArgument( - "Unexpected number of inputs to Merge node for loop-varying " - "argument ", - arg.merge->name(), "; expected 2, got ", - arg.merge->input_types().size()); - } - TF_RETURN_IF_ERROR(arg.merge->input_node(1 - enter_merge->dst_input(), - &arg.next_iteration)); - if (!IsNextIteration(arg.next_iteration)) { - return errors::InvalidArgument( - "Expected NextIteration node as input to Merge node; got node ", - arg.next_iteration->name(), " with kind ", - arg.next_iteration->type_string()); - } - - // Find the Switch successor of the Merge. There should be exactly one - // Switch node that is a successor of both the Merge and the LoopCond. - for (const Edge* edge : arg.merge->out_edges()) { - if (edge->dst_input() == 0 && IsSwitch(edge->dst()) && - switches.find(edge->dst()) != switches.end()) { - if (arg.switch_node != nullptr) { - return errors::InvalidArgument("Duplicate Switch successors to ", - arg.merge->name()); - } - arg.switch_node = edge->dst(); - } - } - if (arg.switch_node == nullptr) { - return errors::InvalidArgument("Missing Switch successor to ", - arg.merge->name()); - } - - // Update the device on the Identity outputs of the switch to match their - // target. These Identity outputs do not - - // Loop over the switch node's output to: - // - Find the Exit successor. - // - Set the sharding on all Identity outputs of the switch. These - // identity nodes are values used by the loop body or condition. - // The Identity node may have the wrong device so copy the device from - // one of its outputs instead. - std::deque possible_exit; - for (const Edge* edge : arg.switch_node->out_edges()) { - if (edge->src_output() == 0) { - possible_exit.push_back(edge); - } - if (IsIdentity(edge->dst())) { - TF_RETURN_IF_ERROR( - SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true)); - } - } - // TODO(b/67425339): Allow general graph between switch and exit. - while (!possible_exit.empty()) { - const Edge* edge = possible_exit.front(); - possible_exit.pop_front(); - if (IsExit(edge->dst())) { - if (arg.exit != nullptr) { - return errors::InvalidArgument("Duplicate Exit successors to ", - arg.switch_node->name()); - } - arg.exit = edge->dst(); - } else { - if (!IsIdentity(edge->dst())) { - return errors::Unimplemented("General graph between switch (", - arg.switch_node->name(), - ") and exit node of frame ", - frame->name, " not supported yet."); - } - for (const Edge* out : edge->dst()->out_edges()) { - possible_exit.push_back(out); - } - } - } - } - } - - // Builds the condition and body functions. - std::unique_ptr cond_graph; - TF_RETURN_IF_ERROR(BuildLoopCondition(*graph, frame, &cond_graph)); - DataTypeVector arg_types; - std::unique_ptr body_graph; - TF_RETURN_IF_ERROR(BuildLoopBody(*graph, frame, &arg_types, &body_graph)); - - VLOG(2) << "Frame " << frame->name << " condition: " - << dump_graph::DumpGraphToFile("loop_condition", *cond_graph, library) - << " body: " << dump_graph::DumpGraphToFile("loop_body", *body_graph); - - static std::atomic sequence_num(0LL); - int64 id = ++sequence_num; - NameAttrList cond_name; - cond_name.set_name(strings::StrCat("_functionalize_cond_", id)); - NameAttrList body_name; - body_name.set_name(strings::StrCat("_functionalize_body_", id)); - FunctionDef cond_fdef; - TF_RETURN_IF_ERROR( - GraphToFunctionDef(*cond_graph, cond_name.name(), &cond_fdef)); - FunctionDef body_fdef; - TF_RETURN_IF_ERROR( - GraphToFunctionDef(*body_graph, body_name.name(), &body_fdef)); - - TF_RETURN_IF_ERROR(library->AddFunctionDef(cond_fdef)); - TF_RETURN_IF_ERROR(library->AddFunctionDef(body_fdef)); - if (lookup_library) { - // Copy missing FunctionDefs from lookup_library to library to make library - // self-contained. - TF_RETURN_IF_ERROR( - AddMissingFunctionDef(cond_fdef, lookup_library, library)); - TF_RETURN_IF_ERROR( - AddMissingFunctionDef(body_fdef, lookup_library, library)); - } - - // Builds a While operator. - NodeDef while_def; - NodeDefBuilder builder(frame->loop_cond->name(), "XlaWhile"); - builder.Attr("T", arg_types); - builder.Attr("cond", cond_name); - builder.Attr("body", body_name); - std::vector inputs; - for (int i = 0; i < frame->args.size(); ++i) { - const Arg& arg = frame->args[i]; - const Edge* in_edge; - TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge)); - if (in_edge->IsControlEdge()) { - builder.ControlInput(in_edge->src()->name()); - } else { - inputs.push_back(NodeDefBuilder::NodeOut( - in_edge->src()->name(), in_edge->src_output(), arg_types[i])); - } - } - builder.Input(inputs); - TF_RETURN_IF_ERROR(builder.Finalize(&while_def)); - TF_ASSIGN_OR_RETURN(Node * while_node, AddNode(while_def, graph)); - - // Copies edges to the Enter nodes and from the Exit nodes onto the While. - for (int i = 0; i < frame->args.size(); ++i) { - const Arg& arg = frame->args[i]; - const Edge* in_edge; - TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge)); - if (in_edge->IsControlEdge()) { - graph->AddControlEdge(in_edge->src(), while_node); - } else { - graph->AddEdge(in_edge->src(), in_edge->src_output(), while_node, i); - } - - if (!arg.is_loop_invariant) { - // Add output edges if the output of the loop is consumed. - if (arg.exit != nullptr) { - std::vector edges(arg.exit->out_edges().begin(), - arg.exit->out_edges().end()); - for (const Edge* edge : edges) { - Node* dst = edge->dst(); - int dst_input = edge->dst_input(); - graph->RemoveEdge(edge); - - if (dst_input == Graph::kControlSlot) { - graph->AddControlEdge(while_node, dst); - } else { - graph->AddEdge(while_node, i, dst, dst_input); - } - } - } - } - } - - // Remove the old nodes from the graph, and add the while node to the parent - // frame. - for (Node* node : frame->nodes) { - graph->RemoveNode(node); - } - frame->nodes.clear(); - frame->parent->nodes.insert(while_node); - - VLOG(2) << "Frame " << frame->name << " after: " - << dump_graph::DumpGraphToFile("functionalize_after", *graph, - library); - - return Status::OK(); -} - -class FunctionalizeCond { - public: - // All nodes are assumed to be either in no branch, then branch, else branch, - // or both branches (such as merge nodes). - enum Branch { - kElseBranch = 0, - kThenBranch = 1, - kBoth = 2, - kNeither = 3, - kNumBranchTypes = 4 - }; - - // Returns a textual representation of the Branch b. - static string Branch_Name(FunctionalizeCond::Branch b); - - // Functionalize all the switch-merge nodes of a loop-free graph into XlaIf - // nodes. That is, attempt to transform every remaining switch and merge nodes - // in the graph into XlaIf nodes. - // Precondition: All while loops have been removed from graph. - static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library); - - private: - // CondArgNode represents a input to the conditional and its corresponding - // switch nodes. - struct CondArgNode { - explicit CondArgNode(Node* src, int src_output) - : src(src), src_output(src_output) {} - string ToString() const { - return strings::StrCat("src=", src->name(), ":", src_output, - " switches=", NodesToString(switches)); - } - - Node* src; - int src_output; - std::vector switches; - }; - using CondArgNodes = std::vector; - - struct ForwardFlowNode { - explicit ForwardFlowNode(Branch branch = Branch::kNeither) - : branch(branch), count(0) {} - string ToString() const { - return strings::StrCat("branch=", Branch_Name(branch), " count=", count); - } - Branch branch; - int count; - }; - - // Group of switch nodes that will be part of the same XlaIf. - struct SwitchCluster { - explicit SwitchCluster(const Edge* predicate_edge) - : predicate_edge(predicate_edge) {} - string ToString() const { - return strings::StrCat(name, " predicate=", predicate_edge->src()->name(), - " switches=", NodesToString(switches)); - } - - string name; - const Edge* predicate_edge; - std::vector switches; - }; - - FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library, - bool dump_graphs) - : library_(library), graph_(graph), dump_graphs_(dump_graphs) {} - - // Perform the actual cond functionalization. Iterate over groups of switch - // nodes (linked by common predicate), from innermost to outermost, and - // extract into XlaIf nodes. - Status FunctionalizeInternal(); - - // Determines the branch_map (mapping from node to branch of cond) and - // frontier (the nodes where the cond ends). - StatusOr, - std::unordered_set>> - DetermineBranchMapAndFrontier(const SwitchCluster& switch_cluster); - - // Returns XlaIf node created from subgraph of merge and switch nodes. This - // encapsulates the process of extracting the bodies needed for the then and - // else branch, creates a XlaIf node, removing the nodes of the branches from - // the graph and replacing the merge node with a XlaIf. - StatusOr ConvertToXlaIf(const CondArgNodes& cond_arg_nodes, - const SwitchCluster& switch_cluster, - const std::vector& switches); - - // Builds a XlaIfOp to replace the Switch-Graph-Merge cluster with. - StatusOr BuildAndAddXlaIfOp(const CondArgNodes& cond_arg_nodes, - const SwitchCluster& switch_cluster, - const std::vector& merge_nodes); - - // Extracts a function body corresponding to the given input edge of the merge - // node. - Status ExtractBody(const CondArgNodes& cond_arg_nodes, - const std::vector& switches, - const std::vector& merge_nodes, int input_edge, - Graph* body); - - // Adds all the input edges to `if_node` corresponding to the arguments. - Status AddInputEdges(const CondArgNodes& cond_arg_nodes, - const Edge* predicate_edge, Node* if_node); - - // Adds all output edges from the `if_node`. - Status AddOutputEdges(const std::vector& outputs, Node* if_node); - - // Returns the switch clusters of graph_ in postorder. Dead switch nodes are - // skipped and removed from the graph. - StatusOr> DeterminePredicateSwitchOrder(); - - // Update the state for destination based on the state of source and the node - // being updated. - Status Join(const ForwardFlowNode& src_state, const Node* dst, - ForwardFlowNode* dst_state); - - // Ensure that all nodes in the branch_map are dominated by the switch - // nodes. Returns nodes that are not dominated by the switches but are a - // control dependency of a node in the cond, and remove such control - // dependencies. - StatusOr> EnsureDominanceAndReturnNonDominatedControlNodes( - const std::unordered_map& branch_map, - const std::vector& switches); - - // Validates that the frontier of nodes for the conditional - // section are as expected. - Status ValidateFrontier( - const std::unordered_map& branch_map, - const std::unordered_set& frontier); - - FunctionLibraryDefinition* library_; - Graph* graph_; - bool dump_graphs_; -}; - -bool IsDeadSwitch(const Node* node) { - for (const Edge* e : node->out_edges()) { - const Node* dst = e->dst(); - if (!dst->IsIdentity()) { - return false; - } - for (const Edge* ee : dst->out_edges()) { - if (!ee->IsControlEdge() || !ee->dst()->IsSink()) { - return false; - } - } - } - return true; -} - -string FunctionalizeCond::Branch_Name(FunctionalizeCond::Branch b) { - const string branch_name[FunctionalizeCond::kNumBranchTypes + 1] = { - "else", "then", "both", "neither", "count"}; - return branch_name[b]; -} - -Status FunctionalizeCond::ValidateFrontier( - const std::unordered_map& - branch_map, - const std::unordered_set& frontier) { - std::unordered_set pending[kNumBranchTypes]; - for (Node* n : frontier) { - pending[branch_map.at(n).branch].insert(n); - } - TF_RET_CHECK(pending[kNeither].empty()) << NodesToString(pending[kNeither]); - for (const Node* n : pending[kBoth]) { - TF_RET_CHECK(IsMerge(n)) << n->DebugString(); - // Merge nodes may be in then or else branch too - } - int index = (pending[kThenBranch].size() <= pending[kElseBranch].size()) - ? kThenBranch - : kElseBranch; - int other = 1 - index; - for (const Node* n : pending[index]) { - if (pending[other].find(n) != pending[other].end()) { - return errors::Internal( - "Node (", n->DebugString().c_str(), - ") in both Else and Then branch should be in Both."); - } - } - // An empty frontier indicates a dead switch. Above we attempt to remove dead - // switch nodes, but not all are removed so don't treat it as an error yet. - // TODO(jpienaar): Find out why dead switch nodes remain. - // if (pending[kBoth].empty() && pending[kThenBranch].empty() && - // pending[kElseBranch].empty()) { - // return errors::Internal("Unexpected empty frontier for switch nodes"); - // } - return Status::OK(); -} - -Status FunctionalizeCond::Join(const ForwardFlowNode& src_state, - const Node* dst, ForwardFlowNode* dst_state) { - TF_RET_CHECK(dst_state->branch != Branch::kBoth && - dst_state->branch != Branch::kNumBranchTypes) - << "Unexpected/Invalid branch type: Merging " - << Branch_Name(src_state.branch) << " with " - << Branch_Name(dst_state->branch); - if (dst_state->branch == Branch::kNeither) { - dst_state->branch = src_state.branch; - } else if (src_state.branch != dst_state->branch && - src_state.branch != Branch::kNeither) { - if (IsMerge(dst)) { - dst_state->branch = Branch::kBoth; - } else { - return errors::Internal("Illegal merge:\n", src_state.ToString(), - " with ", dst_state->ToString(), " for\n", - dst->DebugString()); - } - } - ++dst_state->count; - return Status::OK(); -} - -StatusOr> -FunctionalizeCond::DeterminePredicateSwitchOrder() { - struct Cluster { - bool operator==(const Cluster& other) const { - return representative == other.representative; - } - int representative = -1; - }; - - // Perform a DFS over the graph and - // * Determine the reverse topological order of the nodes (there should be no - // cycles at this point so the post-order numbering corresponds to the - // reverse topological sorting); - // * Identify dead switches; - // * Initialize the cluster's representative; - std::vector> clusters(graph_->num_node_ids()); - std::vector dead_switches; - std::vector switch_order; - std::vector rev_topo_sorted_nodes; - DFS(*graph_, nullptr, [&](Node* n) { - clusters[n->id()].Get().representative = n->id(); - if (IsSwitch(n)) { - if (IsDeadSwitch(n)) { - dead_switches.push_back(n); - } else { - rev_topo_sorted_nodes.push_back(n); - switch_order.push_back(n); - } - } else if (n->IsOp()) { - // Exclude src and sink nodes from further consideration. - rev_topo_sorted_nodes.push_back(n); - } - }); - - std::vector switch_clusters; - // Return early if there are no switches in the graph. - if (switch_order.empty()) { - return switch_clusters; - } - - // Remove all dead switch nodes. - for (Node* n : dead_switches) { - VLOG(2) << "Removing dead switch: " << n->DebugString(); - graph_->RemoveNode(n); - } - - // Identify switch nodes that are part of the same control flow context by - // considering the operands of operations: an operation is part of the same - // control context as its operands unless the operation is a switch. Control - // dependencies are considered part of the same control flow context if the - // switch depth is the same (see comment below). - - // entry_cluster records the input cluster to a switch node. This is used when - // merging with a merge node where the dst's cluster is merged with the entry - // cluster of the merge node's cluster (which corresponds to a switch cluster - // and so has an entry cluster). - std::unordered_map*> entry_cluster; - - // Returns the output cluster of a node. Where the output cluster is cluster - // where the output of the node is used. For non-merge nodes this is simply - // the cluster they are part of, while for merge nodes it is the entry cluster - // of the cluster they are part of (this will correspond to the entry node of - // a switch node that dominates the merge). - auto find_output_cluster = [&](Node* n) { - UnionFind* cluster = &clusters[n->id()]; - if (!IsMerge(n)) return cluster; - auto it = entry_cluster.find(clusters[n->id()].Get().representative); - // If the cluster is not found in the entry_cluster map then an - // instruction not dominated by a switch node has been merged into the - // cluster of the merge. This indicates a failure of the clustering. - CHECK(it != entry_cluster.end()) - << "Unable to find entry for n=" << n->id() << " (" - << cluster->Get().representative << ")"; - return it->second; - }; - - // TODO(jpienaar): This could be combined with DetermineBranchMapAndFrontier. - std::vector switch_depth(graph_->num_node_ids()); - for (auto it = rev_topo_sorted_nodes.rbegin(); - it != rev_topo_sorted_nodes.rend(); ++it) { - Node* n = *it; - - // Compute switch depth. - int new_switch_depth = 0; - for (const Edge* e : n->in_edges()) { - Node* src = e->src(); - new_switch_depth = std::max( - new_switch_depth, switch_depth[src->id()] - (IsMerge(src) ? 1 : 0)); - } - switch_depth[n->id()] = new_switch_depth + (IsSwitch(n) ? 1 : 0); - - // Only merge the input operands of a switch. The switch's clustering itself - // is determined by the interaction of the switch's outputs. - if (IsSwitch(n)) { - Node* input; - TF_CHECK_OK(n->input_node(0, &input)); - entry_cluster[n->id()] = find_output_cluster(input); - UnionFind* cluster = entry_cluster[n->id()]; - int cluster_depth = switch_depth[cluster->Get().representative]; - // Merge the inputs of the switch node with one another. This results in - // predicates and control input residing in the same cluster. - for (const Edge* e : n->in_edges()) { - // Only consider the data inputs to the Switch node. - if (e->IsControlEdge()) continue; - - Node* src = e->src(); - UnionFind* src_cluster = find_output_cluster(src); - int src_cluster_depth = switch_depth[src_cluster->Get().representative]; - if (cluster_depth != src_cluster_depth) { - return errors::InvalidArgument( - "Unable to functionalize control flow in graph: Switch ('", - n->name(), "') has operands ('", input->name(), "' and '", - src->name(), "') that have different switch depths (", - cluster_depth, " != ", src_cluster_depth, ")"); - } - cluster->Merge(src_cluster); - } - continue; - } - - for (const Edge* e : n->in_edges()) { - Node* src = e->src(); - if (!src->IsOp()) continue; - UnionFind* cluster = find_output_cluster(src); - // Merge a node with its data operands and with its control operands if - // the src and dst are in the same ControlContext. The ControlContext is - // not explicitly available here, and instead the switch depth is used as - // a proxy here. Due to the invariant that control edges can only be from - // a containing scope to an inner scope or from the inner scope to its - // containing scope (for exit nodes), the switch depth will only match if - // the src and dst are in the same ControlContext. Control edges between - // ControlContexts are handled during the extraction. - int src_id = cluster->Get().representative; - int src_depth = switch_depth[src_id]; - if (!e->IsControlEdge() || new_switch_depth == src_depth) { - if (src_depth != new_switch_depth) { - // TODO(b/77601805) remove this when outside_compilation supports - // control flow. - if (str_util::StrContains(src->name(), "outside_compilation") || - str_util::StrContains(n->name(), "outside_compilation")) { - return errors::InvalidArgument( - "outside_compilation is not yet supported within TensorFlow " - "control flow constructs b/77601805"); - } - return errors::InvalidArgument( - "Unable to functionalize control flow in graph: Operand ('", - src->name(), "') and operator ('", n->name(), - "') have different switch depths (", src_depth, - " != ", new_switch_depth, ")"); - } - cluster->Merge(&clusters[n->id()]); - } - } - } - - if (dump_graphs_) { - // Mark the switch cluster each node is part of. - for (Node* n : graph_->nodes()) { - n->ClearAttr("_XlaFunctionalizeSwitchGroup"); - n->AddAttr("_XlaFunctionalizeSwitchGroup", - clusters[n->id()].Get().representative); - } - LOG(INFO) << "FunctionalizeControlFlow (with_clusters): " - << dump_graph::DumpGraphToFile("functionalize_clustered", *graph_, - library_); - } - - // Verify all the nodes of a cluster are at the same depth. - std::unordered_map> cluster_to_depth_node; - for (Node* n : graph_->nodes()) { - int depth = switch_depth[n->id()]; - int cluster_rep = clusters[n->id()].Get().representative; - auto it = cluster_to_depth_node.find(cluster_rep); - if (it == cluster_to_depth_node.end()) { - cluster_to_depth_node[cluster_rep] = std::make_pair(depth, n); - } else { - if (it->second.first != depth) { - return errors::Internal( - "Illegal clustering created, mismatch in depths:", "\n\t", - n->DebugString(), "(", clusters[n->id()].Get().representative, - ") at depth=", depth, " vs\n\t", it->second.second->DebugString(), - "(", clusters[n->id()].Get().representative, ") at depth ", - it->second.first); - } - } - } - - struct Hash { - size_t operator()(const std::pair& item) const { - return Hash64Combine(hash()(item.first), - std::hash()(item.second.representative)); - } - }; - - // Merge Switch nodes with common predicate. - std::unordered_map, int, Hash> predicate_index; - // The nodes in switch_order are in reverse topological order, but the - // clustered switches need not be (i.e., when considered as a cluster one - // element of a cluster may be later in the topological order than another - // node whose cluster is later in the topological order of clustered - // switches). - for (auto it = switch_order.rbegin(); it != switch_order.rend(); ++it) { - const Edge* pred_edge; - TF_CHECK_OK((*it)->input_edge(1, &pred_edge)); - // The predicate can be preceded by a identity node. Look through identity - // nodes to predicate. - while (pred_edge->src()->IsIdentity()) { - TF_CHECK_OK(pred_edge->src()->input_edge(0, &pred_edge)); - } - auto repr = std::make_pair(pred_edge->src(), clusters[(*it)->id()].Get()); - if (predicate_index.find(repr) == predicate_index.end()) { - predicate_index[repr] = switch_clusters.size(); - switch_clusters.emplace_back(pred_edge); - // Generate a name by concatenating with the cluster representative as - // there could be multiple switch clusters with the same predicate. - switch_clusters[predicate_index[repr]].name = strings::StrCat( - pred_edge->src()->name(), "_", repr.second.representative, "_If"); - } - switch_clusters[predicate_index[repr]].switches.push_back(*it); - } - - return switch_clusters; -} - -StatusOr> -FunctionalizeCond::EnsureDominanceAndReturnNonDominatedControlNodes( - const std::unordered_map& branch_map, - const std::vector& switches) { - std::vector old_control_nodes; - for (const auto& kv : branch_map) { - if (kv.second.count != kv.first->in_edges().size()) { - std::vector delete_edges; - for (const Edge* in : kv.first->in_edges()) { - auto it = branch_map.find(in->src()); - if (it == branch_map.end()) { - if (in->IsControlEdge()) { - old_control_nodes.push_back(in->src()); - delete_edges.push_back(in); - } else { - if (IsSwitch(in->src())) { - if (std::find(switches.begin(), switches.end(), in->src()) == - switches.end()) { - return errors::Internal( - "Unexpected switch node found during flow forward: ", - in->src()->DebugString()); - } - continue; - } - return errors::InvalidArgument( - "Value ", kv.first->name(), "'s input, ", in->src()->name(), - ", is not dominated by switch nodes ", NodesToString(switches)); - } - } - } - // Remove control edges from nodes that are not dominated by the switch - // nodes. New control dependencies will be added between these nodes and - // the XlaIf node inserted. - for (const Edge* e : delete_edges) { - graph_->RemoveEdge(e); - } - } - } - return old_control_nodes; -} - -StatusOr< - std::pair, - std::unordered_set>> -FunctionalizeCond::DetermineBranchMapAndFrontier( - const SwitchCluster& switch_cluster) { - std::unordered_map branch_map; - std::unordered_set frontier; - std::vector stack = switch_cluster.switches; - std::vector visited(graph_->num_node_ids(), false); - while (!stack.empty()) { - Node* n = stack.back(); - stack.pop_back(); - - if (visited[n->id()]) { - continue; - } - visited[n->id()] = true; - - // Propagate branch state along each edge of a switch node. - bool sink_only = true; - for (const Edge* e : n->out_edges()) { - Node* out = e->dst(); - if (!out->IsOp()) { - continue; - } - sink_only = false; - // Propagate branch information. - ForwardFlowNode& ffn = branch_map[out]; - if (IsSwitch(n)) { - int index = e->IsControlEdge() ? Branch::kNeither : e->src_output(); - TF_RETURN_WITH_CONTEXT_IF_ERROR( - Join(ForwardFlowNode(Branch(index)), out, &ffn), " when joining ", - e->DebugString()); - } else { - TF_RETURN_WITH_CONTEXT_IF_ERROR(Join(branch_map[n], out, &ffn), - " when joining ", e->DebugString()); - } - if (IsMerge(out)) { - if (out->in_edges().size() == ffn.count) { - frontier.insert(out); - } - } else if (!visited[out->id()]) { - stack.push_back(out); - } - } - if (sink_only) { - if (!IsIdentity(n)) { - VLOG(1) << "Feeding into sink: " << n->DebugString(); - } - } - } - - if (dump_graphs_) { - for (const auto& kv : branch_map) { - // Append attribute to the graph if running with logging to make the - // changes clearer in the visualization. - kv.first->AddAttr("_XlaFunctionalizeBranch", - Branch_Name(kv.second.branch)); - } - } - return std::make_pair(std::move(branch_map), std::move(frontier)); -} - -Status FunctionalizeCond::FunctionalizeInternal() { - TF_ASSIGN_OR_RETURN(std::vector predicate_switch_order, - DeterminePredicateSwitchOrder()); - - // Iterate from innermost set of clustered switches to outermost, replacing - // matching switch->merge subgraphs with single XlaIf nodes. - for (auto it = predicate_switch_order.rbegin(); - it != predicate_switch_order.rend(); ++it) { - auto& ps = *it; - VLOG(3) << "Flow down from: " << ps.ToString(); - - std::unordered_map branch_map; - std::unordered_set frontier; - TF_ASSIGN_OR_RETURN(std::tie(branch_map, frontier), - DetermineBranchMapAndFrontier(ps)); - - if (dump_graphs_) - LOG(INFO) << "FunctionalizeControlFlow (before XlaIf conversion): " - << dump_graph::DumpGraphToFile("functionalize_bc", *graph_, - library_); - TF_RETURN_IF_ERROR(ValidateFrontier(branch_map, frontier)); - - struct Hash { - size_t operator()(const std::pair& item) const { - return Hash64Combine(hash()(item.first), - std::hash()(item.second)); - } - }; - - // Sort the merge and switch nodes using NodeCmp. The switch-nodes are - // further grouped (post sorting) by input to the switch node as in the - // functionalized form each input will be passed in only once. This grouping - // should retain the sorted order. - CondArgNodes cond_arg_nodes; - std::sort(ps.switches.begin(), ps.switches.end(), NodeCmp()); - std::unordered_map, int, Hash> input_index; - for (Node* switch_node : ps.switches) { - const Edge* e; - TF_RETURN_IF_ERROR(switch_node->input_edge(0, &e)); - std::pair key = std::make_pair(e->src(), e->src_output()); - if (input_index.find(key) == input_index.end()) { - input_index[key] = cond_arg_nodes.size(); - cond_arg_nodes.emplace_back(key.first, key.second); - } - cond_arg_nodes.at(input_index.at(key)).switches.push_back(switch_node); - } - std::vector merge_nodes(frontier.begin(), frontier.end()); - std::sort(merge_nodes.begin(), merge_nodes.end(), NodeCmp()); - - TF_ASSIGN_OR_RETURN(std::vector old_control_nodes, - EnsureDominanceAndReturnNonDominatedControlNodes( - branch_map, ps.switches)); - - TF_ASSIGN_OR_RETURN(Node * if_node, - ConvertToXlaIf(cond_arg_nodes, ps, merge_nodes)); - for (Node* old : old_control_nodes) { - graph_->AddControlEdge(old, if_node); - } - - for (auto& del_kv : branch_map) { - graph_->RemoveNode(del_kv.first); - } - for (auto& kv : cond_arg_nodes) { - for (Node* node : kv.switches) { - graph_->RemoveNode(node); - } - } - if (dump_graphs_) - LOG(INFO) << "FunctionalizeControlFlow (after XlaIf conversion): " - << dump_graph::DumpGraphToFile("functionalize_ac", *graph_, - library_); - } - return Status::OK(); -} - -StatusOr FunctionalizeCond::BuildAndAddXlaIfOp( - const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster, - const std::vector& merge_nodes) { - VLOG(2) << "Build if op for " << switch_cluster.name; - - NodeDef if_def; - // Create a new If node using the name of the merge node. - NodeDefBuilder builder(switch_cluster.name, "XlaIf"); - string branch[] = {"else_branch", "then_branch"}; - for (int i = 0; i < 2; ++i) { - static std::atomic sequence_num(0LL); - int64 id = ++sequence_num; - - NameAttrList body_name; - body_name.set_name( - strings::StrCat("_functionalize_if_", branch[i], "_", id)); - auto body = xla::MakeUnique(graph_->op_registry()); - TF_RETURN_IF_ERROR(ExtractBody(cond_arg_nodes, switch_cluster.switches, - merge_nodes, i, body.get())); - VLOG(3) << "Body " << branch[i] << ": " << DebugString(body.get()); - FunctionDef body_fdef; - TF_RETURN_IF_ERROR(GraphToFunctionDef(*body, body_name.name(), &body_fdef)); - TF_RETURN_IF_ERROR(library_->AddFunctionDef(body_fdef)); - builder.Attr(branch[i], body_name); - } - - // Build input type. - std::vector inputs; - DataTypeVector in_arg_types; - for (auto& kv : cond_arg_nodes) { - bool inserted = false; - for (const Node* arg : kv.switches) { - const Edge* in_edge; - TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge)); - if (in_edge->IsControlEdge()) { - builder.ControlInput(in_edge->src()->name()); - } else { - if (!inserted) { - DataType dtype = arg->input_type(0); - inputs.emplace_back(NodeDefBuilder::NodeOut( - in_edge->src()->name(), in_edge->src_output(), dtype)); - in_arg_types.push_back(dtype); - inserted = true; - } - } - } - } - builder.Attr("Tin", in_arg_types); - - // Build output type. - DataTypeVector out_type; - for (const Node* merge : merge_nodes) { - DataType dtype = merge->output_type(0); - out_type.push_back(dtype); - } - builder.Attr("Tout", out_type); - - builder.Attr("Tcond", DT_BOOL); - builder.Device(switch_cluster.predicate_edge->src()->assigned_device_name()); - // Conditional should be the first input ... - builder.Input(NodeDefBuilder::NodeOut( - switch_cluster.predicate_edge->src()->name(), - switch_cluster.predicate_edge->src_output(), - switch_cluster.predicate_edge->src()->output_type(0))); - // ... followed by the other inputs. - builder.Input(inputs); - - TF_RETURN_IF_ERROR(builder.Finalize(&if_def)); - TF_ASSIGN_OR_RETURN(Node * if_node, AddNode(if_def, graph_)); - return if_node; -} - -Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes, - const std::vector& switches, - const std::vector& merge_nodes, - int input_edge, Graph* body) { - VLOG(2) << "ExtractBody for " << NodesToString(merge_nodes) << " along edge " - << input_edge; - std::vector squash_src_outputs(graph_->num_node_ids(), false); - std::vector node_map(graph_->num_node_ids(), nullptr); - int arg_count = 0; - for (auto& kv : cond_arg_nodes) { - Node* arg_node = nullptr; - for (const auto* arg : kv.switches) { - DataType dtype = arg->input_type(0); - if (arg_node == nullptr) { - TF_ASSIGN_OR_RETURN(arg_node, BuildArgNode(body, dtype, arg_count++)); - } - node_map.at(arg->id()) = arg_node; - squash_src_outputs.at(arg->id()) = true; - } - } - - std::vector stack; - stack.reserve(merge_nodes.size()); - for (int j = 0; j < merge_nodes.size(); ++j) { - Node* node = merge_nodes[j]; - TF_ASSIGN_OR_RETURN(node_map.at(node->id()), - BuildRetvalNode(body, node->output_type(0), - /*index=*/j)); - const Edge* in_edge; - TF_RETURN_IF_ERROR(node->input_edge(input_edge, &in_edge)); - Node* in = in_edge->src(); - if (node_map.at(in->id()) == nullptr) { - node_map.at(in->id()) = body->CopyNode(in); - } - - if (std::find(switches.begin(), switches.end(), in) == switches.end()) { - body->AddEdge(node_map.at(in->id()), in_edge->src_output(), - node_map.at(node->id()), 0); - } else { - body->AddEdge(node_map.at(in->id()), 0, node_map.at(node->id()), 0); - // Don't include input nodes that are already just returned in stack. - continue; - } - stack.push_back(in); - } - - return CopySubgraph(*graph_, nullptr, stack, squash_src_outputs, &node_map, - body); -} - -Status FunctionalizeCond::AddInputEdges(const CondArgNodes& cond_arg_nodes, - const Edge* predicate_edge, - Node* if_node) { - VLOG(3) << "AddInputEdges for " << if_node->name(); - int index = 0; - graph_->AddEdge(predicate_edge->src(), predicate_edge->src_output(), if_node, - index++); - for (auto& arg : cond_arg_nodes) { - if (arg.src_output == Graph::kControlSlot) { - graph_->AddControlEdge(arg.src, if_node); - } else { - graph_->AddEdge(arg.src, arg.src_output, if_node, index++); - } - } - return Status::OK(); -} - -Status FunctionalizeCond::AddOutputEdges(const std::vector& outputs, - Node* if_node) { - VLOG(3) << "AddOutputEdges for " << if_node->name(); - for (int i = 0; i < outputs.size(); ++i) { - Node* node = outputs[i]; - std::vector edges(node->out_edges().begin(), - node->out_edges().end()); - for (const Edge* edge : edges) { - Node* dst = edge->dst(); - int dst_input = edge->dst_input(); - - if (edge->src_output() > 0) { - return errors::Unimplemented("Output of index (", edge->src_output(), - ") of merge node ", node->name()); - } - - int src_output = - dst_input == Graph::kControlSlot ? Graph::kControlSlot : i; - graph_->RemoveEdge(edge); - graph_->AddEdge(if_node, src_output, dst, dst_input); - } - } - return Status::OK(); -} - -StatusOr FunctionalizeCond::ConvertToXlaIf( - const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster, - const std::vector& merge_nodes) { - VLOG(1) << "ConvertToXlaIf for " << switch_cluster.ToString() << " -> " - << NodesToString(merge_nodes); - - // Extract bodies and builds a If operator. - TF_ASSIGN_OR_RETURN( - Node * if_node, - BuildAndAddXlaIfOp(cond_arg_nodes, switch_cluster, merge_nodes)); - TF_RETURN_IF_ERROR( - AddInputEdges(cond_arg_nodes, switch_cluster.predicate_edge, if_node)); - TF_RETURN_IF_ERROR(AddOutputEdges(merge_nodes, if_node)); - - return if_node; -} - -Status FunctionalizeCond::Functionalize(Graph* graph, - FunctionLibraryDefinition* library) { - VLOG(1) << "FunctionalizeCond::Functionalize"; - FunctionalizeCond fc(graph, library, /*dump_graphs=*/VLOG_IS_ON(2)); - return fc.FunctionalizeInternal(); -} - -} // namespace - -// Transformation that converts TensorFlow's graph control flow constructs into -// functional equivalents. -Status FunctionalizeControlFlow(Graph* graph, - FunctionLibraryDefinition* library) { - return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library); -} - Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library, Graph* graph, FunctionLibraryDefinition* library) { @@ -1434,90 +46,26 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library, << dump_graph::DumpGraphToFile("functionalize_initial", *graph, library); - // Note: BuildControlFlowInfo() requires that the graph's source node is - // connected to all source nodes in the graph. Many graphs violate this - // invariant. - std::vector cf_info; - TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &cf_info)); - - // Builds Frames, indexed by name. - std::unordered_map frames; - for (Node* node : graph->op_nodes()) { - const ControlFlowInfo& cf = cf_info[node->id()]; - - VLOG(2) << "node: " << node->name() << " (" << node->id() - << ") frame_name: " << cf.frame_name - << " frame: " << (cf.frame ? cf.frame->name() : "---") - << " parent_frame: " - << (cf.parent_frame ? cf.parent_frame->name() : "---"); - TF_RET_CHECK(cf.frame != nullptr && cf.parent_frame != nullptr); - - Frame& frame = frames[cf.frame_name]; - Frame* parent = &frames[cf_info[cf.parent_frame->id()].frame_name]; - if (frame.parent == nullptr) { - frame.parent = parent; - frame.name = cf.frame_name; - ++parent->num_children; - } else if (frame.parent != parent) { - return errors::InvalidArgument("Mismatched parent frames for ", - cf.frame->id(), ": ", parent->name, " vs ", - frame.parent->name); - } - - if (IsEnter(node)) { - Arg arg; - arg.enter = node; - TF_RETURN_IF_ERROR(GetNodeAttr(arg.enter->attrs(), "is_constant", - &arg.is_loop_invariant)); - frame.args.push_back(arg); - } else if (IsLoopCond(node)) { - if (frame.loop_cond) { - return errors::InvalidArgument( - "Loop ", cf.frame_name, - " has more than one LoopCond node: ", node->name(), " and ", - frame.loop_cond->name()); - } - frame.loop_cond = node; - } - frame.nodes.insert(node); - } - - // Adds frames with no children (i.e., the innermost frames) to a worklist. - std::deque worklist; - for (auto& frame : frames) { - if (frame.second.num_children == 0) { - worklist.push_back(&frame.second); - } - } - - // Eliminate loops from innermost to outermost. - while (!worklist.empty()) { - Frame* frame = worklist.front(); - worklist.pop_front(); - if (frame->parent == frame) { - // Skip the root frame. - continue; - } - - TF_RETURN_IF_ERROR( - FunctionalizeLoop(lookup_library, graph, frame, library)); - - // If the parent has no remaining children, add it to the worklist. - --frame->parent->num_children; - if (frame->parent->num_children == 0) { - worklist.push_back(frame->parent); - } - } + // Functionalize and remove while loops from graph. + TF_RETURN_IF_ERROR(FunctionalizeWhileLoop(lookup_library, graph, library)); // FunctionalizeControlFlow is invoked for every function, so the loops's // bodies and conditionals that were extracted into functions will be handled // in successive invocations. - TF_RETURN_IF_ERROR(FunctionalizeCond::Functionalize(graph, library)); + TF_RETURN_IF_ERROR(FunctionalizeCond(graph, library)); VLOG(2) << "FunctionalizeControlFlow (final): " << dump_graph::DumpGraphToFile("functionalize_final", *graph, library); + return Status::OK(); } +// Transformation that converts TensorFlow's graph control flow constructs into +// functional equivalents. +Status FunctionalizeControlFlow(Graph* graph, + FunctionLibraryDefinition* library) { + return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h index d941041d15532446d1413f16fe64602bfb1a7daa..55600f2a8b5302cef26b9be4ccd0f8804476a17a 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h @@ -16,14 +16,16 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_ #define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_ +#include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/graph/graph.h" namespace tensorflow { // Transformation that converts tf.while_loop() loops into functional While -// operators, suitable for XLA compilation. If lookup_library is provided, use -// it to make the library for control flow self-contained. +// operators and tf.cond() conditionals into function If operators, suitable for +// XLA compilation. If lookup_library is provided, use it to make the library +// for control flow self-contained. Status FunctionalizeControlFlow(Graph* graph, FunctionLibraryDefinition* library); Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library, diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc index 14977a908ae2b0ff7e13b634c41b6d331b4b8a36..c068a4110c0bb14282379eb7a3cbdae4e80ddbd6 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/core/framework/op.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/graph/validate.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/util/equal_graph_def.h" @@ -36,12 +37,12 @@ limitations under the License. namespace tensorflow { namespace { -// Returns the names of the "then" and "else" functions for the XlaIf node in a +// Returns the names of the "then" and "else" functions for the If node in a // graph. Status FindIfThenAndElse(const GraphDef& graph, string* op_name, NameAttrList* then_fn, NameAttrList* else_fn) { for (const NodeDef& node : graph.node()) { - if (node.op() == "XlaIf") { + if (node.op() == "If") { *op_name = node.name(); const NameAttrList* result; TF_RETURN_IF_ERROR(GetNodeAttr(node, "then_branch", &result)); @@ -51,7 +52,7 @@ Status FindIfThenAndElse(const GraphDef& graph, string* op_name, return Status::OK(); } } - return errors::NotFound("No XlaIf node found in graph"); + return errors::NotFound("No If node found in graph"); } // Graph: @@ -114,8 +115,13 @@ TEST(FunctionalizeControlFlow, Conditional) { auto if_op = ops::XlaIf(scope.WithOpName(op_name), less, std::initializer_list{less, y, x}, then_fn, else_fn, {DT_INT32}); + auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]); GraphDef expected; TF_EXPECT_OK(scope.ToGraphDef(&expected)); + // TODO(jpienaar): Create wrapper for IfOp. + for (NodeDef& n : *expected.mutable_node()) { + if (n.op() == "XlaIf") n.set_op("If"); + } TF_EXPECT_GRAPH_EQ(expected, graph_def); } @@ -799,11 +805,11 @@ TEST(FunctionalizeControlFlow, Complex) { auto assign = ops::AssignAddVariableOp( scope.WithOpName("outer/inner/assign_add"), enter_var, add_jkx); - auto one = - ops::Const(scope.WithOpName("outer/inner/One") - .WithControlDependencies( - gtl::ArraySlice{assign.operation}), - 1); + auto one = ops::Const( + scope.WithOpName("outer/inner/One") + .WithControlDependencies( + absl::Span{assign.operation}), + 1); auto add_j = ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one); @@ -817,7 +823,7 @@ TEST(FunctionalizeControlFlow, Complex) { scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1); auto add_i = ops::Add(scope.WithOpName("outer/add") - .WithControlDependencies(gtl::ArraySlice{ + .WithControlDependencies(absl::Span{ exit_j.output.op(), exit_k.output.op()}), identity_i, one_outer); auto next_iteration_i = @@ -923,7 +929,7 @@ TEST(FunctionalizeControlFlow, Complex) { scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1); auto add_i = ops::Add(scope.WithOpName("outer/add") - .WithControlDependencies(gtl::ArraySlice{ + .WithControlDependencies(absl::Span{ while_op[0].op(), while_op[1].op()}), identity_i, one_outer); @@ -985,11 +991,11 @@ TEST(FunctionalizeControlFlow, Complex) { auto assign = ops::AssignAddVariableOp( scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx); - auto one = - ops::Const(scope.WithOpName("outer/inner/One") - .WithControlDependencies( - gtl::ArraySlice{assign.operation}), - 1); + auto one = ops::Const( + scope.WithOpName("outer/inner/One") + .WithControlDependencies( + absl::Span{assign.operation}), + 1); auto add_j = ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one); diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..924fcdd9cd72a6472e0b2748680f2552fa65ec79 --- /dev/null +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc @@ -0,0 +1,72 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h" + +#include "tensorflow/core/framework/node_def.pb.h" + +namespace tensorflow { + +bool NodeCmpByNameResourcesLast::operator()(const Node* lhs, + const Node* rhs) const { + bool lhs_is_resource = + lhs->num_inputs() > 0 ? (lhs->input_type(0) == DT_RESOURCE) : false; + bool rhs_is_resource = + rhs->num_inputs() > 0 ? (rhs->input_type(0) == DT_RESOURCE) : false; + return std::tie(lhs_is_resource, lhs->name()) < + std::tie(rhs_is_resource, rhs->name()); +} + +xla::StatusOr AddNodeDefToGraph(const NodeDef& node_def, Graph* graph) { + Status status; + Node* inserted_node = graph->AddNode(node_def, &status); + if (!status.ok()) { + return status; + } + return inserted_node; +} + +xla::StatusOr BuildRetvalNode(Graph* graph, DataType type, int index) { + const char* const kRetValOp = "_Retval"; + NodeDef ret_def; + ret_def.set_op(kRetValOp); + ret_def.set_name(strings::StrCat(kRetValOp, index)); + AddNodeAttr("T", type, &ret_def); + AddNodeAttr("index", index, &ret_def); + return AddNodeDefToGraph(ret_def, graph); +} + +// Check that the graph has no cycle containing the given node. +Status CheckNodeNotInCycle(const Node* node, const int num_nodes) { + std::vector ready; + ready.push_back(node); + std::vector visited(num_nodes); + while (!ready.empty()) { + const Node* current_node = ready.back(); + ready.pop_back(); + visited[current_node->id()] = true; + for (const Edge* out : current_node->out_edges()) { + if (out->dst() == node) { + return errors::Internal("Detected a cycle: ", FormatNodeForError(*node), + " (", node->def().op(), ") feeds into itself."); + } else if (!visited[out->dst()->id()]) { + ready.push_back(out->dst()); + } + } + } + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h new file mode 100644 index 0000000000000000000000000000000000000000..61940e3586c59ffc660eaac8f8d035fbbbdfeffd --- /dev/null +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h @@ -0,0 +1,57 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_UTIL_H_ +#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_UTIL_H_ + +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/core/graph/graph.h" + +// Utility functions shared between functionalize cond and while. + +namespace tensorflow { + +// Check that the graph has no cycle containing the given node. +Status CheckNodeNotInCycle(const Node* node, const int num_nodes); + +// Comparison function used for sorting nodes consistently. +// a) resource variables are last, and +// b) sort lexicographically by name (for deterministic output). +struct NodeCmpByNameResourcesLast { + bool operator()(const Node* lhs, const Node* rhs) const; +}; + +// Returns the Node* created from the NodeDef in the Graph. +xla::StatusOr AddNodeDefToGraph(const NodeDef& node_def, Graph* graph); + +// Build a retval node of given type and index. +xla::StatusOr BuildRetvalNode(Graph* graph, DataType type, int index); + +// Returns a textual representation of the names of the nodes in the input. +template +string NodesToString(const T& nodes) { + return strings::StrCat("{", + absl::StrJoin(nodes, ",", + [](string* output, const Node* node) { + strings::StrAppend(output, + node->name()); + }), + "}"); +} + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_UTIL_H_ diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e3c4b0e0f695f0073f2c8aa1a4b342e39ea4be5 --- /dev/null +++ b/tensorflow/compiler/tf2xla/functionalize_while.cc @@ -0,0 +1,668 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/functionalize_while.h" + +#include +#include +#include +#include +#include + +#include "absl/memory/memory.h" +#include "absl/types/optional.h" +#include "tensorflow/compiler/jit/union_find.h" +#include "tensorflow/compiler/tf2xla/dump_graph.h" +#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h" +#include "tensorflow/compiler/tf2xla/tf2xla_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/control_flow.h" +#include "tensorflow/core/graph/node_builder.h" + +namespace tensorflow { +namespace { + +using xla::StatusOr; + +// Information about a loop argument. +struct Arg { + // Every loop argument has an Enter node. + Node* enter; + + // Is the loop argument a loop-invariant value? Taken from the `is_constant` + // attribute on the Enter node. + bool is_loop_invariant; + + // If 'is_loop_invariant' is true, the following are all nullptr. Non-constant + // arguments must have all of the following nodes: + Node* merge = nullptr; + Node* switch_node = nullptr; + Node* next_iteration = nullptr; + Node* exit = nullptr; +}; + +// Information about a loop frame. +struct Frame { + string name; + + // Pointer to the parent frame. The root frame has a pointer to itself. + Frame* parent = nullptr; + int num_children = 0; + + // Arguments to this loop. + std::vector args; + + // The loop condition of the loop. There should be exactly one loop condition + // in every loop. + Node* loop_cond = nullptr; + + // Set of nodes that belong to the loop frame. + std::unordered_set nodes; +}; + +// Copies a subgraph from `graph` to `output` by performing a reverse DFS +// starting at nodes in vector `stack`. +// `node_map` is a vector indexed by source node ID to dest nodes. +// Does not traverse into nodes in `node_map`, so by adding nodes to `node_map` +// before the traversal clients can cut the graph. If a frame is provided (frame +// != nullptr), then this functions will return an error if the +// traversal leaves 'frame'; the client must add enough nodes to `node_map` to +// cut the graph and prevent the traversal from escaping. +// +// `squash_src_outputs` contains a bool for each source node ID. If true, then +// the source output on that node will be replaced by zero when copied. This is +// used when replacing a Switch node with an _Arg node. The output we are +// taking from the Switch node was not necessarily the first output, but _Arg +// nodes only have one output. By adding the Switch node to `squash_src_outputs` +// we rewrite the src_output of the corresponding edge to be 0. +Status CopySubgraph(const Graph& graph, const Frame* frame, + std::vector stack, + const std::vector& squash_src_outputs, + std::vector* node_map, Graph* output) { + VLOG(3) << "Stack: " << NodesToString(stack); + std::vector visited(graph.num_node_ids(), false); + while (!stack.empty()) { + Node* n = stack.back(); + stack.pop_back(); + + VLOG(5) << "Copying node " << n->name(); + + if (visited[n->id()]) continue; + visited[n->id()] = true; + + for (const Edge* e : n->in_edges()) { + Node* src = e->src(); + if (frame != nullptr && frame->nodes.find(src) == frame->nodes.end()) { + // We traversed out of the loop frame, without encountering a cut node. + return errors::Internal("Graph traversal of loop frame ", frame->name, + " escaped frame at ", src->name(), + " without encountering an argument node."); + } + if ((*node_map)[src->id()] == nullptr) { + (*node_map)[src->id()] = output->CopyNode(src); + stack.push_back(src); + } + Node* src_copy = (*node_map)[e->src()->id()]; + int src_output = squash_src_outputs[e->src()->id()] && !e->IsControlEdge() + ? 0 + : e->src_output(); + Node* dst_copy = (*node_map)[e->dst()->id()]; + output->AddEdge(src_copy, src_output, dst_copy, e->dst_input()); + } + } + return Status::OK(); +} + +StatusOr BuildArgNode(Graph* graph, DataType type, int index) { + const char* const kArgOp = "_Arg"; + NodeDef arg_def; + NodeDefBuilder builder(strings::StrCat(kArgOp, index), kArgOp); + builder.Attr("T", type); + builder.Attr("index", index); + TF_RETURN_IF_ERROR(builder.Finalize(&arg_def)); + return AddNodeDefToGraph(arg_def, graph); +} + +// Builds a graph for the loop condition. +Status BuildLoopCondition(const Graph& graph, Frame* frame, + std::unique_ptr* cond_output) { + VLOG(2) << "Building loop condition for " << frame->name; + *cond_output = absl::make_unique(graph.op_registry()); + Graph* output = cond_output->get(); + + // Map from nodes in the original graph to the condition graph. + std::vector node_map(graph.num_node_ids(), nullptr); + std::vector squash_src_outputs(graph.num_node_ids(), false); + + // Build one _Arg node for each Enter node. + for (int i = 0; i < frame->args.size(); ++i) { + const Arg& arg = frame->args[i]; + + TF_ASSIGN_OR_RETURN(Node * arg_node, + BuildArgNode(output, arg.enter->input_type(0), i)); + if (arg.is_loop_invariant) { + node_map[arg.enter->id()] = arg_node; + } else { + node_map[arg.merge->id()] = arg_node; + } + } + + // Build a Retval node for the loop condition. The LoopCond nodes are always + // boolean because of the type constraints on the LoopCond op. + TF_ASSIGN_OR_RETURN(node_map[frame->loop_cond->id()], + BuildRetvalNode(output, DT_BOOL, 0)); + + // Performs a reverse DFS, copying nodes and edges to the output graph. + // The _Arg and _Retval nodes were added unconditionally above, so we are + // guaranteed to get the correct function signature. + return CopySubgraph(graph, frame, {frame->loop_cond}, squash_src_outputs, + &node_map, output); +} + +// Builds a graph for the loop body. +Status BuildLoopBody(const Graph& graph, Frame* frame, + DataTypeVector* arg_types, + std::unique_ptr* body_output) { + VLOG(2) << "Building loop body for " << frame->name; + *body_output = absl::make_unique(graph.op_registry()); + Graph* output = body_output->get(); + + // Map from nodes in the original graph to the condition graph. + std::vector node_map(graph.num_node_ids(), nullptr); + std::vector squash_src_outputs(graph.num_node_ids(), false); + + // Build one _Arg node for each Enter node. + std::vector next_iterations; + next_iterations.reserve(frame->args.size()); + arg_types->reserve(frame->args.size()); + for (int i = 0; i < frame->args.size(); ++i) { + const Arg& arg = frame->args[i]; + + DataType dtype = arg.enter->input_type(0); + arg_types->push_back(dtype); + + TF_ASSIGN_OR_RETURN(Node * arg_node, BuildArgNode(output, dtype, i)); + + if (dtype == DT_RESOURCE) { + // The convention of the XLA bridge is that resource variable arguments + // are only inputs to the loop body and have no corresponding output. + // TODO(b/37741920): change the convention so that DT_RESOURCE variables + // are both inputs and outputs, and then remove this case. + TF_RET_CHECK(arg.is_loop_invariant); + node_map[arg.enter->id()] = arg_node; + } else { + TF_ASSIGN_OR_RETURN(Node * retval_node, + BuildRetvalNode(output, dtype, i)); + + if (arg.is_loop_invariant) { + // Argument is loop-invariant. Forward it from the Arg to the Retval. + node_map[arg.enter->id()] = arg_node; + output->AddEdge(arg_node, 0, retval_node, 0); + } else { + // Argument is loop-varying. + node_map[arg.switch_node->id()] = arg_node; + // The Switch node has two outputs, but _Arg only has one. This tells + // the CopySubgraph function to rewrite the output number of edges from + // the _Arg node to be 0 rather than copying the output number from the + // Switch node. + squash_src_outputs[arg.switch_node->id()] = true; + node_map[arg.next_iteration->id()] = retval_node; + next_iterations.push_back(arg.next_iteration); + } + } + } + + // Performs a reverse DFS, copying nodes and edges to the output graph. + // The _Arg and _Retval nodes were added unconditionally above, so we are + // guaranteed to get the correct function signature. + TF_RETURN_IF_ERROR(CopySubgraph(graph, frame, std::move(next_iterations), + squash_src_outputs, &node_map, output)); + + return Status::OK(); +} + +// Copy the FunctionDef of given function from lookup_library to library, if +// it can be found in lookup_library but is missing from library. +Status AddMissingFunctionByName(const string& function_name, + const FunctionLibraryDefinition* lookup_library, + FunctionLibraryDefinition* library) { + if (!library->Find(function_name) && lookup_library->Find(function_name)) { + return library->AddFunctionDef(*lookup_library->Find(function_name)); + } + return Status::OK(); +} + +// Iterate over all functions that the given fdef refers to. Copy the missing +// FunctionDefs from lookup_library to library. +Status AddMissingFunctionDef(const FunctionDef& fdef, + const FunctionLibraryDefinition* lookup_library, + FunctionLibraryDefinition* library) { + TF_RET_CHECK(lookup_library); + for (const NodeDef& node : fdef.node_def()) { + if (library->Find(node.op())) { + continue; + } + // The function referred by 'SymbolicGradient' node is specified in its + // attribute 'f'. + if (node.op() == FunctionLibraryDefinition::kGradientOp) { + const AttrValue* attr = + AttrSlice(&node.attr()).Find(FunctionLibraryDefinition::kFuncAttr); + if (!attr) { + return errors::InvalidArgument("SymbolicGradient is missing attr: f"); + } + const string& func_name = attr->func().name(); + TF_RETURN_IF_ERROR( + AddMissingFunctionByName(func_name, lookup_library, library)); + // Copy the user-defined gradient function if it exists. + const string grad_name = lookup_library->FindGradient(func_name); + if (!grad_name.empty() && library->FindGradient(func_name).empty()) { + TF_RETURN_IF_ERROR( + AddMissingFunctionByName(grad_name, lookup_library, library)); + GradientDef grad_def; + grad_def.set_function_name(func_name); + grad_def.set_gradient_func(grad_name); + TF_RETURN_IF_ERROR(library->AddGradientDef(grad_def)); + } + } else if (lookup_library->Find(node.op())) { + TF_RETURN_IF_ERROR( + library->AddFunctionDef(*lookup_library->Find(node.op()))); + } + } + return Status::OK(); +} + +Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library, + Graph* graph, Frame* frame, + FunctionLibraryDefinition* library) { + VLOG(2) << "Frame " << frame->name << " before: " + << dump_graph::DumpGraphToFile("functionalize_before", *graph, + library); + + // Split loop-varying Enter nodes with multiple successors. If the same + // Tensor is fed as input to multiple loop arguments, we may end up with a + // shared Enter node. We clone Enter nodes with multiple successors to + // maintain the invariant of a unique Enter node per argument of the final + // loop. + std::vector args; + for (const Arg& arg : frame->args) { + if (arg.is_loop_invariant) { + args.push_back(arg); + } else { + std::vector edges(arg.enter->out_edges().begin(), + arg.enter->out_edges().end()); + for (int i = 0; i < edges.size(); ++i) { + if (edges[i]->IsControlEdge() && edges[i]->dst()->IsSink()) { + continue; + } + TF_RET_CHECK(!edges[i]->IsControlEdge()) << edges[i]->src()->name(); + Arg new_arg; + new_arg.is_loop_invariant = false; + if (i == 0) { + new_arg.enter = arg.enter; + } else { + new_arg.enter = graph->CopyNode(arg.enter); + frame->nodes.insert(new_arg.enter); + for (Edge const* e : arg.enter->in_edges()) { + graph->AddEdge(e->src(), e->src_output(), new_arg.enter, + e->IsControlEdge() ? Graph::kControlSlot : 0); + } + Node* dst = edges[i]->dst(); + int dst_input = edges[i]->dst_input(); + graph->RemoveEdge(edges[i]); + graph->AddEdge(new_arg.enter, 0, dst, dst_input); + } + args.push_back(new_arg); + } + } + } + frame->args = std::move(args); + + std::sort(frame->args.begin(), frame->args.end(), + [](const Arg& a, const Arg& b) { + return NodeCmpByNameResourcesLast()(a.enter, b.enter); + }); + + if (frame->loop_cond == nullptr) { + return errors::InvalidArgument("Loop ", frame->name, + " has no LoopCond node"); + } + + // Find the set of Switch nodes that are successors of the LoopCond. + std::unordered_set switches; + for (const Edge* edge : frame->loop_cond->out_edges()) { + if (!edge->IsControlEdge() && IsSwitch(edge->dst()) && + edge->dst_input() == 1) { + switches.insert(edge->dst()); + } + } + + // For each non-constant argument, looks for the following pattern of nodes: + // Enter ----> Merge --------> Switch --> Exit + // ^ ^ + // | | + // NextIteration LoopCond + // ^ ^ + // | | + // ... ... + for (Arg& arg : frame->args) { + if (!arg.is_loop_invariant) { + // Follow the edge from the Enter to Merge. + const Edge* enter_merge = nullptr; + for (const Edge* e : arg.enter->out_edges()) { + // Ignore control-edges to the sink node. These are allowed by the + // graph invariants, although probably they should have been stripped + // off earlier. + if (e->IsControlEdge() && e->dst()->IsSink()) { + continue; + } + if (enter_merge != nullptr) { + return errors::Internal("Enter node for loop-varying argument ", + FormatNodeForError(*arg.enter), + " has multiple successors: ", + FormatNodeForError(*enter_merge->dst()), + " and ", FormatNodeForError(*e->dst())); + } + enter_merge = e; + } + if (enter_merge == nullptr) { + return errors::Internal("Enter node for loop-varying argument ", + FormatNodeForError(*arg.enter), + " has zero successors"); + } + arg.merge = enter_merge->dst(); + if (!IsMerge(arg.merge)) { + return errors::InvalidArgument( + "Successor of Enter node for loop-varying argument ", + FormatNodeForError(*arg.merge), + " is not a Merge node; got: ", arg.merge->type_string()); + } + + // Find the NextIteration from the merge. There should be two inputs to + // the Merge and the NextIteration should be the other input. + if (arg.merge->input_types().size() != 2) { + return errors::InvalidArgument( + "Unexpected number of inputs to Merge node for loop-varying " + "argument ", + FormatNodeForError(*arg.merge), "; expected 2, got ", + arg.merge->input_types().size()); + } + TF_RETURN_IF_ERROR(arg.merge->input_node(1 - enter_merge->dst_input(), + &arg.next_iteration)); + if (!IsNextIteration(arg.next_iteration)) { + return errors::InvalidArgument( + "Expected NextIteration node as input to Merge node; got node ", + FormatNodeForError(*arg.next_iteration), " with kind ", + arg.next_iteration->type_string()); + } + + // Find the Switch successor of the Merge. There should be exactly one + // Switch node that is a successor of both the Merge and the LoopCond. + for (const Edge* edge : arg.merge->out_edges()) { + if (edge->dst_input() == 0 && IsSwitch(edge->dst()) && + switches.find(edge->dst()) != switches.end()) { + if (arg.switch_node != nullptr) { + return errors::InvalidArgument("Duplicate Switch successors to ", + FormatNodeForError(*arg.merge)); + } + arg.switch_node = edge->dst(); + } + } + if (arg.switch_node == nullptr) { + return errors::InvalidArgument("Missing Switch successor to ", + FormatNodeForError(*arg.merge)); + } + + // Update the device on the Identity outputs of the switch to match their + // target. These Identity outputs do not + + // Loop over the switch node's output to: + // - Find the Exit successor. + // - Set the sharding on all Identity outputs of the switch. These + // identity nodes are values used by the loop body or condition. + // The Identity node may have the wrong device so copy the device from + // one of its outputs instead. + std::deque possible_exit; + for (const Edge* edge : arg.switch_node->out_edges()) { + if (edge->src_output() == 0) { + possible_exit.push_back(edge); + } + if (IsIdentity(edge->dst())) { + TF_RETURN_IF_ERROR( + SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true)); + } + } + // TODO(b/67425339): Allow general graph between switch and exit. + while (!possible_exit.empty()) { + const Edge* edge = possible_exit.front(); + possible_exit.pop_front(); + if (IsExit(edge->dst())) { + if (arg.exit != nullptr) { + return errors::InvalidArgument( + "Duplicate Exit successors to ", + FormatNodeForError(*arg.switch_node)); + } + arg.exit = edge->dst(); + } else { + if (!IsIdentity(edge->dst())) { + return errors::Unimplemented("General graph between switch (", + FormatNodeForError(*arg.switch_node), + ") and exit node of frame ", + frame->name, " not supported yet."); + } + for (const Edge* out : edge->dst()->out_edges()) { + possible_exit.push_back(out); + } + } + } + } + } + + // Builds the condition and body functions. + std::unique_ptr cond_graph; + TF_RETURN_IF_ERROR(BuildLoopCondition(*graph, frame, &cond_graph)); + DataTypeVector arg_types; + std::unique_ptr body_graph; + TF_RETURN_IF_ERROR(BuildLoopBody(*graph, frame, &arg_types, &body_graph)); + + VLOG(2) << "Frame " << frame->name << " condition: " + << dump_graph::DumpGraphToFile("loop_condition", *cond_graph, library) + << " body: " << dump_graph::DumpGraphToFile("loop_body", *body_graph); + + static std::atomic sequence_num(0LL); + int64 id = ++sequence_num; + NameAttrList cond_name; + cond_name.set_name(strings::StrCat("_functionalize_cond_", id)); + NameAttrList body_name; + body_name.set_name(strings::StrCat("_functionalize_body_", id)); + FunctionDef cond_fdef; + TF_RETURN_IF_ERROR( + GraphToFunctionDef(*cond_graph, cond_name.name(), &cond_fdef)); + FunctionDef body_fdef; + TF_RETURN_IF_ERROR( + GraphToFunctionDef(*body_graph, body_name.name(), &body_fdef)); + + TF_RETURN_IF_ERROR(library->AddFunctionDef(cond_fdef)); + TF_RETURN_IF_ERROR(library->AddFunctionDef(body_fdef)); + if (lookup_library) { + // Copy missing FunctionDefs from lookup_library to library to make library + // self-contained. + TF_RETURN_IF_ERROR( + AddMissingFunctionDef(cond_fdef, lookup_library, library)); + TF_RETURN_IF_ERROR( + AddMissingFunctionDef(body_fdef, lookup_library, library)); + } + + // Builds a While operator. + NodeDef while_def; + NodeDefBuilder builder(frame->loop_cond->name(), "XlaWhile"); + builder.Attr("T", arg_types); + builder.Attr("cond", cond_name); + builder.Attr("body", body_name); + std::vector inputs; + for (int i = 0; i < frame->args.size(); ++i) { + const Arg& arg = frame->args[i]; + const Edge* in_edge; + TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge)); + if (in_edge->IsControlEdge()) { + builder.ControlInput(in_edge->src()->name()); + } else { + inputs.push_back(NodeDefBuilder::NodeOut( + in_edge->src()->name(), in_edge->src_output(), arg_types[i])); + } + } + builder.Input(inputs); + TF_RETURN_IF_ERROR(builder.Finalize(&while_def)); + TF_ASSIGN_OR_RETURN(Node * while_node, AddNodeDefToGraph(while_def, graph)); + + // Copies edges to the Enter nodes and from the Exit nodes onto the While. + for (int i = 0; i < frame->args.size(); ++i) { + const Arg& arg = frame->args[i]; + const Edge* in_edge; + TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge)); + if (in_edge->IsControlEdge()) { + graph->AddControlEdge(in_edge->src(), while_node); + } else { + graph->AddEdge(in_edge->src(), in_edge->src_output(), while_node, i); + } + + if (!arg.is_loop_invariant) { + // Add output edges if the output of the loop is consumed. + if (arg.exit != nullptr) { + std::vector edges(arg.exit->out_edges().begin(), + arg.exit->out_edges().end()); + for (const Edge* edge : edges) { + Node* dst = edge->dst(); + int dst_input = edge->dst_input(); + graph->RemoveEdge(edge); + + if (dst_input == Graph::kControlSlot) { + graph->AddControlEdge(while_node, dst); + } else { + graph->AddEdge(while_node, i, dst, dst_input); + } + } + } + } + } + + // Remove the old nodes from the graph, and add the while node to the parent + // frame. + for (Node* node : frame->nodes) { + graph->RemoveNode(node); + } + frame->nodes.clear(); + frame->parent->nodes.insert(while_node); + + VLOG(2) << "Frame " << frame->name << " after: " + << dump_graph::DumpGraphToFile("functionalize_after", *graph, + library); + + return Status::OK(); +} +} // namespace + +Status FunctionalizeWhileLoop(const FunctionLibraryDefinition* lookup_library, + Graph* graph, + FunctionLibraryDefinition* library) { + // Note: BuildControlFlowInfo() requires that the graph's source node is + // connected to all source nodes in the graph. Many graphs violate this + // invariant. + std::vector cf_info; + std::vector unreachable_nodes; + TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &cf_info, &unreachable_nodes)); + if (!unreachable_nodes.empty()) { + return errors::InvalidArgument( + "The following nodes are unreachable from the source in the graph: ", + errors::FormatNodeNamesForError(unreachable_nodes)); + } + + // Builds Frames, indexed by name. + std::unordered_map frames; + for (Node* node : graph->op_nodes()) { + const ControlFlowInfo& cf = cf_info[node->id()]; + + VLOG(2) << "node: " << node->name() << " (" << node->id() + << ") frame_name: " << cf.frame_name + << " frame: " << (cf.frame ? cf.frame->name() : "---") + << " parent_frame: " + << (cf.parent_frame ? cf.parent_frame->name() : "---"); + TF_RET_CHECK(cf.frame != nullptr && cf.parent_frame != nullptr); + + Frame& frame = frames[cf.frame_name]; + Frame* parent = &frames[cf_info[cf.parent_frame->id()].frame_name]; + if (frame.parent == nullptr) { + frame.parent = parent; + frame.name = cf.frame_name; + ++parent->num_children; + } + + if (IsEnter(node)) { + Arg arg; + arg.enter = node; + TF_RETURN_IF_ERROR(GetNodeAttr(arg.enter->attrs(), "is_constant", + &arg.is_loop_invariant)); + frame.args.push_back(arg); + } else if (IsLoopCond(node)) { + frame.loop_cond = node; + } + frame.nodes.insert(node); + } + + // Adds frames with no children (i.e., the innermost frames) to a worklist. + std::deque worklist; + for (auto& frame : frames) { + if (frame.second.num_children == 0) { + worklist.push_back(&frame.second); + } + } + + // Eliminate loops from innermost to outermost. + while (!worklist.empty()) { + Frame* frame = worklist.front(); + worklist.pop_front(); + if (frame->parent == frame) { + // Skip the root frame. + continue; + } + + TF_RETURN_IF_ERROR( + FunctionalizeLoop(lookup_library, graph, frame, library)); + + // If the parent has no remaining children, add it to the worklist. + --frame->parent->num_children; + if (frame->parent->num_children == 0) { + worklist.push_back(frame->parent); + } + } + + // There should be no cycle at this point, since while loops have been removed + // from graph. + // Check that the newly added XlaWhile nodes don't feed into themselves. + for (const Node* node : graph->op_nodes()) { + if (node->def().op() == "XlaWhile") { + TF_RETURN_WITH_CONTEXT_IF_ERROR( + CheckNodeNotInCycle(node, graph->num_node_ids()), + "Functionalizing loop failed."); + } + } + + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/functionalize_while.h b/tensorflow/compiler/tf2xla/functionalize_while.h new file mode 100644 index 0000000000000000000000000000000000000000..a708c6e4ec4e13527b4ee2d6c435dddee0a2b4e2 --- /dev/null +++ b/tensorflow/compiler/tf2xla/functionalize_while.h @@ -0,0 +1,32 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_ +#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_ + +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/graph/graph.h" + +namespace tensorflow { + +// Transformation that converts tf.while_loop() loops into functional While +// operators, suitable for XLA compilation. If lookup_library is provided, use +// it to make the library for control flow self-contained. +Status FunctionalizeWhileLoop(const FunctionLibraryDefinition* lookup_library, + Graph* graph, FunctionLibraryDefinition* library); + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_ diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc index 212f6f3966149ca0b2d2e012b19300e1f488f996..1ed1fb3b021b27be00086b2e71cc9309e3d76049 100644 --- a/tensorflow/compiler/tf2xla/graph_compiler.cc +++ b/tensorflow/compiler/tf2xla/graph_compiler.cc @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/xla/client/client_library.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/executor.h" #include "tensorflow/core/common_runtime/function.h" @@ -39,6 +40,7 @@ limitations under the License. #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/graph/validate.h" #include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/platform/logging.h" @@ -55,7 +57,8 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph, std::vector compile_time_constant_flags(expressions.size()); TF_RETURN_IF_ERROR( - BackwardsConstAnalysis(*graph, &compile_time_constant_flags)); + BackwardsConstAnalysis(*graph, &compile_time_constant_flags, + /*compile_time_const_nodes=*/nullptr)); args->resize(expressions.size()); for (int i = 0; i < args->size(); ++i) { @@ -87,6 +90,8 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph, } } // namespace Status GraphCompiler::Compile() { + // Check that the graph has no illegal cycles. + TF_RETURN_IF_ERROR(graph::ValidateGraphHasNoCycle(*graph_)); // Maintain a mapping from node id to node outputs. using NodeOutputs = std::vector; std::vector output_registry(graph_->num_node_ids()); @@ -141,6 +146,7 @@ Status GraphCompiler::Compile() { } OpKernelContext op_context(¶ms, n->num_outputs()); + VLOG(3) << "Translating " << params.op_kernel->name(); if (IsFunctional(n)) { TF_RETURN_IF_ERROR(CompileFunctionalNode(n, &op_context)); } else { @@ -157,9 +163,8 @@ Status GraphCompiler::Compile() { outputs.resize(n->num_outputs()); for (int o = 0; o < n->num_outputs(); ++o) { outputs[o] = op_context.release_output(o); - if (*op_context.is_output_dead() || outputs[o].tensor == nullptr) { + if (outputs[o].tensor == nullptr) { return errors::Internal("Missing xla_context ", o, "-th output from ", - (*op_context.is_output_dead() ? "(dead)" : ""), SummarizeNode(*n)); } } @@ -227,7 +232,7 @@ Status GraphCompiler::CompileFunctionalNode(Node* n, XlaContext& context = XlaContext::Get(op_context); auto* b = context.builder(); - auto output_handle = b->Call(*result.computation, handles); + auto output_handle = xla::Call(b, *result.computation, handles); // The output handle of `Call` computation is a tuple type. Unzip it so // that it can fit into future computations. int computation_output = 0; @@ -236,7 +241,7 @@ Status GraphCompiler::CompileFunctionalNode(Node* n, xla_op_context.SetConstantOutput(i, result.outputs[i].constant_value); } else { xla_op_context.SetOutput( - i, b->GetTupleElement(output_handle, computation_output)); + i, xla::GetTupleElement(output_handle, computation_output)); ++computation_output; } } diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index edd2ab6301ee891c433639ce300cde0c72929cea..4c776fb1781e4d0b0d1fa5f313536eb42d6856bb 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -6,6 +6,10 @@ package( load("//tensorflow:tensorflow.bzl", "tf_copts") load("//tensorflow:tensorflow.bzl", "tf_kernel_library") +load( + "//third_party/mkl:build_defs.bzl", + "if_mkl", +) tf_kernel_library( name = "xla_ops", @@ -18,6 +22,7 @@ tf_kernel_library( "bcast_ops.cc", "bias_ops.cc", "binary_ops.cc", + "broadcast_to_op.cc", "bucketize_op.cc", "cast_op.cc", "categorical_op.cc", @@ -58,6 +63,7 @@ tf_kernel_library( "pack_op.cc", "pad_op.cc", "pooling_ops.cc", + "qr_op.cc", "quantize_and_dequantize_op.cc", "random_ops.cc", "reduce_window_op.cc", @@ -79,19 +85,28 @@ tf_kernel_library( "shape_util.cc", "slice_op.cc", "softmax_op.cc", + "sort_ops.cc", "spacetobatch_op.cc", "spacetodepth_op.cc", + "sparse_to_dense_op.cc", "split_op.cc", "stack_ops.cc", "stateless_random_ops.cc", "strided_slice_op.cc", "tensor_array_ops.cc", "tile_ops.cc", + "topk_op.cc", "training_ops.cc", "transpose_op.cc", "unary_ops.cc", "unpack_op.cc", "variable_ops.cc", + "xla_broadcast_helper_op.cc", + "xla_conv_op.cc", + "xla_dot_op.cc", + "xla_pad_op.cc", + "xla_reduce_op.cc", + "xla_select_and_scatter_op.cc", ], hdrs = [ "index_ops.h", @@ -100,24 +115,37 @@ tf_kernel_library( deps = [ ":if_op", ":while_op", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/tf2xla/lib:batch_dot", "//tensorflow/compiler/tf2xla/lib:cholesky", + "//tensorflow/compiler/tf2xla/lib:qr", + "//tensorflow/compiler/tf2xla/lib:random", "//tensorflow/compiler/tf2xla/lib:scatter", "//tensorflow/compiler/tf2xla/lib:triangular_solve", "//tensorflow/compiler/tf2xla/lib:util", "//tensorflow/compiler/tf2xla/lib:while_loop", "//tensorflow/compiler/tf2xla/ops:xla_ops", "//tensorflow/compiler/xla:array4d", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/lib:constants", + "//tensorflow/compiler/xla/client/lib:math", + "//tensorflow/compiler/xla/client/lib:numeric", + "//tensorflow/compiler/xla/client/lib:pooling", + "//tensorflow/compiler/xla/client/lib:prng", + "//tensorflow/compiler/xla/client/lib:sorting", "//tensorflow/core:framework", "//tensorflow/core:image_ops_op_lib", "//tensorflow/core:lib", @@ -140,8 +168,14 @@ tf_kernel_library( "//tensorflow/core/kernels:sparse_to_dense_op", "//tensorflow/core/kernels:stack_ops", "//tensorflow/core/kernels:training_ops", - "//tensorflow/core/kernels:transpose_op", - ], + ] + if_mkl( + [ + "//tensorflow/core/kernels:mkl_transpose_op", + ], + [ + "//tensorflow/core/kernels:transpose_op", + ], + ), ) tf_kernel_library( @@ -152,8 +186,9 @@ tf_kernel_library( "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/tf2xla/ops:xla_ops", - "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", @@ -168,8 +203,8 @@ tf_kernel_library( "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/tf2xla/ops:xla_ops", - "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", @@ -203,10 +238,11 @@ tf_kernel_library( ":index_ops_kernel_argmax_float_2d", "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/tf2xla:xla_compiler", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla/client:client_library", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core/kernels:argmax_op", diff --git a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc index 1e59868621475cf72f4cc8b14dafec2dd8cd5c95..41a453da80dec6b6f57a4d222e2c33ef6b786a10 100644 --- a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" namespace tensorflow { namespace { @@ -31,7 +32,7 @@ class AddNOp : public XlaOpKernel { xla::XlaOp sum = ctx->Input(0); for (int i = 1; i < ctx->num_inputs(); ++i) { - sum = ctx->builder()->Add(sum, ctx->Input(i)); + sum = xla::Add(sum, ctx->Input(i)); } ctx->SetOutput(0, sum); diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc index 26fc1620a4f032b3af28de6e3a5af0e965e82341..276d744c096f8996c774964204feaa3762bdb844 100644 --- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc @@ -65,6 +65,6 @@ class XlaArgOp : public XlaOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(XlaArgOp); }; -REGISTER_XLA_OP(Name("_Arg").AllowResourceTypes(), XlaArgOp); +REGISTER_XLA_OP(Name("_Arg").AllowResourceTypes().CompilationOnly(), XlaArgOp); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc index b0ba25b9983c3a9af26728ce4b1c263c844327db..4cfe946b2e6146f034867c06e996ffae42b90705 100644 --- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc @@ -28,11 +28,10 @@ class BatchMatMulOp : public XlaOpKernel { } void Compile(XlaOpKernelContext* ctx) override { - auto result = BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1), + auto result = BatchDot(ctx->Input(0), ctx->Input(1), /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_, /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_); - OP_REQUIRES_OK(ctx, result.status()); - ctx->SetOutput(0, result.ValueOrDie()); + ctx->SetOutput(0, result); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc index 15e1815a4cf07ff50dd1431b6790d14781da590f..b3ad0aea84eef601de08909f760699b8700d28f4 100644 --- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/util/tensor_format.h" namespace tensorflow { @@ -34,10 +35,11 @@ class FusedBatchNormOp : public XlaOpKernel { ctx, FormatFromString(data_format_str, &data_format_), errors::InvalidArgument("Invalid data format: ", data_format_str)); OP_REQUIRES(ctx, - (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW), + (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW || + data_format_ == FORMAT_HWNC || data_format_ == FORMAT_HWCN), errors::InvalidArgument( "Unsupported data format ", ToString(data_format_), - "; supported formats are NHWC and NCHW")); + "; supported formats are NHWC, NCHW, HWNC and HWCN")); } void Compile(XlaOpKernelContext* ctx) override { @@ -48,8 +50,6 @@ class FusedBatchNormOp : public XlaOpKernel { OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(ctx->input_type(1), &scale_type)); - xla::XlaBuilder* builder = ctx->builder(); - xla::XlaOp input = ctx->Input(0); TensorShape input_shape = ctx->InputShape(0); @@ -59,30 +59,30 @@ class FusedBatchNormOp : public XlaOpKernel { // TODO(b/69928690): support mixed precision in the XLA batch normalization // operators. As a workaround, cast everything to the statistics type (which // may be more precise than the input type). - input = builder->ConvertElementType(input, scale_type); + input = xla::ConvertElementType(input, scale_type); if (is_training_) { - xla::XlaOp output = builder->BatchNormTraining( + xla::XlaOp output = xla::BatchNormTraining( input, ctx->Input(1), ctx->Input(2), epsilon_, feature_index); // In training mode, outputs the normalized value as well as the // calculated mean and variance. - ctx->SetOutput(0, builder->ConvertElementType( - builder->GetTupleElement(output, 0), input_type)); - ctx->SetOutput(1, builder->GetTupleElement(output, 1)); - ctx->SetOutput(2, builder->GetTupleElement(output, 2)); + ctx->SetOutput(0, xla::ConvertElementType(xla::GetTupleElement(output, 0), + input_type)); + ctx->SetOutput(1, xla::GetTupleElement(output, 1)); + ctx->SetOutput(2, xla::GetTupleElement(output, 2)); // Output 3 and 4 for "FusedBatchNorm" are currently marked as "reserved // space 1 & 2". They are used to pass the per-batch mean and // variance to the gradient. Here we maintain the same behavior by setting // them to the mean and variance calculated by BatchNormTraining. - ctx->SetOutput(3, builder->GetTupleElement(output, 1)); - ctx->SetOutput(4, builder->GetTupleElement(output, 2)); + ctx->SetOutput(3, xla::GetTupleElement(output, 1)); + ctx->SetOutput(4, xla::GetTupleElement(output, 2)); } else { - xla::XlaOp output = builder->BatchNormInference( + xla::XlaOp output = xla::BatchNormInference( input, ctx->Input(1), ctx->Input(2), ctx->Input(3), ctx->Input(4), epsilon_, feature_index); - ctx->SetOutput(0, builder->ConvertElementType(output, input_type)); + ctx->SetOutput(0, xla::ConvertElementType(output, input_type)); // Directly send input to output as mean and variance in inference mode. ctx->SetOutput(1, ctx->Input(3)); ctx->SetOutput(2, ctx->Input(4)); @@ -111,10 +111,11 @@ class FusedBatchNormGradOp : public XlaOpKernel { ctx, FormatFromString(data_format_str, &data_format_), errors::InvalidArgument("Invalid data format: ", data_format_str)); OP_REQUIRES(ctx, - (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW), + (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW || + data_format_ == FORMAT_HWNC || data_format_ == FORMAT_HWCN), errors::InvalidArgument( "Unsupported data format ", ToString(data_format_), - "; supported formats are NHWC and NCHW")); + "; supported formats are NHWC, NCHW, HWNC and HWCN")); } void Compile(XlaOpKernelContext* ctx) override { @@ -142,12 +143,12 @@ class FusedBatchNormGradOp : public XlaOpKernel { xla::XlaOp offset_backprop; if (is_training_) { xla::XlaOp output = - b->BatchNormGrad(activations, scale, mean, var, grad_backprop, - epsilon_, feature_index); + xla::BatchNormGrad(activations, scale, mean, var, grad_backprop, + epsilon_, feature_index); - x_backprop = b->GetTupleElement(output, 0); - scale_backprop = b->GetTupleElement(output, 1); - offset_backprop = b->GetTupleElement(output, 2); + x_backprop = xla::GetTupleElement(output, 0); + scale_backprop = xla::GetTupleElement(output, 1); + offset_backprop = xla::GetTupleElement(output, 2); } else { // Reduce over all dimensions except the feature dim. std::vector reduction_dims(input_dims - 1); @@ -164,35 +165,35 @@ class FusedBatchNormGradOp : public XlaOpKernel { auto converted = XlaHelpers::ConvertElementType(b, grad_backprop, accumulation_type); auto reduce = - b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type), - *ctx->GetOrCreateAdd(accumulation_type), reduction_dims); + xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type), + *ctx->GetOrCreateAdd(accumulation_type), reduction_dims); offset_backprop = XlaHelpers::ConvertElementType(b, reduce, scale_dtype); // scratch1 = rsqrt(pop_var + epsilon) auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5); - auto scratch1 = - b->Pow(b->Add(var, b->ConstantR0(epsilon_)), neg_half); + auto scratch1 = xla::Pow( + xla::Add(var, xla::ConstantR0(b, epsilon_)), neg_half); // scratch2 = sum(y_backprop * (x - mean)) auto mul = - b->Mul(grad_backprop, b->Sub(activations, mean, {feature_index})); + xla::Mul(grad_backprop, xla::Sub(activations, mean, {feature_index})); converted = XlaHelpers::ConvertElementType(b, mul, accumulation_type); reduce = - b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type), - *ctx->GetOrCreateAdd(accumulation_type), reduction_dims); + xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type), + *ctx->GetOrCreateAdd(accumulation_type), reduction_dims); auto scratch2 = XlaHelpers::ConvertElementType(b, reduce, scale_dtype); x_backprop = - b->Mul(grad_backprop, b->Mul(scratch1, scale), {feature_index}); - scale_backprop = b->Mul(scratch1, scratch2); + xla::Mul(grad_backprop, xla::Mul(scratch1, scale), {feature_index}); + scale_backprop = xla::Mul(scratch1, scratch2); } ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, x_backprop, input_dtype)); ctx->SetOutput(1, scale_backprop); ctx->SetOutput(2, offset_backprop); - ctx->SetConstantOutput(3, Tensor(scale_dtype, {})); - ctx->SetConstantOutput(4, Tensor(scale_dtype, {})); + ctx->SetConstantOutput(3, Tensor()); + ctx->SetConstantOutput(4, Tensor()); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc index 642278ab994bf3cc84396f093ed56b009a1435c1..edced6bc0e57cfc2b1c62f1e4a010dd316f7d092 100644 --- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc @@ -16,13 +16,14 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" namespace tensorflow { namespace { void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input, DataType input_dtype, const TensorShape& input_tensor_shape, - gtl::ArraySlice block_shape, + absl::Span block_shape, const xla::Literal& crops) { const int input_rank = input_tensor_shape.dims(); const gtl::InlinedVector input_shape = @@ -33,7 +34,7 @@ void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input, ctx, input_rank >= 1 + block_rank, errors::InvalidArgument("input rank should be >= ", 1 + block_rank, " instead of ", input_rank)); - gtl::ArraySlice remainder_shape(input_shape); + absl::Span remainder_shape(input_shape); remainder_shape.remove_prefix(1 + block_rank); OP_REQUIRES( @@ -45,7 +46,6 @@ void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input, ", 2] instead of ", xla::ShapeUtil::HumanString(crops.shape()))); - xla::XlaBuilder* b = ctx->builder(); const int64 batch_size = input_shape[0]; // Compute the product of the block_shape values. @@ -72,7 +72,7 @@ void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input, reshaped_shape[block_rank] = batch_size / block_num_elems; std::copy(input_shape.begin() + 1, input_shape.end(), reshaped_shape.begin() + block_rank + 1); - xla::XlaOp reshaped = b->Reshape(input, reshaped_shape); + xla::XlaOp reshaped = xla::Reshape(input, reshaped_shape); // 2. Permute dimensions of `reshaped` to produce `permuted` of shape // [batch / prod(block_shape), @@ -90,7 +90,7 @@ void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input, } std::iota(permutation.begin() + 1 + block_rank * 2, permutation.end(), 1 + block_rank * 2); - xla::XlaOp permuted = b->Transpose(reshaped, permutation); + xla::XlaOp permuted = xla::Transpose(reshaped, permutation); // 3. Reshape `permuted` to produce `reshaped_permuted` of shape // [batch / prod(block_shape), @@ -110,7 +110,8 @@ void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input, std::copy(remainder_shape.begin(), remainder_shape.end(), reshaped_permuted_shape.begin() + 1 + block_rank); - xla::XlaOp reshaped_permuted = b->Reshape(permuted, reshaped_permuted_shape); + xla::XlaOp reshaped_permuted = + xla::Reshape(permuted, reshaped_permuted_shape); // 4. Crop the start and end of dimensions `[1, ..., M]` of // `reshaped_permuted` according to `crops` to produce the output of shape: @@ -138,7 +139,7 @@ void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input, " end: ", crop_end, " size ", reshaped_permuted_shape[1 + i])); } xla::XlaOp output = - b->Slice(reshaped_permuted, start_indices, end_indices, strides); + xla::Slice(reshaped_permuted, start_indices, end_indices, strides); ctx->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc index ee2c920453c3bbaef2c145df743fddf999167c39..2e383b1473590403823863f89264e5381d8e8806 100644 --- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc @@ -16,10 +16,11 @@ limitations under the License. // XLA-specific Ops for broadcasting used in gradient // code. +#include "absl/strings/str_join.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/bcast.h" @@ -51,8 +52,8 @@ class BCastArgsOp : public XlaOpKernel { BCast bcast(shapes[0], shapes[1]); OP_REQUIRES(ctx, bcast.IsValid(), errors::InvalidArgument( - "Incompatible shapes: [", str_util::Join(shapes[0], ","), - "] vs. [", str_util::Join(shapes[1], ","), "]")); + "Incompatible shapes: [", absl::StrJoin(shapes[0], ","), + "] vs. [", absl::StrJoin(shapes[1], ","), "]")); const int64 len = bcast.output_shape().size(); Tensor output(DT_INT32, TensorShape({len})); @@ -105,8 +106,8 @@ class BCastGradArgsOp : public XlaOpKernel { BCast bcast(shapes[0], shapes[1]); OP_REQUIRES(ctx, bcast.IsValid(), errors::InvalidArgument( - "Incompatible shapes: [", str_util::Join(shapes[0], ","), - "] vs. [", str_util::Join(shapes[1], ","), "]")); + "Incompatible shapes: [", absl::StrJoin(shapes[0], ","), + "] vs. [", absl::StrJoin(shapes[1], ","), "]")); Output(ctx, 0, bcast.grad_x_reduce_idx()); Output(ctx, 1, bcast.grad_y_reduce_idx()); } diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc index 9d677f426650ea17a49e5ab1401078f04623fe97..41f540506ba41fbe7f91393e7b8e26a89e72ef0a 100644 --- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/util/tensor_format.h" @@ -60,8 +61,7 @@ class BiasOp : public XlaOpKernel { "of the input tensor: ", bias_shape.DebugString(), " vs. ", input_shape.DebugString())); - xla::XlaOp result = - ctx->builder()->Add(ctx->Input(0), ctx->Input(1), {feature_dim}); + xla::XlaOp result = xla::Add(ctx->Input(0), ctx->Input(1), {feature_dim}); ctx->SetOutput(0, result); } @@ -109,8 +109,8 @@ class BiasAddGradOp : public XlaOpKernel { auto converted = XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type); auto reduce = - b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type), - *ctx->GetOrCreateAdd(accumulation_type), reduce_dims); + xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type), + *ctx->GetOrCreateAdd(accumulation_type), reduce_dims); ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, reduce, input_type(0))); } diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc index f04cde878e98002d9442e0f3ec251c5197ef7969..df17da4c1ca07053cf63757f1acf2b1a3735e705 100644 --- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc @@ -19,7 +19,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/types.h" @@ -30,29 +30,30 @@ namespace { // A subclass of a XlaBinaryOp must build the computation that // describes the (tensor,tensor)->tensor function to apply to each element of // the input. -#define XLA_MAKE_BINARY(NAME, HLO) \ - class NAME##Op : public XlaBinaryOp { \ - public: \ - explicit NAME##Op(OpKernelConstruction* ctx) : XlaBinaryOp(ctx) {} \ - xla::XlaOp Computation( \ - XlaOpKernelContext* ctx, const xla::XlaOp& lhs, \ - const gtl::ArraySlice& lhs_shape, const xla::XlaOp& rhs, \ - const gtl::ArraySlice& rhs_shape, \ - const BCast& broadcast_helper, \ - const std::vector& extend_dimensions) override { \ - xla::XlaBuilder* b = ctx->builder(); \ - return HLO; \ - } \ - }; \ +#define XLA_MAKE_BINARY(NAME, HLO) \ + class NAME##Op : public XlaBinaryOp { \ + public: \ + explicit NAME##Op(OpKernelConstruction* ctx) : XlaBinaryOp(ctx) {} \ + xla::XlaOp Computation( \ + XlaOpKernelContext* ctx, const xla::XlaOp& lhs, \ + const absl::Span& lhs_shape, const xla::XlaOp& rhs, \ + const absl::Span& rhs_shape, \ + const BCast& broadcast_helper, \ + const std::vector& extend_dimensions) override { \ + xla::XlaBuilder* b = ctx->builder(); \ + (void)b; \ + return HLO; \ + } \ + }; \ REGISTER_XLA_OP(Name(#NAME), NAME##Op) -XLA_MAKE_BINARY(Add, b->Add(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Sub, b->Sub(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Mul, b->Mul(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Div, b->Div(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Add, xla::Add(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Sub, xla::Sub(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Mul, xla::Mul(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Div, xla::Div(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Atan2, b->Atan2(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Complex, b->Complex(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Atan2, xla::Atan2(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Complex, xla::Complex(lhs, rhs, extend_dimensions)); // Implementation of FloorDiv. Pseudo-code: // if ((x < 0) != (y < 0)) { @@ -67,13 +68,13 @@ static xla::XlaOp FloorDivImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x, std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper); auto zero = XlaHelpers::Zero(b, dtype); auto one = XlaHelpers::One(b, dtype); - auto different_sign = b->Ne(b->Lt(x, zero), b->Lt(y, zero)); - auto abs_x = b->Abs(x); - auto abs_y = b->Abs(y); - auto t = b->Neg(b->Sub(b->Add(abs_x, abs_y), one)); - auto result = b->Select(different_sign, b->Div(t, abs_y), b->Div(x, y)); + auto different_sign = xla::Ne(xla::Lt(x, zero), xla::Lt(y, zero)); + auto abs_x = xla::Abs(x); + auto abs_y = xla::Abs(y); + auto t = xla::Neg(xla::Sub(xla::Add(abs_x, abs_y), one)); + auto result = xla::Select(different_sign, xla::Div(t, abs_y), xla::Div(x, y)); if (DataTypeIsFloating(dtype)) { - result = b->Floor(result); + result = xla::Floor(result); } return result; } @@ -87,75 +88,78 @@ static xla::XlaOp FloorModImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x, xla::XlaOp y, const BCast& broadcast_helper) { std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper); auto zero = XlaHelpers::Zero(b, dtype); - auto same_sign = b->Eq(b->Lt(x, zero), b->Lt(y, zero)); - auto trunc_mod = b->Rem(x, y); - return b->Select(same_sign, trunc_mod, b->Rem(b->Add(trunc_mod, y), y)); + auto same_sign = xla::Eq(xla::Lt(x, zero), xla::Lt(y, zero)); + auto trunc_mod = xla::Rem(x, y); + return xla::Select(same_sign, trunc_mod, xla::Rem(xla::Add(trunc_mod, y), y)); } XLA_MAKE_BINARY(FloorMod, FloorModImpl(b, input_type(0), lhs, rhs, broadcast_helper)); -XLA_MAKE_BINARY(BitwiseAnd, b->And(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(BitwiseOr, b->Or(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(BitwiseAnd, xla::And(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(BitwiseOr, xla::Or(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(BitwiseXor, xla::Xor(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(LeftShift, b->ShiftLeft(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(LeftShift, xla::ShiftLeft(lhs, rhs, extend_dimensions)); XLA_MAKE_BINARY(RightShift, (DataTypeIsUnsigned(ctx->input_type(0)) - ? b->ShiftRightLogical(lhs, rhs, extend_dimensions) - : b->ShiftRightArithmetic(lhs, rhs, extend_dimensions))); - -XLA_MAKE_BINARY(LogicalAnd, b->And(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(LogicalOr, b->Or(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Mod, b->Rem(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Maximum, b->Max(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Minimum, b->Min(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(RealDiv, b->Div(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(ReciprocalGrad, b->Neg(b->Mul(rhs, b->Mul(lhs, lhs)))); + ? xla::ShiftRightLogical(lhs, rhs, extend_dimensions) + : xla::ShiftRightArithmetic(lhs, rhs, extend_dimensions))); + +XLA_MAKE_BINARY(LogicalAnd, xla::And(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(LogicalOr, xla::Or(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Mod, xla::Rem(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Maximum, xla::Max(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Minimum, xla::Min(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(RealDiv, xla::Div(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(ReciprocalGrad, xla::Neg(xla::Mul(rhs, xla::Mul(lhs, lhs)))); XLA_MAKE_BINARY( RsqrtGrad, - b->Mul(b->Pow(lhs, XlaHelpers::IntegerLiteral(b, input_type(0), 3)), - b->Div(rhs, XlaHelpers::IntegerLiteral(b, input_type(0), -2)), - extend_dimensions)); -XLA_MAKE_BINARY(SqrtGrad, - b->Div(b->Mul(rhs, - XlaHelpers::FloatLiteral(b, input_type(0), 0.5)), - lhs, extend_dimensions)); + xla::Mul(xla::Pow(lhs, XlaHelpers::IntegerLiteral(b, input_type(0), 3)), + xla::Div(rhs, XlaHelpers::IntegerLiteral(b, input_type(0), -2)), + extend_dimensions)); +XLA_MAKE_BINARY( + SqrtGrad, + xla::Div(xla::Mul(rhs, XlaHelpers::FloatLiteral(b, input_type(0), 0.5)), + lhs, extend_dimensions)); static xla::XlaOp Square(xla::XlaBuilder* builder, const xla::XlaOp& x) { - return builder->Mul(x, x); + return xla::Mul(x, x); } XLA_MAKE_BINARY(SquaredDifference, - Square(b, b->Sub(lhs, rhs, extend_dimensions))); + Square(b, xla::Sub(lhs, rhs, extend_dimensions))); -XLA_MAKE_BINARY(TruncateDiv, b->Div(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(TruncateMod, b->Rem(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(TruncateDiv, xla::Div(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(TruncateMod, xla::Rem(lhs, rhs, extend_dimensions)); // Comparison ops -XLA_MAKE_BINARY(Equal, b->Eq(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(NotEqual, b->Ne(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Greater, b->Gt(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(GreaterEqual, b->Ge(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(Less, b->Lt(lhs, rhs, extend_dimensions)); -XLA_MAKE_BINARY(LessEqual, b->Le(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Equal, xla::Eq(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(NotEqual, xla::Ne(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Greater, xla::Gt(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(GreaterEqual, xla::Ge(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Less, xla::Lt(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(LessEqual, xla::Le(lhs, rhs, extend_dimensions)); // Non-linear ops XLA_MAKE_BINARY(SigmoidGrad, - b->Mul(b->Mul(rhs, lhs), - b->Sub(XlaHelpers::One(b, input_type(0)), lhs))); + xla::Mul(xla::Mul(rhs, lhs), + xla::Sub(XlaHelpers::One(b, input_type(0)), lhs))); XLA_MAKE_BINARY(SoftplusGrad, - b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)), - XlaHelpers::One(b, input_type(1))))); + xla::Div(lhs, xla::Add(xla::Exp(xla::Neg(rhs)), + XlaHelpers::One(b, input_type(1))))); // softsigngrad(gradients, features) = gradients / (1 + abs(features)) ** 2 XLA_MAKE_BINARY(SoftsignGrad, - b->Div(lhs, Square(b, b->Add(XlaHelpers::One(b, input_type(0)), - b->Abs(rhs))))); + xla::Div(lhs, + Square(b, xla::Add(XlaHelpers::One(b, input_type(0)), + xla::Abs(rhs))))); -XLA_MAKE_BINARY(TanhGrad, b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)), - b->Mul(lhs, lhs)))); +XLA_MAKE_BINARY(TanhGrad, + xla::Mul(rhs, xla::Sub(XlaHelpers::One(b, input_type(0)), + xla::Mul(lhs, lhs)))); -XLA_MAKE_BINARY(Pow, b->Pow(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(Pow, xla::Pow(lhs, rhs, extend_dimensions)); #undef XLA_MAKE_BINARY @@ -168,12 +172,13 @@ class ApproximateEqualOp : public XlaOpKernel { // Computes the max of the scalar input x and 0. void Compile(XlaOpKernelContext* ctx) override { xla::XlaBuilder* b = ctx->builder(); - auto abs = b->Abs(b->Sub(ctx->Input(0), ctx->Input(1))); + auto abs = xla::Abs(xla::Sub(ctx->Input(0), ctx->Input(1))); auto abs_shape = b->GetShape(abs); OP_REQUIRES_OK(ctx, abs_shape.status()); auto abs_type = abs_shape.ValueOrDie().element_type(); - auto result = b->Lt( - abs, b->ConvertElementType(b->ConstantR0(tolerance_), abs_type)); + auto result = + xla::Lt(abs, xla::ConvertElementType( + xla::ConstantR0(b, tolerance_), abs_type)); ctx->SetOutput(0, result); } diff --git a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4bd7c74dca2a7cbb51f2a329ac575d635f314516 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc @@ -0,0 +1,101 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "absl/algorithm/container.h" +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/xla_helpers.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/bcast.h" + +namespace tensorflow { +namespace { + +class BroadcastToOp : public XlaOpKernel { + public: + explicit BroadcastToOp(OpKernelConstruction* context) + : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + const TensorShape input_shape = context->InputShape(0); + TensorShape output_shape; + OP_REQUIRES_OK(context, context->ConstantInputAsShape(1, &output_shape)); + + OP_REQUIRES(context, input_shape.dims() <= output_shape.dims(), + errors::InvalidArgument( + "Input rank (", input_shape.dims(), + ") must be less than or equal to the output rank (", + output_shape.dims(), ")")); + + auto input_dims = input_shape.dim_sizes(); + auto output_dims = output_shape.dim_sizes(); + + // Broadcasting is done right-to-left on right-aligned dimensions; reverse + // the two vectors so elements to be broadcast are aligned. + absl::c_reverse(input_dims); + absl::c_reverse(output_dims); + + std::vector broadcast_dims; + std::vector broadcast_shape; + for (int i = 0; i < output_shape.dims(); ++i) { + if (i < input_shape.dims()) { + OP_REQUIRES( + context, + (output_dims[i] == 0 && input_dims[i] == 0) || + (input_dims[i] != 0 && output_dims[i] % input_dims[i] == 0), + errors::InvalidArgument("invalid shape to broadcast from ", + input_shape.DebugString(), " to ", + output_shape.DebugString())); + + broadcast_dims.push_back(broadcast_shape.size()); + if (output_dims[i] == input_dims[i] || input_dims[i] == 1) { + broadcast_shape.push_back(output_dims[i]); + } + if (output_dims[i] != input_dims[i]) { + // Add dimensions [I, O/I], which we will later flatten to just + // [O]. We must do this in two phases since XLA broadcasting does not + // support tiling. + broadcast_shape.push_back(input_dims[i]); + broadcast_shape.push_back(output_dims[i] / input_dims[i]); + } + } else { + broadcast_shape.push_back(output_dims[i]); + } + } + absl::c_reverse(broadcast_dims); + int broadcast_shape_size = broadcast_shape.size(); + for (int64& broadcast_dim : broadcast_dims) { + broadcast_dim = broadcast_shape_size - broadcast_dim - 1; + } + absl::c_reverse(broadcast_shape); + xla::XlaOp output = xla::Reshape( + xla::BroadcastInDim(context->Input(0), + xla::ShapeUtil::MakeShape( + context->input_xla_type(0), broadcast_shape), + broadcast_dims), + output_shape.dim_sizes()); + context->SetOutput(0, output); + } +}; + +REGISTER_XLA_OP(Name("BroadcastTo").CompileTimeConstInput("shape"), + BroadcastToOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc index ca9a6b40688d1e8496d1b823e20d273d519f65e8..5078f8662bd397eaa51274ec816c130b8ced92cc 100644 --- a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" namespace tensorflow { @@ -36,22 +37,22 @@ class BucketizeOp : public XlaOpKernel { const DataType dtype = context->input_type(0); xla::XlaOp input = context->Input(0); - xla::XlaOp boundaries = builder->ConstantR1(boundaries_); + xla::XlaOp boundaries = xla::ConstantR1(builder, boundaries_); // TODO(phawkins): the following behavior matches the behavior of the core // Bucketize kernel. However, comparing an int32 or int64 against float may // lead to inaccurate bucketing due to rounding. if (dtype == DT_DOUBLE) { - input = builder->ConvertElementType(input, xla::F64); - boundaries = builder->ConvertElementType(boundaries, xla::F64); + input = xla::ConvertElementType(input, xla::F64); + boundaries = xla::ConvertElementType(boundaries, xla::F64); } else { - input = builder->ConvertElementType(input, xla::F32); + input = xla::ConvertElementType(input, xla::F32); } - xla::XlaOp comparison = builder->ConvertElementType( - builder->Ge(builder->Broadcast(input, {1}), boundaries, - /*broadcast_dimensions=*/{0}), - xla::S32); - xla::XlaOp buckets = builder->Reduce( - comparison, /*init_value=*/builder->ConstantR0(0), + xla::XlaOp comparison = + xla::ConvertElementType(xla::Ge(xla::Broadcast(input, {1}), boundaries, + /*broadcast_dimensions=*/{0}), + xla::S32); + xla::XlaOp buckets = xla::Reduce( + comparison, /*init_value=*/xla::ConstantR0(builder, 0), /*computation=*/xla::CreateScalarAddComputation(xla::S32, builder), /*dimensions_to_reduce=*/{0}); context->SetOutput(0, buckets); diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc index e9d98c768572c52825fa5192ecec834889f040fe..8cc2479dd555380da7500abe6b2aca380110333b 100644 --- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/core/framework/kernel_def_builder.h" @@ -40,14 +41,14 @@ class CastOp : public XlaOpKernel { if (src_dtype_ == dst_dtype_) { output = input; } else if (dst_dtype_ == DT_BOOL) { - output = builder->Ne(input, XlaHelpers::Zero(builder, src_dtype_)); + output = xla::Ne(input, XlaHelpers::Zero(builder, src_dtype_)); } else if (xla::primitive_util::IsComplexType(src_type_) && !xla::primitive_util::IsComplexType(dst_type_)) { // As in cast_op.h, we replicate the numpy behavior of truncating the // imaginary part. - output = builder->ConvertElementType(builder->Real(input), dst_type_); + output = xla::ConvertElementType(xla::Real(input), dst_type_); } else { - output = builder->ConvertElementType(input, dst_type_); + output = xla::ConvertElementType(input, dst_type_); } ctx->SetOutput(0, output); @@ -72,7 +73,6 @@ class BitcastOp : public XlaOpKernel { } void Compile(XlaOpKernelContext* ctx) override { - xla::XlaBuilder* builder = ctx->builder(); xla::XlaOp input = ctx->Input(0); xla::XlaOp output; @@ -92,7 +92,7 @@ class BitcastOp : public XlaOpKernel { xla::primitive_util::BitWidth(dst_type_), errors::Unimplemented( "Only bitcasts between equally sized types supported.")); - output = builder->BitcastConvertType(input, dst_type_); + output = xla::BitcastConvertType(input, dst_type_); } ctx->SetOutput(0, output); diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc index 835a7f568945f0bee86fe2b39491c3326726e1aa..e7fef77edcba0ea5a521956a704225ac4f7fcb22 100644 --- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" @@ -65,24 +66,22 @@ class CategoricalOp : public XlaOpKernel { DataTypeToPrimitiveType(input_type(0), &uniform_xla_type)); xla::Shape uniform_shape = xla::ShapeUtil::MakeShape(uniform_xla_type, uniform_shape_array); - auto uniforms = builder->RngUniform( - XlaHelpers::Zero(builder, input_type(0)), - XlaHelpers::One(builder, input_type(0)), uniform_shape); + auto uniforms = + xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)), + XlaHelpers::One(builder, input_type(0)), uniform_shape); // Use Gumbel softmax trick to generate categorical samples. // See: // https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/ // TODO(b/68769470): Switch to using a cumulative sum approach. - auto softmax_entries = - builder->Sub(logits, builder->Log(builder->Neg(builder->Log(uniforms))), - /*broadcast_dimensions=*/{0, 2}); - - TensorShape softmax_shape(uniform_shape_array); - xla::XlaOp argmax; - OP_REQUIRES_OK( - ctx, - XlaHelpers::ArgMax(builder, ctx, softmax_entries, softmax_shape, - input_type(0), output_type(0), /*axis=*/2, &argmax)); + auto softmax_entries = xla::Sub(logits, xla::Log(-xla::Log(uniforms)), + /*broadcast_dimensions=*/{0, 2}); + + xla::PrimitiveType xla_output_type; + OP_REQUIRES_OK(ctx, + DataTypeToPrimitiveType(output_type(0), &xla_output_type)); + xla::XlaOp argmax = + XlaHelpers::ArgMax(softmax_entries, xla_output_type, /*axis=*/2); ctx->SetOutput(0, argmax); } diff --git a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc index fe6651793dc763d13f4a4b0ac294ec3ecf64af8f..9fcbc86adc0967cbb7fb73da8bdabc58b60953da 100644 --- a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc @@ -24,12 +24,7 @@ class CholeskyOp : public XlaOpKernel { public: explicit CholeskyOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { - auto result = Cholesky(ctx->builder(), ctx->Input(0)); - if (!result.ok()) { - ctx->SetStatus(result.status()); - return; - } - ctx->SetOutput(0, result.ValueOrDie()); + ctx->SetOutput(0, Cholesky(ctx->Input(0))); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc index a00bc912f9f40052565446c6bf9390629af9a4cd..547fe48046e8c934e3bc14d02c8448e107c1a406 100644 --- a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/tensor_shape.h" namespace tensorflow { @@ -29,7 +30,6 @@ class ClipByValueOp : public XlaOpKernel { const TensorShape min_shape = ctx->InputShape(1); const TensorShape max_shape = ctx->InputShape(2); - xla::XlaBuilder* builder = ctx->builder(); auto input = ctx->Input(0); auto min = ctx->Input(1); auto max = ctx->Input(2); @@ -45,13 +45,13 @@ class ClipByValueOp : public XlaOpKernel { if (shape != min_shape) { OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(min_shape), shape_error()); - min = builder->Broadcast(min, shape.dim_sizes()); + min = xla::Broadcast(min, shape.dim_sizes()); } if (shape != max_shape) { OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(max_shape), shape_error()); - max = builder->Broadcast(max, shape.dim_sizes()); + max = xla::Broadcast(max, shape.dim_sizes()); } - ctx->SetOutput(0, builder->Clamp(min, input, max)); + ctx->SetOutput(0, xla::Clamp(min, input, max)); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc index 78285affa1c399ae107a9172fb85cf257457c368..f4106051043859a6786705009d76b02a64cd3ff1 100644 --- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -88,7 +89,7 @@ class ConcatBaseOp : public XlaOpKernel { "] = ", in_shape.DebugString())); if (in_shape.dims() == 0) { // Inputs that come in as scalars must be reshaped to 1-vectors. - input_data.push_back(ctx->builder()->Reshape(handle, {1})); + input_data.push_back(xla::Reshape(handle, {1})); } else { input_data.push_back(handle); } @@ -96,7 +97,7 @@ class ConcatBaseOp : public XlaOpKernel { } VLOG(1) << "Concat dim " << concat_dim << " equivalent to " << axis; - ctx->SetOutput(0, ctx->builder()->ConcatInDim(input_data, axis)); + ctx->SetOutput(0, xla::ConcatInDim(ctx->builder(), input_data, axis)); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc index 59d06c654de18c9003fe0bdc706d0c2443de6d7b..da8cf3fc6fa694f592280f8c249d317827d9cd09 100644 --- a/tensorflow/compiler/tf2xla/kernels/const_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/tensor.pb.h" @@ -53,41 +54,41 @@ class ConstOp : public XlaOpKernel { switch (proto_.dtype()) { case DT_BOOL: if (proto_.bool_val_size() == 1) { - ctx->SetOutput(0, - b->Broadcast(b->ConstantR0(proto_.bool_val(0)), - shape.dim_sizes())); + ctx->SetOutput( + 0, xla::Broadcast(xla::ConstantR0(b, proto_.bool_val(0)), + shape.dim_sizes())); return; } break; case DT_FLOAT: if (proto_.float_val_size() == 1) { - ctx->SetOutput( - 0, b->Broadcast(b->ConstantR0(proto_.float_val(0)), - shape.dim_sizes())); + ctx->SetOutput(0, xla::Broadcast(xla::ConstantR0( + b, proto_.float_val(0)), + shape.dim_sizes())); return; } break; case DT_DOUBLE: if (proto_.double_val_size() == 1) { - ctx->SetOutput( - 0, b->Broadcast(b->ConstantR0(proto_.double_val(0)), - shape.dim_sizes())); + ctx->SetOutput(0, xla::Broadcast(xla::ConstantR0( + b, proto_.double_val(0)), + shape.dim_sizes())); return; } break; case DT_INT32: if (proto_.int_val_size() == 1) { - ctx->SetOutput(0, - b->Broadcast(b->ConstantR0(proto_.int_val(0)), - shape.dim_sizes())); + ctx->SetOutput( + 0, xla::Broadcast(xla::ConstantR0(b, proto_.int_val(0)), + shape.dim_sizes())); return; } break; case DT_INT64: if (proto_.int64_val_size() == 1) { - ctx->SetOutput( - 0, b->Broadcast(b->ConstantR0(proto_.int64_val(0)), - shape.dim_sizes())); + ctx->SetOutput(0, xla::Broadcast(xla::ConstantR0( + b, proto_.int64_val(0)), + shape.dim_sizes())); return; } break; diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc index 627bad12f33c82e91bc3c6f3323f562bc8174056..674720e22fbf9d995e74c7dbd0ef7d7765941867 100644 --- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc @@ -18,6 +18,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -51,8 +53,8 @@ xla::XlaOp CreateExpandedZero(const TensorShape& filter_shape, DataType dtype, xla::XlaBuilder* builder) { TensorShape expanded_filter_shape = ExpandedFilterShapeForDepthwiseConvolution(filter_shape); - return builder->Broadcast(XlaHelpers::Zero(builder, dtype), - expanded_filter_shape.dim_sizes()); + return xla::Broadcast(XlaHelpers::Zero(builder, dtype), + expanded_filter_shape.dim_sizes()); } // Create a mask for depthwise convolution that will make a normal convolution @@ -95,84 +97,63 @@ xla::XlaOp CreateExpandedFilterMask(const TensorShape& filter_shape, // Create a M sized linspace and an M*N sized linspace that will be // broadcasted into perpendicular dimensions and compared. - xla::XlaOp input_feature_iota; - // DT_INT32 Iota will always return status::OK(). - TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32, input_feature, - &input_feature_iota)); - xla::XlaOp expanded_feature_iota; - TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32, - input_feature * depthwise_multiplier, - &expanded_feature_iota)); + xla::XlaOp input_feature_iota = xla::Iota(builder, xla::S32, input_feature); + xla::XlaOp expanded_feature_iota = + xla::Iota(builder, xla::S32, input_feature * depthwise_multiplier); // Divide the M*N sized linspace by the depthwise_multiplier to create // [0 0 1 1 2 2] in the example in the function comment. expanded_feature_iota = - builder->Div(expanded_feature_iota, - XlaHelpers::IntegerLiteral(builder, DataType::DT_INT32, - depthwise_multiplier)); + xla::Div(expanded_feature_iota, + XlaHelpers::IntegerLiteral(builder, DataType::DT_INT32, + depthwise_multiplier)); // Broadcast the N*M linspace to [H, W, ..., M, M*N]. auto expanded_feature_broadcast_dims = expanded_filter_shape.dim_sizes(); expanded_feature_broadcast_dims.pop_back(); - auto broadcasted_expanded_feature_iota = builder->Broadcast( - expanded_feature_iota, expanded_feature_broadcast_dims); + auto broadcasted_expanded_feature_iota = + xla::Broadcast(expanded_feature_iota, expanded_feature_broadcast_dims); // Compare the broadcasted linspace to the input feature linspace in the // input feature dimension to create a diagonal predicate. - return builder->Eq(broadcasted_expanded_feature_iota, input_feature_iota, - {expanded_filter_shape.dims() - 2}); + return xla::Eq(broadcasted_expanded_feature_iota, input_feature_iota, + {expanded_filter_shape.dims() - 2}); } -// Expands a filter of shape [H, W, ..., M, N] to [H, W, ..., M, M*N] by adding -// zeros for the cross-depth filters. Used to build a depthwise convolution. -xla::XlaOp ExpandFilterForDepthwiseConvolution(const TensorShape& filter_shape, - DataType dtype, - const xla::XlaOp& filter, - xla::XlaBuilder* builder) { - int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1); - int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2); - TensorShape expanded_filter_shape = - ExpandedFilterShapeForDepthwiseConvolution(filter_shape); +// Reshapes a filter of shape [H, W, ..., M, N] to [H, W, ..., 1, M*N]. Used to +// build a depthwise convolution. +xla::XlaOp ReshapeFilterForDepthwiseConvolution(const TensorShape& filter_shape, + const xla::XlaOp& filter) { + int64 input_feature_dim = filter_shape.dims() - 2; + int64 output_feature_dim = filter_shape.dims() - 1; + int64 depthwise_multiplier = filter_shape.dim_size(output_feature_dim); + int64 input_feature = filter_shape.dim_size(input_feature_dim); // Create a [H, W, ..., 1, N*M] reshape of the filter. - TensorShape implicit_broadcast_filter_shape = expanded_filter_shape; - implicit_broadcast_filter_shape.set_dim( - implicit_broadcast_filter_shape.dims() - 2, 1); - implicit_broadcast_filter_shape.set_dim( - implicit_broadcast_filter_shape.dims() - 1, - depthwise_multiplier * input_feature); - auto implicit_broadcast_filter = - builder->Reshape(filter, implicit_broadcast_filter_shape.dim_sizes()); - - // Broadcast the filter to [H, W, ..., M, M*N]. - auto expanded_zero = CreateExpandedZero(filter_shape, dtype, builder); - auto expanded_filter = builder->Add(implicit_broadcast_filter, expanded_zero); - - // If the filter mask is set, choose the broadcasted filter, othwerwise, - // choose zero. - return builder->Select(CreateExpandedFilterMask(filter_shape, builder), - expanded_filter, expanded_zero); + TensorShape implicit_broadcast_filter_shape = filter_shape; + implicit_broadcast_filter_shape.set_dim(input_feature_dim, 1); + implicit_broadcast_filter_shape.set_dim(output_feature_dim, + depthwise_multiplier * input_feature); + return xla::Reshape(filter, implicit_broadcast_filter_shape.dim_sizes()); } -// Inverse of ExpandFilterForDepthwiseConvolution. +// Reduces the results of the convolution with an expanded filter to the +// non-expanded filter. xla::XlaOp ContractFilterForDepthwiseBackprop(XlaOpKernelContext* ctx, const TensorShape& filter_shape, DataType dtype, const xla::XlaOp& filter_backprop, xla::XlaBuilder* builder) { - TensorShape expanded_filter_shape = - ExpandedFilterShapeForDepthwiseConvolution(filter_shape); - auto masked_expanded_filter = builder->Select( + auto masked_expanded_filter = xla::Select( CreateExpandedFilterMask(filter_shape, builder), filter_backprop, CreateExpandedZero(filter_shape, dtype, builder)); - return builder->Reshape( + return xla::Reshape( // This reduce does not need inputs to be converted with // XlaHelpers::SumAccumulationType() since the ExpandedFilterMask with // ExpandedZero guarantees that only one element is non zero, so there // cannot be accumulated precision error. - builder->Reduce(masked_expanded_filter, XlaHelpers::Zero(builder, dtype), - *ctx->GetOrCreateAdd(dtype), - {expanded_filter_shape.dims() - 2}), + xla::Reduce(masked_expanded_filter, XlaHelpers::Zero(builder, dtype), + *ctx->GetOrCreateAdd(dtype), {filter_shape.dims() - 2}), filter_shape.dim_sizes()); } @@ -248,15 +229,9 @@ class ConvOp : public XlaOpKernel { "input and filter must have the same depth: ", in_depth, " vs ", input_shape.dim_size(feature_dim))); - xla::XlaBuilder* b = ctx->builder(); - xla::XlaOp filter = ctx->Input(1); - TensorShape expanded_filter_shape = filter_shape; if (depthwise_) { - filter = ExpandFilterForDepthwiseConvolution( - filter_shape, ctx->input_type(0), filter, b); - expanded_filter_shape = - ExpandedFilterShapeForDepthwiseConvolution(filter_shape); + filter = ReshapeFilterForDepthwiseConvolution(filter_shape, filter); } xla::ConvolutionDimensionNumbers dims; @@ -283,14 +258,15 @@ class ConvOp : public XlaOpKernel { int64 unused_output_size; OP_REQUIRES_OK( ctx, GetWindowedOutputSizeVerboseV2( - input_shape.dim_size(dim), expanded_filter_shape.dim_size(i), + input_shape.dim_size(dim), filter_shape.dim_size(i), rhs_dilation[i], window_strides[i], padding_, &unused_output_size, &padding[i].first, &padding[i].second)); } - xla::XlaOp conv = - b->ConvGeneralDilated(ctx->Input(0), filter, window_strides, padding, - lhs_dilation, rhs_dilation, dims); + xla::XlaOp conv = xla::ConvGeneralDilated( + ctx->Input(0), filter, window_strides, padding, lhs_dilation, + rhs_dilation, dims, + /*feature_group_count=*/depthwise_ ? in_depth : 1); ctx->SetOutput(0, conv); } @@ -391,7 +367,6 @@ class ConvBackpropInputOp : public XlaOpKernel { expanded_filter_shape, out_backprop_shape, dilations_, strides_, padding_, data_format_, &dims)); - xla::XlaBuilder* b = ctx->builder(); auto filter = ctx->Input(1); auto out_backprop = ctx->Input(2); @@ -428,20 +403,18 @@ class ConvBackpropInputOp : public XlaOpKernel { rhs_dilation[i] = dilations_[dim]; } - // If this is a depthwise convolution, expand the filter. - if (depthwise_) { - filter = ExpandFilterForDepthwiseConvolution( - filter_shape, ctx->input_type(1), filter, b); - } - // Mirror the filter in the spatial dimensions. - xla::XlaOp mirrored_weights = b->Rev(filter, kernel_spatial_dims); + xla::XlaOp mirrored_weights = xla::Rev(filter, kernel_spatial_dims); // activation gradients // = gradients (with padding and dilation) mirrored_weights - xla::XlaOp in_backprop = b->ConvGeneralDilated( + xla::XlaOp in_backprop = xla::ConvGeneralDilated( out_backprop, mirrored_weights, /*window_strides=*/ones, padding, - lhs_dilation, rhs_dilation, dnums); + lhs_dilation, rhs_dilation, dnums, + /*feature_group_count=*/ + depthwise_ ? out_backprop_shape.dim_size(feature_dim) / + filter_shape.dim_size(num_spatial_dims_ + 1) + : 1); ctx->SetOutput(0, in_backprop); } @@ -638,8 +611,8 @@ class ConvBackpropFilterOp : public XlaOpKernel { // This is done by specifying the window dilation factors in the // convolution HLO below. auto filter_backprop = - b->ConvGeneralDilated(activations, gradients, window_strides, padding, - /*lhs_dilation=*/ones, rhs_dilation, dnums); + xla::ConvGeneralDilated(activations, gradients, window_strides, padding, + /*lhs_dilation=*/ones, rhs_dilation, dnums); if (depthwise_) { filter_backprop = ContractFilterForDepthwiseBackprop( diff --git a/tensorflow/compiler/tf2xla/kernels/cross_op.cc b/tensorflow/compiler/tf2xla/kernels/cross_op.cc index 7fcd4170fb79a574663c1abffe873d4b53f471d3..db579a5b35d69deb3dca578e31c1b54fada76342 100644 --- a/tensorflow/compiler/tf2xla/kernels/cross_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/cross_op.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" namespace tensorflow { namespace { @@ -58,21 +59,21 @@ class CrossOp : public XlaOpKernel { auto in1 = ctx->Input(1); starts.back() = 0; limits.back() = 1; - auto u1 = b->Slice(in0, starts, limits, strides); - auto v1 = b->Slice(in1, starts, limits, strides); + auto u1 = xla::Slice(in0, starts, limits, strides); + auto v1 = xla::Slice(in1, starts, limits, strides); starts.back() = 1; limits.back() = 2; - auto u2 = b->Slice(in0, starts, limits, strides); - auto v2 = b->Slice(in1, starts, limits, strides); + auto u2 = xla::Slice(in0, starts, limits, strides); + auto v2 = xla::Slice(in1, starts, limits, strides); starts.back() = 2; limits.back() = 3; - auto u3 = b->Slice(in0, starts, limits, strides); - auto v3 = b->Slice(in1, starts, limits, strides); + auto u3 = xla::Slice(in0, starts, limits, strides); + auto v3 = xla::Slice(in1, starts, limits, strides); - auto s1 = b->Sub(b->Mul(u2, v3), b->Mul(u3, v2)); - auto s2 = b->Sub(b->Mul(u3, v1), b->Mul(u1, v3)); - auto s3 = b->Sub(b->Mul(u1, v2), b->Mul(u2, v1)); - auto output = b->ConcatInDim({s1, s2, s3}, in0_shape.dims() - 1); + auto s1 = xla::Sub(xla::Mul(u2, v3), xla::Mul(u3, v2)); + auto s2 = xla::Sub(xla::Mul(u3, v1), xla::Mul(u1, v3)); + auto s3 = xla::Sub(xla::Mul(u1, v2), xla::Mul(u2, v1)); + auto output = xla::ConcatInDim(b, {s1, s2, s3}, in0_shape.dims() - 1); ctx->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc index 01aa1a83e7967921f1583b3ef18ec57e452dcfea..ef1015552d181a183d412f9c269dd5ec608b388f 100644 --- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc @@ -22,7 +22,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/util/bcast.h" @@ -96,18 +96,16 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) { // First reshape the inputs, which should be a metadata-only // operation since we are flattening the dimensions in order. - auto lhs_shaped = builder->Reshape(lhs, broadcast_helper.x_reshape()); - auto rhs_shaped = builder->Reshape(rhs, broadcast_helper.y_reshape()); + auto lhs_shaped = xla::Reshape(lhs, broadcast_helper.x_reshape()); + auto rhs_shaped = xla::Reshape(rhs, broadcast_helper.y_reshape()); // Next broadcast the necessary input dimensions. We rely on the // XLA optimizer to be smart about the fact that we are asking // it to broadcast size 1 on some of these dimensions, to avoid // adding complexity to this code. - auto lhs_broadcast = - builder->Broadcast(lhs_shaped, broadcast_helper.x_bcast()); + auto lhs_broadcast = xla::Broadcast(lhs_shaped, broadcast_helper.x_bcast()); int lhs_size = broadcast_helper.x_bcast().size(); - auto rhs_broadcast = - builder->Broadcast(rhs_shaped, broadcast_helper.y_bcast()); + auto rhs_broadcast = xla::Broadcast(rhs_shaped, broadcast_helper.y_bcast()); int rhs_size = broadcast_helper.y_bcast().size(); // Now reshape them to the correct output shape. After the @@ -122,15 +120,15 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) { lhs_reorder.push_back(i); lhs_reorder.push_back(i + lhs_size); } - auto lhs_output = builder->Reshape(lhs_broadcast, lhs_reorder, - broadcast_helper.output_shape()); + auto lhs_output = + xla::Reshape(lhs_broadcast, lhs_reorder, broadcast_helper.output_shape()); std::vector rhs_reorder; for (int i = 0; i < rhs_size; ++i) { rhs_reorder.push_back(i); rhs_reorder.push_back(i + rhs_size); } - auto rhs_output = builder->Reshape(rhs_broadcast, rhs_reorder, - broadcast_helper.output_shape()); + auto rhs_output = + xla::Reshape(rhs_broadcast, rhs_reorder, broadcast_helper.output_shape()); return {lhs_output, rhs_output}; } diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h index 4f92dbc8740b697322424058530b8477c35d809a..6653944a911588b7bc88d67b8cdd2c17850530f0 100644 --- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h +++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h @@ -20,7 +20,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/util/bcast.h" @@ -57,8 +57,8 @@ class XlaBinaryOp : public XlaOpKernel { // in the XLA documentation. virtual xla::XlaOp Computation( XlaOpKernelContext* ctx, const xla::XlaOp& lhs, - const gtl::ArraySlice& lhs_shape, const xla::XlaOp& rhs, - const gtl::ArraySlice& rhs_shape, const BCast& broadcast_helper, + const absl::Span& lhs_shape, const xla::XlaOp& rhs, + const absl::Span& rhs_shape, const BCast& broadcast_helper, const std::vector& extend_dimensions) = 0; void Compile(XlaOpKernelContext* ctx) override; diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc index 23243f62462c6315e359d9621823b19fc98c6218..12b0e38288e8f222ed506a75ec2575f27141c859 100644 --- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/util/tensor_format.h" namespace tensorflow { @@ -50,7 +51,6 @@ class DepthToSpaceOp : public XlaOpKernel { const gtl::InlinedVector input_shape = input_tensor_shape.dim_sizes(); - xla::XlaBuilder* b = ctx->builder(); xla::XlaOp input = ctx->Input(0); int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_); @@ -130,7 +130,7 @@ class DepthToSpaceOp : public XlaOpKernel { ") is not divisible by square of the block size (", block_size_, ")")); - xla::XlaOp reshaped = b->Reshape(input, reshaped_shape); + xla::XlaOp reshaped = xla::Reshape(input, reshaped_shape); // 2. Permute dimensions of `reshaped` to produce // `permuted_reshaped` of shape: @@ -141,7 +141,7 @@ class DepthToSpaceOp : public XlaOpKernel { // input_shape[2], // block_size_, // depth / (block_size_ * block_size_)] - xla::XlaOp permuted_reshaped = b->Transpose(reshaped, transpose_order); + xla::XlaOp permuted_reshaped = xla::Transpose(reshaped, transpose_order); // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the // batch dimension, producing an output tensor of shape: @@ -151,7 +151,7 @@ class DepthToSpaceOp : public XlaOpKernel { // input_shape[2] * block_size_, // depth / (block_size_ * block_size_)] // - xla::XlaOp output = b->Reshape(permuted_reshaped, output_shape); + xla::XlaOp output = xla::Reshape(permuted_reshaped, output_shape); ctx->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc index 931705ba837153e1175cd9a209876ef5ec93f0fc..49c12fc232092873b69961644a059abc6035f64f 100644 --- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc @@ -18,6 +18,9 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/framework/op_kernel.h" @@ -25,10 +28,10 @@ namespace tensorflow { namespace { // Create a diagonal / batch diagonal matrix with 'input' on the diagonal. -xla::StatusOr CreateDiagonal( - const xla::XlaOp& input, int64 last_dim_size, - tensorflow::gtl::ArraySlice other_dims, XlaOpKernelContext* ctx, - xla::XlaBuilder* builder) { +xla::XlaOp CreateDiagonal(xla::XlaOp input, int64 last_dim_size, + absl::Span other_dims, + xla::PrimitiveType element_type) { + xla::XlaBuilder* builder = input.builder(); // Create two matrices that have the following forms, and compare them: // // [[0, 0, 0, 0] [[0, 1, 2, 3] @@ -38,16 +41,14 @@ xla::StatusOr CreateDiagonal( // // This produces a predicate matrix of the right size, with "true" on the // diagonal. - xla::XlaOp iota; - TF_RETURN_IF_ERROR( - XlaHelpers::Iota(builder, DataType::DT_INT32, last_dim_size, &iota)); - xla::XlaOp iota_broadcast = builder->Broadcast(iota, {last_dim_size}); - xla::XlaOp mask = builder->Eq(iota_broadcast, iota, {0}); + xla::XlaOp iota = xla::Iota(builder, xla::S32, last_dim_size); + xla::XlaOp iota_broadcast = xla::Broadcast(iota, {last_dim_size}); + xla::XlaOp mask = xla::Eq(iota_broadcast, iota, {0}); // If this is a batched diagonal, broadcast the mask across the other // dimensions. if (!other_dims.empty()) { - mask = builder->Broadcast(mask, other_dims); + mask = xla::Broadcast(mask, other_dims); } // Broadcast the input, and then use the mask computed above to select the @@ -64,18 +65,15 @@ xla::StatusOr CreateDiagonal( std::vector broadcast_dims(other_dims.begin(), other_dims.end()); broadcast_dims.push_back(1LL); broadcast_dims.push_back(last_dim_size); - xla::XlaOp input_broadcast = builder->Reshape(input, broadcast_dims); + xla::XlaOp input_broadcast = xla::Reshape(input, broadcast_dims); broadcast_dims[broadcast_dims.size() - 2] = last_dim_size; - xla::PrimitiveType element_type; - TF_RETURN_IF_ERROR( - DataTypeToPrimitiveType(ctx->input_type(0), &element_type)); auto broadcast_shape = xla::ShapeUtil::MakeShape(element_type, broadcast_dims); - xla::XlaOp zeros = Zeros(builder, broadcast_shape); + xla::XlaOp zeros = xla::Zeros(builder, broadcast_shape); - input_broadcast = builder->Add(input_broadcast, zeros); - return builder->Select(mask, input_broadcast, zeros); + input_broadcast = xla::Add(input_broadcast, zeros); + return xla::Select(mask, input_broadcast, zeros); } class DiagOp : public XlaOpKernel { @@ -83,8 +81,6 @@ class DiagOp : public XlaOpKernel { explicit DiagOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { - xla::XlaBuilder* builder = ctx->builder(); - OP_REQUIRES(ctx, ctx->num_inputs() >= 1, errors::InvalidArgument("Diag op must have at an input")); const TensorShape input_shape = ctx->InputShape(0); @@ -104,19 +100,17 @@ class DiagOp : public XlaOpKernel { // Flattens the input to 1D. int64 size = input_shape.num_elements(); - input = builder->Reshape(input, {size}); + input = xla::Reshape(input, {size}); // Create an R2 with the R1 diagonal. - auto diag_or_status = - CreateDiagonal(input, size, /*other_dims=*/{}, ctx, builder); - OP_REQUIRES_OK(ctx, diag_or_status.status()); - xla::XlaOp diag = diag_or_status.ValueOrDie(); + xla::XlaOp diag = + CreateDiagonal(input, size, /*other_dims=*/{}, ctx->input_xla_type(0)); // Reshapes to the final shape. std::vector new_dims(dims.size() * 2); std::copy(dims.begin(), dims.end(), new_dims.begin()); std::copy(dims.begin(), dims.end(), new_dims.begin() + dims.size()); - diag = builder->Reshape(diag, new_dims); + diag = xla::Reshape(diag, new_dims); ctx->SetOutput(0, diag); } @@ -129,8 +123,6 @@ class DiagPartOp : public XlaOpKernel { explicit DiagPartOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { - xla::XlaBuilder* builder = ctx->builder(); - const TensorShape input_shape = ctx->InputShape(0); auto dims = input_shape.dim_sizes(); @@ -156,37 +148,13 @@ class DiagPartOp : public XlaOpKernel { new_dims.push_back(dims[i]); } - xla::XlaOp diag = ctx->Input(0); - - // TODO(b/30878775): use Slice with strides when supported, in place of - // the Pad -> Reshape -> Slice. - - // Picture: - // [[1, 0, 0, 0] pad and reshape to [[1, 0, 0, 0, 0], - // [0, 2, 0, 0] =================> [2, 0, 0, 0, 0], - // [0, 0, 3, 0] [3, 0, 0, 0, 0], - // [0, 0, 0, 4]] [4, 0, 0, 0, 0]] - // and then slice out the first column. - - // Flattens the input to 1D. - int64 size = input_shape.num_elements(); - diag = builder->Reshape(diag, {size}); - - // Adds padding after the last element of 'new_size'. - xla::PaddingConfig config; - auto* dim = config.add_dimensions(); - dim->set_edge_padding_high(new_size); - auto zero = XlaHelpers::Zero(builder, input_type(0)); - diag = builder->Pad(diag, zero, config); - - // Reshapes so the diagonal is now in the first column. - diag = builder->Reshape(diag, {new_size, new_size + 1}); + xla::XlaOp input = ctx->Input(0); - // Slices out the first column and reshapes to the final shape. - diag = builder->Slice(diag, {0, 0}, {new_size, 1}, {1, 1}); - diag = builder->Reshape(diag, new_dims); + xla::XlaOp output = xla::Reshape( + xla::GetMatrixDiagonal(xla::Reshape(input, {new_size, new_size})), + new_dims); - ctx->SetOutput(0, diag); + ctx->SetOutput(0, output); } }; @@ -197,8 +165,6 @@ class MatrixDiagOp : public XlaOpKernel { explicit MatrixDiagOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { - xla::XlaBuilder* builder = ctx->builder(); - OP_REQUIRES(ctx, ctx->num_inputs() >= 1, errors::InvalidArgument("MatrixDiag op must have at an input")); const TensorShape input_shape = ctx->InputShape(0); @@ -208,17 +174,15 @@ class MatrixDiagOp : public XlaOpKernel { errors::InvalidArgument("Expected 1 <= dims, got shape ", input_shape.DebugString())); - xla::XlaOp diag = ctx->Input(0); int last_dim = dims.size() - 1; int64 last_dim_size = input_shape.dim_size(last_dim); - tensorflow::gtl::ArraySlice other_dims(dims); - other_dims.pop_back(); + absl::Span other_dims(dims); + other_dims.remove_suffix(1); - auto diag_or_status = - CreateDiagonal(diag, last_dim_size, other_dims, ctx, builder); - OP_REQUIRES_OK(ctx, diag_or_status.status()); - diag = diag_or_status.ValueOrDie(); + xla::XlaOp input = ctx->Input(0); + xla::XlaOp diag = CreateDiagonal(input, last_dim_size, other_dims, + ctx->input_xla_type(0)); ctx->SetOutput(0, diag); } }; @@ -230,8 +194,6 @@ class MatrixDiagPartOp : public XlaOpKernel { explicit MatrixDiagPartOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { - xla::XlaBuilder* builder = ctx->builder(); - const TensorShape input_shape = ctx->InputShape(0); auto dims = input_shape.dim_sizes(); @@ -239,71 +201,8 @@ class MatrixDiagPartOp : public XlaOpKernel { errors::InvalidArgument("Expected 2 <= dims, got shape ", input_shape.DebugString())); - xla::XlaOp diag = ctx->Input(0); - - int last_dim = dims.size() - 1; - int64 last_dim_size = dims[last_dim]; - - // The smaller of the last two dimension sizes. - int64 smaller_dim_size = std::min(dims[last_dim - 1], dims[last_dim]); - - // TODO(b/30878775): use Slice with strides when supported, in place of - // the Pad -> Reshape -> Slice. - - // Picture: for each 2D matrix in the tensor's last two dimensions: - // [[1, 0, 0, 0] pad and reshape to [[1, 0, 0, 0, 0], - // [0, 2, 0, 0] =================> [2, 0, 0, 0, 0], - // [0, 0, 3, 0]] [3, 0, 0, 0, 0], - // and then slice out the first column. - // - // Another example, with tall and narrow input. - // [[1, 0] pad and reshape to [[1, 0, 0], - // [0, 2] =================> [2, 0, 0]] - // [0, 0] - // [0, 0]] - - // Collapses the last two dimensions. - std::vector flattened_dims(dims.begin(), dims.end() - 1); - flattened_dims.back() *= dims.back(); - diag = builder->Reshape(diag, flattened_dims); - - // Slices or pads the last dimension to 'target_size'. - int64 actual_size = flattened_dims.back(); - int64 target_size = smaller_dim_size * (last_dim_size + 1); - if (actual_size < target_size) { - xla::PaddingConfig config = - xla::MakeNoPaddingConfig(flattened_dims.size()); - auto* dim = config.mutable_dimensions(flattened_dims.size() - 1); - dim->set_edge_padding_high(target_size - actual_size); - auto zero = XlaHelpers::Zero(builder, input_type(0)); - diag = builder->Pad(diag, zero, config); - } else if (actual_size > target_size) { - std::vector start(flattened_dims.size(), 0); - std::vector limits(flattened_dims.begin(), flattened_dims.end()); - std::vector strides(flattened_dims.size(), 1); - limits[flattened_dims.size() - 1] = target_size; - diag = builder->Slice(diag, start, limits, strides); - } - - // Reshape so the target values are in the first position of the last - // dimension. - std::vector unflattened_dims(dims.begin(), dims.end()); - dims[last_dim - 1] = smaller_dim_size; - dims[last_dim] = last_dim_size + 1; - diag = builder->Reshape(diag, dims); - - // Slices out the first column and reshapes to the final shape. - std::vector start(dims.size(), 0); - std::vector limits(dims.begin(), dims.end()); - std::vector strides(dims.size(), 1); - limits[last_dim] = 1; - diag = builder->Slice(diag, start, limits, strides); - - // Collapses away the last dimension. - dims.pop_back(); - diag = builder->Reshape(diag, dims); - - ctx->SetOutput(0, diag); + xla::XlaOp input = ctx->Input(0); + ctx->SetOutput(0, xla::GetMatrixDiagonal(input)); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc index 0419de78b2ee83fd395e8bf23444fde84f30bba2..a3389d5b905bf3ee15744ab4fcee193d312e2ae0 100644 --- a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc @@ -18,7 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/compiler/tf2xla/type_util.h" @@ -57,8 +57,8 @@ class DynamicUpdateSliceOp : public XlaOpKernel { input_shape.DebugString(), "; update shape is ", update_shape.DebugString())); - xla::XlaOp result = ctx->builder()->DynamicUpdateSlice( - ctx->Input(0), ctx->Input(1), ctx->Input(2)); + xla::XlaOp result = + xla::DynamicUpdateSlice(ctx->Input(0), ctx->Input(1), ctx->Input(2)); ctx->SetOutput(0, result); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc index dd4a16908779508380b36f43ce2306ff2f5fb8c4..cb73053666d4c32bc0a2ef19b174aee1a29f101e 100644 --- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -150,8 +151,7 @@ class DynamicStitchOp : public XlaOpKernel { if (new_shape == data_shapes[input_num]) { input[input_num] = handle; } else { - input[input_num] = - ctx->builder()->Reshape(handle, new_shape.dim_sizes()); + input[input_num] = xla::Reshape(handle, new_shape.dim_sizes()); } } @@ -175,10 +175,10 @@ class DynamicStitchOp : public XlaOpKernel { // And place it in the concat list in the place indicated by // the index. to_concat[index_num] = - ctx->builder()->Slice(expression, slice_start, slice_limit, stride); + xla::Slice(expression, slice_start, slice_limit, stride); } - ctx->SetOutput(0, ctx->builder()->ConcatInDim(to_concat, 0)); + ctx->SetOutput(0, xla::ConcatInDim(ctx->builder(), to_concat, 0)); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/elu_op.cc b/tensorflow/compiler/tf2xla/kernels/elu_op.cc index 493781a1e68b8906f1a7e018e5710130e2eb08b5..5fdb1d972c55efb876972d3f472b53a1f7cde1c2 100644 --- a/tensorflow/compiler/tf2xla/kernels/elu_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/elu_op.cc @@ -18,8 +18,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/no_op.h" @@ -34,9 +34,9 @@ class EluOp : public XlaOpKernel { void Compile(XlaOpKernelContext* ctx) override { xla::XlaBuilder* b = ctx->builder(); const auto zero = XlaHelpers::Zero(b, input_type(0)); - const auto pred = b->Gt(ctx->Input(0), zero); - const auto expm1 = b->Expm1(ctx->Input(0)); - ctx->SetOutput(0, b->Select(pred, ctx->Input(0), expm1)); + const auto pred = xla::Gt(ctx->Input(0), zero); + const auto expm1 = xla::Expm1(ctx->Input(0)); + ctx->SetOutput(0, xla::Select(pred, ctx->Input(0), expm1)); } }; @@ -51,9 +51,9 @@ class EluGradOp : public XlaOpKernel { const auto one = XlaHelpers::One(b, input_type(0)); const auto grad = ctx->Input(0); const auto activation = ctx->Input(1); - const auto exp_grad = b->Mul(grad, b->Add(activation, one)); - const auto pred = b->Gt(activation, zero); - ctx->SetOutput(0, b->Select(pred, grad, exp_grad)); + const auto exp_grad = xla::Mul(grad, xla::Add(activation, one)); + const auto pred = xla::Gt(activation, zero); + ctx->SetOutput(0, xla::Select(pred, grad, exp_grad)); } }; @@ -71,10 +71,10 @@ class SeluOp : public XlaOpKernel { 1.0507009873554804934193349852946); const auto scale_alpha = XlaHelpers::FloatLiteral(b, input_type(0), 1.7580993408473768599402175208123); - const auto pred = b->Gt(ctx->Input(0), zero); - const auto expm1 = b->Expm1(ctx->Input(0)); - ctx->SetOutput(0, b->Select(pred, b->Mul(scale, ctx->Input(0)), - b->Mul(scale_alpha, expm1))); + const auto pred = xla::Gt(ctx->Input(0), zero); + const auto expm1 = xla::Expm1(ctx->Input(0)); + ctx->SetOutput(0, xla::Select(pred, xla::Mul(scale, ctx->Input(0)), + xla::Mul(scale_alpha, expm1))); } }; @@ -92,10 +92,10 @@ class SeluGradOp : public XlaOpKernel { 1.7580993408473768599402175208123); const auto grad = ctx->Input(0); const auto activation = ctx->Input(1); - const auto lin_grad = b->Mul(grad, scale); - const auto exp_grad = b->Mul(grad, b->Add(activation, scale_alpha)); - const auto pred = b->Gt(activation, zero); - ctx->SetOutput(0, b->Select(pred, lin_grad, exp_grad)); + const auto lin_grad = xla::Mul(grad, scale); + const auto exp_grad = xla::Mul(grad, xla::Add(activation, scale_alpha)); + const auto pred = xla::Gt(activation, zero); + ctx->SetOutput(0, xla::Select(pred, lin_grad, exp_grad)); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc index 6df01cabbf1d98c0299bfd808bcc6db6223c4777..c68b0bfd7961892294c2931e5c4c44de534a7740 100644 --- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc @@ -17,6 +17,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/util/tensor_format.h" namespace tensorflow { @@ -110,13 +112,11 @@ class ExtractImagePatchesOp : public XlaOpKernel { // Builds an identity matrix as a broadcast equality of iotas. // iota = np.arange(np.prod(ksize), depth) // filter = np.equal(np.reshape(iota, [-1, 1]), iota).astype(np.float32) - xla::XlaOp iota; - TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32, - kernel_size * depth, &iota)); + xla::XlaOp iota = xla::Iota(builder, xla::S32, kernel_size * depth); - auto lhs = builder->Reshape(iota, lhs_shape); - auto filter = builder->ConvertElementType( - builder->Eq(lhs, iota, {num_spatial_dims + 1}), type); + auto lhs = xla::Reshape(iota, lhs_shape); + auto filter = xla::ConvertElementType( + xla::Eq(lhs, iota, {num_spatial_dims + 1}), type); xla::ConvolutionDimensionNumbers dims; std::vector window_strides(num_spatial_dims); @@ -148,8 +148,8 @@ class ExtractImagePatchesOp : public XlaOpKernel { } xla::XlaOp conv = - builder->ConvGeneralDilated(ctx->Input(0), filter, window_strides, - padding, lhs_dilation, rhs_dilation, dims); + xla::ConvGeneralDilated(ctx->Input(0), filter, window_strides, padding, + lhs_dilation, rhs_dilation, dims); ctx->SetOutput(0, conv); } diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc index 8f0de0a524c908b598c1a2165a462275346ad137..cdba6680dee3fade5bdf0c453ed672b653072b0d 100644 --- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/platform/macros.h" namespace tensorflow { @@ -49,20 +50,20 @@ void XlaNudge(xla::XlaBuilder* b, const DataType data_type, const float quant_min_value, const float quant_max_value, xla::XlaOp* nudged_min, xla::XlaOp* nudged_max, xla::XlaOp* scale) { - *scale = b->Div(b->Sub(max, min), - XlaHelpers::FloatLiteral(b, data_type, - quant_max_value - quant_min_value)); + *scale = xla::Div(xla::Sub(max, min), + XlaHelpers::FloatLiteral( + b, data_type, quant_max_value - quant_min_value)); xla::XlaOp quant_min = XlaHelpers::FloatLiteral(b, data_type, quant_min_value); - xla::XlaOp zero_point_from_min = b->Sub(quant_min, b->Div(min, *scale)); + xla::XlaOp zero_point_from_min = xla::Sub(quant_min, xla::Div(min, *scale)); xla::XlaOp quant_max = XlaHelpers::FloatLiteral(b, data_type, quant_max_value); xla::XlaOp nudged_zero_point = - b->Select(b->Le(zero_point_from_min, quant_min), quant_min, - b->Select(b->Ge(zero_point_from_min, quant_max), quant_max, - b->Round(zero_point_from_min))); - *nudged_min = b->Mul(b->Sub(quant_min, nudged_zero_point), *scale); - *nudged_max = b->Mul(b->Sub(quant_max, nudged_zero_point), *scale); + xla::Select(xla::Le(zero_point_from_min, quant_min), quant_min, + xla::Select(xla::Ge(zero_point_from_min, quant_max), + quant_max, xla::Round(zero_point_from_min))); + *nudged_min = xla::Mul(xla::Sub(quant_min, nudged_zero_point), *scale); + *nudged_max = xla::Mul(xla::Sub(quant_max, nudged_zero_point), *scale); } xla::XlaOp Quantize(xla::XlaBuilder* b, const xla::XlaOp& input, @@ -71,14 +72,14 @@ xla::XlaOp Quantize(xla::XlaBuilder* b, const xla::XlaOp& input, const xla::XlaOp& nudged_input_max, const xla::XlaOp& input_scale) { xla::XlaOp one = XlaHelpers::FloatLiteral(b, data_type, 1.0f); - xla::XlaOp inv_scale = b->Div(one, input_scale); + xla::XlaOp inv_scale = xla::Div(one, input_scale); xla::XlaOp half = XlaHelpers::FloatLiteral(b, data_type, 0.5f); - xla::XlaOp clamped = b->Clamp(nudged_input_min, input, nudged_input_max); - xla::XlaOp clamped_shifted = b->Sub(clamped, nudged_input_min); + xla::XlaOp clamped = xla::Clamp(nudged_input_min, input, nudged_input_max); + xla::XlaOp clamped_shifted = xla::Sub(clamped, nudged_input_min); xla::XlaOp rounded = - b->Floor(b->Add(b->Mul(clamped_shifted, inv_scale), half)); - return b->Add(b->Mul(rounded, input_scale), nudged_input_min); + xla::Floor(xla::Add(xla::Mul(clamped_shifted, inv_scale), half)); + return xla::Add(xla::Mul(rounded, input_scale), nudged_input_min); } class FakeQuantWithMinMaxArgsOp : public XlaOpKernel { @@ -163,11 +164,11 @@ class FakeQuantWithMinMaxArgsGradOp : public XlaOpKernel { xla::XlaOp nudged_input_max = XlaHelpers::FloatLiteral(b, data_type, nudged_input_max_); - xla::XlaOp between_nudged_min_max = - b->And(b->Le(nudged_input_min, input), b->Le(input, nudged_input_max)); - xla::XlaOp zeroes = b->Broadcast(XlaHelpers::Zero(b, data_type), - gradient_shape.dim_sizes()); - xla::XlaOp output = b->Select(between_nudged_min_max, gradient, zeroes); + xla::XlaOp between_nudged_min_max = xla::And( + xla::Le(nudged_input_min, input), xla::Le(input, nudged_input_max)); + xla::XlaOp zeroes = xla::Broadcast(XlaHelpers::Zero(b, data_type), + gradient_shape.dim_sizes()); + xla::XlaOp output = xla::Select(between_nudged_min_max, gradient, zeroes); ctx->SetOutput(0, output); } @@ -249,25 +250,25 @@ class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel { XlaNudge(b, data_type, input_min, input_max, quant_min_, quant_max_, &nudged_input_min, &nudged_input_max, &input_scale); - xla::XlaOp between_nudged_min_max = - b->And(b->Le(nudged_input_min, input), b->Le(input, nudged_input_max)); + xla::XlaOp between_nudged_min_max = xla::And( + xla::Le(nudged_input_min, input), xla::Le(input, nudged_input_max)); xla::XlaOp zero = XlaHelpers::Zero(b, data_type); - xla::XlaOp zeroes = b->Broadcast(zero, gradient_shape.dim_sizes()); - xla::XlaOp output0 = b->Select(between_nudged_min_max, gradient, zeroes); + xla::XlaOp zeroes = xla::Broadcast(zero, gradient_shape.dim_sizes()); + xla::XlaOp output0 = xla::Select(between_nudged_min_max, gradient, zeroes); ctx->SetOutput(0, output0); - xla::XlaOp below_min = b->Lt(input, nudged_input_min); - xla::XlaOp select1 = b->Select(below_min, gradient, zeroes); - xla::XlaOp reduce1 = b->ReduceAll( + xla::XlaOp below_min = xla::Lt(input, nudged_input_min); + xla::XlaOp select1 = xla::Select(below_min, gradient, zeroes); + xla::XlaOp reduce1 = xla::ReduceAll( XlaHelpers::ConvertElementType(b, select1, accumulation_type), XlaHelpers::Zero(b, accumulation_type), *ctx->GetOrCreateAdd(accumulation_type)); xla::XlaOp output1 = XlaHelpers::ConvertElementType(b, reduce1, data_type); ctx->SetOutput(1, output1); - xla::XlaOp above_max = b->Gt(input, nudged_input_max); - xla::XlaOp select2 = b->Select(above_max, gradient, zeroes); - xla::XlaOp reduce2 = b->ReduceAll( + xla::XlaOp above_max = xla::Gt(input, nudged_input_max); + xla::XlaOp select2 = xla::Select(above_max, gradient, zeroes); + xla::XlaOp reduce2 = xla::ReduceAll( XlaHelpers::ConvertElementType(b, select2, accumulation_type), XlaHelpers::Zero(b, accumulation_type), *ctx->GetOrCreateAdd(accumulation_type)); diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc index 933924cad1c7cac2879bd4720cb21ffc33c23f50..80bcef966360ec9a1ca63a02741108ce41b31846 100644 --- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -62,8 +63,7 @@ class GenericFftOp : public XlaOpKernel { } } - xla::XlaBuilder* b = ctx->builder(); - xla::XlaOp fft = b->Fft(ctx->Input(0), fft_type_, fft_length); + xla::XlaOp fft = xla::Fft(ctx->Input(0), fft_type_, fft_length); ctx->SetOutput(0, fft); } diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc index e4467a0fb138ed7919af62ed032c0f5abee3e4f6..54b21a278229024e3e54e9135548be6b69b077e1 100644 --- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/register_types.h" @@ -59,11 +60,11 @@ class FillOp : public XlaOpKernel { xla::XlaOp data = ctx->Input(1); if (value_shape.dims() > 0) { CHECK_EQ(value_shape.dims(), 1); - data = ctx->builder()->Reshape(data, {}); + data = xla::Reshape(data, {}); } // Emit the actual computation, which broadcasts the scalar to the // desired shape. - auto result = ctx->builder()->Broadcast(data, broadcast); + auto result = xla::Broadcast(data, broadcast); ctx->SetOutput(0, result); } diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc index d13e25bcddae16d0cd630403219657121b80868d..44140304fdf5cdf60d8ad8b85c532fcadff8ba86 100644 --- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/op_kernel.h" @@ -75,8 +76,8 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape, out_shape.AppendShape(indices_shape_no_index_vectors); out_shape.AppendShape(input_shape_post_axis); - *gather_output = builder->Broadcast(XlaHelpers::Zero(builder, dtype), - out_shape.dim_sizes()); + *gather_output = + xla::Broadcast(XlaHelpers::Zero(builder, dtype), out_shape.dim_sizes()); return Status::OK(); } @@ -94,11 +95,11 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape, // operand = s32[3,3] parameter(0) // indices = s32[2] parameter(1) // gather = s32[3,2] gather(operand, indices), - // output_window_dims={0}, - // elided_window_dims={1}, - // gather_dims_to_operand_dims={1}, + // offset_dims={0}, + // collapsed_slice_dims={1}, + // start_index_map={1}, // index_vector_dim=1, - // window_bounds={3, 1} + // slice_sizes={3, 1} // // // Example of an N-D gather pulling out slices of shape [1,1,2] out of a @@ -107,42 +108,42 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape, // operand = s32[3,3,2] parameter(0) // indices = s32[2,2] parameter(1) // gather = s32[2,2] gather(operand, indices), - // output_window_dims={1}, - // elided_window_dims={0,1}, - // gather_dims_to_operand_dims={0,1}, + // offset_dims={1}, + // collapsed_slice_dims={0,1}, + // start_index_map={0,1}, // index_vector_dim=0, - // window_bounds={1,1,2} + // slice_sizes={1,1,2} xla::GatherDimensionNumbers dim_numbers; - std::vector window_bounds; - window_bounds.reserve(input_shape.dims()); + std::vector slice_sizes; + slice_sizes.reserve(input_shape.dims()); for (int64 i = 0; i < input_shape.dims(); i++) { int64 window_bound; if (axis <= i && i < (axis + num_index_dims)) { - dim_numbers.add_elided_window_dims(i); + dim_numbers.add_collapsed_slice_dims(i); window_bound = 1; } else { window_bound = input_shape.dim_size(i); } - window_bounds.push_back(window_bound); + slice_sizes.push_back(window_bound); if (i < axis) { - dim_numbers.add_output_window_dims(i); + dim_numbers.add_offset_dims(i); } else if (i >= (axis + num_index_dims)) { int64 indices_rank = indices_are_nd ? (indices_shape.dims() - 1) : indices_shape.dims(); - dim_numbers.add_output_window_dims(i + indices_rank - num_index_dims); + dim_numbers.add_offset_dims(i + indices_rank - num_index_dims); } } dim_numbers.set_index_vector_dim(indices_are_nd ? (indices_shape.dims() - 1) : indices_shape.dims()); for (int64 i = axis; i < axis + num_index_dims; i++) { - dim_numbers.add_gather_dims_to_operand_dims(i); + dim_numbers.add_start_index_map(i); } - *gather_output = builder->Gather(input, indices, dim_numbers, window_bounds); + *gather_output = xla::Gather(input, indices, dim_numbers, slice_sizes); return Status::OK(); } diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h index d898e43b858bac706d524c7c271f48b1b5fa258f..92346283c31dfe1d638526ac4b26ef762cd7fd14 100644 --- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h +++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h @@ -20,7 +20,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/util/bcast.h" diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc index e72200bfbcff20c55ac03030f1afc4bacaabf7ce..19dd38c46ef154ea74bcbb6721dd04924702efcc 100644 --- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc @@ -25,7 +25,10 @@ class IdentityOp : public XlaOpKernel { void Compile(XlaOpKernelContext* ctx) override { for (int i = 0; i < ctx->num_inputs(); ++i) { - ctx->SetOutput(i, ctx->Input(i)); + // Forwards using the underlying op_kernel_context so both tensor and + // resource values are forwarded correctly. + ctx->op_kernel_context()->set_output(i, + ctx->op_kernel_context()->input(i)); } } @@ -35,9 +38,10 @@ class IdentityOp : public XlaOpKernel { // XLA_* devices also register a "real" Identity operator so we suppress the // dummy operator using CompilationOnly(). -REGISTER_XLA_OP(Name("Identity").CompilationOnly(), IdentityOp); - -REGISTER_XLA_OP(Name("IdentityN").CompilationOnly(), IdentityOp); +REGISTER_XLA_OP(Name("Identity").AllowResourceTypes().CompilationOnly(), + IdentityOp); +REGISTER_XLA_OP(Name("IdentityN").AllowResourceTypes().CompilationOnly(), + IdentityOp); REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp); REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp); REGISTER_XLA_OP(Name("StopGradient"), IdentityOp); diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc index 8b9b026643cf35216a2082dfcce9270c017bd14f..6e1dbf5472f0b1eb0abcbe29c553ae926ecf2d8a 100644 --- a/tensorflow/compiler/tf2xla/kernels/if_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" namespace tensorflow { @@ -48,11 +49,11 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { VLOG(1) << "Building If: " << input_types_.size() << " inputs"; - std::vector inputs(input_types_.size()); std::vector arguments(input_types_.size()); for (int i = 0; i < input_types_.size(); ++i) { XlaCompiler::Argument& arg = arguments[i]; DataType type = ctx->input_type(i + 1); + if (type == DT_RESOURCE) { XlaResource* resource; OP_REQUIRES_OK(ctx, ctx->GetResourceInput(i + 1, &resource)); @@ -60,7 +61,6 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { arg.initialized = resource->initialized(); arg.kind = XlaCompiler::Argument::kResource; arg.resource_kind = resource->kind(); - OP_REQUIRES_OK(ctx, resource->Pack(&inputs[i], b)); arg.type = resource->type(); arg.shape = resource->shape(); @@ -79,7 +79,6 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { arg.kind = XlaCompiler::Argument::kParameter; arg.type = input_types_[i]; arg.shape = ctx->InputShape(i + 1); - inputs[i] = ctx->Input(i + 1); VLOG(2) << "Arg type: " << DataTypeString(arg.type) << " shape: " << arg.shape.DebugString(); } @@ -100,6 +99,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, else_branch_, arguments, &else_result)); + bool has_tensor_array_gradients = false; for (XlaCompiler::CompilationResult* result : {&then_result, &else_result}) { for (const XlaCompiler::ResourceUpdate& update : result->resource_updates) { XlaResource* resource; @@ -121,9 +121,21 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { for (const auto& gradient : resource->tensor_array_gradients()) { arg.tensor_array_gradients.insert(gradient.first); } + if (!resource->tensor_array_gradients().empty()) + has_tensor_array_gradients = true; } } + // Recompile the functions to update the argument shapes for tensor arrays. + if (has_tensor_array_gradients) { + then_result = {}; + OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, then_branch_, + arguments, &then_result)); + else_result = {}; + OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, else_branch_, + arguments, &else_result)); + } + // Check that both branches have identical input shapes. OP_REQUIRES(ctx, then_result.xla_input_shapes.size() == 1, errors::FailedPrecondition("Expected one input shape")); @@ -175,25 +187,37 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { "Mismatch in resource of then and else branch for resource ", i)); } + int num_inputs = then_result.input_mapping.size(); + std::vector inputs(num_inputs); + for (int i = 0; i < num_inputs; ++i) { + int input_num = then_result.input_mapping[i] + 1; + if (ctx->input_type(input_num) == DT_RESOURCE) { + XlaResource* resource; + OP_REQUIRES_OK(ctx, ctx->GetResourceInput(input_num, &resource)); + OP_REQUIRES_OK(ctx, resource->Pack(&inputs[i], b)); + } else { + inputs[i] = ctx->Input(i + 1); + } + } + + auto input_tuple = xla::Tuple(b, inputs); xla::XlaOp outputs = - b->Conditional(ctx->Input(0), b->Tuple(inputs), *then_result.computation, - b->Tuple(inputs), *else_result.computation); + xla::Conditional(ctx->Input(0), input_tuple, *then_result.computation, + input_tuple, *else_result.computation); // Sets non-variable outputs. for (int i = 0; i < output_types_.size(); ++i) { - if (ctx->input_type(i) != DT_RESOURCE) { - xla::XlaOp output_handle = b->GetTupleElement(outputs, i); - if (VLOG_IS_ON(2)) { - LOG(INFO) << "Setting output " << i; - auto shape_or = b->GetShape(output_handle); - if (shape_or.ok()) { - LOG(INFO) << "Shape for output " << i << ": " - << xla::ShapeUtil::HumanString(shape_or.ValueOrDie()); - } else { - LOG(INFO) << "Shape unknown for output " << i; - } + xla::XlaOp output_handle = xla::GetTupleElement(outputs, i); + if (VLOG_IS_ON(2)) { + LOG(INFO) << "Setting output " << i; + auto shape_or = b->GetShape(output_handle); + if (shape_or.ok()) { + LOG(INFO) << "Shape for output " << i << ": " + << xla::ShapeUtil::HumanString(shape_or.ValueOrDie()); + } else { + LOG(INFO) << "Shape unknown for output " << i; } - ctx->SetOutput(i, output_handle); } + ctx->SetOutput(i, output_handle); } // Updates the values of any resource variables modified by the conditional @@ -209,7 +233,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { OP_REQUIRES_OK(ctx, resource->SetFromPack( arguments[update.input_index].tensor_array_gradients, - b->GetTupleElement(outputs, pos), b)); + xla::GetTupleElement(outputs, pos), b)); } VLOG(2) << "If variable: pos: " << update.input_index << " name: " << resource->name() @@ -221,6 +245,8 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { VLOG(1) << "Done building If"; } +REGISTER_XLA_OP(Name("If").AllowResourceTypes(), XlaIfOp); +REGISTER_XLA_OP(Name("StatelessIf").AllowResourceTypes(), XlaIfOp); REGISTER_XLA_OP(Name("XlaIf").AllowResourceTypes(), XlaIfOp); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc index 1568b33679963c1a6630525f60560180d40b8d53..33a73fe5fdf403e513be085dd7bcea3255277b4a 100644 --- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc @@ -17,6 +17,12 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/sorting.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/core/framework/tensor_shape.h" namespace tensorflow { namespace { @@ -32,23 +38,26 @@ std::array RGBToHSV(XlaOpKernelContext* ctx, xla::XlaBuilder* b, auto red = rgb[0]; auto green = rgb[1]; auto blue = rgb[2]; - auto value = b->Max(b->Max(red, green), blue); - auto minimum = b->Min(b->Min(red, green), blue); - auto range = b->Sub(value, minimum); - - auto zeros = b->Broadcast(zero, shape.dim_sizes()); - auto saturation = b->Select(b->Gt(value, zero), b->Div(range, value), zeros); - - auto norm = b->Div(XlaHelpers::FloatLiteral(b, dtype, 1.0 / 6.0), range); - - auto hue = b->Select(b->Eq(green, value), - b->Add(b->Mul(norm, b->Sub(blue, red)), - XlaHelpers::FloatLiteral(b, dtype, 2.0 / 6.0)), - b->Add(b->Mul(norm, b->Sub(red, green)), - XlaHelpers::FloatLiteral(b, dtype, 4.0 / 6.0))); - hue = b->Select(b->Eq(red, value), b->Mul(norm, b->Sub(green, blue)), hue); - hue = b->Select(b->Gt(range, zero), hue, zeros); - hue = b->Select(b->Lt(hue, zero), b->Add(hue, one), hue); + auto value = xla::Max(xla::Max(red, green), blue); + auto minimum = xla::Min(xla::Min(red, green), blue); + auto range = xla::Sub(value, minimum); + + auto zeros = xla::Broadcast(zero, shape.dim_sizes()); + auto saturation = + xla::Select(xla::Gt(value, zero), xla::Div(range, value), zeros); + + auto norm = xla::Div(XlaHelpers::FloatLiteral(b, dtype, 1.0 / 6.0), range); + + auto hue = + xla::Select(xla::Eq(green, value), + xla::Add(xla::Mul(norm, xla::Sub(blue, red)), + XlaHelpers::FloatLiteral(b, dtype, 2.0 / 6.0)), + xla::Add(xla::Mul(norm, xla::Sub(red, green)), + XlaHelpers::FloatLiteral(b, dtype, 4.0 / 6.0))); + hue = xla::Select(xla::Eq(red, value), xla::Mul(norm, xla::Sub(green, blue)), + hue); + hue = xla::Select(xla::Gt(range, zero), hue, zeros); + hue = xla::Select(xla::Lt(hue, zero), xla::Add(hue, one), hue); return {hue, saturation, value}; } @@ -66,15 +75,15 @@ std::array HSVToRGB(xla::XlaBuilder* b, auto four = XlaHelpers::FloatLiteral(b, dtype, 4.0); auto six = XlaHelpers::FloatLiteral(b, dtype, 6.0); - auto dh = b->Mul(hue, six); - auto dr = b->Clamp(zero, b->Sub(b->Abs(b->Sub(dh, three)), one), one); - auto dg = b->Clamp(zero, b->Sub(two, b->Abs(b->Sub(dh, two))), one); - auto db = b->Clamp(zero, b->Sub(two, b->Abs(b->Sub(dh, four))), one); - auto one_minus_s = b->Sub(one, saturation); + auto dh = xla::Mul(hue, six); + auto dr = xla::Clamp(zero, xla::Sub(xla::Abs(xla::Sub(dh, three)), one), one); + auto dg = xla::Clamp(zero, xla::Sub(two, xla::Abs(xla::Sub(dh, two))), one); + auto db = xla::Clamp(zero, xla::Sub(two, xla::Abs(xla::Sub(dh, four))), one); + auto one_minus_s = xla::Sub(one, saturation); - auto red = b->Mul(b->Add(one_minus_s, b->Mul(saturation, dr)), value); - auto green = b->Mul(b->Add(one_minus_s, b->Mul(saturation, dg)), value); - auto blue = b->Mul(b->Add(one_minus_s, b->Mul(saturation, db)), value); + auto red = xla::Mul(xla::Add(one_minus_s, xla::Mul(saturation, dr)), value); + auto green = xla::Mul(xla::Add(one_minus_s, xla::Mul(saturation, dg)), value); + auto blue = xla::Mul(xla::Add(one_minus_s, xla::Mul(saturation, db)), value); return {red, green, blue}; } @@ -97,21 +106,21 @@ class RGBToHSVOp : public XlaOpKernel { xla::XlaBuilder* b = context->builder(); xla::XlaOp input = context->Input(0); - xla::XlaOp red = - b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1, - /*dimno=*/channel_dim); - xla::XlaOp green = - b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1, - /*dimno=*/channel_dim); - xla::XlaOp blue = - b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1, - /*dimno=*/channel_dim); + xla::XlaOp red = xla::SliceInDim(input, /*start_index=*/0, + /*limit_index=*/1, /*stride=*/1, + /*dimno=*/channel_dim); + xla::XlaOp green = xla::SliceInDim(input, /*start_index=*/1, + /*limit_index=*/2, /*stride=*/1, + /*dimno=*/channel_dim); + xla::XlaOp blue = xla::SliceInDim(input, /*start_index=*/2, + /*limit_index=*/3, /*stride=*/1, + /*dimno=*/channel_dim); TensorShape channel_shape = input_shape; channel_shape.set_dim(channel_dim, 1); auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0), channel_shape); - context->SetOutput(0, b->ConcatInDim(hsv, channel_dim)); + context->SetOutput(0, xla::ConcatInDim(b, hsv, channel_dim)); } }; REGISTER_XLA_OP(Name("RGBToHSV"), RGBToHSVOp); @@ -134,20 +143,20 @@ class HSVToRGBOp : public XlaOpKernel { xla::XlaBuilder* b = context->builder(); xla::XlaOp input = context->Input(0); - xla::XlaOp hue = - b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1, - /*dimno=*/channel_dim); - xla::XlaOp saturation = - b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1, - /*dimno=*/channel_dim); - xla::XlaOp value = - b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1, - /*dimno=*/channel_dim); + xla::XlaOp hue = xla::SliceInDim(input, /*start_index=*/0, + /*limit_index=*/1, /*stride=*/1, + /*dimno=*/channel_dim); + xla::XlaOp saturation = xla::SliceInDim(input, /*start_index=*/1, + /*limit_index=*/2, /*stride=*/1, + /*dimno=*/channel_dim); + xla::XlaOp value = xla::SliceInDim(input, /*start_index=*/2, + /*limit_index=*/3, /*stride=*/1, + /*dimno=*/channel_dim); auto rgb = HSVToRGB(context->builder(), {hue, saturation, value}, context->input_type(0)); - context->SetOutput(0, b->ConcatInDim(rgb, channel_dim)); + context->SetOutput(0, xla::ConcatInDim(b, rgb, channel_dim)); } }; REGISTER_XLA_OP(Name("HSVToRGB"), HSVToRGBOp); @@ -182,18 +191,20 @@ class AdjustContrastOpV2 : public XlaOpKernel { const DataType accumulation_type = XlaHelpers::SumAccumulationType(type); auto converted = XlaHelpers::ConvertElementType(b, input, accumulation_type); - auto reduce = b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type), - *context->GetOrCreateAdd(accumulation_type), - {height_dim, width_dim}); + auto reduce = xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type), + *context->GetOrCreateAdd(accumulation_type), + {height_dim, width_dim}); auto output = XlaHelpers::ConvertElementType(b, reduce, type); - output = b->Div(output, XlaHelpers::FloatLiteral(b, type, height * width)); + output = + xla::Div(output, XlaHelpers::FloatLiteral(b, type, height * width)); std::vector broadcast_dims(input_shape.dims() - 2); std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0); broadcast_dims.back() = channel_dim; - output = b->Add(b->Mul(input, factor), - b->Mul(output, b->Sub(XlaHelpers::One(b, type), factor)), - broadcast_dims); + output = + xla::Add(xla::Mul(input, factor), + xla::Mul(output, xla::Sub(XlaHelpers::One(b, type), factor)), + broadcast_dims); context->SetOutput(0, output); } }; @@ -226,26 +237,26 @@ class AdjustSaturationOp : public XlaOpKernel { DataType type = context->input_type(0); - xla::XlaOp red = - b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1, - /*dimno=*/channel_dim); - xla::XlaOp green = - b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1, - /*dimno=*/channel_dim); - xla::XlaOp blue = - b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1, - /*dimno=*/channel_dim); + xla::XlaOp red = xla::SliceInDim(input, /*start_index=*/0, + /*limit_index=*/1, /*stride=*/1, + /*dimno=*/channel_dim); + xla::XlaOp green = xla::SliceInDim(input, /*start_index=*/1, + /*limit_index=*/2, /*stride=*/1, + /*dimno=*/channel_dim); + xla::XlaOp blue = xla::SliceInDim(input, /*start_index=*/2, + /*limit_index=*/3, /*stride=*/1, + /*dimno=*/channel_dim); TensorShape channel_shape = input_shape; channel_shape.set_dim(channel_dim, 1); auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0), channel_shape); - hsv[1] = b->Clamp(XlaHelpers::Zero(b, type), b->Mul(hsv[1], scale), - XlaHelpers::One(b, type)); + hsv[1] = xla::Clamp(XlaHelpers::Zero(b, type), xla::Mul(hsv[1], scale), + XlaHelpers::One(b, type)); auto rgb = HSVToRGB(context->builder(), hsv, context->input_type(0)); - context->SetOutput(0, b->ConcatInDim(rgb, channel_dim)); + context->SetOutput(0, xla::ConcatInDim(b, rgb, channel_dim)); } }; REGISTER_XLA_OP(Name("AdjustSaturation"), AdjustSaturationOp); @@ -276,15 +287,15 @@ class AdjustHueOp : public XlaOpKernel { DataType type = context->input_type(0); - xla::XlaOp red = - b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1, - /*dimno=*/channel_dim); - xla::XlaOp green = - b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1, - /*dimno=*/channel_dim); - xla::XlaOp blue = - b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1, - /*dimno=*/channel_dim); + xla::XlaOp red = xla::SliceInDim(input, /*start_index=*/0, + /*limit_index=*/1, /*stride=*/1, + /*dimno=*/channel_dim); + xla::XlaOp green = xla::SliceInDim(input, /*start_index=*/1, + /*limit_index=*/2, /*stride=*/1, + /*dimno=*/channel_dim); + xla::XlaOp blue = xla::SliceInDim(input, /*start_index=*/2, + /*limit_index=*/3, /*stride=*/1, + /*dimno=*/channel_dim); TensorShape channel_shape = input_shape; channel_shape.set_dim(channel_dim, 1); auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0), @@ -294,15 +305,161 @@ class AdjustHueOp : public XlaOpKernel { auto one = XlaHelpers::One(b, type); auto& hue = hsv[0]; - hue = b->Rem(b->Add(hsv[0], delta), one); - hue = b->Select(b->Lt(hue, zero), b->Rem(b->Add(one, hue), one), hue); + hue = xla::Rem(xla::Add(hsv[0], delta), one); + hue = + xla::Select(xla::Lt(hue, zero), xla::Rem(xla::Add(one, hue), one), hue); auto rgb = HSVToRGB(context->builder(), hsv, context->input_type(0)); - context->SetOutput(0, b->ConcatInDim(rgb, channel_dim)); + context->SetOutput(0, xla::ConcatInDim(b, rgb, channel_dim)); } }; REGISTER_XLA_OP(Name("AdjustHue"), AdjustHueOp); +class NonMaxSuppressionOp : public XlaOpKernel { + public: + explicit NonMaxSuppressionOp(OpKernelConstruction* context) + : XlaOpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("pad_to_max_output_size", + &pad_to_max_output_size_)); + } + + void Compile(XlaOpKernelContext* context) override { + // TODO(b/111646731): Improve scalability of this op, using blocking. + int num_boxes_dim = 0; + int coords_dim = 1; + const TensorShape& boxes_shape = context->InputShape("boxes"); + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(boxes_shape), + errors::InvalidArgument("boxes must be 2-D, currently: ", + boxes_shape.DebugString())); + const int64 num_boxes = boxes_shape.dim_size(num_boxes_dim); + OP_REQUIRES(context, boxes_shape.dim_size(coords_dim) == 4, + errors::InvalidArgument("boxes must have 4 columns", + boxes_shape.DebugString())); + const TensorShape& scores_shape = context->InputShape("scores"); + OP_REQUIRES(context, TensorShapeUtils::IsVector(scores_shape), + errors::InvalidArgument("scores must be 1-D, currently: ", + scores_shape.DebugString())); + OP_REQUIRES( + context, scores_shape.dim_size(0) == num_boxes, + errors::InvalidArgument("scores size must equal number of boxes", + scores_shape.DebugString())); + OP_REQUIRES(context, pad_to_max_output_size_, + errors::InvalidArgument( + "XLA compilation requires pad_to_max_output_size == True")); + + xla::XlaOp boxes = context->Input("boxes"); + xla::XlaOp scores = context->Input("scores"); + int64 output_size; + OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &output_size)); + OP_REQUIRES( + context, output_size >= 0, + errors::InvalidArgument("Need output_size >= 0, got ", output_size)); + xla::XlaOp score_thresh = context->Input("score_threshold"); + xla::XlaOp iou_thresh = context->Input("iou_threshold"); + + xla::XlaBuilder* const builder = context->builder(); + + // Choose a more convenient layout. + xla::XlaOp boxes_t = xla::Transpose(boxes, {1, 0}); + coords_dim = 0; + num_boxes_dim = 1; + + // Shapes are henceforth [1, num_boxes]. + xla::XlaOp coord_y0 = xla::SliceInDim(boxes_t, + /*start_index=*/0, + /*limit_index=*/1, + /*stride=*/1, + /*dimno=*/coords_dim); + xla::XlaOp coord_x0 = xla::SliceInDim(boxes_t, + /*start_index=*/1, + /*limit_index=*/2, + /*stride=*/1, + /*dimno=*/coords_dim); + xla::XlaOp coord_y1 = xla::SliceInDim(boxes_t, + /*start_index=*/2, + /*limit_index=*/3, + /*stride=*/1, + /*dimno=*/coords_dim); + xla::XlaOp coord_x1 = xla::SliceInDim(boxes_t, + /*start_index=*/3, + /*limit_index=*/4, + /*stride=*/1, + /*dimno=*/coords_dim); + xla::XlaOp y1 = + xla::Select(xla::Le(coord_y0, coord_y1), coord_y0, coord_y1); + xla::XlaOp y2 = + xla::Select(xla::Le(coord_y0, coord_y1), coord_y1, coord_y0); + xla::XlaOp x1 = + xla::Select(xla::Le(coord_x0, coord_x1), coord_x0, coord_x1); + xla::XlaOp x2 = + xla::Select(xla::Le(coord_x0, coord_x1), coord_x1, coord_x0); + xla::XlaOp area = (y2 - y1) * (x2 - x1); + + // Transpose the 1xN tensors, instead of the NxN tensors. + xla::XlaOp y1_t = xla::Transpose(y1, {1, 0}); + xla::XlaOp y2_t = xla::Transpose(y2, {1, 0}); + xla::XlaOp x1_t = xla::Transpose(x1, {1, 0}); + xla::XlaOp x2_t = xla::Transpose(x2, {1, 0}); + xla::XlaOp area_t = xla::Transpose(area, {1, 0}); + + // Shapes are henceforth [num_boxes, num_boxes]. + xla::XlaOp i_xmin = xla::Max(x1, x1_t); + xla::XlaOp i_ymin = xla::Max(y1, y1_t); + xla::XlaOp i_xmax = xla::Min(x2, x2_t); + xla::XlaOp i_ymax = xla::Min(y2, y2_t); + auto square_zero = xla::ZerosLike(i_xmin); + + xla::XlaOp i_area = xla::Max(i_xmax - i_xmin, square_zero) * + xla::Max(i_ymax - i_ymin, square_zero); + xla::XlaOp u_area = area + area_t - i_area; + xla::XlaOp iou = i_area / u_area; + + xla::XlaOp iou_thresh_mask = xla::Gt(iou, iou_thresh + square_zero); + xla::XlaOp scores_2d = xla::Reshape(scores, {num_boxes, 1}); + xla::XlaOp score_cmp_mask = + xla::Gt(scores_2d, xla::Transpose(scores_2d, {1, 0})); + xla::XlaOp suppress = xla::And(iou_thresh_mask, score_cmp_mask); + + // Shapes are [num_boxes] after the reduce. + xla::XlaOp included_iou = xla::Not(xla::Reduce( + suppress, + /*init_value=*/xla::ConstantR0(builder, false), + /*computation=*/CreateScalarOrComputation(xla::PRED, builder), + /*dimensions_to_reduce=*/{0})); + xla::XlaOp included_score = + xla::Gt(scores, xla::Broadcast(score_thresh, {num_boxes})); + xla::XlaOp included = xla::And(included_iou, included_score); + xla::XlaOp neg_inf = + xla::Broadcast(xla::MinValue(builder, xla::F32), {num_boxes}); + xla::XlaOp scores_included = xla::Select(included, scores, neg_inf); + + xla::XlaOp ones_included = xla::Select( + included, + xla::Broadcast(xla::ConstantR0(builder, 1), {num_boxes}), + xla::Broadcast(xla::ConstantR0(builder, 0), {num_boxes})); + + // num_valid is scalar. + xla::XlaOp num_valid = xla::Reduce( + ones_included, + /*init_value=*/xla::ConstantR0(builder, 0), + /*computation=*/CreateScalarAddComputation(xla::S32, builder), + /*dimensions_to_reduce=*/{0}); + + xla::XlaOp output_tuple = TopK(scores_included, output_size); + xla::XlaOp selected_indices = xla::GetTupleElement(output_tuple, 1); + + context->SetOutput(0, selected_indices); + context->SetOutput(1, num_valid); + } + + private: + bool pad_to_max_output_size_; +}; + +REGISTER_XLA_OP( + Name("NonMaxSuppressionV4").CompileTimeConstInput("max_output_size"), + NonMaxSuppressionOp); + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc index 9058cbc74762576c7e6f8ec1b2b0f6b247ac0502..d9a0257b70bcf302dea77db2e9f7fa7b4543e038 100644 --- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc @@ -18,6 +18,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/array4d.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/lib/math/math_util.h" @@ -30,13 +32,13 @@ namespace { // // 1. S := (N - 1) / gcd(N-1, R-1) // 2. k := (R - 1) / gcd(N-1, R-1) -// 3. Convolution(kxk, stride=S, lhs_dilation=k, padding=k-1) +// 3. Convolution((2k-1)x(2k-1), stride=S, lhs_dilation=k, padding=k-1) // // For example, to Scale from 7x7 -> 15x15: // // 1. S := (7-1) / gcd(7-1, 15-1) = 6 / gcd(6, 14) = 6 / 2 = 3 // 2. k := (15 - 1) / gcd(7-1, 15-1) = 14 / gcd(6, 14) = 14 / 2 = 7 -// 3. Convolution(7x7, stride=3, lhs_dilation=3, padding=2) +// 3. Convolution(15x15, stride=3, lhs_dilation=7, padding=2) // // // The 7x7 -> 15x15 case is much too large to write out in full as an @@ -63,6 +65,8 @@ namespace { // 1/9 * 3 6 9 6 3 // 2 4 6 4 2 // 1 2 3 2 1 +// Note that the convolution kernel matrix is separable and thus we can instead +// use 2 consecutive 1D kernel of the dimension 2k-1, along each axis. // Computes the size of the convolutional kernel and stride to use when resizing // from in_size to out_size. @@ -74,7 +78,8 @@ struct ResizeConvolutionDims { std::vector stride; }; ResizeConvolutionDims ComputeResizeConvolutionParameters( - gtl::ArraySlice in_size, gtl::ArraySlice out_size) { + absl::Span in_size, absl::Span out_size, + bool align_corners) { CHECK_EQ(in_size.size(), out_size.size()); int num_spatial_dims = in_size.size(); ResizeConvolutionDims dims; @@ -90,61 +95,104 @@ ResizeConvolutionDims ComputeResizeConvolutionParameters( // entry before resizing. dims.stride[i] = dims.kernel_size[i] = 1; } else { - int64 gcd = MathUtil::GCD(static_cast(in_size[i] - 1), - static_cast(out_size[i] - 1)); - dims.stride[i] = (in_size[i] - 1) / gcd; - dims.kernel_size[i] = (out_size[i] - 1) / gcd; + // The scaling factor changes depending on the alignment of corners. + const int64 in_size_factor = align_corners ? in_size[i] - 1 : in_size[i]; + const int64 out_size_factor = + align_corners ? out_size[i] - 1 : out_size[i]; + + int64 gcd = MathUtil::GCD(static_cast(in_size_factor), + static_cast(out_size_factor)); + dims.stride[i] = in_size_factor / gcd; + dims.kernel_size[i] = out_size_factor / gcd; } } return dims; } +// The upper padding of the input needed by ConvGeneralDilated calls is +// determined by solving two related relationships (assuming rhs_dilation == 0): +// 1. dilated_input_dim = lower_padding + upper_padding +// + lhs_dilation * (in_size - 1) + 1 +// 2. dilated_input_dim = (2 * dims.kernel-size - 1) +// + dims.stride * (out_size - 1) +int64 CalculateUpperPadding(int64 in_size, int64 out_size, int64 kernel_size, + int64 stride) { + return (2 * kernel_size - 1) + (out_size - 1) * stride - (kernel_size - 1) - + 1 - (kernel_size * (in_size - 1)); +} + +// Form a 2D convolution kernel like: +// 1 2 3 2 1 +// 2 4 6 4 2 +// 1/9 * 3 6 9 6 3 +// 2 4 6 4 2 +// 1 2 3 2 1 +// by multiplying two 1D kernels of the form: +// 1/3 * [1 2 3 2 1] +// If the 2D kernel would be very large, the 1D kernel can be applied once in +// each dimension due to the symmetry of the kernel along all axis to reduce the +// computational intensity. +std::vector Make1DKernel(int64 n) { + std::vector kernel(n * 2 - 1); + for (int64 i = 0; i < n; ++i) { + float v = (i + 1.0f) / n; + kernel[i] = v; + kernel[n * 2 - 2 - i] = v; + } + return kernel; +} + +// Kernels with more than 16 spatial elements are considered intense and the +// kernel should applied to each dimension independently. +const int64 kMax2DKernelSize = 16; + xla::XlaOp MakeBilinearResizeKernel(xla::XlaBuilder* builder, - gtl::ArraySlice kernel_size, + absl::Span kernel_size, int64 channels) { - // Form a 2D convolution kernel like: - // 1 2 3 2 1 - // 2 4 6 4 2 - // 1/9 * 3 6 9 6 3 - // 2 4 6 4 2 - // 1 2 3 2 1 - // by multiplying two 1D kernels of the form: - // 1/3 * [1 2 3 2 1] - auto make_1d_kernel = [](int64 n) { - std::vector kernel(n * 2 - 1); - for (int64 i = 0; i < n; ++i) { - float v = (i + 1.0f) / n; - kernel[i] = v; - kernel[n * 2 - 2 - i] = v; - } - return kernel; - }; - - xla::XlaOp channels_iota; - // DT_INT32 Iota will always return status::OK(). - TF_CHECK_OK( - XlaHelpers::Iota(builder, DataType::DT_INT32, channels, &channels_iota)); + xla::XlaOp channels_iota = xla::Iota(builder, xla::S32, channels); - auto diag = builder->ConvertElementType( - builder->Eq( - builder->Broadcast(channels_iota, {2 * kernel_size[0] - 1, + auto diag = xla::ConvertElementType( + xla::Eq(xla::Broadcast(channels_iota, {2 * kernel_size[0] - 1, 2 * kernel_size[1] - 1, channels}), - channels_iota, /*broadcast_dimensions=*/{2}), + channels_iota, /*broadcast_dimensions=*/{2}), xla::PrimitiveType::F32); - return builder->Mul( - builder->Mul(diag, - builder->ConstantR1(make_1d_kernel(kernel_size[1])), - /*broadcast_dimensions=*/{1}), - builder->ConstantR1(make_1d_kernel(kernel_size[0])), + return xla::Mul( + xla::Mul(diag, + xla::ConstantR1(builder, Make1DKernel(kernel_size[1])), + /*broadcast_dimensions=*/{1}), + xla::ConstantR1(builder, Make1DKernel(kernel_size[0])), /*broadcast_dimensions=*/{0}); } +xla::XlaOp MakeBilinearResizeKernelInDim(xla::XlaBuilder* builder, + absl::Span kernel_size, + int64 channels, int64 dim) { + xla::XlaOp channels_iota = xla::Iota(builder, xla::S32, channels); + + auto diag = xla::ConvertElementType( + xla::Eq( + xla::Broadcast(channels_iota, + {dim == 0 ? (2 * kernel_size[0] - 1) : 1, + dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels}), + channels_iota, /*broadcast_dimensions=*/{2}), + xla::PrimitiveType::F32); + if (dim == 1) { + return xla::Mul( + diag, xla::ConstantR1(builder, Make1DKernel(kernel_size[1])), + /*broadcast_dimensions=*/{1}); + } + return xla::Mul(diag, + xla::ConstantR1(builder, Make1DKernel(kernel_size[0])), + /*broadcast_dimensions=*/{0}); +} + xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder, const xla::XlaOp& input, const int num_spatial_dims, std::vector in_size, std::vector out_size, - const int64 channels) { + const int64 channels, + const bool align_corners) { // Picture for a 1x3 to 1x4 resize: // stride = 2, kernel size = 3 // Input: @@ -165,27 +213,104 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder, dimension_numbers.add_output_spatial_dimensions(1 + i); dimension_numbers.add_kernel_spatial_dimensions(i); } - dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims); - dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims + 1); + dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims + 1); + dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims); ResizeConvolutionDims dims = - ComputeResizeConvolutionParameters(in_size, out_size); - xla::XlaOp kernel = - MakeBilinearResizeKernel(builder, dims.kernel_size, channels); - xla::XlaOp output = builder->ConvGeneralDilated( - input, kernel, dims.stride, - /*padding=*/ - {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, - {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}}, - /*lhs_dilation=*/dims.kernel_size, - /*rhs_dilation=*/{1, 1}, dimension_numbers); + ComputeResizeConvolutionParameters(in_size, out_size, align_corners); + xla::XlaOp output; + + // Concatenation and padding below currently assumes num_spatial_dims is 2 to + // prevent needless code complexity. + CHECK_EQ(num_spatial_dims, 2) + << "ResizeUsingDilationAndConvolution pads only 2 dimensions currently."; + std::vector upper_padding(num_spatial_dims); + for (int i = 0; i < num_spatial_dims; ++i) { + upper_padding[i] = dims.kernel_size[i] - 1; + } + xla::XlaOp input_data = input; + + if (!align_corners) { + // When Tensorflow does not align_corners, the resize indexing can access + // beyond the upper bound and is instead clamped to prevent out of bounds + // reads. This is conceptually the same as extending the edges of the input. + // We emulate this by copying the last row/column of the input. + // Calculate what padding would be needed then determine how far to extend + // the border before lhs dilation. + std::vector num_extended(num_spatial_dims); + upper_padding[0] = CalculateUpperPadding( + in_size[0], out_size[0], dims.kernel_size[0], dims.stride[0]); + upper_padding[1] = CalculateUpperPadding( + in_size[1], out_size[1], dims.kernel_size[1], dims.stride[1]); + num_extended[0] = upper_padding[0] / (dims.kernel_size[0]); + num_extended[1] = upper_padding[1] / (dims.kernel_size[1]); + + if (num_extended[0] > 0) { + auto slice = + xla::Slice(input_data, {0, in_size[0] - 1, 0, 0}, + {1, in_size[0], in_size[1], channels}, {1, 1, 1, 1}); + for (int i = 0; i < num_extended[0]; i++) { + input_data = xla::ConcatInDim(builder, {input_data, slice}, 1); + } + } + + if (num_extended[1] > 0) { + auto slice = + xla::Slice(input_data, {0, 0, in_size[1] - 1, 0}, + {1, in_size[0] + num_extended[0], in_size[1], channels}, + {1, 1, 1, 1}); + for (int i = 0; i < num_extended[1]; i++) { + input_data = xla::ConcatInDim(builder, {input_data, slice}, 2); + } + } + + // Setting in_size to (in_size + num_extended) due to the above Slice and + // ConcatInDim. Recalculate needed padding after the above Slice/Concat. + upper_padding[0] = + CalculateUpperPadding(in_size[0] + num_extended[0], out_size[0], + dims.kernel_size[0], dims.stride[0]); + upper_padding[1] = + CalculateUpperPadding(in_size[1] + num_extended[1], out_size[1], + dims.kernel_size[1], dims.stride[1]); + } + + // Split convolutions into independent dimensions if they would be a very + // large kernel. + if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) { + xla::XlaOp kernel = + MakeBilinearResizeKernel(builder, dims.kernel_size, channels); + output = + xla::ConvGeneralDilated(input_data, kernel, dims.stride, + /*padding=*/ + {{dims.kernel_size[0] - 1, upper_padding[0]}, + {dims.kernel_size[1] - 1, upper_padding[1]}}, + /*lhs_dilation=*/dims.kernel_size, + /*rhs_dilation=*/{1, 1}, dimension_numbers); + } else { + xla::XlaOp kernel0 = + MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0); + output = xla::ConvGeneralDilated( + input_data, kernel0, {dims.stride[0], 1}, + /*padding=*/ + {{dims.kernel_size[0] - 1, upper_padding[0]}, {0, 0}}, + /*lhs_dilation=*/{dims.kernel_size[0], 1}, + /*rhs_dilation=*/{1, 1}, dimension_numbers); + xla::XlaOp kernel1 = + MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1); + output = xla::ConvGeneralDilated( + output, kernel1, {1, dims.stride[1]}, + /*padding=*/ + {{0, 0}, {dims.kernel_size[1] - 1, upper_padding[1]}}, + /*lhs_dilation=*/{1, dims.kernel_size[1]}, + /*rhs_dilation=*/{1, 1}, dimension_numbers); + } // Add broadcasts to handle expanding from a size == 1 dimension to a // size > 1 dimension. for (int i = 0; i < num_spatial_dims; ++i) { if (in_size[i] == 1 && out_size[i] > 1) { - output = builder->Add(output, builder->ConstantR1(out_size[i], 0), - /*broadcast_dimensions=*/{1 + i}); + output = xla::Add(output, xla::ConstantR1(builder, out_size[i], 0), + /*broadcast_dimensions=*/{1 + i}); } } return output; @@ -196,9 +321,10 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder, const int num_spatial_dims, std::vector in_size, std::vector grad_size, - const int64 channels) { + const int64 channels, + const bool align_corners) { ResizeConvolutionDims dims = - ComputeResizeConvolutionParameters(in_size, grad_size); + ComputeResizeConvolutionParameters(in_size, grad_size, align_corners); // To form the backward convolution, we keep the kernel unchanged (it is // already symmetric) and swap the roles of strides and LHS dilation. @@ -214,26 +340,63 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder, } dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims); dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims + 1); - xla::XlaOp kernel = - MakeBilinearResizeKernel(builder, dims.kernel_size, channels); + xla::XlaOp output; + if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) { + xla::XlaOp kernel = + MakeBilinearResizeKernel(builder, dims.kernel_size, channels); + + // Broadcast the input kernel where the forward op expanded from a size == 1 + // dimension to a size > 1 dimension. This has the effect of summing the + // gradient contributions in that dimension. + for (int i = 0; i < num_spatial_dims; ++i) { + if (in_size[i] == 1 && grad_size[i] > 1) { + kernel = + xla::Add(kernel, xla::ConstantR1(builder, grad_size[i], 0), + /*broadcast_dimensions=*/{i}); + } + } - // Broadcast the input kernel where the forward op expanded from a size == 1 - // dimension to a size > 1 dimension. This has the effect of summing the - // gradient contributions in that dimension. - for (int i = 0; i < num_spatial_dims; ++i) { - if (in_size[i] == 1 && grad_size[i] > 1) { - kernel = builder->Add(kernel, builder->ConstantR1(grad_size[i], 0), - /*broadcast_dimensions=*/{i}); + output = xla::ConvGeneralDilated( + grad, kernel, /*window_strides=*/dims.kernel_size, + /*padding=*/ + {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, + {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}}, + /*lhs_dilation=*/dims.stride, + /*rhs_dilation=*/{1, 1}, dimension_numbers); + } else { + xla::XlaOp kernel0 = + MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0); + xla::XlaOp kernel1 = + MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1); + + // Broadcast the input kernel where the forward op expanded from a size == 1 + // dimension to a size > 1 dimension. This has the effect of summing the + // gradient contributions in that dimension. + if (in_size[0] == 1 && grad_size[0] > 1) { + kernel0 = + xla::Add(kernel0, xla::ConstantR1(builder, grad_size[0], 0), + /*broadcast_dimensions=*/{0}); + } + if (in_size[1] == 1 && grad_size[1] > 1) { + kernel1 = + xla::Add(kernel0, xla::ConstantR1(builder, grad_size[1], 0), + /*broadcast_dimensions=*/{1}); } - } - xla::XlaOp output = builder->ConvGeneralDilated( - grad, kernel, /*window_strides=*/dims.kernel_size, - /*padding=*/ - {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, - {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}}, - /*lhs_dilation=*/dims.stride, - /*rhs_dilation=*/{1, 1}, dimension_numbers); + output = xla::ConvGeneralDilated( + grad, kernel0, /*window_strides=*/{dims.kernel_size[0], 1}, + /*padding=*/ + {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, {0, 0}}, + /*lhs_dilation=*/{dims.stride[0], 1}, + /*rhs_dilation=*/{1, 1}, dimension_numbers); + + output = xla::ConvGeneralDilated( + output, kernel1, /*window_strides=*/{1, dims.kernel_size[1]}, + /*padding=*/ + {{0, 0}, {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}}, + /*lhs_dilation=*/{1, dims.stride[1]}, + /*rhs_dilation=*/{1, 1}, dimension_numbers); + } // If in_size[i] > 1 and grad_size[i] == 1, pad the output in dimension i. // Opposite of the slice performed by the forward op. @@ -246,7 +409,7 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder, } } if (pad_output) { - output = builder->Pad(output, builder->ConstantR0(0.0f), padding); + output = xla::Pad(output, xla::ConstantR0(builder, 0.0f), padding); } return output; } @@ -255,10 +418,6 @@ class ResizeBilinearOp : public XlaOpKernel { public: explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_)); - OP_REQUIRES( - ctx, align_corners_ == true, - errors::Unimplemented( - "ResizeBilinear with align_corners=False is not yet implemented")); } void Compile(XlaOpKernelContext* ctx) override { @@ -291,24 +450,23 @@ class ResizeBilinearOp : public XlaOpKernel { // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in // dimension i. - std::vector slice_size = in_size; bool slice_input = false; for (int i = 0; i < num_spatial_dims; ++i) { if (in_size[i] > 1 && out_size[i] == 1) { // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first // entry before resizing. slice_input = true; - slice_size[i] = 1; + in_size[i] = 1; } } if (slice_input) { - input = b->Slice(input, {0, 0, 0, 0}, - {batch, slice_size[0], slice_size[1], channels}, - {1, 1, 1, 1}); + input = + xla::Slice(input, {0, 0, 0, 0}, + {batch, in_size[0], in_size[1], channels}, {1, 1, 1, 1}); } // Output is always type float. - input = b->ConvertElementType(input, xla::F32); + input = xla::ConvertElementType(input, xla::F32); // Special Case: // Instead of doing a ResizeUsingDilationAndConvolution directly, @@ -320,6 +478,9 @@ class ResizeBilinearOp : public XlaOpKernel { // operations along different dimensions. // Given sufficient numerical stability and a cxd is same as resizing axb -> exf -> cxd. + // This does not work in the case of align_corners_=false because of special + // padding requirements that cause multiple resizes to be very different + // from a single resize. // // This makes the convolutions kernels smaller and the operation faster. xla::XlaOp output = input; @@ -329,21 +490,24 @@ class ResizeBilinearOp : public XlaOpKernel { (static_cast(out_size[0]) - 1) / ((in_size[0] - 1) * 2), (static_cast(out_size[1]) - 1) / ((in_size[1] - 1) * 2)}; if ((k[0] == std::floor(k[0])) && (k[1] == std::floor(k[1])) && - k[0] > 1 && k[1] > 1) { + k[0] > 1 && k[1] > 1 && align_corners_) { std::vector next_out_size = {(in_size[0] - 1) * 2 + 1, (in_size[1] - 1) * 2 + 1}; - output = ResizeUsingDilationAndConvolution( - b, input, num_spatial_dims, in_size, next_out_size, channels); + output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims, + in_size, next_out_size, + channels, align_corners_); input = output; in_size = next_out_size; } else { - output = ResizeUsingDilationAndConvolution( - b, input, num_spatial_dims, in_size, out_size, channels); + output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims, + in_size, out_size, + channels, align_corners_); in_size = out_size; } } else { output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims, - in_size, out_size, channels); + in_size, out_size, channels, + align_corners_); in_size = out_size; } } @@ -423,22 +587,25 @@ class ResizeBilinearGradOp : public XlaOpKernel { std::vector next_grad_size = {(in_size[0] - 1) * 2 + 1, (in_size[1] - 1) * 2 + 1}; output = ResizeUsingDilationAndConvolutionGradOp( - b, grad, num_spatial_dims, in_size, next_grad_size, channels); + b, grad, num_spatial_dims, in_size, next_grad_size, channels, + align_corners_); grad = output; in_size = next_grad_size; } else { output = ResizeUsingDilationAndConvolutionGradOp( - b, grad, num_spatial_dims, in_size, grad_size, channels); + b, grad, num_spatial_dims, in_size, grad_size, channels, + align_corners_); in_size = grad_size; } } else { output = ResizeUsingDilationAndConvolutionGradOp( - b, grad, num_spatial_dims, in_size, grad_size, channels); + b, grad, num_spatial_dims, in_size, grad_size, channels, + align_corners_); in_size = grad_size; } } - output = b->ConvertElementType(output, output_type_); + output = xla::ConvertElementType(output, output_type_); ctx->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc index 36eb4c75454ed82804c40b82e5dbaec2eef0a719..f3964748587c1b31cf8b1b76643ff19a9044bf44 100644 --- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc @@ -60,19 +60,15 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) { input_shape.DebugString())); DataType index_type = output_type(0); + xla::PrimitiveType index_xla_type; + OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(index_type, &index_xla_type)); - xla::XlaBuilder* b = ctx->builder(); xla::XlaOp input = ctx->Input(0); - xla::XlaOp output; if (is_min_) { - OP_REQUIRES_OK(ctx, - XlaHelpers::ArgMin(b, ctx, input, input_shape, input_type(0), - index_type, axis, &output)); + output = XlaHelpers::ArgMin(input, index_xla_type, axis); } else { - OP_REQUIRES_OK(ctx, - XlaHelpers::ArgMax(b, ctx, input, input_shape, input_type(0), - index_type, axis, &output)); + output = XlaHelpers::ArgMax(input, index_xla_type, axis); } ctx->SetOutput(0, output); diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc index 2c2d88486fda99d2380382a3e2f633f5bdc7478c..22a45b2a11e8ecb688f8e773ef4b286eafe68f4f 100644 --- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc +++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -76,14 +77,15 @@ class ArgMaxCustomCallOp : public XlaOpKernel { // XLA passes to the function, so it is not included here. std::vector args; args.push_back(ctx->Input(0)); - args.push_back(b.ConstantLiteral( - *xla::Literal::CreateR1(input_shape.dim_sizes()))); + args.push_back(xla::ConstantLiteral( + &b, *xla::LiteralUtil::CreateR1(input_shape.dim_sizes()))); if (input_shape.dims() > 1) { // Don't bother passing the output shape and dim for the 1d case, since // the shape is always a scalar and the dim is always 0. - args.push_back(b.ConstantLiteral( - *xla::Literal::CreateR1(output_shape.dim_sizes()))); - args.push_back(b.ConstantLiteral(*xla::Literal::CreateR0(dim))); + args.push_back(xla::ConstantLiteral( + &b, *xla::LiteralUtil::CreateR1(output_shape.dim_sizes()))); + args.push_back( + xla::ConstantLiteral(&b, *xla::LiteralUtil::CreateR0(dim))); } xla::Shape xla_shape = @@ -94,10 +96,12 @@ class ArgMaxCustomCallOp : public XlaOpKernel { xla::XlaOp output; switch (input_shape.dims()) { case 1: - output = b.CustomCall("argmax_float_1d_xla_impl", args, xla_shape); + output = + xla::CustomCall(&b, "argmax_float_1d_xla_impl", args, xla_shape); break; case 2: - output = b.CustomCall("argmax_float_2d_xla_impl", args, xla_shape); + output = + xla::CustomCall(&b, "argmax_float_2d_xla_impl", args, xla_shape); break; default: OP_REQUIRES(ctx, false, diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc index 1decf7d72d72bb697477e7f841ced2a1a0d5fbe9..f028e361bccd51de0bd69a1d2227c7afaed53455 100644 --- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc @@ -16,7 +16,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/no_op.h" @@ -39,12 +39,12 @@ class L2LossOp : public XlaOpKernel { const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype); auto t = XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type); - auto square = b->Mul(t, t); - auto reduce = b->Reduce(square, XlaHelpers::Zero(b, accumulation_type), - *ctx->GetOrCreateAdd(accumulation_type), dims); + auto square = xla::Mul(t, t); + auto reduce = xla::Reduce(square, XlaHelpers::Zero(b, accumulation_type), + *ctx->GetOrCreateAdd(accumulation_type), dims); auto deconverted = XlaHelpers::ConvertElementType(b, reduce, dtype); auto two = XlaHelpers::IntegerLiteral(b, dtype, 2); - ctx->SetOutput(0, b->Div(deconverted, two)); + ctx->SetOutput(0, xla::Div(deconverted, two)); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc index 0388b4c830702ea00ec69fc42c6468326c88cf38..a11bbe918f7f8eb050aaa40d4344f9cc9e9a10a4 100644 --- a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/lib/core/errors.h" @@ -90,8 +91,10 @@ class ListDiffOp : public XlaOpKernel { idx_output.push_back(i); } - context->SetOutput(0, context->builder()->ConstantR1(val_output)); - context->SetOutput(1, context->builder()->ConstantR1(idx_output)); + context->SetOutput(0, + xla::ConstantR1(context->builder(), val_output)); + context->SetOutput(1, + xla::ConstantR1(context->builder(), idx_output)); return Status::OK(); } diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc index 39fbf98a6274918840e9e351470f04c2d80c5d01..87ee2d3aede50eb24e65570f106d49030e1d4236 100644 --- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" namespace tensorflow { @@ -50,8 +51,8 @@ class LRNOp : public XlaOpKernel { auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0)); auto converted = XlaHelpers::ConvertElementType(builder, input, accumulation_type); - auto squared = builder->Mul(converted, converted); - auto reduce = builder->ReduceWindow( + auto squared = xla::Mul(converted, converted); + auto reduce = xla::ReduceWindow( squared, XlaHelpers::Zero(builder, accumulation_type), *ctx->GetOrCreateAdd(accumulation_type), /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1}, @@ -59,12 +60,12 @@ class LRNOp : public XlaOpKernel { auto sqr_sum = XlaHelpers::ConvertElementType(builder, reduce, input_type(0)); - auto scale = builder->Pow( - builder->Add(builder->ConstantR0(bias_), - builder->Mul(builder->ConstantR0(alpha_), sqr_sum)), - builder->ConstantR0(-beta_)); + auto scale = xla::Pow( + xla::Add(xla::ConstantR0(builder, bias_), + xla::Mul(xla::ConstantR0(builder, alpha_), sqr_sum)), + xla::ConstantR0(builder, -beta_)); - ctx->SetOutput(0, builder->Mul(input, scale)); + ctx->SetOutput(0, xla::Mul(input, scale)); } private: @@ -138,8 +139,8 @@ class LRNGradOp : public XlaOpKernel { auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0)); auto converted = XlaHelpers::ConvertElementType(builder, in_image, accumulation_type); - auto squared = builder->Mul(converted, converted); - auto reduce = builder->ReduceWindow( + auto squared = xla::Mul(converted, converted); + auto reduce = xla::ReduceWindow( squared, XlaHelpers::Zero(builder, accumulation_type), *ctx->GetOrCreateAdd(accumulation_type), /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1}, @@ -148,17 +149,17 @@ class LRNGradOp : public XlaOpKernel { XlaHelpers::ConvertElementType(builder, reduce, input_type(0)); auto norm = - builder->Add(builder->ConstantR0(bias_), - builder->Mul(builder->ConstantR0(alpha_), sqr_sum)); + xla::Add(xla::ConstantR0(builder, bias_), + xla::Mul(xla::ConstantR0(builder, alpha_), sqr_sum)); - auto dy = builder->Mul( - builder->Mul(builder->ConstantR0(-2.0f * alpha_ * beta_), - builder->Div(out_image, norm)), + auto dy = xla::Mul( + xla::Mul(xla::ConstantR0(builder, -2.0f * alpha_ * beta_), + xla::Div(out_image, norm)), in_grads); auto converted_dy = XlaHelpers::ConvertElementType(builder, dy, accumulation_type); - auto dy_reduce = builder->ReduceWindow( + auto dy_reduce = xla::ReduceWindow( converted_dy, XlaHelpers::Zero(builder, accumulation_type), *ctx->GetOrCreateAdd(accumulation_type), /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1}, @@ -166,10 +167,10 @@ class LRNGradOp : public XlaOpKernel { auto dy_reduced = XlaHelpers::ConvertElementType(builder, dy_reduce, input_type(0)); - xla::XlaOp gradients = builder->Add( - builder->Mul(in_image, dy_reduced), - builder->Mul(in_grads, - builder->Pow(norm, builder->ConstantR0(-beta_)))); + xla::XlaOp gradients = xla::Add( + xla::Mul(in_image, dy_reduced), + xla::Mul(in_grads, + xla::Pow(norm, xla::ConstantR0(builder, -beta_)))); ctx->SetOutput(0, gradients); } diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc index 6949b296f4b9afe4a0c9152c763a9ad233b9f595..6440770c29894c951f010f6c1deb929f4fe79bbf 100644 --- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" namespace tensorflow { @@ -53,10 +54,14 @@ class MatMulOp : public XlaOpKernel { const TensorShape b_shape = ctx->InputShape(1); // Check that the dimensions of the two matrices are valid. - OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_shape), - errors::InvalidArgument("In[0] is not a matrix")); - OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b_shape), - errors::InvalidArgument("In[1] is not a matrix")); + OP_REQUIRES( + ctx, TensorShapeUtils::IsMatrix(a_shape), + errors::InvalidArgument("In[0] is not a matrix. Instead it has shape ", + a_shape.DebugString())); + OP_REQUIRES( + ctx, TensorShapeUtils::IsMatrix(b_shape), + errors::InvalidArgument("In[1] is not a matrix. Instead it has shape ", + b_shape.DebugString())); int first_index = transpose_a_ ? 0 : 1; int second_index = transpose_b_ ? 1 : 0; @@ -70,15 +75,15 @@ class MatMulOp : public XlaOpKernel { xla::XlaOp b = ctx->Input(1); if (is_sparse_) { if (a_type_ == DT_BFLOAT16) { - a = ctx->builder()->ConvertElementType(a, xla::F32); + a = xla::ConvertElementType(a, xla::F32); } if (b_type_ == DT_BFLOAT16) { - b = ctx->builder()->ConvertElementType(b, xla::F32); + b = xla::ConvertElementType(b, xla::F32); } } - auto lhs = (transpose_a_) ? ctx->builder()->Transpose(a, {1, 0}) : a; - auto rhs = (transpose_b_) ? ctx->builder()->Transpose(b, {1, 0}) : b; - ctx->SetOutput(0, ctx->builder()->Dot(lhs, rhs)); + auto lhs = (transpose_a_) ? xla::Transpose(a, {1, 0}) : a; + auto rhs = (transpose_b_) ? xla::Transpose(b, {1, 0}) : b; + ctx->SetOutput(0, xla::Dot(lhs, rhs)); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc index fbd5dc0fdad4483aadbe9bc263cc1f7a034cee09..8dfd7de591c4a3c4768dd60b41e03d294ad49397 100644 --- a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc @@ -16,6 +16,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/tensor_shape.h" namespace tensorflow { @@ -50,6 +52,7 @@ class MatrixBandPartOp : public XlaOpKernel { xla::XlaOp num_upper = context->Input(2); DataType input_type = context->input_type(0); DataType index_type = context->input_type(1); + xla::PrimitiveType index_xla_type = context->input_xla_type(1); TensorShape batch_shape = input_shape; batch_shape.RemoveLastDims(2); @@ -58,33 +61,29 @@ class MatrixBandPartOp : public XlaOpKernel { // Compute 'offset', which is how many diagonals we are above/below the // diagonal. - xla::XlaOp iota_m; - OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, m, &iota_m)); + xla::XlaOp iota_m = xla::Iota(builder, index_xla_type, m); + xla::XlaOp iota_n = xla::Iota(builder, index_xla_type, n); - xla::XlaOp iota_n; - OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, n, &iota_n)); - - auto offset = builder->Sub(builder->Broadcast(iota_n, {m}), iota_m, - /*broadcast_dimensions=*/{0}); + auto offset = xla::Sub(xla::Broadcast(iota_n, {m}), iota_m, + /*broadcast_dimensions=*/{0}); // If num_lower or num_upper are negative, include all lower/upper // diagonals. auto zero_index = XlaHelpers::Zero(builder, index_type); - num_lower = builder->Select( - builder->Lt(num_lower, zero_index), - XlaHelpers::IntegerLiteral(builder, index_type, m), num_lower); - num_upper = builder->Select( - builder->Lt(num_upper, zero_index), - XlaHelpers::IntegerLiteral(builder, index_type, n), num_upper); + num_lower = xla::Select(xla::Lt(num_lower, zero_index), + XlaHelpers::IntegerLiteral(builder, index_type, m), + num_lower); + num_upper = xla::Select(xla::Lt(num_upper, zero_index), + XlaHelpers::IntegerLiteral(builder, index_type, n), + num_upper); - auto indicator = builder->And(builder->Le(builder->Neg(num_lower), offset), - builder->Le(offset, num_upper)); - indicator = builder->Broadcast(indicator, batch_shape.dim_sizes()); + auto indicator = xla::And(xla::Le(xla::Neg(num_lower), offset), + xla::Le(offset, num_upper)); + indicator = xla::Broadcast(indicator, batch_shape.dim_sizes()); auto zero_input = XlaHelpers::Zero(builder, input_type); - auto output = builder->Select( - indicator, input, - builder->Broadcast(zero_input, input_shape.dim_sizes())); + auto output = xla::Select( + indicator, input, xla::Broadcast(zero_input, input_shape.dim_sizes())); context->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc index db53f6fef8d6bf901c8281f50791ca6766c46efd..c0ca881ff82cee04e0c5e35f9a2d5732fabdd8a6 100644 --- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc @@ -16,6 +16,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" namespace tensorflow { @@ -61,14 +63,11 @@ class MatrixSetDiagOp : public XlaOpKernel { auto zero = XlaHelpers::Zero(builder, context->input_type(0)); // Create an indicator tensor that is true only on the diagonal. - xla::XlaOp iota_m; - OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, m, &iota_m)); - xla::XlaOp iota_n; - OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, n, &iota_n)); - auto indicator = builder->Eq(iota_m, - builder->Broadcast(iota_n, {m}), - /*broadcast_dimensions=*/{0}); - indicator = builder->Broadcast(indicator, batch_shape.dim_sizes()); + xla::XlaOp iota_m = xla::Iota(builder, xla::S32, m); + xla::XlaOp iota_n = xla::Iota(builder, xla::S32, n); + auto indicator = xla::Eq(iota_m, xla::Broadcast(iota_n, {m}), + /*broadcast_dimensions=*/{0}); + indicator = xla::Broadcast(indicator, batch_shape.dim_sizes()); // Broadcast diag up to the input shape. Use an implicit broadcast (Add) // because we need to broadcast on the right. @@ -77,10 +76,10 @@ class MatrixSetDiagOp : public XlaOpKernel { if (min_dim != m) { diag_broadcast_dims.back() = rank - 1; } - diag = builder->Add(diag, builder->Broadcast(zero, input_shape.dim_sizes()), - /*broadcast_dimensions=*/diag_broadcast_dims); + diag = xla::Add(diag, xla::Broadcast(zero, input_shape.dim_sizes()), + /*broadcast_dimensions=*/diag_broadcast_dims); - auto output = builder->Select(indicator, diag, input); + auto output = xla::Select(indicator, diag, input); context->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc index eaed93146460de5a6e8328432302cc75bf36a534..f4def11d08c31513aec5aad15187016a7294c2fd 100644 --- a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc @@ -30,13 +30,9 @@ class MatrixTriangularSolveOp : public XlaOpKernel { void Compile(XlaOpKernelContext* ctx) override { auto result = TriangularSolve( - ctx->builder(), ctx->Input(0), ctx->Input(1), /*left_side=*/true, + ctx->Input(0), ctx->Input(1), /*left_side=*/true, /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_); - if (!result.ok()) { - ctx->SetStatus(result.status()); - return; - } - ctx->SetOutput(0, result.ValueOrDie()); + ctx->SetOutput(0, result); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc index 7e9de3ef9b245c113cc143128fe58e7e017a361c..2a42eeaf76ab3aa88ff3a93ef7eb7ab217964bb6 100644 --- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/util/mirror_pad_mode.h" namespace tensorflow { @@ -27,21 +28,38 @@ class MirrorPadOp : public XlaOpKernel { xla::StatusOr DoMirrorPad(const xla::XlaOp& t, const xla::Shape& original_shape, - const xla::Literal& pad_literal, + const xla::LiteralSlice& pad_literal, + const MirrorPadMode mode, xla::XlaBuilder* b) { + // The difference in the semantics of REFLECT and SYMMETRIC is that REFLECT + // will not mirror the border values while symmetric does. + // e.g. input is [1, 2, 3] and paddings is [0, 2], then the output is: + // - [1, 2, 3, 2, 1] in reflect mode + // - [1, 2, 3, 3, 2] in symmetric mode. + int64 excluded_edges = mode == MirrorPadMode::REFLECT ? 1 : 0; xla::XlaOp accum = t; for (int64 dimno = xla::ShapeUtil::Rank(original_shape) - 1; dimno >= 0; --dimno) { - auto t_rev = b->Rev(accum, {dimno}); + auto t_rev = xla::Rev(accum, {dimno}); TF_ASSIGN_OR_RETURN(int64 lhs_padding, pad_literal.GetIntegralAsS64({dimno, 0})); TF_ASSIGN_OR_RETURN(int64 rhs_padding, pad_literal.GetIntegralAsS64({dimno, 1})); int64 dim_size = original_shape.dimensions(dimno); - auto lhs_pad = b->SliceInDim(t_rev, dim_size - 1 - lhs_padding, - dim_size - 1, 1, dimno); - auto rhs_pad = b->SliceInDim(t_rev, 1, 1 + rhs_padding, 1, dimno); - accum = b->ConcatInDim({lhs_pad, accum, rhs_pad}, dimno); + + // Padding amounts on each side must be no more than the size of the + // original shape. + TF_RET_CHECK(lhs_padding >= 0 && + lhs_padding <= dim_size - excluded_edges); + TF_RET_CHECK(rhs_padding >= 0 && + rhs_padding <= dim_size - excluded_edges); + + auto lhs_pad = + xla::SliceInDim(t_rev, dim_size - excluded_edges - lhs_padding, + dim_size - excluded_edges, 1, dimno); + auto rhs_pad = xla::SliceInDim(t_rev, excluded_edges, + excluded_edges + rhs_padding, 1, dimno); + accum = xla::ConcatInDim(b, {lhs_pad, accum, rhs_pad}, dimno); } return accum; } @@ -52,9 +70,10 @@ class MirrorPadOp : public XlaOpKernel { MirrorPadMode mode; OP_REQUIRES_OK(ctx, GetNodeAttr(def(), "mode", &mode)); - OP_REQUIRES(ctx, mode == MirrorPadMode::REFLECT, - xla::Unimplemented( - "Only REFLECT MirrorPad mode is currently supported")); + OP_REQUIRES( + ctx, mode == MirrorPadMode::REFLECT || mode == MirrorPadMode::SYMMETRIC, + xla::Unimplemented("Unsupported MirrorPad mode. Only SYMMETRIC and " + "REFLECT modes are currently supported")); const int dims = input_shape.dims(); OP_REQUIRES( @@ -82,7 +101,7 @@ class MirrorPadOp : public XlaOpKernel { xla::StatusOr in0_shape = b->GetShape(in0); OP_REQUIRES(ctx, in0_shape.ok(), in0_shape.status()); xla::StatusOr accum_status = - DoMirrorPad(in0, in0_shape.ValueOrDie(), pad_literal, b); + DoMirrorPad(in0, in0_shape.ValueOrDie(), pad_literal, mode, b); OP_REQUIRES_OK(ctx, accum_status.status()); diff --git a/tensorflow/compiler/tf2xla/kernels/pack_op.cc b/tensorflow/compiler/tf2xla/kernels/pack_op.cc index aecaabb6dcf46bdd6ae3da929448d6370acb989b..a9b519d8928cc2807831fd6b4f12e60b7d58ea55 100644 --- a/tensorflow/compiler/tf2xla/kernels/pack_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/pack_op.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -76,11 +77,10 @@ class PackOp : public XlaOpKernel { for (int i = 0; i < num; ++i) { // Reshape the inputs to have an extra dimension of size 1. - reshaped_inputs[i] = - ctx->builder()->Reshape(values[i], child_shape.dim_sizes()); + reshaped_inputs[i] = xla::Reshape(values[i], child_shape.dim_sizes()); } - ctx->SetOutput(0, ctx->builder()->ConcatInDim(reshaped_inputs, axis)); + ctx->SetOutput(0, xla::ConcatInDim(ctx->builder(), reshaped_inputs, axis)); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc index 7c95475e7b1f02183e44f73f116a4aeb25f05c09..e5937b56c17d01892928b073da09f38941ea1bbb 100644 --- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/register_types.h" @@ -63,8 +64,8 @@ class PadOp : public XlaOpKernel { int before = pad_literal.Get({i, 0}); int after = pad_literal.Get({i, 1}); OP_REQUIRES(ctx, before >= 0 && after >= 0, - errors::InvalidArgument("Paddings must be non-negative: ", - before, " ", after)); + errors::InvalidArgument( + "Paddings must be non-negative: ", before, " ", after)); dim->set_edge_padding_low(before); dim->set_edge_padding_high(after); } @@ -74,11 +75,10 @@ class PadOp : public XlaOpKernel { if (ctx->num_inputs() == 3) { OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(2)), errors::InvalidArgument("constant_values must be a scalar.")); - ctx->SetOutput(0, - ctx->builder()->Pad(ctx->Input(0), ctx->Input(2), config)); + ctx->SetOutput(0, xla::Pad(ctx->Input(0), ctx->Input(2), config)); } else { auto zero = XlaHelpers::Zero(ctx->builder(), input_type(0)); - ctx->SetOutput(0, ctx->builder()->Pad(ctx->Input(0), zero, config)); + ctx->SetOutput(0, xla::Pad(ctx->Input(0), zero, config)); } } }; diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc index f8e7b48a0fd94835964aea033ad33523150067b4..f6f158a73be42ea2602811ad64a2a2c655dab088 100644 --- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc @@ -20,7 +20,11 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/pooling.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -61,63 +65,60 @@ class PoolingOp : public XlaOpKernel { Padding padding; OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding)); padding_ = (padding == VALID) ? xla::Padding::kValid : xla::Padding::kSame; + + OP_REQUIRES_OK( + ctx, DataTypeToPrimitiveType(reduction_type_, &xla_reduction_type_)); } int num_dims() const { return num_spatial_dims_ + 2; } - // Method that builds an initial value to use in reductions. - virtual xla::XlaOp InitValue(xla::XlaBuilder* b) = 0; - - // The reduction operation to apply to each window. - virtual const xla::XlaComputation* Reduction(XlaOpKernelContext* ctx) = 0; - - // A post-processing operation to apply on the outputs of the ReduceWindow. - virtual xla::XlaOp PostProcessOutput(XlaOpKernelContext* ctx, - const xla::XlaOp& output, DataType dtype, - const TensorShape& input_shape) = 0; - - void Compile(XlaOpKernelContext* ctx) override { - std::vector ksize = ksize_; - std::vector stride = stride_; - if (ctx->num_inputs() != 1) { - const TensorShape ksize_shape = ctx->InputShape(1); - // Validate input sizes. - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape), - errors::InvalidArgument("ksize must be a vector, not shape ", - ksize_shape.DebugString())); - OP_REQUIRES(ctx, ksize_shape.num_elements() == num_dims(), - errors::InvalidArgument("Sliding window ksize field must " - "specify ", - num_dims(), " dimensions")); - ksize.clear(); - OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize)); - - const TensorShape stride_shape = ctx->InputShape(2); - // Validate input sizes. - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape), - errors::InvalidArgument("stride must be a vector, not shape ", - stride_shape.DebugString())); - OP_REQUIRES(ctx, stride_shape.num_elements() == num_dims(), - errors::InvalidArgument("Sliding window stride field must " - "specify ", - num_dims(), " dimensions")); - stride.clear(); - OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride)); + protected: + xla::StatusOr> GetKernelSize(XlaOpKernelContext* ctx) { + if (ctx->num_inputs() == 1) { + return ksize_; } - const TensorShape input_shape = ctx->InputShape(0); - OP_REQUIRES(ctx, input_shape.dims() == num_dims(), - errors::InvalidArgument("Input to ", type_string(), - " operator must have ", num_dims(), - " dimensions")); + const TensorShape ksize_shape = ctx->InputShape(1); + // Validate input sizes. + if (!TensorShapeUtils::IsVector(ksize_shape)) { + return errors::InvalidArgument("ksize must be a vector, not shape ", + ksize_shape.DebugString()); + } + if (ksize_shape.num_elements() != num_dims()) { + return errors::InvalidArgument( + "Sliding window ksize field must " + "specify ", + num_dims(), " dimensions"); + } + std::vector ksize; + auto status = ctx->ConstantInputAsIntVector(1, &ksize); + if (!status.ok()) { + return status; + } + return ksize; + } - xla::XlaBuilder* const b = ctx->builder(); - auto input = - XlaHelpers::ConvertElementType(b, ctx->Input(0), reduction_type_); - auto reduce = ctx->builder()->ReduceWindow( - input, InitValue(b), *Reduction(ctx), ksize, stride, padding_); - auto pooled = XlaHelpers::ConvertElementType(b, reduce, input_type(0)); - ctx->SetOutput(0, - PostProcessOutput(ctx, pooled, input_type(0), input_shape)); + xla::StatusOr> GetStride(XlaOpKernelContext* ctx) { + if (ctx->num_inputs() == 1) { + return stride_; + } + const TensorShape stride_shape = ctx->InputShape(2); + // Validate input sizes. + if (!TensorShapeUtils::IsVector(stride_shape)) { + return errors::InvalidArgument("stride must be a vector, not shape ", + stride_shape.DebugString()); + } + if (stride_shape.num_elements() != num_dims()) { + return errors::InvalidArgument( + "Sliding window stride field must " + "specify ", + num_dims(), " dimensions"); + } + std::vector stride; + auto status = ctx->ConstantInputAsIntVector(2, &stride); + if (!status.ok()) { + return status; + } + return stride; } protected: @@ -127,26 +128,51 @@ class PoolingOp : public XlaOpKernel { xla::Padding padding_; TensorFormat data_format_ = FORMAT_NHWC; DataType reduction_type_; + xla::PrimitiveType xla_reduction_type_; }; +// Converts the tensor data format to the one required by the XLA pooling +// library. +xla::TensorFormat XlaTensorFormat(tensorflow::TensorFormat data_format, + int num_spatial_dims) { + int num_dims = num_spatial_dims + 2; + int batch_dimension = GetTensorBatchDimIndex(num_dims, data_format); + int feature_dimension = GetTensorFeatureDimIndex(num_dims, data_format); + gtl::InlinedVector spatial_dimensions(num_spatial_dims); + for (int spatial_dim = 0; spatial_dim < num_spatial_dims; ++spatial_dim) { + spatial_dimensions[spatial_dim] = + GetTensorSpatialDimIndex(num_dims, data_format, spatial_dim); + } + return xla::TensorFormat(/*batch_dimension=*/batch_dimension, + /*feature_dimension=*/feature_dimension, + /*spatial_dimensions=*/spatial_dimensions); +} + class MaxPoolOp : public PoolingOp { public: MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims) : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims, /*reduction_type=*/ctx->input_type(0)) {} - xla::XlaOp InitValue(xla::XlaBuilder* b) override { - return XlaHelpers::MinValue(b, reduction_type_); - } + void Compile(XlaOpKernelContext* ctx) override { + auto ksize_or_error = GetKernelSize(ctx); + OP_REQUIRES_OK(ctx, ksize_or_error.status()); + std::vector ksize = ksize_or_error.ValueOrDie(); - const xla::XlaComputation* Reduction(XlaOpKernelContext* ctx) override { - return ctx->GetOrCreateMax(reduction_type_); - } + auto stride_or_error = GetStride(ctx); + OP_REQUIRES_OK(ctx, stride_or_error.status()); + std::vector stride = stride_or_error.ValueOrDie(); - xla::XlaOp PostProcessOutput(XlaOpKernelContext* ctx, - const xla::XlaOp& output, DataType dtype, - const TensorShape& input_shape) override { - return output; + const TensorShape input_shape = ctx->InputShape(0); + OP_REQUIRES(ctx, input_shape.dims() == num_dims(), + errors::InvalidArgument("Input to ", type_string(), + " operator must have ", num_dims(), + " dimensions")); + + auto pooling = + xla::MaxPool(ctx->Input(0), ksize, stride, padding_, + XlaTensorFormat(data_format_, input_shape.dims() - 2)); + ctx->SetOutput(0, pooling); } }; @@ -173,60 +199,6 @@ class MaxPool3DOp : public MaxPoolOp { }; REGISTER_XLA_OP(Name("MaxPool3D"), MaxPool3DOp); -// Common computation shared between AvgPool and AvgPoolGrad. Divide each -// element of an image by the count of elements that contributed to that -// element during pooling. -static xla::XlaOp AvgPoolDivideByCount( - XlaOpKernelContext* ctx, const xla::XlaOp& output, DataType dtype, - const TensorShape& input_shape, xla::Padding padding, - const std::vector& ksize, const std::vector& stride, - int num_spatial_dims, TensorFormat data_format) { - if (padding == xla::Padding::kValid) { - // In VALID padding, all windows have the same number of elements - // contributing to each average. Divide by the window size everywhere to - // get the average. - int64 window_size = std::accumulate(ksize.begin(), ksize.end(), 1, - [](int64 a, int64 b) { return a * b; }); - - auto divisor = - XlaHelpers::IntegerLiteral(ctx->builder(), dtype, window_size); - return ctx->builder()->Div(output, divisor); - } else { - // For SAME padding, the padding shouldn't be included in the - // counts. We use another ReduceWindow to find the right counts. - - // TODO(phawkins): use a less brute-force way to compute this. Only - // the boundary regions will have interesting values here. - - std::vector input_dim_sizes(num_spatial_dims); - std::vector window_dims(num_spatial_dims); - std::vector window_ksize(num_spatial_dims); - std::vector window_stride(num_spatial_dims); - for (int i = 0; i < num_spatial_dims; ++i) { - int dim = GetTensorSpatialDimIndex(num_spatial_dims + 2, data_format, i); - input_dim_sizes[i] = input_shape.dim_size(dim); - window_dims[i] = dim; - window_ksize[i] = ksize[dim]; - window_stride[i] = stride[dim]; - } - - // Build a matrix of all 1s, with the same width/height as the input. - const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype); - auto ones = ctx->builder()->Broadcast( - XlaHelpers::One(ctx->builder(), accumulation_type), input_dim_sizes); - - // Perform a ReduceWindow with the same window size, strides, and padding - // to count the number of contributions to each result element. - auto reduce = ctx->builder()->ReduceWindow( - ones, XlaHelpers::Zero(ctx->builder(), accumulation_type), - *ctx->GetOrCreateAdd(accumulation_type), window_ksize, window_stride, - xla::Padding::kSame); - auto counts = XlaHelpers::ConvertElementType(ctx->builder(), reduce, dtype); - - return ctx->builder()->Div(output, counts, window_dims); - } -} - class AvgPoolOp : public PoolingOp { public: AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims) @@ -234,20 +206,34 @@ class AvgPoolOp : public PoolingOp { /*reduction_type=*/ XlaHelpers::SumAccumulationType(ctx->input_type(0))) {} - xla::XlaOp InitValue(xla::XlaBuilder* b) override { - return XlaHelpers::Zero(b, reduction_type_); - } + void Compile(XlaOpKernelContext* ctx) override { + auto ksize_or_error = GetKernelSize(ctx); + OP_REQUIRES_OK(ctx, ksize_or_error.status()); + std::vector ksize = ksize_or_error.ValueOrDie(); - const xla::XlaComputation* Reduction(XlaOpKernelContext* ctx) override { - return ctx->GetOrCreateAdd(reduction_type_); - } + auto stride_or_error = GetStride(ctx); + OP_REQUIRES_OK(ctx, stride_or_error.status()); + std::vector stride = stride_or_error.ValueOrDie(); + + const TensorShape input_shape = ctx->InputShape(0); + OP_REQUIRES(ctx, input_shape.dims() == num_dims(), + errors::InvalidArgument("Input to ", type_string(), + " operator must have ", num_dims(), + " dimensions")); - xla::XlaOp PostProcessOutput(XlaOpKernelContext* ctx, - const xla::XlaOp& output, DataType dtype, - const TensorShape& input_shape) override { - return AvgPoolDivideByCount(ctx, output, dtype, input_shape, padding_, - ksize_, stride_, num_spatial_dims_, - data_format_); + auto xla_data_format = + XlaTensorFormat(data_format_, input_shape.dims() - 2); + auto spatial_padding = MakeSpatialPadding( + input_shape.dim_sizes(), ksize, stride, padding_, xla_data_format); + + // Convert the input to the reduction type. + auto converted_input = + ConvertElementType(ctx->Input(0), xla_reduction_type_); + auto pooling = + xla::AvgPool(converted_input, ksize, stride, spatial_padding, + xla_data_format, padding_ == xla::Padding::kValid); + // Convert the pooling result back to the input type before returning it. + ctx->SetOutput(0, ConvertElementType(pooling, ctx->input_xla_type(0))); } }; @@ -347,9 +333,9 @@ class MaxPoolGradOp : public XlaOpKernel { xla::XlaOp init_value = XlaHelpers::Zero(ctx->builder(), input_type(2)); auto select = CreateScalarGeComputation(element_type, ctx->builder()); auto scatter = CreateScalarAddComputation(element_type, ctx->builder()); - xla::XlaOp gradients = ctx->builder()->SelectAndScatter( - input, select, ksize_, stride_, xla_padding, out_backprop, init_value, - scatter); + xla::XlaOp gradients = + xla::SelectAndScatter(input, select, ksize_, stride_, xla_padding, + out_backprop, init_value, scatter); ctx->SetOutput(0, gradients); } @@ -424,78 +410,31 @@ class AvgPoolGradOp : public XlaOpKernel { errors::InvalidArgument("out_backprop must be ", num_dims(), "-dimensional")); - int depth_dim = GetTensorFeatureDimIndex(num_dims(), data_format_); - int64 depth = out_backprop_shape.dim_size(depth_dim); - - // We can think of average-pooling as: - // * a convolution with a kernel consisting entirely of 1s, where the - // input feature and output feature are equal, and 0s everywhere else. - // * followed by dividing by the counts. - // - // This then gives us an algorithm to build the gradient: - // * divide out_backprop by the counts, followed by - // * Conv2DBackpropInput specialized for that kernel, which simplifies to - // a Pad and a ReduceWindow. - // - // For an explanation of backpropagation for convolution, see the comments - // in third_party/tensorflow/core/kernels/conv_grad_ops.h - - // TF filter shape is [ H, W, ..., inC, outC ] - std::vector filter_dims(num_dims()); - for (int i = 0; i < num_spatial_dims_; ++i) { - int dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i); - filter_dims[i] = ksize_[dim]; - } - filter_dims[num_dims() - 2] = depth; - filter_dims[num_dims() - 1] = depth; - TensorShape filter_shape(filter_dims); - - // Reuse the logic from Conv2DBackpropInput to compute padding. - ConvBackpropDimensions dims; - OP_REQUIRES_OK( - ctx, ConvBackpropComputeDimensions( - type_string(), /*num_spatial_dims=*/num_spatial_dims_, - gradients_shape, filter_shape, out_backprop_shape, stride_, - padding_, data_format_, &dims)); - - // The input gradients are computed by a convolution of the output gradients - // and the filter, with some appropriate padding. See the comment at the top - // of conv_grad_ops.h for details. - xla::XlaBuilder* const b = ctx->builder(); auto out_backprop = ctx->Input(1); - auto dtype = input_type(1); + std::vector stride_int64s(stride_.begin(), stride_.end()); xla::Padding xla_padding = (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame; - - // Divide the out_backprop values by the counts for each spatial position. - std::vector stride_int64s(stride_.begin(), stride_.end()); - auto out_backprop_div = AvgPoolDivideByCount( - ctx, out_backprop, dtype, gradients_shape, xla_padding, ksize_, - stride_int64s, num_spatial_dims_, data_format_); - - // Pad the gradients in the spatial dimensions. We use the same padding - // as Conv2DBackpropInput. - xla::PaddingConfig padding_config = xla::MakeNoPaddingConfig(num_dims()); - for (int i = 0; i < num_spatial_dims_; ++i) { - int dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i); - auto* padding = padding_config.mutable_dimensions(dim); - padding->set_edge_padding_low(dims.spatial_dims[i].pad_before); - padding->set_edge_padding_high(dims.spatial_dims[i].pad_after); - padding->set_interior_padding(dims.spatial_dims[i].stride - 1); - } - - auto zero = XlaHelpers::Zero(b, dtype); - auto padded_gradients = b->Pad(out_backprop_div, zero, padding_config); - - // in_backprop = padded_gradients ones - std::vector ones(num_dims(), 1LL); - auto accumulation_type = XlaHelpers::SumAccumulationType(dtype); - auto in_backprop = b->ReduceWindow( - XlaHelpers::ConvertElementType(b, padded_gradients, accumulation_type), - XlaHelpers::Zero(b, accumulation_type), - *ctx->GetOrCreateAdd(accumulation_type), ksize_, - /* window_strides=*/ones, xla::Padding::kValid); - ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, in_backprop, dtype)); + xla::PrimitiveType xla_reduction_type; + auto reduction_type = XlaHelpers::SumAccumulationType(ctx->input_type(1)); + OP_REQUIRES_OK( + ctx, DataTypeToPrimitiveType(reduction_type, &xla_reduction_type)); + auto converted_out_backprop = + xla::ConvertElementType(out_backprop, xla_reduction_type); + auto xla_data_format = + XlaTensorFormat(data_format_, gradients_shape.dims() - 2); + auto padding_values = + MakeSpatialPadding(gradients_shape.dim_sizes(), ksize_, stride_int64s, + xla_padding, xla_data_format); + auto in_backprop = + xla::AvgPoolGrad(converted_out_backprop, gradients_shape.dim_sizes(), + ksize_, stride_int64s, padding_values, xla_data_format, + /*counts_include_padding=*/padding_ == VALID); + // Convert the pooling result back to the input type before returning it. + xla::PrimitiveType xla_out_backprop_type; + OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(ctx->input_type(1), + &xla_out_backprop_type)); + ctx->SetOutput(0, + xla::ConvertElementType(in_backprop, xla_out_backprop_type)); } protected: @@ -614,58 +553,61 @@ class MaxPoolGradGradOp : public XlaOpKernel { auto b = ctx->builder(); - auto sixteen = b->ConstantR0(16); + auto sixteen = xla::ConstantR0(b, 16); // in (f32) -> round to bf16 -> f32 for correct bitwidth -> 16-high-bit u32 - auto in_hi = b->BitcastConvertType( - b->ConvertElementType(b->ConvertElementType(input, xla::BF16), - xla::F32), + auto in_hi = xla::BitcastConvertType( + xla::ConvertElementType(xla::ConvertElementType(input, xla::BF16), + xla::F32), xla::U32); - auto bp_int = b->BitcastConvertType(out_backprop, xla::U32); - auto bp_hi = b->ShiftRightLogical(bp_int, sixteen); - auto bp_lo = b->ShiftRightLogical(b->ShiftLeft(bp_int, sixteen), sixteen); - auto in_hi_bp_hi = b->Add(in_hi, bp_hi); // Want an unsigned add. - auto in_hi_bp_lo = b->Add(in_hi, bp_lo); // Want an unsigned add. - - auto init_value = XlaHelpers::MinValue(b, DT_FLOAT); + auto bp_int = xla::BitcastConvertType(out_backprop, xla::U32); + auto bp_hi = xla::ShiftRightLogical(bp_int, sixteen); + auto bp_lo = + xla::ShiftRightLogical(xla::ShiftLeft(bp_int, sixteen), sixteen); + auto in_hi_bp_hi = xla::Add(in_hi, bp_hi); // Want an unsigned add. + auto in_hi_bp_lo = xla::Add(in_hi, bp_lo); // Want an unsigned add. + + auto init_value = xla::MinValue(b, xla::F32); // We will reduce by taking the maximal value up to 16 bits (ignoring the lo // 16 bits of packed-in hi/lo backprop value). auto rb = b->CreateSubBuilder("GreaterOrEqOf_ByFirst16Bits"); { // F32 parameters to satisfy lowering type restriction for reduce opcode. const xla::Shape scalar = xla::ShapeUtil::MakeShape(xla::F32, {}); - auto lhs = rb->Parameter(0, scalar, "lhs"); - auto rhs = rb->Parameter(1, scalar, "rhs"); - auto sixteen = rb->ConstantR0(16); - auto lhs_criteria = rb->ShiftLeft( - rb->ShiftRightLogical(rb->BitcastConvertType(lhs, xla::S32), sixteen), - sixteen); - auto rhs_criteria = rb->ShiftLeft( - rb->ShiftRightLogical(rb->BitcastConvertType(rhs, xla::S32), sixteen), - sixteen); + auto lhs = xla::Parameter(rb.get(), 0, scalar, "lhs"); + auto rhs = xla::Parameter(rb.get(), 1, scalar, "rhs"); + auto sixteen = xla::ConstantR0(rb.get(), 16); + auto lhs_criteria = + xla::ShiftLeft(xla::ShiftRightLogical( + xla::BitcastConvertType(lhs, xla::S32), sixteen), + sixteen); + auto rhs_criteria = + xla::ShiftLeft(xla::ShiftRightLogical( + xla::BitcastConvertType(rhs, xla::S32), sixteen), + sixteen); // Must use a F32 comparison, because S32 would not work for negatives. - rb->Select(rb->Ge(rb->BitcastConvertType(lhs_criteria, xla::F32), - rb->BitcastConvertType(rhs_criteria, xla::F32)), - lhs, rhs); + xla::Select(xla::Ge(xla::BitcastConvertType(lhs_criteria, xla::F32), + xla::BitcastConvertType(rhs_criteria, xla::F32)), + lhs, rhs); } auto reduce = rb->BuildAndNoteError(); xla::Padding xla_padding = (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame; auto pooled_hi = - b->ReduceWindow(b->BitcastConvertType(in_hi_bp_hi, xla::F32), - init_value, reduce, ksize_, stride_, xla_padding); + xla::ReduceWindow(xla::BitcastConvertType(in_hi_bp_hi, xla::F32), + init_value, reduce, ksize_, stride_, xla_padding); auto pooled_lo = - b->ReduceWindow(b->BitcastConvertType(in_hi_bp_lo, xla::F32), - init_value, reduce, ksize_, stride_, xla_padding); + xla::ReduceWindow(xla::BitcastConvertType(in_hi_bp_lo, xla::F32), + init_value, reduce, ksize_, stride_, xla_padding); auto grads_hi = - b->ShiftLeft(b->BitcastConvertType(pooled_hi, xla::U32), sixteen); - auto grads_lo = b->ShiftRightLogical( - b->ShiftLeft(b->BitcastConvertType(pooled_lo, xla::U32), sixteen), + xla::ShiftLeft(xla::BitcastConvertType(pooled_hi, xla::U32), sixteen); + auto grads_lo = xla::ShiftRightLogical( + xla::ShiftLeft(xla::BitcastConvertType(pooled_lo, xla::U32), sixteen), sixteen); - auto grads = b->Add(grads_hi, grads_lo); // Want an unsigned add. + auto grads = xla::Add(grads_hi, grads_lo); // Want an unsigned add. xla::PrimitiveType element_type; OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(2), &element_type)); - ctx->SetOutput(0, b->BitcastConvertType(grads, element_type)); + ctx->SetOutput(0, xla::BitcastConvertType(grads, element_type)); } protected: @@ -694,5 +636,18 @@ REGISTER_XLA_OP(Name("MaxPoolGradGradV2") .CompileTimeConstInput("strides"), MaxPool2DGradGradOp); +class MaxPool3DGradGradOp : public MaxPoolGradGradOp { + public: + explicit MaxPool3DGradGradOp(OpKernelConstruction* ctx) + : MaxPoolGradGradOp(ctx, /*num_spatial_dims=*/3) { + string data_format; + OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format)); + OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_), + errors::InvalidArgument("Invalid data format")); + } +}; +REGISTER_XLA_OP(Name("MaxPool3DGradGrad").TypeConstraint("T", DT_FLOAT), + MaxPool3DGradGradOp); + } // anonymous namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/qr_op.cc b/tensorflow/compiler/tf2xla/kernels/qr_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7ea0afc1f53cbe4cfcc3f6121a4ecd55864c1b52 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/qr_op.cc @@ -0,0 +1,47 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/lib/qr.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" + +namespace tensorflow { +namespace { + +class QROp : public XlaOpKernel { + public: + explicit QROp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("full_matrices", &full_matrices_)); + } + void Compile(XlaOpKernelContext* ctx) override { + auto result = QRDecomposition(ctx->Input(0), full_matrices_); + if (!result.ok()) { + ctx->SetStatus(result.status()); + return; + } + ctx->SetOutput(0, result.ValueOrDie().q); + ctx->SetOutput(1, result.ValueOrDie().r); + } + + private: + // If true, compute full-sized q and r. If false, compute only the leading P + // columns of q. + bool full_matrices_; +}; + +REGISTER_XLA_OP(Name("Qr").TypeConstraint("T", kFloatTypes), QROp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc index 661cd5923e1023eaf89a6bc4f56fcc362c8bcfb6..6f4ed496a1774dde68dd9d5fbd37995d615b678c 100644 --- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc @@ -13,10 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/core/platform/macros.h" namespace tensorflow { @@ -28,82 +32,115 @@ class QuantizeAndDequantizeOp : public XlaOpKernel { : XlaOpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_)); - OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits_)); - OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63), - errors::InvalidArgument("num_bits is out of range: ", num_bits_, - " with signed_input_ ", signed_input_)); } void Compile(XlaOpKernelContext* ctx) override { xla::XlaOp input = ctx->Input(0); const DataType data_type = ctx->input_type(0); - // Comments taken from semantics description at - // https://www.tensorflow.org/versions/r1.0/api_docs/cc/class/tensorflow/ops/quantize-and-dequantize - // - // ... we find m such that - // - // m = max(abs(input_min), abs(input_max)) if range_given is true, - // m = max(abs(min_elem(input)), - // abs(max_elem(input))) otherwise. + xla::PrimitiveType xla_type; + OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(data_type, &xla_type)); + xla::XlaBuilder* b = ctx->builder(); - xla::XlaOp input_min, input_max; + + // The implementation follows + // tensorflow/core/kernels/quantize_and_dequantize_op.h closely. + xla::XlaOp min_range, max_range; if (range_given_) { - double input_min_value, input_max_value; - OP_REQUIRES_OK(ctx, ctx->ConstantInputAsFloatScalar(1, &input_min_value)); - OP_REQUIRES_OK(ctx, ctx->ConstantInputAsFloatScalar(2, &input_max_value)); - input_min = XlaHelpers::FloatLiteral(b, data_type, input_min_value); - input_max = XlaHelpers::FloatLiteral(b, data_type, input_max_value); + min_range = ctx->Input(1); + max_range = ctx->Input(2); } else { const xla::XlaComputation* fmax = ctx->GetOrCreateMax(data_type); const xla::XlaComputation* fmin = ctx->GetOrCreateMin(data_type); - input_min = - b->ReduceAll(input, XlaHelpers::MaxValue(b, data_type), *fmin); - input_max = - b->ReduceAll(input, XlaHelpers::MinValue(b, data_type), *fmax); + min_range = ReduceAll(input, xla::MaxValue(b, xla_type), *fmin); + max_range = ReduceAll(input, xla::MinValue(b, xla_type), *fmax); } - xla::XlaOp m = b->Max(b->Abs(input_min), b->Abs(input_max)); - - // Next, we choose our fixed-point quantization buckets, [min_fixed, - // max_fixed]. If signed_input is true, this is - // - // [min_fixed, max_fixed ] = [-((1 << (num_bits - 1)) - 1), - // (1 << (num_bits - 1)) - 1]. - // - // Otherwise, if signed_input is false, the fixed-point range is - // - // [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]. - int64 min_fixed, max_fixed; + + xla::XlaOp num_bits; + if (num_bits_ < 0) { + OP_REQUIRES( + ctx, ctx->num_inputs() == 4, + errors::Internal("Expected 4 inputs to QuantizeAndDequantize")); + num_bits = ctx->Input(3); + } else { + num_bits = xla::ConstantR0(b, num_bits_); + } + + const xla::XlaOp zero = XlaHelpers::Zero(b, data_type); + const xla::XlaOp one = XlaHelpers::One(b, data_type); + const xla::XlaOp two = XlaHelpers::FloatLiteral(b, data_type, 2.0); + const xla::XlaOp half = XlaHelpers::FloatLiteral(b, data_type, 0.5); + + // Calculate the range for the simulated integer quantization: + // e.g. [-128,127] for signed = true, num_bits = 8, + // or [0, 255] for signed = false, num_bits = 8. + // We do this in floating point for hardware that does not have 64-bit + // integer support. + xla::XlaOp min_quantized, max_quantized; if (signed_input_) { - min_fixed = -((1LL << (num_bits_ - 1)) - 1); - max_fixed = (1LL << (num_bits_ - 1)) - 1; + min_quantized = + -Pow(two, ConvertElementType(num_bits - xla::ConstantR0(b, 1), + xla_type)); + max_quantized = + Pow(two, ConvertElementType(num_bits - xla::ConstantR0(b, 1), + xla_type)) - + one; } else { - min_fixed = 0; - max_fixed = (1LL << num_bits_) - 1; + min_quantized = zero; + max_quantized = Pow(two, ConvertElementType(num_bits, xla_type)) - one; } - // From this we compute our scaling factor, s: - // - // s = (max_fixed - min_fixed) / (2 * m). - xla::XlaOp s = - b->Div(XlaHelpers::FloatLiteral(b, data_type, max_fixed - min_fixed), - b->Mul(XlaHelpers::FloatLiteral(b, data_type, 2.0), m)); + // Determine the maximum scaling factor that would scale + // [min_range, max_range] to not exceed [min_quantized, max_quantized], + // while keeping 0 unchanged. + xla::XlaOp scale_from_min_side = + Select(Gt(min_quantized * min_range, zero), min_quantized / min_range, + xla::MaxFiniteValue(b, xla_type)); + xla::XlaOp scale_from_max_side = + Select(Gt(max_quantized * max_range, zero), max_quantized / max_range, + xla::MaxFiniteValue(b, xla_type)); - // Now we can quantize and dequantize the elements of our tensor. An element - // e is transformed into e': - // - // e' = (e * s).round_to_nearest() / s. - xla::XlaOp result = b->Div(b->Round(b->Mul(input, s)), s); + // Note: Avoids changing the side of the range that determines scale. + xla::XlaOp cond = Lt(scale_from_min_side, scale_from_max_side); + xla::XlaOp scale = Select(cond, scale_from_min_side, scale_from_max_side); + xla::XlaOp inverse_scale = + Select(cond, min_range / min_quantized, max_range / max_quantized); + min_range = Select(cond, min_range, min_quantized * inverse_scale); + max_range = Select(cond, max_quantized * inverse_scale, max_range); + if (range_given_) { + // Note: The clamping here is to avoid overflow in the quantized type. + // The semantics of the op does not guarantee to clamp to the specified + // min_range and max_range - because we may have changed either min_range + // or max_range. + // No need to clamp to min_range and max_range if range_given_ == false as + // in that case they were measured from the tensor. + input = Clamp(min_range, input, max_range); + } + xla::XlaOp result = + Floor((input - min_range) * scale + half) * inverse_scale + min_range; ctx->SetOutput(0, result); } - int64 num_bits_; + protected: + int64 num_bits_ = -1; bool signed_input_; bool range_given_; }; -REGISTER_XLA_OP(Name("QuantizeAndDequantizeV2"), QuantizeAndDequantizeOp); +class QuantizeAndDequantizeV2Op : public QuantizeAndDequantizeOp { + public: + explicit QuantizeAndDequantizeV2Op(OpKernelConstruction* ctx) + : QuantizeAndDequantizeOp(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits_)); + OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63), + errors::InvalidArgument("num_bits is out of range: ", num_bits_, + " with signed_input_ ", signed_input_)); + } +}; + +REGISTER_XLA_OP(Name("QuantizeAndDequantizeV2"), QuantizeAndDequantizeV2Op); +REGISTER_XLA_OP(Name("QuantizeAndDequantizeV3"), QuantizeAndDequantizeOp); } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc index 5f5bd586376ab368e443671ac8a5de23a5fd604b..afd5986846705f66eb4c7ced9dbe2f4757f5af7f 100644 --- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc @@ -17,11 +17,17 @@ limitations under the License. // TODO(misard,phawkins): handle random number generator seeds/states correctly. // TODO(misard,phawkins): add tests. +#include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h" +#include "tensorflow/compiler/tf2xla/lib/random.h" +#include "tensorflow/compiler/tf2xla/lib/util.h" +#include "tensorflow/compiler/tf2xla/lib/while_loop.h" #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" @@ -42,8 +48,8 @@ class RandomUniformOp : public XlaOpKernel { OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, shape, &xla_shape)); xla::XlaBuilder* b = ctx->builder(); - xla::XlaOp result = b->RngUniform(XlaHelpers::Zero(b, dtype), - XlaHelpers::One(b, dtype), xla_shape); + xla::XlaOp result = xla::RngUniform(XlaHelpers::Zero(b, dtype), + XlaHelpers::One(b, dtype), xla_shape); ctx->SetOutput(0, result); } @@ -55,6 +61,143 @@ class RandomUniformOp : public XlaOpKernel { REGISTER_XLA_OP(Name("RandomUniform").CompileTimeConstInput("shape"), RandomUniformOp); +class RandomShuffleOp : public XlaOpKernel { + public: + explicit RandomShuffleOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + + void Compile(XlaOpKernelContext* ctx) override { + auto builder = ctx->builder(); + xla::XlaOp input = ctx->Input(0); + TensorShape input_shape = ctx->InputShape(0); + const int64 n = input_shape.dim_size(0); + int64 num_elements = 1; + for (tensorflow::TensorShapeDim dimension : input_shape) { + num_elements *= dimension.size; + } + + if (num_elements <= 1 || n <= 1) { + // No shuffling is required, so copy input directly to output + ctx->SetOutput(0, input); + return; + } + + if (input_shape.dims() == 1) { + // For R1s, shuffle values by sorting instead of the obvious Fisher-Yates + // algorithm. Fisher-Yates is simple to implement and correct, but not + // easily parallelizable. For a sufficiently parallel architecture, it is + // faster to sort many times, than Fisher-Yates shuffle once. + + // Shuffle values by assigning each value a random key and sorting the + // keys. Keys can collide causing detectable patterns in the shuffled + // output. Collisions translates into more ascending sub-sequences in the + // shuffled output than would be expected by chance. To avoid collisions, + // the number of possible key values must be sufficiently large. + + // How are more than 2^32 keys created? In each loop iteration, the + // algorithm sorts by random keys. Conceptually, the earlier iterations + // are sorting on the lower-order bits of larger keys that are never + // actually assembled. + + // The expected number of collisions is n - d + d(1 - 1/d)^n, where d is + // the number of possible keys and n is the number of values. If d = n^2, + // then the limit as n goes to infinity is 1/2. If d = n^3, then the limit + // as n goes to infinity is zero. + + // This implementation ensures that the key-space is greater than or equal + // to the cube of the number of values. The risk of collisions can be + // further reduced by increasing Exponent at the expense of + // performance. + + // For Exponent = 2, the expected number of collisions per shuffle is + // maximized at n = floor((2^32-1)^(1/2)) = 65535 where the expectation is + // about 1/2. + + // For Exponent = 3, the expected number of collisions per shuffle is + // maximized at n = floor((2^32-1)^(1/3)) = 1625 where the expectation is + // about 1/3255. + + // For Exponent = 4, the expected number of collisions per shuffle is + // maximized at n = floor((2^32-1)^(1/4)) = 255 where the expectation is + // about 1/132622. + constexpr int Exponent = 3; + const int rounds = static_cast( + std::ceil(Exponent * std::log(num_elements) / std::log(kuint32max))); + + const xla::Shape key_shape = + xla::ShapeUtil::MakeShape(xla::U32, {num_elements}); + xla::XlaOp zero = xla::ConstantR0(builder, 0U); + + // Unfortunately, xla::RngUniform gives values in the half open interval + // rather than the closed interval, so instead of 2^32 possible keys there + // are only 2^32 - 1 (kuint32max). + xla::XlaOp max_value = xla::ConstantR0(builder, kuint32max); + + xla::XlaOp curr = input; + for (int i = 0; i < rounds; ++i) { + xla::XlaOp keys = xla::RngUniform(zero, max_value, key_shape); + xla::XlaOp sorted = xla::Sort(keys, curr); + curr = xla::GetTupleElement(sorted, 1); + } + + ctx->SetOutput(0, curr); + return; + } + + // The Fisher-Yates algorithm. + + // Generate the random swaps for the indices. + auto swaps_shape = xla::ShapeUtil::MakeShape(xla::S32, {n}); + auto swaps = + xla::RngUniform(xla::ConstantR0(builder, 0), + xla::ConstantR0(builder, n), swaps_shape); + + // Generate range(n) as the initial value for the indices to be swapped. + xla::XlaOp indices = xla::Iota(builder, xla::S32, n); + + // Swap the indices at i and swaps[i]. + auto swap_body_fn = [&](xla::XlaOp i, + absl::Span loop_vars, + xla::XlaBuilder* builder) + -> xla::StatusOr> { + auto swaps = loop_vars[0]; + auto indices = loop_vars[1]; + i = xla::Reshape(i, {1}); + // temp = indices[i] + auto temp = xla::DynamicSlice(indices, i, {1}); + // swap_index = swaps[i] + auto swap_index = xla::DynamicSlice(swaps, i, {1}); + // swap_value = indices[swaps[i]] + auto swap_value = xla::DynamicSlice(indices, swap_index, {1}); + // indices[i] = indices[swaps[i]] + indices = xla::DynamicUpdateSlice(indices, swap_value, i); + // indices[swaps[i]] = temp + indices = xla::DynamicUpdateSlice(indices, temp, swap_index); + return std::vector{swaps, indices}; + }; + // for i in range(n): + auto swap_loop_result = + XlaForEachIndex(n, xla::S32, swap_body_fn, {swaps, indices}, + "indices_swap_loop", builder) + .ValueOrDie(); + auto swapped_indices = swap_loop_result[1]; + + // Gather the data using the swapped indices as the shuffled order. + auto indices_tensor_shape = TensorShape({n}); + DataType type = ctx->expected_output_dtype(0); + xla::XlaOp gather; + OP_REQUIRES_OK(ctx, XlaGather(input, input_shape, swapped_indices, + indices_tensor_shape, + /*axis=*/0, /*indices_are_nd=*/false, type, + DT_INT32, builder, &gather)); + ctx->SetOutput(0, gather); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleOp); +}; + +REGISTER_XLA_OP(Name("RandomShuffle"), RandomShuffleOp); + class RandomUniformIntOp : public XlaOpKernel { public: explicit RandomUniformIntOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} @@ -77,7 +220,7 @@ class RandomUniformIntOp : public XlaOpKernel { auto minval = ctx->Input(1); auto maxval = ctx->Input(2); - ctx->SetOutput(0, ctx->builder()->RngUniform(minval, maxval, xla_shape)); + ctx->SetOutput(0, xla::RngUniform(minval, maxval, xla_shape)); } private: @@ -103,8 +246,8 @@ class RandomStandardNormalOp : public XlaOpKernel { xla::XlaBuilder* b = ctx->builder(); // Normal distribution with a mean of 0 and a standard deviation of 1: - xla::XlaOp result = b->RngNormal(XlaHelpers::Zero(b, dtype), - XlaHelpers::One(b, dtype), xla_shape); + xla::XlaOp result = xla::RngNormal(XlaHelpers::Zero(b, dtype), + XlaHelpers::One(b, dtype), xla_shape); ctx->SetOutput(0, result); } @@ -127,63 +270,21 @@ class TruncatedNormalOp : public XlaOpKernel { OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape)); xla::Shape xla_shape; OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, shape, &xla_shape)); - xla::Shape xla_element_shape = - xla::ShapeUtil::MakeShape(xla_shape.element_type(), {}); xla::XlaBuilder* b = ctx->builder(); - xla::XlaOp mean = XlaHelpers::Zero(b, dtype); - xla::XlaOp stddev = XlaHelpers::One(b, dtype); - xla::XlaOp candidate = b->RngNormal(mean, stddev, xla_shape); - - auto two_sd = [dtype](bool negate, xla::XlaBuilder* b) { - return XlaHelpers::FloatLiteral(b, dtype, negate ? -2.0 : 2.0); - }; - auto out_of_range_mask = [two_sd](xla::XlaOp candidate, - xla::XlaBuilder* b) { - xla::XlaOp too_large = b->Gt(candidate, two_sd(false, b)); - xla::XlaOp too_small = b->Lt(candidate, two_sd(true, b)); - return b->Or(too_large, too_small); - }; - // The algorithm we're using is roughly: - // - // while (any(candidate < mean-2*sd || candidate > mean+2*sd)) { - // out_of_range_mask := candidate < mean-2*sd || candidate > mean+2*sd - // candidate = select(out_of_range_mask, rng_normal(), candidate) - // } - std::unique_ptr test_builder = - b->CreateSubBuilder("truncated_normal_test"); - { - auto* b = test_builder.get(); - xla::XlaOp candidate = b->Parameter(0, xla_shape, "candidate"); - out_of_range_mask(candidate, b); - OP_REQUIRES_OK(ctx, Any(out_of_range_mask(candidate, b), b).status()); - } - - std::unique_ptr body_builder = - b->CreateSubBuilder("truncated_normal_body"); - { - auto* b = body_builder.get(); - xla::XlaOp candidate = b->Parameter(0, xla_shape, "candidate"); - xla::XlaOp to_resample = out_of_range_mask(candidate, b); - xla::XlaOp mean = XlaHelpers::Zero(b, dtype); - xla::XlaOp stddev = XlaHelpers::One(b, dtype); - b->Select(to_resample, b->RngNormal(mean, stddev, xla_shape), candidate); - } - - xla::StatusOr test_computation = test_builder->Build(); - OP_REQUIRES_OK(ctx, test_computation.status()); - xla::StatusOr body_computation = body_builder->Build(); - OP_REQUIRES_OK(ctx, body_computation.status()); - xla::XlaOp result = b->While(test_computation.ValueOrDie(), - body_computation.ValueOrDie(), candidate); - - ctx->SetOutput(0, result); + xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype, 1.0); + xla::XlaOp min_positive = + XlaHelpers::FloatLiteral(b, dtype, std::numeric_limits::min()); + auto uniform = xla::RngUniform(min_positive, one, xla_shape); + ctx->SetOutput(0, TruncatedNormal(uniform)); } }; -REGISTER_XLA_OP(Name("TruncatedNormal").CompileTimeConstInput("shape"), +REGISTER_XLA_OP(Name("TruncatedNormal") + .CompileTimeConstInput("shape") + .TypeConstraint("dtype", DT_FLOAT), TruncatedNormalOp); -} // anonymous namespace +} // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc index 08894489ac77bbbe4ddb067c06a6d031a537697d..8102faad28db71075fb8da269c55edbdb667193e 100644 --- a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc @@ -19,6 +19,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/op_kernel.h" @@ -30,41 +32,30 @@ class ReduceWindowOp : public XlaOpKernel { explicit ReduceWindowOp(OpKernelConstruction* context) : XlaOpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("computation", &computation_)); - OP_REQUIRES_OK(context, - context->GetAttr("window_dimensions", &window_dimensions_)); - OP_REQUIRES_OK(context, - context->GetAttr("window_strides", &window_strides_)); - OP_REQUIRES_OK(context, context->GetAttr("padding_low", &padding_low_)); - OP_REQUIRES_OK(context, context->GetAttr("padding_high", &padding_high_)); } void Compile(XlaOpKernelContext* context) override { const TensorShape input_shape = context->InputShape(0); const DataType dtype = context->input_type(0); + std::vector window_dimensions; + std::vector window_strides; + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector( + "window_dimensions", &window_dimensions)); + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("window_strides", + &window_strides)); + const int rank = input_shape.dims(); - OP_REQUIRES(context, rank == window_dimensions_.size(), + OP_REQUIRES(context, rank == window_dimensions.size(), errors::InvalidArgument( "The size of window_dimensions must be equal to the input " "rank (", - window_dimensions_.size(), " vs. ", rank, ")")); - OP_REQUIRES(context, rank == window_strides_.size(), + window_dimensions.size(), " vs. ", rank, ")")); + OP_REQUIRES(context, rank == window_strides.size(), errors::InvalidArgument( "The size of window_strides must be equal to the input " "rank (", - window_strides_.size(), " vs. ", rank, ")")); - OP_REQUIRES(context, rank == padding_low_.size(), - errors::InvalidArgument( - "The size of padding_low must be equal to the input " - "rank (", - padding_low_.size(), " vs. ", rank, ")")); - OP_REQUIRES(context, rank == padding_high_.size(), - errors::InvalidArgument( - "The size of padding_high must be equal to the input " - "rank (", - padding_high_.size(), " vs. ", rank, ")")); - - xla::XlaBuilder* builder = context->builder(); + window_strides.size(), " vs. ", rank, ")")); // Build the reducer function. XlaCompiler::Argument reducer_arg; @@ -76,6 +67,7 @@ class ReduceWindowOp : public XlaOpKernel { compile_options.use_tuple_arg = false; compile_options.resolve_compile_time_constants = false; compile_options.is_entry_computation = false; + compile_options.always_return_tuple = false; XlaCompiler::CompilationResult reducer; OP_REQUIRES_OK(context, context->compiler()->CompileFunction( compile_options, *computation_, @@ -84,51 +76,47 @@ class ReduceWindowOp : public XlaOpKernel { xla::Shape scalar_shape; OP_REQUIRES_OK(context, TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape)); + OP_REQUIRES( + context, + xla::ShapeUtil::Compatible(reducer.xla_output_shape, scalar_shape), + errors::InvalidArgument( + "Invalid output shape of ReduceWindow reducer. Expected ", + xla::ShapeUtil::HumanString(scalar_shape), " got ", + xla::ShapeUtil::HumanString(reducer.xla_output_shape))); + + const TensorShape padding_shape = context->InputShape("padding"); OP_REQUIRES(context, - xla::ShapeUtil::Compatible( - reducer.xla_output_shape, - xla::ShapeUtil::MakeTupleShape({scalar_shape})), + TensorShapeUtils::IsMatrix(padding_shape) && + padding_shape.dim_size(1) == 2, errors::InvalidArgument( - "Invalid output shape of ReduceWindow reducer. Expected ", - xla::ShapeUtil::HumanString(scalar_shape), " got ", - xla::ShapeUtil::HumanString(reducer.xla_output_shape))); - - // Wraps the reducer in a computation that unpacks the output tuple. - xla::XlaComputation wrapper; - { - std::unique_ptr cb = - builder->CreateSubBuilder("wrapper"); - auto x = cb->Parameter(0, scalar_shape, "x"); - auto y = cb->Parameter(1, scalar_shape, "y"); - auto outputs = cb->Call(*reducer.computation, {x, y}); - cb->GetTupleElement(outputs, 0); - xla::StatusOr result = cb->Build(); - OP_REQUIRES_OK(context, result.status()); - wrapper = std::move(result.ValueOrDie()); - } - - std::vector> padding(rank); - for (int i = 0; i < rank; ++i) { - padding[i] = {padding_low_[i], padding_high_[i]}; + "padding must be a matrix with minor dimension 2, got ", + padding_shape.DebugString())); + xla::Literal padding_literal; + OP_REQUIRES_OK(context, context->ConstantInputAsInt64Literal( + "padding", &padding_literal)); + std::vector> padding(padding_shape.dim_size(0)); + for (int i = 0; i < padding.size(); ++i) { + padding[i] = {padding_literal.Get({i, 0}), + padding_literal.Get({i, 1})}; } - xla::XlaOp output = builder->ReduceWindowWithGeneralPadding( - context->Input(0), context->Input(1), wrapper, window_dimensions_, - window_strides_, padding); + xla::XlaOp output = xla::ReduceWindowWithGeneralPadding( + context->Input(0), context->Input(1), *reducer.computation, + window_dimensions, window_strides, padding); context->SetOutput(0, output); } private: const NameAttrList* computation_; - std::vector window_dimensions_; - std::vector window_strides_; - std::vector padding_low_; - std::vector padding_high_; TF_DISALLOW_COPY_AND_ASSIGN(ReduceWindowOp); }; -REGISTER_XLA_OP(Name("XlaReduceWindow"), ReduceWindowOp); +REGISTER_XLA_OP(Name("XlaReduceWindow") + .CompileTimeConstInput("window_dimensions") + .CompileTimeConstInput("window_strides") + .CompileTimeConstInput("padding"), + ReduceWindowOp); } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc index 0f425637795e9633a8e36f921000ee2f5e25813a..0d260fa8fcaa513d7854c1e9215952404d555c70 100644 --- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc @@ -19,7 +19,9 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/kernel_def_builder.h" namespace tensorflow { @@ -31,11 +33,11 @@ class SumOp : public XlaReductionOp { : XlaReductionOp(ctx, XlaHelpers::SumAccumulationType(ctx->input_type(0))) {} xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { - return XlaHelpers::Zero(builder, reduction_type_); + return xla::Zero(builder, xla_reduction_type_); } void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs, const xla::XlaOp& scalar_rhs) override { - builder->Add(scalar_lhs, scalar_rhs); + xla::Add(scalar_lhs, scalar_rhs); } }; @@ -48,12 +50,12 @@ class ProdOp : public XlaReductionOp { XlaHelpers::SumAccumulationType(ctx->input_type(0))) {} xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { - return XlaHelpers::One(builder, reduction_type_); + return xla::One(builder, xla_reduction_type_); } void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs, const xla::XlaOp& scalar_rhs) override { - builder->Mul(scalar_lhs, scalar_rhs); + xla::Mul(scalar_lhs, scalar_rhs); } }; @@ -66,12 +68,12 @@ class MinOp : public XlaReductionOp { : XlaReductionOp(ctx, ctx->input_type(0)) {} xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { - return XlaHelpers::MaxValue(builder, reduction_type_); + return xla::MaxValue(builder, xla_reduction_type_); } void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs, const xla::XlaOp& scalar_rhs) override { - builder->Min(scalar_lhs, scalar_rhs); + xla::Min(scalar_lhs, scalar_rhs); } }; @@ -83,12 +85,12 @@ class MaxOp : public XlaReductionOp { : XlaReductionOp(ctx, ctx->input_type(0)) {} xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { - return XlaHelpers::MinValue(builder, reduction_type_); + return xla::MinValue(builder, xla_reduction_type_); } void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs, const xla::XlaOp& scalar_rhs) override { - builder->Max(scalar_lhs, scalar_rhs); + xla::Max(scalar_lhs, scalar_rhs); } }; @@ -101,11 +103,11 @@ class MeanOp : public XlaReductionOp { XlaHelpers::SumAccumulationType(ctx->input_type(0))) {} xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { - return XlaHelpers::Zero(builder, reduction_type_); + return xla::Zero(builder, xla_reduction_type_); } void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs, const xla::XlaOp& scalar_rhs) override { - builder->Add(scalar_lhs, scalar_rhs); + xla::Add(scalar_lhs, scalar_rhs); } xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder, @@ -113,7 +115,7 @@ class MeanOp : public XlaReductionOp { int64 num_elements_reduced) override { auto divisor = XlaHelpers::IntegerLiteral(builder, input_type(0), num_elements_reduced); - return builder->Div(reduce_output, divisor); + return reduce_output / divisor; } }; @@ -126,12 +128,12 @@ class AllOp : public XlaReductionOp { : XlaReductionOp(ctx, ctx->input_type(0)) {} xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { - return builder->ConstantR0(true); + return xla::ConstantR0(builder, true); } void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs, const xla::XlaOp& scalar_rhs) override { - builder->And(scalar_lhs, scalar_rhs); + xla::And(scalar_lhs, scalar_rhs); } }; @@ -143,12 +145,12 @@ class AnyOp : public XlaReductionOp { : XlaReductionOp(ctx, ctx->input_type(0)) {} xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { - return builder->ConstantR0(false); + return xla::ConstantR0(builder, false); } void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs, const xla::XlaOp& scalar_rhs) override { - builder->Or(scalar_lhs, scalar_rhs); + xla::Or(scalar_lhs, scalar_rhs); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h index 2ecfb854a1c8625524d4f1199af3927edd204926..466e79828d111ee7cadcf713703e8f252c63e62c 100644 --- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h +++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h @@ -19,7 +19,7 @@ limitations under the License. #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_REDUCTION_OPS_H_ #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" namespace tensorflow { @@ -64,6 +64,7 @@ class XlaReductionOp : public XlaOpKernel { protected: DataType reduction_type_; + xla::PrimitiveType xla_reduction_type_; }; } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc index 4fd5bfd03999a7f8b7bb081cc4b03aa1434d4c3d..598248563bb93146e6dea3016822d26b8bf368e7 100644 --- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc +++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc @@ -15,11 +15,14 @@ limitations under the License. // XLA-specific reduction Ops. +#include "absl/strings/str_join.h" #include "tensorflow/compiler/tf2xla/kernels/reduction_ops.h" #include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/kernel_def_builder.h" namespace tensorflow { @@ -27,10 +30,9 @@ namespace tensorflow { XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx, DataType reduction_type) : XlaOpKernel(ctx), reduction_type_(reduction_type) { - const DataType dt = BaseType(input_type(0)); - OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, DT_INT32}, {dt})); - OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_)); + OP_REQUIRES_OK( + ctx, DataTypeToPrimitiveType(reduction_type_, &xla_reduction_type_)); } // Unless BuildFinalizer is overridden the reduction has no @@ -54,20 +56,24 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) { return; } + OP_REQUIRES(ctx, axes_tensor_shape.dims() <= 1, + errors::InvalidArgument( + "Expected scalar or vector as index argument, got ", + axes_tensor_shape.DebugString())); + // Evaluate the constant, reshaping to a 1-vector if it is a scalar. + std::vector axes; xla::Literal axes_literal; - OP_REQUIRES_OK(ctx, - ctx->ConstantInputReshaped( - 1, {axes_tensor_shape.num_elements()}, &axes_literal)); + OP_REQUIRES_OK(ctx, ctx->ConstantInputReshapedToIntVector(1, &axes)); VLOG(1) << "data shape: " << data_shape.DebugString(); - VLOG(1) << "axes : " << axes_literal.ToString(); + VLOG(1) << "axes : " << absl::StrJoin(axes, ","); gtl::InlinedVector bitmap(data_shape.dims(), false); std::vector xla_axes; int64 num_elements_reduced = 1LL; for (int64 i = 0; i < axes_tensor_shape.num_elements(); ++i) { - int32 index = axes_literal.Get({i}); + int64 index = axes[i]; OP_REQUIRES(ctx, !(index < -data_shape.dims() || index >= data_shape.dims()), errors::InvalidArgument("Invalid reduction dimension (", index, @@ -101,20 +107,20 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) { xla::PrimitiveType type; TF_CHECK_OK(DataTypeToPrimitiveType(reduction_type_, &type)); - auto data = b->ConvertElementType(ctx->Input(0), type); + auto data = xla::ConvertElementType(ctx->Input(0), type); // Call virtual method to get the initial value. - auto initial = b->ConvertElementType(InitialValue(b), type); + auto initial = xla::ConvertElementType(InitialValue(b), type); // Make two scalar parameters of the desired type for the lambda. - auto rx = r.Parameter(0, xla::ShapeUtil::MakeShape(type, {}), "x"); - auto ry = r.Parameter(1, xla::ShapeUtil::MakeShape(type, {}), "y"); + auto rx = xla::Parameter(&r, 0, xla::ShapeUtil::MakeShape(type, {}), "x"); + auto ry = xla::Parameter(&r, 1, xla::ShapeUtil::MakeShape(type, {}), "y"); // Call virtual method to build the reduction lambda. BuildReducer(&r, rx, ry); xla::XlaComputation reduction_computation = r.Build().ConsumeValueOrDie(); - auto reduce = b->Reduce(data, initial, reduction_computation, xla_axes); + auto reduce = xla::Reduce(data, initial, reduction_computation, xla_axes); auto deconverted = XlaHelpers::ConvertElementType(b, reduce, input_type(0)); auto finalized = BuildFinalizer(b, deconverted, num_elements_reduced); - auto result = keep_dims_ ? b->Reshape(finalized, final_shape) : finalized; + auto result = keep_dims_ ? xla::Reshape(finalized, final_shape) : finalized; ctx->SetOutput(0, result); } diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc index ba7d484d53d7258edaa5bc42fa116cf16e94835b..d35777ccb1271ec6a7c9972c714d06b2415d9c34 100644 --- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc @@ -18,8 +18,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/no_op.h" @@ -34,7 +34,7 @@ class ReluOp : public XlaOpKernel { void Compile(XlaOpKernelContext* ctx) override { xla::XlaBuilder* builder = ctx->builder(); auto zero = XlaHelpers::Zero(builder, input_type(0)); - ctx->SetOutput(0, builder->Max(zero, ctx->Input(0))); + ctx->SetOutput(0, xla::Max(zero, ctx->Input(0))); } }; @@ -46,7 +46,7 @@ class Relu6Op : public XlaOpKernel { xla::XlaBuilder* builder = ctx->builder(); auto zero = XlaHelpers::Zero(builder, input_type(0)); auto six = XlaHelpers::IntegerLiteral(builder, input_type(0), 6); - ctx->SetOutput(0, builder->Clamp(zero, ctx->Input(0), six)); + ctx->SetOutput(0, xla::Clamp(zero, ctx->Input(0), six)); } }; @@ -59,9 +59,9 @@ class ReluGradOp : public XlaOpKernel { xla::XlaBuilder* b = ctx->builder(); const TensorShape shape = ctx->InputShape(0); const auto zero = - b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes()); - const auto pred = b->Gt(ctx->Input(1), zero); - ctx->SetOutput(0, b->Select(pred, ctx->Input(0), zero)); + xla::Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes()); + const auto pred = xla::Gt(ctx->Input(1), zero); + ctx->SetOutput(0, xla::Select(pred, ctx->Input(0), zero)); } }; @@ -74,12 +74,12 @@ class Relu6GradOp : public XlaOpKernel { xla::XlaBuilder* b = ctx->builder(); const TensorShape shape = ctx->InputShape(0); const auto zero = - b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes()); - const auto six = b->Broadcast( + xla::Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes()); + const auto six = xla::Broadcast( XlaHelpers::IntegerLiteral(b, input_type(0), 6), shape.dim_sizes()); - auto out = - b->Select(b->And(b->Lt(ctx->Input(1), six), b->Gt(ctx->Input(1), zero)), - ctx->Input(0), zero); + auto out = xla::Select( + xla::And(xla::Lt(ctx->Input(1), six), xla::Gt(ctx->Input(1), zero)), + ctx->Input(0), zero); ctx->SetOutput(0, out); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc index af4d64b159c09ed7e01017f25a2b23e58542dc3c..366ce42866e9f1375ee0ff6f4985c8f461fc0885 100644 --- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc @@ -19,7 +19,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -40,8 +41,8 @@ class ReshapeOp : public XlaOpKernel { sizes_shape.DebugString())); const int64 num_dims = sizes_shape.num_elements(); - xla::Literal literal; - OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &literal)); + std::vector shape_input; + OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &shape_input)); // Compute the output shape. Determine product of specified // dimensions, and find the index of the unspecified one if there @@ -50,7 +51,7 @@ class ReshapeOp : public XlaOpKernel { int64 product = 1; int unknown_index = -1; for (int d = 0; d < num_dims; ++d) { - const int32 size = literal.Get({d}); + const int32 size = shape_input[d]; if (size == -1) { OP_REQUIRES( ctx, unknown_index == -1, @@ -90,8 +91,7 @@ class ReshapeOp : public XlaOpKernel { VLOG(1) << "Reshape " << input_shape.DebugString() << " " << shape.DebugString(); - ctx->SetOutput(0, - ctx->builder()->Reshape(ctx->Input(0), shape.dim_sizes())); + ctx->SetOutput(0, xla::Reshape(ctx->Input(0), shape.dim_sizes())); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc index a711278638444be01fb865561957702368b75114..e172c649325adb6f7761ce0be141f21e8d545bc1 100644 --- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc @@ -16,7 +16,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/op_kernel.h" @@ -47,6 +48,15 @@ class RetvalOp : public XlaOpKernel { } else { xla::XlaOp input = ctx->Input(0); const TensorShape input_shape = ctx->InputShape(0); + DataType input_type = ctx->input_type(0); + XlaContext& tc = XlaContext::Get(ctx); + + if (input_type == DT_RESOURCE) { + XlaResource* resource; + OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource)); + ctx->SetStatus(tc.AddResourceRetval(index_, resource)); + return; + } auto is_constant = ctx->builder()->IsConstant(input); if (!is_constant.ok()) { @@ -54,7 +64,6 @@ class RetvalOp : public XlaOpKernel { return; } - XlaContext& tc = XlaContext::Get(ctx); if (tc.resolve_compile_time_constants() && (input_shape.num_elements() == 0 || is_constant.ValueOrDie())) { xla::Literal literal; @@ -62,15 +71,24 @@ class RetvalOp : public XlaOpKernel { OP_REQUIRES_OK(ctx, tc.AddConstRetval(index_, dtype_, literal)); } else { TensorShape shape = ctx->InputShape(0); - TensorShape representation_shape = - tc.is_entry_computation() - ? tc.RepresentationShape(shape, ctx->input_type(0)) - : shape; + ctx->SetStatus(is_constant.status()); + TensorShape representation_shape; + if (tc.is_entry_computation()) { + xla::StatusOr shape_or_status = + tc.RepresentationShape(shape, ctx->input_type(0)); + if (!shape_or_status.ok()) { + ctx->SetStatus(shape_or_status.status()); + return; + } else { + representation_shape = shape_or_status.ValueOrDie(); + } + } else { + representation_shape = shape; + } xla::XlaOp output = input; if (tc.is_entry_computation()) { - output = - ctx->builder()->Reshape(input, representation_shape.dim_sizes()); + output = xla::Reshape(input, representation_shape.dim_sizes()); } else { // The core from which a return value is returned depends on the // device assignment of the input to the retval. Since we can't change @@ -78,8 +96,8 @@ class RetvalOp : public XlaOpKernel { // introduce an operator here, even if the shape does not change. // TODO(b/76097077): propagate device assignments onto arguments and // return values of functions, and then reshape unconditionally. - output = ctx->builder()->GetTupleElement( - ctx->builder()->Tuple({output}), 0); + output = + xla::GetTupleElement(xla::Tuple(ctx->builder(), {output}), 0); } tc.AddRetval(index_, dtype_, shape, output); } @@ -94,7 +112,8 @@ class RetvalOp : public XlaOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp); }; -REGISTER_XLA_OP(Name("_Retval"), RetvalOp); +REGISTER_XLA_OP(Name("_Retval").AllowResourceTypes().CompilationOnly(), + RetvalOp); } // anonymous namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc index 2872a3c4d49d0d269aa3d216887a5c32cd51f1c3..c0afccaa5b15dd33fcd016dfdd9bb18e244bf90a 100644 --- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc @@ -19,7 +19,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -62,7 +63,7 @@ class ReverseOp : public XlaOpKernel { } } - ctx->SetOutput(0, ctx->builder()->Rev(ctx->Input(0), dimensions)); + ctx->SetOutput(0, xla::Rev(ctx->Input(0), dimensions)); } }; @@ -94,13 +95,27 @@ class ReverseV2Op : public XlaOpKernel { std::vector axes; OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &axes)); + // witnessed_axes is used to ensure that the same axis is not marked to be + // reversed multiple times. + gtl::InlinedVector witnessed_axes(x_shape.dims(), false); + for (int d = 0; d < axes.size(); ++d) { - OP_REQUIRES(ctx, (0 <= axes[d]) && (axes[d] < x_shape.dims()), - errors::InvalidArgument(axes[d], " is out of range [0, ", - x_shape.dims(), ").")); + OP_REQUIRES( + ctx, (-x_shape.dims() <= axes[d]) && (axes[d] < x_shape.dims()), + errors::InvalidArgument(axes[d], " is out of range [-", + x_shape.dims(), ", ", x_shape.dims(), ").")); + // Axes can be negative and are shifted to the canonical index before + // being lowered to HLO. + if (axes[d] < 0) { + axes[d] += x_shape.dims(); + } + OP_REQUIRES(ctx, !witnessed_axes[axes[d]], + errors::InvalidArgument("canonicalized axis ", axes[d], + " was repeated.")); + witnessed_axes[axes[d]] = true; } - ctx->SetOutput(0, ctx->builder()->Rev(ctx->Input(0), axes)); + ctx->SetOutput(0, xla::Rev(ctx->Input(0), axes)); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc index 5d1c05268493f4f6404c40a4092a71f1e5b3f3b9..03a50ef8a059e5a005c4cc2e5e98acedfea8619a 100644 --- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc @@ -17,6 +17,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/tensor_shape.h" namespace tensorflow { @@ -85,103 +87,96 @@ class ReverseSequenceOp : public XlaOpKernel { auto condition_builder = builder->CreateSubBuilder("reverse_sequence_condition"); { - auto param = condition_builder->Parameter(0, tuple_shape, "param"); - auto i = condition_builder->GetTupleElement(param, 0); - condition_builder->Lt( - i, XlaHelpers::IntegerLiteral(condition_builder.get(), seq_lens_type, - batch_size)); + auto param = + xla::Parameter(condition_builder.get(), 0, tuple_shape, "param"); + auto i = xla::GetTupleElement(param, 0); + xla::Lt(i, XlaHelpers::IntegerLiteral(condition_builder.get(), + seq_lens_type, batch_size)); } auto condition = condition_builder->Build(); OP_REQUIRES_OK(context, condition.status()); auto body_builder = builder->CreateSubBuilder("reverse_sequence_body"); { - auto param = body_builder->Parameter(0, tuple_shape, "param"); - auto i = body_builder->GetTupleElement(param, 0); - auto seq_lens = body_builder->GetTupleElement(param, 1); - auto output = body_builder->GetTupleElement(param, 2); + auto param = xla::Parameter(body_builder.get(), 0, tuple_shape, "param"); + auto i = xla::GetTupleElement(param, 0); + auto seq_lens = xla::GetTupleElement(param, 1); + auto output = xla::GetTupleElement(param, 2); // seq_len is the sequence length of the current batch element (rank 1) - auto seq_len = body_builder->DynamicSlice( - seq_lens, body_builder->Reshape(i, {1}), {1}); + auto seq_len = xla::DynamicSlice(seq_lens, xla::Reshape(i, {1}), {1}); // Indices is the offset of the batch element in the input. - auto batch_element_indices = body_builder->Broadcast( - XlaHelpers::Zero(body_builder.get(), seq_lens_type), - {input_shape.dims()}); - batch_element_indices = body_builder->DynamicUpdateSlice( - batch_element_indices, body_builder->Reshape(i, {1}), - body_builder->Reshape( - XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type, - batch_dim_), - {1})); + auto batch_element_indices = + xla::Broadcast(XlaHelpers::Zero(body_builder.get(), seq_lens_type), + {input_shape.dims()}); + batch_element_indices = xla::DynamicUpdateSlice( + batch_element_indices, xla::Reshape(i, {1}), + xla::Reshape(XlaHelpers::IntegerLiteral(body_builder.get(), + seq_lens_type, batch_dim_), + {1})); // Slice out the current batch element and pad it out in the sequence // dimension. TensorShape slice_shape = input_shape; slice_shape.set_dim(batch_dim_, 1); slice_shape.set_dim(seq_dim_, max_seq_len); - auto slice = body_builder->DynamicSlice(output, batch_element_indices, - slice_shape.dim_sizes()); + auto slice = xla::DynamicSlice(output, batch_element_indices, + slice_shape.dim_sizes()); auto padding_config = xla::MakeNoPaddingConfig(slice_shape.dims()); padding_config.mutable_dimensions(seq_dim_)->set_edge_padding_high( slice_shape.dim_size(seq_dim_)); - slice = body_builder->Pad( - slice, XlaHelpers::Zero(body_builder.get(), input_type), - padding_config); + slice = xla::Pad(slice, XlaHelpers::Zero(body_builder.get(), input_type), + padding_config); // Now slice out the reversed sequence from its actual start. // sequence_start_indices is the offset of the start of the reversed // sequence in the input. The slice will go into the padding, however, we // will mask off these elements and replace them with elements from the // original input so their values do not matter. - auto sequence_start_indices = body_builder->Broadcast( - XlaHelpers::Zero(body_builder.get(), seq_lens_type), - {slice_shape.dims()}); - sequence_start_indices = body_builder->DynamicUpdateSlice( + auto sequence_start_indices = + xla::Broadcast(XlaHelpers::Zero(body_builder.get(), seq_lens_type), + {slice_shape.dims()}); + sequence_start_indices = xla::DynamicUpdateSlice( sequence_start_indices, - body_builder->Sub(XlaHelpers::IntegerLiteral( - body_builder.get(), seq_lens_type, max_seq_len), - seq_len), - body_builder->Reshape( - XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type, - seq_dim_), - {1})); - slice = body_builder->DynamicSlice(slice, sequence_start_indices, - slice_shape.dim_sizes()); + xla::Sub(XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type, + max_seq_len), + seq_len), + xla::Reshape(XlaHelpers::IntegerLiteral(body_builder.get(), + seq_lens_type, seq_dim_), + {1})); + slice = xla::DynamicSlice(slice, sequence_start_indices, + slice_shape.dim_sizes()); // Shift the reversed sequence to the left. - output = body_builder->DynamicUpdateSlice(output, slice, - batch_element_indices); + output = xla::DynamicUpdateSlice(output, slice, batch_element_indices); - body_builder->Tuple( - {body_builder->Add( - i, XlaHelpers::One(body_builder.get(), seq_lens_type)), + xla::Tuple( + body_builder.get(), + {xla::Add(i, XlaHelpers::One(body_builder.get(), seq_lens_type)), seq_lens, output}); } auto body = body_builder->Build(); OP_REQUIRES_OK(context, body.status()); - auto loop_output = builder->While( + auto loop_output = xla::While( condition.ValueOrDie(), body.ValueOrDie(), - builder->Tuple({XlaHelpers::Zero(builder, seq_lens_type), seq_lens, - builder->Rev(input, {seq_dim_})})); - auto output = builder->GetTupleElement(loop_output, 2); + xla::Tuple(builder, {XlaHelpers::Zero(builder, seq_lens_type), seq_lens, + xla::Rev(input, {seq_dim_})})); + auto output = xla::GetTupleElement(loop_output, 2); // Mask out elements after the sequence length. - xla::XlaOp iota; - OP_REQUIRES_OK( - context, XlaHelpers::Iota(builder, seq_lens_type, max_seq_len, &iota)); + xla::XlaOp iota = + xla::Iota(builder, seq_lens_xla_shape.element_type(), max_seq_len); std::vector dims(input_shape.dims(), 1); dims[batch_dim_] = batch_size; - auto mask = builder->Lt(iota, builder->Reshape(seq_lens, dims), {seq_dim_}); + auto mask = xla::Lt(iota, xla::Reshape(seq_lens, dims), {seq_dim_}); // Broadcast the mask up to the input shape. - mask = - builder->Or(mask, builder->Broadcast(builder->ConstantR0(false), - input_shape.dim_sizes())); + mask = xla::Or(mask, xla::Broadcast(xla::ConstantR0(builder, false), + input_shape.dim_sizes())); - output = builder->Select(mask, output, input); + output = xla::Select(mask, output, input); context->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc index 1819fb543317eed15b2fe0518d74aba5c564697d..ab094d7dd1ce9856a3c2854fd2776827d6c4b76f 100644 --- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc @@ -20,7 +20,9 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/register_types.h" @@ -100,7 +102,7 @@ class ScanOp : public XlaOpKernel { init = XlaHelpers::One(builder, dtype); reducer = ctx->GetOrCreateMul(dtype); } - auto output = builder->ReduceWindowWithGeneralPadding( + auto output = xla::ReduceWindowWithGeneralPadding( XlaHelpers::ConvertElementType(builder, ctx->Input(0), dtype), init, *reducer, window_dims, window_strides, padding); output = @@ -110,12 +112,12 @@ class ScanOp : public XlaOpKernel { // of all the input elements. Slice off this extra "last" element. if (exclusive_) { if (reverse_) { - output = builder->SliceInDim(output, 1, input_shape.dim_size(axis) + 1, - 1, axis); + output = + xla::SliceInDim(output, 1, input_shape.dim_size(axis) + 1, 1, axis); } else { output = - builder->SliceInDim(output, 0, input_shape.dim_size(axis), 1, axis); + xla::SliceInDim(output, 0, input_shape.dim_size(axis), 1, axis); } } ctx->SetOutput(0, output); diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc index f2c63b4f9083ad3c7dd7cf318dc22def1e99fa9f..f1f32699fee5f03f603f830722fe65622dee5d3e 100644 --- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/op_kernel.h" @@ -103,8 +104,8 @@ class ScatterNdOp : public XlaOpKernel { updates_shape)); xla::XlaBuilder* builder = context->builder(); - auto buffer = builder->Broadcast(XlaHelpers::Zero(builder, dtype), - buffer_shape.dim_sizes()); + auto buffer = xla::Broadcast(XlaHelpers::Zero(builder, dtype), + buffer_shape.dim_sizes()); auto indices = context->Input(0); auto updates = context->Input(1); auto result = diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc index 664078ca16c6d5d4b57c4a8c661ad0848f30dd7d..b22ecb7c6dbb42a33a4f4d90b18b20816df16a50 100644 --- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc @@ -14,20 +14,30 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/tf2xla/lib/scatter.h" +#include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" namespace tensorflow { namespace { -class UnsortedSegmentSum : public XlaOpKernel { +class UnsortedSegmentReduce : public XlaOpKernel { public: - explicit UnsortedSegmentSum(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + explicit UnsortedSegmentReduce(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + DataType dtype; + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype)); + OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype, &type_)); } + // The initial value to initialize elements of the output to. + virtual xla::XlaOp InitialValue(xla::XlaBuilder* builder) = 0; + + // A function to combine two scalars with the same index (e.g., sum). + virtual xla::XlaOp Combine(xla::XlaOp a, xla::XlaOp b) = 0; + void Compile(XlaOpKernelContext* ctx) override { // output = unsorted_segment_sum(data, indices, num_segments) // Compute a tensor such that: @@ -50,28 +60,28 @@ class UnsortedSegmentSum : public XlaOpKernel { OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(2, &num_segments)); OP_REQUIRES(ctx, data_shape.dims() >= indices_shape.dims(), - errors::InvalidArgument( - "UnsortedSegmentSum requires that indices' rank be" - " less than or equal to data's rank.")); + errors::InvalidArgument(type_string(), + " requires that indices' rank be" + " less than or equal to data's rank.")); // Validate that indices.shape is a prefix of data.shape. for (int d = 0; d < indices_shape.dims(); ++d) { - OP_REQUIRES(ctx, (data_shape.dim_size(d) == indices_shape.dim_size(d)), - errors::InvalidArgument( - "UnsortedSegmentSum requires indices shape to be prefix" - " of data_shape, but dimension ", - d, " differs ", data_shape.dim_size(d), " vs. ", - indices_shape.dim_size(d))); + OP_REQUIRES( + ctx, (data_shape.dim_size(d) == indices_shape.dim_size(d)), + errors::InvalidArgument(type_string(), + " requires indices shape to be prefix" + " of data_shape, but dimension ", + d, " differs ", data_shape.dim_size(d), + " vs. ", indices_shape.dim_size(d))); } xla::XlaBuilder* builder = ctx->builder(); TensorShape buffer_shape = data_shape; buffer_shape.RemoveDimRange(0, indices_shape.dims()); buffer_shape.InsertDim(0, num_segments); - auto buffer = builder->Broadcast(XlaHelpers::Zero(builder, dtype_), - buffer_shape.dim_sizes()); + auto buffer = + xla::Broadcast(InitialValue(builder), buffer_shape.dim_sizes()); - auto combiner = [](xla::XlaOp a, xla::XlaOp b, xla::XlaBuilder* builder) { - return builder->Add(a, b); - }; + auto combiner = [this](xla::XlaOp a, xla::XlaOp b, + xla::XlaBuilder* builder) { return Combine(a, b); }; auto result = XlaScatter(buffer, /*updates=*/data, indices, /*indices_are_vectors=*/false, combiner, builder); @@ -79,13 +89,73 @@ class UnsortedSegmentSum : public XlaOpKernel { ctx->SetOutput(0, result.ValueOrDie()); } - private: - DataType dtype_; + protected: + xla::PrimitiveType type_; +}; + +class UnsortedSegmentSum : public UnsortedSegmentReduce { + public: + explicit UnsortedSegmentSum(OpKernelConstruction* ctx) + : UnsortedSegmentReduce(ctx) {} + + xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { + return xla::Zero(builder, type_); + }; + xla::XlaOp Combine(xla::XlaOp a, xla::XlaOp b) override { return a + b; }; }; REGISTER_XLA_OP( Name("UnsortedSegmentSum").CompileTimeConstInput("num_segments"), UnsortedSegmentSum); +class UnsortedSegmentProd : public UnsortedSegmentReduce { + public: + explicit UnsortedSegmentProd(OpKernelConstruction* ctx) + : UnsortedSegmentReduce(ctx) {} + + xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { + return xla::One(builder, type_); + }; + xla::XlaOp Combine(xla::XlaOp a, xla::XlaOp b) override { return a * b; }; +}; + +REGISTER_XLA_OP( + Name("UnsortedSegmentProd").CompileTimeConstInput("num_segments"), + UnsortedSegmentProd); + +class UnsortedSegmentMin : public UnsortedSegmentReduce { + public: + explicit UnsortedSegmentMin(OpKernelConstruction* ctx) + : UnsortedSegmentReduce(ctx) {} + + xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { + return xla::MaxFiniteValue(builder, type_); + }; + xla::XlaOp Combine(xla::XlaOp a, xla::XlaOp b) override { + return xla::Min(a, b); + }; +}; + +REGISTER_XLA_OP( + Name("UnsortedSegmentMin").CompileTimeConstInput("num_segments"), + UnsortedSegmentMin); + +class UnsortedSegmentMax : public UnsortedSegmentReduce { + public: + explicit UnsortedSegmentMax(OpKernelConstruction* ctx) + : UnsortedSegmentReduce(ctx) {} + + xla::XlaOp InitialValue(xla::XlaBuilder* builder) override { + return xla::MinFiniteValue(builder, type_); + }; + xla::XlaOp Combine(xla::XlaOp a, xla::XlaOp b) override { + return xla::Max(a, b); + }; +}; + +REGISTER_XLA_OP( + Name("UnsortedSegmentMax").CompileTimeConstInput("num_segments"), + UnsortedSegmentMax); + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc index f9f48164d63492b057d4950abfc2ca6153e44870..9e4c57c9bf73369662274f6b783418e18ff860c2 100644 --- a/tensorflow/compiler/tf2xla/kernels/select_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/kernels/bounds_check.h" @@ -40,8 +41,6 @@ class SelectOp : public XlaOpKernel { "'then' and 'else' must have the same size. but received: ", then_shape.DebugString(), " vs. ", else_shape.DebugString())); - xla::XlaBuilder* builder = ctx->builder(); - auto cond_handle = ctx->Input(0); auto then_handle = ctx->Input(1); auto else_handle = ctx->Input(2); @@ -67,16 +66,16 @@ class SelectOp : public XlaOpKernel { // XLA. It seems we have to broadcast on the left and then Reshape // to get the dimensions in the right order. const auto dim_sizes = then_shape.dim_sizes(); - gtl::ArraySlice bdims = dim_sizes; - bdims.pop_front(); - cond_handle = builder->Broadcast(cond_handle, bdims); + absl::Span bdims = dim_sizes; + bdims.remove_prefix(1); + cond_handle = xla::Broadcast(cond_handle, bdims); std::vector dim_order(then_shape.dims()); dim_order[0] = then_shape.dims() - 1; std::iota(dim_order.begin() + 1, dim_order.end(), 0); - cond_handle = builder->Transpose(cond_handle, dim_order); + cond_handle = xla::Transpose(cond_handle, dim_order); } - ctx->SetOutput(0, builder->Select(cond_handle, then_handle, else_handle)); + ctx->SetOutput(0, xla::Select(cond_handle, then_handle, else_handle)); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc index 9ce01d0d44509bbcbea18afdb4210a675834bb6d..a7f5a8f1698b9d02560de427d356e9e6be5caa7c 100644 --- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc @@ -18,7 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/types.h" @@ -45,7 +45,7 @@ void SendOp::Compile(XlaOpKernelContext* ctx) { XlaCompiler* compiler = XlaContext::Get(ctx).compiler(); xla::ChannelHandle channel; OP_REQUIRES_OK(ctx, compiler->GetChannelHandle(tensor_name_, &channel)); - ctx->builder()->Send(ctx->Input(0), channel); + xla::Send(ctx->Input(0), channel); } REGISTER_XLA_OP(Name("XlaSend"), SendOp); @@ -76,7 +76,7 @@ void RecvOp::Compile(XlaOpKernelContext* ctx) { XlaCompiler* compiler = XlaContext::Get(ctx).compiler(); xla::ChannelHandle channel; OP_REQUIRES_OK(ctx, compiler->GetChannelHandle(tensor_name_, &channel)); - ctx->SetOutput(0, ctx->builder()->Recv(shape_, channel)); + ctx->SetOutput(0, xla::Recv(ctx->builder(), shape_, channel)); } REGISTER_XLA_OP(Name("XlaRecv"), RecvOp); diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc index 2c31f8d90891924f6f86a54ccf548de4df87f3bd..25a5bcbe1dd27d741ce3b74125ba9ce425ee78f3 100644 --- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc @@ -18,7 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -55,9 +55,10 @@ Status GetIntValue(int index, XlaOpKernelContext* ctx, int64* value) { // The type-specific part of the implementation of Range. template -Status CreateRangeTensor(const xla::Literal& start_literal, - const xla::Literal& limit_literal, - const xla::Literal& delta_literal, Tensor* output) { +Status CreateRangeTensor(const xla::LiteralSlice& start_literal, + const xla::LiteralSlice& limit_literal, + const xla::LiteralSlice& delta_literal, + Tensor* output) { T start = start_literal.Get({}); T limit = limit_literal.Get({}); T delta = delta_literal.Get({}); @@ -67,13 +68,13 @@ Status CreateRangeTensor(const xla::Literal& start_literal, } if (delta > 0) { if (start > limit) { - return errors::InvalidArgument("Requires start <= limit when delta > 0: ", - start, "/", limit); + return errors::InvalidArgument( + "Requires start <= limit when delta > 0: ", start, "/", limit); } } else { if (start < limit) { - return errors::InvalidArgument("Requires start >= limit when delta < 0: ", - start, "/", limit); + return errors::InvalidArgument( + "Requires start >= limit when delta < 0: ", start, "/", limit); } } int64 size = diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc index 05354bca5bb089703fdcceb6f44648bbb98d004b..4e0cf99d8e7ff45ed9145981b5e2e637ce4d4e4b 100644 --- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/kernels/bounds_check.h" @@ -43,7 +44,7 @@ class ShapeOp : public XlaOpKernel { DataType out_dtype_; }; -REGISTER_XLA_OP(Name("Shape"), ShapeOp); +REGISTER_XLA_OP(Name("Shape").CompilationOnly(), ShapeOp); class ShapeNOp : public XlaOpKernel { public: @@ -65,7 +66,7 @@ class ShapeNOp : public XlaOpKernel { private: DataType out_dtype_; }; -REGISTER_XLA_OP(Name("ShapeN"), ShapeNOp); +REGISTER_XLA_OP(Name("ShapeN").CompilationOnly(), ShapeNOp); class RankOp : public XlaOpKernel { public: @@ -81,7 +82,7 @@ class RankOp : public XlaOpKernel { } }; -REGISTER_XLA_OP(Name("Rank"), RankOp); +REGISTER_XLA_OP(Name("Rank").CompilationOnly(), RankOp); class SizeOp : public XlaOpKernel { public: @@ -100,7 +101,7 @@ class SizeOp : public XlaOpKernel { } }; -REGISTER_XLA_OP(Name("Size"), SizeOp); +REGISTER_XLA_OP(Name("Size").CompilationOnly(), SizeOp); class ExpandDimsOp : public XlaOpKernel { public: @@ -147,7 +148,7 @@ class ExpandDimsOp : public XlaOpKernel { dim = std::min(dim, existing_dims_size); new_shape.emplace(new_shape.begin() + dim, 1); - ctx->SetOutput(0, ctx->builder()->Reshape(ctx->Input(0), new_shape)); + ctx->SetOutput(0, xla::Reshape(ctx->Input(0), new_shape)); } }; REGISTER_XLA_OP(Name("ExpandDims").CompileTimeConstInput("dim"), ExpandDimsOp); @@ -189,10 +190,9 @@ class SqueezeOp : public XlaOpKernel { if (!wrapped_squeeze_dims.empty()) { if (wrapped_squeeze_dims.count(i) > 0) { OP_REQUIRES(ctx, existing_dim == 1, - errors::InvalidArgument("Tried to explicitly squeeze " - "dimension ", - i, " but dimension was not 1: ", - existing_dim)); + errors::InvalidArgument( + "Tried to explicitly squeeze dimension ", i, + " but dimension was not 1: ", existing_dim)); } else { // This dimension is not being squeezed. new_shape.push_back(existing_dim); @@ -205,7 +205,7 @@ class SqueezeOp : public XlaOpKernel { } } - ctx->SetOutput(0, ctx->builder()->Reshape(ctx->Input(0), new_shape)); + ctx->SetOutput(0, xla::Reshape(ctx->Input(0), new_shape)); } private: @@ -222,7 +222,7 @@ class ZerosLikeOp : public XlaOpKernel { const TensorShape input_shape = ctx->InputShape(0); auto zero = XlaHelpers::Zero(ctx->builder(), input_type(0)); - ctx->SetOutput(0, ctx->builder()->Broadcast(zero, input_shape.dim_sizes())); + ctx->SetOutput(0, xla::Broadcast(zero, input_shape.dim_sizes())); } }; @@ -236,7 +236,7 @@ class OnesLikeOp : public XlaOpKernel { const TensorShape input_shape = ctx->InputShape(0); auto one = XlaHelpers::One(ctx->builder(), input_type(0)); - ctx->SetOutput(0, ctx->builder()->Broadcast(one, input_shape.dim_sizes())); + ctx->SetOutput(0, xla::Broadcast(one, input_shape.dim_sizes())); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc index be1e97bf26fa4cde1b741c8d0b843a85ce33a59c..537b71f3c0cf3622a8a45a717ac406da69f5c3c7 100644 --- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc @@ -15,16 +15,17 @@ limitations under the License. // XLA-specific Slice Op. +#include "absl/types/span.h" #include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/mem.h" namespace tensorflow { @@ -92,8 +93,7 @@ class SliceOp : public XlaOpKernel { limits.push_back(begin[i] + size[i]); } std::vector strides(begin.size(), 1); - ctx->SetOutput( - 0, ctx->builder()->Slice(ctx->Input(0), begin, limits, strides)); + ctx->SetOutput(0, xla::Slice(ctx->Input(0), begin, limits, strides)); } else { // `begin` is not a compile-time constant. for (int i = 0; i < input_dims; ++i) { @@ -106,8 +106,7 @@ class SliceOp : public XlaOpKernel { input_shape.dim_size(i), "], but ", "got ", size[i])); } - ctx->SetOutput( - 0, ctx->builder()->DynamicSlice(ctx->Input(0), ctx->Input(1), size)); + ctx->SetOutput(0, xla::DynamicSlice(ctx->Input(0), ctx->Input(1), size)); } } }; diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc index bbf5ee8b12186a582666121b1df5d8b7d881863e..d6bd927135c013ac1ec3f6547aef358dc2741896 100644 --- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc @@ -15,13 +15,17 @@ limitations under the License. // XLA-specific Ops for softmax. +#include "absl/strings/match.h" +#include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace tensorflow { namespace { @@ -29,18 +33,23 @@ namespace { class SoftmaxOp : public XlaOpKernel { public: explicit SoftmaxOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { - log_ = str_util::StartsWith(type_string(), "Log"); + log_ = absl::StartsWith(type_string(), "Log"); } void Compile(XlaOpKernelContext* ctx) override { const TensorShape logits_shape = ctx->InputShape(0); - OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_shape), - errors::InvalidArgument("logits must be 2-dimensional")); + OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(logits_shape), + errors::InvalidArgument("logits must have >= 1 dimension, got ", + logits_shape.DebugString())); - const int kBatchDim = 0; - const int kClassDim = 1; + // Major dimensions are batch dimensions, minor dimension is the class + // dimension. + std::vector batch_dims(logits_shape.dims() - 1); + std::iota(batch_dims.begin(), batch_dims.end(), 0); + const int kClassDim = logits_shape.dims() - 1; const DataType type = input_type(0); + const xla::PrimitiveType xla_type = ctx->input_xla_type(0); auto logits = ctx->Input(0); xla::XlaBuilder* const b = ctx->builder(); @@ -48,24 +57,27 @@ class SoftmaxOp : public XlaOpKernel { // Find the max in each batch, resulting in a tensor of shape [batch] auto logits_max = - b->Reduce(logits, XlaHelpers::MinValue(b, type), max_func, {kClassDim}); + xla::Reduce(logits, xla::MinValue(b, xla_type), max_func, {kClassDim}); // Subtract the max in batch b from every element in batch b. Broadcasts // along the batch dimension. - auto shifted_logits = b->Sub(logits, logits_max, {kBatchDim}); - auto exp_shifted = b->Exp(shifted_logits); + auto shifted_logits = xla::Sub(logits, logits_max, batch_dims); + auto exp_shifted = xla::Exp(shifted_logits); const DataType accumulation_type = XlaHelpers::SumAccumulationType(type); + xla::PrimitiveType xla_accumulation_type; + OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(accumulation_type, + &xla_accumulation_type)); auto converted = - XlaHelpers::ConvertElementType(b, exp_shifted, accumulation_type); + xla::ConvertElementType(exp_shifted, xla_accumulation_type); auto reduce = - b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type), - *ctx->GetOrCreateAdd(accumulation_type), {kClassDim}); + xla::Reduce(converted, xla::Zero(b, xla_accumulation_type), + *ctx->GetOrCreateAdd(accumulation_type), {kClassDim}); auto sum = XlaHelpers::ConvertElementType(b, reduce, type); auto softmax = log_ // softmax = shifted_logits - log(sum(exp(shifted_logits))) - ? b->Sub(shifted_logits, b->Log(sum), {kBatchDim}) + ? xla::Sub(shifted_logits, xla::Log(sum), batch_dims) // softmax = exp(shifted_logits) / sum(exp(shifted_logits)) - : b->Div(exp_shifted, sum, {kBatchDim}); + : xla::Div(exp_shifted, sum, batch_dims); ctx->SetOutput(0, softmax); } @@ -77,8 +89,8 @@ REGISTER_XLA_OP(Name("Softmax"), SoftmaxOp); REGISTER_XLA_OP(Name("LogSoftmax"), SoftmaxOp); std::pair CrossEntropyWithLogits( - XlaOpKernelContext* ctx, DataType type, const xla::XlaOp& logits, - const xla::XlaOp& labels) { + XlaOpKernelContext* ctx, DataType type, xla::PrimitiveType xla_type, + xla::XlaOp logits, xla::XlaOp labels) { const xla::XlaComputation& max_func = *ctx->GetOrCreateMax(type); const int kBatchDim = 0; @@ -87,43 +99,44 @@ std::pair CrossEntropyWithLogits( xla::XlaBuilder* b = ctx->builder(); // Find the max in each batch, resulting in a tensor of shape [batch] auto logits_max = - b->Reduce(logits, XlaHelpers::MinValue(b, type), max_func, {kClassDim}); + xla::Reduce(logits, xla::MinValue(b, xla_type), max_func, {kClassDim}); // Subtract the max in batch b from every element in batch b. // Broadcasts along the batch dimension. - auto shifted_logits = b->Sub(logits, logits_max, {kBatchDim}); + auto shifted_logits = xla::Sub(logits, logits_max, {kBatchDim}); // exp(logits - max_logits) - auto exp_shifted_logits = b->Exp(shifted_logits); + auto exp_shifted_logits = xla::Exp(shifted_logits); // sum_{class} (exp(logits - max_logits)) const DataType accumulation_type = XlaHelpers::SumAccumulationType(type); auto converted = XlaHelpers::ConvertElementType(b, exp_shifted_logits, accumulation_type); - auto reduce = b->Reduce(converted, XlaHelpers::Zero(b, accumulation_type), - *ctx->GetOrCreateAdd(accumulation_type), {kClassDim}); + auto reduce = + xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type), + *ctx->GetOrCreateAdd(accumulation_type), {kClassDim}); auto sum_exp = XlaHelpers::ConvertElementType(b, reduce, type); // log(sum(exp(logits - max_logits))) - auto log_sum_exp = b->Log(sum_exp); + auto log_sum_exp = xla::Log(sum_exp); // sum(-labels * // ((logits - max_logits) - log(sum(exp(logits - max_logits))))) // along classes // (The subtraction broadcasts along the batch dimension.) - auto sub = b->Sub(shifted_logits, log_sum_exp, {kBatchDim}); - auto mul = b->Mul(b->Neg(labels), sub); + auto sub = xla::Sub(shifted_logits, log_sum_exp, {kBatchDim}); + auto mul = xla::Mul(xla::Neg(labels), sub); auto sum = - b->Reduce(XlaHelpers::ConvertElementType(b, mul, accumulation_type), - XlaHelpers::Zero(b, accumulation_type), - *ctx->GetOrCreateAdd(accumulation_type), {kClassDim}); + xla::Reduce(XlaHelpers::ConvertElementType(b, mul, accumulation_type), + XlaHelpers::Zero(b, accumulation_type), + *ctx->GetOrCreateAdd(accumulation_type), {kClassDim}); auto loss = XlaHelpers::ConvertElementType(b, sum, type); // backprop: prob - labels, where // prob = exp(logits - max_logits) / sum(exp(logits - max_logits)) // (where the division broadcasts along the batch dimension) xla::XlaOp backprop = - b->Sub(b->Div(exp_shifted_logits, sum_exp, {kBatchDim}), labels); + xla::Sub(xla::Div(exp_shifted_logits, sum_exp, {kBatchDim}), labels); return {loss, backprop}; } @@ -146,12 +159,13 @@ class SoftmaxXentWithLogitsOp : public XlaOpKernel { // check that "labels" is a matrix too. const DataType type = input_type(0); + const xla::PrimitiveType xla_type = ctx->input_xla_type(0); auto logits = ctx->Input(0); auto labels = ctx->Input(1); xla::XlaOp loss, backprop; std::tie(loss, backprop) = - CrossEntropyWithLogits(ctx, type, logits, labels); + CrossEntropyWithLogits(ctx, type, xla_type, logits, labels); ctx->SetOutput(0, loss); ctx->SetOutput(1, backprop); } @@ -187,8 +201,9 @@ class SparseSoftmaxXentWithLogitsOp : public XlaOpKernel { int64 batch_size = logits_shape.dim_size(0); int64 depth = logits_shape.dim_size(1); - DataType logits_type = input_type(0); - DataType indices_type = input_type(1); + const DataType logits_type = input_type(0); + const xla::PrimitiveType xla_logits_type = ctx->input_xla_type(0); + const DataType indices_type = input_type(1); xla::XlaOp indices = ctx->Input(1); @@ -206,20 +221,18 @@ class SparseSoftmaxXentWithLogitsOp : public XlaOpKernel { // Builds a vector of {batch_size} that is 0 if the index is in range, or // NaN otherwise; then add that vector to the labels to force out-of-range // values to NaNs. - xla::XlaOp nan_or_zero = builder->Select( - builder->And( - builder->Le(XlaHelpers::Zero(builder, indices_type), indices), - builder->Lt(indices, XlaHelpers::IntegerLiteral( - builder, indices_type, depth))), - builder->Broadcast(XlaHelpers::Zero(builder, logits_type), - {batch_size}), - builder->Broadcast(XlaHelpers::FloatLiteral(builder, logits_type, NAN), - {batch_size})); - labels = builder->Add(labels, nan_or_zero, {0}); + xla::XlaOp nan_or_zero = xla::Select( + xla::And(xla::Le(XlaHelpers::Zero(builder, indices_type), indices), + xla::Lt(indices, XlaHelpers::IntegerLiteral( + builder, indices_type, depth))), + xla::Broadcast(XlaHelpers::Zero(builder, logits_type), {batch_size}), + xla::Broadcast(XlaHelpers::FloatLiteral(builder, logits_type, NAN), + {batch_size})); + labels = xla::Add(labels, nan_or_zero, {0}); xla::XlaOp loss, backprop; - std::tie(loss, backprop) = - CrossEntropyWithLogits(ctx, logits_type, ctx->Input(0), labels); + std::tie(loss, backprop) = CrossEntropyWithLogits( + ctx, logits_type, xla_logits_type, ctx->Input(0), labels); ctx->SetOutput(0, loss); ctx->SetOutput(1, backprop); } diff --git a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..aaeeae01ccb303091a6d37d1aeb4b2a3377dc638 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc @@ -0,0 +1,35 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" + +namespace tensorflow { +namespace { + +class XlaSortOp : public XlaOpKernel { + public: + explicit XlaSortOp(OpKernelConstruction* context) : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + context->SetOutput(0, xla::Sort(context->Input(0))); + } +}; + +REGISTER_XLA_OP(Name("XlaSort"), XlaSortOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc index ec077924b5b5af4a573c86c8d9aeb8623bd7f801..b7b4f3a5465c8eea832ef940b7c84a7435edc38c 100644 --- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc @@ -16,13 +16,14 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" namespace tensorflow { namespace { void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input, DataType input_dtype, const TensorShape& input_tensor_shape, - gtl::ArraySlice block_shape, + absl::Span block_shape, const xla::Literal& paddings) { const int input_rank = input_tensor_shape.dims(); const gtl::InlinedVector input_shape = @@ -33,7 +34,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input, ctx, input_rank >= 1 + block_rank, errors::InvalidArgument("input rank should be >= ", 1 + block_rank, " instead of ", input_rank)); - gtl::ArraySlice remainder_shape(input_shape); + absl::Span remainder_shape(input_shape); remainder_shape.remove_prefix(1 + block_rank); OP_REQUIRES( @@ -73,7 +74,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input, "The product of the block dimensions must be positive")); xla::XlaOp padded = - b->Pad(input, XlaHelpers::Zero(b, input_dtype), padding_config); + xla::Pad(input, XlaHelpers::Zero(b, input_dtype), padding_config); // 2. Reshape `padded` to `reshaped_padded` of shape: // @@ -100,7 +101,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input, std::copy(remainder_shape.begin(), remainder_shape.end(), reshaped_padded_shape.begin() + 1 + 2 * block_rank); - xla::XlaOp reshaped_padded = b->Reshape(padded, reshaped_padded_shape); + xla::XlaOp reshaped_padded = xla::Reshape(padded, reshaped_padded_shape); // 3. Permute dimensions of `reshaped_padded` to produce // `permuted_reshaped_padded` of shape: @@ -120,7 +121,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input, std::iota(permutation.begin() + 1 + block_rank * 2, permutation.end(), 1 + block_rank * 2); xla::XlaOp permuted_reshaped_padded = - b->Transpose(reshaped_padded, permutation); + xla::Transpose(reshaped_padded, permutation); // 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the // batch dimension, producing an output tensor of shape: @@ -140,7 +141,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input, std::copy(remainder_shape.begin(), remainder_shape.end(), output_shape.begin() + 1 + block_rank); - xla::XlaOp output = b->Reshape(permuted_reshaped_padded, output_shape); + xla::XlaOp output = xla::Reshape(permuted_reshaped_padded, output_shape); ctx->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc index 4c5886ee2a0f63d609f79fc690f457d93e284e3e..4493539fe34f0ce635fdc58660d4ff90af9c9379 100644 --- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/util/tensor_format.h" namespace tensorflow { @@ -50,7 +51,6 @@ class SpaceToDepthOp : public XlaOpKernel { const gtl::InlinedVector input_shape = input_tensor_shape.dim_sizes(); - xla::XlaBuilder* b = ctx->builder(); xla::XlaOp input = ctx->Input(0); int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_); @@ -135,7 +135,7 @@ class SpaceToDepthOp : public XlaOpKernel { // input_shape[1] / block_size_, block_size_, // input_shape[2] / block_size_, block_size_, // depth] - xla::XlaOp reshaped = b->Reshape(input, reshaped_shape); + xla::XlaOp reshaped = xla::Reshape(input, reshaped_shape); // 2. Permute dimensions of `reshaped` to produce // `permuted_reshaped` of shape: @@ -145,7 +145,7 @@ class SpaceToDepthOp : public XlaOpKernel { // input_shape[2] / block_size_, // block_size_, block_size_, // depth] - xla::XlaOp permuted_reshaped = b->Transpose(reshaped, transpose_order); + xla::XlaOp permuted_reshaped = xla::Transpose(reshaped, transpose_order); // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the // batch dimension, producing an output tensor of shape: @@ -155,7 +155,7 @@ class SpaceToDepthOp : public XlaOpKernel { // input_shape[2] / block_size_, // block_size_ * block_size_ * depth] // - xla::XlaOp output = b->Reshape(permuted_reshaped, output_shape); + xla::XlaOp output = xla::Reshape(permuted_reshaped, output_shape); ctx->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc b/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e831dc30a9d3c27ec3b1494e7d8a6de836ff2a11 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc @@ -0,0 +1,88 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/lib/scatter.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" + +namespace tensorflow { +namespace { + +// Operator to convert sparse representations to dense. +class SparseToDenseOp : public XlaOpKernel { + public: + explicit SparseToDenseOp(OpKernelConstruction* context) + : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + // sparse_indices + const TensorShape indices_shape = context->InputShape(0); + OP_REQUIRES(context, indices_shape.dims() <= 2, + errors::InvalidArgument( + "sparse_indices should be a scalar, vector, or matrix, " + "got shape ", + indices_shape.DebugString())); + const int64 num_elems = + indices_shape.dims() > 0 ? indices_shape.dim_size(0) : 1; + const int64 num_dims = + indices_shape.dims() > 1 ? indices_shape.dim_size(1) : 1; + + // output_shape + TensorShape output_shape; + OP_REQUIRES_OK(context, context->ConstantInputAsShape(1, &output_shape)); + OP_REQUIRES(context, output_shape.dims() == num_dims, + errors::InvalidArgument( + "output_shape has incorrect number of elements: ", + output_shape.num_elements(), " should be: ", num_dims)); + + // sparse_values + const TensorShape sparse_values_shape = context->InputShape(2); + const int64 num_values = sparse_values_shape.num_elements(); + OP_REQUIRES( + context, + sparse_values_shape.dims() == 0 || + (sparse_values_shape.dims() == 1 && num_values == num_elems), + errors::InvalidArgument("sparse_values has incorrect shape ", + sparse_values_shape.DebugString(), + ", should be [] or [", num_elems, "]")); + + // default_value + const TensorShape default_value_shape = context->InputShape(3); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(default_value_shape), + errors::InvalidArgument("default_value should be a scalar.")); + + xla::XlaOp indices = context->Input(0); + xla::XlaOp sparse_values = context->Input(2); + xla::XlaOp default_value = context->Input(3); + + if (sparse_values_shape.dims() == 0 && num_elems != 1) { + sparse_values = Broadcast(sparse_values, {num_elems}); + } + xla::XlaBuilder* builder = context->builder(); + auto buffer = Broadcast(default_value, output_shape.dim_sizes()); + + auto result = XlaScatter(buffer, sparse_values, indices, + /*indices_are_vectors=*/num_dims > 1, + /*combiner=*/{}, builder); + context->SetOutput(0, builder->ReportErrorOrReturn(result)); + } +}; + +REGISTER_XLA_OP(Name("SparseToDense").CompileTimeConstInput("output_shape"), + SparseToDenseOp); + +} // namespace + +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc index 8958b2e7701e62d802e37a895c14b662ecf9786a..93fc14e9efca868e84444dd0e07d7f0dfa84c042 100644 --- a/tensorflow/compiler/tf2xla/kernels/split_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc @@ -19,7 +19,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -98,7 +99,7 @@ class SplitOp : public XlaOpKernel { // Slice out the ith split from the split dimension. begin[split_dim] = i * slice_size; limits[split_dim] = (i + 1) * slice_size; - ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides)); + ctx->SetOutput(i, xla::Slice(input, begin, limits, strides)); } } }; @@ -134,7 +135,7 @@ class SplitVOp : public XlaOpKernel { errors::InvalidArgument( "Number of ways to split should be > 0, but got ", num_split)); - // check that sizes are correct + // Check that sizes are correct. int total_split_size = 0; int neg_one_dim = -1; std::vector split_sizes_vec(num_split, -1); @@ -148,7 +149,7 @@ class SplitVOp : public XlaOpKernel { " number of elements as the output. Got ", split_size_shape.dims(), "-D and ", split_size_shape.num_elements(), " elements")); - // get the dimension of this split + // Get the dimension of this split. xla::Literal split_size_literal; OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &split_size_literal)); @@ -199,7 +200,7 @@ class SplitVOp : public XlaOpKernel { // Slice out the ith split from the split dimension. limits[split_dim] = begin[split_dim] + slice_size; - ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides)); + ctx->SetOutput(i, xla::Slice(input, begin, limits, strides)); begin[split_dim] = limits[split_dim]; } } diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc index 0fb05a2be7b1034d6c2e864643b69647d622ede7..df91900570107609c0f1c2281faaab8a5e65b98b 100644 --- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc @@ -23,7 +23,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/register_types.h" @@ -144,24 +144,25 @@ class StackPushOp : public XlaOpKernel { // Initializes the Stack, if the element shape was not already known. OP_REQUIRES_OK(ctx, MaybeInitializeStack(b, resource, dtype_, elem_shape)); - xla::XlaOp ta = b->GetTupleElement(resource->value(), 0); - xla::XlaOp index = b->GetTupleElement(resource->value(), 1); + xla::XlaOp ta = xla::GetTupleElement(resource->value(), 0); + xla::XlaOp index = xla::GetTupleElement(resource->value(), 1); xla::XlaOp value = ctx->Input(1); // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0]. auto start_indices = - b->Pad(b->Reshape(index, {1}), b->ConstantR0(0), - xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}})); + xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0(b, 0), + xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}})); TensorShape slice_shape = elem_shape; slice_shape.InsertDim(0, 1LL); - auto update = b->Reshape(value, slice_shape.dim_sizes()); + auto update = xla::Reshape(value, slice_shape.dim_sizes()); // TODO(phawkins): We don't check the index is in bounds --- there is no // error mechanism in XLA. - OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple( - {b->DynamicUpdateSlice(ta, update, start_indices), - b->Add(index, b->ConstantR0(1))}))); + OP_REQUIRES_OK(ctx, + resource->SetValue(xla::Tuple( + b, {xla::DynamicUpdateSlice(ta, update, start_indices), + xla::Add(index, xla::ConstantR0(b, 1))}))); ctx->SetOutput(0, value); } @@ -197,27 +198,27 @@ class StackPopOp : public XlaOpKernel { OP_REQUIRES_OK(ctx, GetStackShape(b, resource, &stack_shape)); xla::XlaOp state = resource->value(); - xla::XlaOp ta = b->GetTupleElement(state, 0); - xla::XlaOp index = b->GetTupleElement(state, 1); + xla::XlaOp ta = xla::GetTupleElement(state, 0); + xla::XlaOp index = xla::GetTupleElement(state, 1); - index = b->Sub(index, b->ConstantR0(1)); - OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple({ta, index}))); + index = Sub(index, xla::ConstantR0(b, 1)); + OP_REQUIRES_OK(ctx, resource->SetValue(xla::Tuple(b, {ta, index}))); // start_indices of the DynamicSlice are [index, 0, 0, ..., 0]. auto start_indices = - b->Pad(b->Reshape(index, {1}), b->ConstantR0(0), - xla::MakeEdgePaddingConfig({{0, stack_shape.dims() - 1}})); + xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0(b, 0), + xla::MakeEdgePaddingConfig({{0, stack_shape.dims() - 1}})); auto slice_shape = stack_shape.dim_sizes(); slice_shape[0] = 1LL; // TODO(phawkins): We don't check the index is in bounds --- there is no // error mechanism in XLA. - xla::XlaOp read = b->DynamicSlice(ta, start_indices, slice_shape); + xla::XlaOp read = xla::DynamicSlice(ta, start_indices, slice_shape); // Remove the leading '1' dimension. std::vector value_shape(slice_shape.begin() + 1, slice_shape.end()); - ctx->SetOutput(0, b->Reshape(read, value_shape)); + ctx->SetOutput(0, xla::Reshape(read, value_shape)); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc index a99d4ddc7c4956f7144512a9bdf6f4c2eb0f944f..5412e135478361d08965e4621ec52cfb4a792f1d 100644 --- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc @@ -15,11 +15,16 @@ limitations under the License. #include +#include "tensorflow/compiler/tf2xla/lib/random.h" #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/math.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/lib/prng.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" @@ -29,187 +34,6 @@ limitations under the License. namespace tensorflow { namespace { -// Rotates a 32-bit integer 'v' left by 'distance' bits. -xla::XlaOp RotateLeftS32(xla::XlaBuilder* builder, const xla::XlaOp& v, - int distance) { - return builder->Or( - builder->ShiftLeft(v, builder->ConstantR0(distance)), - builder->ShiftRightLogical(v, builder->ConstantR0(32 - distance))); -} - -// TODO(b/65209188): add a primitive XOR to XLA and call it here, rather than -// building XOR out of other bitwise operators. -xla::XlaOp BitwiseXor(xla::XlaBuilder* builder, const xla::XlaOp& x, - const xla::XlaOp& y) { - return builder->Or(builder->And(x, builder->Not(y)), - builder->And(builder->Not(x), y)); -} - -using ThreeFry2x32State = std::array; - -// Implements the ThreeFry counter-based PRNG algorithm. -// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. -// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf -ThreeFry2x32State ThreeFry2x32(xla::XlaBuilder* builder, - ThreeFry2x32State input, ThreeFry2x32State key) { - // Rotation distances specified by the Threefry2x32 algorithm. - constexpr std::array rotations = {13, 15, 26, 6, 17, 29, 16, 24}; - ThreeFry2x32State x; - - std::array ks; - // 0x1BD11BDA is a parity constant specified by the ThreeFry2x32 algorithm. - ks[2] = builder->ConstantR0(0x1BD11BDA); - for (int i = 0; i < 2; ++i) { - ks[i] = key[i]; - x[i] = input[i]; - ks[2] = BitwiseXor(builder, ks[2], key[i]); - } - - x[0] = builder->Add(x[0], ks[0]); - x[1] = builder->Add(x[1], ks[1]); - - // Performs a single round of the Threefry2x32 algorithm, with a rotation - // amount 'rotation'. - auto round = [builder](ThreeFry2x32State v, int rotation) { - v[0] = builder->Add(v[0], v[1]); - v[1] = RotateLeftS32(builder, v[1], rotation); - v[1] = BitwiseXor(builder, v[0], v[1]); - return v; - }; - - // There are no known statistical flaws with 13 rounds of Threefry2x32. - // We are conservative and use 20 rounds. - x = round(x, rotations[0]); - x = round(x, rotations[1]); - x = round(x, rotations[2]); - x = round(x, rotations[3]); - x[0] = builder->Add(x[0], ks[1]); - x[1] = builder->Add(builder->Add(x[1], ks[2]), builder->ConstantR0(1)); - - x = round(x, rotations[4]); - x = round(x, rotations[5]); - x = round(x, rotations[6]); - x = round(x, rotations[7]); - x[0] = builder->Add(x[0], ks[2]); - x[1] = builder->Add(builder->Add(x[1], ks[0]), builder->ConstantR0(2)); - - x = round(x, rotations[0]); - x = round(x, rotations[1]); - x = round(x, rotations[2]); - x = round(x, rotations[3]); - x[0] = builder->Add(x[0], ks[0]); - x[1] = builder->Add(builder->Add(x[1], ks[1]), builder->ConstantR0(3)); - - x = round(x, rotations[4]); - x = round(x, rotations[5]); - x = round(x, rotations[6]); - x = round(x, rotations[7]); - x[0] = builder->Add(x[0], ks[1]); - x[1] = builder->Add(builder->Add(x[1], ks[2]), builder->ConstantR0(4)); - - x = round(x, rotations[0]); - x = round(x, rotations[1]); - x = round(x, rotations[2]); - x = round(x, rotations[3]); - x[0] = builder->Add(x[0], ks[2]); - x[1] = builder->Add(builder->Add(x[1], ks[0]), builder->ConstantR0(5)); - - return x; -} - -// Returns a tensor of 'shape' random values uniformly distributed in the range -// [minval, maxval) -xla::XlaOp RandomUniform(xla::XlaBuilder* builder, const xla::XlaOp& seed, - const TensorShape& shape, double minval, - double maxval) { - // Split the seed into two 32-bit scalars to form a key. - auto seed0 = builder->Reshape(builder->Slice(seed, {0}, {1}, {1}), {}); - auto seed1 = builder->Reshape(builder->Slice(seed, {1}, {2}, {1}), {}); - ThreeFry2x32State key = {seed0, seed1}; - const int64 size = shape.num_elements(); - - const int64 half_size = MathUtil::CeilOfRatio(size, 2); - const bool size_is_odd = (half_size * 2 != size); - - // Fill the generator inputs with unique counter values. - ThreeFry2x32State inputs; - TF_CHECK_OK(XlaHelpers::Iota(builder, DT_INT32, half_size, &inputs[0])); - inputs[1] = builder->Add(inputs[0], builder->ConstantR0(half_size)); - ThreeFry2x32State outputs = ThreeFry2x32(builder, inputs, key); - - if (size_is_odd) { - outputs[1] = builder->Slice(outputs[1], {0}, {half_size - 1}, {1}); - } - - auto bits = - builder->Reshape(builder->ConcatInDim(outputs, 0), shape.dim_sizes()); - - // Form 22 random mantissa bits, with a leading 1 bit. The leading 1 bit - // forces the random bits into the mantissa. - constexpr int kFloatBits = 32; - constexpr int kMantissaBits = 23; - bits = builder->Or( - builder->ShiftRightLogical( - bits, builder->ConstantR0(kFloatBits - kMantissaBits)), - builder->ConstantR0(bit_cast(1.0f))); - auto floats = builder->BitcastConvertType(bits, xla::F32); - - // We have a floating point number in the range [1.0, 2.0). - // Subtract 1.0f to shift to the range [0.0, 1.0) - floats = builder->Sub(floats, builder->ConstantR0(1.0f)); - // Multiply and add to shift to the range [minval, maxval). - floats = builder->Mul(floats, builder->ConstantR0(maxval - minval)); - floats = builder->Add(floats, builder->ConstantR0(minval)); - return floats; -} - -// Approximation for the inverse error function from -// Giles, M., "Approximating the erfinv function". -// The approximation has the form: -// w = -log((1 - x) * (1 + x)) -// if ( w < 5 ) { -// w = w - 2.5 -// p = sum_{i=1}^n lq[i]*w^i -// } else { -// w = sqrt(w) - 3 -// p = sum_{i=1}^n gq[i]*w^i -// } -// return p*x -xla::XlaOp ErfInvF32(xla::XlaBuilder* b, const xla::XlaOp& x, - const TensorShape& shape) { - constexpr int kDegree = 9; - constexpr std::array w_less_than_5_constants = { - 2.81022636e-08f, 3.43273939e-07f, -3.5233877e-06f, - -4.39150654e-06f, 0.00021858087f, -0.00125372503f, - -0.00417768164f, 0.246640727f, 1.50140941f}; - constexpr std::array w_greater_than_5_constants = { - -0.000200214257f, 0.000100950558f, 0.00134934322f, - -0.00367342844f, 0.00573950773f, -0.0076224613f, - 0.00943887047f, 1.00167406f, 2.83297682f}; - - auto one = b->ConstantR0(1.0); - auto w = b->Neg(b->Log(b->Mul(b->Sub(one, x), b->Add(one, x)))); - - auto lt = b->Lt(w, b->ConstantR0(5.0)); - auto coefficient = [&](int i) { - return b->Select( - lt, - b->Broadcast(b->ConstantR0(w_less_than_5_constants[i]), - shape.dim_sizes()), - b->Broadcast(b->ConstantR0(w_greater_than_5_constants[i]), - shape.dim_sizes())); - }; - w = b->Select(lt, b->Sub(w, b->ConstantR0(2.5f)), - b->Sub(b->SqrtF32(w), b->ConstantR0(3.0f))); - auto p = coefficient(0); - for (int i = 1; i < kDegree; ++i) { - p = b->Add(coefficient(i), b->Mul(p, w)); - } - return b->Mul(p, x); -} - -} // namespace - class StatelessRandomUniformOp : public XlaOpKernel { public: explicit StatelessRandomUniformOp(OpKernelConstruction* ctx) @@ -226,7 +50,17 @@ class StatelessRandomUniformOp : public XlaOpKernel { errors::InvalidArgument("seed must have shape [2], not ", seed_shape.DebugString())); xla::XlaOp seed = ctx->Input(1); - ctx->SetOutput(0, RandomUniform(builder, seed, shape, 0.0, 1.0)); + + xla::Shape xla_shape; + OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape)); + + auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {}); + auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {}); + + auto uniform = xla::StatelessRngUniform( + {seed0, seed1}, xla_shape, xla::ConstantR0(builder, 0.0), + xla::ConstantR0(builder, 1.0)); + ctx->SetOutput(0, uniform); } private: @@ -255,12 +89,20 @@ class StatelessRandomNormalOp : public XlaOpKernel { seed_shape.DebugString())); xla::XlaOp seed = ctx->Input(1); xla::XlaBuilder* builder = ctx->builder(); - auto uniform = - RandomUniform(builder, seed, shape, std::nextafter(-1.0f, 0.0f), 1.0); + xla::Shape xla_shape; + OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape)); + + auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {}); + auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {}); + + auto uniform = xla::StatelessRngUniform( + {seed0, seed1}, xla_shape, + xla::ConstantR0(builder, std::nextafter(-1.0f, 0.0f)), + xla::ConstantR0(builder, 1.0)); // Convert uniform distribution to normal distribution by computing // sqrt(2) * erfinv(x) - auto normal = builder->Mul(builder->ConstantR0(std::sqrt(2.0)), - ErfInvF32(builder, uniform, shape)); + auto normal = + xla::ScalarLike(uniform, std::sqrt(2.0)) * xla::ErfInv(uniform); ctx->SetOutput(0, normal); } @@ -275,4 +117,44 @@ REGISTER_XLA_OP(Name("StatelessRandomNormal") .TypeConstraint("Tseed", DT_INT32), StatelessRandomNormalOp); +class StatelessTruncatedNormalOp : public XlaOpKernel { + public: + explicit StatelessTruncatedNormalOp(OpKernelConstruction* ctx) + : XlaOpKernel(ctx) {} + + void Compile(XlaOpKernelContext* ctx) override { + TensorShape shape; + OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape)); + + TensorShape seed_shape = ctx->InputShape(1); + OP_REQUIRES(ctx, seed_shape == TensorShape({2}), + errors::InvalidArgument("seed must have shape [2], not ", + seed_shape.DebugString())); + xla::XlaOp seed = ctx->Input(1); + xla::XlaBuilder* builder = ctx->builder(); + + auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {}); + auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {}); + + xla::Shape xla_shape; + OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape)); + auto uniform = xla::StatelessRngUniform( + {seed0, seed1}, xla_shape, + xla::ConstantR0(builder, std::numeric_limits::min()), + xla::ConstantR0(builder, 1.0)); + + ctx->SetOutput(0, TruncatedNormal(uniform)); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(StatelessTruncatedNormalOp); +}; + +REGISTER_XLA_OP(Name("StatelessTruncatedNormal") + .CompileTimeConstInput("shape") + .TypeConstraint("dtype", DT_FLOAT) + .TypeConstraint("Tseed", DT_INT32), + StatelessTruncatedNormalOp); + +} // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc index 55254c746e5ebaf6b468c24ab59b968bf0d6260b..472d4744d7d9cec65645c3259b0c097f0c756bac 100644 --- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc @@ -14,17 +14,18 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/util/strided_slice_op.h" +#include "absl/types/span.h" #include "tensorflow/compiler/tf2xla/literal_util.h" #include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/mem.h" namespace tensorflow { @@ -92,12 +93,12 @@ class StridedSliceOp : public XlaOpKernel { xla::XlaOp slice = ctx->Input(0); if (!dimensions_to_reverse.empty()) { - slice = ctx->builder()->Rev(slice, dimensions_to_reverse); + slice = xla::Rev(slice, dimensions_to_reverse); } - slice = ctx->builder()->Slice(slice, slice_begin, slice_end, slice_strides); + slice = xla::Slice(slice, slice_begin, slice_end, slice_strides); - slice = ctx->builder()->Reshape(slice, final_shape.dim_sizes()); + slice = xla::Reshape(slice, final_shape.dim_sizes()); ctx->SetOutput(0, slice); } @@ -171,7 +172,7 @@ class StridedSliceGradOp : public XlaOpKernel { xla::XlaOp grad = ctx->Input(4); // Undo any new/shrink axes. - grad = ctx->builder()->Reshape(grad, processing_shape.dim_sizes()); + grad = xla::Reshape(grad, processing_shape.dim_sizes()); // Pad the input gradients. gtl::InlinedVector dimensions_to_reverse; @@ -204,9 +205,9 @@ class StridedSliceGradOp : public XlaOpKernel { } } if (!dimensions_to_reverse.empty()) { - grad = ctx->builder()->Rev(grad, dimensions_to_reverse); + grad = xla::Rev(grad, dimensions_to_reverse); } - grad = ctx->builder()->Pad(grad, zero, padding_config); + grad = xla::Pad(grad, zero, padding_config); ctx->SetOutput(0, grad); } @@ -306,17 +307,17 @@ class StridedSliceAssignOp : public XlaOpKernel { } if (!dimensions_to_reverse.empty()) { - rhs = ctx->builder()->Rev(rhs, dimensions_to_reverse); + rhs = xla::Rev(rhs, dimensions_to_reverse); } - rhs = ctx->builder()->Reshape(rhs, slice_dims); + rhs = xla::Reshape(rhs, slice_dims); if (lhs_shape.dims() == 0) { // TODO(b/38323843): DynamicUpdateSlice crashes on rank 0 inputs. Fix // and remove this workaround. lhs = rhs; } else { - lhs = ctx->builder()->DynamicUpdateSlice( - lhs, rhs, ctx->builder()->ConstantR1(slice_begin)); + lhs = xla::DynamicUpdateSlice( + lhs, rhs, xla::ConstantR1(ctx->builder(), slice_begin)); } OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs)); diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc index 9adee78a1fd1fb9a12afae83197425c328b5fe7e..bb114d1aedd57c7de992a05b37ad53443489596f 100644 --- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc @@ -25,7 +25,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/tf2xla/xla_resource.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/register_types.h" @@ -121,12 +122,11 @@ Status GetTensorArrayShape(const XlaResource* resource, // relevant slice of 'operand'. xla::XlaOp DynamicAddSlice(xla::XlaBuilder* builder, const xla::XlaOp& operand, const xla::XlaOp& update, - const gtl::ArraySlice& update_dims, + absl::Span update_dims, const xla::XlaOp& start_indices) { - xla::XlaOp current = - builder->DynamicSlice(operand, start_indices, update_dims); - xla::XlaOp sum = builder->Add(current, update); - return builder->DynamicUpdateSlice(operand, sum, start_indices); + xla::XlaOp current = xla::DynamicSlice(operand, start_indices, update_dims); + xla::XlaOp sum = xla::Add(current, update); + return xla::DynamicUpdateSlice(operand, sum, start_indices); } class TensorArrayOp : public XlaOpKernel { @@ -162,7 +162,7 @@ class TensorArrayOp : public XlaOpKernel { ta_shape.AddDim(size); ta_shape.AppendShape(shape); xla::XlaOp zero = XlaHelpers::Zero(b, dtype_); - value = b->Broadcast(zero, ta_shape.dim_sizes()); + value = xla::Broadcast(zero, ta_shape.dim_sizes()); } XlaContext& xc = XlaContext::Get(ctx); @@ -215,12 +215,12 @@ class TensorArrayWriteOp : public XlaOpKernel { // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0]. auto start_indices = - b->Pad(b->Reshape(index, {1}), b->ConstantR0(0), - xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}})); + xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0(b, 0), + xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}})); TensorShape slice_shape = elem_shape; slice_shape.InsertDim(0, 1LL); - auto update = b->Reshape(value, slice_shape.dim_sizes()); + auto update = xla::Reshape(value, slice_shape.dim_sizes()); xla::XlaOp written = DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices); @@ -259,17 +259,17 @@ class TensorArrayReadOp : public XlaOpKernel { // start_indices of the DynamicSlice are [index, 0, 0, ..., 0]. auto start_indices = - b->Pad(b->Reshape(index, {1}), b->ConstantR0(0), - xla::MakeEdgePaddingConfig({{0, ta_shape.dims() - 1}})); + xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0(b, 0), + xla::MakeEdgePaddingConfig({{0, ta_shape.dims() - 1}})); auto slice_shape = ta_shape.dim_sizes(); slice_shape[0] = 1LL; - xla::XlaOp read = b->DynamicSlice(ta, start_indices, slice_shape); + xla::XlaOp read = xla::DynamicSlice(ta, start_indices, slice_shape); // Remove the leading '1' dimension. std::vector value_shape(slice_shape.begin() + 1, slice_shape.end()); - ctx->SetOutput(0, b->Reshape(read, value_shape)); + ctx->SetOutput(0, xla::Reshape(read, value_shape)); } private: @@ -326,7 +326,7 @@ class TensorArrayGatherOp : public XlaOpKernel { for (auto i = 1; i < ta_shape.dims(); i++) { end[i] = ta_shape.dim_size(i); } - ctx->SetOutput(0, b->Slice(ta, begin, end, strides)); + ctx->SetOutput(0, xla::Slice(ta, begin, end, strides)); return; } } @@ -391,7 +391,7 @@ class TensorArrayScatterOp : public XlaOpKernel { } if (scatter_all_elements_in_order) { - ta = b->Add(ta, value); + ta = xla::Add(ta, value); } else { auto slice_dims = value_shape.dim_sizes(); slice_dims[0] = 1LL; @@ -407,13 +407,13 @@ class TensorArrayScatterOp : public XlaOpKernel { // Slice out part of the value. value_starts[0] = i; value_ends[0] = i + 1; - auto slice = b->Slice(value, value_starts, value_ends, value_strides); + auto slice = xla::Slice(value, value_starts, value_ends, value_strides); // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0]. - auto index = b->Slice(indices, {i}, {i + 1}, {1}); + auto index = xla::Slice(indices, {i}, {i + 1}, {1}); auto start_indices = - b->Pad(b->Reshape(index, {1}), b->ConstantR0(0), - xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}})); + xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0(b, 0), + xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}})); ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices); } } @@ -452,7 +452,7 @@ class TensorArrayConcatOp : public XlaOpKernel { auto ta_dims = ta_shape.dim_sizes(); std::vector shape(ta_dims.begin() + 1, ta_dims.end()); shape[0] *= ta_shape.dim_size(0); - ctx->SetOutput(0, b->Reshape(ta, shape)); + ctx->SetOutput(0, xla::Reshape(ta, shape)); Tensor lengths(DT_INT64, {ta_dims[0]}); auto lengths_vec = lengths.vec(); @@ -522,8 +522,8 @@ class TensorArraySplitOp : public XlaOpKernel { value_shape.DebugString(), " vs. ", ta_shape.DebugString())); - OP_REQUIRES_OK(ctx, resource->SetValue(b->Add( - ta, b->Reshape(value, ta_shape.dim_sizes())))); + OP_REQUIRES_OK(ctx, resource->SetValue(xla::Add( + ta, xla::Reshape(value, ta_shape.dim_sizes())))); ctx->SetOutput(0, flow); } diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc index e91075196bd8414939888e22b5483ad637487af6..93d5996b5eaf10221b1d7067e7650b78cd6b8fef 100644 --- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc @@ -16,16 +16,17 @@ limitations under the License. // XLA-specific Tile Op. #include +#include "absl/types/span.h" #include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/type_index.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" namespace tensorflow { @@ -69,7 +70,7 @@ class TileOp : public XlaOpKernel { bool one_dimension_is_broadcasted_without_multiple = true; for (int i = 0; i < input_dims; ++i) { int multiple = literal.Get({i}); - OP_REQUIRES(ctx, multiple, + OP_REQUIRES(ctx, multiple >= 0, errors::InvalidArgument("Expected multiples[", i, "] >= 0, but got ", multiple)); int64 new_dim = input_shape.dim_size(i) * multiple; @@ -93,9 +94,9 @@ class TileOp : public XlaOpKernel { if (one_dimension_is_broadcasted_without_multiple) { // Create a constant Zero the size of the output shape to leverage binary // operation broadcast semantics. - auto broadcasted_zero = ctx->builder()->Broadcast( + auto broadcasted_zero = xla::Broadcast( XlaHelpers::Zero(ctx->builder(), ctx->input_type(0)), output_shape); - ctx->SetOutput(0, ctx->builder()->Add(broadcasted_zero, input)); + ctx->SetOutput(0, xla::Add(broadcasted_zero, input)); return; } @@ -103,7 +104,7 @@ class TileOp : public XlaOpKernel { // dimension. This prepends the broadcasted dimensions, so an // input of shape [2,3,1] broadcast with multiples [5,4,3] will // end up with shape [5,4,3,2,3,1]. - auto broadcasted = ctx->builder()->Broadcast(input, multiples_array); + auto broadcasted = xla::Broadcast(input, multiples_array); // Now flatten and reshape. The broadcasted dimensions are // paired with the original dimensions so in the above example // we flatten [0,3,1,4,2,5] then reshape to [10,12,3]. @@ -112,8 +113,7 @@ class TileOp : public XlaOpKernel { flattened.push_back(i); flattened.push_back(i + output_shape.size()); } - xla::XlaOp output = - ctx->builder()->Reshape(broadcasted, flattened, output_shape); + xla::XlaOp output = xla::Reshape(broadcasted, flattened, output_shape); ctx->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..183879c7602ccbbd74fca6cb9fa3fc94c066c37d --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc @@ -0,0 +1,67 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/lib/sorting.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/no_op.h" + +namespace tensorflow { +namespace { + +class TopKOp : public XlaOpKernel { + public: + explicit TopKOp(OpKernelConstruction* context) : XlaOpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("sorted", &sorted_)); + } + + void Compile(XlaOpKernelContext* context) override { + int64 k; + OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(1, &k)); + OP_REQUIRES(context, k >= 0, + errors::InvalidArgument("Need k >= 0, got ", k)); + const TensorShape input_shape = context->InputShape(0); + OP_REQUIRES(context, input_shape.dims() >= 1, + errors::InvalidArgument("input must be >= 1-D, got shape ", + input_shape.DebugString())); + int last_dim = input_shape.dims() - 1; + int last_dim_size = input_shape.dim_size(last_dim); + OP_REQUIRES( + context, last_dim_size >= k, + errors::InvalidArgument("input must have at least k columns. Had ", + last_dim_size, ", needed ", k)); + if (last_dim_size < k) { + k = last_dim_size; + } + xla::XlaOp output_tuple = TopK(context->Input(0), k); + context->SetOutput(0, xla::GetTupleElement(output_tuple, 0)); + context->SetOutput(1, xla::GetTupleElement(output_tuple, 1)); + } + + private: + bool sorted_; +}; + +REGISTER_XLA_OP(Name("TopKV2").CompileTimeConstInput("k").TypeConstraint( + "T", {DT_UINT32, DT_INT32, DT_FLOAT, DT_BFLOAT16}), + TopKOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc index 34caefa050c0d58f5f7bad557286b6ed64b996ad..7077c2e3a546e198bdb4ff944ea531f3158810f2 100644 --- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc @@ -16,8 +16,10 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/math.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/no_op.h" @@ -31,7 +33,6 @@ class ResourceApplyGradientDescent : public XlaOpKernel { : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { xla::XlaOp handle; - xla::XlaBuilder* b = ctx->builder(); DataType type = ctx->input_type(1); TensorShape var_shape; OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &handle)); @@ -48,7 +49,7 @@ class ResourceApplyGradientDescent : public XlaOpKernel { var_shape.DebugString(), " vs ", delta_shape.DebugString())); - handle = b->Sub(handle, b->Mul(ctx->Input(1), ctx->Input(2))); + handle = handle - ctx->Input(1) * ctx->Input(2); OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle)); } }; @@ -56,6 +57,64 @@ REGISTER_XLA_OP( Name("ResourceApplyGradientDescent").TypeConstraint("T", kFloatTypes), ResourceApplyGradientDescent); +xla::XlaOp ProximalGradientDescentUpdate(xla::XlaOp var, xla::XlaOp lr, + xla::XlaOp l1, xla::XlaOp l2, + xla::XlaOp grad) { + xla::XlaOp one = xla::ScalarLike(lr, 1.0); + xla::XlaOp zero = xla::ScalarLike(lr, 0.0); + xla::XlaOp prox_var = var - grad * lr; + xla::XlaOp l1_gt_zero = xla::Sign(prox_var) * + xla::Max(xla::Abs(prox_var) - lr * l1, zero) / + (one + lr * l2); + xla::XlaOp l1_le_zero = prox_var / (one + lr * l2); + return xla::Select(xla::Gt(l1, zero), l1_gt_zero, l1_le_zero); +} + +class ResourceApplyProximalGradientDescent : public XlaOpKernel { + public: + explicit ResourceApplyProximalGradientDescent(OpKernelConstruction* ctx) + : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + xla::XlaOp var; + TensorShape var_shape; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var)); + + TensorShape alpha_shape = ctx->InputShape(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_shape), + errors::InvalidArgument("alpha is not a scalar: ", + alpha_shape.DebugString())); + TensorShape l1_shape = ctx->InputShape(2); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_shape), + errors::InvalidArgument("l1 is not a scalar: ", + l1_shape.DebugString())); + TensorShape l2_shape = ctx->InputShape(3); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_shape), + errors::InvalidArgument("l2 is not a scalar: ", + l2_shape.DebugString())); + TensorShape delta_shape = ctx->InputShape(4); + OP_REQUIRES( + ctx, var_shape.IsSameSize(delta_shape), + errors::InvalidArgument("var and delta do not have the same shape: ", + var_shape.DebugString(), " vs ", + delta_shape.DebugString())); + xla::XlaOp alpha = ctx->Input(1); + xla::XlaOp l1 = ctx->Input(2); + xla::XlaOp l2 = ctx->Input(3); + xla::XlaOp delta = ctx->Input(4); + var = ProximalGradientDescentUpdate(var, alpha, l1, l2, delta); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var)); + } + + private: + DataType dtype_; +}; +REGISTER_XLA_OP(Name("ResourceApplyProximalGradientDescent") + .TypeConstraint("T", kFloatTypes), + ResourceApplyProximalGradientDescent); + class ResourceApplyMomentum : public XlaOpKernel { public: explicit ResourceApplyMomentum(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { @@ -63,8 +122,6 @@ class ResourceApplyMomentum : public XlaOpKernel { } void Compile(XlaOpKernelContext* ctx) override { - xla::XlaBuilder* b = ctx->builder(); - DataType type = ctx->input_type(2); TensorShape var_shape, accum_shape; @@ -97,14 +154,13 @@ class ResourceApplyMomentum : public XlaOpKernel { xla::XlaOp grad = ctx->Input(3); xla::XlaOp momentum = ctx->Input(4); - accum = b->Add(b->Mul(accum, momentum), grad); + accum = accum * momentum + grad; if (use_nesterov_) { // See https://github.com/tensorflow/tensorflow/pull/2798 for an // explanation of the reparameterization used here. - var = b->Sub( - var, b->Add(b->Mul(grad, lr), b->Mul(b->Mul(accum, momentum), lr))); + var = var - (grad * lr + accum * momentum * lr); } else { - var = b->Sub(var, b->Mul(accum, lr)); + var = var - accum * lr; } OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, var)); OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, accum)); @@ -121,8 +177,6 @@ class ResourceApplyAdagrad : public XlaOpKernel { explicit ResourceApplyAdagrad(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { - xla::XlaBuilder* b = ctx->builder(); - DataType type = ctx->input_type(2); TensorShape var_shape, accum_shape; @@ -149,10 +203,8 @@ class ResourceApplyAdagrad : public XlaOpKernel { xla::XlaOp lr = ctx->Input(2); xla::XlaOp grad = ctx->Input(3); - accum = b->Add(accum, b->Pow(grad, XlaHelpers::FloatLiteral(b, type, 2.0))); - var = b->Sub( - var, b->Mul(b->Mul(grad, lr), - b->Pow(accum, XlaHelpers::FloatLiteral(b, type, -0.5)))); + accum = accum + xla::Square(grad); + var = var - grad * lr * xla::Rsqrt(accum); OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, var)); OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, accum)); } @@ -160,6 +212,139 @@ class ResourceApplyAdagrad : public XlaOpKernel { REGISTER_XLA_OP(Name("ResourceApplyAdagrad").TypeConstraint("T", kFloatTypes), ResourceApplyAdagrad); +class ResourceApplyProximalAdagrad : public XlaOpKernel { + public: + explicit ResourceApplyProximalAdagrad(OpKernelConstruction* ctx) + : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + TensorShape var_shape, accum_shape; + xla::XlaOp var, accum; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var)); + OP_REQUIRES_OK(ctx, + ctx->ReadVariableInput(1, dtype_, &accum_shape, &accum)); + + OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape), + errors::InvalidArgument( + "var and accum do not have the same shape", + var_shape.DebugString(), " ", accum_shape.DebugString())); + + TensorShape lr_shape = ctx->InputShape(2); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape), + errors::InvalidArgument("lr is not a scalar: ", + lr_shape.DebugString())); + TensorShape l1_shape = ctx->InputShape(3); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l1_shape), + errors::InvalidArgument("l1 is not a scalar: ", + l1_shape.DebugString())); + TensorShape l2_shape = ctx->InputShape(4); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l2_shape), + errors::InvalidArgument("l2 is not a scalar: ", + l2_shape.DebugString())); + TensorShape grad_shape = ctx->InputShape(5); + OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape), + errors::InvalidArgument( + "var and grad do not have the same shape: ", + var_shape.DebugString(), " vs ", grad_shape.DebugString())); + + xla::XlaOp lr = ctx->Input(2); + xla::XlaOp l1 = ctx->Input(3); + xla::XlaOp l2 = ctx->Input(4); + xla::XlaOp grad = ctx->Input(5); + accum = accum + xla::Square(grad); + // Adagrad learning rate. + xla::XlaOp adagrad_lr = lr * xla::Rsqrt(accum); + var = ProximalGradientDescentUpdate(var, adagrad_lr, l1, l2, grad); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, accum)); + } + + private: + DataType dtype_; +}; +REGISTER_XLA_OP( + Name("ResourceApplyProximalAdagrad").TypeConstraint("T", kFloatTypes), + ResourceApplyProximalAdagrad); + +class ResourceApplyAdagradDA : public XlaOpKernel { + public: + explicit ResourceApplyAdagradDA(OpKernelConstruction* ctx) + : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + TensorShape var_shape, accum_shape, squared_accum_shape; + xla::XlaOp var, accum, squared_accum; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var)); + OP_REQUIRES_OK(ctx, + ctx->ReadVariableInput(1, dtype_, &accum_shape, &accum)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype_, &squared_accum_shape, + &squared_accum)); + OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape), + errors::InvalidArgument( + "var and accum do not have the same shape", + var_shape.DebugString(), " ", accum_shape.DebugString())); + OP_REQUIRES( + ctx, var_shape.IsSameSize(squared_accum_shape), + errors::InvalidArgument( + "var and squared accum do not have the same shape", + var_shape.DebugString(), " ", squared_accum_shape.DebugString())); + + TensorShape grad_shape = ctx->InputShape(3); + TensorShape lr_shape = ctx->InputShape(4); + TensorShape l1_shape = ctx->InputShape(5); + TensorShape l2_shape = ctx->InputShape(6); + TensorShape global_step_shape = ctx->InputShape(7); + + OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape), + errors::InvalidArgument( + "var and grad do not have the same shape", + var_shape.DebugString(), " ", grad_shape.DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape), + errors::InvalidArgument("lr is not a scalar: ", + lr_shape.DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l1_shape), + errors::InvalidArgument("l1 is not a scalar: ", + l1_shape.DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(l2_shape), + errors::InvalidArgument("l2 is not a scalar: ", + l2_shape.DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step_shape), + errors::InvalidArgument("global step is not a scalar: ", + global_step_shape.DebugString())); + + xla::XlaOp grad = ctx->Input(3); + xla::XlaOp lr = ctx->Input(4); + xla::XlaOp l1 = ctx->Input(5); + xla::XlaOp l2 = ctx->Input(6); + xla::XlaBuilder* const b = ctx->builder(); + xla::XlaOp global_step = + XlaHelpers::ConvertElementType(b, ctx->Input(7), dtype_); + + accum = accum + grad; + squared_accum = squared_accum + xla::Square(grad); + xla::XlaOp zero = xla::ScalarLike(lr, 0.0); + xla::XlaOp denominator = global_step * lr * l2 + xla::Sqrt(squared_accum); + xla::XlaOp l1_le_zero = -lr * accum / denominator; + xla::XlaOp l1_gt_zero = -lr * xla::Sign(accum) * + xla::Max(xla::Abs(accum) - global_step * l1, zero) / + denominator; + + var = xla::Select(xla::Gt(l1, zero), l1_gt_zero, l1_le_zero); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, accum)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, dtype_, squared_accum)); + } + + private: + DataType dtype_; +}; +REGISTER_XLA_OP(Name("ResourceApplyAdagradDA").TypeConstraint("T", kFloatTypes), + ResourceApplyAdagradDA); + class ResourceApplyAdam : public XlaOpKernel { public: explicit ResourceApplyAdam(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { @@ -227,17 +412,12 @@ class ResourceApplyAdam : public XlaOpKernel { // variable <- variable - alpha * m_t / (sqrt(v_t) + epsilon) xla::XlaBuilder* b = ctx->builder(); - xla::XlaOp half = XlaHelpers::FloatLiteral(b, dtype_, 0.5); xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype_, 1.0); - xla::XlaOp two = XlaHelpers::FloatLiteral(b, dtype_, 2.0); - xla::XlaOp alpha = - b->Div(b->Mul(lr, b->Pow(b->Sub(one, beta2_power), half)), - b->Sub(one, beta1_power)); - m = b->Add(m, b->Mul(b->Sub(grad, m), b->Sub(one, beta1))); - v = b->Add(v, b->Mul(b->Sub(b->Pow(grad, two), v), b->Sub(one, beta2))); - var = - b->Sub(var, b->Div(b->Mul(m, alpha), b->Add(b->Pow(v, half), epsilon))); + xla::XlaOp alpha = lr * xla::Sqrt(one - beta2_power) / (one - beta1_power); + m = m + (grad - m) * (one - beta1); + v = v + (xla::Square(grad) - v) * (one - beta2); + var = var - m * alpha / (xla::Sqrt(v) + epsilon); OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var)); OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, m)); @@ -250,38 +430,112 @@ class ResourceApplyAdam : public XlaOpKernel { REGISTER_XLA_OP(Name("ResourceApplyAdam").TypeConstraint("T", kFloatTypes), ResourceApplyAdam); -class ResourceApplyRMSProp : public XlaOpKernel { +class ResourceApplyAdaMax : public XlaOpKernel { public: - explicit ResourceApplyRMSProp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + explicit ResourceApplyAdaMax(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + } void Compile(XlaOpKernelContext* ctx) override { - xla::XlaBuilder* b = ctx->builder(); + TensorShape var_shape, m_shape, v_shape; + xla::XlaOp var, m, v; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype_, &m_shape, &m)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype_, &v_shape, &v)); - DataType type = ctx->input_type(3); + TensorShape beta1_power_shape = ctx->InputShape(3); + TensorShape lr_shape = ctx->InputShape(4); + TensorShape beta1_shape = ctx->InputShape(5); + TensorShape beta2_shape = ctx->InputShape(6); + TensorShape epsilon_shape = ctx->InputShape(7); + TensorShape grad_shape = ctx->InputShape(8); - TensorShape var_shape, ms_shape, mom_shape; - xla::XlaOp var, ms, mom; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &ms_shape, &ms)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, type, &mom_shape, &mom)); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power_shape), + errors::InvalidArgument("beta1_power is not a scalar: ", + beta1_power_shape.DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape), + errors::InvalidArgument("lr is not a scalar : ", + lr_shape.DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_shape), + errors::InvalidArgument("beta1 is not a scalar: ", + beta1_shape.DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_shape), + errors::InvalidArgument("beta2 is not a scalar: ", + beta2_shape.DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_shape), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon_shape.DebugString())); + OP_REQUIRES(ctx, var_shape.IsSameSize(m_shape), + errors::InvalidArgument("var and m do not have the same shape", + var_shape.DebugString(), " ", + m_shape.DebugString())); + OP_REQUIRES(ctx, var_shape.IsSameSize(v_shape), + errors::InvalidArgument("var and v do not have the same shape", + var_shape.DebugString(), " ", + v_shape.DebugString())); + OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape), + errors::InvalidArgument( + "var and grad do not have the same shape", + var_shape.DebugString(), " ", grad_shape.DebugString())); - TensorShape lr_shape = ctx->InputShape(3); + xla::XlaOp beta1_power = ctx->Input(3); + xla::XlaOp lr = ctx->Input(4); + xla::XlaOp beta1 = ctx->Input(5); + xla::XlaOp beta2 = ctx->Input(6); + xla::XlaOp epsilon = ctx->Input(7); + xla::XlaOp grad = ctx->Input(8); + + xla::XlaOp one = xla::ScalarLike(lr, 1.0); + m = beta1 * m + (one - beta1) * grad; + v = xla::Max(beta2 * v, xla::Abs(grad)); + var = var - lr / (one - beta1_power) * (m / (v + epsilon)); + + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, m)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, dtype_, v)); + } + + private: + DataType dtype_; +}; +REGISTER_XLA_OP(Name("ResourceApplyAdaMax").TypeConstraint("T", kFloatTypes), + ResourceApplyAdaMax); + +class ResourceApplyRMSProp : public XlaOpKernel { + public: + explicit ResourceApplyRMSProp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + TensorShape var_shape, ms_shape, mom_shape, mg_shape; + xla::XlaOp var, ms, mom, mg; + OP_REQUIRES_OK(ctx, + ctx->ReadVariableInput("var", dtype_, &var_shape, &var)); + if (centered_) { + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput("mg", dtype_, &mg_shape, &mg)); + } + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput("ms", dtype_, &ms_shape, &ms)); + OP_REQUIRES_OK(ctx, + ctx->ReadVariableInput("mom", dtype_, &mom_shape, &mom)); + + TensorShape lr_shape = ctx->InputShape("lr"); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape), errors::InvalidArgument("lr is not a scalar: ", lr_shape.DebugString())); - TensorShape rho_shape = ctx->InputShape(4); + TensorShape rho_shape = ctx->InputShape("rho"); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho_shape), errors::InvalidArgument("rho is not a scalar: ", rho_shape.DebugString())); - TensorShape momentum_shape = ctx->InputShape(5); + TensorShape momentum_shape = ctx->InputShape("momentum"); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum_shape), errors::InvalidArgument("momentum is not a scalar: ", momentum_shape.DebugString())); - TensorShape epsilon_shape = ctx->InputShape(6); + TensorShape epsilon_shape = ctx->InputShape("epsilon"); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_shape), errors::InvalidArgument("epsilon is not a scalar: ", epsilon_shape.DebugString())); - TensorShape grad_shape = ctx->InputShape(7); + TensorShape grad_shape = ctx->InputShape("grad"); // var should be the same shape as mom and ms. OP_REQUIRES(ctx, var_shape.IsSameSize(ms_shape), @@ -297,11 +551,11 @@ class ResourceApplyRMSProp : public XlaOpKernel { "var and grad do not have the same shape", var_shape.DebugString(), " ", grad_shape.DebugString())); - xla::XlaOp lr = ctx->Input(3); - xla::XlaOp rho = ctx->Input(4); - xla::XlaOp momentum = ctx->Input(5); - xla::XlaOp epsilon = ctx->Input(6); - xla::XlaOp grad = ctx->Input(7); + xla::XlaOp lr = ctx->Input("lr"); + xla::XlaOp rho = ctx->Input("rho"); + xla::XlaOp momentum = ctx->Input("momentum"); + xla::XlaOp epsilon = ctx->Input("epsilon"); + xla::XlaOp grad = ctx->Input("grad"); // ms <- rho * ms_{t-1} + (1-rho) * grad * grad // mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) @@ -320,25 +574,46 @@ class ResourceApplyRMSProp : public XlaOpKernel { // ms <- grad**2 (1 - rho) + ms * rho // // Which is the equation listed above. - xla::XlaOp new_ms = b->Add( - ms, - b->Mul(b->Sub(b->Pow(grad, XlaHelpers::FloatLiteral(b, type, 2.0)), ms), - b->Sub(XlaHelpers::FloatLiteral(b, type, 1.0), rho))); - xla::XlaOp new_mom = - b->Add(b->Mul(mom, momentum), - b->Mul(b->Mul(grad, lr), - b->Pow(b->Add(new_ms, epsilon), - XlaHelpers::FloatLiteral(b, type, -0.5)))); - xla::XlaOp new_var = b->Sub(var, new_mom); - - OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, new_var)); - OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, new_ms)); - OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, type, new_mom)); + xla::XlaOp one = xla::ScalarLike(ms, 1.0); + xla::XlaOp new_ms = xla::Square(grad) * (one - rho) + ms * rho; + xla::XlaOp denominator; + if (centered_) { + mg = grad * (one - rho) + mg * rho; + denominator = new_ms - xla::Square(mg) + epsilon; + } else { + denominator = new_ms + epsilon; + } + xla::XlaOp new_mom = mom * momentum + grad * lr * xla::Rsqrt(denominator); + xla::XlaOp new_var = var - new_mom; + + OP_REQUIRES_OK(ctx, ctx->AssignVariable("var", dtype_, new_var)); + if (centered_) { + OP_REQUIRES_OK(ctx, ctx->AssignVariable("mg", dtype_, mg)); + } + OP_REQUIRES_OK(ctx, ctx->AssignVariable("ms", dtype_, new_ms)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable("mom", dtype_, new_mom)); } + + protected: + bool centered_ = false; + + private: + DataType dtype_; }; REGISTER_XLA_OP(Name("ResourceApplyRMSProp").TypeConstraint("T", kFloatTypes), ResourceApplyRMSProp); +class ResourceApplyCenteredRMSProp : public ResourceApplyRMSProp { + public: + explicit ResourceApplyCenteredRMSProp(OpKernelConstruction* ctx) + : ResourceApplyRMSProp(ctx) { + centered_ = true; + } +}; +REGISTER_XLA_OP( + Name("ResourceApplyCenteredRMSProp").TypeConstraint("T", kFloatTypes), + ResourceApplyCenteredRMSProp); + void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype, bool has_l2_shrinkage) { xla::XlaBuilder* b = ctx->builder(); @@ -413,7 +688,7 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype, } // grad_to_use = grad + 2 * l2_shrinkage * var - // new_accum = accum + grad_to_use * grad_to_use + // new_accum = accum + grad * grad // linear += grad_to_use - // (new_accum^(-lr_power) - accum^(-lr_power)) / lr * var // quadratic = (new_accum^(-lr_power) / lr) + 2 * l2 @@ -424,21 +699,18 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype, xla::XlaOp two = XlaHelpers::FloatLiteral(b, dtype, 2.0); xla::XlaOp grad_to_use; if (has_l2_shrinkage) { - grad_to_use = b->Add(grad, b->Mul(two, b->Mul(l2_shrinkage, var))); + grad_to_use = grad + two * l2_shrinkage * var; } else { grad_to_use = grad; } - xla::XlaOp new_accum = b->Add(accum, b->Pow(grad_to_use, two)); - xla::XlaOp new_accum_lr_pow = b->Pow(new_accum, b->Neg(lr_power)); - xla::XlaOp accum_lr_pow = b->Pow(accum, b->Neg(lr_power)); - linear = b->Add( - linear, - b->Sub(grad_to_use, - b->Mul(b->Div(b->Sub(new_accum_lr_pow, accum_lr_pow), lr), var))); - xla::XlaOp linear_clipped = b->Clamp(b->Neg(l1), linear, l1); - xla::XlaOp quadratic = b->Add(b->Div(new_accum_lr_pow, lr), b->Mul(two, l2)); - var = b->Div(b->Sub(linear_clipped, linear), quadratic); + xla::XlaOp new_accum = accum + xla::Square(grad); + xla::XlaOp new_accum_lr_pow = xla::Pow(new_accum, -lr_power); + xla::XlaOp accum_lr_pow = xla::Pow(accum, -lr_power); + linear = linear + grad_to_use - (new_accum_lr_pow - accum_lr_pow) / lr * var; + xla::XlaOp linear_clipped = xla::Clamp(-l1, linear, l1); + xla::XlaOp quadratic = new_accum_lr_pow / lr + two * l2; + var = (linear_clipped - linear) / quadratic; accum = new_accum; OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype, var)); @@ -478,5 +750,176 @@ class ResourceApplyFtrlV2 : public XlaOpKernel { REGISTER_XLA_OP(Name("ResourceApplyFtrlV2").TypeConstraint("T", kFloatTypes), ResourceApplyFtrlV2); +class ResourceApplyAdadelta : public XlaOpKernel { + public: + explicit ResourceApplyAdadelta(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + TensorShape var_shape, accum_shape, accum_update_shape; + xla::XlaOp var, accum, accum_update; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var)); + OP_REQUIRES_OK(ctx, + ctx->ReadVariableInput(1, dtype_, &accum_shape, &accum)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype_, &accum_update_shape, + &accum_update)); + + TensorShape lr_shape = ctx->InputShape(3); + TensorShape rho_shape = ctx->InputShape(4); + TensorShape epsilon_shape = ctx->InputShape(5); + TensorShape grad_shape = ctx->InputShape(6); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape), + errors::InvalidArgument("lr is not a scalar: ", + lr_shape.DebugString())); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho_shape), + errors::InvalidArgument("rho is not a scalar: ", + rho_shape.DebugString())); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_shape), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon_shape.DebugString())); + + OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape), + errors::InvalidArgument( + "var and accum do not have the same shape", + var_shape.DebugString(), " ", accum_shape.DebugString())); + + OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape), + errors::InvalidArgument( + "var and grad do not have the same shape", + var_shape.DebugString(), " ", grad_shape.DebugString())); + + xla::XlaOp lr = ctx->Input(3); + xla::XlaOp rho = ctx->Input(4); + xla::XlaOp epsilon = ctx->Input(5); + xla::XlaOp grad = ctx->Input(6); + + xla::XlaBuilder* b = ctx->builder(); + xla::XlaOp neg_half = XlaHelpers::FloatLiteral(b, dtype_, -0.5); + xla::XlaOp half = XlaHelpers::FloatLiteral(b, dtype_, 0.5); + xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype_, 1.0); + xla::XlaOp two = XlaHelpers::FloatLiteral(b, dtype_, 2.0); + + accum = rho * accum + (one - rho) * xla::Pow(grad, two); + xla::XlaOp update = xla::Pow(accum_update + epsilon, half) * + xla::Pow(accum + epsilon, neg_half) * grad; + accum_update = rho * accum_update + (one - rho) * xla::Pow(update, two); + var = var - update * lr; + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, accum)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, dtype_, accum_update)); + } + + private: + DataType dtype_; +}; +REGISTER_XLA_OP(Name("ResourceApplyAdadelta").TypeConstraint("T", kFloatTypes), + ResourceApplyAdadelta); + +class ResourceApplySignBase : public XlaOpKernel { + public: + explicit ResourceApplySignBase(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + TensorShape var_shape, m_shape; + xla::XlaOp var, m; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype_, &m_shape, &m)); + OP_REQUIRES(ctx, var_shape.IsSameSize(m_shape), + errors::InvalidArgument("var and m do not have the same shape", + var_shape.DebugString(), " ", + m_shape.DebugString())); + TensorShape grad_shape = ctx->InputShape(6); + OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape), + errors::InvalidArgument( + "var and grad do not have the same shape", + var_shape.DebugString(), " ", grad_shape.DebugString())); + CheckScalarParams(ctx); + + xla::XlaOp lr = ctx->Input(2); + xla::XlaOp alpha = ctx->Input(3); + xla::XlaOp sign_decay = ctx->Input(4); + xla::XlaOp beta = ctx->Input(5); + xla::XlaOp grad = ctx->Input(6); + + m = m * beta + grad * (xla::ScalarLike(beta, 1.0) - beta); + xla::XlaOp decay = xla::Sign(grad) * xla::Sign(m) * sign_decay; + + xla::XlaOp grad_scale = ComputeGradientScale(alpha, decay); + var = var - lr * grad_scale * grad; + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, m)); + } + + virtual void CheckScalarParams(XlaOpKernelContext* ctx) { + TensorShape lr_shape = ctx->InputShape(2); + TensorShape sign_decay_shape = ctx->InputShape(4); + TensorShape beta_shape = ctx->InputShape(5); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape), + errors::InvalidArgument("lr is not a scalar: ", + lr_shape.DebugString())); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sign_decay_shape), + errors::InvalidArgument("sign_decay is not a scalar: ", + sign_decay_shape.DebugString())); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta_shape), + errors::InvalidArgument("beta is not a scalar: ", + beta_shape.DebugString())); + } + + virtual xla::XlaOp ComputeGradientScale(xla::XlaOp alpha, + xla::XlaOp decay) = 0; + + private: + DataType dtype_; +}; + +class ResourceApplyAddSign : public ResourceApplySignBase { + public: + explicit ResourceApplyAddSign(OpKernelConstruction* ctx) + : ResourceApplySignBase(ctx) {} + + void CheckScalarParams(XlaOpKernelContext* ctx) override { + ResourceApplySignBase::CheckScalarParams(ctx); + TensorShape alpha_shape = ctx->InputShape(3); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_shape), + errors::InvalidArgument("alpha is not a scalar: ", + alpha_shape.DebugString())); + } + + xla::XlaOp ComputeGradientScale(xla::XlaOp alpha, xla::XlaOp decay) override { + return alpha + decay; + } +}; +REGISTER_XLA_OP(Name("ResourceApplyAddSign").TypeConstraint("T", kFloatTypes), + ResourceApplyAddSign); + +class ResourceApplyPowerSign : public ResourceApplySignBase { + public: + explicit ResourceApplyPowerSign(OpKernelConstruction* ctx) + : ResourceApplySignBase(ctx) {} + + void CheckScalarParams(XlaOpKernelContext* ctx) override { + ResourceApplySignBase::CheckScalarParams(ctx); + TensorShape logbase_shape = ctx->InputShape(3); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(logbase_shape), + errors::InvalidArgument("logbase is not a scalar: ", + logbase_shape.DebugString())); + } + + xla::XlaOp ComputeGradientScale(xla::XlaOp alpha, xla::XlaOp decay) override { + return xla::Exp(alpha * decay); + } +}; +REGISTER_XLA_OP(Name("ResourceApplyPowerSign").TypeConstraint("T", kFloatTypes), + ResourceApplyPowerSign); + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc index c167642174b328a968d7f7ce1f0ad6e0ab8a7a68..f9148b394212777271f9eba51313ee17b19819af 100644 --- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/bounds_check.h" @@ -32,7 +33,8 @@ namespace { class TransposeOp : public XlaOpKernel { public: - explicit TransposeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + explicit TransposeOp(OpKernelConstruction* ctx, bool conjugate = false) + : XlaOpKernel(ctx), conjugate_(conjugate) {} void Compile(XlaOpKernelContext* ctx) override { const TensorShape input_shape = ctx->InputShape(0); @@ -78,19 +80,37 @@ class TransposeOp : public XlaOpKernel { errors::InvalidArgument(i, " is missing from 'perm' argument.")); } + xla::XlaOp transposed; // 0-D, 1-D, and identity transposes do nothing. if (dims <= 1 || is_identity) { - ctx->SetOutput(0, ctx->Input(0)); - return; + transposed = ctx->Input(0); + } else { + transposed = xla::Transpose(ctx->Input(0), transposed_order); } - ctx->SetOutput(0, - ctx->builder()->Transpose(ctx->Input(0), transposed_order)); + // Conjugate the transposed result if this is ConjugateTransposeOp. + if (conjugate_) { + ctx->SetOutput(0, xla::Conj(transposed)); + } else { + ctx->SetOutput(0, transposed); + } } + + private: + const bool conjugate_; +}; + +class ConjugateTransposeOp : public TransposeOp { + public: + explicit ConjugateTransposeOp(OpKernelConstruction* ctx) + : TransposeOp(ctx, /*conjugate=*/true) {} }; REGISTER_XLA_OP(Name("Transpose").CompileTimeConstInput("perm"), TransposeOp); +REGISTER_XLA_OP(Name("ConjugateTranspose").CompileTimeConstInput("perm"), + ConjugateTransposeOp); + // InvertPermutation frequently forms part of the gradient of Transpose. // // inv = InvertPermutationOp(T p) takes a permutation of @@ -127,7 +147,7 @@ class InvertPermutationOp : public XlaOpKernel { output[d] = i; } - ctx->SetOutput(0, ctx->builder()->ConstantR1(output)); + ctx->SetOutput(0, xla::ConstantR1(ctx->builder(), output)); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc index 71a9fd051bfc8db09738a4bfe8ddde447895ecf0..0bdfc05726105e2d18362a691cbe2aab00bf77f3 100644 --- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc @@ -16,24 +16,26 @@ limitations under the License. // Native XLA implementations of simple unary Ops #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h" +#include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/math.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/kernel_def_builder.h" namespace tensorflow { namespace { -// A subclass of a TlaUnaryOp must build the lambda computation that -// describes the scalar->scalar function to apply to each element of -// the input. #define XLAJIT_MAKE_UNARY(NAME, COMPUTATION) \ class NAME##Op : public XlaOpKernel { \ public: \ explicit NAME##Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} \ void Compile(XlaOpKernelContext* ctx) { \ xla::XlaBuilder* b = ctx->builder(); \ + (void)b; \ xla::XlaOp x = ctx->Input(0); \ xla::XlaOp y = COMPUTATION; \ ctx->SetOutput(0, y); \ @@ -41,122 +43,74 @@ namespace { }; \ REGISTER_XLA_OP(Name(#NAME), NAME##Op); -XLAJIT_MAKE_UNARY(ComplexAbs, b->Abs(x)); +XLAJIT_MAKE_UNARY(ComplexAbs, xla::Abs(x)); -XLAJIT_MAKE_UNARY(Angle, b->Atan2(b->Imag(x), b->Real(x))); +XLAJIT_MAKE_UNARY(Angle, xla::Atan2(xla::Imag(x), xla::Real(x))); -XLAJIT_MAKE_UNARY(Conj, b->Conj(x)); +XLAJIT_MAKE_UNARY(Conj, xla::Conj(x)); // Return x if x>0, otherwise -x. -XLAJIT_MAKE_UNARY(Abs, b->Abs(x)); - -// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x)) -XLAJIT_MAKE_UNARY( - Acos, - b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0), - b->Atan2(b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)), - b->Mul(x, x)), - XlaHelpers::FloatLiteral(b, input_type(0), 0.5)), - b->Add(XlaHelpers::One(b, input_type(0)), x)))); - -// acosh(x) = log(x + sqrt(x^2 - 1)) -// = log(x + sqrt((x+1)*(x-1))) -XLAJIT_MAKE_UNARY( - Acosh, - b->Log(b->Add(x, - b->Pow(b->Mul(b->Add(x, XlaHelpers::One(b, input_type(0))), - b->Sub(x, XlaHelpers::One(b, input_type(0)))), - XlaHelpers::FloatLiteral(b, input_type(0), 0.5))))); - -// asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2))) -XLAJIT_MAKE_UNARY( - Asin, - b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0), - b->Atan2(x, b->Add(XlaHelpers::One(b, input_type(0)), - b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)), - b->Mul(x, x)), - XlaHelpers::FloatLiteral(b, input_type(0), - 0.5)))))); - -// asinh(x) = log(x + sqrt(x^2 + 1)) +XLAJIT_MAKE_UNARY(Abs, xla::Abs(x)); +XLAJIT_MAKE_UNARY(Acos, xla::Acos(x)); +XLAJIT_MAKE_UNARY(Acosh, xla::Acosh(x)); +XLAJIT_MAKE_UNARY(Asin, xla::Asin(x)) +XLAJIT_MAKE_UNARY(Asinh, xla::Asinh(x)); +XLAJIT_MAKE_UNARY(Atan, xla::Atan(x)); +XLAJIT_MAKE_UNARY(Atanh, xla::Atanh(x)); +XLAJIT_MAKE_UNARY(Ceil, xla::Ceil(x)); +XLAJIT_MAKE_UNARY(Cos, xla::Cos(x)); +XLAJIT_MAKE_UNARY(Cosh, xla::Cosh(x)); +XLAJIT_MAKE_UNARY(Sin, xla::Sin(x)); +XLAJIT_MAKE_UNARY(Exp, xla::Exp(x)); +XLAJIT_MAKE_UNARY(Expm1, xla::Expm1(x)); +XLAJIT_MAKE_UNARY(Floor, xla::Floor(x)); +XLAJIT_MAKE_UNARY(IsFinite, xla::IsFinite(x)); XLAJIT_MAKE_UNARY( - Asinh, - b->Log(b->Add(x, b->Pow(b->Add(b->Mul(x, x), - XlaHelpers::One(b, input_type(0))), - XlaHelpers::FloatLiteral(b, input_type(0), 0.5))))); - -XLAJIT_MAKE_UNARY(Atan, b->Atan2(x, XlaHelpers::One(b, input_type(0)))); - -// atanh(x) = 0.5 * log((1 + x) / (1 - x)) -XLAJIT_MAKE_UNARY( - Atanh, b->Mul(b->Log(b->Div(b->Add(XlaHelpers::One(b, input_type(0)), x), - b->Sub(XlaHelpers::One(b, input_type(0)), x))), - XlaHelpers::FloatLiteral(b, input_type(0), 0.5))); -XLAJIT_MAKE_UNARY(Ceil, b->Ceil(x)); -XLAJIT_MAKE_UNARY(Cos, b->Cos(x)); -XLAJIT_MAKE_UNARY(Cosh, - b->Mul(b->Add(b->Exp(x), b->Exp(b->Neg(x))), - XlaHelpers::FloatLiteral(b, input_type(0), 0.5))); -XLAJIT_MAKE_UNARY(Sin, b->Sin(x)); -XLAJIT_MAKE_UNARY(Exp, b->Exp(x)); - -XLAJIT_MAKE_UNARY(Expm1, b->Expm1(x)); - -XLAJIT_MAKE_UNARY(Floor, b->Floor(x)); -XLAJIT_MAKE_UNARY(IsFinite, b->IsFinite(x)); -XLAJIT_MAKE_UNARY(IsInf, b->Eq(b->Abs(x), - XlaHelpers::FloatLiteral( - b, input_type(0), - std::numeric_limits::infinity()))); -XLAJIT_MAKE_UNARY(IsNan, b->Ne(x, x)); + IsInf, + xla::Eq(xla::Abs(x), + xla::ScalarLike(x, std::numeric_limits::infinity()))); +XLAJIT_MAKE_UNARY(IsNan, xla::Ne(x, x)); // Return 1/x -XLAJIT_MAKE_UNARY(Inv, b->Div(XlaHelpers::One(b, input_type(0)), x)); -XLAJIT_MAKE_UNARY(Reciprocal, b->Div(XlaHelpers::One(b, input_type(0)), x)); -XLAJIT_MAKE_UNARY(Log, b->Log(x)); - -XLAJIT_MAKE_UNARY(Log1p, b->Log1p(x)); +XLAJIT_MAKE_UNARY(Inv, xla::ScalarLike(x, 1.0) / x); +XLAJIT_MAKE_UNARY(Reciprocal, xla::ScalarLike(x, 1.0) / x); +XLAJIT_MAKE_UNARY(Log, xla::Log(x)); +XLAJIT_MAKE_UNARY(Log1p, xla::Log1p(x)); -XLAJIT_MAKE_UNARY(Invert, b->Not(x)); -XLAJIT_MAKE_UNARY(LogicalNot, b->Not(x)); -XLAJIT_MAKE_UNARY(Neg, b->Neg(x)); +XLAJIT_MAKE_UNARY(Invert, xla::Not(x)); +XLAJIT_MAKE_UNARY(LogicalNot, xla::Not(x)); +XLAJIT_MAKE_UNARY(Neg, -x); // Implements Banker's rounding: numbers that are equidistant between two // integers are rounded towards even. -static xla::XlaOp Round(xla::XlaBuilder* b, DataType dtype, - const xla::XlaOp& x) { - auto half = XlaHelpers::FloatLiteral(b, dtype, 0.5); - auto one = XlaHelpers::FloatLiteral(b, dtype, 1.0); - auto two = XlaHelpers::FloatLiteral(b, dtype, 2.0); - - auto round_val = b->Floor(x); - auto fraction = b->Sub(x, round_val); - auto nearest_even_int = - b->Sub(round_val, b->Mul(two, b->Floor(b->Mul(half, x)))); - auto is_odd = b->Eq(nearest_even_int, one); - return b->Select( - b->Or(b->Gt(fraction, half), b->And(b->Eq(fraction, half), is_odd)), - b->Add(round_val, one), round_val); +xla::XlaOp RoundToEven(xla::XlaOp x) { + auto half = xla::ScalarLike(x, 0.5); + auto one = xla::ScalarLike(x, 1.0); + auto two = xla::ScalarLike(x, 2.0); + + auto round_val = xla::Floor(x); + auto fraction = x - round_val; + auto nearest_even_int = round_val - two * xla::Floor(half * x); + auto is_odd = xla::Eq(nearest_even_int, one); + return xla::Select(xla::Or(xla::Gt(fraction, half), + xla::And(xla::Eq(fraction, half), is_odd)), + round_val + one, round_val); } -XLAJIT_MAKE_UNARY(Rint, Round(b, input_type(0), x)); -XLAJIT_MAKE_UNARY(Round, Round(b, input_type(0), x)); +XLAJIT_MAKE_UNARY(Rint, RoundToEven(x)); +XLAJIT_MAKE_UNARY(Round, RoundToEven(x)); -XLAJIT_MAKE_UNARY(Rsqrt, - b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), -0.5))); +XLAJIT_MAKE_UNARY(Rsqrt, xla::Rsqrt(x)); // Expresses sigmoid as a rescaled tanh: sigmoid(x) == (tanh(x/2) + 1) / 2. -static xla::XlaOp Sigmoid(xla::XlaBuilder* b, DataType dtype, - const xla::XlaOp& x) { - auto half = XlaHelpers::FloatLiteral(b, dtype, 0.5); - return b->Add(half, b->Mul(half, b->Tanh(b->Mul(half, x)))); +xla::XlaOp Sigmoid(xla::XlaOp x) { + auto half = xla::ScalarLike(x, 0.5); + return half + half * xla::Tanh(half * x); } -XLAJIT_MAKE_UNARY(Sigmoid, Sigmoid(b, input_type(0), x)); +XLAJIT_MAKE_UNARY(Sigmoid, Sigmoid(x)); // Returns 0 if x is 0, -1 if x < 0 and 1 if x > 0. -XLAJIT_MAKE_UNARY(Sign, b->Sign(x)); -XLAJIT_MAKE_UNARY(Sinh, - b->Mul(b->Sub(b->Exp(x), b->Exp(b->Neg(x))), - XlaHelpers::FloatLiteral(b, input_type(0), 0.5))); +XLAJIT_MAKE_UNARY(Sign, xla::Sign(x)); +XLAJIT_MAKE_UNARY(Sinh, xla::Sinh(x)); // softplus(x) = log(1 + exp(x)) // @@ -166,24 +120,94 @@ XLAJIT_MAKE_UNARY(Sinh, // // This is equivalent to: // max(x, 0) + log1p(exp(-abs(x))) -XLAJIT_MAKE_UNARY(Softplus, - b->Add(b->Max(x, XlaHelpers::Zero(b, input_type(0))), - b->Log1p(b->Exp(b->Neg(b->Abs(x)))))); +XLAJIT_MAKE_UNARY(Softplus, xla::Max(x, xla::ScalarLike(x, 0.0)) + + xla::Log1p(xla::Exp(-xla::Abs(x)))); // softsign(x) = x / (abs(x) + 1) -XLAJIT_MAKE_UNARY(Softsign, - b->Div(x, - b->Add(b->Abs(x), XlaHelpers::One(b, input_type(0))))); -XLAJIT_MAKE_UNARY(Sqrt, - b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), 0.5))); -XLAJIT_MAKE_UNARY(Square, b->Mul(x, x)); -XLAJIT_MAKE_UNARY(Tan, b->Div(b->Sin(x), b->Cos(x))); -XLAJIT_MAKE_UNARY(Tanh, b->Tanh(x)); - -XLAJIT_MAKE_UNARY(Real, b->Real(x)); -XLAJIT_MAKE_UNARY(Imag, b->Imag(x)); +XLAJIT_MAKE_UNARY(Softsign, x / (xla::Abs(x) + xla::ScalarLike(x, 1.0))); +XLAJIT_MAKE_UNARY(Sqrt, xla::Sqrt(x)); +XLAJIT_MAKE_UNARY(Square, x* x); +XLAJIT_MAKE_UNARY(Tan, xla::Tan(x)); +XLAJIT_MAKE_UNARY(Tanh, xla::Tanh(x)); + +XLAJIT_MAKE_UNARY(Real, xla::Real(x)); +XLAJIT_MAKE_UNARY(Imag, xla::Imag(x)); #undef XLAJIT_MAKE_UNARY +// Erf/Erfc. For x in (-1, 1), the erf approximation is used; erfc polynomial +// is used outside of this range. +class ErfOp : public XlaOpKernel { + public: + explicit ErfOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + void Compile(XlaOpKernelContext* ctx) override { + xla::XlaOp x = ctx->Input(0); + xla::XlaOp one = xla::ScalarLike(x, 1.0); + auto y = + xla::Select(xla::Gt(xla::Abs(x), one), one - xla::Erfc(x), xla::Erf(x)); + ctx->SetOutput(0, y); + } +}; +REGISTER_XLA_OP(Name("Erf"), ErfOp); + +class ErfcOp : public XlaOpKernel { + public: + explicit ErfcOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + void Compile(XlaOpKernelContext* ctx) override { + xla::XlaOp x = ctx->Input(0); + xla::XlaOp one = xla::ScalarLike(x, 1.0); + auto y = + xla::Select(xla::Lt(xla::Abs(x), one), one - xla::Erf(x), xla::Erfc(x)); + ctx->SetOutput(0, y); + } +}; +REGISTER_XLA_OP(Name("Erfc"), ErfcOp); + +class LgammaOp : public XlaOpKernel { + public: + explicit LgammaOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + // Calculate lgamma using the Lanczos approximation + // (https://en.wikipedia.org/wiki/Lanczos_approximation). + void Compile(XlaOpKernelContext* ctx) override { + xla::XlaOp input = ctx->Input(0); + xla::PrimitiveType input_type = ctx->input_xla_type(0); + + if (input_type == xla::F16 || input_type == xla::BF16) { + // The approximation works better with at least 32-bits of accuracy. + xla::XlaOp input_f32 = xla::ConvertElementType(input, xla::F32); + xla::XlaOp result_f32 = xla::Lgamma(input_f32); + xla::XlaOp result_x16 = xla::ConvertElementType(result_f32, input_type); + ctx->SetOutput(0, result_x16); + } else { + xla::XlaOp result = xla::Lgamma(input); + ctx->SetOutput(0, result); + } + } +}; // namespace +REGISTER_XLA_OP(Name("Lgamma"), LgammaOp); + +class DigammaOp : public XlaOpKernel { + public: + explicit DigammaOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + // Calculate lgamma using the Lanczos approximation + // (https://en.wikipedia.org/wiki/Lanczos_approximation). + void Compile(XlaOpKernelContext* ctx) override { + xla::XlaOp input = ctx->Input(0); + xla::PrimitiveType input_type = ctx->input_xla_type(0); + + if (input_type == xla::F16 || input_type == xla::BF16) { + // The approximation works better with at least 32-bits of accuracy. + xla::XlaOp input_f32 = xla::ConvertElementType(input, xla::F32); + xla::XlaOp result_f32 = xla::Digamma(input_f32); + xla::XlaOp result_x16 = xla::ConvertElementType(result_f32, input_type); + ctx->SetOutput(0, result_x16); + } else { + xla::XlaOp result = xla::Digamma(input); + ctx->SetOutput(0, result); + } + } +}; // namespace +REGISTER_XLA_OP(Name("Digamma"), DigammaOp); + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc index f87586ba578a6138e7fb921032e1a71f8c9ac80c..8671632976023fded04c26a9780c1a67638b0916 100644 --- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc @@ -22,7 +22,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -74,10 +75,9 @@ class UnpackOp : public XlaOpKernel { for (int i = 0; i < num; ++i) { start_indices[axis] = i; limit_indices[axis] = i + 1; - auto slice = ctx->builder()->Slice(input, start_indices, limit_indices, - strides); + auto slice = xla::Slice(input, start_indices, limit_indices, strides); // Reshape to drop the 'axis' dimension. - auto result = ctx->builder()->Reshape(slice, output_shape.dim_sizes()); + auto result = xla::Reshape(slice, output_shape.dim_sizes()); ctx->SetOutput(i, result); } } diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc index a163fa0a5b34675e46d0d7c5f4e0ccb1e3fb18eb..2c92a585f5679242d672d0402e617ff199b94f17 100644 --- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc @@ -13,18 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h" #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h" #include "tensorflow/compiler/tf2xla/kernels/shape_util.h" +#include "tensorflow/compiler/tf2xla/lib/scatter.h" #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/types.h" -#include "tensorflow/core/kernels/bounds_check.h" -#include "tensorflow/core/kernels/no_op.h" namespace tensorflow { namespace { @@ -35,12 +33,33 @@ class VarIsInitializedOp : public XlaOpKernel { void Compile(XlaOpKernelContext* ctx) override { XlaResource* variable; OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &variable)); - ctx->SetOutput(0, - ctx->builder()->ConstantR0(variable->initialized())); + ctx->SetOutput( + 0, xla::ConstantR0(ctx->builder(), variable->initialized())); } }; REGISTER_XLA_OP(Name("VarIsInitializedOp"), VarIsInitializedOp); +class VariableShapeOp : public XlaOpKernel { + public: + explicit VariableShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + DataType variable_dtype; + TensorShape shape; + OP_REQUIRES_OK(ctx, + ctx->GetVariableTypeAndShape(0, &variable_dtype, &shape)); + Tensor shape_constant(out_dtype_, TensorShape({shape.dims()})); + OP_REQUIRES_OK(ctx, TensorShapeToConstant(shape, &shape_constant)); + ctx->SetConstantOutput(0, shape_constant); + } + + private: + DataType out_dtype_; +}; +REGISTER_XLA_OP(Name("VariableShape"), VariableShapeOp); + class ReadVariableOp : public XlaOpKernel { public: explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { @@ -77,7 +96,7 @@ class AssignAddVariableOp : public XlaOpKernel { xla::XlaOp handle; OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle)); - handle = ctx->builder()->Add(handle, ctx->Input(1)); + handle = xla::Add(handle, ctx->Input(1)); OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle)); } }; @@ -93,7 +112,7 @@ class AssignSubVariableOp : public XlaOpKernel { xla::XlaOp handle; OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle)); - handle = ctx->builder()->Sub(handle, ctx->Input(1)); + handle = xla::Sub(handle, ctx->Input(1)); OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle)); } }; @@ -125,29 +144,152 @@ class ResourceGatherOp : public XlaOpKernel { ctx->SetOutput(0, gather); } }; -REGISTER_XLA_OP(Name("ResourceGather").TypeConstraint("dtype", kNumericTypes), - ResourceGatherOp); +REGISTER_XLA_OP(Name("ResourceGather"), ResourceGatherOp); -class VariableShapeOp : public XlaOpKernel { +class ResourceScatterOp : public XlaOpKernel { public: - explicit VariableShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_)); + explicit ResourceScatterOp( + OpKernelConstruction* context, bool indices_are_vectors, + std::function + combiner) + : XlaOpKernel(context), + indices_are_vectors_(indices_are_vectors), + combiner_(std::move(combiner)) {} + + void Compile(XlaOpKernelContext* context) override { + xla::XlaBuilder* builder = context->builder(); + + DataType dtype = context->input_type(2); + TensorShape var_shape; + xla::XlaOp var_value; + OP_REQUIRES_OK( + context, context->ReadVariableInput(0, dtype, &var_shape, &var_value)); + + const xla::XlaOp indices = context->Input(1); + const xla::XlaOp updates = context->Input(2); + + auto result = XlaScatter(var_value, updates, indices, indices_are_vectors_, + combiner_, builder); + OP_REQUIRES_OK(context, result.status()); + OP_REQUIRES_OK(context, + context->AssignVariable(0, dtype, result.ValueOrDie())); } - void Compile(XlaOpKernelContext* ctx) override { - DataType variable_dtype; - TensorShape shape; - OP_REQUIRES_OK(ctx, - ctx->GetVariableTypeAndShape(0, &variable_dtype, &shape)); - Tensor shape_constant(out_dtype_, TensorShape({shape.dims()})); - OP_REQUIRES_OK(ctx, TensorShapeToConstant(shape, &shape_constant)); - ctx->SetConstantOutput(0, shape_constant); + private: + const bool indices_are_vectors_; + const std::function + combiner_; +}; + +class ResourceScatterAddOp : public ResourceScatterOp { + public: + explicit ResourceScatterAddOp(OpKernelConstruction* context) + : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {} + + private: + static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y, + xla::XlaBuilder* builder) { + return xla::Add(x, y); } +}; +REGISTER_XLA_OP(Name("ResourceScatterAdd"), ResourceScatterAddOp); + +class ResourceScatterSubOp : public ResourceScatterOp { + public: + explicit ResourceScatterSubOp(OpKernelConstruction* context) + : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {} private: - DataType out_dtype_; + static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y, + xla::XlaBuilder* builder) { + return xla::Sub(x, y); + } }; +REGISTER_XLA_OP(Name("ResourceScatterSub"), ResourceScatterSubOp); + +class ResourceScatterMulOp : public ResourceScatterOp { + public: + explicit ResourceScatterMulOp(OpKernelConstruction* context) + : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {} + + private: + static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y, + xla::XlaBuilder* builder) { + return xla::Mul(x, y); + } +}; +REGISTER_XLA_OP(Name("ResourceScatterMul"), ResourceScatterMulOp); + +class ResourceScatterDivOp : public ResourceScatterOp { + public: + explicit ResourceScatterDivOp(OpKernelConstruction* context) + : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {} + + private: + static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y, + xla::XlaBuilder* builder) { + return xla::Div(x, y); + } +}; +REGISTER_XLA_OP(Name("ResourceScatterDiv"), ResourceScatterDivOp); + +class ResourceScatterMinOp : public ResourceScatterOp { + public: + explicit ResourceScatterMinOp(OpKernelConstruction* context) + : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {} + + private: + static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y, + xla::XlaBuilder* builder) { + return xla::Min(x, y); + } +}; +REGISTER_XLA_OP(Name("ResourceScatterMin"), ResourceScatterMinOp); + +class ResourceScatterMaxOp : public ResourceScatterOp { + public: + explicit ResourceScatterMaxOp(OpKernelConstruction* context) + : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {} + + private: + static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y, + xla::XlaBuilder* builder) { + return xla::Max(x, y); + } +}; +REGISTER_XLA_OP(Name("ResourceScatterMax"), ResourceScatterMaxOp); + +class ResourceScatterUpdateOp : public ResourceScatterOp { + public: + explicit ResourceScatterUpdateOp(OpKernelConstruction* context) + : ResourceScatterOp(context, /*indices_are_vectors=*/false, + /*combiner=*/{}) {} +}; +REGISTER_XLA_OP(Name("ResourceScatterUpdate"), ResourceScatterUpdateOp); + +class ResourceScatterNdUpdateOp : public ResourceScatterOp { + public: + explicit ResourceScatterNdUpdateOp(OpKernelConstruction* context) + : ResourceScatterOp(context, /*indices_are_vectors=*/true, + /*combiner=*/{}) {} +}; +REGISTER_XLA_OP(Name("ResourceScatterNdUpdate"), ResourceScatterNdUpdateOp); + +class ResourceScatterNdAddOp : public ResourceScatterOp { + public: + explicit ResourceScatterNdAddOp(OpKernelConstruction* context) + : ResourceScatterOp(context, /*indices_are_vectors=*/true, + /*combiner=*/Combine) {} + + private: + static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y, + xla::XlaBuilder* builder) { + return xla::Add(x, y); + } +}; +REGISTER_XLA_OP(Name("ResourceScatterNdAdd"), ResourceScatterNdAddOp); -REGISTER_XLA_OP(Name("VariableShape"), VariableShapeOp); } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc index 5467c5d9946846ff9f14ce9c5aac9e2be4b9d6ab..296518229ebf0ba46717afc4f26d5ae1551c2862 100644 --- a/tensorflow/compiler/tf2xla/kernels/while_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc @@ -21,8 +21,9 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/op_kernel.h" @@ -246,7 +247,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) { } } - xla::XlaOp init = builder->Tuple(inputs); + xla::XlaOp init = xla::Tuple(builder, inputs); VLOG(1) << "Building while loop"; @@ -255,22 +256,21 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) { { std::unique_ptr cb = builder->CreateSubBuilder("cond_wrapper"); - auto inputs = cb->Parameter(0, cond_input_shape, "inputs"); - auto outputs = cb->Call(*cond.computation, {inputs}); - cb->GetTupleElement(outputs, 0); + auto inputs = xla::Parameter(cb.get(), 0, cond_input_shape, "inputs"); + auto outputs = xla::Call(cb.get(), *cond.computation, {inputs}); + xla::GetTupleElement(outputs, 0); xla::StatusOr result = cb->Build(); OP_REQUIRES_OK(ctx, result.status()); cond_wrapper = std::move(result.ValueOrDie()); } - xla::XlaOp while_result = - builder->While(cond_wrapper, *body.computation, init); + xla::XlaOp while_result = xla::While(cond_wrapper, *body.computation, init); // Sets non-variable outputs. for (int i = 0; i < ctx->num_outputs(); ++i) { if (ctx->input_type(i) != DT_RESOURCE) { ctx->SetOutput(body.input_mapping[i], - builder->GetTupleElement(while_result, i)); + xla::GetTupleElement(while_result, i)); } } @@ -284,7 +284,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) { OP_REQUIRES_OK(ctx, resource->SetFromPack( arguments[update.input_index].tensor_array_gradients, - builder->GetTupleElement(while_result, pos), builder)); + xla::GetTupleElement(while_result, pos), builder)); } VLOG(2) << "Loop-carried variable: pos: " << update.input_index << " name: " << resource->name() << " modified: " << update.modified @@ -300,6 +300,8 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) { VLOG(1) << "Done building while loop"; } +REGISTER_XLA_OP(Name("While").AllowResourceTypes(), XlaWhileOp); +REGISTER_XLA_OP(Name("StatelessWhile").AllowResourceTypes(), XlaWhileOp); REGISTER_XLA_OP(Name("XlaWhile").AllowResourceTypes(), XlaWhileOp); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..412afeaaad96842521fbd306f5b666e837e675fd --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc @@ -0,0 +1,115 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "absl/algorithm/container.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/xla_compiler.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace tensorflow { +namespace { + +class XlaBroadcastHelperOp : public XlaOpKernel { + public: + explicit XlaBroadcastHelperOp(OpKernelConstruction* context) + : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + xla::XlaOp lhs = context->Input(0); + xla::XlaOp rhs = context->Input(1); + const TensorShape lhs_shape = context->InputShape(0); + const TensorShape rhs_shape = context->InputShape(1); + + const bool broadcast_lhs = lhs_shape.dims() < rhs_shape.dims(); + const TensorShape* min_rank_shape = broadcast_lhs ? &lhs_shape : &rhs_shape; + const TensorShape* max_rank_shape = broadcast_lhs ? &rhs_shape : &lhs_shape; + + std::vector broadcast_dims; + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("broadcast_dims", + &broadcast_dims)); + if (broadcast_dims.empty()) { + OP_REQUIRES( + context, + lhs_shape.dims() == rhs_shape.dims() || lhs_shape.dims() == 0 || + rhs_shape.dims() == 0, + errors::InvalidArgument( + "If broadcast_dims is empty, both " + "arguments must have equal rank; " + "argument shapes, or at least one argument must be a scalar: ", + lhs_shape.DebugString(), " and ", rhs_shape.DebugString())); + context->SetOutput(0, lhs); + context->SetOutput(1, rhs); + return; + } + + OP_REQUIRES( + context, broadcast_dims.size() == min_rank_shape->dims(), + errors::InvalidArgument( + "broadcast_dims must have size equal to the smaller argument rank; " + "broadcast_dims: [", + absl::StrJoin(broadcast_dims, ","), "]; argument shapes: ", + lhs_shape.DebugString(), " and ", rhs_shape.DebugString())); + std::vector sorted_broadcast_dims = broadcast_dims; + absl::c_sort(sorted_broadcast_dims); + std::set dims_set(broadcast_dims.begin(), broadcast_dims.end()); + OP_REQUIRES(context, + dims_set.size() == broadcast_dims.size() && + broadcast_dims == sorted_broadcast_dims, + errors::InvalidArgument( + "Duplicate or nonmonotonic dimension in broadcast_dims; " + "broadcast_dims: [", + absl::StrJoin(broadcast_dims, ","), "]")); + + std::vector broadcast_shape(max_rank_shape->dims(), 1LL); + for (int i = 0; i < broadcast_dims.size(); ++i) { + const int dim = broadcast_dims[i]; + OP_REQUIRES( + context, dim >= 0 && dim < broadcast_shape.size(), + errors::InvalidArgument( + "Invalid broadcast dimension (", dim, "); broadcast_dims: [", + absl::StrJoin(broadcast_dims, ","), "]; argument shapes: ", + lhs_shape.DebugString(), " and ", rhs_shape.DebugString())); + broadcast_shape[dim] = min_rank_shape->dim_size(i); + } + xla::PrimitiveType type = context->input_xla_type(0); + xla::Shape broadcast_xla_shape = + xla::ShapeUtil::MakeShape(type, broadcast_shape); + if (broadcast_lhs) { + lhs = xla::BroadcastInDim(lhs, broadcast_xla_shape, broadcast_dims); + } else { + rhs = xla::BroadcastInDim(rhs, broadcast_xla_shape, broadcast_dims); + } + context->SetOutput(0, lhs); + context->SetOutput(1, rhs); + } + + private: + xla::DotDimensionNumbers dnums_; + + TF_DISALLOW_COPY_AND_ASSIGN(XlaBroadcastHelperOp); +}; + +REGISTER_XLA_OP( + Name("XlaBroadcastHelper").CompileTimeConstInput("broadcast_dims"), + XlaBroadcastHelperOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8848623868091f8d19b1622f23ba23c68689d90d --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc @@ -0,0 +1,101 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/xla_compiler.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { +namespace { + +class XlaConvOp : public XlaOpKernel { + public: + explicit XlaConvOp(OpKernelConstruction* context) : XlaOpKernel(context) { + string dnums_attr; + OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr)); + OP_REQUIRES( + context, dnums_.ParsePartialFromString(dnums_attr), + errors::InvalidArgument("Error parsing convolution dimension numbers")); + string precision_config_attr; + OP_REQUIRES_OK( + context, context->GetAttr("precision_config", &precision_config_attr)); + OP_REQUIRES( + context, + precision_config_.ParsePartialFromString(precision_config_attr), + errors::InvalidArgument("Error parsing convolution dimension numbers")); + } + + void Compile(XlaOpKernelContext* context) override { + const TensorShape lhs_shape = context->InputShape(0); + const TensorShape rhs_shape = context->InputShape(1); + const TensorShape padding_shape = context->InputShape("padding"); + std::vector window_strides; + std::vector lhs_dilation; + std::vector rhs_dilation; + int64 feature_group_count; + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("window_strides", + &window_strides)); + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("lhs_dilation", + &lhs_dilation)); + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("rhs_dilation", + &rhs_dilation)); + OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar( + "feature_group_count", &feature_group_count)); + + OP_REQUIRES(context, + TensorShapeUtils::IsMatrix(padding_shape) && + padding_shape.dim_size(1) == 2, + errors::InvalidArgument( + "padding must be a matrix with minor dimension 2, got ", + padding_shape.DebugString())); + xla::Literal padding_literal; + OP_REQUIRES_OK(context, context->ConstantInputAsInt64Literal( + "padding", &padding_literal)); + std::vector> padding(padding_shape.dim_size(0)); + for (int i = 0; i < padding.size(); ++i) { + padding[i] = {padding_literal.Get({i, 0}), + padding_literal.Get({i, 1})}; + } + + // We do only minimal checking, relying on XLA to check the shape + // invariants. + xla::XlaOp output = xla::ConvGeneralDilated( + context->Input(0), context->Input(1), window_strides, padding, + lhs_dilation, rhs_dilation, dnums_, feature_group_count, + &precision_config_); + context->SetOutput(0, output); + } + + private: + xla::ConvolutionDimensionNumbers dnums_; + xla::PrecisionConfigProto precision_config_; + + TF_DISALLOW_COPY_AND_ASSIGN(XlaConvOp); +}; + +REGISTER_XLA_OP(Name("XlaConv") + .CompileTimeConstInput("window_strides") + .CompileTimeConstInput("lhs_dilation") + .CompileTimeConstInput("rhs_dilation") + .CompileTimeConstInput("feature_group_count") + .CompileTimeConstInput("padding"), + XlaConvOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2fed53e5c072e1a50e0f07f45357ee86c90f986f --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc @@ -0,0 +1,65 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/xla_compiler.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { +namespace { + +class XlaDotOp : public XlaOpKernel { + public: + explicit XlaDotOp(OpKernelConstruction* context) : XlaOpKernel(context) { + string dnums_attr; + OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr)); + OP_REQUIRES( + context, dnums_.ParsePartialFromString(dnums_attr), + errors::InvalidArgument("Error parsing convolution dimension numbers")); + string precision_config_attr; + OP_REQUIRES_OK( + context, context->GetAttr("precision_config", &precision_config_attr)); + OP_REQUIRES( + context, + precision_config_.ParsePartialFromString(precision_config_attr), + errors::InvalidArgument("Error parsing convolution dimension numbers")); + } + + void Compile(XlaOpKernelContext* context) override { + const TensorShape lhs_shape = context->InputShape(0); + const TensorShape rhs_shape = context->InputShape(1); + + // We do only minimal checking, relying on XLA to check the shape + // invariants. + xla::XlaOp output = xla::DotGeneral(context->Input(0), context->Input(1), + dnums_, &precision_config_); + context->SetOutput(0, output); + } + + private: + xla::DotDimensionNumbers dnums_; + xla::PrecisionConfigProto precision_config_; + + TF_DISALLOW_COPY_AND_ASSIGN(XlaDotOp); +}; + +REGISTER_XLA_OP(Name("XlaDot"), XlaDotOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..59502d83c7338bd1b05b3323a97761fff2da186a --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc @@ -0,0 +1,105 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "absl/algorithm/container.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/xla_compiler.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { +namespace { + +class XlaPadOp : public XlaOpKernel { + public: + explicit XlaPadOp(OpKernelConstruction* context) : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + const TensorShape input_shape = context->InputShape("input"); + const TensorShape padding_value_shape = + context->InputShape("padding_value"); + + std::vector padding_low; + std::vector padding_high; + std::vector padding_interior; + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("padding_low", + &padding_low)); + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("padding_high", + &padding_high)); + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector( + "padding_interior", &padding_interior)); + + OP_REQUIRES(context, TensorShapeUtils::IsScalar(padding_value_shape), + errors::InvalidArgument("padding_value must be a scalar")); + const int rank = input_shape.dims(); + OP_REQUIRES(context, rank == padding_low.size(), + errors::InvalidArgument( + "The size of padding_low must be equal to the input " + "rank (", + padding_low.size(), " vs. ", rank, ")")); + OP_REQUIRES(context, rank == padding_high.size(), + errors::InvalidArgument( + "The size of padding_high must be equal to the input " + "rank (", + padding_high.size(), " vs. ", rank, ")")); + OP_REQUIRES(context, rank == padding_interior.size(), + errors::InvalidArgument( + "The size of padding_interior must be equal to the input " + "rank (", + padding_interior.size(), " vs. ", rank, ")")); + + auto non_negative = [](int64 x) { return x >= 0; }; + OP_REQUIRES( + context, absl::c_all_of(padding_low, non_negative), + errors::InvalidArgument("padding_low must be non-negative, got [", + absl::StrJoin(padding_low, ","), "]")); + OP_REQUIRES( + context, absl::c_all_of(padding_high, non_negative), + errors::InvalidArgument("padding_high must be non-negative, got [", + absl::StrJoin(padding_high, ","), "]")); + OP_REQUIRES( + context, absl::c_all_of(padding_interior, non_negative), + errors::InvalidArgument("padding_interior must be non-negative, got [", + absl::StrJoin(padding_interior, ","), "]")); + + xla::PaddingConfig padding_config; + for (int i = 0; i < rank; ++i) { + auto* dim = padding_config.add_dimensions(); + dim->set_edge_padding_low(padding_low[i]); + dim->set_edge_padding_high(padding_high[i]); + dim->set_interior_padding(padding_interior[i]); + } + + xla::XlaOp output = + xla::Pad(context->Input("input"), context->Input("padding_value"), + padding_config); + context->SetOutput(0, output); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(XlaPadOp); +}; + +REGISTER_XLA_OP(Name("XlaPad") + .CompileTimeConstInput("padding_low") + .CompileTimeConstInput("padding_high") + .CompileTimeConstInput("padding_interior"), + XlaPadOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc2425f37bfa793ce3a106b635c9dffd15b975ff --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc @@ -0,0 +1,102 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "absl/algorithm/container.h" +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/xla_compiler.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { +namespace { + +class XlaReduceOp : public XlaOpKernel { + public: + explicit XlaReduceOp(OpKernelConstruction* context) : XlaOpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("reducer", &reducer_)); + OP_REQUIRES_OK(context, context->GetAttr("dimensions_to_reduce", + &dimensions_to_reduce_)); + std::set dims_set(dimensions_to_reduce_.begin(), + dimensions_to_reduce_.end()); + OP_REQUIRES( + context, dims_set.size() == dimensions_to_reduce_.size(), + errors::InvalidArgument("Duplicate dimension in dimensions_to_reduce " + "argument to XlaReduce")); + } + + void Compile(XlaOpKernelContext* context) override { + const TensorShape input_shape = context->InputShape("input"); + const TensorShape init_value_shape = context->InputShape("init_value"); + const DataType dtype = context->input_type(0); + + const int rank = input_shape.dims(); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(init_value_shape), + errors::InvalidArgument("init_value must be a scalar")); + + auto dim_in_range = [rank](int64 dim) { return dim >= 0 && dim < rank; }; + OP_REQUIRES(context, + rank >= dimensions_to_reduce_.size() && + absl::c_all_of(dimensions_to_reduce_, dim_in_range), + errors::InvalidArgument( + "Invalid dimensions_to_reduce argument to XlaReduce")); + + // Build the reducer function. + XlaCompiler::Argument reducer_arg; + reducer_arg.kind = XlaCompiler::Argument::kParameter; + reducer_arg.type = dtype; + reducer_arg.shape = TensorShape(); + + XlaCompiler::CompileOptions compile_options; + compile_options.use_tuple_arg = false; + compile_options.always_return_tuple = false; + compile_options.resolve_compile_time_constants = false; + compile_options.is_entry_computation = false; + XlaCompiler::CompilationResult reducer; + OP_REQUIRES_OK(context, context->compiler()->CompileFunction( + compile_options, *reducer_, + {reducer_arg, reducer_arg}, &reducer)); + + xla::Shape scalar_shape; + OP_REQUIRES_OK(context, + TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape)); + OP_REQUIRES( + context, + xla::ShapeUtil::Compatible(reducer.xla_output_shape, scalar_shape), + errors::InvalidArgument( + "Invalid output shape of XlaReduce reducer. Expected ", + xla::ShapeUtil::HumanString(scalar_shape), " got ", + xla::ShapeUtil::HumanString(reducer.xla_output_shape))); + + xla::XlaOp output = + xla::Reduce(context->Input("input"), context->Input("init_value"), + *reducer.computation, dimensions_to_reduce_); + context->SetOutput(0, output); + } + + private: + const NameAttrList* reducer_; + std::vector dimensions_to_reduce_; + + TF_DISALLOW_COPY_AND_ASSIGN(XlaReduceOp); +}; + +REGISTER_XLA_OP(Name("XlaReduce"), XlaReduceOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..089776fcf74fcf6b363dfff5de8d86d7449eacd6 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc @@ -0,0 +1,147 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/kernels/while_op.h" + +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/xla_compiler.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { +namespace { + +class XlaSelectAndScatterOp : public XlaOpKernel { + public: + explicit XlaSelectAndScatterOp(OpKernelConstruction* context) + : XlaOpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("select", &select_computation_)); + OP_REQUIRES_OK(context, context->GetAttr("scatter", &scatter_computation_)); + } + + void Compile(XlaOpKernelContext* context) override { + const TensorShape input_shape = context->InputShape(0); + const DataType dtype = context->input_type(0); + + std::vector window_dimensions; + std::vector window_strides; + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector( + "window_dimensions", &window_dimensions)); + OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("window_strides", + &window_strides)); + + const int rank = input_shape.dims(); + OP_REQUIRES(context, rank == window_dimensions.size(), + errors::InvalidArgument( + "The size of window_dimensions must be equal to the input " + "rank (", + window_dimensions.size(), " vs. ", rank, ")")); + OP_REQUIRES(context, rank == window_strides.size(), + errors::InvalidArgument( + "The size of window_strides must be equal to the input " + "rank (", + window_strides.size(), " vs. ", rank, ")")); + + XlaCompiler::CompileOptions compile_options; + compile_options.use_tuple_arg = false; + compile_options.resolve_compile_time_constants = false; + compile_options.is_entry_computation = false; + compile_options.always_return_tuple = false; + + // Build the select function. + XlaCompiler::Argument select_arg; + select_arg.kind = XlaCompiler::Argument::kParameter; + select_arg.type = dtype; + select_arg.shape = TensorShape(); + + XlaCompiler::CompilationResult select; + OP_REQUIRES_OK(context, context->compiler()->CompileFunction( + compile_options, *select_computation_, + {select_arg, select_arg}, &select)); + + xla::Shape select_output_shape = xla::ShapeUtil::MakeShape(xla::PRED, {}); + OP_REQUIRES( + context, + xla::ShapeUtil::Compatible(select.xla_output_shape, + select_output_shape), + errors::InvalidArgument( + "Invalid output shape of XlaSelectAndScatter select. Expected ", + xla::ShapeUtil::HumanString(select_output_shape), " got ", + xla::ShapeUtil::HumanString(select.xla_output_shape))); + + // Build the scatter function. + XlaCompiler::Argument scatter_arg; + scatter_arg.kind = XlaCompiler::Argument::kParameter; + scatter_arg.type = dtype; + scatter_arg.shape = TensorShape(); + + XlaCompiler::CompilationResult scatter; + OP_REQUIRES_OK(context, context->compiler()->CompileFunction( + compile_options, *scatter_computation_, + {scatter_arg, scatter_arg}, &scatter)); + + xla::Shape scalar_shape; + OP_REQUIRES_OK(context, + TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape)); + OP_REQUIRES( + context, + xla::ShapeUtil::Compatible(scatter.xla_output_shape, scalar_shape), + errors::InvalidArgument( + "Invalid output shape of scatter. Expected ", + xla::ShapeUtil::HumanString(scalar_shape), " got ", + xla::ShapeUtil::HumanString(scatter.xla_output_shape))); + + const TensorShape padding_shape = context->InputShape("padding"); + OP_REQUIRES(context, + TensorShapeUtils::IsMatrix(padding_shape) && + padding_shape.dim_size(1) == 2, + errors::InvalidArgument( + "padding must be a matrix with minor dimension 2, got ", + padding_shape.DebugString())); + xla::Literal padding_literal; + OP_REQUIRES_OK(context, context->ConstantInputAsInt64Literal( + "padding", &padding_literal)); + std::vector> padding(padding_shape.dim_size(0)); + for (int i = 0; i < padding.size(); ++i) { + padding[i] = {padding_literal.Get({i, 0}), + padding_literal.Get({i, 1})}; + } + + xla::XlaOp output = xla::SelectAndScatterWithGeneralPadding( + context->Input("operand"), *select.computation, window_dimensions, + window_strides, padding, context->Input("source"), + context->Input("init_value"), *scatter.computation); + context->SetOutput(0, output); + } + + private: + const NameAttrList* select_computation_; + const NameAttrList* scatter_computation_; + + TF_DISALLOW_COPY_AND_ASSIGN(XlaSelectAndScatterOp); +}; + +REGISTER_XLA_OP(Name("XlaSelectAndScatter") + .CompileTimeConstInput("window_dimensions") + .CompileTimeConstInput("window_strides") + .CompileTimeConstInput("padding"), + XlaSelectAndScatterOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD index ee7f5d510ab7a3ce7d3bbe843c5fefd362f79b7b..9365d203f06d9f1cad320353f43db010d39697af 100644 --- a/tensorflow/compiler/tf2xla/lib/BUILD +++ b/tensorflow/compiler/tf2xla/lib/BUILD @@ -25,8 +25,8 @@ cc_library( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/core:lib", ], ) @@ -40,12 +40,50 @@ cc_library( ":triangular_solve", ":util", ":while_loop", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client/lib:constants", + "//tensorflow/core:lib", + ], +) + +cc_library( + name = "random", + srcs = ["random.cc"], + hdrs = ["random.h"], + deps = [ + "//tensorflow/compiler/tf2xla:xla_compiler", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client/lib:constants", + "//tensorflow/compiler/xla/client/lib:math", + "//tensorflow/core:protos_all_cc", + ], +) + +cc_library( + name = "qr", + srcs = ["qr.cc"], + hdrs = ["qr.h"], + deps = [ + ":batch_dot", + ":util", + ":while_loop", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/lib:constants", + "//tensorflow/compiler/xla/client/lib:math", + "//tensorflow/compiler/xla/client/lib:numeric", "//tensorflow/core:lib", ], ) @@ -57,15 +95,16 @@ cc_library( deps = [ ":util", ":while_loop", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -76,13 +115,16 @@ cc_library( deps = [ ":batch_dot", ":util", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", + "//tensorflow/compiler/xla/client/lib:constants", + "//tensorflow/compiler/xla/client/lib:numeric", "//tensorflow/core:lib", ], ) @@ -94,7 +136,7 @@ xla_test( deps = [ ":triangular_solve", "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", @@ -102,7 +144,7 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -116,14 +158,16 @@ cc_library( srcs = ["util.cc"], hdrs = ["util.h"], deps = [ + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -134,7 +178,7 @@ xla_test( ":batch_dot", ":util", "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", @@ -159,8 +203,9 @@ cc_library( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc index 526694d5a0c7124e1696f34b516f3b202462bc19..d8c050d09e871c80e128989c9fbdb57c266b19ed 100644 --- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc +++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -25,91 +26,100 @@ limitations under the License. namespace tensorflow { -xla::StatusOr BatchDot(xla::XlaBuilder* builder, xla::XlaOp x, - xla::XlaOp y, bool transpose_x, - bool transpose_y, bool conjugate_x, - bool conjugate_y) { - TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x)); - TF_ASSIGN_OR_RETURN(xla::Shape y_shape, builder->GetShape(y)); - - // Check that both tensors have the same number of dimensions. There must be - // at least two (the batch dimensions can be empty). - if (xla::ShapeUtil::Rank(x_shape) != xla::ShapeUtil::Rank(y_shape)) { - return errors::InvalidArgument( - "Arguments to BatchedDot have different ranks: ", - xla::ShapeUtil::HumanString(x_shape), " vs. ", - xla::ShapeUtil::HumanString(y_shape)); - } - const int ndims = xla::ShapeUtil::Rank(x_shape); - if (ndims < 2) { - return errors::InvalidArgument( - "Arguments to BatchedDot must have rank >= 2: ", ndims); - } - - // The batch dimensions must be equal and the matrix dimensions must be - // valid. - std::vector batch_dimension_numbers; - for (int i = 0; i < ndims - 2; ++i) { - if (x_shape.dimensions(i) != y_shape.dimensions(i)) { +xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x, + bool transpose_y, bool conjugate_x, bool conjugate_y, + xla::PrecisionConfigProto::Precision precision) { + xla::XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x)); + TF_ASSIGN_OR_RETURN(xla::Shape y_shape, builder->GetShape(y)); + + // Check that both tensors have the same number of dimensions. There must be + // at least two (the batch dimensions can be empty). + if (xla::ShapeUtil::Rank(x_shape) != xla::ShapeUtil::Rank(y_shape)) { return errors::InvalidArgument( - "Dimension ", i, " of inputs to BatchedDot must be equal: ", - xla::ShapeUtil::HumanString(x_shape), " vs ", + "Arguments to BatchedDot have different ranks: ", + xla::ShapeUtil::HumanString(x_shape), " vs. ", xla::ShapeUtil::HumanString(y_shape)); } - batch_dimension_numbers.push_back(i); - } - - int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1); - int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2); - if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) { - return errors::InvalidArgument( - "Dimensions ", x_inner_dim, " and ", y_inner_dim, - " of arguments to BatchedDot must be equal: ", - xla::ShapeUtil::HumanString(x_shape), " transpose: ", transpose_x, - " vs. ", xla::ShapeUtil::HumanString(y_shape), - " transpose: ", transpose_y); - } - - // Check for zero lhs/rhs dim size. - if (xla::ShapeUtil::HasZeroElements(x_shape) || - xla::ShapeUtil::HasZeroElements(y_shape)) { - std::vector dimensions(batch_dimension_numbers.size()); - for (int i = 0; i < batch_dimension_numbers.size(); ++i) { - dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]); + const int ndims = xla::ShapeUtil::Rank(x_shape); + if (ndims < 2) { + return errors::InvalidArgument( + "Arguments to BatchedDot must have rank >= 2: ", ndims); + } + + // The batch dimensions must be equal and the matrix dimensions must be + // valid. + std::vector batch_dimension_numbers; + for (int i = 0; i < ndims - 2; ++i) { + if (x_shape.dimensions(i) != y_shape.dimensions(i)) { + return errors::InvalidArgument( + "Dimension ", i, " of inputs to BatchedDot must be equal: ", + xla::ShapeUtil::HumanString(x_shape), " vs ", + xla::ShapeUtil::HumanString(y_shape)); + } + batch_dimension_numbers.push_back(i); + } + + int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1); + int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2); + if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) { + return errors::InvalidArgument( + "Dimensions ", x_inner_dim, " and ", y_inner_dim, + " of arguments to BatchedDot must be equal: ", + xla::ShapeUtil::HumanString(x_shape), " transpose: ", transpose_x, + " vs. ", xla::ShapeUtil::HumanString(y_shape), + " transpose: ", transpose_y); + } + + // Check for zero lhs/rhs dim size. + if (xla::ShapeUtil::IsZeroElementArray(x_shape) || + xla::ShapeUtil::IsZeroElementArray(y_shape)) { + std::vector dimensions(batch_dimension_numbers.size()); + for (int i = 0; i < batch_dimension_numbers.size(); ++i) { + dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]); + } + int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2); + int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1); + dimensions.push_back(x_shape.dimensions(x_outer_dim)); + dimensions.push_back(y_shape.dimensions(y_outer_dim)); + return xla::Broadcast( + xla::ConstantLiteral(builder, + xla::LiteralUtil::Zero(x_shape.element_type())), + dimensions); } - int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2); - int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1); - dimensions.push_back(x_shape.dimensions(x_outer_dim)); - dimensions.push_back(y_shape.dimensions(y_outer_dim)); - return builder->Broadcast( - builder->ConstantLiteral(xla::Literal::Zero(x_shape.element_type())), - dimensions); - } - - if (x_shape.element_type() == xla::C64 && conjugate_x) { - x = builder->Conj(x); - } - if (y_shape.element_type() == xla::C64 && conjugate_y) { - y = builder->Conj(y); - } - - // If there are no batch dimensions, use a regular Dot. - // TODO(b/69062148) Remove this code when Dot emitters can be passed - // dimensions to transpose directly (i.e. without requiring a Transpose HLO). - if (batch_dimension_numbers.empty()) { - auto lhs = transpose_x ? builder->Transpose(x, {1, 0}) : x; - auto rhs = transpose_y ? builder->Transpose(y, {1, 0}) : y; - return builder->Dot(lhs, rhs); - } - - xla::DotDimensionNumbers dot_dnums; - dot_dnums.add_lhs_contracting_dimensions(x_inner_dim); - dot_dnums.add_rhs_contracting_dimensions(y_inner_dim); - for (auto batch_dimension_number : batch_dimension_numbers) { - dot_dnums.add_lhs_batch_dimensions(batch_dimension_number); - dot_dnums.add_rhs_batch_dimensions(batch_dimension_number); - } - return builder->DotGeneral(x, y, dot_dnums); + + if (x_shape.element_type() == xla::C64 && conjugate_x) { + x = xla::Conj(x); + } + if (y_shape.element_type() == xla::C64 && conjugate_y) { + y = xla::Conj(y); + } + + xla::PrecisionConfigProto precision_proto; + precision_proto.add_operand_precision(precision); + precision_proto.add_operand_precision(precision); + + // If there are no batch dimensions, use a regular Dot. + // TODO(b/69062148) Remove this code when Dot emitters can be passed + // dimensions to transpose directly (i.e. without requiring a Transpose + // HLO). + if (batch_dimension_numbers.empty()) { + auto lhs = transpose_x ? xla::Transpose(x, {1, 0}) : x; + auto rhs = transpose_y ? xla::Transpose(y, {1, 0}) : y; + return xla::Dot(lhs, rhs, &precision_proto); + } + + xla::DotDimensionNumbers dot_dnums; + dot_dnums.add_lhs_contracting_dimensions(x_inner_dim); + dot_dnums.add_rhs_contracting_dimensions(y_inner_dim); + for (auto batch_dimension_number : batch_dimension_numbers) { + dot_dnums.add_lhs_batch_dimensions(batch_dimension_number); + dot_dnums.add_rhs_batch_dimensions(batch_dimension_number); + } + + return xla::DotGeneral(x, y, dot_dnums, &precision_proto); + }); } } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h index 1acc72033b05e73b0f5f88907df20cde5cfffbf0..6cfccd55530ff40a309673d57d1fe61fc8264316 100644 --- a/tensorflow/compiler/tf2xla/lib/batch_dot.h +++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h @@ -16,8 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_ #define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_ -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" namespace tensorflow { @@ -43,10 +43,11 @@ namespace tensorflow { // It is computed as: // // output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :]) -xla::StatusOr BatchDot(xla::XlaBuilder* builder, xla::XlaOp x, - xla::XlaOp y, bool transpose_x, - bool transpose_y, bool conjugate_x = false, - bool conjugate_y = false); +xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x = false, + bool transpose_y = false, bool conjugate_x = false, + bool conjugate_y = false, + xla::PrecisionConfigProto::Precision precision = + xla::PrecisionConfigProto::DEFAULT); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc index 3f1384bc864abd882ebba2b90acbe0b1e664687a..c50a8de33e93a91b1a414146147de48df603eb85 100644 --- a/tensorflow/compiler/tf2xla/lib/cholesky.cc +++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc @@ -22,7 +22,9 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/lib/triangular_solve.h" #include "tensorflow/compiler/tf2xla/lib/util.h" #include "tensorflow/compiler/tf2xla/lib/while_loop.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -47,179 +49,169 @@ namespace { // l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j], row_t)) / // l[..., j, j] // return l -xla::StatusOr CholeskyUnblocked(xla::XlaBuilder* builder, - const xla::XlaOp& a) { - TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); - const int n_dims = xla::ShapeUtil::Rank(a_shape); - const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1); - gtl::ArraySlice major_dims(xla::AsInt64Slice(a_shape.dimensions()), - /*pos=*/0, - /*len=*/n_dims - 2); - - xla::XlaOp l = Zeros(builder, a_shape); - - // Construct the for loop body to iterate over rows. - auto body_fn = [&](xla::XlaOp i, gtl::ArraySlice loop_vars, - xla::XlaBuilder* body_builder) - -> xla::StatusOr> { - xla::Shape col_shape; - xla::Shape row_shape; - for (int64 d : major_dims) { - row_shape.add_dimensions(d); - col_shape.add_dimensions(d); - } - row_shape.add_dimensions(1); - row_shape.add_dimensions(n); - row_shape.set_element_type(a_shape.element_type()); - auto mask_zeros_row = Zeros(body_builder, row_shape); - - col_shape.add_dimensions(n); - col_shape.add_dimensions(1); - col_shape.set_element_type(a_shape.element_type()); - auto mask_zeros_col = Zeros(body_builder, col_shape); - - std::vector mask_vector(n); - std::iota(mask_vector.begin(), mask_vector.end(), 0); - auto mask_range = body_builder->ConstantR1(mask_vector); - auto mask_range_row = body_builder->Broadcast( - body_builder->Reshape(mask_range, {0}, {1, n}), major_dims); - auto mask_range_col = body_builder->Broadcast( - body_builder->Reshape(mask_range, {0}, {n, 1}), major_dims); - auto body_a = loop_vars[0]; - auto body_l = loop_vars[1]; - - // row = l[..., i, :i] - // select the whole i-th row, then mask out all columns past i-1 - auto zero = body_builder->ConstantR0(0); - TF_ASSIGN_OR_RETURN(auto l_i, DynamicSliceInMinorDims(body_builder, body_l, - {i, zero}, {1, n})); - auto row = body_builder->Select(body_builder->Ge(mask_range_row, i), - mask_zeros_row, l_i); - // a[..., i, i] - TF_ASSIGN_OR_RETURN(auto a_ii, DynamicSliceInMinorDims(body_builder, body_a, - {i, i}, {1, 1})); - // np.dot(row, np.swapaxes(row, -1, -2)) - xla::XlaOp diag_dot; - TF_ASSIGN_OR_RETURN(diag_dot, BatchDot(body_builder, row, row, - /*transpose_x=*/false, - /*transpose_y=*/true)); - // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row, - // np.swapaxes(row, -1, -2))) - auto l_ii = body_builder->Pow( - body_builder->Sub(a_ii, diag_dot), - FloatLiteral(body_builder, a_shape.element_type(), 0.5)); - - // a[..., i+1:, i] - auto ip1 = body_builder->Add(i, body_builder->ConstantR0(1)); - // select the whole i-th column, then mask out all rows above i+1 +xla::XlaOp CholeskyUnblocked(xla::XlaOp a, + xla::PrecisionConfigProto::Precision precision) { + xla::XlaBuilder* builder = a.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); + const int n_dims = xla::ShapeUtil::Rank(a_shape); + const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1); + auto major_dims = xla::AsInt64Slice(a_shape.dimensions()) + .subspan( + /*pos=*/0, + /*len=*/n_dims - 2); + + xla::XlaOp l = xla::ZerosLike(a); + + // Construct the for loop body to iterate over rows. + auto body_fn = [&](xla::XlaOp i, absl::Span loop_vars, + xla::XlaBuilder* body_builder) + -> xla::StatusOr> { + xla::Shape col_shape; + xla::Shape row_shape; + for (int64 d : major_dims) { + row_shape.add_dimensions(d); + col_shape.add_dimensions(d); + } + row_shape.add_dimensions(1); + row_shape.add_dimensions(n); + row_shape.set_element_type(a_shape.element_type()); + auto mask_zeros_row = xla::Zeros(body_builder, row_shape); + + col_shape.add_dimensions(n); + col_shape.add_dimensions(1); + col_shape.set_element_type(a_shape.element_type()); + auto mask_zeros_col = xla::Zeros(body_builder, col_shape); + + std::vector mask_vector(n); + std::iota(mask_vector.begin(), mask_vector.end(), 0); + auto mask_range = xla::ConstantR1(body_builder, mask_vector); + auto mask_range_row = + xla::Broadcast(xla::Reshape(mask_range, {0}, {1, n}), major_dims); + auto mask_range_col = + xla::Broadcast(xla::Reshape(mask_range, {0}, {n, 1}), major_dims); + auto body_a = loop_vars[0]; + auto body_l = loop_vars[1]; + + // row = l[..., i, :i] + // select the whole i-th row, then mask out all columns past i-1 + auto zero = xla::ConstantR0(body_builder, 0); + auto l_i = DynamicSliceInMinorDims(body_l, {i, zero}, {1, n}); + auto row = xla::Select(xla::Ge(mask_range_row, i), mask_zeros_row, l_i); + // a[..., i, i] + auto a_ii = DynamicSliceInMinorDims(body_a, {i, i}, {1, 1}); + // np.dot(row, np.swapaxes(row, -1, -2)) + auto diag_dot = BatchDot(row, row, + /*transpose_x=*/false, + /*transpose_y=*/true, /*conjugate_x=*/false, + /*conjugate_y=*/false, precision); + // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row, + // np.swapaxes(row, -1, -2))) + auto l_ii = + xla::Pow(a_ii - diag_dot, + FloatLiteral(body_builder, a_shape.element_type(), 0.5)); + + // a[..., i+1:, i] + // select the whole i-th column, then mask out all rows above i+1 + auto a_0i = DynamicSliceInMinorDims(body_a, {i}, {1}); + auto a_ip1i = + xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, a_0i); + + // l[..., i+1:, i] = (a[..., i+1:, i] - np.dot(l[..., i+1:, :i], r.T)) / + // l[..., i, i] + // The columns in [i, n] are zeroed out in `row`, so we just have to + // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i], + // r.T) + auto dot = BatchDot(body_l, row, + /*transpose_x=*/false, + /*transpose_y=*/true, /*conjugate_x=*/false, + /*conjugate_y=*/false, precision); + // np.dot(l[..., i+1:, :i], r.T) + auto dot_ip1 = + xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, dot); + + body_l = + DynamicUpdateSliceInMinorDims(body_l, (a_ip1i - dot_ip1) / l_ii, {i}); + // Assign the diagonal after the rest of the column because otherwise the + // column assign will wrap around and overwrite the diagonal assign. + body_l = DynamicUpdateSliceInMinorDims(body_l, l_ii, {i, i}); + + return std::vector{body_a, body_l}; + }; + TF_ASSIGN_OR_RETURN( - auto a_0i, DynamicSliceInMinorDims(body_builder, body_a, {i}, {1})); - auto a_ip1i = body_builder->Select(body_builder->Le(mask_range_col, i), - mask_zeros_col, a_0i); - - // l[..., i+1:, i] = (a[..., i+1:, i] - np.dot(l[..., i+1:, :i], r.T)) / - // l[..., i, i] - // The columns in [i, n] are zeroed out in `row`, so we just have to - // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i], - // r.T) - TF_ASSIGN_OR_RETURN(auto dot, BatchDot(body_builder, body_l, row, - /*transpose_x=*/false, - /*transpose_y=*/true)); - // np.dot(l[..., i+1:, :i], r.T) - auto dot_ip1 = body_builder->Select(body_builder->Le(mask_range_col, i), - mask_zeros_col, dot); - - auto col_update = - body_builder->Div(body_builder->Sub(a_ip1i, dot_ip1), l_ii); - TF_ASSIGN_OR_RETURN(body_l, DynamicUpdateSliceInMinorDims( - body_builder, body_l, col_update, {i})); - // Assign the diagonal after the rest of the column because otherwise the - // column assign will wrap around and overwrite the diagonal assign. - TF_ASSIGN_OR_RETURN(body_l, DynamicUpdateSliceInMinorDims( - body_builder, body_l, l_ii, {i, i})); - - return std::vector{body_a, body_l}; - }; - - TF_ASSIGN_OR_RETURN( - auto cholesky_while, - XlaForEachIndex(n, xla::S32, body_fn, {a, l}, "unblocked", builder)); - - return cholesky_while[1]; + auto cholesky_while, + XlaForEachIndex(n, xla::S32, body_fn, {a, l}, "unblocked", builder)); + + return cholesky_while[1]; + }); } } // namespace -xla::StatusOr Cholesky(xla::XlaBuilder* builder, xla::XlaOp a, - int64 block_size) { - TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); - const int ndims = xla::ShapeUtil::Rank(a_shape); - if (ndims < 2) { - return errors::InvalidArgument( - "Arguments to Cholesky must have rank >= 2: ", ndims); - } - - const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1); - if (n != xla::ShapeUtil::GetDimension(a_shape, -2)) { - return errors::InvalidArgument( - "Arguments to Cholesky must be square matrices: ", - xla::ShapeUtil::HumanString(a_shape)); - } - - if (block_size < 1) { - return errors::InvalidArgument( - "block_size argument to Cholesky must be >= 1; got ", block_size); - } - - // Blocked left-looking Cholesky factorization. - // Algorithm 1 from - // Haidar, Azzam, et al. "High-performance Cholesky factorization for GPU-only - // execution." Proceedings of General Purpose GPUs. ACM, 2017. - xla::XlaOp l = Zeros(builder, a_shape); - for (int64 i = 0; i < n; i += block_size) { - int64 k = std::min(block_size, n - i); - if (i > 0) { - // TODO(phawkins): consider implementing SYRK for the diagonal part of - // the panel. - // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i])) - TF_ASSIGN_OR_RETURN(auto lhs, - SliceInMinorDims(builder, l, {i, 0}, {n, i})); - TF_ASSIGN_OR_RETURN(auto rhs, - SliceInMinorDims(builder, l, {i, 0}, {i + k, i})); - TF_ASSIGN_OR_RETURN(auto delta, - BatchDot(builder, lhs, rhs, /*transpose_x=*/false, - /*transpose_y=*/true, /*conjugate_x=*/false, - /*conjugate_y=*/false)); - TF_ASSIGN_OR_RETURN(auto before, - SliceInMinorDims(builder, a, {i, i}, {n, i + k})); - TF_ASSIGN_OR_RETURN( - a, UpdateSliceInMinorDims(builder, a, builder->Sub(before, delta), - {i, i})); +xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size, + xla::PrecisionConfigProto::Precision precision) { + xla::XlaBuilder* builder = a.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); + const int ndims = xla::ShapeUtil::Rank(a_shape); + if (ndims < 2) { + return errors::InvalidArgument( + "Arguments to Cholesky must have rank >= 2: ", ndims); + } + + const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1); + if (n != xla::ShapeUtil::GetDimension(a_shape, -2)) { + return errors::InvalidArgument( + "Arguments to Cholesky must be square matrices: ", + xla::ShapeUtil::HumanString(a_shape)); + } + + if (block_size < 1) { + return errors::InvalidArgument( + "block_size argument to Cholesky must be >= 1; got ", block_size); } - // l[i:i+k, i:i+k] = cholesky_unblocked(a[i:i+k, i:i+k]) - TF_ASSIGN_OR_RETURN(auto x, - SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); - TF_ASSIGN_OR_RETURN(auto factorized, CholeskyUnblocked(builder, x)); - TF_ASSIGN_OR_RETURN(l, - UpdateSliceInMinorDims(builder, l, factorized, {i, i})); - - if (i + k < n) { - // l[i+k:, i:i+k] = trsm_right_transpose(l[i:i+k, i:i+k], a[i+k:, i:i+k]) - TF_ASSIGN_OR_RETURN(auto panel, - SliceInMinorDims(builder, a, {i + k, i}, {n, i + k})); - TF_ASSIGN_OR_RETURN(auto update, - TriangularSolve(builder, factorized, panel, - /*left_side=*/false, - /*lower=*/true, - /*transpose_a=*/true, - /*conjugate_a=*/false, - /*block_size=*/block_size)); - TF_ASSIGN_OR_RETURN( - l, UpdateSliceInMinorDims(builder, l, update, {i + k, i})); + // Blocked left-looking Cholesky factorization. + // Algorithm 1 from + // Haidar, Azzam, et al. "High-performance Cholesky factorization for + // GPU-only execution." Proceedings of General Purpose GPUs. ACM, 2017. + xla::XlaOp l = xla::ZerosLike(a); + for (int64 i = 0; i < n; i += block_size) { + int64 k = std::min(block_size, n - i); + if (i > 0) { + // TODO(phawkins): consider implementing SYRK for the diagonal part of + // the panel. + // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i])) + auto lhs = SliceInMinorDims(l, {i, 0}, {n, i}); + auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i}); + auto delta = BatchDot(lhs, rhs, /*transpose_x=*/false, + /*transpose_y=*/true, /*conjugate_x=*/false, + /*conjugate_y=*/false, precision); + auto before = SliceInMinorDims(a, {i, i}, {n, i + k}); + a = UpdateSliceInMinorDims(a, before - delta, {i, i}); + } + + // l[i:i+k, i:i+k] = cholesky_unblocked(a[i:i+k, i:i+k]) + auto x = SliceInMinorDims(a, {i, i}, {i + k, i + k}); + auto factorized = CholeskyUnblocked(x, precision); + l = UpdateSliceInMinorDims(l, factorized, {i, i}); + + if (i + k < n) { + // l[i+k:, i:i+k] = + // trsm_right_transpose(l[i:i+k, i:i+k], a[i+k:, i:i+k]) + auto panel = SliceInMinorDims(a, {i + k, i}, {n, i + k}); + auto update = TriangularSolve(factorized, panel, + /*left_side=*/false, + /*lower=*/true, + /*transpose_a=*/true, + /*conjugate_a=*/false, + /*block_size=*/block_size); + l = UpdateSliceInMinorDims(l, update, {i + k, i}); + } } - } - return l; + return l; + }); } } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h index 20fca7969ece2729a44933fd3ef3f87230ab6cad..60cd7ded53fe862f29ca2bb68b175fcd1c89b70c 100644 --- a/tensorflow/compiler/tf2xla/lib/cholesky.h +++ b/tensorflow/compiler/tf2xla/lib/cholesky.h @@ -16,8 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_ #define TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_ -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" namespace tensorflow { @@ -30,8 +30,9 @@ namespace tensorflow { // TODO(phawkins): check for negative values on the diagonal and return an // error, instead of silently yielding NaNs. // TODO(znado): handle the complex Hermitian case -xla::StatusOr Cholesky(xla::XlaBuilder* builder, xla::XlaOp a, - int64 block_size = 256); +xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size = 256, + xla::PrecisionConfigProto::Precision precision = + xla::PrecisionConfigProto::HIGHEST); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/tf2xla/lib/qr.cc new file mode 100644 index 0000000000000000000000000000000000000000..0a140fa93caec28ebbbd666fd4fa518222ea23a4 --- /dev/null +++ b/tensorflow/compiler/tf2xla/lib/qr.cc @@ -0,0 +1,411 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/lib/qr.h" + +#include +#include + +#include "tensorflow/compiler/tf2xla/lib/batch_dot.h" +#include "tensorflow/compiler/tf2xla/lib/util.h" +#include "tensorflow/compiler/tf2xla/lib/while_loop.h" +#include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/math.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace tensorflow { + +namespace { + +// Computes a Householder reflection of the form: +// H = I - tau v v.T. +// such that +// H . ( x1 ) = ( x1 ) +// ( x2 ) = ( x2 ) +// ( ... ) = ( ... ) +// ( xk ) = ( beta ) +// ( ... ) ( 0 ) +// ( ... ) ( 0 ) +// Unlike the usual formulation, we allow the caller to supply 'k' rather than +// only providing the relevant part of 'x' to maintain XLA's static shape +// invariant. In addition, the implementation supports batching. +// Pseudo-code, without batching: +// alpha = x[k] +// x_copy = np.copy(x) +// x_copy[:k+1] = 0 +// xnorm = norm2(x_copy) +// if xnorm == 0: +// beta = alpha +// tau = 0 +// v = np.zeros_like(x) +// else: +// beta = - np.sign(alpha) * dlapy2(alpha, xnorm) +// tau = (beta - alpha) / beta +// v = x / (alpha - beta) +// v[k] = 1 +// return (v, tau, beta) +// TODO(phawkins): LAPACK's xLARFG implementation has code for handling +// overflows in the norm/beta calculations. Perhaps do the same here. +xla::Status House(xla::XlaOp x, xla::XlaOp k, + absl::Span batch_dims, const int64 m, + xla::XlaOp* v, xla::XlaOp* tau, xla::XlaOp* beta) { + xla::XlaBuilder* const builder = x.builder(); + TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x)); + const xla::PrimitiveType type = x_shape.element_type(); + + std::vector batch_dim_ids(batch_dims.size()); + std::iota(batch_dim_ids.begin(), batch_dim_ids.end(), 0); + const int64 minor_dim = batch_dims.size(); + + xla::XlaOp zero = xla::ScalarLike(x, 0.0); + xla::XlaOp one = xla::ScalarLike(x, 1.0); + + // alpha = x[k] + xla::XlaOp alpha = + xla::Reshape(DynamicSliceInMinorDims(x, {k}, {1}), batch_dims); + + // Compute x[k+1:] (padded with zeros in elements 0..k) + xla::XlaOp iota = xla::Iota(builder, xla::S32, m); + xla::XlaOp x_after_k = + xla::Mul(x, xla::ConvertElementType(xla::Gt(iota, k), type), + /*broadcast_dimensions=*/{minor_dim}); + + // sigma = np.dot(x[k+1:], x[k+1:]) + auto sigma = + xla::Reduce(x_after_k * x_after_k, zero, + xla::CreateScalarAddComputation(type, builder), {minor_dim}); + // mu = np.sqrt(x[k]*x[k] + sigma) + auto mu = xla::Sqrt(xla::Square(alpha) + sigma); + + auto sigma_is_zero = xla::Eq(sigma, zero); + + *beta = xla::Select(sigma_is_zero, alpha, -xla::Sign(alpha) * mu); + *tau = xla::Select(sigma_is_zero, xla::Broadcast(zero, batch_dims), + (*beta - alpha) / *beta); + auto divisor = xla::Select(sigma_is_zero, xla::Broadcast(one, batch_dims), + alpha - *beta); + + auto e_k = xla::Broadcast(xla::ConvertElementType(xla::Eq(iota, k), type), + std::vector(batch_dims.size(), 1)); + + // Form v as [0, 0, ..., 1] ++ x[k+1:] / divisor + // If sigma is zero, x[k+1:] is zero, so use any non-zero divisor. + *v = e_k + + xla::Div(x_after_k, divisor, /*broadcast_dimensions=*/batch_dim_ids); + return Status::OK(); +} + +// Householder QR decomposition. Algorithm 5.2.1 from Golub and Van +// Loan "Matrix Computations", 4th Edition. This is an unblocked implementation +// used as an inner routine of the blocked implementation. +// Algorithm is adapted slightly so the shapes inside the loop are static, at +// the cost of some redundant computation. Since this is used as an inner block +// kernel, accumulates the Householder transformations (vs, taus) rather than +// the matrix q. +// Equivalent Python code, without batching: +// def qr(a): +// m = a.shape[0] +// n = a.shape[1] +// vs = np.zeros([m, n]) +// taus = np.zeros([n]) +// for j in xrange(min(m, n)): +// v, tau, beta = house(a[:, j], j) +// # Unusually, we apply the Householder transformation to the entirety of +// # a, wasting FLOPs to maintain the static shape invariant that XLA +// # requires. For columns that precede j this has no effect. +// a[:, :] -= tau * np.dot(v[:, np.newaxis], +// np.dot(v[np.newaxis, :], a[:, :])) +// # Form column j explicitly rather than relying on the precision of the +// # Householder update. +// a[j, j] = beta +// a[j+1:, j] = np.zeros([m - j - 1], dtype=a.dtype) +// vs[:, j] = v +// taus[j] = tau +// return (q, vs, taus) +struct QRBlockResult { + // The factored R value + xla::XlaOp r; + + // Representation of the Householder matrices I - beta v v.T + xla::XlaOp taus; // Shape: [..., n] + xla::XlaOp vs; // Shape: [..., m, n] +}; +xla::StatusOr QRBlock( + xla::XlaOp a, xla::PrecisionConfigProto::Precision precision) { + xla::XlaBuilder* builder = a.builder(); + TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); + const int num_dims = xla::ShapeUtil::Rank(a_shape); + if (num_dims < 2) { + return errors::InvalidArgument("Arguments to QR must have rank >= 2: ", + num_dims); + } + xla::PrimitiveType type = a_shape.element_type(); + + const int64 m = xla::ShapeUtil::GetDimension(a_shape, -2); + const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1); + + const int64 num_batch_dims = num_dims - 2; + std::vector batch_dims(num_batch_dims); + for (int i = 0; i < num_batch_dims; ++i) { + batch_dims[i] = xla::ShapeUtil::GetDimension(a_shape, i); + } + + std::vector batch_dim_indices(num_batch_dims); + std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0); + + auto qr_body_fn = + [&](xla::XlaOp j, absl::Span values, + xla::XlaBuilder* builder) -> xla::StatusOr> { + auto a = values[0]; + auto vs = values[1]; + auto taus = values[2]; + + // v, beta = house(a[:, j], j) + auto x = DynamicSliceInMinorDims(a, {j}, {1}); + xla::XlaOp v, tau, beta; + TF_RETURN_IF_ERROR(House(xla::Collapse(x, {num_dims - 2, num_dims - 1}), j, + batch_dims, m, &v, &tau, &beta)); + + std::vector shape = batch_dims; + shape.push_back(1); + shape.push_back(m); + auto v_broadcast = xla::Reshape(v, shape); + // a[:, :] -= tau * np.dot(v[:, np.newaxis], + // np.dot(v[np.newaxis, :], a[:, :])) + auto vva = + BatchDot(v_broadcast, a, /*transpose_x=*/false, /*transpose_y=*/false, + /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + vva = + BatchDot(v_broadcast, vva, /*transpose_x=*/true, /*transpose_y=*/false, + /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + a = a - xla::Mul(tau, vva, + /*broadcast_dimensions=*/batch_dim_indices); + + // It is more precise to populate column 'k' explicitly, rather than + // computing it implicitly by applying the Householder transformation. + // a[k,k] = beta + // a[k+1:,k] = np.zeros([m-k-1], dtype=a.dtype) + auto iota = xla::Reshape(xla::Iota(a.builder(), xla::S32, m), {m, 1}); + auto predecessor_mask = xla::ConvertElementType(xla::Lt(iota, j), type); + auto mask = xla::Broadcast(xla::ConvertElementType(xla::Eq(iota, j), type), + std::vector(batch_dims.size(), 1)); + auto new_x = + xla::Mul(x, predecessor_mask, + /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1}) + + xla::Mul(beta, mask, /*broadcast_dimensions=*/batch_dim_indices); + a = DynamicUpdateSliceInMinorDims(a, new_x, {j}); + + // vs[:, j] = v + vs = DynamicUpdateSliceInMinorDims( + vs, xla::Reshape(v, ConcatVectors(batch_dims, {m, 1})), {j}); + // taus[j] = tau + taus = DynamicUpdateSliceInMinorDims( + taus, xla::Reshape(tau, ConcatVectors(batch_dims, {1})), {j}); + return std::vector{a, vs, taus}; + }; + + auto vs = xla::Zeros(builder, xla::ShapeUtil::MakeShape( + type, ConcatVectors(batch_dims, {m, n}))); + auto taus = xla::Zeros( + builder, xla::ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {n}))); + + TF_ASSIGN_OR_RETURN(auto values, + XlaForEachIndex(std::min(m, n), xla::S32, qr_body_fn, + {a, vs, taus}, "qr", builder)); + + QRBlockResult result; + result.r = values[0]; + result.vs = values[1]; + result.taus = values[2]; + return result; +} + +// Computes W and Y such that I-WY is equivalent to the sequence of Householder +// transformations given by vs and taus. +// Golub and van Loan, "Matrix Computations", algorithm 5.1.2. +// Y = np.zeros([m, n]) +// W = np.zeros([m, n]) +// Y[:, 0] = vs[:, 0] +// W[:, 0] = -taus[0] * vs[:, 0] +// for j in xrange(1, n): +// v = vs[:, j] +// z = -taus[j] * v - taus[j] * np.dot(W, np.dot(Y.T, v)) +// W[:, j] = z +// Y[:, j] = v +// return W +// There is no need to return Y since at termination of the loop it is equal to +// vs. +xla::StatusOr ComputeWYRepresentation( + xla::PrimitiveType type, absl::Span batch_dims, xla::XlaOp vs, + xla::XlaOp taus, int64 m, int64 n, + xla::PrecisionConfigProto::Precision precision) { + std::vector batch_dim_indices(batch_dims.size()); + std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0); + int64 n_index = batch_dims.size() + 1; + + auto body_fn = + [&](xla::XlaOp j, absl::Span values, + xla::XlaBuilder* builder) -> xla::StatusOr> { + auto w = values[0]; + auto y = values[1]; + const auto vs = values[2]; + const auto taus = values[3]; + + // Want j values in range [1, ... n). + j = j + xla::ConstantR0(builder, 1); + // vs has shape [..., m, 1] + auto v = DynamicSliceInMinorDims(vs, {j}, {1}); + // beta has shape [..., 1] + auto beta = DynamicSliceInMinorDims(taus, {j}, {1}); + + // yv has shape [..., n, 1] + auto yv = BatchDot(y, v, /*transpose_x=*/true, /*transpose_y=*/false, + /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + // wyv has shape [..., m, 1] + auto wyv = + BatchDot(w, yv, /*transpose_x=*/false, /*transpose_y=*/false, + /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + + auto z = xla::Mul( + -beta, v + wyv, + /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index})); + + w = DynamicUpdateSliceInMinorDims(w, z, {j}); + y = DynamicUpdateSliceInMinorDims(y, v, {j}); + + return std::vector{w, y, vs, taus}; + }; + + xla::XlaBuilder* builder = vs.builder(); + auto w = xla::Zeros(builder, xla::ShapeUtil::MakeShape( + type, ConcatVectors(batch_dims, {m, n}))); + auto y = w; + auto v = SliceInMinorDims(vs, {0}, {1}); + auto beta = SliceInMinorDims(taus, {0}, {1}); + y = UpdateSliceInMinorDims(y, v, {0}); + auto bv = xla::Mul( + -beta, v, + /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index})); + w = UpdateSliceInMinorDims(w, bv, {0}); + + TF_ASSIGN_OR_RETURN( + auto values, XlaForEachIndex(n - 1, xla::S32, body_fn, {w, y, vs, taus}, + "wy", builder)); + return values[0]; +} + +} // namespace + +// Block Householder QR Factorization. Algorithm 5.2.2 of Golub and van Loan. +// def qr_blocked(a, block_size): +// m = a.shape[0] +// n = a.shape[1] +// q = np.eye(m) +// for i in xrange(0, min(m, n), block_size): +// k = min(block_size, min(m, n) - s) +// (a, vs, taus) = qr(a[i:, i:i+k]) +// y = vs +// w = ComputeWYRepresentation(vs, taus, m-i, k) +// a[i:, i+r:] += np.dot(y, np.dot(w.T, a[i:, i+k:])) +// q[:, i:] += np.dot(q[:, i:], np.dot(w, y.T)) +// return (q, a) +// TODO(phawkins): consider using UT transformations (in the form I - V U V') +// rather than WY transformations. +xla::StatusOr QRDecomposition( + xla::XlaOp a, bool full_matrices, int64 block_size, + xla::PrecisionConfigProto::Precision precision) { + xla::XlaBuilder* builder = a.builder(); + TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); + const int num_dims = xla::ShapeUtil::Rank(a_shape); + if (num_dims < 2) { + return errors::InvalidArgument("Arguments to QR must have rank >= 2: ", + num_dims); + } + xla::PrimitiveType type = a_shape.element_type(); + + const int64 m = xla::ShapeUtil::GetDimension(a_shape, -2); + const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1); + const int64 p = std::min(m, n); + + if (block_size < 1) { + return errors::InvalidArgument( + "block_size argument to QR must be >= 1; got ", block_size); + } + + const int64 num_batch_dims = num_dims - 2; + std::vector batch_dims(num_batch_dims); + for (int i = 0; i < num_batch_dims; ++i) { + batch_dims[i] = xla::ShapeUtil::GetDimension(a_shape, i); + } + + auto q = xla::Broadcast(xla::IdentityMatrix(builder, type, m, m), batch_dims); + for (int64 i = 0; i < p; i += block_size) { + int64 k = std::min(block_size, p - i); + + auto a_block = SliceInMinorDims(a, {i, i}, {m, i + k}); + TF_ASSIGN_OR_RETURN(auto qr_block, QRBlock(a_block, precision)); + + a = UpdateSliceInMinorDims(a, qr_block.r, {i, i}); + + // Compute the I-WY block representation of a product of Householder + // matrices. + TF_ASSIGN_OR_RETURN( + auto w, ComputeWYRepresentation(type, batch_dims, qr_block.vs, + qr_block.taus, m - i, k, precision)); + auto y = qr_block.vs; + + // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:])) + auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n}); + auto a_update = + BatchDot(w, a_panel, /*transpose_x=*/true, /*transpose_y=*/false, + /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + a_update = + BatchDot(y, a_update, /*transpose_x=*/false, /*transpose_y=*/false, + /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + a_panel = a_panel + a_update; + a = UpdateSliceInMinorDims(a, a_panel, {i, i + k}); + + // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T)) + auto q_panel = SliceInMinorDims(q, {0, i}, {m, m}); + auto q_update = + BatchDot(q_panel, w, /*transpose_x=*/false, /*transpose_y=*/false, + /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + q_update = BatchDot(q_update, y, /*transpose_x=*/false, + /*transpose_y=*/true, /*conjugate_x=*/false, + /*conjugate_y=*/false, precision); + q_panel = q_panel + q_update; + q = UpdateSliceInMinorDims(q, q_panel, {0, i}); + } + QRDecompositionResult result; + + // full_matrices is false when only a partial result in needed. Slice to the + // needed dimensions here. + if (!full_matrices) { + q = SliceInMinorDims(q, {0, 0}, {m, p}); + a = SliceInMinorDims(a, {0, 0}, {p, n}); + } + result.q = q; + result.r = a; + return result; +} + +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/qr.h b/tensorflow/compiler/tf2xla/lib/qr.h new file mode 100644 index 0000000000000000000000000000000000000000..8a389fb7b053257adcd2a338dca52445c78381d1 --- /dev/null +++ b/tensorflow/compiler/tf2xla/lib/qr.h @@ -0,0 +1,43 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_ +#define TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_ + +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace tensorflow { + +// Computes the QR decompositions of a batch of matrices. That is, +// given a (batched) matrix a, computes an orthonormal matrix Q and an +// upper-triangular matrix R such that a = QR. +// `a` must be a (batched) matrix of size [..., m, n]. +// The algorithm implements a blocked QR decomposition; `block_size` is +// the block size to use. +// TODO(phawkins): handle the complex case. +struct QRDecompositionResult { + xla::XlaOp q; + xla::XlaOp r; +}; + +xla::StatusOr QRDecomposition( + xla::XlaOp a, bool full_matrices, int64 block_size = 128, + xla::PrecisionConfigProto::Precision precision = + xla::PrecisionConfigProto::HIGHEST); + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_ diff --git a/tensorflow/compiler/tf2xla/lib/random.cc b/tensorflow/compiler/tf2xla/lib/random.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e7cf00ee5e063aef36a9531ff87d8fe6928ca1f --- /dev/null +++ b/tensorflow/compiler/tf2xla/lib/random.cc @@ -0,0 +1,55 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/lib/random.h" + +#include +#include + +#include "tensorflow/compiler/tf2xla/xla_helpers.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/math.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/status_macros.h" + +namespace tensorflow { + +xla::XlaOp TruncatedNormal(xla::XlaOp uniform) { + auto normal_cdf = [](double x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + + const double kA = -2.0; + const double kB = 2.0; + const double kMu = 0.0; + const double kSigma = 1.0; + const double kAlpha = (kA - kMu) / kSigma; + const double kBeta = (kB - kMu) / kSigma; + const double kAlphaNormalCdf = normal_cdf(kAlpha); + const double kBetaNormalCdf = normal_cdf(kBeta); + const double kZ = kBetaNormalCdf - kAlphaNormalCdf; + + xla::XlaOp one = xla::ScalarLike(uniform, 1.0); + xla::XlaOp two = xla::ScalarLike(uniform, 2.0); + xla::XlaOp sqrt_2 = xla::ScalarLike(uniform, std::sqrt(2.0)); + xla::XlaOp z = xla::ScalarLike(uniform, kZ); + xla::XlaOp alpha_normal_cdf = xla::ScalarLike(uniform, kAlphaNormalCdf); + + auto p = alpha_normal_cdf + z * uniform; + // probit(p) = sqrt(2) * erfinv(2*p-1) + return sqrt_2 * xla::ErfInv(two * p - one); +} + +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/random.h b/tensorflow/compiler/tf2xla/lib/random.h new file mode 100644 index 0000000000000000000000000000000000000000..59fc5d0433a51328bc78006ab1c3495d908b44ac --- /dev/null +++ b/tensorflow/compiler/tf2xla/lib/random.h @@ -0,0 +1,35 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_ +#define TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_ + +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/core/framework/types.pb.h" + +namespace tensorflow { + +// Builds an array filled with values sampled from a truncated normal +// distribution such that no values are greater than two or less than negative +// two. +// +// The "uniform" parameter must be an array of random numbers distributed in +// (0,1). +xla::XlaOp TruncatedNormal(xla::XlaOp uniform); + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_ diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc index d5a27abb2585f699ae2719cb8a6b9a829263389e..38dfde165df47ca78a25a068a901cd1071aa55e2 100644 --- a/tensorflow/compiler/tf2xla/lib/scatter.cc +++ b/tensorflow/compiler/tf2xla/lib/scatter.cc @@ -18,15 +18,16 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/tf2xla/lib/util.h" #include "tensorflow/compiler/tf2xla/lib/while_loop.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace tensorflow { @@ -39,9 +40,9 @@ xla::StatusOr XlaScatter( TF_ASSIGN_OR_RETURN(xla::Shape buffer_shape, builder->GetShape(buffer)); TF_RETURN_IF_ERROR(builder->GetShape(updates).status()); TF_ASSIGN_OR_RETURN(xla::Shape indices_shape, builder->GetShape(indices)); - gtl::ArraySlice indices_dims = + absl::Span indices_dims = xla::AsInt64Slice(indices_shape.dimensions()); - gtl::ArraySlice buffer_dims = + absl::Span buffer_dims = xla::AsInt64Slice(buffer_shape.dimensions()); // If the indices are N-dimensional, the minor dimension of indices contains @@ -57,7 +58,7 @@ xla::StatusOr XlaScatter( ") must be <= the rank of the buffer (shape: ", xla::ShapeUtil::HumanString(buffer_shape), ")"); } - indices_dims.pop_back(); + indices_dims.remove_suffix(1); } int64 num_indices = 1; @@ -97,8 +98,8 @@ xla::StatusOr XlaScatter( buffer_shape_post_axes.end()); // Construct the initial values of the loop-carried Tensors. - auto flat_indices = builder->Reshape(indices, flat_indices_shape); - auto flat_updates = builder->Reshape(updates, flat_updates_shape); + auto flat_indices = xla::Reshape(indices, flat_indices_shape); + auto flat_updates = xla::Reshape(updates, flat_updates_shape); auto init = {flat_indices, flat_updates, buffer}; // Constructs the loop body. The implementation of scatter is essentially: @@ -106,52 +107,50 @@ xla::StatusOr XlaScatter( // index = dynamic-slice(indices, i) // update = dynamic-slice(updates, i) // buffer = dynamic-update-slice(buffer, update, index) - auto body_fn = [&](xla::XlaOp i, gtl::ArraySlice loop_vars, + auto body_fn = [&](xla::XlaOp i, absl::Span loop_vars, xla::XlaBuilder* body_builder) { auto indices = loop_vars[0]; auto updates = loop_vars[1]; auto buffer = loop_vars[2]; - auto zero_index = body_builder->ConstantLiteral( - xla::Literal::Zero(indices_shape.element_type())); + auto zero_index = xla::ConstantLiteral( + body_builder, xla::LiteralUtil::Zero(indices_shape.element_type())); // Slice the i-th index from the indices array. xla::XlaOp index; - auto indices_offset = body_builder->Reshape(i, {1}); + auto indices_offset = xla::Reshape(i, {1}); if (indices_are_vectors) { - indices_offset = body_builder->Pad(indices_offset, zero_index, - xla::MakeEdgePaddingConfig({{0, 1}})); + indices_offset = xla::Pad(indices_offset, zero_index, + xla::MakeEdgePaddingConfig({{0, 1}})); - index = body_builder->DynamicSlice(indices, indices_offset, - {1, num_index_dims}); - index = body_builder->Collapse(index, {0, 1}); + index = xla::DynamicSlice(indices, indices_offset, {1, num_index_dims}); + index = xla::Collapse(index, {0, 1}); } else { - index = body_builder->DynamicSlice(indices, indices_offset, {1}); + index = xla::DynamicSlice(indices, indices_offset, {1}); } // Discard updates with negative indices, since some users expect this. - auto index_in_range = - body_builder->ReduceAll(body_builder->Le(zero_index, index), - body_builder->ConstantR0(true), - xla::CreateScalarAndComputation(body_builder)); + auto index_in_range = xla::ReduceAll( + xla::Le(zero_index, index), xla::ConstantR0(body_builder, true), + xla::CreateScalarAndComputation(xla::PRED, body_builder)); // Make the index in bounds to prevent implementation defined behavior. - index = body_builder->Max(index, zero_index); - index = body_builder->Pad( + index = xla::Max(index, zero_index); + index = xla::Pad( index, zero_index, xla::MakeEdgePaddingConfig({{0, buffer_shape_post_axes.size()}})); // Slice the i-th index from the updates array. - auto updates_offset = body_builder->Reshape(i, {1}); - updates_offset = body_builder->Pad( + auto updates_offset = xla::Reshape(i, {1}); + updates_offset = xla::Pad( updates_offset, zero_index, xla::MakeEdgePaddingConfig({{0, buffer_shape_post_axes.size()}})); std::vector flat_updates_slice_shape({1}); flat_updates_slice_shape.insert(flat_updates_slice_shape.end(), buffer_shape_post_axes.begin(), buffer_shape_post_axes.end()); - auto update = body_builder->DynamicSlice(updates, updates_offset, - flat_updates_slice_shape); + auto update = + xla::DynamicSlice(updates, updates_offset, flat_updates_slice_shape); // Unflatten the major (iteration) dimensions of the slice to their // original shape. @@ -159,20 +158,19 @@ xla::StatusOr XlaScatter( updates_slice_shape.insert(updates_slice_shape.end(), buffer_shape_post_axes.begin(), buffer_shape_post_axes.end()); - update = body_builder->Reshape(update, updates_slice_shape); + update = xla::Reshape(update, updates_slice_shape); // Apply the update to the buffer. If there is a combiner, use it to merge // the current values with the update. - auto current_value = - body_builder->DynamicSlice(buffer, index, updates_slice_shape); + auto current_value = xla::DynamicSlice(buffer, index, updates_slice_shape); if (combiner) { update = combiner(current_value, update, body_builder); } // Use the current value instead of the update if the index is out of // bounds. - update = body_builder->Select(index_in_range, update, current_value); + update = xla::Select(index_in_range, update, current_value); // Apply the update. - buffer = body_builder->DynamicUpdateSlice(buffer, update, index); + buffer = xla::DynamicUpdateSlice(buffer, update, index); return std::vector{indices, updates, buffer}; }; diff --git a/tensorflow/compiler/tf2xla/lib/scatter.h b/tensorflow/compiler/tf2xla/lib/scatter.h index 87309e10ede320a81d173cd0a64492f88a2c7376..13a5f1b850a612bddeeac39bef431c19925351ca 100644 --- a/tensorflow/compiler/tf2xla/lib/scatter.h +++ b/tensorflow/compiler/tf2xla/lib/scatter.h @@ -18,8 +18,8 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/statusor.h" namespace tensorflow { diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc index b4503601f94baa5a595a64c9fc81bc92d9980ac6..37b2240b45b4ae6a587c827cfdfa1096b4e1737e 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc @@ -20,628 +20,397 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/lib/batch_dot.h" #include "tensorflow/compiler/tf2xla/lib/util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/math/math_util.h" namespace tensorflow { -xla::StatusOr TriangularSolve(xla::XlaBuilder* builder, - const xla::XlaOp& a, xla::XlaOp b, - bool left_side, bool lower, - bool transpose_a, bool conjugate_a, - int64 block_size) { - TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); - TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b)); - if (xla::ShapeUtil::Rank(a_shape) != xla::ShapeUtil::Rank(b_shape)) { - return errors::InvalidArgument( - "Arguments to TriangularSolve have different ranks: ", - xla::ShapeUtil::HumanString(a_shape), " vs. ", - xla::ShapeUtil::HumanString(b_shape)); - } - const int ndims = xla::ShapeUtil::Rank(a_shape); - if (ndims < 2) { - return errors::InvalidArgument( - "Arguments to TriangularSolve must have rank >= 2: ", ndims); - } - // The batch dimensions must be equal. - std::vector batch_dimensions; - for (int i = 0; i < ndims - 2; ++i) { - int64 a_size = a_shape.dimensions(i); - int64 b_size = b_shape.dimensions(i); - if (a_size != b_size) { - return errors::InvalidArgument( - "Batch dimensions of arguments to TriangularSolve must be equal: ", - xla::ShapeUtil::HumanString(a_shape), " vs ", - xla::ShapeUtil::HumanString(b_shape)); +// Get the diagonal blocks of the coefficient matrix +xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) { + xla::XlaBuilder* builder = a.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(a)); + int ndims = xla::ShapeUtil::Rank(shape); + int64 n = xla::ShapeUtil::GetDimension(shape, -1); + int64 num_blocks = n / block_size; + + xla::XlaOp diag_blocks; + + // If the coefficient matrix is exactly the block size, we just add a + // singleton dimension i.e. [..., n, n] -> [..., 1, n, n] + if (n == block_size) { + std::vector permutation(ndims); + std::iota(permutation.begin(), permutation.end(), 1); + permutation.insert(permutation.end() - 2, 0); + return Transpose(Broadcast(a, /*broadcast_sizes=*/{1}), permutation); } - batch_dimensions.push_back(a_size); - } - - if (xla::ShapeUtil::GetDimension(a_shape, -1) != - xla::ShapeUtil::GetDimension(a_shape, -2)) { - return errors::InvalidArgument( - "The 'a' arguments to TriangularSolve must be square matrices: ", - xla::ShapeUtil::HumanString(a_shape)); - } - const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2); - const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1); - if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(a_shape, -1)) { - return errors::InvalidArgument( - "Arguments to TriangularSolve have incompatible matrix shapes: ", - xla::ShapeUtil::HumanString(a_shape), " vs ", - xla::ShapeUtil::HumanString(b_shape)); - } - - if (block_size < 1) { - return errors::InvalidArgument( - "block_size argument to TriangularSolve must be >= 1; got ", - block_size); - } - - std::map base_computations; - auto get_base_triangular_solve = - [&](int k) -> xla::StatusOr { - xla::XlaComputation& computation = base_computations[k]; - if (computation.IsNull()) { - std::unique_ptr sub = builder->CreateSubBuilder( - tensorflow::strings::StrCat("trsm_base_", k)); - - auto a_param = sub->Parameter( - 0, - xla::ShapeUtil::MakeShape( - b_shape.element_type(), - PrependMajorDims(sub.get(), batch_dimensions, {k, k})), - "a"); - - std::array b_lastd; - if (left_side) { - b_lastd = {k, n}; - } else { - b_lastd = {m, k}; - } - auto b_param = sub->Parameter( - 1, - xla::ShapeUtil::MakeShape( - b_shape.element_type(), - PrependMajorDims(sub.get(), batch_dimensions, b_lastd)), - "b"); - - // We use a left-looking or right-looking subroutine on the block diagonal - // in the lower=true cases, while falling back to a recursive call in - // others. The left-looking and right-looking subroutines are written with - // a While loop and so yields much faster compile times. Moreover, they - // can give higher performance on smaller (sub)problems. - if (left_side && lower) { - TF_RETURN_IF_ERROR(TriangularSolveLeftLooking(sub.get(), a_param, - b_param, transpose_a, - conjugate_a) - .status()); - } else if (!left_side && lower) { - TF_RETURN_IF_ERROR(TriangularSolveRightLooking(sub.get(), a_param, - b_param, transpose_a, - conjugate_a) - .status()); - } else { - TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param, - left_side, lower, transpose_a, - conjugate_a, - /*block_size=*/1) - .status()); - } - TF_ASSIGN_OR_RETURN(computation, sub->Build()); + // We can grab entire blocks using gather + if (n > block_size) { + // Construct the starting indices of the diagonal blocks + auto start_indices = + Transpose(Broadcast(Mul(Iota(builder, xla::S32, num_blocks), + xla::ConstantR0(builder, block_size)), + /*broadcast_sizes=*/{2}), + /*permutation=*/{1, 0}); + + // Gather the diagonal blocks + xla::GatherDimensionNumbers dim_numbers; + dim_numbers.add_offset_dims(ndims - 1); + dim_numbers.add_offset_dims(ndims); + dim_numbers.add_start_index_map(ndims - 2); + dim_numbers.add_start_index_map(ndims - 1); + dim_numbers.set_index_vector_dim(1); + diag_blocks = Gather(a, start_indices, dim_numbers, + /*slice_sizes=*/{block_size, block_size}); } - return &computation; - }; - - xla::XlaOp output = Zeros(builder, b_shape); - - // Right-looking blocked triangular solve. - // For an explanation of the algorithm, see the TRSM discussion in: - // Goto, Kazushige, and Robert Van De Geijn. "High-performance implementation - // of the level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1 - // (2008): 4. - - // In the code comments below, T = lambda x: np.swapaxes(x, -1, -2) if - // conjugate_a is False, or T = lambda x: np.conj(np.swapaxes(x, -1, -2)) if - // conjugate_a is True. - - if (!left_side && lower == transpose_a) { - // for i in range(0, a.shape[-1], block_size): - for (int64 i = 0; i < n; i += block_size) { - int64 k = std::min(block_size, n - i); - - // output[..., :, i:i+k] = triangular_solve( - // a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1) - TF_ASSIGN_OR_RETURN(auto a_slice, - SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); - TF_ASSIGN_OR_RETURN(auto b_slice, - SliceInMinorDims(builder, b, {0, i}, {m, i + k})); - xla::XlaOp update; - if (k > 1) { - TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve, - get_base_triangular_solve(k)); - update = builder->Call(*solve, {a_slice, b_slice}); - } else { - TF_ASSIGN_OR_RETURN(auto a_slice_conj, - MaybeConjugate(builder, a_slice, conjugate_a)); - update = builder->Div(b_slice, a_slice_conj); - } - TF_ASSIGN_OR_RETURN( - output, UpdateSliceInMinorDims(builder, output, update, {0, i})); - - // if i + k < a.shape[-1]: - // a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:] - // a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2 - // b[..., :, i+k:] -= np.matmul(output[..., :, i:i+k], a_slice_2) - if (i + k < n) { - xla::XlaOp a_slice_2; - if (lower) { - TF_ASSIGN_OR_RETURN( - a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {n, i + k})); - } else { - TF_ASSIGN_OR_RETURN( - a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, n})); - } - TF_ASSIGN_OR_RETURN(auto b_update, - BatchDot(builder, update, a_slice_2, - /*transpose_x=*/false, - /*transpose_y=*/transpose_a, - /*conjugate_x=*/false, - /*conjugate_y=*/conjugate_a)); - TF_ASSIGN_OR_RETURN(auto b_slice_2, - SliceInMinorDims(builder, b, {0, i + k}, {m, n})); - b_update = builder->Sub(b_slice_2, b_update); - TF_ASSIGN_OR_RETURN( - b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k})); + // The last block might be smaller than the block size, + // so we will need to pad it + if (n % block_size != 0) { + // Pad with zeros + auto last_blocks = + SliceInMinorDims(a, {n - n % block_size, n - n % block_size}, {n, n}); + xla::PaddingConfig config = xla::MakeNoPaddingConfig(ndims); + int64 padding = block_size - n % block_size; + config.mutable_dimensions(ndims - 1)->set_edge_padding_high(padding); + config.mutable_dimensions(ndims - 2)->set_edge_padding_high(padding); + last_blocks = + Pad(last_blocks, Zero(builder, shape.element_type()), config); + + // Add a singleton dimension + // i.e. [..., block_size, block_size] -> [..., 1, block_size, block_size] + TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape, + builder->GetShape(last_blocks)); + auto shape_dims = xla::AsInt64Slice(blocks_shape.dimensions()); + auto last_blocks_dims = std::vector(ndims); + std::copy(shape_dims.begin(), shape_dims.end(), last_blocks_dims.begin()); + last_blocks_dims.insert(last_blocks_dims.end() - 2, 1); + last_blocks = Reshape(last_blocks, last_blocks_dims); + + // Concatenate with the other blocks if necessary + if (n > block_size) { + diag_blocks = + xla::ConcatInDim(builder, {diag_blocks, last_blocks}, ndims - 2); + } else { + diag_blocks = last_blocks; } } - } else if (left_side && lower != transpose_a) { - // for i in range(0, a.shape[-1], block_size): - for (int64 i = 0; i < m; i += block_size) { - int64 k = std::min(block_size, m - i); - - // output[..., i:i+k, :] = triangular_solve( - // a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1) - TF_ASSIGN_OR_RETURN(auto a_slice, - SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); - TF_ASSIGN_OR_RETURN(auto b_slice, - SliceInMinorDims(builder, b, {i, 0}, {i + k, n})); - xla::XlaOp update; - if (k > 1) { - TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve, - get_base_triangular_solve(k)); - update = builder->Call(*solve, {a_slice, b_slice}); - } else { - TF_ASSIGN_OR_RETURN(auto a_slice_conj, - MaybeConjugate(builder, a_slice, conjugate_a)); - update = builder->Div(b_slice, a_slice_conj); - } - TF_ASSIGN_OR_RETURN( - output, UpdateSliceInMinorDims(builder, output, update, {i, 0})); - - // if i + k < a.shape[-1]: - // a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:] - // a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2 - // b[..., i+k:, :] -= np.matmul(a_slice_2, output[..., i:i+k, :]) - if (i + k < m) { - xla::XlaOp a_slice_2; - if (lower) { - TF_ASSIGN_OR_RETURN( - a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {m, i + k})); - } else { - TF_ASSIGN_OR_RETURN( - a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, m})); - } + return diag_blocks; + }); +} - TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update, - /*transpose_x=*/transpose_a, - /*transpose_y=*/false, - /*conjugate_x=*/conjugate_a, - /*conjugate_y=*/false)); - TF_ASSIGN_OR_RETURN(auto b_slice_2, - SliceInMinorDims(builder, b, {i + k, 0}, {m, n})); - b_update = builder->Sub(b_slice_2, b_update); - TF_ASSIGN_OR_RETURN( - b, UpdateSliceInMinorDims(builder, b, b_update, {i + k, 0})); - } +xla::XlaOp InvertDiagonalBlocks( + xla::XlaOp diag_blocks, bool lower, bool transpose_a, bool conjugate_a, + xla::PrecisionConfigProto::Precision precision) { + xla::XlaBuilder* builder = diag_blocks.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + // Input is a batch of square lower triangular square matrices. Its shape is + // (..., size, size). We resize this to (num_blocks, size, size). + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(diag_blocks)); + int64 block_size = xla::ShapeUtil::GetDimension(shape, -1); + int64 num_blocks = xla::ShapeUtil::ElementsIn(shape) / + tensorflow::MathUtil::IPow(block_size, 2); + diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size}); + + // The input must be triangular because we rely on that when doing + // multiplications later on + diag_blocks = Triangle(diag_blocks, /*lower=*/lower); + + // Rescale blocks to be unit triangular, but avoid dividing by + // zero (which can happen if the last block was padded) otherwise it will + // introduce nans which will propagate + auto diags = GetMatrixDiagonal(diag_blocks); + TF_ASSIGN_OR_RETURN(xla::Shape diags_shape, builder->GetShape(diags)); + auto one = ScalarLike(diags, 1); + auto ones = Broadcast(one, xla::AsInt64Slice(diags_shape.dimensions())); + diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags); + auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2}); + + // We can now use the fact that for an upper triangular matrix + // [[L11, 0], [L21, L22]], given the inverses L11' and L22', we have + // L22' = -L22' * L21 * L11'. In our case, L21 is a vector and our blocks + // have been rescaled to be unit triangular, so L22 = L22' = 1. + + // Initialize the output matrix with -1s on the diagonal. We use -1 instead + // of 1 because we cannot do matrix-vector multiplies with variable shapes + // inside of a loop, or do irregularly shaped in-place updates. Hence, + // L21 <- -L22 * L21 * L11 cannot be done naively. Instead, we update the + // entire row i.e. we calculate + // [L21 L22 0] <- -[L21 L22 0] @ diag_blocks([L11', -I, -I]) + // which means [L21 L22 0] <- [-L21 * L11', L22, 0]. + auto identity = + IdentityMatrix(builder, shape.element_type(), block_size, block_size); + auto neg_identity = -identity; + + // The first or last diagonal element should be set to 1 instead of -1 + // though, since we never update it + auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1}); + auto start_index = (lower) ? 0 : block_size - 1; + auto output_block = DynamicUpdateSlice( + neg_identity, pos_one, + /*start_indices=*/xla::ConstantR1(builder, 2, start_index)); + + // Broadcast diag([1, -1, -1, ...]) to every block + xla::XlaOp output = Broadcast(output_block, + /*broadcast_sizes=*/{num_blocks}); + + // Now we construct a loop that performs matrix-vector multiplications + // inverting the blocks one row at a time + std::vector tuple_shapes = { + // The loop iteration counter is a scalar, incremented each iteration. + xla::ShapeUtil::MakeShape(xla::S32, {}), + // The output has the shape of A, with one row updated each iteration. + xla::ShapeUtil::MakeShape(shape.element_type(), + {num_blocks, block_size, block_size}), + // The input is a loop invariant. + xla::ShapeUtil::MakeShape(shape.element_type(), + {num_blocks, block_size, block_size})}; + xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes); + + auto init_i = One(builder, xla::S32); + auto init = xla::Tuple(builder, {init_i, output, scaled_diag_blocks}); + + // Construct the loop condition function. + std::unique_ptr condb = + builder->CreateSubBuilder("InvertDiagCond"); + { + auto i = GetTupleElement( + Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0); + Lt(i, xla::ConstantR0(condb.get(), block_size)); } - } else if (!left_side && lower != transpose_a) { - // for i in reversed(range(0, a.shape[-1], block_size)): - const int64 last_blk_ix = xla::RoundUpToNearest(n, block_size) - block_size; - for (int64 i = last_blk_ix; i >= 0; i -= block_size) { - int64 k = std::min(block_size, n - i); - - // output[..., :, i:i+k] triangular_solve( - // a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1) - TF_ASSIGN_OR_RETURN(auto a_slice, - SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); - TF_ASSIGN_OR_RETURN(auto b_slice, - SliceInMinorDims(builder, b, {0, i}, {m, i + k})); - xla::XlaOp update; - if (k > 1) { - TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve, - get_base_triangular_solve(k)); - update = builder->Call(*solve, {a_slice, b_slice}); - } else { - TF_ASSIGN_OR_RETURN(auto a_slice_conj, - MaybeConjugate(builder, a_slice, conjugate_a)); - update = builder->Div(b_slice, a_slice_conj); - } - TF_ASSIGN_OR_RETURN( - output, UpdateSliceInMinorDims(builder, output, update, {0, i})); - - // if i - k >= 0: - // a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k] - // a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2 - // b[..., :, :i] -= np.matmul(out[..., :, i:i+k], a_slice_2) - if (i - k >= 0) { - xla::XlaOp a_slice_2; - if (lower) { - TF_ASSIGN_OR_RETURN(a_slice_2, - SliceInMinorDims(builder, a, {i, 0}, {i + k, i})); - } else { - TF_ASSIGN_OR_RETURN(a_slice_2, - SliceInMinorDims(builder, a, {0, i}, {i, i + k})); - } + TF_ASSIGN_OR_RETURN(auto cond, condb->Build()); + + // Construct the loop body function. + std::unique_ptr bodyb = + builder->CreateSubBuilder("InvertDiagBody"); + { + auto input_tuple = + Parameter(bodyb.get(), 0, tuple_shape, "InvertDiagBodyTuple"); + + auto i = GetTupleElement(input_tuple, 0); + auto body_out = GetTupleElement(input_tuple, 1); + auto body_input = GetTupleElement(input_tuple, 2); + + auto zero = xla::ConstantR1(bodyb.get(), 1, 0); + auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i; + auto start_indices = + xla::ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0); + auto input_row = + DynamicSlice(body_input, start_indices, + /*slice_sizes=*/{num_blocks, 1, block_size}); + + // We want -L21 L11^{-1} + xla::DotDimensionNumbers dnums; + dnums.add_lhs_batch_dimensions(0); + dnums.add_rhs_batch_dimensions(0); + dnums.add_lhs_contracting_dimensions(2); + dnums.add_rhs_contracting_dimensions(1); + xla::PrecisionConfigProto precision_proto; + precision_proto.add_operand_precision(precision); + precision_proto.add_operand_precision(precision); + auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto); + + body_out = DynamicUpdateSlice(body_out, update, start_indices); + + auto next_i = i + ScalarLike(i, 1); + xla::Tuple(bodyb.get(), {next_i, body_out, body_input}); + } + TF_ASSIGN_OR_RETURN(auto body, bodyb->Build()); + + // Construct the While loop and return the result, + // return while_loop(cond_fun, body_fun, init)[1] + auto invert_while = While(cond, body, init); + auto inv_diag_blocks = GetTupleElement(invert_while, 1); + + // Undo the scaling + inv_diag_blocks = Div(inv_diag_blocks, diags, + /*broadcast_dimensions=*/{0, 1}); + + // Reshape back to original batch major dimensions + return Reshape(inv_diag_blocks, xla::AsInt64Slice(shape.dimensions())); + }); +} - TF_ASSIGN_OR_RETURN(auto b_update, - BatchDot(builder, update, a_slice_2, - /*transpose_x=*/false, - /*transpose_y=*/transpose_a, - /*conjugate_x=*/false, - /*conjugate_y=*/conjugate_a)); - TF_ASSIGN_OR_RETURN(auto b_slice_2, - SliceInMinorDims(builder, b, {0, 0}, {m, i})); - b_update = builder->Sub(b_slice_2, b_update); - TF_ASSIGN_OR_RETURN( - b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0})); +xla::XlaOp SolveWithInvertedDiagonalBlocks( + xla::XlaOp a, xla::XlaOp b, xla::XlaOp inv_diag_blocks, bool left_side, + bool lower, bool transpose_a, bool conjugate_a, + xla::PrecisionConfigProto::Precision precision) { + xla::XlaBuilder* builder = a.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape, + builder->GetShape(inv_diag_blocks)); + TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b)); + int64 block_size = xla::ShapeUtil::GetDimension(blocks_shape, -1); + + TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); + int64 ndims = xla::ShapeUtil::Rank(a_shape); + int64 n = xla::ShapeUtil::GetDimension(a_shape, -1); + int64 num_blocks = n / block_size + (n % block_size != 0); + int64 m_dim = (left_side) ? -1 : -2; + int64 m = xla::ShapeUtil::GetDimension(b_shape, m_dim); + + // Initialize the solution + auto x = ZerosLike(b); + + // This loop is unrolled for performance reasons, but it could be expressed + // rolled as well since the matrices are of the same size each iteration + for (int i = 0; i < num_blocks; i++) { + // High-level intuition: We have B[i] = L[i] @ X. Since L is upper + // triangular this means B[i] = L[i, :i + 1] @ X[:i + 1]. We can split + // this into two parts: B[i] = L[i, :i] @ X[:i] + L[i, i] @ X[i] which + // can be solved for X[i] as X[i] = inv(L[i, i]) @ B[i] - L[i, :i] @ X[:i] + + // Decide whether we go from first block to last or vice versa + auto j = (left_side ^ lower ^ transpose_a) ? num_blocks - 1 - i : i; + + // Get the size of the inverse blocks (the last one might be smaller) + int64 block = (n % block_size != 0 && j + 1 == num_blocks) + ? n % block_size + : block_size; + auto inv_block = + MaybeConjugate(Collapse(SliceInMinorDims(inv_diag_blocks, {j, 0, 0}, + {j + 1, block, block}), + /*dimensions=*/{ndims - 2, ndims - 1}), + conjugate_a); + + // Get the corresponding row of B + int64 k = std::min((j + 1) * block_size, n); + std::vector start = {j * block_size, 0}; + std::vector end = {k, m}; + if (!left_side) { + std::swap(start[0], start[1]); + std::swap(end[0], end[1]); } - } - } else { // left_side && lower == transpose_a - // for i in reversed(range(0, a.shape[-1], block_size)): - const int64 last_blk_ix = xla::RoundUpToNearest(m, block_size) - block_size; - for (int64 i = last_blk_ix; i >= 0; i -= block_size) { - int64 k = std::min(block_size, m - i); - - // output[..., i:i+k, :] triangular_solve( - // a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1) - TF_ASSIGN_OR_RETURN(auto a_slice, - SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); - TF_ASSIGN_OR_RETURN(auto b_slice, - SliceInMinorDims(builder, b, {i, 0}, {i + k, n})); - xla::XlaOp update; - if (k > 1) { - TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve, - get_base_triangular_solve(k)); - update = builder->Call(*solve, {a_slice, b_slice}); + auto b_row = SliceInMinorDims(b, start, end); + + xla::XlaOp remainder; + if (i == 0) { + remainder = b_row; } else { - TF_ASSIGN_OR_RETURN(auto a_slice_conj, - MaybeConjugate(builder, a_slice, conjugate_a)); - update = builder->Div(b_slice, a_slice_conj); - } - TF_ASSIGN_OR_RETURN( - output, UpdateSliceInMinorDims(builder, output, update, {i, 0})); - - // if i - k >= 0: - // a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k] - // a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2 - // b[..., :i, :] -= np.matmul(a_slice_2, out[..., i:i+k, :]) - if (i - k >= 0) { - xla::XlaOp a_slice_2; - if (lower) { - TF_ASSIGN_OR_RETURN(a_slice_2, - SliceInMinorDims(builder, a, {i, 0}, {i + k, i})); + // This matrix multiply involves a lot of multiplying with zero (namely, + // X[i * block_size:] = 0), but this is faster than slicing... + end = {k, n}; + if (!left_side) { + std::swap(end[0], end[1]); + } + if (transpose_a) { + std::swap(start[0], start[1]); + std::swap(end[0], end[1]); + } + auto a_row = + MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a); + if (left_side) { + remainder = b_row - BatchDot(a_row, x, transpose_a, false, + /*conjugate_x=*/false, + /*conjugate_y=*/false, precision); } else { - TF_ASSIGN_OR_RETURN(a_slice_2, - SliceInMinorDims(builder, a, {0, i}, {i, i + k})); + remainder = b_row - BatchDot(x, a_row, false, transpose_a, + /*conjugate_x=*/false, + /*conjugate_y=*/false, precision); } + } - TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update, - /*transpose_x=*/transpose_a, - /*transpose_y=*/false, - /*conjugate_x=*/conjugate_a, - /*conjugate_y=*/false)); - TF_ASSIGN_OR_RETURN(auto b_slice_2, - SliceInMinorDims(builder, b, {0, 0}, {i, n})); - b_update = builder->Sub(b_slice_2, b_update); - TF_ASSIGN_OR_RETURN( - b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0})); + xla::XlaOp x_update; + auto zero = Zero(builder, xla::S32); + auto start_index = + xla::ConstantR0WithType(builder, xla::S32, j * block_size); + std::vector update_starts = {start_index, zero}; + if (left_side) { + x_update = + BatchDot(inv_block, remainder, transpose_a, false, + /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + } else { + x_update = + BatchDot(remainder, inv_block, false, transpose_a, + /*conjugate_x=*/false, /*conjugate_y=*/false, precision); + std::swap(update_starts[0], update_starts[1]); } + x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts); } - } - return output; + return x; + }); } -xla::StatusOr TriangularSolveLeftLooking(xla::XlaBuilder* builder, - const xla::XlaOp& a, - const xla::XlaOp& b, - bool transpose_a, - bool conjugate_a) { - TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); - TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b)); - const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2); - const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1); - const int64 ndims = xla::ShapeUtil::Rank(a_shape); - - std::vector batch_dimensions; - for (int i = 0; i < ndims - 2; ++i) { - int64 a_size = a_shape.dimensions(i); - batch_dimensions.push_back(a_size); - } - - // The main computation is performed in a While loop. - - // Allocate the output and set its first or last row, - // output = np.zeros_like(b) - // if transpose_a: - // output[..., m-1:, :] = b[..., m-1:, :] / a[..., m-1:, m-1:] - // else: - // output[..., :1, :] = b[..., :1, :] / a[..., :1, :1] - xla::XlaOp output = Zeros(builder, b_shape); - { - auto i = transpose_a ? m - 1 : 0; - TF_ASSIGN_OR_RETURN(auto a_slice, - SliceInMinorDims(builder, a, {i, i}, {i + 1, i + 1})); - TF_ASSIGN_OR_RETURN(auto b_slice, - SliceInMinorDims(builder, b, {i, 0}, {i + 1, n})); - TF_ASSIGN_OR_RETURN(auto a_slice_conj, - MaybeConjugate(builder, a_slice, conjugate_a)); - auto update = builder->Div(b_slice, a_slice_conj); - TF_ASSIGN_OR_RETURN( - output, UpdateSliceInMinorDims(builder, output, update, {i, 0})); - } - - // Construct the initial loop carry tuple, - // if transpose_a: - // init = (m-2, output, a, b) - // else: - // init = (1, output, a, b) - std::vector tuple_shapes = { - // The loop iteration counter is a scalar, incremented each iteration. - xla::ShapeUtil::MakeShape(xla::S32, {}), - // The output has the shape of b, with one row updated each iteration. - b_shape, - // The coefficient matrix a is a loop invariant. - a_shape, - // The right-hand-side matrix b is a loop invariant. - b_shape}; - xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes); - auto init_i = builder->ConstantR0(transpose_a ? m - 2 : 1); - auto init = builder->Tuple({init_i, output, a, b}); - - // Construct the loop condition function, - // def cond_fun(loop_carry): - // i, output, a, b = loop_carry - // return i >= 0 if transpose_a else i < m - std::unique_ptr condb = - builder->CreateSubBuilder("TriangularSolveLeftLookingWhileCond"); - { - auto i = condb->GetTupleElement( - condb->Parameter(0, tuple_shape, - "TriangularSolveLeftLookingWhileTuple"), - 0); - if (transpose_a) { - condb->Ge(i, condb->ConstantR0(0)); - } else { - condb->Lt(i, condb->ConstantR0(m)); +xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side, + bool lower, bool transpose_a, bool conjugate_a, + int64 block_size, + xla::PrecisionConfigProto::Precision precision) { + xla::XlaBuilder* builder = a.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); + TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b)); + if (xla::ShapeUtil::Rank(a_shape) != xla::ShapeUtil::Rank(b_shape)) { + return errors::InvalidArgument( + "Arguments to TriangularSolve have different ranks: ", + xla::ShapeUtil::HumanString(a_shape), " vs. ", + xla::ShapeUtil::HumanString(b_shape)); } - } - TF_ASSIGN_OR_RETURN(auto cond, condb->Build()); - - // Construct the loop body function, - // def body_fun(loop_carry): - // i, output, a, b = loop_carry - // if transpose_a: - // a_row = np.swapaxes(a[..., i+1:, i:i+1], -1 -2) - // else: - // a_row = a[..., i:i+1, :i] - // result_row = b[..., i:i+1, :] - np.matmul(a_row, output[..., :, :]) - // output[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1] - // if transpose_a: - // return (i - 1, output, a, b) - // else: - // return (i + 1, output, a, b) - // We have to do some extra FLOPs propagating zeros in the matrix multiply - // because we can't have the size of its arguments depend on the loop counter. - std::unique_ptr bodyb = - builder->CreateSubBuilder("TriangularSolveLeftLookingWhileBody"); - { - auto input_tuple = bodyb->Parameter(0, tuple_shape, - "TriangularSolveLeftLookingWhileTuple"); - - // i, output, a, b = loop_carry - auto i = bodyb->GetTupleElement(input_tuple, 0); - auto body_out = bodyb->GetTupleElement(input_tuple, 1); - auto body_a = bodyb->GetTupleElement(input_tuple, 2); - auto body_b = bodyb->GetTupleElement(input_tuple, 3); - auto zero = bodyb->ConstantR0(0); - - // We'd like to implement this: - // if transpose_a: - // a_row = T(a[..., i+1:, i:i+1]) - // result_row = (b[..., i:i+1, :] - // - np.matmul(a_row, body_out[..., i+1:, :])) - // else: - // result_row = (b[..., i:i+1, :] - // - np.matmul(a[..., i:i+1, :i], body_out[..., :i, :])) - // But since we can't have intermediate array sizes depend on the loop - // counter, we instead exploit the fact that we initialized the output to - // all zeros and use that as zero-padding (doing unnecessary FLOPs). - xla::XlaOp a_row; - if (transpose_a) { - TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a, - {zero, i}, {m, 1})); - } else { - TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a, - {i, zero}, {1, m})); + const int64 ndims = xla::ShapeUtil::Rank(a_shape); + if (ndims < 2) { + return errors::InvalidArgument( + "Arguments to TriangularSolve must have rank >= 2: ", ndims); + } + // The batch dimensions must be equal. + std::vector batch_dimensions; + for (int i = 0; i < ndims - 2; ++i) { + int64 a_size = a_shape.dimensions(i); + int64 b_size = b_shape.dimensions(i); + if (a_size != b_size) { + return errors::InvalidArgument( + "Batch dimensions of arguments to TriangularSolve must be equal: ", + xla::ShapeUtil::HumanString(a_shape), " vs ", + xla::ShapeUtil::HumanString(b_shape)); + } + batch_dimensions.push_back(a_size); } - TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out, - /*transpose_x=*/transpose_a, - /*transpose_y=*/false, - /*conjugate_x=*/conjugate_a, - /*conjugate_y=*/false)); - TF_ASSIGN_OR_RETURN( - auto result_row_slice, - DynamicSliceInMinorDims(bodyb.get(), body_b, {i, zero}, {1, n})); - auto result_row = bodyb->Sub(result_row_slice, b_update); - - // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1] - TF_ASSIGN_OR_RETURN(auto a_elt, DynamicSliceInMinorDims(bodyb.get(), body_a, - {i, i}, {1, 1})); - TF_ASSIGN_OR_RETURN(auto a_elt_conj, - MaybeConjugate(bodyb.get(), a_elt, conjugate_a)); - auto div_result = bodyb->Div(result_row, a_elt_conj); - TF_ASSIGN_OR_RETURN(body_out, - DynamicUpdateSliceInMinorDims(bodyb.get(), body_out, - div_result, {i, zero})); - - // if transpose_a: - // return (i - 1, body_out, a, b) - // else: - // return (i + 1, body_out, a, b) - auto next_i = bodyb->Add(i, bodyb->ConstantR0(transpose_a ? -1 : 1)); - bodyb->Tuple({next_i, body_out, body_a, body_b}); - } - TF_ASSIGN_OR_RETURN(auto body, bodyb->Build()); - - // Construct the While loop and return the result, - // return while_loop(cond_fun, body_fun, init)[1] - auto triangular_solve_left_looking_while = builder->While(cond, body, init); - return builder->GetTupleElement(triangular_solve_left_looking_while, 1); -} -xla::StatusOr TriangularSolveRightLooking(xla::XlaBuilder* builder, - const xla::XlaOp& a, - const xla::XlaOp& b, - bool transpose_a, - bool conjugate_a) { - TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a)); - TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b)); - const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2); - const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1); - const int64 ndims = xla::ShapeUtil::Rank(a_shape); - - std::vector batch_dimensions; - for (int i = 0; i < ndims - 2; ++i) { - int64 a_size = a_shape.dimensions(i); - batch_dimensions.push_back(a_size); - } - - // The main computation is performed in a While loop. - xla::XlaOp output = Zeros(builder, b_shape); - - // Construct the initial loop carry tuple, - // if transpose_a: - // init = (0, output, a, b) - // else: - // init = (n-1, output, a, b) - std::vector tuple_shapes = { - // The loop iteration counter is a scalar, incremented each iteration. - xla::ShapeUtil::MakeShape(xla::S32, {}), - // The output has the shape of b, with one row updated each iteration. - b_shape, - // The coefficient matrix a is a loop invariant. - a_shape, - // The right-hand-side matrix b is a loop invariant. - b_shape}; - xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes); - auto init_i = builder->ConstantR0(transpose_a ? 0 : n - 1); - auto init = builder->Tuple({init_i, output, a, b}); - - // Construct the loop condition function, - // def cond_fun(loop_carry): - // i, output, a, b = loop_carry - // return i < n if transpose_a else i >= 0 - std::unique_ptr condb = - builder->CreateSubBuilder("TriangularSolveRightLookingWhileCond"); - { - auto i = condb->GetTupleElement( - condb->Parameter(0, tuple_shape, - "TriangularSolveRightLookingWhileTuple"), - 0); - if (transpose_a) { - condb->Lt(i, condb->ConstantR0(n)); - } else { - condb->Ge(i, condb->ConstantR0(0)); + if (xla::ShapeUtil::GetDimension(a_shape, -1) != + xla::ShapeUtil::GetDimension(a_shape, -2)) { + return errors::InvalidArgument( + "The 'a' arguments to TriangularSolve must be square matrices: ", + xla::ShapeUtil::HumanString(a_shape)); } - } - TF_ASSIGN_OR_RETURN(auto cond, condb->Build()); - - // Construct the loop body function, - // def body_fun(loop_carry): - // i, output, a, b = loop_carry - // if transpose_a: - // a_row = np.swapaxes(a[..., :, i:i+1], -1 -2) - // else: - // a_row = a[..., :, i:i+1] - // result_row = b[..., :, i:i+1] - np.matmul(output, a_row) - // output[..., :, i:i+1] = result_row / a[..., i:i+1, i:i+1] - // if transpose_a: - // return (i - 1, output, a, b) - // else: - // return (i + 1, output, a, b) - // We have to do some extra FLOPs propagating zeros in the matrix multiply - // because we can't have the size of its arguments depend on the loop counter. - std::unique_ptr bodyb = - builder->CreateSubBuilder("TriangularSolveRightLookingWhileBody"); - { - auto input_tuple = bodyb->Parameter( - 0, tuple_shape, "TriangularSolveRightLookingWhileTuple"); - - // i, output, a, b = loop_carry - auto i = bodyb->GetTupleElement(input_tuple, 0); - auto body_out = bodyb->GetTupleElement(input_tuple, 1); - auto body_a = bodyb->GetTupleElement(input_tuple, 2); - auto body_b = bodyb->GetTupleElement(input_tuple, 3); - auto zero = bodyb->ConstantR0(0); - - // We'd like to implement b[..., :, i:i+1] - np.matmul(output, a[..., :, - // i:i+1]) But since we can't have intermediate array sizes depend on the - // loop counter, we instead exploit the fact that we initialized the output - // to all zeros and use that as zero-padding (doing unnecessary FLOPs). - TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), body_out, body_a, - /*transpose_x=*/false, - /*transpose_y=*/transpose_a, - /*conjugate_x=*/false, - /*conjugate_y=*/conjugate_a)); - // result = b - np.matmul(output, a) - auto result = bodyb->Sub(body_b, b_update); - // result_row = result[..., :, i:i+1] - TF_ASSIGN_OR_RETURN( - auto result_row, - DynamicSliceInMinorDims(bodyb.get(), result, {zero, i}, {m, 1})); - - // body_out[..., :, i:i+1] = result_row / a[..., i:i+1, i:i+1] - TF_ASSIGN_OR_RETURN(auto a_ii, DynamicSliceInMinorDims(bodyb.get(), body_a, - {i, i}, {1, 1})); - TF_ASSIGN_OR_RETURN(auto a_ii_conj, - MaybeConjugate(bodyb.get(), a_ii, conjugate_a)); - auto div_result = bodyb->Div(result_row, a_ii_conj); - TF_ASSIGN_OR_RETURN(body_out, - DynamicUpdateSliceInMinorDims(bodyb.get(), body_out, - div_result, {zero, i})); - - // if transpose_a: - // return (i + 1, body_out, a, b) - // else: - // return (i - 1, body_out, a, b) - auto next_i = bodyb->Add(i, bodyb->ConstantR0(transpose_a ? 1 : -1)); - bodyb->Tuple({next_i, body_out, body_a, body_b}); - } - TF_ASSIGN_OR_RETURN(auto body, bodyb->Build()); - - // Construct the While loop and return the result, - // return while_loop(cond_fun, body_fun, init)[1] - auto triangular_solve_left_looking_while = builder->While(cond, body, init); - return builder->GetTupleElement(triangular_solve_left_looking_while, 1); + const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2); + const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1); + if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(a_shape, -1)) { + return errors::InvalidArgument( + "Arguments to TriangularSolve have incompatible matrix shapes: ", + xla::ShapeUtil::HumanString(a_shape), " vs ", + xla::ShapeUtil::HumanString(b_shape)); + } + + if (block_size < 1) { + return errors::InvalidArgument( + "block_size argument to TriangularSolve must be >= 1; got ", + block_size); + } + + // We find the diagonal blocks of the coefficient matrix + auto diag_blocks = DiagonalBlocks(a, block_size); + + // We invert these blocks in parallel using batched matrix-vector products + auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, transpose_a, + conjugate_a, precision); + + // We now find the solution using GEMMs + auto x = + SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side, lower, + transpose_a, conjugate_a, precision); + + return x; + }); } } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h index 540c26b2473df9e7885f4e549b3e516a3d8a0d43..ac42a4835295b7cb52697710d738f4728d3983d1 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h @@ -16,8 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_ #define TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_ -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" namespace tensorflow { @@ -57,23 +57,11 @@ namespace tensorflow { // // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no // blocking is used. -xla::StatusOr TriangularSolve(xla::XlaBuilder* builder, - const xla::XlaOp& a, xla::XlaOp b, - bool left_side, bool lower, - bool transpose_a, bool conjugate_a, - int64 block_size = 256); - -xla::StatusOr TriangularSolveLeftLooking(xla::XlaBuilder* builder, - const xla::XlaOp& a, - const xla::XlaOp& b, - bool transpose_a, - bool conjugate_a); - -xla::StatusOr TriangularSolveRightLooking(xla::XlaBuilder* builder, - const xla::XlaOp& a, - const xla::XlaOp& b, - bool transpose_a, - bool conjugate_a); +xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side, + bool lower, bool transpose_a, bool conjugate_a, + int64 block_size = 128, + xla::PrecisionConfigProto::Precision precision = + xla::PrecisionConfigProto::HIGHEST); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc index 87ea4763f7c2357ae179b68ade3715b24c46432f..aeebf16028d40189203cdfd815f06a339ee72902 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc @@ -20,8 +20,8 @@ limitations under the License. #include #include "tensorflow/compiler/xla/array2d.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -85,11 +85,10 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) { xla::XlaOp a, b; auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsRight(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/false, /*lower=*/true, - /*transpose_a=*/true, /*conjugate_a=*/false, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/false, /*lower=*/true, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); xla::Array2D expected({ {0.5, 0.08333334, 0.04629629, 0.03367003}, @@ -107,11 +106,10 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) { xla::XlaOp a, b; auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsRight(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/false, /*lower=*/true, - /*transpose_a=*/false, /*conjugate_a=*/false, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/false, /*lower=*/true, + /*transpose_a=*/false, /*conjugate_a=*/false, + /*block_size=*/2); xla::Array2D expected({ {-0.16414141, -0.06902357, -0.07070707, 0.36363636}, @@ -129,11 +127,10 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) { xla::XlaOp a, b; auto a_data = CreateR2Parameter(AValsUpper(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsRight(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/false, /*lower=*/false, - /*transpose_a=*/true, /*conjugate_a=*/false, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/false, /*lower=*/false, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); xla::Array2D expected({ {-0.16414141, -0.06902357, -0.07070707, 0.36363636}, @@ -151,11 +148,10 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) { xla::XlaOp a, b; auto a_data = CreateR2Parameter(AValsUpper(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsRight(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/false, /*lower=*/false, - /*transpose_a=*/false, /*conjugate_a=*/false, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/false, /*lower=*/false, + /*transpose_a=*/false, /*conjugate_a=*/false, + /*block_size=*/2); xla::Array2D expected({ {0.5, 0.08333334, 0.04629629, 0.03367003}, @@ -173,11 +169,10 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) { xla::XlaOp a, b; auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/true, /*lower=*/true, - /*transpose_a=*/true, /*conjugate_a=*/false, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/true, /*lower=*/true, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); xla::Array2D expected({ {-0.89646465, -0.69444444, -0.49242424}, @@ -196,11 +191,32 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) { xla::XlaOp a, b; auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/true, /*lower=*/true, - /*transpose_a=*/false, /*conjugate_a=*/false, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/true, /*lower=*/true, + /*transpose_a=*/false, /*conjugate_a=*/false, + /*block_size=*/2); + + xla::Array2D expected({ + {0.5, 1.0, 1.5}, + {0.41666667, 0.33333333, 0.25}, + {0.23148148, 0.18518519, 0.13888889}, + {0.16835017, 0.13468013, 0.1010101}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) { + xla::XlaBuilder builder(TestName()); + + xla::XlaOp a, b; + auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); + TriangularSolve(a, b, + /*left_side=*/true, /*lower=*/true, + /*transpose_a=*/false, /*conjugate_a=*/false, + /*block_size=*/3); xla::Array2D expected({ {0.5, 1.0, 1.5}, @@ -219,11 +235,10 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) { xla::XlaOp a, b; auto a_data = CreateR2Parameter(AValsUpper(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/true, /*lower=*/false, - /*transpose_a=*/true, /*conjugate_a=*/false, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/true, /*lower=*/false, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); xla::Array2D expected({ {0.5, 1.0, 1.5}, @@ -242,11 +257,10 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) { xla::XlaOp a, b; auto a_data = CreateR2Parameter(AValsUpper(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/true, /*lower=*/false, - /*transpose_a=*/false, /*conjugate_a=*/false, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/true, /*lower=*/false, + /*transpose_a=*/false, /*conjugate_a=*/false, + /*block_size=*/2); xla::Array2D expected({ {-0.89646465, -0.69444444, -0.49242424}, @@ -267,11 +281,10 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) { CreateR2Parameter(AValsLowerComplex(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsRightComplex(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/false, /*lower=*/true, - /*transpose_a=*/true, /*conjugate_a=*/true, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/false, /*lower=*/true, + /*transpose_a=*/true, /*conjugate_a=*/true, + /*block_size=*/2); xla::Array2D expected({ {0.5, complex64(0.08333333, 0.08333333), @@ -295,11 +308,10 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) { CreateR2Parameter(AValsUpperComplex(), 0, "a", &builder, &a); auto b_data = CreateR2Parameter(BValsLeftComplex(), 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, - /*left_side=*/true, /*lower=*/false, - /*transpose_a=*/true, /*conjugate_a=*/false, - /*block_size=*/2); - TF_ASSERT_OK(result.status()); + TriangularSolve(a, b, + /*left_side=*/true, /*lower=*/false, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); xla::Array2D expected({ {0.5, 1., 1.5}, @@ -317,49 +329,5 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) { xla::ErrorSpec(1e-2, 1e-2)); } -XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) { - xla::XlaBuilder builder(TestName()); - - xla::XlaOp a, b; - auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); - auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); - auto result = TriangularSolveLeftLooking(&builder, a, b, - /*transpose_a=*/false, - /*conjugate_a=*/false); - TF_ASSERT_OK(result.status()); - - xla::Array2D expected({ - {0.5, 1.0, 1.5}, - {0.41666667, 0.33333333, 0.25}, - {0.23148148, 0.18518519, 0.13888889}, - {0.16835017, 0.13468013, 0.1010101}, - }); - - ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, - xla::ErrorSpec(1e-2, 1e-2)); -} - -XLA_TEST_F(TriangularSolveLeftLookingTest, NonzeroUpperTriangle) { - xla::XlaBuilder builder(TestName()); - - xla::XlaOp a, b; - auto a_data = CreateR2Parameter(AValsFull(), 0, "a", &builder, &a); - auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); - auto result = TriangularSolveLeftLooking(&builder, a, b, - /*transpose_a=*/false, - /*conjugate_a=*/false); - TF_ASSERT_OK(result.status()); - - xla::Array2D expected({ - {0.5, 1.0, 1.5}, - {0.41666667, 0.33333333, 0.25}, - {0.23148148, 0.18518519, 0.13888889}, - {0.16835017, 0.13468013, 0.1010101}, - }); - - ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, - xla::ErrorSpec(1e-2, 1e-2)); -} - } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc index d9ff7e6259f3fbab8957394bff5c5670a67dd0eb..c26784852472061ffead03cfe7431f8b8ba0e555 100644 --- a/tensorflow/compiler/tf2xla/lib/util.cc +++ b/tensorflow/compiler/tf2xla/lib/util.cc @@ -18,6 +18,8 @@ limitations under the License. #include #include +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -28,8 +30,9 @@ limitations under the License. namespace tensorflow { xla::XlaOp Zeros(xla::XlaBuilder* builder, const xla::Shape& shape) { - return builder->Broadcast( - builder->ConstantLiteral(xla::Literal::Zero(shape.element_type())), + return xla::Broadcast( + xla::ConstantLiteral(builder, + xla::LiteralUtil::Zero(shape.element_type())), xla::AsInt64Slice(shape.dimensions())); } @@ -37,19 +40,19 @@ xla::XlaOp FloatLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type, double value) { switch (type) { case xla::F16: - return builder->ConstantR0(static_cast(value)); + return xla::ConstantR0(builder, static_cast(value)); break; case xla::BF16: - return builder->ConstantR0(static_cast(value)); + return xla::ConstantR0(builder, static_cast(value)); break; case xla::F32: - return builder->ConstantR0(static_cast(value)); + return xla::ConstantR0(builder, static_cast(value)); break; case xla::F64: - return builder->ConstantR0(value); + return xla::ConstantR0(builder, value); break; case xla::C64: - return builder->ConstantR0(value); + return xla::ConstantR0(builder, value); break; default: LOG(FATAL) << "unhandled element type " << type; @@ -61,31 +64,31 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type, xla::Literal literal; switch (type) { case xla::U8: - literal = std::move(*xla::Literal::CreateR0(value)); + literal = std::move(*xla::LiteralUtil::CreateR0(value)); break; case xla::U32: - literal = std::move(*xla::Literal::CreateR0(value)); + literal = std::move(*xla::LiteralUtil::CreateR0(value)); break; case xla::U64: - literal = std::move(*xla::Literal::CreateR0(value)); + literal = std::move(*xla::LiteralUtil::CreateR0(value)); break; case xla::S8: - literal = std::move(*xla::Literal::CreateR0(value)); + literal = std::move(*xla::LiteralUtil::CreateR0(value)); break; case xla::S32: - literal = std::move(*xla::Literal::CreateR0(value)); + literal = std::move(*xla::LiteralUtil::CreateR0(value)); break; case xla::S64: - literal = std::move(*xla::Literal::CreateR0(value)); + literal = std::move(*xla::LiteralUtil::CreateR0(value)); break; case xla::F32: - literal = std::move(*xla::Literal::CreateR0(value)); + literal = std::move(*xla::LiteralUtil::CreateR0(value)); break; case xla::F64: - literal = std::move(*xla::Literal::CreateR0(value)); + literal = std::move(*xla::LiteralUtil::CreateR0(value)); break; case xla::C64: - literal = std::move(*xla::Literal::CreateR0(value)); + literal = std::move(*xla::LiteralUtil::CreateR0(value)); break; case xla::PRED: LOG(FATAL) << "pred element type is not integral"; @@ -94,11 +97,11 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type, LOG(FATAL) << "u16/s16 literals not yet implemented"; case xla::BF16: literal = std::move( - *xla::Literal::CreateR0(static_cast(value))); + *xla::LiteralUtil::CreateR0(static_cast(value))); break; case xla::F16: - literal = std::move( - *xla::Literal::CreateR0(static_cast(value))); + literal = std::move(*xla::LiteralUtil::CreateR0( + static_cast(value))); break; case xla::TUPLE: LOG(FATAL) << "tuple element type is not integral"; @@ -107,134 +110,142 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type, default: LOG(FATAL) << "unhandled element type " << type; } - return builder->ConstantLiteral(literal); + return xla::ConstantLiteral(builder, literal); } -xla::StatusOr SliceInMinorDims(xla::XlaBuilder* builder, - const xla::XlaOp& x, - gtl::ArraySlice start, - gtl::ArraySlice end) { - TF_RET_CHECK(start.size() == end.size()); - int64 n_minor_dims = start.size(); - - TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); - - const int64 n_dims = xla::ShapeUtil::Rank(shape); - TF_RET_CHECK(n_minor_dims <= n_dims); - gtl::ArraySlice major_dims(xla::AsInt64Slice(shape.dimensions()), - /*pos=*/0, - /*len=*/n_dims - n_minor_dims); - - // Prepends 0s in the major dim - std::vector padded_start(n_dims, 0); - std::copy(start.begin(), start.end(), - padded_start.begin() + major_dims.size()); - - // Prepends the shape of the major dims. - std::vector padded_end(n_dims); - std::copy(major_dims.begin(), major_dims.end(), padded_end.begin()); - std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size()); - - std::vector strides(n_dims, 1); - return builder->Slice(x, padded_start, padded_end, strides); +xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span start, + absl::Span end) { + xla::XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_RET_CHECK(start.size() == end.size()); + int64 n_minor_dims = start.size(); + + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); + + const int64 n_dims = xla::ShapeUtil::Rank(shape); + TF_RET_CHECK(n_minor_dims <= n_dims); + auto major_dims = xla::AsInt64Slice(shape.dimensions()) + .subspan( + /*pos=*/0, + /*len=*/n_dims - n_minor_dims); + + // Prepends 0s in the major dim + std::vector padded_start(n_dims, 0); + std::copy(start.begin(), start.end(), + padded_start.begin() + major_dims.size()); + + // Prepends the shape of the major dims. + std::vector padded_end(n_dims); + std::copy(major_dims.begin(), major_dims.end(), padded_end.begin()); + std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size()); + + std::vector strides(n_dims, 1); + return xla::Slice(x, padded_start, padded_end, strides); + }); } -std::vector PrependMajorDims(xla::XlaBuilder* builder, - const gtl::ArraySlice& major_dims, - const gtl::ArraySlice& indices) { - std::vector output(indices.size() + major_dims.size()); - std::copy(major_dims.begin(), major_dims.end(), output.begin()); - std::copy(indices.begin(), indices.end(), output.begin() + major_dims.size()); +std::vector ConcatVectors(absl::Span xs, + absl::Span ys) { + std::vector output(xs.size() + ys.size()); + std::copy(xs.begin(), xs.end(), output.begin()); + std::copy(ys.begin(), ys.end(), output.begin() + xs.size()); return output; } -xla::StatusOr DynamicSliceInMinorDims( - xla::XlaBuilder* builder, const xla::XlaOp& x, - const std::vector& starts, - const gtl::ArraySlice& sizes) { - TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); - const int64 n_dims = xla::ShapeUtil::Rank(shape); - int64 n_minor_dims = starts.size(); - TF_RET_CHECK(n_minor_dims == sizes.size()); - TF_RET_CHECK(n_minor_dims <= n_dims); - gtl::ArraySlice major_dims(xla::AsInt64Slice(shape.dimensions()), - /*pos=*/0, - /*len=*/n_dims - sizes.size()); - TF_ASSIGN_OR_RETURN(auto padded_starts, - PrependZerosInMajorDims(builder, x, starts)); - auto padded_sizes = PrependMajorDims(builder, major_dims, sizes); - return builder->DynamicSlice(x, padded_starts, padded_sizes); +xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x, + absl::Span starts, + absl::Span sizes) { + xla::XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); + const int64 n_dims = xla::ShapeUtil::Rank(shape); + int64 n_minor_dims = starts.size(); + TF_RET_CHECK(n_minor_dims == sizes.size()); + TF_RET_CHECK(n_minor_dims <= n_dims); + auto major_dims = xla::AsInt64Slice(shape.dimensions()) + .subspan( + /*pos=*/0, + /*len=*/n_dims - sizes.size()); + auto padded_starts = PrependZerosInMajorDims(x, starts); + auto padded_sizes = ConcatVectors(major_dims, sizes); + return xla::DynamicSlice(x, padded_starts, padded_sizes); + }); } -xla::StatusOr UpdateSlice(xla::XlaBuilder* builder, - const xla::XlaOp& x, - const xla::XlaOp& update, - gtl::ArraySlice start) { - // TODO(phawkins): make int64 work on all backends, remove the int32 cast. - std::vector start_as_int32(start.begin(), start.end()); - auto start_constant = builder->ConstantR1(start_as_int32); - TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); - const int64 n_dims = xla::ShapeUtil::Rank(shape); - TF_ASSIGN_OR_RETURN(xla::Shape start_constant_shape, - builder->GetShape(start_constant)); - const int64 start_length = - xla::ShapeUtil::GetDimension(start_constant_shape, -1); - TF_RET_CHECK(start_length == n_dims); - return builder->DynamicUpdateSlice(x, update, start_constant); +xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update, + absl::Span start) { + xla::XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + // TODO(phawkins): make int64 work on all backends, remove the int32 cast. + std::vector start_as_int32(start.begin(), start.end()); + auto start_constant = xla::ConstantR1(builder, start_as_int32); + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); + const int64 n_dims = xla::ShapeUtil::Rank(shape); + TF_ASSIGN_OR_RETURN(xla::Shape start_constant_shape, + builder->GetShape(start_constant)); + const int64 start_length = + xla::ShapeUtil::GetDimension(start_constant_shape, -1); + TF_RET_CHECK(start_length == n_dims); + return xla::DynamicUpdateSlice(x, update, start_constant); + }); } -xla::StatusOr UpdateSliceInMinorDims(xla::XlaBuilder* builder, - const xla::XlaOp& x, - const xla::XlaOp& update, - gtl::ArraySlice start) { - TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); - const int64 n_dims = xla::ShapeUtil::Rank(shape); - const int64 n_minor_dims = start.size(); - TF_RET_CHECK(n_minor_dims <= n_dims); - std::vector padded_start(n_dims, 0); - std::copy(start.begin(), start.end(), - padded_start.begin() + (n_dims - n_minor_dims)); - return UpdateSlice(builder, x, update, padded_start); +xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update, + absl::Span start) { + xla::XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); + const int64 n_dims = xla::ShapeUtil::Rank(shape); + const int64 n_minor_dims = start.size(); + TF_RET_CHECK(n_minor_dims <= n_dims); + std::vector padded_start(n_dims, 0); + std::copy(start.begin(), start.end(), + padded_start.begin() + (n_dims - n_minor_dims)); + return UpdateSlice(x, update, padded_start); + }); } -xla::StatusOr DynamicUpdateSliceInMinorDims( - xla::XlaBuilder* builder, const xla::XlaOp& x, const xla::XlaOp& update, - const std::vector& starts) { - TF_ASSIGN_OR_RETURN(auto padded_starts, - PrependZerosInMajorDims(builder, x, starts)); - return builder->DynamicUpdateSlice(x, update, padded_starts); +xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update, + absl::Span starts) { + auto padded_starts = PrependZerosInMajorDims(x, starts); + return xla::DynamicUpdateSlice(x, update, padded_starts); } -xla::StatusOr PrependZerosInMajorDims( - xla::XlaBuilder* builder, const xla::XlaOp& x, - const std::vector& starts) { - TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); - const int64 n_dims = xla::ShapeUtil::Rank(shape); - auto zero = builder->Reshape(builder->ConstantR0(0), {1}); - std::vector padded_starts(n_dims, zero); - for (int i = 0; i < starts.size(); ++i) { - padded_starts[n_dims - starts.size() + i] = - builder->Reshape(starts[i], {1}); - } - return builder->ConcatInDim(padded_starts, 0); +xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x, + absl::Span starts) { + xla::XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); + const int64 n_dims = xla::ShapeUtil::Rank(shape); + auto zero = xla::Reshape(xla::ConstantR0(builder, 0), {1}); + std::vector padded_starts(n_dims, zero); + for (int i = 0; i < starts.size(); ++i) { + padded_starts[n_dims - starts.size() + i] = xla::Reshape(starts[i], {1}); + } + return xla::ConcatInDim(builder, padded_starts, 0); + }); } -xla::StatusOr TransposeInMinorDims(xla::XlaBuilder* builder, - const xla::XlaOp& x) { - TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); - const int64 n_dims = xla::ShapeUtil::Rank(shape); - TF_RET_CHECK(n_dims >= 2); - std::vector permutation(n_dims); - std::iota(permutation.begin(), permutation.end(), 0); - std::swap(permutation[n_dims - 1], permutation[n_dims - 2]); - return builder->Transpose(x, permutation); +xla::XlaOp TransposeInMinorDims(xla::XlaOp x) { + xla::XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); + const int64 n_dims = xla::ShapeUtil::Rank(shape); + TF_RET_CHECK(n_dims >= 2); + std::vector permutation(n_dims); + std::iota(permutation.begin(), permutation.end(), 0); + std::swap(permutation[n_dims - 1], permutation[n_dims - 2]); + return xla::Transpose(x, permutation); + }); } -xla::StatusOr MaybeConjugate(xla::XlaBuilder* builder, - const xla::XlaOp& x, bool conjugate) { - TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); - auto perform_conj = shape.element_type() == xla::C64 && conjugate; - return perform_conj ? builder->Conj(x) : x; +xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate) { + xla::XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); + auto perform_conj = shape.element_type() == xla::C64 && conjugate; + return perform_conj ? xla::Conj(x) : x; + }); } } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h index 3c120a2548576d6ad46870583ca65beea63507a3..80e9e5b002d49581209e608b98606e02709c5876 100644 --- a/tensorflow/compiler/tf2xla/lib/util.h +++ b/tensorflow/compiler/tf2xla/lib/util.h @@ -16,16 +16,13 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_ #define TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_ -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace tensorflow { -// Returns a zero-filled tensor with shape `shape`. -xla::XlaOp Zeros(xla::XlaBuilder* builder, const xla::Shape& shape); - // Returns a floating point scalar constant of 'type' with 'value'. // If 'type' is complex, returns a real value with zero imaginary component. xla::XlaOp FloatLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type, @@ -33,62 +30,51 @@ xla::XlaOp FloatLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type, // Makes a 1D tensor [0, ..., x, y] from two tensors x and y with zeros // prepended until the array is length n_dims. -xla::XlaOp PrependZerosInMajorDims(xla::XlaBuilder* builder, - gtl::ArraySlice starts); +xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x, + absl::Span starts); // Returns a integer scalar constant of 'type' with 'value'. // If 'type' is complex, returns a real value with zero imaginary component. xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type, int64 value); -// Builds a vector of zeros of length rank(x) with the last two values being +// Builds a vector of zeros of length rank(x) with the last values being // those in `starts`. -xla::StatusOr PrependZerosInMajorDims( - xla::XlaBuilder* builder, const xla::XlaOp& x, - const std::vector& starts); +xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x, + absl::Span starts); // Performs a slice in the minor dimensions of a Tensor. -xla::StatusOr SliceInMinorDims(xla::XlaBuilder* builder, - const xla::XlaOp& x, - gtl::ArraySlice start, - gtl::ArraySlice end); +xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span start, + absl::Span end); -// Builds a 1-d vector out of a concatenation of `major_dims` and `starts`. -std::vector PrependMajorDims(xla::XlaBuilder* builder, - const gtl::ArraySlice& major_dims, - const gtl::ArraySlice& indices); +// Returns the concatenation of `xs` and `ys`. +std::vector ConcatVectors(absl::Span xs, + absl::Span ys); // Performs a dynamic slice in the minor dimensions of a Tensor. -xla::StatusOr DynamicSliceInMinorDims( - xla::XlaBuilder* builder, const xla::XlaOp& x, - const std::vector& starts, const gtl::ArraySlice& sizes); +xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x, + absl::Span starts, + absl::Span sizes); // Updates a slice of 'x', i.e., // x[start[0], ..., start[n]] = update -xla::StatusOr UpdateSlice(xla::XlaBuilder* builder, - const xla::XlaOp& x, - const xla::XlaOp& update, - gtl::ArraySlice start); +xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update, + absl::Span start); // Updates a slice of 'x', where 'start' contains a list of minor dimensions: // x[..., start[0], ..., start[n]] = update -xla::StatusOr UpdateSliceInMinorDims(xla::XlaBuilder* builder, - const xla::XlaOp& x, - const xla::XlaOp& update, - gtl::ArraySlice start); +xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update, + absl::Span start); -xla::StatusOr DynamicUpdateSliceInMinorDims( - xla::XlaBuilder* builder, const xla::XlaOp& x, const xla::XlaOp& update, - const std::vector& starts); +xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update, + absl::Span starts); // Transposes a stack of matrices `x` by swapping the last two dimensions. -xla::StatusOr TransposeInMinorDims(xla::XlaBuilder* builder, - const xla::XlaOp& x); +xla::XlaOp TransposeInMinorDims(xla::XlaOp x); // Applies a complex conjugation operation if `a` is complex and `conjugate_a` // is true, otherwise returns its argument. -xla::StatusOr MaybeConjugate(xla::XlaBuilder* builder, - const xla::XlaOp& x, bool conjugate); +xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/tf2xla/lib/util_test.cc index 265b39402c832f8c810a74f281563b05afdf2b1b..442fe92c34ca26cb1a854cc90da8dc034bca79bb 100644 --- a/tensorflow/compiler/tf2xla/lib/util_test.cc +++ b/tensorflow/compiler/tf2xla/lib/util_test.cc @@ -21,7 +21,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/lib/batch_dot.h" #include "tensorflow/compiler/xla/array2d.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -70,8 +70,7 @@ XLA_TEST_F(UtilTest, Simple2dLookup) { auto a_data = CreateR2Parameter(BValsRight(), 0, "a", &builder, &a); auto x_data = CreateR0Parameter(2, 1, "x", &builder, &x); auto y_data = CreateR0Parameter(1, 2, "y", &builder, &y); - auto result = DynamicSliceInMinorDims(&builder, a, {x, y}, {1, 1}); - TF_ASSERT_OK(result.status()); + DynamicSliceInMinorDims(a, {x, y}, {1, 1}); ComputeAndCompareR2(&builder, {{10}}, {a_data.get(), x_data.get(), y_data.get()}, @@ -86,10 +85,8 @@ XLA_TEST_F(UtilTest, Simple3dLookup) { CreateR3Parameter(BatchedAValsFull(), 0, "a", &builder, &a); auto index_data = CreateR0Parameter(1, 1, "index", &builder, &index); - TF_ASSERT_OK_AND_ASSIGN( - auto l_index, - DynamicSliceInMinorDims(&builder, a, - {index, builder.ConstantR0(0)}, {1, 4})); + DynamicSliceInMinorDims(a, {index, xla::ConstantR0(&builder, 0)}, + {1, 4}); ComputeAndCompareR3(&builder, {{{3, 6, 0, 1}}, {{24, 61, 82, 48}}}, {a_data.get(), index_data.get()}); @@ -104,8 +101,7 @@ XLA_TEST_F(UtilTest, SimpleSliceUpdate) { auto x_data = CreateR0Parameter(2, 2, "x", &builder, &x); auto y_data = CreateR0Parameter(1, 3, "y", &builder, &y); - auto result = DynamicUpdateSliceInMinorDims(&builder, a, b, {x, y}); - TF_ASSERT_OK(result.status()); + DynamicUpdateSliceInMinorDims(a, b, {x, y}); xla::Array2D expected( {{{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 9, 1, -10}, {5, 8, 10, 11}}}); @@ -128,13 +124,9 @@ XLA_TEST_F(UtilTest, RowBatchDot) { // Select {{3, 6, 0, 1}, {24, 61, 82, 48}} out of BatchedAValsFull(). auto index_data = CreateR0Parameter(1, 2, "index", &builder, &index); - TF_ASSERT_OK_AND_ASSIGN( - auto l_index, - DynamicSliceInMinorDims(&builder, a, - {index, builder.ConstantR0(0)}, {1, n})); - TF_ASSERT_OK_AND_ASSIGN( - auto dot, BatchDot(&builder, l_index, row, - /*transpose_x=*/false, /*transpose_y=*/true)); + auto l_index = DynamicSliceInMinorDims( + a, {index, xla::ConstantR0(&builder, 0)}, {1, n}); + BatchDot(l_index, row, /*transpose_x=*/false, /*transpose_y=*/true); ComputeAndCompareR3(&builder, {{{33}}, {{292}}}, {a_data.get(), row_data.get(), index_data.get()}); diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/tf2xla/lib/while_loop.cc index 09ce594930efc0af47306590d76b322ac730f80f..5300e2c878bf725b65544701eb3fdc6032553491 100644 --- a/tensorflow/compiler/tf2xla/lib/while_loop.cc +++ b/tensorflow/compiler/tf2xla/lib/while_loop.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/lib/while_loop.h" #include "tensorflow/compiler/tf2xla/lib/util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -23,7 +24,7 @@ namespace tensorflow { xla::StatusOr> XlaWhileLoop( const LoopConditionFunction& condition_function, const LoopBodyFunction& body_function, - gtl::ArraySlice initial_values, StringPiece name, + absl::Span initial_values, StringPiece name, xla::XlaBuilder* builder) { int arity = initial_values.size(); std::vector var_shapes; @@ -39,7 +40,7 @@ xla::StatusOr> XlaWhileLoop( xla::XlaBuilder* builder) { std::vector elements(arity); for (int i = 0; i < arity; ++i) { - elements[i] = builder->GetTupleElement(tuple, i); + elements[i] = xla::GetTupleElement(tuple, i); } return elements; }; @@ -48,7 +49,8 @@ xla::StatusOr> XlaWhileLoop( std::unique_ptr cond_builder = builder->CreateSubBuilder(strings::StrCat(name, "_condition")); { - auto parameter = cond_builder->Parameter(0, tuple_shape, "parameter"); + auto parameter = + xla::Parameter(cond_builder.get(), 0, tuple_shape, "parameter"); TF_RETURN_IF_ERROR( condition_function(unpack_tuple(parameter, arity, cond_builder.get()), @@ -61,7 +63,8 @@ xla::StatusOr> XlaWhileLoop( std::unique_ptr body_builder = builder->CreateSubBuilder(strings::StrCat(name, "_body")); { - auto parameter = body_builder->Parameter(0, tuple_shape, "parameter"); + auto parameter = + xla::Parameter(body_builder.get(), 0, tuple_shape, "parameter"); TF_ASSIGN_OR_RETURN( auto result, @@ -69,11 +72,11 @@ xla::StatusOr> XlaWhileLoop( body_builder.get())); TF_RET_CHECK(result.size() == initial_values.size()); - body_builder->Tuple(result); + xla::Tuple(body_builder.get(), result); } TF_ASSIGN_OR_RETURN(auto body, body_builder->Build()); - auto outputs = builder->While(cond, body, builder->Tuple(initial_values)); + auto outputs = xla::While(cond, body, xla::Tuple(builder, initial_values)); return unpack_tuple(outputs, arity, builder); } @@ -81,25 +84,25 @@ xla::StatusOr> XlaWhileLoop( xla::StatusOr> XlaForEachIndex( int64 num_iterations, xla::PrimitiveType num_iterations_type, const ForEachIndexBodyFunction& body_function, - gtl::ArraySlice initial_values, StringPiece name, + absl::Span initial_values, StringPiece name, xla::XlaBuilder* builder) { auto while_cond_fn = - [&](gtl::ArraySlice values, + [&](absl::Span values, xla::XlaBuilder* cond_builder) -> xla::StatusOr { - return cond_builder->Lt( - values[0], - IntegerLiteral(cond_builder, num_iterations_type, num_iterations)); + return xla::Lt(values[0], IntegerLiteral(cond_builder, num_iterations_type, + num_iterations)); }; - auto while_body_fn = [&](gtl::ArraySlice values, + auto while_body_fn = [&](absl::Span values, xla::XlaBuilder* body_builder) -> xla::StatusOr> { xla::XlaOp iteration = values[0]; std::vector updated_values; updated_values.reserve(values.size()); - updated_values.push_back(body_builder->Add( + updated_values.push_back(xla::Add( iteration, - body_builder->ConstantLiteral(xla::Literal::One(num_iterations_type)))); + xla::ConstantLiteral(body_builder, + xla::LiteralUtil::One(num_iterations_type)))); values.remove_prefix(1); TF_ASSIGN_OR_RETURN(std::vector body_outputs, @@ -111,8 +114,8 @@ xla::StatusOr> XlaForEachIndex( std::vector values; values.reserve(initial_values.size() + 1); - values.push_back( - builder->ConstantLiteral(xla::Literal::Zero(num_iterations_type))); + values.push_back(xla::ConstantLiteral( + builder, xla::LiteralUtil::Zero(num_iterations_type))); values.insert(values.end(), initial_values.begin(), initial_values.end()); TF_ASSIGN_OR_RETURN(values, XlaWhileLoop(while_cond_fn, while_body_fn, values, diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.h b/tensorflow/compiler/tf2xla/lib/while_loop.h index 5b6684c995889efbb1378c7ac4903548891d090a..115ebf390df6c215680e5982a6ceba546f384af8 100644 --- a/tensorflow/compiler/tf2xla/lib/while_loop.h +++ b/tensorflow/compiler/tf2xla/lib/while_loop.h @@ -19,24 +19,24 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace tensorflow { // Function that builds a loop condition. Takes as input a sequence of input // values, and returns a boolean value representing if the condition succeeds. -typedef std::function(gtl::ArraySlice, +typedef std::function(absl::Span, xla::XlaBuilder*)> LoopConditionFunction; // Function that builds a loop body. Takes as input a sequence of input values // and returns a sequence of output values. typedef std::function>( - gtl::ArraySlice, xla::XlaBuilder*)> + absl::Span, xla::XlaBuilder*)> LoopBodyFunction; // Helper function for building an XLA while loop, where the values carried by @@ -50,7 +50,7 @@ typedef std::function>( xla::StatusOr> XlaWhileLoop( const LoopConditionFunction& condition_function, const LoopBodyFunction& body_function, - gtl::ArraySlice initial_values, StringPiece name, + absl::Span initial_values, StringPiece name, xla::XlaBuilder* builder); // Builds an XLA loop that repeats a computation `num_iterations` times. @@ -59,13 +59,13 @@ xla::StatusOr> XlaWhileLoop( // (current iteration number, loop-carried values), and returns an updated // vector of the loop-carried values. typedef std::function>( - xla::XlaOp, gtl::ArraySlice, xla::XlaBuilder*)> + xla::XlaOp, absl::Span, xla::XlaBuilder*)> ForEachIndexBodyFunction; xla::StatusOr> XlaForEachIndex( int64 num_iterations, xla::PrimitiveType num_iterations_type, const ForEachIndexBodyFunction& body_function, - gtl::ArraySlice initial_values, StringPiece name, + absl::Span initial_values, StringPiece name, xla::XlaBuilder* builder); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc index 43e1c1e9fecec1c71db1509757251cb5d903ca49..20103ec3ae00b57723e05326dbbb1b0f6e1a671a 100644 --- a/tensorflow/compiler/tf2xla/literal_util.cc +++ b/tensorflow/compiler/tf2xla/literal_util.cc @@ -17,26 +17,55 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/type_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/common_runtime/dma_helper.h" namespace tensorflow { -Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal) { - xla::Shape literal_shape; - TF_RETURN_IF_ERROR(TensorShapeToXLAShape( - host_tensor.dtype(), host_tensor.shape(), &literal_shape)); +Status HostTensorToBorrowingLiteral(const Tensor& host_tensor, + xla::BorrowingLiteral* literal) { + xla::Shape xla_shape; + TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(), + host_tensor.shape(), &xla_shape)); + *literal = xla::BorrowingLiteral( + static_cast(DMAHelper::base(&host_tensor)), xla_shape); + return Status::OK(); +} + +Status HostTensorToMutableBorrowingLiteral( + Tensor* host_tensor, xla::MutableBorrowingLiteral* literal) { + xla::Shape xla_shape; + TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor->dtype(), + host_tensor->shape(), &xla_shape)); + return HostTensorToMutableBorrowingLiteral(xla_shape, host_tensor, literal); +} - *literal = xla::Literal(literal_shape); +Status HostTensorToMutableBorrowingLiteral( + const xla::Shape& xla_shape, Tensor* host_tensor, + xla::MutableBorrowingLiteral* literal) { + *literal = xla::MutableBorrowingLiteral( + static_cast(DMAHelper::base(host_tensor)), xla_shape); - // memcpy over the payload ... - // TODO(phawkins): handle string types. - size_t total_bytes = host_tensor.TotalBytes(); - if (total_bytes > 0) { - void* dst_ptr = literal->untyped_data(); - const void* src_ptr = DMAHelper::base(&host_tensor); - memcpy(dst_ptr, src_ptr, total_bytes); + return Status::OK(); +} + +Status HostTensorsToBorrowingLiteralTuple(absl::Span host_tensors, + xla::BorrowingLiteral* literal) { + std::vector buf_ptrs; + buf_ptrs.reserve(host_tensors.size()); + std::vector tensor_shapes(host_tensors.size()); + + for (int i = 0; i < host_tensors.size(); i++) { + // Validate runtime shapes and fail if it doesn't match the contract. + const Tensor* tensor = &host_tensors[i]; + buf_ptrs.emplace_back(static_cast(DMAHelper::base(tensor))); + TF_RETURN_IF_ERROR(TensorShapeToXLAShape(tensor->dtype(), tensor->shape(), + &tensor_shapes[i])); } + + *literal = xla::BorrowingLiteral( + buf_ptrs, xla::ShapeUtil::MakeTupleShape(tensor_shapes)); + return Status::OK(); } diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h index 220bec15538c36fa30abef9e729b64dbbb9f72b3..1db7470ee2a839099454b772d4833492e033bc92 100644 --- a/tensorflow/compiler/tf2xla/literal_util.h +++ b/tensorflow/compiler/tf2xla/literal_util.h @@ -18,16 +18,33 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2XLA_LITERAL_UTIL_H_ #define TENSORFLOW_COMPILER_TF2XLA_LITERAL_UTIL_H_ -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/lib/core/status.h" namespace tensorflow { -// Copies 'host_tensor' to an XLA Literal. Fails if host_tensor is of an -// unsupported type. -Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal); +// Returns a BorrowingLiteral that utilizes the same underlying buffer owned by +// 'host_tensor'. +Status HostTensorToBorrowingLiteral(const Tensor& host_tensor, + xla::BorrowingLiteral* literal); +// Returns a MutableBorrowingLiteral that utilizes the same underlying buffer +// owned by 'host_tensor', but is mutable via the xla::Literal methods. +Status HostTensorToMutableBorrowingLiteral( + Tensor* host_tensor, xla::MutableBorrowingLiteral* literal); +// Similar as above, except the literal shape is explicitly provided and used +// instead of obtaining it from the 'host_tensor'. The provided literal shape +// 'xla_shape' must be compatible with the shape of 'host_tensor'. +Status HostTensorToMutableBorrowingLiteral( + const xla::Shape& xla_shape, Tensor* host_tensor, + xla::MutableBorrowingLiteral* literal); + +// Returns a BorrowingLiteral tuple that utilizes the same underlying buffers +// owned by 'host_tensors'. +Status HostTensorsToBorrowingLiteralTuple(absl::Span host_tensors, + xla::BorrowingLiteral* literal); // Copies 'literal' to freshly allocated 'host_tensor', which is allocated of // type . diff --git a/tensorflow/compiler/tf2xla/literal_util_test.cc b/tensorflow/compiler/tf2xla/literal_util_test.cc index f3d6787daaa1165b28ce63dfd501533fa0963edd..7dc16b5a46791b81eef2c572736e1a1c7969b203 100644 --- a/tensorflow/compiler/tf2xla/literal_util_test.cc +++ b/tensorflow/compiler/tf2xla/literal_util_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/framework/numeric_types.h" #include "tensorflow/core/framework/tensor_testutil.h" @@ -27,7 +28,7 @@ TEST(LiteralUtil, LiteralToHostTensor) { { std::vector int64_values = {1, 2, 3}; std::unique_ptr int64_values_literal = - xla::Literal::CreateR1(gtl::ArraySlice(int64_values)); + xla::LiteralUtil::CreateR1(absl::Span(int64_values)); Tensor host_tensor; EXPECT_EQ("Cannot convert literal of type S64 to tensor of type int32", LiteralToHostTensor(*int64_values_literal, DT_INT32, &host_tensor) @@ -48,7 +49,7 @@ TEST(LiteralUtil, LiteralToHostTensor) { Tensor host_tensor; std::vector int32_values = {10, 11}; std::unique_ptr int32_values_literal = - xla::Literal::CreateR1(gtl::ArraySlice(int32_values)); + xla::LiteralUtil::CreateR1(absl::Span(int32_values)); EXPECT_TRUE( LiteralToHostTensor(*int32_values_literal, DT_INT32, &host_tensor) .ok()); diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD index bb9168fa358154f3db9dab87bacc9bf28dd16406..4dce0a2102cf9c782850ccc7af4f14b59bd51e53 100644 --- a/tensorflow/compiler/tf2xla/ops/BUILD +++ b/tensorflow/compiler/tf2xla/ops/BUILD @@ -8,14 +8,11 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py") cc_library( name = "xla_ops", - srcs = [ - "dynamic_slice_ops.cc", - "functional_ops.cc", - "reduce_window_op.cc", - "sendrecv_ops.cc", - ], + srcs = ["xla_ops.cc"], deps = [ "//tensorflow/core:framework", + "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", ], alwayslink = 1, ) diff --git a/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc deleted file mode 100644 index d6c0edbb889b1751ac9d9d47d0c9534b543196ff..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/tf2xla/ops/dynamic_slice_ops.cc +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" - -namespace tensorflow { - -REGISTER_OP("XlaDynamicUpdateSlice") - .Input("input: T") - .Input("update: T") - .Input("indices: Tindices") - .Output("output: T") - .Attr("T: type") - .Attr("Tindices: {int32, int64}") - .SetShapeFn(shape_inference::UnchangedShape) - .Doc(R"doc( -Wraps the XLA DynamicUpdateSlice operator, documented at - https://www.tensorflow.org/performance/xla/operation_semantics#dynamicupdateslice -. - -XlaDynamicUpdateSlice generates a result which is the value of the `input` -operand, with a slice update overwritten at `indices`. The shape of `update` -determines the shape of the sub-array of the result which is updated. The shape -of indices must be rank == 1, with dimension size equal to the rank of `input`. - -Handling of out-of-bounds slice indices is implementation-defined. - -input: A `Tensor` of type T. -indices: A vector of indices into `input`. Must have length equal to the rank of - `input`. -update: A `Tensor` of type T. Same rank as `input`. -output: A `Tensor` of type T. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/ops/functional_ops.cc b/tensorflow/compiler/tf2xla/ops/functional_ops.cc deleted file mode 100644 index 4a669f8e6eaf644f119f3c0a66f29d9f2c9a9d16..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/tf2xla/ops/functional_ops.cc +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" - -namespace tensorflow { - -// TODO(b/37549631) setting the While Op to always be stateful is too -// conservative. -REGISTER_OP("XlaWhile") - .Input("input: T") - .Output("output: T") - .Attr("T: list(type) >= 0") - .Attr("cond: func") - .Attr("body: func") - .SetIsStateful() - .SetShapeFn(shape_inference::UnknownShape) - .Doc(R"doc( -output = input; While (Cond(output)) { output = Body(output) } - -input: A list of input tensors whose types are T. -output: A list of output tensors whose types are T. -cond: A function takes 'input' and returns a tensor. If the tensor is - a scalar of non-boolean, the scalar is converted to a boolean - according to the following rule: if the scalar is a numerical - value, non-zero means True and zero means False; if the scalar is - a string, non-empty means True and empty means False. If the - tensor is not a scalar, non-emptiness means True and False - otherwise. -body: A function that takes a list of tensors and returns another - list of tensors. Both lists have the same types as specified by T. -)doc"); - -// TODO(b/37549631) setting the If Op to always be stateful is too -// conservative. -REGISTER_OP("XlaIf") - .Input("cond: Tcond") - .Input("inputs: Tin") - .Output("output: Tout") - .Attr("Tcond: type") - .Attr("then_branch: func") - .Attr("else_branch: func") - .Attr("Tin: list(type) >= 0") - .Attr("Tout: list(type) >= 0") - .SetIsStateful() - .SetShapeFn(shape_inference::UnknownShape) - .Doc(R"doc( -output = cond ? then_branch(inputs) : else_branch(inputs). - -cond: A boolean scalar. -inputs: A list of input tensors. -output: A list of tensors returned by either then_branch(inputs) or - else_branch(inputs). The input shapes of the then_branch and - else_branch must match. -then_branch: A function takes 'inputs' and returns a list of tensors, - whose types are the same as what else_branch returns. -else_branch: A function takes 'inputs' and returns a list of tensors. - whose types are the same as what then_branch returns. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc b/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc deleted file mode 100644 index d9af982adc090ea78c711fd4656ba429c53b18c9..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/tf2xla/ops/reduce_window_op.cc +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" - -namespace tensorflow { - -REGISTER_OP("XlaReduceWindow") - .Input("input: T") - .Input("init_value: T") - .Attr("T: numbertype") - .Attr("computation: func") - .Attr("window_dimensions: list(int)") - .Attr("window_strides: list(int)") - .Attr("padding_low: list(int)") - .Attr("padding_high: list(int)") - .Output("output: T") - .SetShapeFn(shape_inference::UnknownShape) - .Doc(R"doc( -Wraps the XLA ReduceWindow operator, documented at - https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow . - -input: the input tensor -init_value: a scalar representing the initial value for the reduction -computation: a reducer function to apply -window_dimensions: the shape of the window -window_strides: the inter-window strides -padding_low: the padding to apply at the start of each input dimensions -padding_high: the padding to apply at the end of each input dimension. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc deleted file mode 100644 index 7ec7b50e905a6cbdecea4543dcb87322b5a7e844..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/tf2xla/ops/sendrecv_ops.cc +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" - -namespace tensorflow { - -REGISTER_OP("XlaSend") - .Input("tensor: T") - .Attr("T: type") - .Attr("tensor_name: string") - .SetIsStateful() - .SetShapeFn(shape_inference::UnknownShape) - .Doc(R"doc( -Sends the named tensor to another XLA computation. Wraps the XLA Send operator -documented at - https://www.tensorflow.org/performance/xla/operation_semantics#send . - -tensor: The tensor to send. -tensor_name: A string key that identifies the channel. -)doc"); - -REGISTER_OP("XlaRecv") - .Output("tensor: dtype") - .Attr("dtype: type") - .Attr("tensor_name: string") - .Attr("shape: shape") - .SetIsStateful() - .SetShapeFn([](shape_inference::InferenceContext* c) { - TensorShape shape_attr; - TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_attr)); - shape_inference::ShapeHandle s; - TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(shape_attr, &s)); - c->set_output(0, s); - return Status::OK(); - }) - .Doc(R"doc( -Receives the named tensor from another XLA computation. Wraps the XLA Recv -operator documented at - https://www.tensorflow.org/performance/xla/operation_semantics#recv . - -tensor: The tensor to receive. -dtype: The type of the tensor. -tensor_name: A string key that identifies the channel. -shape: The shape of the tensor. -)doc"); - -} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..2cd9ae799f06afdcbae5429ef8caffd3b4d29c29 --- /dev/null +++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc @@ -0,0 +1,360 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "absl/algorithm/container.h" +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace tensorflow { +namespace { + +// Helper shape function for operators that return an output with the same rank +// as their first input. +Status UnchangedRank(shape_inference::InferenceContext* c) { + if (c->RankKnown(c->input(0))) { + c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(0)))); + } else { + c->set_output(0, c->input(0)); + } + return Status::OK(); +} + +REGISTER_OP("XlaBroadcastHelper") + .Input("lhs: T") + .Input("rhs: T") + .Input("broadcast_dims: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Output("lhs_output: T") + .Output("rhs_output: T") + .SetShapeFn(shape_inference::UnknownShape) + .Doc(R"doc( +Helper operator for performing XLA-style broadcasts + +Broadcasts `lhs` and `rhs` to the same rank, by adding size 1 dimensions to +whichever of `lhs` and `rhs` has the lower rank, using XLA's broadcasting rules +for binary operators. + +lhs: the LHS input tensor +rhs: the RHS input tensor +broadcast_dims: an XLA-style broadcast dimension specification +lhs_output: the broadcasted LHS tensor +rhs_output: the broadcasted RHS tensor +)doc"); + +REGISTER_OP("XlaConv") + .Input("lhs: T") + .Input("rhs: T") + .Input("window_strides: Tindices") + .Input("padding: Tindices") + .Input("lhs_dilation: Tindices") + .Input("rhs_dilation: Tindices") + .Input("feature_group_count: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("dimension_numbers: string") + .Attr("precision_config: string") + .Output("output: T") + .SetShapeFn(UnchangedRank) + .Doc(R"doc( +Wraps the XLA ConvGeneralDilated operator, documented at + https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution +. + +lhs: the input tensor +rhs: the kernel tensor +window_strides: the inter-window strides +padding: the padding to apply at the start and end of each input dimensions +lhs_dilation: dilation to apply between input elements +rhs_dilation: dilation to apply between kernel elements +feature_group_count: number of feature groups for grouped convolution. +dimension_numbers: a serialized xla::ConvolutionDimensionNumbers proto. +precision_config: a serialized xla::PrecisionConfigProto proto. +)doc"); + +REGISTER_OP("XlaDot") + .Input("lhs: T") + .Input("rhs: T") + .Attr("T: numbertype") + .Attr("dimension_numbers: string") + .Attr("precision_config: string") + .Output("output: T") + .SetShapeFn(shape_inference::UnknownShape) + .Doc(R"doc( +Wraps the XLA ConvGeneralDilated operator, documented at + https://www.tensorflow.org/performance/xla/operation_semantics#dotgeneral +. + +lhs: the LHS tensor +rhs: the RHS tensor +dimension_numbers: a serialized xla::DotDimensionNumbers proto. +precision_config: a serialized xla::PrecisionConfigProto proto. +)doc"); + +REGISTER_OP("XlaDynamicUpdateSlice") + .Input("input: T") + .Input("update: T") + .Input("indices: Tindices") + .Output("output: T") + .Attr("T: type") + .Attr("Tindices: {int32, int64}") + .SetShapeFn(shape_inference::UnchangedShape) + .Doc(R"doc( +Wraps the XLA DynamicUpdateSlice operator, documented at + https://www.tensorflow.org/performance/xla/operation_semantics#dynamicupdateslice +. + +XlaDynamicUpdateSlice generates a result which is the value of the `input` +operand, with a slice update overwritten at `indices`. The shape of `update` +determines the shape of the sub-array of the result which is updated. The shape +of indices must be rank == 1, with dimension size equal to the rank of `input`. + +Handling of out-of-bounds slice indices is implementation-defined. + +input: A `Tensor` of type T. +indices: A vector of indices into `input`. Must have length equal to the rank of + `input`. +update: A `Tensor` of type T. Same rank as `input`. +output: A `Tensor` of type T. +)doc"); + +// TODO(b/37549631) setting the If Op to always be stateful is too +// conservative. +REGISTER_OP("XlaIf") + .Input("cond: Tcond") + .Input("inputs: Tin") + .Output("output: Tout") + .Attr("Tcond: type") + .Attr("then_branch: func") + .Attr("else_branch: func") + .Attr("Tin: list(type) >= 0") + .Attr("Tout: list(type) >= 0") + .SetIsStateful() + .SetShapeFn(shape_inference::UnknownShape) + .Doc(R"doc( +output = cond ? then_branch(inputs) : else_branch(inputs). + +cond: A boolean scalar. +inputs: A list of input tensors. +output: A list of tensors returned by either then_branch(inputs) or + else_branch(inputs). The input shapes of the then_branch and + else_branch must match. +then_branch: A function takes 'inputs' and returns a list of tensors, + whose types are the same as what else_branch returns. +else_branch: A function takes 'inputs' and returns a list of tensors. + whose types are the same as what then_branch returns. +)doc"); + +REGISTER_OP("XlaPad") + .Input("input: T") + .Input("padding_value: T") + .Input("padding_low: Tindices") + .Input("padding_high: Tindices") + .Input("padding_interior: Tindices") + .Output("output: T") + .Attr("T: type") + .Attr("Tindices: {int32, int64}") + .SetShapeFn(UnchangedRank) + .Doc(R"doc( +Wraps the XLA Pad operator, documented at + https://www.tensorflow.org/performance/xla/operation_semantics#pad +. + +input: A `Tensor` of type T. +padding_value: A scalar `Tensor` of type T. +padding_low: the padding to apply at the start of each input dimensions +padding_high: the padding to apply at the end of each input dimension. +padding_interior: the padding to apply between each input element. +output: A `Tensor` of type T. +)doc"); + +REGISTER_OP("XlaRecv") + .Output("tensor: dtype") + .Attr("dtype: type") + .Attr("tensor_name: string") + .Attr("shape: shape") + .SetIsStateful() + .SetShapeFn([](shape_inference::InferenceContext* c) { + TensorShape shape_attr; + TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_attr)); + shape_inference::ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(shape_attr, &s)); + c->set_output(0, s); + return Status::OK(); + }) + .Doc(R"doc( +Receives the named tensor from another XLA computation. Wraps the XLA Recv +operator documented at + https://www.tensorflow.org/performance/xla/operation_semantics#recv . + +tensor: The tensor to receive. +dtype: The type of the tensor. +tensor_name: A string key that identifies the channel. +shape: The shape of the tensor. +)doc"); + +REGISTER_OP("XlaReduce") + .Input("input: T") + .Input("init_value: T") + .Attr("T: numbertype") + .Attr("dimensions_to_reduce: list(int)") + .Attr("reducer: func") + .Output("output: T") + .SetShapeFn([](shape_inference::InferenceContext* c) { + if (c->RankKnown(c->input(0))) { + int rank = c->Rank(c->input(0)); + std::vector dimensions_to_reduce; + TF_RETURN_IF_ERROR( + c->GetAttr("dimensions_to_reduce", &dimensions_to_reduce)); + std::set dims_set(dimensions_to_reduce.begin(), + dimensions_to_reduce.end()); + auto dim_in_range = [rank](int64 dim) { + return dim >= 0 && dim < rank; + }; + if (rank < dimensions_to_reduce.size() || + dims_set.size() != dimensions_to_reduce.size() || + !absl::c_all_of(dimensions_to_reduce, dim_in_range)) { + return errors::InvalidArgument( + "Invalid dimensions_to_reduce argument to XlaReduce"); + } + c->set_output( + 0, c->UnknownShapeOfRank(rank - dimensions_to_reduce.size())); + } else { + c->set_output(0, c->input(0)); + } + return Status::OK(); + }) + .Doc(R"doc( +Wraps the XLA Reduce operator, documented at + https://www.tensorflow.org/performance/xla/operation_semantics#reduce . + +input: the input tensor +init_value: a scalar representing the initial value for the reduction +reducer: a reducer function to apply +dimensions_to_reduce: dimension numbers over which to reduce +)doc"); + +REGISTER_OP("XlaReduceWindow") + .Input("input: T") + .Input("init_value: T") + .Input("window_dimensions: Tindices") + .Input("window_strides: Tindices") + .Input("padding: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("computation: func") + .Output("output: T") + .SetShapeFn(UnchangedRank) + .Doc(R"doc( +Wraps the XLA ReduceWindow operator, documented at + https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow . + +input: the input tensor +init_value: a scalar representing the initial value for the reduction +computation: a reducer function to apply +window_dimensions: the shape of the window +window_strides: the inter-window strides +padding: the padding to apply at the start and end of each input dimensions +)doc"); + +REGISTER_OP("XlaSelectAndScatter") + .Input("operand: T") + .Input("window_dimensions: Tindices") + .Input("window_strides: Tindices") + .Input("padding: Tindices") + .Input("source: T") + .Input("init_value: T") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("select: func") + .Attr("scatter: func") + .Output("output: T") + .SetShapeFn(UnchangedRank) + .Doc(R"doc( +Wraps the XLA SelectAndScatter operator, documented at + https://www.tensorflow.org/performance/xla/operation_semantics#selectandscatter +. + +operand: the input tensor +window_dimensions: the shape of the window +window_strides: the inter-window strides +padding: the padding to apply at the start and end of each input dimensions +source: a tensor of values to scatter +init_value: a scalar representing the initial value for the output tensor +select: a selection function to apply +scatter: a scatter function to apply +)doc"); + +REGISTER_OP("XlaSend") + .Input("tensor: T") + .Attr("T: type") + .Attr("tensor_name: string") + .SetIsStateful() + .SetShapeFn(shape_inference::UnknownShape) + .Doc(R"doc( +Sends the named tensor to another XLA computation. Wraps the XLA Send operator +documented at + https://www.tensorflow.org/performance/xla/operation_semantics#send . + +tensor: The tensor to send. +tensor_name: A string key that identifies the channel. +)doc"); + +REGISTER_OP("XlaSort") + .Input("input: T") + .Output("output: T") + .Attr("T: type") + .SetShapeFn(shape_inference::UnchangedShape) + .Doc(R"doc( +Wraps the XLA Sort operator, documented at + https://www.tensorflow.org/performance/xla/operation_semantics#sort +. + +Sorts a tensor. Currently only rank 1 sorts in ascending order are supported. + +input: A `Tensor` of type T. +output: A `Tensor` of type T. +)doc"); + +// TODO(b/37549631) setting the While Op to always be stateful is too +// conservative. +REGISTER_OP("XlaWhile") + .Input("input: T") + .Output("output: T") + .Attr("T: list(type) >= 0") + .Attr("cond: func") + .Attr("body: func") + .SetIsStateful() + .SetShapeFn(shape_inference::UnknownShape) + .Doc(R"doc( +output = input; While (Cond(output)) { output = Body(output) } + +input: A list of input tensors whose types are T. +output: A list of output tensors whose types are T. +cond: A function takes 'input' and returns a tensor. If the tensor is + a scalar of non-boolean, the scalar is converted to a boolean + according to the following rule: if the scalar is a numerical + value, non-zero means True and zero means False; if the scalar is + a string, non-empty means True and empty means False. If the + tensor is not a scalar, non-emptiness means True and False + otherwise. +body: A function that takes a list of tensors and returns another + list of tensors. Both lists have the same types as specified by T. +)doc"); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD index 42b6292f79ffddd155c05758a1420a2a583eb0c6..69ca39436013ec5cf09ba502a1540d5df322e213 100644 --- a/tensorflow/compiler/tf2xla/python/BUILD +++ b/tensorflow/compiler/tf2xla/python/BUILD @@ -28,5 +28,6 @@ py_library( srcs = ["xla.py"], deps = [ "//tensorflow/compiler/tf2xla/ops:gen_xla_ops", + "//tensorflow/compiler/xla:xla_data_proto_py", ], ) diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py index e5ce65bec950fdfd38c3ca5bc62ac745ef8ca4a7..3626de375ea9ac12e40ea5b5b591bb6d5262adbc 100644 --- a/tensorflow/compiler/tf2xla/python/xla.py +++ b/tensorflow/compiler/tf2xla/python/xla.py @@ -15,11 +15,12 @@ """Experimental library that exposes XLA operations directly in TensorFlow. It is sometimes useful to be able to build HLO programs directly from -TensorFlow. This file provides Tensorflow operators that map as closely as -possible to HLO operators. +TensorFlow. This file provides Tensorflow operators that mirror the semantics of +HLO operators as closely as possible. -There is no promise of backward or forward compatibility for operators defined -in this module. +Note: There is no promise of backward or forward compatibility for operators +defined in this module. This is primarily because the underlying HLO operators +do not promise backward or forward compatibility. """ from __future__ import absolute_import @@ -27,11 +28,298 @@ from __future__ import division from __future__ import print_function from tensorflow.compiler.tf2xla.ops import gen_xla_ops +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import bitwise_ops +from tensorflow.python.ops import gen_math_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops + +# TODO(phawkins): provide wrappers for all XLA operators. Currently the missing +# ops include: +# infeed/outfeed (available via tf.contrib.tpu) +# collectives, e.g., cross-replica-sum (available via tf.contrib.tpu) +# conditional +# gather/scatter +# collapse + +# This file reuses builtin names (following XLA's names, so we can call things +# like xla.max), so we capture the builtin versions here. +# pylint: disable=redefined-builtin +_max = max +_min = min +_slice = slice # pylint: disable=invalid-name + +constant = constant_op.constant + +# Unary operators. + +# For most arithmetic operators there is a TensorFlow operator +# that exactly corresponds to each XLA operator. Rather than defining +# XLA-specific variants, we reuse the corresponding TensorFlow operator. +# TODO(phawkins): It would be even better to have TensorFlow operators that 1:1 +# wrap every HLO operator, because that would allow us to be confident that the +# semantics match. + + +def _unary_op(fn): + """Wrapper that restricts `fn` to have the correct signature.""" + + def unary_op_wrapper(x, name=None): + return fn(x, name=name) + + return unary_op_wrapper + + +abs = _unary_op(math_ops.abs) +# TODO(phawkins): implement clz. +conj = _unary_op(math_ops.conj) +cos = _unary_op(math_ops.cos) +ceil = _unary_op(math_ops.ceil) +digamma = _unary_op(math_ops.digamma) +erf = _unary_op(math_ops.erf) +erfc = _unary_op(math_ops.erfc) +# TODO(phawkins): implement erfinv +exp = _unary_op(math_ops.exp) +expm1 = _unary_op(math_ops.expm1) +floor = _unary_op(math_ops.floor) +imag = _unary_op(math_ops.imag) +is_finite = _unary_op(math_ops.is_finite) +lgamma = _unary_op(math_ops.lgamma) +log = _unary_op(math_ops.log) +log1p = _unary_op(math_ops.log1p) +logical_not = _unary_op(math_ops.logical_not) +neg = _unary_op(math_ops.neg) +real = _unary_op(math_ops.real) +# TODO(phawkins): unlike xla::Round, this rounds to even instead of zero for +# numbers halfway between two integers. +round = _unary_op(math_ops.round) +sin = _unary_op(math_ops.sin) +sign = _unary_op(math_ops.sign) +tanh = _unary_op(math_ops.tanh) + +# Binary operators + +# The main difference between TensorFlow and XLA binary ops is the broadcasting +# semantics. TensorFlow uses Numpy-style broadcasting semantics, whereas XLA +# requires an explicit specification of which dimensions to broadcast if the +# arguments have different ranks. + + +def _broadcasting_binary_op(fn): + """Wraps a binary Tensorflow operator and performs XLA-style broadcasting.""" + + def broadcasting_binary_op_wrapper(x, y, broadcast_dims=None, name=None): + """Inner wrapper function.""" + broadcast_dims = broadcast_dims or [] + broadcast_dims = ops.convert_to_tensor(broadcast_dims, dtypes.int64) + # Rather than relying on having static shape information in the TensorFlow + # graph, we use an XlaBroadcastHelper op that can compute the correct shapes + # at JIT compilation time. + x, y = gen_xla_ops.xla_broadcast_helper(x, y, broadcast_dims) + return fn(x, y, name=name) + + return broadcasting_binary_op_wrapper + + +# Map from TF signed types to TF unsigned types. +_SIGNED_TO_UNSIGNED_TABLE = { + dtypes.int8: dtypes.uint8, + dtypes.int16: dtypes.uint16, + dtypes.int32: dtypes.uint32, + dtypes.int64: dtypes.uint64, +} + +# Map from TF unsigned types to TF signed types. +_UNSIGNED_TO_SIGNED_TABLE = { + dtypes.uint8: dtypes.int8, + dtypes.uint16: dtypes.int16, + dtypes.uint32: dtypes.int32, + dtypes.uint64: dtypes.int64, +} + + +def _shift_right_logical_helper(x, y, name=None): + """Performs an integer right logical shift irrespective of input type.""" + assert y.dtype == x.dtype + dtype = x.dtype + signed = dtype in _SIGNED_TO_UNSIGNED_TABLE + if signed: + unsigned_dtype = _SIGNED_TO_UNSIGNED_TABLE[dtype] + x = math_ops.cast(x, unsigned_dtype) + y = math_ops.cast(y, unsigned_dtype) + output = bitwise_ops.right_shift(x, y, name=name) + if signed: + output = math_ops.cast(output, dtype) + return output + + +def _shift_right_arithmetic_helper(x, y, name=None): + """Performs an integer right arithmetic shift irrespective of input type.""" + assert y.dtype == x.dtype + dtype = x.dtype + unsigned = dtype in _UNSIGNED_TO_SIGNED_TABLE + if unsigned: + signed_dtype = _UNSIGNED_TO_SIGNED_TABLE[dtype] + x = math_ops.cast(x, signed_dtype) + y = math_ops.cast(y, signed_dtype) + output = bitwise_ops.right_shift(x, y, name=name) + if unsigned: + output = math_ops.cast(output, dtype) + return output + + +add = _broadcasting_binary_op(math_ops.add) +sub = _broadcasting_binary_op(math_ops.sub) +mul = _broadcasting_binary_op(math_ops.mul) +div = _broadcasting_binary_op(math_ops.div) +rem = _broadcasting_binary_op(gen_math_ops.mod) +max = _broadcasting_binary_op(math_ops.maximum) +min = _broadcasting_binary_op(math_ops.minimum) +atan2 = _broadcasting_binary_op(math_ops.atan2) +complex = _broadcasting_binary_op(math_ops.complex) +logical_and = _broadcasting_binary_op(math_ops.logical_and) +logical_or = _broadcasting_binary_op(math_ops.logical_or) +logical_xor = _broadcasting_binary_op(math_ops.logical_xor) +eq = _broadcasting_binary_op(math_ops.equal) +ne = _broadcasting_binary_op(math_ops.not_equal) +ge = _broadcasting_binary_op(math_ops.greater_equal) +gt = _broadcasting_binary_op(math_ops.greater) +le = _broadcasting_binary_op(math_ops.less_equal) +lt = _broadcasting_binary_op(math_ops.less) +pow = _broadcasting_binary_op(math_ops.pow) +shift_left = _broadcasting_binary_op(bitwise_ops.left_shift) +shift_right_logical = _broadcasting_binary_op(_shift_right_logical_helper) +shift_right_arithmetic = _broadcasting_binary_op(_shift_right_arithmetic_helper) + + +def _binary_op(fn): + """Wrapper that restricts `fn` to have the correct signature.""" + + def binary_op_wrapper(x, y, name=None): + return fn(x, y, name=name) + + return binary_op_wrapper + + +transpose = _binary_op(array_ops.transpose) +rev = _binary_op(array_ops.reverse) + +bitcast_convert_type = array_ops.bitcast + + +def broadcast(x, dims, name=None): + x = ops.convert_to_tensor(x) + shape = array_ops.concat( + [constant_op.constant(dims), + array_ops.shape(x)], axis=0) + return array_ops.broadcast_to(x, shape, name=name) + + +def clamp(a, x, b, name=None): + return min(max(a, x, name=name), b, name=name) + + +concatenate = array_ops.concat + + +def conv(lhs, + rhs, + window_strides, + padding, + lhs_dilation, + rhs_dilation, + dimension_numbers, + feature_group_count=1, + precision_config=None, + name=None): + """Wraps the XLA ConvGeneralDilated operator. + + ConvGeneralDilated is the most general form of XLA convolution and is + documented at + https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution + + Args: + lhs: the input tensor + rhs: the kernel tensor + window_strides: the inter-window strides + padding: the padding to apply at the start and end of each input dimensions + lhs_dilation: dilation to apply between input elements + rhs_dilation: dilation to apply between kernel elements + dimension_numbers: a `ConvolutionDimensionNumbers` proto. + feature_group_count: number of feature groups for grouped convolution. + precision_config: a `PrecisionConfigProto` proto. + name: an optional name for the operator + + Returns: + A tensor representing the output of the convolution. + """ + precision_config_proto = "" + if precision_config: + precision_config_proto = precision_config.SerializeToString() + return gen_xla_ops.xla_conv( + lhs, + rhs, + window_strides=window_strides, + padding=padding, + lhs_dilation=lhs_dilation, + rhs_dilation=rhs_dilation, + feature_group_count=feature_group_count, + dimension_numbers=dimension_numbers.SerializeToString(), + precision_config=precision_config_proto, + name=name) + + +convert_element_type = math_ops.cast + + +def dot(lhs, rhs, name=None): + return math_ops.tensordot(lhs, rhs, axes=1, name=name) + + +def dot_general(lhs, rhs, dimension_numbers, precision_config=None, name=None): + precision_config_proto = "" + if precision_config: + precision_config_proto = precision_config.SerializeToString() + return gen_xla_ops.xla_dot( + lhs, + rhs, + dimension_numbers=dimension_numbers.SerializeToString(), + precision_config=precision_config_proto, + name=name) + + +def dynamic_slice(x, starts, sizes, name=None): + # TODO(phawkins): the Slice operator lowers to DynamicSlice if `starts` is not + # a compile-time constant. This doesn't exactly mimic the semantics of dynamic + # slice if the slice is out of bounds. + return array_ops.slice(x, starts, sizes, name=name) -# TODO(phawkins): provide wrappers for all XLA operators. dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice +# TODO(phawkins): generalize tf.pad to support interior padding, and then remove +# the XLA-specific pad operator. +pad = gen_xla_ops.xla_pad + + +def random_normal(mu, sigma, dims, name=None): + mu = ops.convert_to_tensor(mu) + return random_ops.random_normal( + dims, mean=mu, stddev=sigma, dtype=mu.dtype, name=name) + + +def random_uniform(minval, maxval, dims, name=None): + minval = ops.convert_to_tensor(minval) + return random_ops.random_uniform( + dims, minval, maxval, dtype=minval.dtype, name=name) + + +recv = gen_xla_ops.xla_recv +reduce = gen_xla_ops.xla_reduce + def reduce_window(operand, init, @@ -61,20 +349,38 @@ def reduce_window(operand, """ window_strides = window_strides or [1] * len(window_dimensions) padding = padding or [(0, 0)] * len(window_dimensions) - padding_low = [x for (x, _) in padding] - padding_high = [y for (_, y) in padding] return gen_xla_ops.xla_reduce_window( - operand, - init, - reducer, - window_dimensions, - window_strides, - padding_low, - padding_high, + input=operand, + init_value=init, + window_dimensions=window_dimensions, + window_strides=window_strides, + padding=padding, + computation=reducer, name=name) -recv = gen_xla_ops.xla_recv +def reshape(x, new_sizes, dimensions=None, name=None): + if dimensions is not None: + x = array_ops.transpose(x, dimensions) + x = array_ops.reshape(x, new_sizes, name=name) + return x + + +def select(condition, x, y, name=None): + return array_ops.where(condition, x, y, name) + + +select_and_scatter = gen_xla_ops.xla_select_and_scatter send = gen_xla_ops.xla_send + +def slice(x, start_dims, limit_dims, strides): + spec = [ + _slice(start, limit, stride) + for (start, limit, stride) in zip(start_dims, limit_dims, strides) + ] + return x[tuple(spec)] + + +sort = gen_xla_ops.xla_sort while_loop = gen_xla_ops.xla_while diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc new file mode 100644 index 0000000000000000000000000000000000000000..32ba6df2e6daa2add468a1bc0559d42606d1a9a6 --- /dev/null +++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc @@ -0,0 +1,130 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/resource_operation_table.h" +#include "absl/algorithm/container.h" +#include "tensorflow/core/lib/gtl/flatmap.h" + +namespace tensorflow { +/*static*/ StringPiece XlaResourceOpInfo::XlaResourceOpKindToString( + XlaResourceOpKind op_kind) { + switch (op_kind) { + case XlaResourceOpKind::kRead: + return "Read"; + case XlaResourceOpKind::kWrite: + return "Write"; + case XlaResourceOpKind::kReadWrite: + return "Modify"; + } +} + +static gtl::FlatMap* CreateResourceOpInfoMap() { + gtl::FlatMap* result = + new gtl::FlatMap; + + auto add = [&](StringPiece op, XlaResourceOpKind op_kind, + XlaResourceKind resource_kind) { + auto insert_result = + result->insert({op, XlaResourceOpInfo(op_kind, resource_kind)}); + CHECK(insert_result.second); + }; + + auto kRead = XlaResourceOpKind::kRead; + auto kWrite = XlaResourceOpKind::kWrite; + auto kReadWrite = XlaResourceOpKind::kReadWrite; + + auto kVariable = XlaResourceKind::kVariable; + auto kStack = XlaResourceKind::kStack; + auto kTensorArray = XlaResourceKind::kTensorArray; + + // clang-format off + add("AssignAddVariableOp" , kReadWrite, kVariable); + add("AssignSubVariableOp" , kReadWrite, kVariable); + add("AssignVariableOp" , kWrite, kVariable); + add("ReadVariableOp" , kRead, kVariable); + add("ResourceApplyAdaMax" , kReadWrite, kVariable); + add("ResourceApplyAdadelta" , kReadWrite, kVariable); + add("ResourceApplyAdagrad" , kReadWrite, kVariable); + add("ResourceApplyAdagradDA" , kReadWrite, kVariable); + add("ResourceApplyAdam" , kReadWrite, kVariable); + add("ResourceApplyAddSign" , kReadWrite, kVariable); + add("ResourceApplyCenteredRMSProp" , kReadWrite, kVariable); + add("ResourceApplyFtrl" , kReadWrite, kVariable); + add("ResourceApplyFtrlV2" , kReadWrite, kVariable); + add("ResourceApplyGradientDescent" , kReadWrite, kVariable); + add("ResourceApplyMomentum" , kReadWrite, kVariable); + add("ResourceApplyPowerSign" , kReadWrite, kVariable); + add("ResourceApplyProximalAdagrad" , kReadWrite, kVariable); + add("ResourceApplyProximalGradientDescent" , kReadWrite, kVariable); + add("ResourceApplyRMSProp" , kReadWrite, kVariable); + add("ResourceGather" , kRead, kVariable); + add("ResourceScatterAdd" , kReadWrite, kVariable); + add("ResourceScatterDiv" , kReadWrite, kVariable); + add("ResourceScatterMax" , kReadWrite, kVariable); + add("ResourceScatterMin" , kReadWrite, kVariable); + add("ResourceScatterMul" , kReadWrite, kVariable); + add("ResourceScatterNdAdd" , kReadWrite, kVariable); + add("ResourceScatterNdUpdate" , kReadWrite, kVariable); + add("ResourceScatterSub" , kReadWrite, kVariable); + add("ResourceScatterUpdate" , kReadWrite, kVariable); + add("ResourceStridedSliceAssign" , kReadWrite, kVariable); + add("VarIsInitializedOp" , kRead, kVariable); + add("VariableShape" , kRead, kVariable); + + add("StackV2" , kWrite, kStack); + add("StackCloseV2" , kRead, kStack); + add("StackPopV2" , kReadWrite, kStack); + add("StackPushV2" , kReadWrite, kStack); + + add("TensorArrayV3" , kWrite, kTensorArray); + add("TensorArrayConcatV3" , kRead, kTensorArray); + add("TensorArrayGatherV3" , kRead, kTensorArray); + add("TensorArrayScatterV3" , kWrite, kTensorArray); + add("TensorArrayGradV3" , kRead, kTensorArray); + add("TensorArrayCloseV3" , kRead, kTensorArray); + add("TensorArrayReadV3" , kRead, kTensorArray); + add("TensorArraySizeV3" , kRead, kTensorArray); + add("TensorArraySplitV3" , kWrite, kTensorArray); + add("TensorArrayWriteV3" , kWrite, kTensorArray); + // clang-format on + + return result; +} + +static const gtl::FlatMap& +GetStaticResourceOpInfoMap() { + static gtl::FlatMap* op_info_map = + CreateResourceOpInfoMap(); + return *op_info_map; +} + +const XlaResourceOpInfo* GetResourceOpInfoForOp(StringPiece op) { + const gtl::FlatMap& op_infos = + GetStaticResourceOpInfoMap(); + auto it = op_infos.find(op); + return it == op_infos.end() ? nullptr : &it->second; +} + +namespace resource_op_table_internal { +std::vector GetKnownResourceOps() { + std::vector result; + for (const auto& p : GetStaticResourceOpInfoMap()) { + result.push_back(p.first); + } + absl::c_sort(result); + return result; +} +} // namespace resource_op_table_internal +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.h b/tensorflow/compiler/tf2xla/resource_operation_table.h new file mode 100644 index 0000000000000000000000000000000000000000..7f627a64c6e8298a427cd87d25d4ba24835bf542 --- /dev/null +++ b/tensorflow/compiler/tf2xla/resource_operation_table.h @@ -0,0 +1,71 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2XLA_RESOURCE_OPERATION_TABLE_H_ +#define TENSORFLOW_COMPILER_TF2XLA_RESOURCE_OPERATION_TABLE_H_ + +#include +#include + +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/platform/logging.h" + +// Exposes information about the resource operations supported by tf2xla in a +// structured form. + +namespace tensorflow { +enum class XlaResourceOpKind { + kRead, // Only reads from resources. + kWrite, // Only writes to resources. + kReadWrite // Reads from and writes to resources. +}; + +enum class XlaResourceKind { + kVariable, // Operates on resource variables. + kStack, // Operates on stacks. + kTensorArray // Operates on tensor arrays. +}; + +class XlaResourceOpInfo { + public: + explicit XlaResourceOpInfo(XlaResourceOpKind op_kind, + XlaResourceKind resource_kind) + : op_kind_(op_kind), resource_kind_(resource_kind) {} + + XlaResourceOpKind kind() const { return op_kind_; } + XlaResourceKind resource_kind() const { return resource_kind_; } + + static StringPiece XlaResourceOpKindToString(XlaResourceOpKind op_kind); + + private: + XlaResourceOpKind op_kind_; + XlaResourceKind resource_kind_; +}; + +// Returns a XlaResourceOpInfo describing `op` if it is a resource operation +// supported by tf2xla, otherwise returns null (i.e. if this returns null then +// `op` is either not a resource operation or is unsupported by XLA). +const XlaResourceOpInfo* GetResourceOpInfoForOp(StringPiece op); + +namespace resource_op_table_internal { +// NB! Implementation detail exposed for unit testing, do not use. +// +// Returns the set of resource operations known by this module. +std::vector GetKnownResourceOps(); +} // namespace resource_op_table_internal + +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_TF2XLA_RESOURCE_OPERATION_TABLE_H_ diff --git a/tensorflow/compiler/tf2xla/resource_operation_table_test.cc b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0343f80de9fed114a0097b981233277c3e12b378 --- /dev/null +++ b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc @@ -0,0 +1,66 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/resource_operation_table.h" + +#include "absl/algorithm/container.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { +bool IsResourceArgDef(const OpDef::ArgDef& arg_def) { + return arg_def.type() == DT_RESOURCE; +} + +bool HasResourceInputOrOutput(const OpDef& op_def) { + return absl::c_any_of(op_def.input_arg(), IsResourceArgDef) || + absl::c_any_of(op_def.output_arg(), IsResourceArgDef); +} + +TEST(ResourceOperationTableTest, HaveAllResourceOps) { + gtl::FlatMap known_resource_ops; + for (StringPiece known_resource_op : + resource_op_table_internal::GetKnownResourceOps()) { + ASSERT_TRUE( + known_resource_ops.insert({string(known_resource_op), false}).second); + } + + std::vector xla_op_names = XlaOpRegistry::GetAllRegisteredOps(); + for (const string& xla_op_name : xla_op_names) { + const OpDef* op_def; + TF_ASSERT_OK(OpRegistry::Global()->LookUpOpDef(xla_op_name, &op_def)); + if (HasResourceInputOrOutput(*op_def)) { + EXPECT_EQ(known_resource_ops.count(xla_op_name), 1) + << "Unknown resource op " << xla_op_name; + known_resource_ops[xla_op_name] = true; + } + } + + std::vector unnecessary_resource_ops; + for (const auto& pair : known_resource_ops) { + if (!pair.second) { + unnecessary_resource_ops.push_back(pair.first); + } + } + + EXPECT_TRUE(unnecessary_resource_ops.empty()) + << "Stale resource ops:\n" + << absl::StrJoin(unnecessary_resource_ops, "\n"); +} +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc index 5759c72af301785f3ca1110b58eeb2fe7dead713..2d7eb8b915b8245ba6573c30b2eb15b12fc3a1b4 100644 --- a/tensorflow/compiler/tf2xla/sharding_util.cc +++ b/tensorflow/compiler/tf2xla/sharding_util.cc @@ -14,9 +14,9 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/tf2xla/sharding_util.h" +#include "absl/strings/match.h" #include "tensorflow/core/framework/node_def.pb.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/util/device_name_utils.h" @@ -27,10 +27,10 @@ const char kShardingAttribute[] = "_XlaSharding"; } // namespace namespace { -xla::StatusOr> -GetShardingFromNodeDef(const NodeDef& node_def) { +xla::StatusOr> GetShardingFromNodeDef( + const NodeDef& node_def) { if (!HasNodeAttr(node_def, kShardingAttribute)) { - return tensorflow::gtl::optional(); + return absl::optional(); } string value; xla::OpSharding sharding; @@ -40,7 +40,7 @@ GetShardingFromNodeDef(const NodeDef& node_def) { "Experimental _XlaSharding attribute was not a valid encoded " "xla::OpSharding proto."); } - return tensorflow::gtl::optional(sharding); + return absl::optional(sharding); } Status CoreOutOfRangeError(int core, int num_cores_per_replica) { @@ -50,12 +50,11 @@ Status CoreOutOfRangeError(int core, int num_cores_per_replica) { } } // namespace -xla::StatusOr> -ParseShardingFromDevice( +xla::StatusOr> ParseShardingFromDevice( const string& device_name, int num_cores_per_replica, - tensorflow::gtl::optional explicit_sharding) { + absl::optional explicit_sharding) { if (device_name.empty()) { - return tensorflow::gtl::optional(); + return absl::optional(); } DeviceNameUtils::ParsedName parsed_device; if (!DeviceNameUtils::ParseFullName(device_name, &parsed_device)) { @@ -66,34 +65,34 @@ ParseShardingFromDevice( if (explicit_sharding.has_value()) { return explicit_sharding; } else if (!parsed_device.has_type || !parsed_device.has_id || - !str_util::StrContains(parsed_device.type, - kDeviceSuffixReplicatedCore)) { - return tensorflow::gtl::optional(); + !absl::StrContains(parsed_device.type, + kDeviceSuffixReplicatedCore)) { + return absl::optional(); } else { const int core = parsed_device.id; if (core < 0 || core >= num_cores_per_replica) { return CoreOutOfRangeError(core, num_cores_per_replica); } - return tensorflow::gtl::optional( + return absl::optional( xla::sharding_builder::AssignDevice(core)); } } -xla::StatusOr> -ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica) { +xla::StatusOr> ParseShardingFromDevice( + const NodeDef& node_def, int num_cores_per_replica) { const string& device_name = node_def.device(); - TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional sharding, + TF_ASSIGN_OR_RETURN(absl::optional sharding, GetShardingFromNodeDef(node_def)); return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding); } -xla::StatusOr> -ParseShardingFromDevice(const Node& node, int num_cores_per_replica) { +xla::StatusOr> ParseShardingFromDevice( + const Node& node, int num_cores_per_replica) { string device_name = node.assigned_device_name(); if (device_name.empty()) { device_name = node.requested_device(); } - TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional sharding, + TF_ASSIGN_OR_RETURN(absl::optional sharding, GetShardingFromNodeDef(node.def())); return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding); } diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h index b1c817bdcc211648b16e395313ca171d1acb9ea9..ab67d4f154282e3fc37b68339045deb5da91b9db 100644 --- a/tensorflow/compiler/tf2xla/sharding_util.h +++ b/tensorflow/compiler/tf2xla/sharding_util.h @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_TF2XLA_TPU_UTIL_H_ -#define TENSORFLOW_COMPILER_TF2XLA_TPU_UTIL_H_ +#ifndef TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_ +#define TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_ #include @@ -33,19 +33,18 @@ namespace tensorflow { // - explicit_sharding if explicit_sharding.has_value() // - a non-value if there is no assigned core or // - a sharding set as per xla::sharding_builder::AssignDevice. -xla::StatusOr> -ParseShardingFromDevice(const string& device_name, int num_cores_per_replica, - tensorflow::gtl::optional - explicit_sharding = tensorflow::gtl::nullopt); +xla::StatusOr> ParseShardingFromDevice( + const string& device_name, int num_cores_per_replica, + absl::optional explicit_sharding = absl::nullopt); -xla::StatusOr> -ParseShardingFromDevice(const Node& node, int num_cores_per_replica); +xla::StatusOr> ParseShardingFromDevice( + const Node& node, int num_cores_per_replica); -xla::StatusOr> -ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica); +xla::StatusOr> ParseShardingFromDevice( + const NodeDef& node_def, int num_cores_per_replica); void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst); } // namespace tensorflow -#endif // TENSORFLOW_COMPILER_TF2XLA_TPU_UTIL_H_ +#endif // TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_ diff --git a/tensorflow/compiler/tf2xla/sharding_util_test.cc b/tensorflow/compiler/tf2xla/sharding_util_test.cc index bff5978237a827cb9650541f2cf6984d9e846796..dcb7e212b74d2e261de7e125bb66b3ec78e0cfe9 100644 --- a/tensorflow/compiler/tf2xla/sharding_util_test.cc +++ b/tensorflow/compiler/tf2xla/sharding_util_test.cc @@ -23,7 +23,7 @@ TEST(CoreUtilTest, ParseShardingFromDevice) { Graph graph(OpRegistry::Global()); auto core_from_sharding = - [](tensorflow::gtl::optional sharding) -> int64 { + [](absl::optional sharding) -> int64 { if (sharding.has_value() && sharding.value().type() == xla::OpSharding::Type::OpSharding_Type_MAXIMAL) { diff --git a/tensorflow/compiler/tf2xla/str_util.cc b/tensorflow/compiler/tf2xla/str_util.cc deleted file mode 100644 index 2b0834fe7b6c4d2199267dbe0ec1f7c2785aa9c7..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/tf2xla/str_util.cc +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/tf2xla/str_util.h" - -#include -#include -#include - -namespace tensorflow { -namespace str_util { - -static void ReplaceAll(string* text, StringPiece from, StringPiece to) { - size_t pos = 0; - while ((pos = text->find(from.data(), pos, from.size())) != string::npos) { - text->replace(pos, from.size(), to.data(), to.size()); - pos += to.size(); - if (from.empty()) { - pos++; // Match at the beginning of the text and after every byte - } - } -} - -void ReplaceAllPairs(string* text, - const std::vector>& replace) { - for (const std::pair& from_to : replace) { - ReplaceAll(text, from_to.first, from_to.second); - } -} - -} // namespace str_util -} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/str_util.h b/tensorflow/compiler/tf2xla/str_util.h deleted file mode 100644 index 51f25009d7003db0d72296619a469ecbbbb1808d..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/tf2xla/str_util.h +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// String utilities that are esoteric enough that they don't belong in -// third_party/tensorflow/core/lib/strings/str_util.h, but are still generally -// useful under xla. - -#ifndef TENSORFLOW_COMPILER_TF2XLA_STR_UTIL_H_ -#define TENSORFLOW_COMPILER_TF2XLA_STR_UTIL_H_ - -#include -#include -#include - -#include "tensorflow/core/lib/core/stringpiece.h" - -namespace tensorflow { -namespace str_util { - -// Replace all non-overlapping occurrences of the given (from,to) pairs in-place -// in text. If from is empty, it matches at the beginning of the text and after -// every byte. Each (from,to) replacement pair is processed in the order it is -// given. -void ReplaceAllPairs(string* text, - const std::vector>& replace); - -} // namespace str_util -} // namespace tensorflow - -#endif // TENSORFLOW_COMPILER_TF2XLA_STR_UTIL_H_ diff --git a/tensorflow/compiler/tf2xla/str_util_test.cc b/tensorflow/compiler/tf2xla/str_util_test.cc deleted file mode 100644 index 8817f6902a8e58e796ca5240a9a24d7506d38793..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/tf2xla/str_util_test.cc +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/tf2xla/str_util.h" - -#include -#include -#include - -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/platform/test.h" - -namespace tensorflow { -namespace str_util { - -class ReplaceAllPairsTest : public ::testing::Test { - protected: - void ExpectReplaceAllPairs( - string text, const std::vector>& replace, - StringPiece want) { - ReplaceAllPairs(&text, replace); - EXPECT_EQ(text, want); - } -}; - -TEST_F(ReplaceAllPairsTest, Simple) { - ExpectReplaceAllPairs("", {}, ""); - ExpectReplaceAllPairs("", {{"", ""}}, ""); - ExpectReplaceAllPairs("", {{"", "X"}}, "X"); - ExpectReplaceAllPairs("", {{"", "XYZ"}}, "XYZ"); - ExpectReplaceAllPairs("", {{"", "XYZ"}, {"", "_"}}, "_X_Y_Z_"); - ExpectReplaceAllPairs("", {{"", "XYZ"}, {"", "_"}, {"_Y_", "a"}}, "_XaZ_"); - ExpectReplaceAllPairs("banana", {}, "banana"); - ExpectReplaceAllPairs("banana", {{"", ""}}, "banana"); - ExpectReplaceAllPairs("banana", {{"", "_"}}, "_b_a_n_a_n_a_"); - ExpectReplaceAllPairs("banana", {{"", "__"}}, "__b__a__n__a__n__a__"); - ExpectReplaceAllPairs("banana", {{"a", "a"}}, "banana"); - ExpectReplaceAllPairs("banana", {{"a", ""}}, "bnn"); - ExpectReplaceAllPairs("banana", {{"a", "X"}}, "bXnXnX"); - ExpectReplaceAllPairs("banana", {{"a", "XX"}}, "bXXnXXnXX"); - ExpectReplaceAllPairs("banana", {{"a", "XX"}, {"XnX", "z"}}, "bXzzX"); - ExpectReplaceAllPairs("a{{foo}}b{{bar}}c{{foo}}", - {{"{{foo}}", "0"}, {"{{bar}}", "123456789"}}, - "a0b123456789c0"); -} - -} // namespace str_util -} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc index 3a08aa8cf4f5cea6210cc9470d57c3387445ea6e..f34af2d67debe8bfa4abcad19e42c55ea40c4e82 100644 --- a/tensorflow/compiler/tf2xla/tf2xla.cc +++ b/tensorflow/compiler/tf2xla/tf2xla.cc @@ -22,11 +22,13 @@ limitations under the License. #include #include +#include "absl/strings/str_join.h" #include "tensorflow/compiler/tf2xla/dump_graph.h" #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph.pb.h" @@ -39,7 +41,6 @@ limitations under the License. #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/node_builder.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -196,8 +197,8 @@ Status RewriteAndPruneGraph( if (!missing_feeds.empty() || !missing_fetches.empty()) { return errors::Aborted( "Post graph-pruning", - ", missing feeds: ", str_util::Join(missing_feeds, ", "), - ", missing fetches: ", str_util::Join(missing_fetches, ", ")); + ", missing feeds: ", absl::StrJoin(missing_feeds, ", "), + ", missing fetches: ", absl::StrJoin(missing_fetches, ", ")); } return Status::OK(); } @@ -263,8 +264,7 @@ Status ConvertGraphToXla(std::unique_ptr graph, xla::Client* client, // Compile the graph into an XLA computation. XlaCompiler::Options compiler_options; compiler_options.client = client; - DeviceType device_type(DEVICE_CPU_XLA_JIT); - compiler_options.device_type = &device_type; + compiler_options.device_type = DeviceType(DEVICE_CPU_XLA_JIT); compiler_options.flib_def = &graph->flib_def(); compiler_options.graph_def_version = graph->versions().producer(); compiler_options.allow_cpu_custom_calls = true; diff --git a/tensorflow/compiler/tf2xla/tf2xla.h b/tensorflow/compiler/tf2xla/tf2xla.h index d02fc56c5b8f58f0e4cfe1779ad34fe3b79324c7..432a12a51622b56ae74a677420da321c58960ee6 100644 --- a/tensorflow/compiler/tf2xla/tf2xla.h +++ b/tensorflow/compiler/tf2xla/tf2xla.h @@ -18,7 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/tf2xla.pb.h" #include "tensorflow/compiler/xla/client/client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/core/framework/graph.pb.h" namespace tensorflow { diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc index 7aca889a266439538c4cd1c153460e6cc871b246..567d212b5eee493d29a1817987cbd7759575386e 100644 --- a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc +++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc @@ -20,11 +20,11 @@ limitations under the License. #include #include +#include "absl/strings/str_join.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/core/framework/kernel_def.pb.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/util/command_line_flags.h" @@ -54,10 +54,10 @@ void PrintSupportedOps(const string& device, const string& regen_run) { } std::sort(types.begin(), types.end()); constraints.push_back("`" + constraint.name() + "={" + - str_util::Join(types, ",") + "}`"); + absl::StrJoin(types, ",") + "}`"); } std::cout << "`" << kdef->op() << "` | " - << str_util::Join(constraints, "
") << std::endl; + << absl::StrJoin(constraints, "
") << std::endl; } std::cout << "\nTo regenerate this table, run:\n\n```shell\n" @@ -76,7 +76,7 @@ void SupportedOpsMain(int argc, char** argv, const char* regen_run) { {"device", &device, "Name of the compilation device for which to print supported ops, " "one of: " + - str_util::Join(device_names, ",")}, + absl::StrJoin(device_names, ",")}, }; string usage = Flags::Usage(argv[0], flag_list); bool parsed_flags_ok = Flags::Parse(&argc, argv, flag_list); diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc index 84c133ffabe20dbdaa4d5a64e035efb5e4c4c44b..56f7045a98201ed398244f9e3f5ff23788135b75 100644 --- a/tensorflow/compiler/tf2xla/tf2xla_test.cc +++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc @@ -18,6 +18,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/tf2xla.pb.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/core/framework/attr_value.pb.h" @@ -73,8 +75,8 @@ TEST(ConvertGraphDefToXla, Sum) { TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation)); // Set up arguments. - auto x_literal = xla::Literal::CreateR0(10); - auto y_literal = xla::Literal::CreateR0(32); + auto x_literal = xla::LiteralUtil::CreateR0(10); + auto y_literal = xla::LiteralUtil::CreateR0(32); auto x_global_or = client->TransferToServer(*x_literal); auto y_global_or = client->TransferToServer(*y_literal); TF_EXPECT_OK(x_global_or.status()); diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc index 9203e8d9e607e99ad738350a1c3f2b9e900df179..e284e0b191ac09f9491973166c80b731c8ea51a5 100644 --- a/tensorflow/compiler/tf2xla/tf2xla_util.cc +++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc @@ -16,9 +16,11 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include +#include #include #include +#include "absl/types/optional.h" #include "tensorflow/compiler/tf2xla/sharding_util.h" #include "tensorflow/compiler/tf2xla/tf2xla.pb.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -31,7 +33,6 @@ limitations under the License. #include "tensorflow/core/graph/tensor_id.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/lib/strings/strcat.h" namespace tensorflow { @@ -232,7 +233,7 @@ Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in, // Push input nodes of the currently visited node to name_queue. for (const string& in_edge : map_entry.second->input()) { auto id = ParseTensorName(in_edge); - const string node_name = std::string(id.first); + const string node_name = string(id.first); if (feed_tensors.find(std::make_pair(node_name, id.second)) == feed_tensors.end()) { name_queue.push(node_name); @@ -267,7 +268,7 @@ Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) { if (edge->IsControlEdge()) continue; const Node* possible_match = out_edges ? edge->dst() : edge->src(); TF_ASSIGN_OR_RETURN( - tensorflow::gtl::optional sharding, + absl::optional sharding, ParseShardingFromDevice( *possible_match, /*num_cores_per_replica=*/std::numeric_limits::max())); @@ -297,4 +298,29 @@ void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype, } } +namespace { +uint32 InitialRandomSeed() { + // Support plumbing the TF seed through to XLA is being worked on. + // If a user wants deterministic behavior, their best option + // is to start with a known checkpoint. This also handles issues when + // multiple random calls can be invoked in any order by TF executor. + // Another option is to use stateless random ops. They have much cleaner + // semantics. + // If a user really wants to set a deterministic seed for XLA-based + // devices, this is the place to do it. + std::random_device rd; + // Make the starting value odd. + return rd() | 1; +} +} // namespace + +uint32 GetXLARandomSeed() { + // We initialize counter with an odd number and increment it by two + // everytime. This ensures that it will never be zero, even + // after an overflow. When seeded with zero, some XLA backends + // can return all zeros instead of random numbers. + static std::atomic counter(InitialRandomSeed()); + return counter.fetch_add(2); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h index 745beb39c1d917cd0d1cd219536ee26a96253ec9..33620ef810bd4fe897f384474e661e341a448b93 100644 --- a/tensorflow/compiler/tf2xla/tf2xla_util.h +++ b/tensorflow/compiler/tf2xla/tf2xla_util.h @@ -56,6 +56,9 @@ Status SetNodeShardingFromNeighbors(Node* n, bool out_edges); void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype, KernelDef* kdef); +// Returns the next random seed to use for seeding xla rng. +uint32 GetXLARandomSeed(); + } // namespace tensorflow #endif // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_ diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc index ae51446204baf14dc03fc6305641048dbf3872b0..2b1f724dc7b2e2bb6d06115827f92bf0670955b3 100644 --- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc +++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/tf2xla_util.h" +#include "absl/strings/match.h" #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/ops/data_flow_ops.h" #include "tensorflow/cc/ops/function_ops.h" @@ -25,16 +26,15 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/test.h" namespace tensorflow { namespace { -void ExpectErrorContains(const Status& status, StringPiece str) { +void ExpectErrorContains(const Status& status, absl::string_view str) { EXPECT_NE(Status::OK(), status); - EXPECT_TRUE(str_util::StrContains(status.error_message(), str)) + EXPECT_TRUE(absl::StrContains(status.error_message(), str)) << "expected error: " << status.error_message() << " to contain: " << str; } diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc index fe7ec633eca2504faf6cbb2f5fd7f59780ab7976..d98237bd5c9288e6337e10c19c2d7574ad2e4c97 100644 --- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc +++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc @@ -22,7 +22,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/sharding_util.h" #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/common_runtime/local_device.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/platform/mem.h" @@ -103,7 +103,7 @@ void XlaCompilationDevice::Compute(OpKernel* op_kernel, auto sharding_parse_result = ParseShardingFromDevice( op_kernel->def(), std::numeric_limits::max()); OP_REQUIRES_OK(context, sharding_parse_result.status()); - tensorflow::gtl::optional op_sharding = + absl::optional op_sharding = sharding_parse_result.ValueOrDie(); // If no sharding metadata is found, XLA is free to use whatever device it diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h index d0b9e34e162f3412cd6662a2e2bbfe3df213c4c2..a6e78825334fec748be5fee80669649df699d2fb 100644 --- a/tensorflow/compiler/tf2xla/xla_compilation_device.h +++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h @@ -19,7 +19,7 @@ limitations under the License. #include #include "tensorflow/compiler/tf2xla/xla_resource.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/common_runtime/local_device.h" #include "tensorflow/core/framework/device_base.h" diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc index 672e19bd93449ccc31f4af5ded23257b197a3c39..1f0f240135dfcd0c540cc39a42514c67ce979ee0 100644 --- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc +++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc @@ -16,45 +16,47 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h" #include -#include "tensorflow/compiler/aot/runtime.h" namespace tensorflow { XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data, AllocMode alloc_mode) - : raw_function_(static_data.raw_function), - result_index_(static_data.result_index), - args_(new void*[static_data.num_args]), - temps_(new void*[static_data.num_temps]), - arg_names_(static_data.arg_names), - result_names_(static_data.result_names), - program_shape_(static_data.program_shape), - hlo_profile_printer_data_(static_data.hlo_profile_printer_data) { + : raw_function_(static_data.raw_function_), + result_index_(static_data.result_index_), + buffer_table_(new void*[static_data.num_buffers_]), + buffer_infos_(static_data.buffer_infos_), + arg_index_table_(static_data.arg_index_table_), + num_args_(static_data.num_args_), + arg_names_(static_data.arg_names_), + result_names_(static_data.result_names_), + program_shape_(static_data.program_shape_), + hlo_profile_printer_data_(static_data.hlo_profile_printer_data_) { + bool allocate_entry_params = + alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS; // Allocate arg and temp buffers. - if (alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS) { - alloc_args_ = tensorflow::tfcompile::runtime::MallocContiguousBuffers( - static_data.arg_sizes, static_data.num_args, args_, - /*annotate_initialized=*/false); - } - alloc_temps_ = tensorflow::tfcompile::runtime::MallocContiguousBuffers( - static_data.temp_sizes, static_data.num_temps, temps_, + alloc_buffer_table_ = cpu_function_runtime::MallocContiguousBuffers( + static_data.buffer_infos_, static_data.num_buffers_, + /*allocate_entry_params=*/allocate_entry_params, buffer_table_, /*annotate_initialized=*/true); - // If Hlo profiling is enabled the generated code expects an appropriately // sized buffer to be passed in as the last argument. If Hlo profiling is // disabled the last function argument is still present in the function // signature, but it is ignored by the generated code and we pass in null for // it. if (hlo_profiling_enabled()) { - profile_counters_ = new int64[static_data.profile_counters_size](); + profile_counters_ = new int64[static_data.profile_counters_size_](); } } +bool XlaCompiledCpuFunction::Run() { + raw_function_(buffer_table_[result_index_], &run_options_, nullptr, + buffer_table_, profile_counters_); + return true; +} + XlaCompiledCpuFunction::~XlaCompiledCpuFunction() { - tensorflow::tfcompile::runtime::FreeContiguous(alloc_args_); - tensorflow::tfcompile::runtime::FreeContiguous(alloc_temps_); - delete[] args_; - delete[] temps_; + cpu_function_runtime::FreeContiguous(alloc_buffer_table_); + delete[] buffer_table_; delete[] profile_counters_; } diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h index 48a8c083cacf2f6ecf9dc1817b6174c01385d035..425e769346ffcbc548495d93cb7adc779f860110 100644 --- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h +++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h @@ -19,6 +19,7 @@ limitations under the License. #include #include +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/core/platform/types.h" @@ -56,36 +57,85 @@ class XlaCompiledCpuFunction { // StaticData represents the state necessary to run an XLA-compiled // function. For JIT this is backed by data in XlaJitCompiledCpuFunction; for // AOT this is backed by data compiled into the object file. - struct StaticData { + // + // The contents of StaticData are XLA-internal implementation details and + // should not be relied on by clients. + // + // TODO(sanjoy): Come up with a cleaner way to express the contraint we want + // here: generated XlaCompiledCpuFunction subclasses should be able to create + // instances of StaticData but only XlaCompiledCpuFunction should be able to + // read from StaticData instances. + class StaticData { + public: + void set_raw_function(RawFunction raw_function) { + raw_function_ = raw_function; + } + void set_buffer_infos( + const cpu_function_runtime::BufferInfo* buffer_infos) { + buffer_infos_ = buffer_infos; + } + void set_num_buffers(size_t num_buffers) { num_buffers_ = num_buffers; } + void set_arg_index_table(const int32* arg_index_table) { + arg_index_table_ = arg_index_table; + } + void set_num_args(int64 num_args) { num_args_ = num_args; } + void set_result_index(size_t result_index) { result_index_ = result_index; } + void set_arg_names(const char** arg_names) { arg_names_ = arg_names; } + void set_result_names(const char** result_names) { + result_names_ = result_names; + } + void set_program_shape(const xla::ProgramShape* program_shape) { + program_shape_ = program_shape; + } + const xla::HloProfilePrinterData* hlo_profile_printer_data() const { + return hlo_profile_printer_data_; + } + void set_hlo_profile_printer_data( + const xla::HloProfilePrinterData* hlo_profile_printer_data) { + hlo_profile_printer_data_ = hlo_profile_printer_data; + } + void set_profile_counters_size(int64 profile_counters_size) { + profile_counters_size_ = profile_counters_size; + } + + private: // The raw function to call. - RawFunction raw_function; + RawFunction raw_function_; + + // Contains information about the buffers used by the XLA computation. + const cpu_function_runtime::BufferInfo* buffer_infos_ = nullptr; + size_t num_buffers_ = 0; + + // Entry parameter i is described by + // buffer_infos[arg_index_table[i]]. + const int32* arg_index_table_ = nullptr; - // Cardinality and sizes of arg and temp buffers. - const intptr_t* arg_sizes = nullptr; - size_t num_args = 0; - const intptr_t* temp_sizes = nullptr; - size_t num_temps = 0; + // There are num_args entry parameters. + int64 num_args_ = 0; // The 0-based index of the result tuple, in the temp buffers. - size_t result_index = 0; + size_t result_index_ = 0; // [Optional] Arrays of arg and result names. These are arrays of C-style // strings, where the array is terminated by nullptr. - const char** arg_names = nullptr; - const char** result_names = nullptr; + const char** arg_names_ = nullptr; + const char** result_names_ = nullptr; // [Optional] Arg and result shapes. - const xla::ProgramShape* program_shape = nullptr; + const xla::ProgramShape* program_shape_ = nullptr; // [Optional] Profile printer data. Null if profiling is disabled. - const xla::HloProfilePrinterData* hlo_profile_printer_data = nullptr; + const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr; // [Optional] The number of profile counters expected in the profile counter // buffer by the generated code and hlo_profile_printer. 0 if profiling is // disabled. This information is already present in // hlo_profile_printer_data but xla::HloProfilePrinterData is forward // declared so we don't have access to that information here. - int64 profile_counters_size = 0; + int64 profile_counters_size_ = 0; + + // Only XlaCompiledCpuFunction is allowed to read the above fields. + friend class XlaCompiledCpuFunction; }; // AllocMode controls the buffer allocation mode. @@ -113,11 +163,7 @@ class XlaCompiledCpuFunction { // Runs the computation, with inputs read from arg buffers, and outputs // written to result buffers. Returns true on success and false on failure. - bool Run() { - raw_function_(temps_[result_index_], &run_options_, - const_cast(args_), temps_, profile_counters_); - return true; - } + bool Run(); // Returns the error message from the previous failed Run call. // @@ -129,14 +175,25 @@ class XlaCompiledCpuFunction { // ------------------------------ // Arg methods for managing input buffers. Buffers are in row-major order. - // Returns the underlying array of argument buffers, where args()[I] is the - // buffer for the positional argument at index I. - void** args() { return args_; } - const void* const* args() const { return args_; } - // Returns the buffer for the positional argument at the given `index`. - void* arg_data(size_t index) { return args_[index]; } - const void* arg_data(size_t index) const { return args_[index]; } + void* arg_data(size_t index) { + return buffer_table_[arg_index_table_[index]]; + } + const void* arg_data(size_t index) const { + return buffer_table_[arg_index_table_[index]]; + } + + int num_args() const { return num_args_; } + + // Returns the size of entry parameter `idx`. + // + // There is a static version of this method on tfcompile generated subclasses + // of XlaCompiledCpuFunction, but try to prefer this when possible since it + // works both for XlaJitCompiledCpuFunction and AOT compiled subclasses. + int arg_size(int idx) const { + assert(idx < num_args()); + return buffer_infos_[arg_index_table_[idx]].size(); + } // Sets the buffer for the positional argument at the given `index` to `data`. // Must be called before Run to have an effect. May be called under any @@ -149,7 +206,9 @@ class XlaCompiledCpuFunction { // // Aliasing of argument and result buffers is not allowed, and results in // undefined behavior. - void set_arg_data(size_t index, void* data) { args_[index] = data; } + void set_arg_data(size_t index, void* data) { + buffer_table_[arg_index_table_[index]] = data; + } // ------------------------------ // Result methods for managing output buffers. Buffers are in row-major order. @@ -159,9 +218,9 @@ class XlaCompiledCpuFunction { // Returns the underlying array of result buffers, where results()[I] is the // buffer for the positional result at index I. - void** results() { return static_cast(temps_[result_index_]); } + void** results() { return static_cast(buffer_table_[result_index_]); } const void* const* results() const { - return static_cast(temps_[result_index_]); + return static_cast(buffer_table_[result_index_]); } // Profile counters for this XLA computation. @@ -219,14 +278,28 @@ class XlaCompiledCpuFunction { const RawFunction raw_function_; const size_t result_index_; - // Arrays of argument and temp buffers; entries in args_ may be overwritten by - // the user. - void** args_ = nullptr; - void** temps_ = nullptr; + // Array containing pointers to argument and temp buffers (slots corresponding + // to constant and on-stack buffers are null). + void** const buffer_table_; - // Backing memory for individual arg and temp buffers. - void* alloc_args_ = nullptr; - void* alloc_temps_ = nullptr; + // Describes the buffers used by the XLA computation. + const cpu_function_runtime::BufferInfo* const buffer_infos_; + + // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]] + // for XLA generated code to be able to find it. + // + // For now we need to keep around the args_ array because there is code that + // depends on args() returning a void**. However, in the future we may remove + // args_ in favor of using buffer_table_ as the sole storage for the + // arguments. + const int32* const arg_index_table_; + + // The number of incoming arguments. + const int32 num_args_; + + // Backing memory for buffer_table_ and args_, the latter depending on + // AllocMode. + void* alloc_buffer_table_ = nullptr; // Backing memory for profiling counters. int64* profile_counters_ = nullptr; diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index f7098917b191058c53a1d6a5923e80e5e8319d72..0c300c282e9698534af6372b2f2ddae06f88db24 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/tf2xla/dump_graph.h" #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h" #include "tensorflow/compiler/tf2xla/graph_compiler.h" @@ -28,11 +29,14 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_compilation_device.h" #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/xla/client/client_library.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/executor.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/common_runtime/graph_optimizer.h" #include "tensorflow/core/framework/attr_value_util.h" +#include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/node_builder.h" @@ -83,12 +87,9 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options) : options_(options), initialization_status_(Status::OK()), next_step_id_(1), - device_( - new XlaCompilationDevice(SessionOptions(), *options_.device_type)), + device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)), device_mgr_({device_}) { - // We no longer need the device_type. - options_.device_type = nullptr; - + CHECK(!options_.device_type.type_string().empty()); if (options_.populate_resource_manager) { initialization_status_ = (*options_.populate_resource_manager)(device_->resource_manager()); @@ -228,15 +229,18 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options, // Computes the XLA shape for argument 'arg'. Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg, bool is_entry_computation, - xla::Shape* xla_shape) { + xla::Shape* xla_shape) const { switch (arg.kind) { case XlaCompiler::Argument::kConstant: LOG(FATAL) << "Unreachable case"; case XlaCompiler::Argument::kParameter: { - TensorShape shape = - is_entry_computation - ? options_.shape_representation_fn(arg.shape, arg.type) - : arg.shape; + TensorShape shape; + if (is_entry_computation) { + TF_ASSIGN_OR_RETURN( + shape, options_.shape_representation_fn(arg.shape, arg.type)); + } else { + shape = arg.shape; + } return TensorShapeToXLAShape(arg.type, shape, xla_shape); } case XlaCompiler::Argument::kResource: { @@ -244,8 +248,9 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg, switch (arg.resource_kind) { case XlaResource::kVariable: { - TensorShape representation_shape = - options_.shape_representation_fn(arg.shape, arg.type); + TF_ASSIGN_OR_RETURN( + TensorShape representation_shape, + options_.shape_representation_fn(arg.shape, arg.type)); return TensorShapeToXLAShape(arg.type, representation_shape, xla_shape); } @@ -306,7 +311,7 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr graph, // unique_ptr so we can capture the cleanup status in the end. xla_context->Ref(); Status status; - auto step_container = xla::MakeUnique( + auto step_container = absl::make_unique( step_id, [&status, device](const string& name) { status = device->resource_manager()->Cleanup(name); }); @@ -341,9 +346,9 @@ Status BuildComputation( const std::vector& arg_cores, const std::vector& retvals, const std::vector>& resources, - bool return_updated_values_for_all_resources, xla::XlaBuilder* builder, - xla::XlaComputation* computation, int* num_computation_outputs, - int* num_nonconst_outputs, + bool return_updated_values_for_all_resources, bool always_return_tuple, + xla::XlaBuilder* builder, xla::XlaComputation* computation, + int* num_computation_outputs, int* num_nonconst_outputs, std::vector* outputs, std::vector* resource_updates) { std::vector elems; @@ -356,6 +361,9 @@ Status BuildComputation( if (retval.has_constant_value()) { output.is_constant = true; output.constant_value = retval.constant_value(); + } else if (retval.resource() != nullptr) { + output.is_constant = false; + output.input_index = retval.resource()->arg_num(); } else { output.is_constant = false; elems.push_back(retval.handle()); @@ -387,13 +395,14 @@ Status BuildComputation( const XlaCompiler::Argument& arg = args[resource->arg_num()]; const int core = arg_cores[resource->arg_num()]; DCHECK_LT(resource->arg_num(), arg_cores.size()); - bool modified = resource->value() != resource->initial_value(); + bool modified = !resource->value().IsIdenticalTo(resource->initial_value()); // TensorArray gradients were modified if their values changed or there are // any newly created gradients. for (const auto& grad : resource->tensor_array_gradients()) { - modified = modified || - grad.second->value() != grad.second->initial_value() || - arg.tensor_array_gradients.count(grad.first) == 0; + modified = + modified || + !grad.second->value().IsIdenticalTo(grad.second->initial_value()) || + arg.tensor_array_gradients.count(grad.first) == 0; } if (return_updated_values_for_all_resources || modified) { resource_updates->emplace_back(); @@ -408,7 +417,7 @@ Status BuildComputation( // Request that the value be returned on a specific core. xla::XlaScopedShardingAssignment assign_sharding( - builder, core == -1 ? tensorflow::gtl::optional() + builder, core == -1 ? absl::optional() : xla::sharding_builder::AssignDevice(core)); xla::XlaOp handle; @@ -418,16 +427,20 @@ Status BuildComputation( // create a tuple/get-tuple-element combination so that sharding // assignment will be placed on this value, which will cause the resource // update to be returned from the same device that provided the resource. - handle = builder->GetTupleElement(builder->Tuple({handle}), 0); - + handle = xla::GetTupleElement(xla::Tuple(builder, {handle}), 0); elems.push_back(handle); } } *num_computation_outputs = elems.size(); - // Builds the XLA computation. - builder->Tuple(elems); + // Builds the XLA computation. We *always* form a tuple here to ensure that + // the output value is the last thing added into the XLA computation, even + // if there is only one output value. + auto tuple = xla::Tuple(builder, elems); + if (!always_return_tuple && elems.size() == 1) { + xla::GetTupleElement(tuple, 0); + } builder->ClearOpMetadata(); xla::StatusOr computation_status = builder->Build(); @@ -455,8 +468,6 @@ Status XlaCompiler::BuildArguments( // XLA computation as runtime parameters. input_mapping->clear(); input_mapping->reserve(args.size()); - std::vector resources; - resources.reserve(args.size()); // Fills in constant arguments, and computes non-constant argument order. for (std::vector::size_type i = 0; i < args.size(); @@ -475,8 +486,9 @@ Status XlaCompiler::BuildArguments( /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource)); arg_expression.set_resource(resource); if (arg.initialized) { - resources.push_back(i); + input_mapping->push_back(i); } + break; case XlaCompiler::Argument::kParameter: { input_mapping->push_back(i); @@ -486,14 +498,11 @@ Status XlaCompiler::BuildArguments( arg_expression.set_constant_value(arg.constant_value); break; case XlaCompiler::Argument::kInvalid: - return errors::Internal("Unreachable case in BuildArguments()"); + return errors::Internal( + "Unreachable case in BuildArguments() while filling constant args"); } } - // Append parameters containing variable values after the other runtime - // parameters. - input_mapping->insert(input_mapping->end(), resources.begin(), - resources.end()); if (input_mapping->empty()) { return Status::OK(); } @@ -554,25 +563,25 @@ Status XlaCompiler::BuildArguments( } xla::XlaScopedShardingAssignment assign_tuple_sharding(builder, tuple_sharding); - tuple = builder->Parameter(0, (*input_shapes)[0], "arg_tuple"); + tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple"); } else { - tuple = builder->Parameter(0, (*input_shapes)[0], "arg_tuple"); + tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple"); } for (std::vector::size_type i = 0; i < input_mapping->size(); ++i) { const int core = (*arg_cores)[input_mapping->at(i)]; xla::XlaScopedShardingAssignment assign_sharding( - builder, core == -1 ? tensorflow::gtl::optional() + builder, core == -1 ? absl::optional() : xla::sharding_builder::AssignDevice(core)); - arg_handles[i] = builder->GetTupleElement(tuple, i); + arg_handles[i] = xla::GetTupleElement(tuple, i); } } else { for (std::vector::size_type i = 0; i < input_mapping->size(); ++i) { const int core = (*arg_cores)[input_mapping->at(i)]; xla::XlaScopedShardingAssignment assign_sharding( - builder, core == -1 ? tensorflow::gtl::optional() + builder, core == -1 ? absl::optional() : xla::sharding_builder::AssignDevice(core)); - arg_handles[i] = - builder->Parameter(i, (*input_shapes)[i], strings::StrCat("arg", i)); + arg_handles[i] = xla::Parameter(builder, i, (*input_shapes)[i], + strings::StrCat("arg", i)); } } @@ -603,14 +612,15 @@ Status XlaCompiler::BuildArguments( // return values of functions, and then reshape unconditionally. if (is_entry_computation) { arg_expression.set_handle( - builder->Reshape(arg_handles[i], arg.shape.dim_sizes())); + xla::Reshape(arg_handles[i], arg.shape.dim_sizes())); } else { arg_expression.set_handle(arg_handles[i]); } break; case XlaCompiler::Argument::kConstant: case XlaCompiler::Argument::kInvalid: - return errors::Internal("Unreachable case in BuildArguments()"); + return errors::Internal( + "Unreachable case in BuildArguments() while filling handles"); } } @@ -655,10 +665,65 @@ Status XlaCompiler::CompileSingleOp( .Finalize(graph.get(), &node); TF_RETURN_IF_ERROR(status); } + FixupSourceAndSinkEdges(graph.get()); return CompileGraph(options, name, std::move(graph), args, result); } +namespace { + +// Check that the ops of all non-functional nodes have been registered. +Status ValidateFunctionDef(const FunctionDef* fdef, + const FunctionLibraryDefinition& flib_def) { + for (const NodeDef& node : fdef->node_def()) { + const string& op = node.op(); + if (op == FunctionLibraryDefinition::kGradientOp || flib_def.Find(op)) { + continue; + } + const OpDef* op_def; + TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(op, &op_def)); + } + return Status::OK(); +} + +// Check that the graph doesn't have any invalid nodes (e.g. incompatible with +// given device_type, invalid data type, missing attributes...) +Status ValidateGraph(const Graph* graph, + const FunctionLibraryDefinition& flib_def, + const DeviceType& device_type, const string& name) { + auto maybe_error = [&](const Node* node, const Status& s) -> Status { + if (!s.ok()) { + return errors::InvalidArgument(strings::StrCat( + "Detected unsupported operations when trying to compile graph ", name, + " on ", device_type.type_string(), ": ", node->def().op(), " (", + s.error_message(), ")", FormatNodeForError(*node))); + } + return Status::OK(); + }; + + for (const Node* node : graph->nodes()) { + if (node->type_string() == FunctionLibraryDefinition::kGradientOp) { + continue; + } + const FunctionDef* fdef = flib_def.Find(node->def().op()); + Status s; + if (fdef) { + s = ValidateFunctionDef(fdef, flib_def); + TF_RETURN_IF_ERROR(maybe_error(node, s)); + continue; + } + const OpDef* op_def; + s = OpRegistry::Global()->LookUpOpDef(node->def().op(), &op_def); + TF_RETURN_IF_ERROR(maybe_error(node, s)); + TF_RETURN_IF_ERROR(ValidateNodeDef(node->def(), *op_def)); + s = FindKernelDef(device_type, node->def(), nullptr, nullptr); + TF_RETURN_IF_ERROR(maybe_error(node, s)); + } + return Status::OK(); +} + +} // namespace + Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options, string const& name, std::unique_ptr graph, @@ -681,6 +746,11 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options, FunctionalizeControlFlow(flib_runtime_->GetFunctionLibraryDefinition(), graph.get(), local_flib_def_.get())); + // Detect invalid nodes. + // FunctionalizeControlFlow may remove some nodes from the graph. + TF_RETURN_IF_ERROR(ValidateGraph(graph.get(), *options_.flib_def, + options_.device_type, name)); + xla::XlaBuilder builder(name); XlaContext* context = new XlaContext( this, &builder, options_.allow_cpu_custom_calls, @@ -705,9 +775,10 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options, result->outputs.resize(context->retvals().size()); TF_RETURN_IF_ERROR(BuildComputation( args, arg_cores, context->retvals(), context->resources(), - options.return_updated_values_for_all_resources, &builder, - result->computation.get(), &num_computation_outputs, - &num_nonconst_outputs, &result->outputs, &result->resource_updates)); + options.return_updated_values_for_all_resources, + options.always_return_tuple, &builder, result->computation.get(), + &num_computation_outputs, &num_nonconst_outputs, &result->outputs, + &result->resource_updates)); VLOG(2) << "Outputs: total: " << context->retvals().size() << " nonconstant: " << num_nonconst_outputs; @@ -721,14 +792,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options, VLOG(2) << "XLA output shape: " << xla::ShapeUtil::HumanString(result->xla_output_shape); - // Copy the host transfer metadata to the result. - for (const auto& send : host_compute_sends_) { - *result->host_compute_metadata.add_device_to_host() = send.second; - } - for (const auto& recv : host_compute_recvs_) { - *result->host_compute_metadata.add_host_to_device() = recv.second; - } - // Tensorflow expects a major-to-minor order of results. xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape); @@ -746,10 +809,34 @@ Status XlaCompiler::GetChannelHandle(const string& key, return Status::OK(); } +Status XlaCompiler::GetHostToDeviceChannelHandle(const string& key, + xla::ChannelHandle* channel) { + auto result = channels_.emplace(key, xla::ChannelHandle()); + if (result.second) { + TF_ASSIGN_OR_RETURN(result.first->second, + client()->CreateHostToDeviceChannelHandle()); + } + *channel = result.first->second; + VLOG(1) << "Host to device channel: " << key << " " << channel->DebugString(); + return Status::OK(); +} + +Status XlaCompiler::GetDeviceToHostChannelHandle(const string& key, + xla::ChannelHandle* channel) { + auto result = channels_.emplace(key, xla::ChannelHandle()); + if (result.second) { + TF_ASSIGN_OR_RETURN(result.first->second, + client()->CreateDeviceToHostChannelHandle()); + } + *channel = result.first->second; + VLOG(1) << "Device to host channel: " << key << " " << channel->DebugString(); + return Status::OK(); +} + namespace { -void SetTransfer(const string& key, gtl::ArraySlice types, - gtl::ArraySlice shapes, +void SetTransfer(const string& key, absl::Span types, + absl::Span shapes, tf2xla::HostTransferMetadata* transfer) { transfer->set_key(key); CHECK(types.size() == shapes.size()); @@ -763,8 +850,8 @@ void SetTransfer(const string& key, gtl::ArraySlice types, } // namespace Status XlaCompiler::SetDeviceToHostMetadata( - const string& key, gtl::ArraySlice types, - gtl::ArraySlice shapes) { + const string& key, absl::Span types, + absl::Span shapes) { if (host_compute_sends_.find(key) != host_compute_sends_.end()) { return errors::InvalidArgument( "Duplicate calls to SetDeviceToHostMetadata with key ", key); @@ -790,8 +877,8 @@ Status XlaCompiler::GetDeviceToHostShapes( } Status XlaCompiler::SetHostToDeviceMetadata( - const string& key, gtl::ArraySlice types, - gtl::ArraySlice shapes) { + const string& key, absl::Span types, + absl::Span shapes) { if (host_compute_recvs_.find(key) != host_compute_sends_.end()) { return errors::InvalidArgument( "Duplicate calls to SetHostToDeviceMetadata with key ", key); diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index bf496bd8bc81e67056eba380288bca88737cc00d..8f4a9858ed63403b9d0f967b61d3f690f12df21a 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -18,7 +18,10 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h" #include "tensorflow/compiler/tf2xla/xla_compilation_device.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/function.h" @@ -51,13 +54,7 @@ class XlaContext; // (kind kResource). // // Only kParameter and initialized kResource arguments become runtime parameters -// to the generated XLA computation. The XLA computation will have run-time -// parameters in the following order: -// +---------------------+-----------------------------------------+ -// | kParameter values | Initial values of kResource arguments | -// +---------------------+-----------------------------------------+ -// Within each block, the arguments are arranged by the _Arg index from which -// they were derived. +// to the generated XLA computation. // // The run-time outputs of the XLA computation are arranged in the following // order: @@ -76,10 +73,10 @@ class XlaContext; // tensors with a different shape to their representation inside the XLA // computation. // -// In both inputs and outputs, kResource values are placed the end. When +// In computation outputs, updated kResource values are placed the end. When // emitting While loop bodies, we must ensure that the loop body has -// identical input and output signatures. By moving variable values -// to the end of the argument list and using the +// identical input and output signatures. By passing variable values +// at the end of the argument list and using the // `return_updated_values_for_all_variables` option, we can ensure that the // input and output values of resources appear at the same positions. // @@ -174,6 +171,11 @@ class XlaCompiler { // computation. bool resolve_compile_time_constants = true; + // If 'always_return_tuple' is true, then the output of a computation will + // always be a tuple. Otherwise, a single-element output will not be wrapped + // in a tuple. + bool always_return_tuple = true; + // True when compiling the entry computation, false for subcomputations // (while, call, etc.) bool is_entry_computation = true; @@ -181,6 +183,8 @@ class XlaCompiler { struct OutputDescription { // Type and shape of the output. The shape is the unflattened shape. + // When `type` is DT_RESOURCE, `shape` is the shape of the resource + // variable's value. DataType type; TensorShape shape; @@ -188,6 +192,10 @@ class XlaCompiler { // 'Tensor' is in host memory. bool is_constant = false; Tensor constant_value; + + // When this output is a resource, i.e. `type == DT_RESOURCE`, this is + // the index of the input that contains the resource. + int input_index; }; // Describes a variable write side effect of the computation. @@ -210,9 +218,9 @@ class XlaCompiler { struct CompilationResult { // Vector that maps from the parameters of the XLA computation to their - // original argument positions. To handle compile-time constant inputs and - // resources, the parameters to the XLA computation may be a subset of the - // original arguments, and are not necessarily in the same order.) + // original argument positions. To handle compile-time constant inputs, the + // parameters to the XLA computation may be a subset of the original + // arguments. The relative ordering of parameters are maintained. std::vector input_mapping; // Input shapes of the computation. If we are flattening inputs, these are @@ -233,7 +241,8 @@ class XlaCompiler { tf2xla::HostComputeMetadata host_compute_metadata; // Resources whose values were updated by the computation, ordered - // by return value position. Resource updates follow the non-constant + // by return value position (which is the same as the order the resources + // were passed as arguments). Resource updates follow the non-constant // results in the outputs of XLA computation. std::vector resource_updates; @@ -241,12 +250,19 @@ class XlaCompiler { std::shared_ptr computation; }; - typedef std::function + typedef std::function(const TensorShape&, + DataType)> ShapeRepresentationFn; struct Options { - // Name of the compilation device to use. Needs to be live only during - // XlaCompiler's constructor. - const DeviceType* device_type = nullptr; + // Name of the compilation device to use. It must be set by the caller. + // The default empty value is invalid. + DeviceType device_type = DeviceType(""); + + // The device to use during compilation to execute instructions on, for + // example for auto-tuning. + // Valid values are defined by `xla::Backend::devices_ordinal_supported()`. + // -1 indicates the default device should be used. + int device_ordinal = -1; xla::Client* client = nullptr; @@ -313,7 +329,7 @@ class XlaCompiler { // See the class comment for more details about the argument passing // convention. Status XLAShapeForArgument(const Argument& arg, bool is_entry_computation, - xla::Shape* xla_shape); + xla::Shape* xla_shape) const; // Retrieves the channel handle associated with `key`. Allocates // a new channel handle if none exists. @@ -322,11 +338,21 @@ class XlaCompiler { // same XlaCompiler. Status GetChannelHandle(const string& key, xla::ChannelHandle* channel); + // Retrieves the host-to-device channel handle associated with `key`. + // Allocates a new channel handle if none exists. + Status GetHostToDeviceChannelHandle(const string& key, + xla::ChannelHandle* channel); + + // Retrieves the device-to-host channel handle associated with `key`. + // Allocates a new channel handle if none exists. + Status GetDeviceToHostChannelHandle(const string& key, + xla::ChannelHandle* channel); + // Sets the shapes and types for the device to host transfer associated with // 'key'. Status SetDeviceToHostMetadata(const string& key, - gtl::ArraySlice types, - gtl::ArraySlice shapes); + absl::Span types, + absl::Span shapes); // Gets the shapes the device to host transfer associated with 'key'. Status GetDeviceToHostShapes(const string& key, @@ -335,8 +361,8 @@ class XlaCompiler { // Sets the shapes and types for the host to device transfer associated with // 'key'. Status SetHostToDeviceMetadata(const string& key, - gtl::ArraySlice types, - gtl::ArraySlice shapes); + absl::Span types, + absl::Span shapes); // In order to avoid deadlocks from dependencies in host computations, it can // be necessary to enforce a partial order on the execution of HostCompute diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc index 55772ca324872f6d5fac008de7819b7fae64966a..be3c93ae47bf16a67ed4fac34a99997cc7888559 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/tf2xla/xla_compiler.h" +#include "absl/strings/match.h" #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/ops/data_flow_ops.h" #include "tensorflow/cc/ops/function_ops.h" @@ -23,7 +24,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -34,10 +35,10 @@ limitations under the License. #include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/public/version.h" @@ -45,8 +46,6 @@ namespace tensorflow { class XlaCompilerTest : public ::testing::Test { protected: - XlaCompilerTest() : cpu_device_type_(DEVICE_CPU_XLA_JIT) {} - void SetUp() override { client_ = xla::ClientLibrary::LocalClientOrDie(); @@ -58,7 +57,7 @@ class XlaCompilerTest : public ::testing::Test { XlaCompiler::Options DefaultOptions() { XlaCompiler::Options options; - options.device_type = &cpu_device_type_; + options.device_type = DeviceType(DEVICE_CPU_XLA_JIT); options.client = client_; options.flib_def = flib_def_.get(); return options; @@ -68,7 +67,6 @@ class XlaCompilerTest : public ::testing::Test { return compiler->local_flib_def_.get(); } - DeviceType cpu_device_type_; xla::Client* client_; std::unique_ptr flib_def_; }; @@ -208,9 +206,9 @@ TEST_F(XlaCompilerTest, Simple) { // Tests that the generated computation works. std::unique_ptr param0_literal = - xla::Literal::CreateR1({7, 42}); + xla::LiteralUtil::CreateR1({7, 42}); std::unique_ptr param1_literal = - xla::Literal::CreateR1({-3, 101}); + xla::LiteralUtil::CreateR1({-3, 101}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_data = @@ -224,12 +222,112 @@ TEST_F(XlaCompilerTest, Simple) { client_->Transfer(*actual).ConsumeValueOrDie(); std::unique_ptr expected0 = - xla::Literal::CreateR1({4, 143}); + xla::LiteralUtil::CreateR1({4, 143}); std::unique_ptr expected_literal = - xla::Literal::MakeTuple({expected0.get()}); + xla::LiteralUtil::MakeTuple({expected0.get()}); EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal)); } +// Tests compilation of a graph where the _Retval node is not necessarily last +// amongst the graph nodes in construction order, and always_return_tuple is +// false. Regression test for bug where the wrong value was returned. +TEST_F(XlaCompilerTest, OutOfOrderGraph) { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0); + auto b = ops::_Arg(scope.WithOpName("B"), DT_INT32, 1); + // The _Retval node is not last in construction order. + auto d = ops::_Retval(scope.WithOpName("D"), a, 0); + auto c = ops::Add(scope.WithOpName("C"), a, b); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(scope.ToGraph(graph.get())); + + // Builds a description of the arguments. + std::vector args(2); + args[0].kind = XlaCompiler::Argument::kParameter; + args[0].type = DT_INT32; + args[0].shape = TensorShape({2}); + args[1].kind = XlaCompiler::Argument::kParameter; + args[1].type = DT_INT32; + args[1].shape = TensorShape({2}); + + // Compiles the graph. + XlaCompiler compiler(DefaultOptions()); + + XlaCompiler::CompileOptions compile_options; + compile_options.always_return_tuple = false; + XlaCompiler::CompilationResult result; + TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph), + args, &result)); + + // Tests that the generated computation works. + std::unique_ptr param0_literal = + xla::LiteralUtil::CreateR1({7, 42}); + std::unique_ptr param1_literal = + xla::LiteralUtil::CreateR1({-3, 101}); + std::unique_ptr param0_data = + client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); + std::unique_ptr param1_data = + client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); + + std::unique_ptr actual = + client_ + ->Execute(*result.computation, {param0_data.get(), param1_data.get()}) + .ConsumeValueOrDie(); + std::unique_ptr actual_literal = + client_->Transfer(*actual).ConsumeValueOrDie(); + + EXPECT_TRUE(xla::LiteralTestUtil::Equal(*param0_literal, *actual_literal)); +} + +// Tests that the compiler doesn't reorder the parameters. +TEST_F(XlaCompilerTest, MixedOrderArguments) { + for (bool swap_order : {false, true}) { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto var = + ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, swap_order ? 0 : 1); + auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, swap_order ? 1 : 0); + // Adds an identity op around the resource to make sure identity ops + // propagate resources correctly. + auto identity = ops::Identity(scope.WithOpName("VIdentity"), var); + auto write = ops::AssignAddVariableOp(scope, identity, a); + auto read = ops::ReadVariableOp( + scope.WithControlDependencies(std::vector{write}), var, + DT_INT32); + auto read_plus_one = ops::Add(scope, read, ops::Const(scope, 1)); + auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0); + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(scope.ToGraph(graph.get())); + + // Builds a description of the arguments. + std::vector args(2); + args[0].kind = XlaCompiler::Argument::kParameter; + args[0].type = DT_INT32; + args[0].shape = TensorShape({2}); + args[1].kind = XlaCompiler::Argument::kResource; + args[1].resource_kind = XlaResource::kVariable; + args[1].initialized = true; + args[1].type = DT_INT32; + args[1].shape = TensorShape({2}); + + if (swap_order) { + // Even after swapping arguments, the compiler should maintain the new + // ordering of parameters. + std::swap(args[0], args[1]); + } + // Compiles the graph. + XlaCompiler compiler(DefaultOptions()); + + XlaCompiler::CompileOptions compile_options; + compile_options.always_return_tuple = false; + XlaCompiler::CompilationResult result; + TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph), + args, &result)); + + EXPECT_THAT(result.input_mapping, ::testing::ElementsAre(0, 1)); + } +} + TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) { // Builds a graph that adds reshapes a tensor, but with the shape not // statically known. @@ -259,10 +357,10 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) { std::move(graph), args, &result); EXPECT_FALSE(status.ok()); EXPECT_TRUE( - str_util::StrContains(status.error_message(), "depends on a parameter")) + absl::StrContains(status.error_message(), "depends on a parameter")) << status.error_message(); EXPECT_TRUE( - str_util::StrContains(status.error_message(), "[[Node: C = Reshape")) + absl::StrContains(status.error_message(), "[[{{node C}} = Reshape")) << status.error_message(); } @@ -308,7 +406,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) { // Tests that the generated computation works. std::unique_ptr param0_literal = - xla::Literal::CreateR1({7, 42}); + xla::LiteralUtil::CreateR1({7, 42}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); @@ -319,9 +417,9 @@ TEST_F(XlaCompilerTest, ConstantOutputs) { client_->Transfer(*actual).ConsumeValueOrDie(); std::unique_ptr expected0 = - xla::Literal::CreateR1({-7, -42}); + xla::LiteralUtil::CreateR1({-7, -42}); std::unique_ptr expected_literal = - xla::Literal::MakeTuple({expected0.get()}); + xla::LiteralUtil::MakeTuple({expected0.get()}); EXPECT_TRUE( xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal)); } @@ -343,7 +441,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) { // Tests that the generated computation works. std::unique_ptr param0_literal = - xla::Literal::CreateR1({7, 42}); + xla::LiteralUtil::CreateR1({7, 42}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); @@ -353,11 +451,12 @@ TEST_F(XlaCompilerTest, ConstantOutputs) { std::unique_ptr actual_literal = client_->Transfer(*actual).ConsumeValueOrDie(); - std::unique_ptr expected0 = xla::Literal::CreateR0(7); + std::unique_ptr expected0 = + xla::LiteralUtil::CreateR0(7); std::unique_ptr expected1 = - xla::Literal::CreateR1({-7, -42}); + xla::LiteralUtil::CreateR1({-7, -42}); std::unique_ptr expected = - xla::Literal::MakeTuple({expected0.get(), expected1.get()}); + xla::LiteralUtil::MakeTuple({expected0.get(), expected1.get()}); EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected, *actual_literal)); } } @@ -571,11 +670,11 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) { // Tests that the generated computation works. std::unique_ptr input_base = - xla::Literal::CreateR1({7, 42}); + xla::LiteralUtil::CreateR1({7, 42}); std::unique_ptr input_grad2 = - xla::Literal::CreateR1({-3, 101}); + xla::LiteralUtil::CreateR1({-3, 101}); std::unique_ptr input = - xla::Literal::MakeTuple({input_base.get(), input_grad2.get()}); + xla::LiteralUtil::MakeTuple({input_base.get(), input_grad2.get()}); std::unique_ptr param0_data = client_->TransferToServer(*input).ConsumeValueOrDie(); @@ -585,17 +684,18 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) { std::unique_ptr actual_literal = client_->Transfer(*actual).ConsumeValueOrDie(); - std::unique_ptr output_read = xla::Literal::CreateR0(42); + std::unique_ptr output_read = + xla::LiteralUtil::CreateR0(42); std::unique_ptr output_base = - xla::Literal::CreateR1({7, 42}); + xla::LiteralUtil::CreateR1({7, 42}); std::unique_ptr output_grad1 = - xla::Literal::CreateR1({0, 1}); + xla::LiteralUtil::CreateR1({0, 1}); std::unique_ptr output_grad2 = - xla::Literal::CreateR1({-3, 101}); - std::unique_ptr output_resource = xla::Literal::MakeTuple( + xla::LiteralUtil::CreateR1({-3, 101}); + std::unique_ptr output_resource = xla::LiteralUtil::MakeTuple( {output_base.get(), output_grad1.get(), output_grad2.get()}); std::unique_ptr expected_literal = - xla::Literal::MakeTuple({output_read.get(), output_resource.get()}); + xla::LiteralUtil::MakeTuple({output_read.get(), output_resource.get()}); EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal)); } @@ -675,8 +775,7 @@ TEST_F(XlaCompilerTest, UndefinedFunctionFails) { compiler.CompileFunction(XlaCompiler::CompileOptions(), name_attr, /*args=*/{}, &result); EXPECT_FALSE(status.ok()); - EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()), - "is not defined.")) + EXPECT_TRUE(absl::StrContains(status.error_message(), "is not defined.")) << status.error_message(); } @@ -755,21 +854,49 @@ TEST_F(XlaCompilerTest, LocalFunctionWithWrongArgumentsFail) { ASSERT_FALSE(status.ok()); // Flib lookup failure. - EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()), - "is not defined.")) + EXPECT_TRUE(absl::StrContains(status.error_message(), "is not defined.")) << status.error_message(); // Local flib lookup failure. - EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()), - "Attr T is not found")) + EXPECT_TRUE(absl::StrContains(status.error_message(), "Attr T is not found")) << status.error_message(); } +void RunAndCheckVariablesComputation( + xla::Client* client, const XlaCompiler::CompilationResult& result) { + std::unique_ptr param0_literal = + xla::LiteralUtil::CreateR1({7, 42}); + std::unique_ptr param1_literal = + xla::LiteralUtil::CreateR1({-3, 101}); + std::unique_ptr param0_data = + client->TransferToServer(*param0_literal).ConsumeValueOrDie(); + std::unique_ptr param1_data = + client->TransferToServer(*param1_literal).ConsumeValueOrDie(); + + std::unique_ptr actual = + client + ->Execute(*result.computation, {param0_data.get(), param1_data.get()}) + .ConsumeValueOrDie(); + std::unique_ptr actual_literal = + client->Transfer(*actual).ConsumeValueOrDie(); + + std::unique_ptr expected0 = + xla::LiteralUtil::CreateR1({5, 144}); + std::unique_ptr expected1 = + xla::LiteralUtil::CreateR1({4, 143}); + std::unique_ptr expected_literal = + xla::LiteralUtil::MakeTuple({expected0.get(), expected1.get()}); + EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal)); +} + // Tests a simple graph that reads and writes a variable. TEST_F(XlaCompilerTest, Variables) { Scope scope = Scope::NewRootScope().ExitOnError(); auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0); auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1); - auto write = ops::AssignAddVariableOp(scope, var, a); + // Adds an identity op around the resource to make sure identity ops propagate + // resources correctly. + auto identity = ops::Identity(scope.WithOpName("VIdentity"), var); + auto write = ops::AssignAddVariableOp(scope, identity, a); auto read = ops::ReadVariableOp( scope.WithControlDependencies(std::vector{write}), var, DT_INT32); @@ -792,36 +919,90 @@ TEST_F(XlaCompilerTest, Variables) { // Compiles the graph. XlaCompiler compiler(DefaultOptions()); + XlaCompiler::CompilationResult result; + TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add", + std::move(graph), args, &result)); + RunAndCheckVariablesComputation(client_, result); +} + +// Tests a simple graph that reads and writes a variable. +TEST_F(XlaCompilerTest, ReturnResourceHandleOnly) { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 0); + auto d = ops::_Retval(scope.WithOpName("D"), var, 0); + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(scope.ToGraph(graph.get())); + + // Builds a description of the arguments. + std::vector args(1); + args[0].kind = XlaCompiler::Argument::kResource; + args[0].resource_kind = XlaResource::kVariable; + args[0].initialized = true; + args[0].type = DT_INT32; + args[0].shape = TensorShape({2}); + + // Compiles the graph. + XlaCompiler compiler(DefaultOptions()); + XlaCompiler::CompilationResult result; TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add", std::move(graph), args, &result)); // Tests that the generated computation works. - std::unique_ptr param0_literal = - xla::Literal::CreateR1({7, 42}); std::unique_ptr param1_literal = - xla::Literal::CreateR1({-3, 101}); - std::unique_ptr param0_data = - client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); + xla::LiteralUtil::CreateR1({-3, 101}); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); std::unique_ptr actual = - client_ - ->Execute(*result.computation, {param0_data.get(), param1_data.get()}) + client_->Execute(*result.computation, {param1_data.get()}) .ConsumeValueOrDie(); std::unique_ptr actual_literal = client_->Transfer(*actual).ConsumeValueOrDie(); - std::unique_ptr expected0 = - xla::Literal::CreateR1({5, 144}); - std::unique_ptr expected1 = - xla::Literal::CreateR1({4, 143}); std::unique_ptr expected_literal = - xla::Literal::MakeTuple({expected0.get(), expected1.get()}); + xla::LiteralUtil::MakeTuple({}); EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal)); } +TEST_F(XlaCompilerTest, ReturnResourceHandle) { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0); + auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1); + // Adds an identity op around the resource to make sure identity ops propagate + // resources correctly. + auto identity = ops::Identity(scope.WithOpName("VIdentity"), var); + auto write = ops::AssignAddVariableOp(scope, identity, a); + auto read = ops::ReadVariableOp( + scope.WithControlDependencies(std::vector{write}), var, + DT_INT32); + auto read_plus_one = ops::Add(scope, read, ops::Const(scope, 1)); + auto r = ops::_Retval(scope.WithOpName("R"), var, 0); + auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 1); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(scope.ToGraph(graph.get())); + + // Builds a description of the arguments. + std::vector args(2); + args[0].kind = XlaCompiler::Argument::kParameter; + args[0].type = DT_INT32; + args[0].shape = TensorShape({2}); + args[1].kind = XlaCompiler::Argument::kResource; + args[1].resource_kind = XlaResource::kVariable; + args[1].initialized = true; + args[1].type = DT_INT32; + args[1].shape = TensorShape({2}); + + // Compiles the graph. + XlaCompiler compiler(DefaultOptions()); + + XlaCompiler::CompilationResult result; + TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add", + std::move(graph), args, &result)); + RunAndCheckVariablesComputation(client_, result); +} + xla::StatusOr> BuildTestGraph() { Scope scope = Scope::NewRootScope().ExitOnError(); auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0); @@ -886,9 +1067,9 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) { // Tests that the generated computation works. std::unique_ptr param0_literal = - xla::Literal::CreateR2({{4, 55}, {1, -3}}); + xla::LiteralUtil::CreateR2({{4, 55}, {1, -3}}); std::unique_ptr param1_literal = - xla::Literal::CreateR1({22, 11, 33, 404}); + xla::LiteralUtil::CreateR1({22, 11, 33, 404}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_data = @@ -902,11 +1083,11 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) { client_->Transfer(*actual).ConsumeValueOrDie(); std::unique_ptr expected0 = - xla::Literal::CreateR2({{27, 67}, {35, 402}}); + xla::LiteralUtil::CreateR2({{27, 67}, {35, 402}}); std::unique_ptr expected1 = - xla::Literal::CreateR1({26, 66, 34, 401}); + xla::LiteralUtil::CreateR1({26, 66, 34, 401}); std::unique_ptr expected_literal = - xla::Literal::MakeTuple({expected0.get(), expected1.get()}); + xla::LiteralUtil::MakeTuple({expected0.get(), expected1.get()}); EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal)); } @@ -955,9 +1136,9 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) { // Tests that the generated computation works. std::unique_ptr param0_literal = - xla::Literal::CreateR1({4, 55, 1, -3}); + xla::LiteralUtil::CreateR1({4, 55, 1, -3}); std::unique_ptr param1_literal = - xla::Literal::CreateR1({22, 11, 33, 404}); + xla::LiteralUtil::CreateR1({22, 11, 33, 404}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_data = @@ -971,13 +1152,127 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) { client_->Transfer(*actual).ConsumeValueOrDie(); std::unique_ptr expected0 = - xla::Literal::CreateR1({27, 67, 35, 402}); + xla::LiteralUtil::CreateR1({27, 67, 35, 402}); std::unique_ptr expected1 = - xla::Literal::CreateR1({26, 66, 34, 401}); + xla::LiteralUtil::CreateR1({26, 66, 34, 401}); std::unique_ptr expected_literal = - xla::Literal::MakeTuple({expected0.get(), expected1.get()}); + xla::LiteralUtil::MakeTuple({expected0.get(), expected1.get()}); EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal)); } +// Tests a graph which has a function with an invalid op. +TEST_F(XlaCompilerTest, FunctionWithInvalidOp) { + XlaCompiler compiler(DefaultOptions()); + + FunctionDefLibrary flib; + FunctionDef fn = FillFn(); + NodeDef* node = fn.add_node_def(); + node->set_name("Invalid"); + node->set_op("InvalidOp"); /* unsupported op */ + node = fn.add_node_def(); + node->set_name("Switch"); + node->set_op("Switch"); /* control flow node */ + *flib.add_function() = fn; + + TF_ASSERT_OK(flib_def_->AddFunctionDef(fn)); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + + Scope scope = Scope::NewRootScope().ExitOnError(); + auto value = ops::Const(scope.WithOpName("value"), 1, {}); + auto shape = ops::Const(scope.WithOpName("shape"), {5}, {1}); + TF_ASSERT_OK(scope.graph()->AddFunctionLibrary(flib)); + + NodeDef def; + TF_ASSERT_OK(NodeDefBuilder("fill_fn", "FillFn", flib_def_.get()) + .Input(value.name(), 0, DT_INT32) + .Input(shape.name(), 1, DT_INT32) + .Finalize(&def)); + Status status; + Node* fill = scope.graph()->AddNode(def, &status); + TF_ASSERT_OK(status); + TF_ASSERT_OK(scope.DoShapeInference(fill)); + scope.graph()->AddEdge(value.node(), 0, fill, 0); + scope.graph()->AddEdge(shape.node(), 0, fill, 1); + + auto retval = ops::_Retval(scope.WithOpName("retval"), Output(fill), 0); + + TF_ASSERT_OK(scope.ToGraph(graph.get())); + + std::vector args; + XlaCompiler::CompilationResult result; + status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "fill", + std::move(graph), args, &result); + ASSERT_FALSE(status.ok()); + EXPECT_TRUE(absl::StrContains(status.error_message(), "InvalidOp")) + << status.error_message(); + EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node fill_fn}}")) + << status.error_message(); +} + +// Tests a graph which has a node with invalid data type. +TEST_F(XlaCompilerTest, NodeWithInvalidDataType) { + std::unique_ptr graph(new Graph(OpRegistry::Global())); + NodeDef shape; + shape.set_name("Shape"); + shape.set_op("Shape"); + (*shape.mutable_attr())["T"].set_type(DT_INT32); + (*shape.mutable_attr())["out_type"].set_type(DT_BOOL); /* invalid type */ + Status status; + Node* shape_node = graph->AddNode(shape, &status); + TF_ASSERT_OK(status); + graph->AddControlEdge(graph->source_node(), shape_node); + + std::vector args; + XlaCompiler::CompilationResult result; + XlaCompiler compiler(DefaultOptions()); + status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "invalid_type", + std::move(graph), args, &result); + ASSERT_FALSE(status.ok()); + EXPECT_TRUE(absl::StrContains(status.error_message(), + "is not in the list of allowed values")) + << status.error_message(); + EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node Shape}}")) + << status.error_message(); +} + +TEST_F(XlaCompilerTest, SingleOpWithoutInputs) { + std::unique_ptr graph(new Graph(OpRegistry::Global())); + NodeDef no_op; + no_op.set_name("NoOp"); + no_op.set_op("NoOp"); + Status status; + graph->AddNode(no_op, &status); + TF_ASSERT_OK(status); + + std::vector args; + XlaCompiler compiler(DefaultOptions()); + // No control edge linking NoOp with source/sink. + { + std::unique_ptr graph_copy(new Graph(OpRegistry::Global())); + CopyGraph(*graph, graph_copy.get()); + XlaCompiler::CompilationResult result; + status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "NoOp", + std::move(graph_copy), args, &result); + ASSERT_FALSE(status.ok()); + EXPECT_TRUE( + absl::StrContains(status.error_message(), + "The following nodes are unreachable " + "from the source in the graph: {{node NoOp}}")) + << status.error_message(); + } + + // Fix control edges for NoOp. + { + std::unique_ptr graph_copy(new Graph(OpRegistry::Global())); + CopyGraph(*graph, graph_copy.get()); + EXPECT_TRUE(FixupSourceAndSinkEdges(graph_copy.get())); + XlaCompiler::CompilationResult result; + TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "NoOp", + std::move(graph_copy), args, &result)); + EXPECT_EQ(0, result.resource_updates.size()); + } +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc index 098072d33cd4eb7f7dec0ec4196b43eca0220d4a..24a4b92b45a3f3563e435fa074fce595d6c0b263 100644 --- a/tensorflow/compiler/tf2xla/xla_context.cc +++ b/tensorflow/compiler/tf2xla/xla_context.cc @@ -19,18 +19,19 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/tf2xla/literal_util.h" #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/core/common_runtime/dma_helper.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" @@ -66,8 +67,8 @@ XlaContext::XlaContext( XlaCompiler* compiler, xla::XlaBuilder* builder, bool allow_cpu_custom_calls, bool resolve_compile_time_constants, bool is_entry_computation, - const std::function* - shape_representation_fn) + const std::function( + const TensorShape&, DataType)>* shape_representation_fn) : compiler_(compiler), builder_(builder), allow_cpu_custom_calls_(allow_cpu_custom_calls), @@ -92,7 +93,7 @@ void XlaContext::AddRetval(int retval_index, DataType type, } Status XlaContext::AddConstRetval(int retval_index, DataType dtype, - const xla::Literal& literal) { + const xla::LiteralSlice& literal) { VLOG(1) << "Adding retval index " << retval_index << " with non-data-dependent tensor to XLA computation"; if (retvals_.size() <= retval_index) { @@ -106,6 +107,19 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype, return Status::OK(); } +Status XlaContext::AddResourceRetval(int retval_index, XlaResource* resource) { + VLOG(1) << "Adding retval index " << retval_index << " with resource " + << resource->name() << ":" << resource->shape().DebugString() + << " to XLA computation"; + if (retvals_.size() <= retval_index) { + retvals_.resize(retval_index + 1); + } + XlaExpression e; + e.set_resource(resource); + retvals_[retval_index] = Retval{DT_RESOURCE, resource->shape(), e}; + return Status::OK(); +} + xla::XlaBuilder* XlaContext::builder() { return builder_; } Status XlaContext::CreateResource( @@ -119,8 +133,8 @@ Status XlaContext::CreateResource( return Status::OK(); } -TensorShape XlaContext::RepresentationShape(const TensorShape& shape, - DataType type) const { +xla::StatusOr XlaContext::RepresentationShape( + const TensorShape& shape, DataType type) const { return (*shape_representation_fn_)(shape, type); } @@ -131,9 +145,11 @@ const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) { xla::XlaBuilder b("max<" + type_string + ">"); xla::PrimitiveType xla_type; TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type)); - auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x"); - auto y = b.Parameter(1, xla::ShapeUtil::MakeShape(xla_type, {}), "y"); - b.Max(x, y); + auto x = + xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x"); + auto y = + xla::Parameter(&b, 1, xla::ShapeUtil::MakeShape(xla_type, {}), "y"); + xla::Max(x, y); return b.Build().ConsumeValueOrDie(); }); } @@ -145,9 +161,11 @@ const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) { xla::XlaBuilder b("min<" + type_string + ">"); xla::PrimitiveType xla_type; TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type)); - auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x"); - auto y = b.Parameter(1, xla::ShapeUtil::MakeShape(xla_type, {}), "y"); - b.Min(x, y); + auto x = + xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x"); + auto y = + xla::Parameter(&b, 1, xla::ShapeUtil::MakeShape(xla_type, {}), "y"); + xla::Min(x, y); return b.Build().ConsumeValueOrDie(); }); } @@ -159,9 +177,11 @@ const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) { xla::XlaBuilder b("add<" + type_string + ">"); xla::PrimitiveType xla_type; TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type)); - auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x"); - auto y = b.Parameter(1, xla::ShapeUtil::MakeShape(xla_type, {}), "y"); - b.Add(x, y); + auto x = + xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x"); + auto y = + xla::Parameter(&b, 1, xla::ShapeUtil::MakeShape(xla_type, {}), "y"); + xla::Add(x, y); return b.Build().ConsumeValueOrDie(); }); } @@ -173,9 +193,11 @@ const xla::XlaComputation* XlaContext::GetOrCreateMul(const DataType type) { xla::XlaBuilder b("mul<" + type_string + ">"); xla::PrimitiveType xla_type; TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type)); - auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x"); - auto y = b.Parameter(1, xla::ShapeUtil::MakeShape(xla_type, {}), "y"); - b.Mul(x, y); + auto x = + xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x"); + auto y = + xla::Parameter(&b, 1, xla::ShapeUtil::MakeShape(xla_type, {}), "y"); + xla::Mul(x, y); return b.Build().ConsumeValueOrDie(); }); } diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h index 341bf6ff1f37fa7cd81f41c02a941214067b1bd1..4da891634e97dd67af0ef09ef33dbc7a4d19743b 100644 --- a/tensorflow/compiler/tf2xla/xla_context.h +++ b/tensorflow/compiler/tf2xla/xla_context.h @@ -22,8 +22,9 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_compilation_device.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/resource_mgr.h" @@ -47,8 +48,8 @@ class XlaContext : public ResourceBase { XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder, bool allow_cpu_custom_calls, bool resolve_compile_time_constants, bool is_entry_computation, - const std::function* - shape_representation_fn); + const std::function( + const TensorShape&, DataType)>* shape_representation_fn); // Virtual method defined by ResourceBase. string DebugString() override; @@ -83,7 +84,10 @@ class XlaContext : public ResourceBase { // As for Retval, but for return values that are compile-time constants. Status AddConstRetval(int retval_index, DataType dtype, - const xla::Literal& literal); + const xla::LiteralSlice& literal); + + // As for Retval, but for return values that are resource handles. + Status AddResourceRetval(int retval_index, XlaResource* resource); // Creates a resource with resource `kind` and initial value `handle`. `name` // is a descriptive name for use in error messages. See the `XlaResource` @@ -101,8 +105,8 @@ class XlaContext : public ResourceBase { // Returns the XLA shape to be used to represent a variable of TF `shape` // and `type`, or of an argument or return value of a top-level computation. - TensorShape RepresentationShape(const TensorShape& shape, - DataType type) const; + xla::StatusOr RepresentationShape(const TensorShape& shape, + DataType type) const; // Get an XLA lambda to compute Max. This is cached in the // XlaContext since it may be used by multiple Ops. There is a @@ -160,7 +164,7 @@ class XlaContext : public ResourceBase { // should be represented in XLA. Parameters/return values will be shaped // according to this function, and reshaped back to/from their declared shapes // for computations. Must be non-null. - const std::function* + const std::function(const TensorShape&, DataType)>* shape_representation_fn_; // Cache of prebuilt computations indexed by their type. diff --git a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc index ead229aaccc292d4944db0c1eaf98c82583533cd..23d04d43b358e858ad1ab2463322ce0ab93b23c2 100644 --- a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc +++ b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc @@ -31,6 +31,10 @@ bool CpuOpFilter(KernelDef* kdef) { DT_FLOAT); return true; } + // TODO(b/26783907): The CPU backend currently does not implement sort. + if (kdef->op() == "XlaSort" || kdef->op() == "TopKV2") { + return false; + } if (kdef->op() == "Const") { AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef); } diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc index 62168b648331844bfe2db1a4d5dcad895c8726f3..1398e9ee536a9675e5b703ec3fabf4a8b9d89cbf 100644 --- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc +++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc @@ -20,12 +20,6 @@ limitations under the License. namespace tensorflow { bool GpuOpFilter(KernelDef* kdef) { - // TODO(b/31361304): The GPU backend does not parallelize PRNG ops, leading to - // slow code. - if (kdef->op() == "RandomStandardNormal" || kdef->op() == "RandomUniform" || - kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal") { - return false; - } if (kdef->op() == "Const") { AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef); } diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc index f1594193af09c7193f03b4685d3a7d4510d654dd..9a34cd8c6ae2dc6d52a3cc69168df96f5322c6da 100644 --- a/tensorflow/compiler/tf2xla/xla_helpers.cc +++ b/tensorflow/compiler/tf2xla/xla_helpers.cc @@ -18,117 +18,90 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/lib/util.h" +#include "absl/types/span.h" #include "tensorflow/compiler/tf2xla/literal_util.h" +#include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace tensorflow { namespace { -Status ArgMinMax(xla::XlaBuilder* builder, XlaOpKernelContext* ctx, - const xla::XlaOp& input, const TensorShape& input_shape, - DataType input_type, DataType output_type, int axis, - bool is_min, xla::XlaOp* argminmax) { - xla::XlaOp init_value; - const xla::XlaComputation* reducer; - if (is_min) { - init_value = XlaHelpers::MaxValue(builder, input_type); - reducer = ctx->GetOrCreateMin(input_type); - } else { - init_value = XlaHelpers::MinValue(builder, input_type); - reducer = ctx->GetOrCreateMax(input_type); - } - - xla::PrimitiveType xla_output_type; - TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(output_type, &xla_output_type)); - - xla::XlaOp input_max = builder->Reduce(input, init_value, *reducer, - /*dimensions_to_reduce=*/{axis}); - std::vector broadcast_dims(input_shape.dims() - 1); - std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0); - std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1); - // Compute a mask that has 1s for elements equal to the maximum. - xla::XlaOp partial_mask = builder->ConvertElementType( - builder->Eq(input, input_max, broadcast_dims), xla_output_type); - - // In order to make identity elements for a bitwise And, we: - // Left shift the 1 to the leftmost bit, yielding 0x10...0 - // Arithmetic right shift the 1 back to the rightmost bit, yielding - // 0xFF...F - int32 bits_in_type = - xla::ShapeUtil::ByteSizeOfPrimitiveType(xla_output_type) * 8 - 1; - xla::XlaOp shift_amount = - XlaHelpers::IntegerLiteral(builder, output_type, bits_in_type); - xla::XlaOp full_mask = builder->ShiftRightArithmetic( - builder->ShiftLeft(partial_mask, shift_amount), shift_amount); - - // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its - // index. - xla::XlaOp iota; - - const int64 axis_size = input_shape.dim_size(axis); - TF_RETURN_IF_ERROR(XlaHelpers::Iota(builder, output_type, axis_size, &iota)); - xla::XlaOp product = - builder->And(full_mask, iota, /*broadcast_dimensions=*/{axis}); - - // If there are multiple maximum elements, choose the one with the highest - // index. - xla::XlaOp output = - builder->Reduce(product, XlaHelpers::MinValue(builder, output_type), - *ctx->GetOrCreateMax(output_type), - /*dimensions_to_reduce=*/{axis}); - *argminmax = output; - return Status::OK(); +xla::XlaOp ArgMinMax(xla::XlaOp input, xla::PrimitiveType output_type, int axis, + bool is_min) { + xla::XlaBuilder* builder = input.builder(); + return builder->ReportErrorOrReturn([&]() -> xla::StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape input_shape, builder->GetShape(input)); + xla::XlaOp init_value; + xla::XlaComputation reducer; + if (is_min) { + init_value = xla::MaxValue(builder, input_shape.element_type()); + reducer = + xla::CreateScalarMinComputation(input_shape.element_type(), builder); + } else { + init_value = xla::MinValue(builder, input_shape.element_type()); + reducer = + xla::CreateScalarMaxComputation(input_shape.element_type(), builder); + } + + xla::XlaOp input_max = xla::Reduce(input, init_value, reducer, + /*dimensions_to_reduce=*/{axis}); + std::vector broadcast_dims(xla::ShapeUtil::Rank(input_shape) - 1); + std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0); + std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1); + // Compute a mask that has 1s for elements equal to the maximum. + xla::XlaOp partial_mask = xla::ConvertElementType( + xla::Eq(input, input_max, broadcast_dims), output_type); + + // In order to make identity elements for a bitwise And, we: + // Left shift the 1 to the leftmost bit, yielding 0x10...0 + // Arithmetic right shift the 1 back to the rightmost bit, yielding + // 0xFF...F + int32 bits_in_type = + xla::ShapeUtil::ByteSizeOfPrimitiveType(output_type) * 8 - 1; + xla::XlaOp shift_amount = + xla::ConstantR0WithType(builder, output_type, bits_in_type); + xla::XlaOp full_mask = xla::ShiftRightArithmetic( + xla::ShiftLeft(partial_mask, shift_amount), shift_amount); + + // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its + // index. + + const int64 axis_size = xla::ShapeUtil::GetDimension(input_shape, axis); + xla::XlaOp iota = xla::Iota(builder, output_type, axis_size); + xla::XlaOp product = + xla::And(full_mask, iota, /*broadcast_dimensions=*/{axis}); + + // If there are multiple maximum elements, choose the one with the highest + // index. + return xla::Reduce(product, xla::MinValue(builder, output_type), + xla::CreateScalarMaxComputation(output_type, builder), + /*dimensions_to_reduce=*/{axis}); + }); } } // namespace -xla::XlaOp XlaHelpers::MinValue(xla::XlaBuilder* b, DataType data_type) { - xla::PrimitiveType type; - TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type)); - return b->ConstantLiteral(xla::Literal::MinValue(type)); -} - -xla::XlaOp XlaHelpers::MaxValue(xla::XlaBuilder* b, DataType data_type) { - xla::PrimitiveType type; - TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type)); - return b->ConstantLiteral(xla::Literal::MaxValue(type)); -} - xla::XlaOp XlaHelpers::Zero(xla::XlaBuilder* b, DataType data_type) { xla::PrimitiveType type; TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type)); - return b->ConstantLiteral(xla::Literal::Zero(type)); + return xla::ConstantLiteral(b, xla::LiteralUtil::Zero(type)); } xla::XlaOp XlaHelpers::One(xla::XlaBuilder* b, DataType data_type) { xla::PrimitiveType type; TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type)); - return b->ConstantLiteral(xla::Literal::One(type)); -} - -xla::XlaOp XlaHelpers::Epsilon(xla::XlaBuilder* b, DataType data_type) { - switch (data_type) { - case DT_HALF: - return b->ConstantR0( - static_cast(Eigen::NumTraits::epsilon())); - case DT_BFLOAT16: - return b->ConstantR0(bfloat16::epsilon()); - case DT_FLOAT: - return b->ConstantR0(std::numeric_limits::epsilon()); - case DT_DOUBLE: - return b->ConstantR0(std::numeric_limits::epsilon()); - default: - LOG(FATAL) << "Unsupported type in XlaHelpers::Epsilon: " - << DataTypeString(data_type); - } + return xla::ConstantLiteral(b, xla::LiteralUtil::One(type)); } xla::XlaOp XlaHelpers::IntegerLiteral(xla::XlaBuilder* b, DataType data_type, @@ -146,7 +119,7 @@ xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type, } /* static */ Status XlaHelpers::ReshapeLiteral( - const xla::Literal& input, gtl::ArraySlice dimensions, + const xla::Literal& input, absl::Span dimensions, xla::Literal* output) { if (xla::ShapeUtil::IsTuple(input.shape())) { return errors::InvalidArgument("ReshapeLiteral does not support tuples."); @@ -176,44 +149,14 @@ static Tensor MakeLinspaceTensor(const TensorShape& shape, int64 depth) { return linspace; } -Status XlaHelpers::ArgMax(xla::XlaBuilder* builder, XlaOpKernelContext* ctx, - const xla::XlaOp& input, - const TensorShape& input_shape, DataType input_type, - DataType output_type, int axis, xla::XlaOp* argmax) { - return ArgMinMax(builder, ctx, input, input_shape, input_type, output_type, - axis, /*is_min=*/false, argmax); -} - -Status XlaHelpers::ArgMin(xla::XlaBuilder* builder, XlaOpKernelContext* ctx, - const xla::XlaOp& input, - const TensorShape& input_shape, DataType input_type, - DataType output_type, int axis, xla::XlaOp* argmin) { - return ArgMinMax(builder, ctx, input, input_shape, input_type, output_type, - axis, /*is_min=*/true, argmin); +xla::XlaOp XlaHelpers::ArgMax(xla::XlaOp input, xla::PrimitiveType output_type, + int axis) { + return ArgMinMax(input, output_type, axis, /*is_min=*/false); } -Status XlaHelpers::Iota(xla::XlaBuilder* builder, DataType dtype, int64 size, - xla::XlaOp* iota) { - TensorShape linspace_shape({size}); - Tensor linspace; - switch (dtype) { - case DT_UINT8: - linspace = MakeLinspaceTensor(linspace_shape, size); - break; - case DT_INT32: - linspace = MakeLinspaceTensor(linspace_shape, size); - break; - case DT_INT64: - linspace = MakeLinspaceTensor(linspace_shape, size); - break; - default: - return errors::InvalidArgument("Invalid argument type ", - DataTypeString(dtype)); - } - xla::Literal linspace_literal; - TF_RETURN_IF_ERROR(HostTensorToLiteral(linspace, &linspace_literal)); - *iota = builder->ConstantLiteral(linspace_literal); - return Status::OK(); +xla::XlaOp XlaHelpers::ArgMin(xla::XlaOp input, xla::PrimitiveType output_type, + int axis) { + return ArgMinMax(input, output_type, axis, /*is_min=*/true); } Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64 depth, int axis, @@ -245,25 +188,28 @@ Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64 depth, int axis, return errors::InvalidArgument("Invalid argument type ", DataTypeString(index_type)); } - xla::Literal linspace_literal; - TF_RETURN_IF_ERROR(HostTensorToLiteral(linspace, &linspace_literal)); + + xla::BorrowingLiteral linspace_literal; + TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(linspace, &linspace_literal)); // Broadcast the linspace constant across the indices along the new axis, // and test equality at each position. std::vector broadcast_dims(indices_shape.dims()); std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0); std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1); - xla::XlaOp one_hot_bool = builder->Eq( - indices, builder->ConstantLiteral(linspace_literal), broadcast_dims); + xla::XlaOp one_hot_bool = xla::Eq( + indices, xla::ConstantLiteral(builder, linspace_literal), broadcast_dims); // Selects the user-provided off_value and on_value values. - *one_hot = builder->Select( - one_hot_bool, builder->Broadcast(on_value, output_shape.dim_sizes()), - builder->Broadcast(off_value, output_shape.dim_sizes())); + *one_hot = xla::Select(one_hot_bool, + xla::Broadcast(on_value, output_shape.dim_sizes()), + xla::Broadcast(off_value, output_shape.dim_sizes())); return Status::OK(); } DataType XlaHelpers::SumAccumulationType(const DataType& dtype) { + // Upcast 16 bit sum reductions to 32 bit to reduce the precision loss from + // repeated floating point additions. if (dtype == DT_BFLOAT16 || dtype == DT_HALF) { return DT_FLOAT; } @@ -275,7 +221,7 @@ xla::XlaOp XlaHelpers::ConvertElementType(xla::XlaBuilder* const builder, const DataType new_element_type) { xla::PrimitiveType convert_to; TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to)); - return builder->ConvertElementType(operand, convert_to); + return xla::ConvertElementType(operand, convert_to); } } // end namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h index c3fdc5252e74363fe289eeabb2cb0d68298ee291..39578144caaadf293d24ea91aa874e56e27ecc01 100644 --- a/tensorflow/compiler/tf2xla/xla_helpers.h +++ b/tensorflow/compiler/tf2xla/xla_helpers.h @@ -18,24 +18,16 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_ #define TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_ +#include "absl/types/span.h" #include "tensorflow/compiler/tf2xla/xla_context.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace tensorflow { // Helper methods for building XLA computations. class XlaHelpers { public: - // Returns a handle representing the minimum value of a scalar - // element of data_type. - static xla::XlaOp MinValue(xla::XlaBuilder* b, DataType data_type); - - // Returns a handle representing the maximum value of a scalar - // element of data_type. - static xla::XlaOp MaxValue(xla::XlaBuilder* b, DataType data_type); - // Returns a handle representing the zero value of a scalar // element of data_type. static xla::XlaOp Zero(xla::XlaBuilder* b, DataType data_type); @@ -44,10 +36,6 @@ class XlaHelpers { // element of data_type. static xla::XlaOp One(xla::XlaBuilder* b, DataType data_type); - // Returns the machine epsilon for floating-point type `data_type`, i.e., - // the difference between 1.0 and the next representable value. - static xla::XlaOp Epsilon(xla::XlaBuilder* b, DataType data_type); - // Returns a handle representing the given value of an integer scalar // element of data_type. // Note that unlike One and Zero, does not work on boolean types. @@ -62,28 +50,18 @@ class XlaHelpers { // Reshapes literal 'input' to have 'shape'. Both the original shape and // 'shape' must contain the same number of elements. static Status ReshapeLiteral(const xla::Literal& input, - gtl::ArraySlice shape, + absl::Span shape, xla::Literal* output); - // Sets `argmax` to the argmax of `input` along `axis`. `input_shape` and - // `input_dtype` are the shape and dtype of `input` respectively, and - // `output_type` is the dtype to use for `argmax`. - static Status ArgMax(xla::XlaBuilder* builder, XlaOpKernelContext* ctx, - const xla::XlaOp& input, const TensorShape& input_shape, - DataType input_type, DataType output_type, int axis, - xla::XlaOp* argmax); - - // Sets `argmin` to the argmin of `input` along `axis`. `input_shape` and - // `input_dtype` are the shape and dtype of `input` respectively, and - // `output_type` is the dtype to use for `argmin`. - static Status ArgMin(xla::XlaBuilder* builder, XlaOpKernelContext* ctx, - const xla::XlaOp& input, const TensorShape& input_shape, - DataType input_type, DataType output_type, int axis, - xla::XlaOp* argmin); - - // Sets *iota to a rank 1 tensor with values [0, 1, 2, ...] of `dtype`. - static Status Iota(xla::XlaBuilder* builder, DataType dtype, int64 size, - xla::XlaOp* iota); + // Returns the argmax of `input` along `axis`. `output_type` is the type to + // use for the output. + static xla::XlaOp ArgMax(xla::XlaOp input, xla::PrimitiveType output_type, + int axis); + + // Returns the argmin of `input` along `axis`. `output_type` is the type to + // use for the output. + static xla::XlaOp ArgMin(xla::XlaOp input, xla::PrimitiveType output_type, + int axis); // Converts `indices` into a one-hot representation. `depth` is the size // of the new axis to add. `axis` is the position at which to add the new diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc index 9e17756b27733e2453ea1688d13e1d718c25cfc8..86a78ee429e8913edb4a948727fa692083c472f4 100644 --- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc +++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc @@ -23,6 +23,8 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h" #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -34,41 +36,6 @@ limitations under the License. namespace tensorflow { namespace { - -// Returns a vector of positional argument buffer sizes. -xla::StatusOr> ComputeArgSizes( - const xla::ProgramShape& program_shape) { - std::vector arg_sizes; - const size_t num_args = program_shape.parameters_size(); - arg_sizes.reserve(num_args); - for (int i = 0; i < num_args; ++i) { - const xla::Shape& arg_shape = program_shape.parameters(i); - constexpr size_t kPointerSize = sizeof(void*); - arg_sizes.push_back(xla::ShapeUtil::ByteSizeOf(arg_shape, kPointerSize)); - } - return std::move(arg_sizes); -} - -// Returns a vector of positional temporary buffer sizes. -xla::StatusOr> ComputeTempSizes( - const xla::BufferAssignment& buffer_assignment) { - const std::vector& allocations = - buffer_assignment.Allocations(); - std::vector temp_sizes; - temp_sizes.reserve(allocations.size()); - for (const xla::BufferAllocation& allocation : allocations) { - // Callers don't allocate temporary buffers for parameters. Nor for - // thread-local buffers, which are lowered to alloca. - if (allocation.is_entry_computation_parameter() || - allocation.is_thread_local()) { - temp_sizes.push_back(-1); - } else { - temp_sizes.push_back(allocation.size()); - } - } - return std::move(temp_sizes); -} - // Returns the index of the result in the temp buffers. xla::StatusOr ComputeResultIndex( const xla::BufferAssignment& buffer_assignment) { @@ -152,11 +119,11 @@ XlaJitCompiledCpuFunction::Compile( const xla::BufferAssignment& buffer_assignment = cpu_executable->buffer_assignment(); - // Compute buffer sizes and the result index, needed to run the raw function. - TF_ASSIGN_OR_RETURN(std::vector arg_sizes, - ComputeArgSizes(*program_shape)); - TF_ASSIGN_OR_RETURN(std::vector temp_sizes, - ComputeTempSizes(buffer_assignment)); + // Compute buffer infos and the result index, needed to run the raw function. + std::vector buffer_infos = + xla::cpu::CreateBufferInfosFromBufferAssignment(buffer_assignment); + std::vector arg_index_table = + xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos); TF_ASSIGN_OR_RETURN(size_t result_index, ComputeResultIndex(buffer_assignment)); @@ -164,28 +131,28 @@ XlaJitCompiledCpuFunction::Compile( new XlaJitCompiledCpuFunction); XlaJitCompiledCpuFunction* jit = jit_unique_ptr.get(); jit->executable_ = std::move(executable); - jit->arg_sizes_ = std::move(arg_sizes); - jit->temp_sizes_ = std::move(temp_sizes); + jit->buffer_infos_ = std::move(buffer_infos); + jit->arg_index_table_ = std::move(arg_index_table); jit->program_shape_ = std::move(program_shape); - jit->static_data_.raw_function = std::move(raw_function); - jit->static_data_.arg_sizes = jit->arg_sizes_.data(); - jit->static_data_.num_args = jit->arg_sizes_.size(); - jit->static_data_.temp_sizes = jit->temp_sizes_.data(); - jit->static_data_.num_temps = jit->temp_sizes_.size(); - jit->static_data_.result_index = result_index; + jit->static_data_.set_raw_function(raw_function); + jit->static_data_.set_buffer_infos(jit->buffer_infos_.data()); + jit->static_data_.set_num_buffers(jit->buffer_infos_.size()); + jit->static_data_.set_arg_index_table(jit->arg_index_table_.data()); + jit->static_data_.set_num_args(jit->arg_index_table_.size()); + jit->static_data_.set_result_index(result_index); // Optional metadata is collected and set below. CollectNames(config.feed(), &jit->nonempty_arg_names_, &jit->arg_names_); CollectNames(config.fetch(), &jit->nonempty_result_names_, &jit->result_names_); - jit->static_data_.arg_names = jit->arg_names_.data(); - jit->static_data_.result_names = jit->result_names_.data(); - jit->static_data_.program_shape = jit->program_shape_.get(); + jit->static_data_.set_arg_names(jit->arg_names_.data()); + jit->static_data_.set_result_names(jit->result_names_.data()); + jit->static_data_.set_program_shape(jit->program_shape_.get()); if (cpu_executable->hlo_profiling_enabled()) { - jit->static_data_.hlo_profile_printer_data = - &cpu_executable->hlo_profile_printer_data(); - jit->static_data_.profile_counters_size = - cpu_executable->hlo_profile_printer_data().profile_counters_size(); + jit->static_data_.set_hlo_profile_printer_data( + &cpu_executable->hlo_profile_printer_data()); + jit->static_data_.set_profile_counters_size( + cpu_executable->hlo_profile_printer_data().profile_counters_size()); } return std::move(jit_unique_ptr); diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h index af307ae4eff74927242c4650d8a43710e991cc52..d3c8f22a8078d03d15447ed200c914390f40b04f 100644 --- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h +++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h @@ -66,9 +66,11 @@ class XlaJitCompiledCpuFunction { // The static data is backed by the rest of the state in this class. XlaCompiledCpuFunction::StaticData static_data_; - // The backing arrays of arg and temp buffer sizes. - std::vector arg_sizes_; - std::vector temp_sizes_; + // The backing array for buffer infos. + std::vector buffer_infos_; + + // The backing array for the arg index table. + std::vector arg_index_table_; // The backing arrays of arg and result names. We hold the actual strings in // nonempty_*_names_, and hold arrays of pointers in *_names_ for the static diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc index 76c68d81af4dd9ec40fe6b1c33b03a876a0c6dc6..1499c99ed15eceaf6bfa2ef0dd1d5885b1e5fc58 100644 --- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc +++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc @@ -19,7 +19,12 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/literal_util.h" #include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_context.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/core/common_runtime/dma_helper.h" namespace tensorflow { @@ -38,8 +43,7 @@ xla::XlaBuilder* XlaOpKernelContext::builder() const { static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) { const XlaExpression* expression = reinterpret_cast(tensor.tensor_data().data()); - CHECK(expression->handle().builder() != nullptr || - expression->resource() != nullptr); + CHECK(expression->handle().valid() || expression->resource() != nullptr); VLOG(1) << "Fetched T" << expression->handle(); return expression; } @@ -48,7 +52,7 @@ static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) { static XlaExpression* CastExpressionFromUninitializedTensor(Tensor* tensor) { const XlaExpression* expression = reinterpret_cast(tensor->tensor_data().data()); - CHECK_EQ(expression->handle().builder(), nullptr); + CHECK(!expression->handle().valid()); return const_cast(expression); } @@ -63,18 +67,59 @@ const xla::XlaOp& XlaOpKernelContext::Input(int index) { return GetComputationFromTensor(context_->input(index)); } +const xla::XlaOp& XlaOpKernelContext::Input(StringPiece name) { + return GetComputationFromTensor(GetInputTensorByName(name)); +} + TensorShape XlaOpKernelContext::InputShape(int index) { return context_->input(index).shape(); } +TensorShape XlaOpKernelContext::InputShape(StringPiece name) { + return GetInputTensorByName(name).shape(); +} + +DataType XlaOpKernelContext::input_type(int index) const { + return context_->input(index).dtype(); +} + +xla::PrimitiveType XlaOpKernelContext::input_xla_type(int index) { + xla::PrimitiveType type; + Status status = DataTypeToPrimitiveType(input_type(index), &type); + if (!status.ok()) { + SetStatus(status); + return xla::PRIMITIVE_TYPE_INVALID; + } + return type; +} + Status XlaOpKernelContext::ConstantInput(int index, xla::Literal* constant_literal) { return ConstantInputReshaped( index, context_->input(index).shape().dim_sizes(), constant_literal); } +static xla::StatusOr InputIndex(XlaOpKernelContext* context, + StringPiece name) { + int start, stop; + TF_RETURN_IF_ERROR(context->op_kernel().InputRange(name, &start, &stop)); + if (stop != start + 1) { + return errors::InvalidArgument("OpKernel used list-valued input name '", + name, + "' when single-valued input was " + "expected"); + } + return start; +} + +Status XlaOpKernelContext::ConstantInput(StringPiece name, + xla::Literal* constant_literal) { + TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name)); + return ConstantInput(index, constant_literal); +} + Status XlaOpKernelContext::ConstantInputReshaped( - int index, gtl::ArraySlice new_dims, + int index, absl::Span new_dims, xla::Literal* constant_literal) { const Tensor& tensor = context_->input(index); TensorShape new_shape(new_dims); @@ -87,6 +132,25 @@ Status XlaOpKernelContext::ConstantInputReshaped( } const XlaExpression* expression = CastExpressionFromTensor(tensor); + auto copy_tensor_to_literal = [](const Tensor& tensor, + xla::Literal* literal) { + xla::Shape literal_shape; + TF_RETURN_IF_ERROR( + TensorShapeToXLAShape(tensor.dtype(), tensor.shape(), &literal_shape)); + + *literal = xla::Literal(literal_shape); + + // memcpy over the payload ... + // TODO(phawkins): handle string types. + size_t total_bytes = tensor.TotalBytes(); + if (total_bytes > 0) { + void* dst_ptr = literal->untyped_data(); + const void* src_ptr = DMAHelper::base(&tensor); + memcpy(dst_ptr, src_ptr, total_bytes); + } + return Status::OK(); + }; + // If the tensor has a known constant value, there is no need to invoke XLA. if (expression->has_constant_value()) { Tensor temp(tensor.dtype()); @@ -95,19 +159,21 @@ Status XlaOpKernelContext::ConstantInputReshaped( // with the enclosing Tensor. return errors::Internal("Incompatible shapes in ConstantInputReshaped."); } - return HostTensorToLiteral(temp, constant_literal); + + return copy_tensor_to_literal(temp, constant_literal); } // Make sure we treat zero-element tensors as constant. if (new_shape.num_elements() == 0) { Tensor temp(tensor.dtype(), new_shape); - return HostTensorToLiteral(temp, constant_literal); + + return copy_tensor_to_literal(temp, constant_literal); } xla::XlaOp handle = expression->handle(); if (new_shape != tensor.shape()) { // Reshape the handle to the desired shape. - handle = builder()->Reshape(handle, new_shape.dim_sizes()); + handle = xla::Reshape(handle, new_shape.dim_sizes()); } // The XLA layout is specified minor to major, and TensorFlow's minor @@ -162,7 +228,8 @@ Status XlaOpKernelContext::ConstantInputReshaped( } // Converts an int32 or int64 scalar literal to an int64. -static Status LiteralToInt64Scalar(const xla::Literal& literal, int64* out) { +static Status LiteralToInt64Scalar(const xla::LiteralSlice& literal, + int64* out) { if (xla::ShapeUtil::Rank(literal.shape()) != 0) { return errors::InvalidArgument("value is not a scalar"); } @@ -177,7 +244,8 @@ static Status LiteralToInt64Scalar(const xla::Literal& literal, int64* out) { } // Converts an float32 or float64 scalar literal to a float64. -static Status LiteralToFloat64Scalar(const xla::Literal& literal, double* out) { +static Status LiteralToFloat64Scalar(const xla::LiteralSlice& literal, + double* out) { if (xla::ShapeUtil::Rank(literal.shape()) != 0) { return errors::InvalidArgument("value is not a scalar"); } @@ -197,6 +265,12 @@ Status XlaOpKernelContext::ConstantInputAsIntScalar(int index, int64* out) { return LiteralToInt64Scalar(literal, out); } +Status XlaOpKernelContext::ConstantInputAsIntScalar(StringPiece name, + int64* out) { + TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name)); + return ConstantInputAsIntScalar(index, out); +} + Status XlaOpKernelContext::ConstantInputAsFloatScalar(int index, double* out) { xla::Literal literal; TF_RETURN_IF_ERROR(ConstantInput(index, &literal)); @@ -204,7 +278,7 @@ Status XlaOpKernelContext::ConstantInputAsFloatScalar(int index, double* out) { } // Converts an int32 or int64 1D literal to an int64 vector. -static Status LiteralToInt64Vector(const xla::Literal& literal, +static Status LiteralToInt64Vector(const xla::LiteralSlice& literal, std::vector* out) { if (xla::ShapeUtil::Rank(literal.shape()) != 1) { return errors::InvalidArgument("value is not 1D"); @@ -231,6 +305,20 @@ Status XlaOpKernelContext::ConstantInputAsIntVector(int index, return LiteralToInt64Vector(literal, out); } +Status XlaOpKernelContext::ConstantInputAsIntVector(StringPiece name, + std::vector* out) { + TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name)); + return ConstantInputAsIntVector(index, out); +} + +Status XlaOpKernelContext::ConstantInputReshapedToIntVector( + int index, std::vector* out) { + xla::Literal literal; + TF_RETURN_IF_ERROR(ConstantInputReshaped( + index, {InputShape(index).num_elements()}, &literal)); + return LiteralToInt64Vector(literal, out); +} + Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index, xla::Literal* out) { xla::Literal literal; @@ -256,6 +344,12 @@ Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index, } } +Status XlaOpKernelContext::ConstantInputAsInt64Literal(StringPiece name, + xla::Literal* out) { + TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name)); + return ConstantInputAsInt64Literal(index, out); +} + // TODO(phawkins): validate that the dimensions form a valid shape, fail // gracefully if they do not. Status XlaOpKernelContext::ConstantInputAsShape(int index, TensorShape* shape) { @@ -292,10 +386,11 @@ Status XlaOpKernelContext::ConstantInputList( return Status::OK(); } -Status XlaOpKernelContext::ReadVariableInput(int index, DataType type, - TensorShape* shape, - xla::XlaOp* value) { - const Tensor& tensor = context_->input(index); +namespace { + +Status ReadVariableInputTensor(const Tensor& tensor, DataType type, + const OpKernelContext* ctx, TensorShape* shape, + xla::XlaOp* value) { const XlaExpression* expression = CastExpressionFromTensor(tensor); XlaResource* variable = expression->resource(); TF_RET_CHECK(variable != nullptr); @@ -313,18 +408,34 @@ Status XlaOpKernelContext::ReadVariableInput(int index, DataType type, *shape = variable->shape(); } - XlaContext& xla_context = XlaContext::Get(context_); - TensorShape representation_shape = - xla_context.RepresentationShape(variable->shape(), variable->type()); + XlaContext& xla_context = XlaContext::Get(ctx); + TF_ASSIGN_OR_RETURN( + TensorShape representation_shape, + xla_context.RepresentationShape(variable->shape(), variable->type())); if (representation_shape == variable->shape()) { *value = variable->value(); } else { - *value = - builder()->Reshape(variable->value(), variable->shape().dim_sizes()); + *value = xla::Reshape(variable->value(), variable->shape().dim_sizes()); } return Status::OK(); } +} // namespace + +Status XlaOpKernelContext::ReadVariableInput(int index, DataType type, + TensorShape* shape, + xla::XlaOp* value) { + return ReadVariableInputTensor(context_->input(index), type, context_, shape, + value); +} + +Status XlaOpKernelContext::ReadVariableInput(StringPiece name, DataType type, + TensorShape* shape, + xla::XlaOp* value) { + return ReadVariableInputTensor(GetInputTensorByName(name), type, context_, + shape, value); +} + Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type, TensorShape* shape) const { const Tensor& tensor = context_->input(index); @@ -368,10 +479,11 @@ void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) { void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) { const TensorShape& shape = constant.shape(); - xla::Literal literal; - OP_REQUIRES_OK(context_, HostTensorToLiteral(constant, &literal)); - xla::XlaOp handle = builder()->ConstantLiteral(literal); - CHECK_NE(handle.builder(), nullptr); + xla::BorrowingLiteral literal; + OP_REQUIRES_OK(context_, HostTensorToBorrowingLiteral(constant, &literal)); + + xla::XlaOp handle = xla::ConstantLiteral(builder(), literal); + CHECK(handle.valid()); // Make the Tensor that will refer to the expression. Tensor* output = nullptr; @@ -414,17 +526,17 @@ Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) { return Status::OK(); } -Status XlaOpKernelContext::AssignVariable(int input_index, DataType type, - xla::XlaOp handle) { - TF_RET_CHECK(handle.builder() != nullptr); +namespace { - const XlaExpression* expression = - CastExpressionFromTensor(context_->input(input_index)); +Status AssignVariableTensor(const Tensor& tensor, DataType type, + const OpKernelContext* ctx, xla::XlaOp handle, + xla::XlaBuilder* builder) { + const XlaExpression* expression = CastExpressionFromTensor(tensor); XlaResource* variable = expression->resource(); TF_RET_CHECK(variable != nullptr); TF_RET_CHECK(variable->kind() == XlaResource::kVariable); - auto shape_or_status = builder()->GetShape(handle); + auto shape_or_status = builder->GetShape(handle); if (!shape_or_status.ok()) { return shape_or_status.status(); } @@ -434,15 +546,31 @@ Status XlaOpKernelContext::AssignVariable(int input_index, DataType type, TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape)); - XlaContext& xla_context = XlaContext::Get(context_); - TensorShape representation_shape = - xla_context.RepresentationShape(shape, type); + XlaContext& xla_context = XlaContext::Get(ctx); + TF_ASSIGN_OR_RETURN(TensorShape representation_shape, + xla_context.RepresentationShape(shape, type)); if (shape != representation_shape) { - handle = builder()->Reshape(handle, representation_shape.dim_sizes()); + handle = xla::Reshape(handle, representation_shape.dim_sizes()); } return variable->SetValue(handle); } +} // namespace + +Status XlaOpKernelContext::AssignVariable(int input_index, DataType type, + xla::XlaOp handle) { + TF_RET_CHECK(handle.valid()); + return AssignVariableTensor(context_->input(input_index), type, context_, + handle, builder()); +} + +Status XlaOpKernelContext::AssignVariable(StringPiece name, DataType type, + xla::XlaOp handle) { + TF_RET_CHECK(handle.valid()); + return AssignVariableTensor(GetInputTensorByName(name), type, context_, + handle, builder()); +} + XlaCompiler* XlaOpKernelContext::compiler() const { return XlaContext::Get(context_).compiler(); } @@ -482,6 +610,12 @@ const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMul( return XlaContext::Get(context_).GetOrCreateMul(type); } +const Tensor& XlaOpKernelContext::GetInputTensorByName(StringPiece name) { + const Tensor* tensor; + CHECK(context_->input(name, &tensor).ok()); + return *tensor; +} + XlaOpKernel::XlaOpKernel(OpKernelConstruction* context) : OpKernel(context) {} void XlaOpKernel::Compute(OpKernelContext* context) { diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h index 667dc262ca03ca716ffbf015a78fc14c7a8b7c1a..45cfa7da740c38afde0158568a019a4426992b64 100644 --- a/tensorflow/compiler/tf2xla/xla_op_kernel.h +++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h @@ -17,7 +17,9 @@ limitations under the License. #define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_ #include "tensorflow/compiler/tf2xla/xla_compiler.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/platform/macros.h" @@ -66,16 +68,26 @@ class XlaOpKernelContext { // Returns the number of inputs to the operator. int num_inputs() const { return context_->num_inputs(); } - // Returns the type of input 'index'. - DataType input_type(int index) { return context_->input(index).dtype(); } + // Returns the type of input `index`. + DataType input_type(int index) const; - // Returns the shape of input 'index'. + // Returns the type of input `index` as an xla::PrimitiveType. If the type + // is not representable as an XLA type, sets an error status and returns + // xla::PRIMITIVE_TYPE_INVALID. + xla::PrimitiveType input_xla_type(int index); + + // Returns the shape of input `index`. TensorShape InputShape(int index); - // Returns input 'index' as a XlaOp. Unlike + // Returns the shape of input `name`. + TensorShape InputShape(StringPiece name); + + // Returns input `index` as a XlaOp. Unlike // OpKernelContext::Input returns a symbolic value rather than a concrete // Tensor. const xla::XlaOp& Input(int index); + // Returns input `name` as a XlaOp. + const xla::XlaOp& Input(StringPiece name); // Returns true if all inputs are the same shape, otherwise sets the // status to a non-OK value and returns false. @@ -90,30 +102,38 @@ class XlaOpKernelContext { // Helper methods for constant inputs. - // Evaluates input 'index' and stores it in '*constant_literal'. If the + // Evaluates input `index` and stores it in `*constant_literal`. If the // expression cannot be evaluated, e.g., because it depends on unbound // parameters, returns a non-OK status. Status ConstantInput(int index, xla::Literal* constant_literal); + Status ConstantInput(StringPiece name, xla::Literal* constant_literal); - // Evaluates input 'index', reshapes it to 'new_shape' if new_shape != - // InputShape(index), and stores it in '*constant_literal'. If the input + // Evaluates input `index`, reshapes it to `new_shape` if new_shape != + // InputShape(index), and stores it in `*constant_literal`. If the input // cannot be evaluated, e.g., because it depends on unbound parameters, // returns a non-Ok status. If InputShape(index).num_elements() != // new_shape.num_elements(), returns an error status. - Status ConstantInputReshaped(int index, gtl::ArraySlice new_shape, + Status ConstantInputReshaped(int index, absl::Span new_dims, xla::Literal* constant_literal); // Converts a constant scalar int32 or int64 tensor into an int64. Status ConstantInputAsIntScalar(int index, int64* out); + Status ConstantInputAsIntScalar(StringPiece name, int64* out); // Converts a constant scalar float32 or float64 tensor into a float64. Status ConstantInputAsFloatScalar(int index, double* out); // Converts a constant 1D int32 or int64 tensor into a vector of int64s. Status ConstantInputAsIntVector(int index, std::vector* out); + Status ConstantInputAsIntVector(StringPiece name, std::vector* out); + + // Reshapes and converts a constant int32 or int64 tensor into a vector of + // int64s. + Status ConstantInputReshapedToIntVector(int index, std::vector* out); // Converts a constant int32 or int64 Tensor into an xla int64 Literal. Status ConstantInputAsInt64Literal(int index, xla::Literal* out); + Status ConstantInputAsInt64Literal(StringPiece name, xla::Literal* out); // Converts a constant 1D int32 or int64 tensor into a TensorShape. Status ConstantInputAsShape(int index, TensorShape* shape); @@ -131,17 +151,17 @@ class XlaOpKernelContext { return context_->expected_output_dtype(index); } - // Sets output 'index' to the XlaOp 'handle'. + // Sets output `index` to the XlaOp `handle`. // All outputs should be set using SetOutput and SetConstantOutput, not // via the underlying OpKernelContext. void SetOutput(int index, const xla::XlaOp& handle); - // Sets output 'index' to compile-time constant 'host_tensor', where - // 'host_tensor' is a tensor in host memory. It is preferable to use + // Sets output `index` to compile-time constant `host_tensor`, where + // `host_tensor` is a tensor in host memory. It is preferable to use // SetConstantOutput where possible. void SetConstantOutput(int index, const Tensor& host_tensor); - // Sets output 'index' to an invalid value. + // Sets output `index` to an invalid value. // Any subsequent attempt to consume this output will cause an error. void SetInvalidOutput(int index); @@ -151,10 +171,10 @@ class XlaOpKernelContext { // Variables - // Sets '*resource' to the resource associated with input `index`. + // Sets `*resource` to the resource associated with input `index`. Status GetResourceInput(int index, XlaResource** resource); - // Sets output 'index' to be a reference to resource 'resource'. + // Sets output `index` to be a reference to resource `resource`. void SetResourceOutput(int index, XlaResource* resource); // Sets `*type` and `*shape` to the current type and shape of a variable's @@ -163,17 +183,23 @@ class XlaOpKernelContext { TensorShape* shape) const; // Reads the current value of the resouce variable referred to by input - // 'index'. If `shape` is not nullptr, sets `*shape` to the shape of the + // `index`. If `shape` is not nullptr, sets `*shape` to the shape of the // variable. Returns an error if the variable has not been initialized, or if // its type does not match `type`. Status ReadVariableInput(int index, DataType type, TensorShape* shape, xla::XlaOp* value); + // Reads the current value of the resouce variable referred to by input + // `name`. + Status ReadVariableInput(StringPiece name, DataType type, TensorShape* shape, + xla::XlaOp* value); // Assigns the value `handle` to the variable referenced by input // `input_index`. The variable must be of `type`. Returns an error if the // variable has been initialized with a different type or with a // different shape. Status AssignVariable(int input_index, DataType type, xla::XlaOp handle); + // Assigns the value `handle` to the variable referenced by input `name`. + Status AssignVariable(StringPiece name, DataType type, xla::XlaOp handle); // Helper routines for the OP_REQUIRES macros void CtxFailure(const Status& s); @@ -221,6 +247,9 @@ class XlaOpKernelContext { const xla::XlaComputation* GetOrCreateMul(const DataType type); private: + // Returns the tensor of input `name`. + const Tensor& GetInputTensorByName(StringPiece name); + OpKernelContext* const context_; }; diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc index 4692038b61f6871a8a16299fd4d11e963eb46a57..dae2d956ca61a18f7da61fcd0a569a55a6286663 100644 --- a/tensorflow/compiler/tf2xla/xla_op_registry.cc +++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc @@ -71,16 +71,18 @@ XlaOpRegistry::~XlaOpRegistry() = default; << " have incompatible allow_resource_types settings."; return false; } - if (!x.has_device_whitelist || !y.has_device_whitelist) { - LOG(WARNING) << "Registrations of " << x.name - << " do not both have device whitelists."; + if (!x.has_device_whitelist && !y.has_device_whitelist) { + LOG(WARNING) << "Duplicate registrations of " << x.name + << "with no device whitelists."; return false; } - for (const auto& device : x.device_whitelist) { - if (y.device_whitelist.count(device) != 0) { - LOG(WARNING) << "Multiple registrations of " << x.name << " on device " - << device; - return false; + if (x.has_device_whitelist && y.has_device_whitelist) { + for (const auto& device : x.device_whitelist) { + if (y.device_whitelist.count(device) != 0) { + LOG(WARNING) << "Multiple registrations of " << x.name << " on device " + << device; + return false; + } } } if (x.compile_time_constant_inputs != y.compile_time_constant_inputs) { @@ -103,7 +105,7 @@ XlaOpRegistry::~XlaOpRegistry() = default; /* static */ void XlaOpRegistry::RegisterBackend( const string& compilation_device_name, - gtl::ArraySlice supported_types, BackendOpFilter op_filter) { + absl::Span supported_types, BackendOpFilter op_filter) { XlaOpRegistry& registry = Instance(); mutex_lock lock(registry.mutex_); auto result = registry.backends_.emplace(compilation_device_name, Backend()); @@ -157,97 +159,143 @@ void XlaOpRegistry::RegisterCompilationKernels() { registry.jit_kernels_registered_ = true; OpRegistryInterface* op_registry = OpRegistry::Global(); - for (const auto& op : registry.ops_) { - const string& op_name = op.first; - const std::unique_ptr& op_registration = op.second; - const OpDef* op_def; - Status lookup_status = op_registry->LookUpOpDef(op_name, &op_def); - if (!lookup_status.ok()) { - LOG(ERROR) << lookup_status.error_message(); - XLA_LOG_LINES( - ERROR, "Ops registered: \n" + - dynamic_cast(op_registry)->DebugString(true)); + // Order of op registration: + // The goal is to allow the co-existence of backend-specific kernels and + // generic kernels. To achieve this, we enforce the following order of + // registrations for one op: + // 1. Process op registration with device whitelists: + // this pass registers backend-specific kernels for this op. + // 2. Process op registration without device whitelists: + // this pass registers the kernels for all the other supported backends. + for (auto& ops : registry.ops_) { + const string& op_name = ops.first; + std::vector>& op_registrations = ops.second; + // Partition the op registration so that the ones with device whitelists + // precede the one without device whitelist. + std::partition(op_registrations.begin(), op_registrations.end(), + [](const std::unique_ptr& op_reg) { + return op_reg->has_device_whitelist; + }); + + // Collect a set of backend registered by ops with device whitelists. + // The op registration without whitelists will register a generic kernel + // for all other backends not in this set. + std::unordered_set whitelisted_backend; + for (auto& op_registration : op_registrations) { + if (op_registration->has_device_whitelist) { + whitelisted_backend.insert(op_registration->device_whitelist.begin(), + op_registration->device_whitelist.end()); + } } - TF_CHECK_OK(lookup_status); - std::unordered_set type_attrs; - for (const OpDef::AttrDef& attr_def : op_def->attr()) { - if (attr_def.type() == "type" || attr_def.type() == "list(type)") { - type_attrs.insert(attr_def.name()); + for (auto& op_registration : op_registrations) { + const OpDef* op_def; + Status lookup_status = op_registry->LookUpOpDef(op_name, &op_def); + if (!lookup_status.ok()) { + LOG(ERROR) << lookup_status.error_message(); + XLA_LOG_LINES( + ERROR, + "Ops registered: \n" + + dynamic_cast(op_registry)->DebugString(true)); } - } + TF_CHECK_OK(lookup_status); - // Checks there are no type constraints referring to unknown attributes. - for (const auto& constraint : op_registration->type_constraints) { - if (type_attrs.find(constraint.first) == type_attrs.end()) { - LOG(FATAL) << "Unknown type attribute " << constraint.first - << " in XLA op registration for " << op_name; + std::unordered_set type_attrs; + for (const OpDef::AttrDef& attr_def : op_def->attr()) { + if (attr_def.type() == "type" || attr_def.type() == "list(type)") { + type_attrs.insert(attr_def.name()); + } } - } - for (auto& backend : registry.backends_) { - // If the operator has a device whitelist, only register on whitelisted - // devices. - if (op_registration->has_device_whitelist && - op_registration->device_whitelist.find(backend.first) == - op_registration->device_whitelist.end()) { - continue; + // Checks there are no type constraints referring to unknown attributes. + for (const auto& constraint : op_registration->type_constraints) { + if (type_attrs.find(constraint.first) == type_attrs.end()) { + LOG(FATAL) << "Unknown type attribute " << constraint.first + << " in XLA op registration for " << op_name; + } } - std::unique_ptr kdef(new KernelDef); - kdef->set_op(op_registration->name); - kdef->set_device_type(backend.first); - - // Constrain each type attribute to the intersection of: - // a) the types supported by the backend, and - // b) the types allowed by the OpDef, and - // c) the type constraints. - for (const string& type_attr : type_attrs) { - KernelDef::AttrConstraint* attr_constraint = kdef->add_constraint(); - attr_constraint->set_name(type_attr); - auto* allowed_values = - attr_constraint->mutable_allowed_values()->mutable_list(); - - const OpDef::AttrDef& op_def_attr = *FindAttr(type_attr, *op_def); - const auto* op_def_allowed_types = - op_def_attr.has_allowed_values() - ? &op_def_attr.allowed_values().list().type() - : nullptr; - auto constraint_it = op_registration->type_constraints.find(type_attr); - const std::set* type_constraints = - constraint_it != op_registration->type_constraints.end() - ? &constraint_it->second - : nullptr; - for (DataType dtype : backend.second.supported_types) { - // Filter out types that aren't allowed by the OpDef. - if (op_def_allowed_types != nullptr && - std::find(op_def_allowed_types->begin(), - op_def_allowed_types->end(), - dtype) == op_def_allowed_types->end()) { - continue; + for (auto& backend : registry.backends_) { + // If the operator has a device whitelist, only register on whitelisted + // devices. + if (op_registration->has_device_whitelist && + op_registration->device_whitelist.find(backend.first) == + op_registration->device_whitelist.end()) { + continue; + } + + // If the operator does NOT has a device whitelist, skip all devices + // that has already been registered. + if (!op_registration->has_device_whitelist && + whitelisted_backend.find(backend.first) != + whitelisted_backend.end()) { + continue; + } + + std::unique_ptr kdef(new KernelDef); + kdef->set_op(op_registration->name); + kdef->set_device_type(backend.first); + + // Constrain each type attribute to the intersection of: + // a) the types supported by the backend, and + // b) the types allowed by the OpDef, and + // c) the type constraints. + bool unsatisfiable_type_constraint = false; + for (const string& type_attr : type_attrs) { + KernelDef::AttrConstraint* attr_constraint = kdef->add_constraint(); + attr_constraint->set_name(type_attr); + auto* allowed_values = + attr_constraint->mutable_allowed_values()->mutable_list(); + + const OpDef::AttrDef& op_def_attr = *FindAttr(type_attr, *op_def); + const auto* op_def_allowed_types = + op_def_attr.has_allowed_values() + ? &op_def_attr.allowed_values().list().type() + : nullptr; + auto constraint_it = + op_registration->type_constraints.find(type_attr); + const std::set* type_constraints = + constraint_it != op_registration->type_constraints.end() + ? &constraint_it->second + : nullptr; + for (DataType dtype : backend.second.supported_types) { + // Filter out types that aren't allowed by the OpDef. + if (op_def_allowed_types != nullptr && + std::find(op_def_allowed_types->begin(), + op_def_allowed_types->end(), + dtype) == op_def_allowed_types->end()) { + continue; + } + // Filter out types based on the type constraints. + if (type_constraints != nullptr && + type_constraints->find(dtype) == type_constraints->end()) { + continue; + } + // Passed all the filters, this type is allowed. + allowed_values->add_type(dtype); + } + if (op_registration->allow_resource_types) { + allowed_values->add_type(DT_RESOURCE); } - // Filter out types based on the type constraints. - if (type_constraints != nullptr && - type_constraints->find(dtype) == type_constraints->end()) { - continue; + // Don't build KernelDefs that have unsatisfiable type constraints. + if (allowed_values->type().empty()) { + unsatisfiable_type_constraint = true; + break; } - // Passed all the filters, this type is allowed. - allowed_values->add_type(dtype); } - if (op_registration->allow_resource_types) { - allowed_values->add_type(DT_RESOURCE); + if (unsatisfiable_type_constraint) continue; + + if (backend.second.op_filter != nullptr && + !backend.second.op_filter(kdef.get())) { + continue; } + VLOG(2) << "XLA op registration: device: " << backend.first + << " op: " << op_name; + registry.kernel_registrars_.emplace_back( + new kernel_factory::OpKernelRegistrar( + new KernelDef(*kdef), "XlaJitOp", op_registration->factory)); + backend.second.kernel_defs.push_back(std::move(kdef)); } - if (backend.second.op_filter != nullptr && - !backend.second.op_filter(kdef.get())) { - continue; - } - VLOG(2) << "XLA op registration: device: " << backend.first - << " op: " << op_name; - registry.kernel_registrars_.emplace_back( - new kernel_factory::OpKernelRegistrar( - new KernelDef(*kdef), "XlaJitOp", op_registration->factory)); - backend.second.kernel_defs.push_back(std::move(kdef)); } } } @@ -265,27 +313,41 @@ std::vector XlaOpRegistry::DeviceKernels( << "Unknown backend " << compilation_device_name; for (const std::unique_ptr& k : it->second.kernel_defs) { auto op_iter = registry.ops_.find(k->op()); - CHECK(op_iter != registry.ops_.end()); + CHECK(op_iter != registry.ops_.end() && !op_iter->second.empty()); // The test in IsCompatible ensures that if there are multiple matching // registrations for this op name, they all have the same value of // compilation_only, so only the first match needs to be tested. if (include_compilation_only_kernels || - !op_iter->second->compilation_only) { + !op_iter->second.front()->compilation_only) { kernels.push_back(k.get()); } } return kernels; } +/*static*/ std::vector XlaOpRegistry::GetAllRegisteredOps() { + std::vector ops; + XlaOpRegistry& registry = Instance(); + mutex_lock lock(registry.mutex_); + for (const auto& pair : registry.ops_) { + ops.push_back(pair.first); + } + std::sort(ops.begin(), ops.end()); + return ops; +} + /* static */ const std::unordered_set* XlaOpRegistry::CompileTimeConstantInputs(const string& op) { XlaOpRegistry& registry = Instance(); mutex_lock lock(registry.mutex_); auto it = registry.ops_.find(op); - if (it == registry.ops_.end()) { + if (it == registry.ops_.end() || it->second.empty()) { return nullptr; } - return &it->second->compile_time_constant_inputs; + // The test in IsCompatible ensures that if there are multiple matching + // registrations for this op name, they all have the same value of + // compile_time_constant_inputs, so only the first match is returned. + return &it->second.front()->compile_time_constant_inputs; } std::vector XlaOpRegistry::BackendNames() { @@ -311,7 +373,7 @@ XlaOpRegistry& XlaOpRegistry::Instance() { XlaOpRegistrationBuilder::XlaOpRegistrationBuilder(StringPiece name) { registration_.reset(new XlaOpRegistry::OpRegistration); - registration_->name = std::string(name); + registration_->name = string(name); } XlaOpRegistrationBuilder XlaOpRegistrationBuilder::Name(StringPiece name) { @@ -320,17 +382,17 @@ XlaOpRegistrationBuilder XlaOpRegistrationBuilder::Name(StringPiece name) { } XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::Device( - gtl::ArraySlice devices) { + absl::Span devices) { registration_->has_device_whitelist = true; for (StringPiece device : devices) { - registration_->device_whitelist.insert(std::string(device)); + registration_->device_whitelist.emplace(device); } return *this; } XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::Device(StringPiece device) { registration_->has_device_whitelist = true; - registration_->device_whitelist.insert(std::string(device)); + registration_->device_whitelist.emplace(device); return *this; } @@ -347,15 +409,15 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::AllowResourceTypes() { XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint( StringPiece attr_name, DataType allowed) { std::set& types = - registration_->type_constraints[std::string(attr_name)]; + registration_->type_constraints[string(attr_name)]; types.insert(allowed); return *this; } XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint( - StringPiece attr_name, gtl::ArraySlice allowed) { + StringPiece attr_name, absl::Span allowed) { std::set& types = - registration_->type_constraints[std::string(attr_name)]; + registration_->type_constraints[string(attr_name)]; for (DataType t : allowed) { types.insert(t); } @@ -364,7 +426,7 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint( XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::CompileTimeConstInput( StringPiece input_name) { - registration_->compile_time_constant_inputs.insert(std::string(input_name)); + registration_->compile_time_constant_inputs.emplace(input_name); return *this; } @@ -378,23 +440,22 @@ XlaOpRegistrar::XlaOpRegistrar( std::unique_ptr registration) { XlaOpRegistry& registry = XlaOpRegistry::Instance(); mutex_lock lock(registry.mutex_); - auto existing_ops = registry.ops_.equal_range(registration->name); - for (auto existing = existing_ops.first; existing != existing_ops.second; - ++existing) { - if (!XlaOpRegistry::IsCompatible(*existing->second, *registration)) { + auto& existing_ops = registry.ops_[registration->name]; + for (auto& existing : existing_ops) { + if (!XlaOpRegistry::IsCompatible(*existing, *registration)) { LOG(FATAL) << "XLA op registration " << registration->name << " is incompatible with existing registration of the same name."; } } - registry.ops_.emplace(registration->name, std::move(registration)); + existing_ops.emplace_back(std::move(registration)); } XlaBackendRegistrar::XlaBackendRegistrar( - StringPiece name, gtl::ArraySlice types, + StringPiece name, absl::Span types, XlaOpRegistry::BackendOpFilter op_filter) { XlaOpRegistry& registry = XlaOpRegistry::Instance(); - registry.RegisterBackend(std::string(name), types, op_filter); + registry.RegisterBackend(string(name), types, op_filter); } } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h index e255b01dd7fdcb095c7992d4352d2d9bb7d36ac3..c640842dc0d4fb3aff64d8388b4ffd3fdcee9faf 100644 --- a/tensorflow/compiler/tf2xla/xla_op_registry.h +++ b/tensorflow/compiler/tf2xla/xla_op_registry.h @@ -94,7 +94,7 @@ class XlaOpRegistry { // the device; it may optionally modify the KernelDef. typedef bool (*BackendOpFilter)(KernelDef* kdef); static void RegisterBackend(const string& compilation_device_name, - gtl::ArraySlice supported_types, + absl::Span supported_types, BackendOpFilter op_filter); // Returns the names of the registered backends. @@ -128,6 +128,9 @@ class XlaOpRegistry { const string& compilation_device_name, bool include_compilation_only_kernels); + // Returns all operations for which there are XLA kernels on any device. + static std::vector GetAllRegisteredOps(); + // Returns the set of compile-time constant inputs to 'op'. Returns nullptr // if the op is not registered. static const std::unordered_set* CompileTimeConstantInputs( @@ -203,7 +206,7 @@ class XlaOpRegistry { // Map from operator name to OpRegistrations, populated by REGISTER_XLA_OP. // Registrations present under the same key must satisfy IsCompatible above, // and this is checked during registration. - std::unordered_multimap> ops_ + std::unordered_map>> ops_ GUARDED_BY(mutex_); // Have we already registered the JIT kernels on the JIT devices? @@ -233,7 +236,7 @@ class XlaOpRegistrationBuilder { // Specifies a whitelist of devices on which the operator may run. XlaOpRegistrationBuilder& Device(StringPiece devices); - XlaOpRegistrationBuilder& Device(gtl::ArraySlice devices); + XlaOpRegistrationBuilder& Device(absl::Span devices); // Specifies a type constraint for a type variable attribute. Each constraint // specifies the set of types that the type variable may assume. @@ -241,7 +244,7 @@ class XlaOpRegistrationBuilder { DataType allowed); XlaOpRegistrationBuilder& TypeConstraint(StringPiece attr_name, - gtl::ArraySlice allowed); + absl::Span allowed); // Specifies that a dummy copy of this operator should not be registered on // XLA_* devices, but may be used during compilation. @@ -279,13 +282,13 @@ class XlaOpRegistrar { #define REGISTER_XLA_OP_UNIQ(CTR, BUILDER, OP) \ static ::tensorflow::XlaOpRegistrar xla_op_registrar__body__##CTR##__object( \ - XlaOpRegistrationBuilder::BUILDER.Build( \ + ::tensorflow::XlaOpRegistrationBuilder::BUILDER.Build( \ [](::tensorflow::OpKernelConstruction* context) \ -> ::tensorflow::OpKernel* { return new OP(context); })); class XlaBackendRegistrar { public: - XlaBackendRegistrar(StringPiece name, gtl::ArraySlice types, + XlaBackendRegistrar(StringPiece name, absl::Span types, XlaOpRegistry::BackendOpFilter op_filter = nullptr); }; diff --git a/tensorflow/compiler/tf2xla/xla_op_registry_test.cc b/tensorflow/compiler/tf2xla/xla_op_registry_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7b3b15b1af7636fddd4c29477cbfe6f9761f2c47 --- /dev/null +++ b/tensorflow/compiler/tf2xla/xla_op_registry_test.cc @@ -0,0 +1,119 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +// This test is to verify the correctness of XLA op registration with specific +// backend overrides. + +// A dummy backend-specific OpKernel for CPU. +class DummyCPUOp : public XlaOpKernel { + public: + explicit DummyCPUOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + void Compile(XlaOpKernelContext* ctx) override { + ctx->SetOutput(0, ctx->Input(0)); + } +}; + +// A dummy generic OpKernel for all backends. +class DummyGenericOp : public XlaOpKernel { + public: + explicit DummyGenericOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + void Compile(XlaOpKernelContext* ctx) override { + ctx->SetOutput(0, ctx->Input(0)); + } +}; + +REGISTER_OP("DummyDuplicateOp") + .Attr("T: {float, int32}") + .Input("input: int32") + .Output("output: int32") + .Doc(R"doc( +A dummy Op. + +input: dummy input. +output: dummy output. +)doc"); + +// Register the DummyCPUOp kernel for CPU with type INT32. +REGISTER_XLA_OP(Name("DummyDuplicateOp") + .Device(DEVICE_CPU_XLA_JIT) + .TypeConstraint("T", DT_INT32), + DummyCPUOp); +// Register the DummyGeneric kernel for all registered device (except CPU since +// it is already registered), with type FLOAT. +REGISTER_XLA_OP(Name("DummyDuplicateOp").TypeConstraint("T", DT_FLOAT), + DummyGenericOp); + +// Test the correctness of registered kernels. The kernel registered for CPU +// should have type INT32 while all other kernels should have type FLOAT. +TEST(XlaOpRegistryTest, XlaOpRegistrationWithOverride) { + XlaOpRegistry::RegisterCompilationKernels(); + auto registered_kernels = GetAllRegisteredKernels().kernel(); + for (const auto& kernels : registered_kernels) { + if (kernels.op() == "DummyDuplicateOp") { + EXPECT_EQ(kernels.constraint_size(), 1); + EXPECT_EQ(kernels.constraint(0).name(), "T"); + if (kernels.device_type() == "XLA_CPU_JIT") { + EXPECT_EQ(kernels.constraint(0).allowed_values().list().type(0), + DT_INT32); + } else { + EXPECT_EQ(kernels.constraint(0).allowed_values().list().type(0), + DT_FLOAT); + } + } + } +} + +// A dummy generic OpKernel for all backends. +class DummyInfeasibleTypeConstraintOp : public XlaOpKernel { + public: + explicit DummyInfeasibleTypeConstraintOp(OpKernelConstruction* ctx) + : XlaOpKernel(ctx) {} + void Compile(XlaOpKernelContext* ctx) override { + LOG(FATAL) << "unreachable"; + } +}; + +REGISTER_OP("DummyInfeasibleTypeConstraintOp") + .Attr("T: {float, string}") + .Input("input: T") + .Output("output: T") + .Doc(R"doc( +A dummy Op. + +input: dummy input. +output: dummy output. +)doc"); +REGISTER_XLA_OP( + Name("DummyInfeasibleTypeConstraintOp").TypeConstraint("T", DT_STRING), + DummyInfeasibleTypeConstraintOp); + +TEST(XlaOpRegistryTest, OpWithInfeasibleTypeConstraintIsNotRegistered) { + XlaOpRegistry::RegisterCompilationKernels(); + auto registered_kernels = GetAllRegisteredKernels().kernel(); + for (const auto& kernels : registered_kernels) { + // The operator should not be registered. + EXPECT_NE(kernels.op(), "DummyInfeasibleTypeConstraintOp"); + } +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc index 540c65c597f20d5bb26494e56c09ff2187cfb0db..7928fa034725206a752cbfe086d01f15cd235df9 100644 --- a/tensorflow/compiler/tf2xla/xla_resource.cc +++ b/tensorflow/compiler/tf2xla/xla_resource.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/sharding_util.h" #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" namespace tensorflow { @@ -89,16 +90,16 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) { } switch (kind_) { case kVariable: { - value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_), - shape_.dim_sizes()); + value_ = + xla::Broadcast(XlaHelpers::Zero(builder, type_), shape_.dim_sizes()); break; } case kTensorArray: { TensorShape ta_shape; ta_shape.AddDim(tensor_array_size_); ta_shape.AppendShape(shape_); - value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_), - ta_shape.dim_sizes()); + value_ = xla::Broadcast(XlaHelpers::Zero(builder, type_), + ta_shape.dim_sizes()); break; } case kStack: { @@ -106,9 +107,9 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) { ta_shape.AddDim(tensor_array_size_); ta_shape.AppendShape(shape_); value_ = - builder->Tuple({builder->Broadcast(XlaHelpers::Zero(builder, type_), - ta_shape.dim_sizes()), - builder->ConstantR0(0)}); + xla::Tuple(builder, {xla::Broadcast(XlaHelpers::Zero(builder, type_), + ta_shape.dim_sizes()), + xla::ConstantR0(builder, 0)}); break; } @@ -130,8 +131,8 @@ Status XlaResource::GetOrCreateTensorArrayGradient(const string& source, TensorShape ta_shape; ta_shape.AddDim(tensor_array_size_); ta_shape.AppendShape(shape_); - xla::XlaOp gradient_value = builder->Broadcast( - XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes()); + xla::XlaOp gradient_value = + xla::Broadcast(XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes()); gradient.reset( new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1, /*name=*/strings::StrCat("TensorArrayGrad: ", name_), @@ -152,7 +153,7 @@ Status XlaResource::Pack(xla::XlaOp* pack, xla::XlaBuilder* builder) const { for (const auto& gradient : tensor_array_gradients_) { elems.push_back(gradient.second->value_); } - *pack = builder->Tuple(elems); + *pack = xla::Tuple(builder, elems); } return Status::OK(); } @@ -168,7 +169,7 @@ Status XlaResource::SetFromPack(const std::set& gradient_sources, } else { TF_RET_CHECK(kind_ == kTensorArray); int pos = 0; - auto v = builder->GetTupleElement(pack, pos++); + auto v = xla::GetTupleElement(pack, pos++); if (!initialized()) { initial_value_ = v; } @@ -178,7 +179,7 @@ Status XlaResource::SetFromPack(const std::set& gradient_sources, XlaResource* gradient; TF_RETURN_IF_ERROR( GetOrCreateTensorArrayGradient(source, builder, &gradient)); - auto v = builder->GetTupleElement(pack, pos++); + auto v = xla::GetTupleElement(pack, pos++); if (!gradient->initialized()) { gradient->initial_value_ = v; } diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h index 9ce36d1aa7622334b2acfbe9aa85d7419c4772ed..2438490be13809b9f3571a362900b44cb838e76b 100644 --- a/tensorflow/compiler/tf2xla/xla_resource.h +++ b/tensorflow/compiler/tf2xla/xla_resource.h @@ -18,7 +18,7 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.pb.h" @@ -75,7 +75,7 @@ class XlaResource { const xla::XlaOp& initial_value() const { return initial_value_; } // A variable is initialized if it has a value. - bool initialized() const { return value_.builder() != nullptr; } + bool initialized() const { return value_.valid(); } // Sets the type and shape of the resource. The type and shape of a resource // must not change once the variable has been initialized. diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index c08db7e3fb0907cc3f2756dd0c5af9de6dc286c6..76e36f3c46b22742b6cf0c86e89d17899338a60f 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -53,7 +53,6 @@ xla_proto_library( deps = [ ":xla_data_proto", "//tensorflow/compiler/xla/service:hlo_proto", - "//tensorflow/compiler/xla/service:session_proto", ], ) @@ -114,6 +113,7 @@ cc_library( ":statusor", ":types", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -143,30 +143,15 @@ cc_library( cc_library( name = "statusor", - srcs = ["statusor.cc"], hdrs = [ "statusor.h", - "statusor_internals.h", ], visibility = ["//visibility:public"], deps = [ ":status", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", - ], -) - -tf_cc_test( - name = "statusor_test", - size = "small", - srcs = ["statusor_test.cc"], - deps = [ - ":statusor", - ":test", - ":types", - "//tensorflow/core:lib", - "//tensorflow/core:test", - "//tensorflow/core:test_main", + "//tensorflow/stream_executor", ], ) @@ -176,7 +161,7 @@ cc_library( hdrs = [ "iterator_util.h", "map_util.h", - "ptr_util.h", + "overflow_util.h", "util.h", ], visibility = ["//visibility:public"], @@ -187,7 +172,11 @@ cc_library( ":types", ":xla_data_proto", "//tensorflow/core:lib", - "//tensorflow/core:ptr_util", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -225,6 +214,7 @@ tf_cc_test( ":test", ":util", "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", ], ) @@ -251,10 +241,13 @@ cc_library( ":types", ":util", ":xla_data_proto", - "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:regexp_internal", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", ], ) @@ -269,7 +262,9 @@ tf_cc_test( ":types", ":util", ":xla_data_proto", + "//tensorflow/core:lib", "//tensorflow/core:test_main", + "@com_google_absl//absl/strings", ], ) @@ -296,9 +291,9 @@ tf_cc_test( ) cc_library( - name = "literal_util", - srcs = ["literal_util.cc"], - hdrs = ["literal_util.h"], + name = "literal", + srcs = ["literal.cc"], + hdrs = ["literal.h"], visibility = ["//visibility:public"], deps = [ ":array2d", @@ -310,17 +305,21 @@ cc_library( ":types", ":util", ":xla_data_proto", - "//tensorflow/core:framework", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) tf_cc_test( - name = "literal_util_test", - srcs = ["literal_util_test.cc"], + name = "literal_test", + srcs = ["literal_test.cc"], deps = [ ":array3d", ":array4d", + ":literal", ":literal_util", ":shape_util", ":test", @@ -329,6 +328,31 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "literal_util", + srcs = ["literal_util.cc"], + hdrs = ["literal_util.h"], + visibility = ["//visibility:public"], + deps = [ + ":array2d", + ":array3d", + ":array4d", + ":literal", + ":shape_util", + ":sparse_index_array", + ":status_macros", + ":types", + ":util", + ":xla_data_proto", + "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -343,9 +367,12 @@ cc_library( hdrs = ["literal_comparison.h"], deps = [ ":error_spec", + ":literal", ":literal_util", ":util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -357,6 +384,8 @@ cc_library( deps = [ ":util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -366,8 +395,8 @@ cc_library( visibility = ["//visibility:public"], deps = [ ":types", - "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/strings", ], ) @@ -378,6 +407,8 @@ cc_library( ":status", ":types", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -398,8 +429,9 @@ cc_library( deps = [ ":array", ":types", - ":util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -444,6 +476,8 @@ cc_library( ":array2d", ":types", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -455,6 +489,7 @@ tf_cc_test( ":test", "//tensorflow/core:lib", "//tensorflow/core:test_main", + "@com_google_absl//absl/types:span", ], ) @@ -474,7 +509,7 @@ cc_library( hdrs = ["packed_literal_reader.h"], visibility = [":internal"], deps = [ - ":literal_util", + ":literal", ":shape_util", ":status_macros", ":statusor", @@ -482,6 +517,8 @@ cc_library( ":util", ":xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/base", + "@com_google_absl//absl/memory", ], ) @@ -496,37 +533,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:regexp_internal", "//tensorflow/core:test", - ], -) - -cc_library( - name = "scanner", - srcs = ["scanner.cc"], - hdrs = ["scanner.h"], - visibility = [":internal"], - deps = [ - ":status", - ":status_macros", - ":types", - ":util", - "//tensorflow/core:lib", - "//tensorflow/core:lib_internal", - ], -) - -tf_cc_test( - name = "scanner_test", - srcs = ["scanner_test.cc"], - deps = [ - ":scanner", - ":status", - ":status_macros", - ":test", - ":types", - ":util", - "//tensorflow/core:lib", - "//tensorflow/core:lib_internal", - "//tensorflow/core:test_main", + "@com_google_absl//absl/strings", ], ) @@ -536,7 +543,7 @@ cc_library( hdrs = ["text_literal_reader.h"], visibility = [":internal"], deps = [ - ":literal_util", + ":literal", ":shape_util", ":status_macros", ":statusor", @@ -545,6 +552,8 @@ cc_library( ":xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -552,7 +561,7 @@ tf_cc_test( name = "text_literal_reader_test", srcs = ["text_literal_reader_test.cc"], deps = [ - ":literal_util", + ":literal", ":shape_util", ":test", ":text_literal_reader", @@ -569,12 +578,14 @@ cc_library( hdrs = ["text_literal_writer.h"], visibility = [":internal"], deps = [ - ":literal_util", + ":literal", ":shape_util", ":status_macros", ":types", ":xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -582,6 +593,7 @@ tf_cc_test( name = "text_literal_writer_test", srcs = ["text_literal_writer_test.cc"], deps = [ + ":literal", ":literal_util", ":test", ":test_helpers", @@ -599,10 +611,12 @@ cc_library( deps = [ ":shape_util", ":status_macros", - ":util", ":xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", ], ) @@ -616,6 +630,7 @@ tf_cc_test( ":xla_data_proto", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", ], ) @@ -642,6 +657,8 @@ cc_library( ":types", ":xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -654,16 +671,19 @@ cc_library( ":array2d", ":array3d", ":array4d", + ":literal_util", ":util", ":window_util", ":xla_data_proto", "//tensorflow/compiler/xla/client:padding", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_evaluator", "//tensorflow/compiler/xla/service:shape_inference", "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], ) @@ -674,7 +694,7 @@ tf_cc_test( ":array2d", ":array3d", ":array4d", - ":literal_util", + ":literal", ":reference_util", ":test", ":util", @@ -682,6 +702,7 @@ tf_cc_test( "//tensorflow/compiler/xla/client:padding", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", ], ) @@ -693,7 +714,8 @@ cc_library( ":array2d", ":shape_util", ":xla_data_proto", - "//tensorflow/core:lib", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/types:span", ], ) diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h index ea75ad32d5df7bbadd37e89de6144b264ab6d5d1..58cc1575858201b4508d7340cb47e59c4f4c5783 100644 --- a/tensorflow/compiler/xla/array.h +++ b/tensorflow/compiler/xla/array.h @@ -27,12 +27,12 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/bits.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -97,12 +97,11 @@ class Array { using value_type = T; // Creates a new array with the specified dimensions. - explicit Array(tensorflow::gtl::ArraySlice sizes) - : Array(sizes, T()) {} + explicit Array(absl::Span sizes) : Array(sizes, T()) {} // Creates a new array with the specified dimensions and specified value for // every cell. - Array(tensorflow::gtl::ArraySlice sizes, T value) + Array(absl::Span sizes, T value) : sizes_(sizes.begin(), sizes.end()), values_(new T[num_elements()]) { Fill(value); } @@ -301,7 +300,7 @@ class Array { // Invokes a callback with the (indices, value_ptr) for each cell in the // array. - void Each(std::function, T*)> f) { + void Each(std::function, T*)> f) { std::vector index(sizes_.size()); for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) { f(index, &values_[i]); @@ -309,8 +308,7 @@ class Array { } // Invokes a callback with the (indices, value) for each cell in the array. - void Each( - std::function, T)> f) const { + void Each(std::function, T)> f) const { std::vector index(sizes_.size()); for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) { f(index, values_[i]); @@ -320,8 +318,7 @@ class Array { // Invokes a callback with the (indices, value_ptr) for each cell in the // array. If a callback returns a non-OK status, returns that else returns // Status::OK(). - Status EachStatus( - std::function, T*)> f) { + Status EachStatus(std::function, T*)> f) { std::vector index(sizes_.size()); for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) { Status s = f(index, &values_[i]); @@ -335,8 +332,7 @@ class Array { // Invokes a callback with the (indices, value) for each cell in the array. // If a callback returns a non-OK status, returns that else returns // Status::OK(). - Status EachStatus( - std::function, T)> f) const { + Status EachStatus(std::function, T)> f) const { std::vector index(sizes_.size()); for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) { Status s = f(index, values_[i]); @@ -377,13 +373,13 @@ class Array { // Returns the value at the cell specified by the indexes. The number of // arguments have to match with the number of dimensions for the array. - const T& operator()(tensorflow::gtl::ArraySlice indexes) const { + const T& operator()(absl::Span indexes) const { return values_[calculate_index(indexes)]; } // Returns the value at the cell specified by the indexes. The number of // arguments have to match with the number of dimensions for the array. - T& operator()(tensorflow::gtl::ArraySlice indexes) { + T& operator()(absl::Span indexes) { return values_[calculate_index(indexes)]; } @@ -409,7 +405,7 @@ class Array { // Returns the total number of elements in the array. int64 num_elements() const { - return std::accumulate(sizes_.begin(), sizes_.end(), 1, + return std::accumulate(sizes_.begin(), sizes_.end(), 1LL, std::multiplies()); } @@ -438,8 +434,8 @@ class Array { bool operator!=(const Array& other) const { return !(*this == other); } // Performs the equivalent of a slice operation on this array. - Array Slice(tensorflow::gtl::ArraySlice starts, - tensorflow::gtl::ArraySlice limits) const { + Array Slice(absl::Span starts, + absl::Span limits) const { CHECK_EQ(starts.size(), num_dimensions()); CHECK_EQ(limits.size(), num_dimensions()); @@ -464,7 +460,7 @@ class Array { // Performs the equivalent of a DynamicUpdateSlice in-place on this array. void UpdateSlice(const Array& from, - tensorflow::gtl::ArraySlice start_indices) { + absl::Span start_indices) { CHECK_EQ(from.num_dimensions(), num_dimensions()); std::vector limit_indices; std::transform(start_indices.begin(), start_indices.end(), @@ -484,7 +480,7 @@ class Array { // Performs an in-place reshape, modifying the dimensions but not the // underlying data. - void Reshape(tensorflow::gtl::ArraySlice new_dimensions) { + void Reshape(absl::Span new_dimensions) { int64 old_num_elements = num_elements(); sizes_ = std::vector(new_dimensions.begin(), new_dimensions.end()); CHECK_EQ(num_elements(), old_num_elements); @@ -507,9 +503,7 @@ class Array { } } - pieces.push_back( - tensorflow::strings::AlphaNum(values_[calculate_index(index)]) - .data()); + pieces.push_back(absl::StrCat(values_[calculate_index(index)])); // Emit comma if it isn't the last element if (index.back() != sizes_.back() - 1) { @@ -527,7 +521,7 @@ class Array { } } } while (next_index(&index)); - return tensorflow::str_util::Join(pieces, ""); + return absl::StrJoin(pieces, ""); } private: diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h index a17e81f44832f272fd93dce9f854042b4a84fde4..782c966b4c57672d137569a318fb20ace14d493b 100644 --- a/tensorflow/compiler/xla/array2d.h +++ b/tensorflow/compiler/xla/array2d.h @@ -24,12 +24,11 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/array.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/bits.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -101,7 +100,7 @@ class Array2D : public Array { template std::unique_ptr> MakeLinspaceArray2D(double from, double to, int64 n1, int64 n2) { - auto array = MakeUnique>(n1, n2); + auto array = absl::make_unique>(n1, n2); int64 count = n1 * n2; NativeT step = static_cast((count > 1) ? (to - from) / (count - 1) : 0); diff --git a/tensorflow/compiler/xla/array4d.h b/tensorflow/compiler/xla/array4d.h index a75fffc605aa0df3e1e2eeb6d3129718cbbba0e4..e23d317baf9aca7b3705a93d6be952fb9a17762b 100644 --- a/tensorflow/compiler/xla/array4d.h +++ b/tensorflow/compiler/xla/array4d.h @@ -26,13 +26,11 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/compiler/xla/array4d_test.cc b/tensorflow/compiler/xla/array4d_test.cc index 927733ea1eab43feff643c35535cc6d9ea59ba5a..918872a7a03a022c72d22dfb8f0da9e9d3820e41 100644 --- a/tensorflow/compiler/xla/array4d_test.cc +++ b/tensorflow/compiler/xla/array4d_test.cc @@ -18,8 +18,8 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { namespace { @@ -27,8 +27,7 @@ namespace { // Given an Array4D and a 4-tuple index, computes the linear index into the // array idx represents. template -int64 Array4DLinearIndex(const Array4D& arr, - tensorflow::gtl::ArraySlice idx) { +int64 Array4DLinearIndex(const Array4D& arr, absl::Span idx) { EXPECT_EQ(4, idx.size()); return (idx[3] + idx[2] * arr.n4() + idx[1] * arr.n3() * arr.n4() + idx[0] * arr.n2() * arr.n3() * arr.n4()); @@ -51,9 +50,8 @@ TEST(Array4dTest, FillCtor) { EXPECT_EQ(fullof7.n3(), 4); EXPECT_EQ(fullof7.n4(), 5); - fullof7.Each([](tensorflow::gtl::ArraySlice idx, int* cell) { - EXPECT_EQ(*cell, 7); - }); + fullof7.Each( + [](absl::Span idx, int* cell) { EXPECT_EQ(*cell, 7); }); } TEST(Array4dTest, ContainerCtor) { @@ -69,7 +67,7 @@ TEST(Array4dTest, ContainerCtor) { EXPECT_EQ(arr.n3(), 4); EXPECT_EQ(arr.n4(), 5); - arr.Each([&arr](tensorflow::gtl::ArraySlice idx, int* cell) { + arr.Each([&arr](absl::Span idx, int* cell) { EXPECT_EQ(*cell, Array4DLinearIndex(arr, idx)); }); } @@ -129,21 +127,19 @@ TEST(Array3dTest, InitializerListCtorHalf) { TEST(Array4dTest, Fill) { Array4D fullof7(2, 3, 4, 5, 7); - fullof7.Each([](tensorflow::gtl::ArraySlice idx, int* cell) { - EXPECT_EQ(*cell, 7); - }); + fullof7.Each( + [](absl::Span idx, int* cell) { EXPECT_EQ(*cell, 7); }); fullof7.Fill(11); - fullof7.Each([](tensorflow::gtl::ArraySlice idx, int* cell) { - EXPECT_EQ(*cell, 11); - }); + fullof7.Each( + [](absl::Span idx, int* cell) { EXPECT_EQ(*cell, 11); }); } TEST(Array4dTest, FillWithMultiples) { Array4D arr(2, 3, 4, 5); arr.FillWithMultiples(2.0f); - arr.Each([&arr](tensorflow::gtl::ArraySlice idx, float* cell) { + arr.Each([&arr](absl::Span idx, float* cell) { EXPECT_EQ(*cell, 2.0f * Array4DLinearIndex(arr, idx)); }); } diff --git a/tensorflow/compiler/xla/array_test.cc b/tensorflow/compiler/xla/array_test.cc index e8356c9832d34135f5ffb1a5c7a9d6db6db3a051..2d0ac98bd4ee27004295c4189cb190bb2c9739c9 100644 --- a/tensorflow/compiler/xla/array_test.cc +++ b/tensorflow/compiler/xla/array_test.cc @@ -163,7 +163,7 @@ TEST(ArrayTest, Each) { arr.FillWithMultiples(1); int64 each_count = 0, each_sum = 0; - arr.Each([&](tensorflow::gtl::ArraySlice idx, int cell) { + arr.Each([&](absl::Span idx, int cell) { int64 lin_idx = idx[0] * 12 + idx[1] * 4 + idx[2]; EXPECT_EQ(lin_idx, cell); each_count++; diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD index aacb394ae5f92aa0d87ee3a23bcc3d4ec5cd99a3..f825f67b447514a416f3a49ac8aad9dcf505f5a7 100644 --- a/tensorflow/compiler/xla/client/BUILD +++ b/tensorflow/compiler/xla/client/BUILD @@ -45,6 +45,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -64,19 +65,21 @@ cc_library( hdrs = ["client.h"], deps = [ ":global_data", + ":xla_computation", "//tensorflow/compiler/xla:execution_options_util", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:service_interface", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla:xla_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -86,9 +89,13 @@ cc_library( hdrs = ["executable_build_options.h"], deps = [ "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", ], ) @@ -99,21 +106,23 @@ cc_library( deps = [ ":client", ":executable_build_options", + ":xla_computation", "//tensorflow/compiler/xla:executable_run_options", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:backend", "//tensorflow/compiler/xla/service:compiler", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:executable", + "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/compiler/xla/service:source_map_util", - "//tensorflow/core:lib", + "//tensorflow/compiler/xla/service:stream_pool", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", "@llvm//:support", ], ) @@ -124,14 +133,14 @@ cc_library( hdrs = ["compile_only_client.h"], deps = [ ":client", + ":xla_computation", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:compile_only_service", "//tensorflow/compiler/xla/service:compiler", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", "@llvm//:support", ], ) @@ -156,6 +165,7 @@ cc_library( "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", ], ) @@ -172,3 +182,65 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", ], ) + +cc_library( + name = "xla_computation", + srcs = ["xla_computation.cc"], + hdrs = ["xla_computation.h"], + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo_proto", + "@com_google_absl//absl/memory", + ], +) + +cc_library( + name = "xla_builder", + srcs = ["xla_builder.cc"], + hdrs = ["xla_builder.h"], + visibility = ["//visibility:public"], + deps = [ + ":padding", + ":sharding_builder", + ":xla_computation", + "//tensorflow/compiler/xla:execution_options_util", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_proto", + "//tensorflow/compiler/xla/service:shape_inference", + "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", + ], +) + +tf_cc_test( + name = "xla_builder_test", + srcs = ["xla_builder_test.cc"], + deps = [ + ":xla_builder", + ":xla_computation", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla:test_helpers", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/core:test", + ], +) diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc index 3d596a6e65430b6e9692aabd65fc8aa84b7b873d..8818f813127230d3b39d4b48d874b7cfb24b8abc 100644 --- a/tensorflow/compiler/xla/client/client.cc +++ b/tensorflow/compiler/xla/client/client.cc @@ -18,14 +18,15 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/execution_options_util.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/types.h" @@ -88,7 +89,7 @@ StatusOr> Client::TransferToServer( "TransferToServer request"); } - return MakeUnique(stub_, response.data()); + return absl::make_unique(stub_, response.data()); } Status Client::TransferToInfeed(const LiteralSlice& literal, int64 replica_id, @@ -162,8 +163,7 @@ Status Client::ResetDevice() { } StatusOr> Client::ExecuteAndTransfer( - const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + const XlaComputation& computation, absl::Span arguments, const ExecutionOptions* execution_options, ExecutionProfile* execution_profile) { TF_ASSIGN_OR_RETURN( @@ -211,8 +211,7 @@ StatusOr Client::LoadSnapshot(const HloSnapshot& module) { } StatusOr> Client::Execute( - const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + const XlaComputation& computation, absl::Span arguments, const ExecutionOptions* execution_options, ExecutionProfile* execution_profile) { ExecuteGraphRequest request; @@ -247,11 +246,11 @@ StatusOr> Client::Execute( } } - return MakeUnique(stub_, response.output()); + return absl::make_unique(stub_, response.output()); } StatusOr>> Client::ExecuteParallel( - tensorflow::gtl::ArraySlice computations) { + absl::Span computations) { ExecuteGraphParallelRequest request; for (const XlaComputationInstance& computation : computations) { @@ -277,7 +276,7 @@ StatusOr>> Client::ExecuteParallel( std::vector> outputs; for (size_t i = 0; i < computations.size(); ++i) { outputs.push_back( - MakeUnique(stub_, response.responses(i).output())); + absl::make_unique(stub_, response.responses(i).output())); if (computations[i].execution_profile != nullptr) { *computations[i].execution_profile = response.responses(i).profile(); } @@ -339,7 +338,7 @@ StatusOr>> Client::DeconstructTuple( std::vector> handles; for (auto& handle : response.element_handles()) { - handles.push_back(MakeUnique(stub_, handle)); + handles.push_back(absl::make_unique(stub_, handle)); } return std::move(handles); } @@ -368,7 +367,7 @@ StatusOr Client::GetComputationStats( StatusOr> Client::GetComputationShape( const XlaComputation& computation) { TF_ASSIGN_OR_RETURN(const auto& result, computation.GetProgramShape()); - return MakeUnique(result); + return absl::make_unique(result); } StatusOr Client::GetShape(const GlobalData& data) { @@ -399,7 +398,7 @@ StatusOr Client::ExecutionStatsAsString( int64 nanoseconds = profile.compute_time_ns(); int64 cycle_count = profile.compute_cycle_count(); double gflops = total_flops / nanoseconds; - return tensorflow::strings::StrCat( + return absl::StrCat( "[Execution Statistics] flop count: ", computation_stats.flop_count(), ", transcendental count: ", computation_stats.transcendental_count(), ", compute execution time: ", nanoseconds, " nsec", @@ -409,8 +408,10 @@ StatusOr Client::ExecutionStatsAsString( return string("[Execution Statistics] not available."); } -StatusOr Client::CreateChannelHandle() { +StatusOr Client::CreateChannelHandleByType( + ChannelHandle::ChannelType type) { CreateChannelHandleRequest request; + request.set_channel_type(type); CreateChannelHandleResponse response; VLOG(1) << "making create channel handle request"; @@ -424,4 +425,16 @@ StatusOr Client::CreateChannelHandle() { return response.channel(); } +StatusOr Client::CreateChannelHandle() { + return CreateChannelHandleByType(ChannelHandle::DEVICE_TO_DEVICE); +} + +StatusOr Client::CreateHostToDeviceChannelHandle() { + return CreateChannelHandleByType(ChannelHandle::HOST_TO_DEVICE); +} + +StatusOr Client::CreateDeviceToHostChannelHandle() { + return CreateChannelHandleByType(ChannelHandle::DEVICE_TO_HOST); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h index cda8a71f718ed0681a1d2e076f51bfd9bff80fc8..7960b078686e611a6439af495d266f9084992d29 100644 --- a/tensorflow/compiler/xla/client/client.h +++ b/tensorflow/compiler/xla/client/client.h @@ -19,16 +19,16 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/global_data.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service_interface.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla.pb.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" namespace xla { @@ -53,7 +53,7 @@ class Client { // will be filled with profile data from the execution. StatusOr> Execute( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const ExecutionOptions* execution_options = nullptr, ExecutionProfile* execution_profile = nullptr); @@ -82,7 +82,7 @@ class Client { // from each computation. // StatusOr>> ExecuteParallel( - tensorflow::gtl::ArraySlice computations); + absl::Span computations); // Requests device_count device handles available on the target. The returned // device handles are used to specify the devices to execute the computations @@ -134,7 +134,7 @@ class Client { // Execute() and Transfer(). StatusOr> ExecuteAndTransfer( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const ExecutionOptions* execution_options = nullptr, ExecutionProfile* execution_profile = nullptr); @@ -153,8 +153,6 @@ class Client { // // If output_layout is non-null, then the output of the computation will be // stored using that layout. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. StatusOr> ComputeConstant( const XlaComputation& computation, const Layout* output_layout = nullptr) const; @@ -180,10 +178,15 @@ class Client { StatusOr> GetComputationShape( const XlaComputation& computation); - // Creates a channel handle that can be used to transfer data between - // two computations via a pair of Send and Recv instructions. + // Creates a channel handle that can be used to transfer data between two + // computations on different devices via a pair of Send and Recv instructions. StatusOr CreateChannelHandle(); + // Create a channel for communicating with the host via a SendtoHost or + // RecvFromHost operation. + StatusOr CreateHostToDeviceChannelHandle(); + StatusOr CreateDeviceToHostChannelHandle(); + StatusOr LoadSnapshot(const HloSnapshot& module); ServiceInterface* stub() { return stub_; } @@ -194,6 +197,9 @@ class Client { StatusOr ExecutionStatsAsString(const XlaComputation& computation, const ExecutionProfile& profile); + StatusOr CreateChannelHandleByType( + ChannelHandle::ChannelType type); + ServiceInterface* stub_; // Stub that this client is connected on. TF_DISALLOW_COPY_AND_ASSIGN(Client); diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc index 803a9e40094391ba47ed27713f4538caf875c4f6..27b7fa7b29206affa9f9c2e4becd9e4ea66484ab 100644 --- a/tensorflow/compiler/xla/client/client_library.cc +++ b/tensorflow/compiler/xla/client/client_library.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client_library.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -94,10 +95,10 @@ ClientLibrary::~ClientLibrary() = default; service_options.set_intra_op_parallelism_threads( options.intra_op_parallelism_threads()); - auto instance = MakeUnique(); + auto instance = absl::make_unique(); TF_ASSIGN_OR_RETURN(instance->service, LocalService::NewService(service_options)); - instance->client = MakeUnique(instance->service.get()); + instance->client = absl::make_unique(instance->service.get()); LocalClient* cl = instance->client.get(); client_library.local_instances_.insert( @@ -134,10 +135,11 @@ ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) { return it->second->client.get(); } - auto instance = MakeUnique(); + auto instance = absl::make_unique(); TF_ASSIGN_OR_RETURN(instance->service, CompileOnlyService::NewService(platform)); - instance->client = MakeUnique(instance->service.get()); + instance->client = + absl::make_unique(instance->service.get()); CompileOnlyClient* cl = instance->client.get(); client_library.compile_only_instances_.insert( diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc index dc69d2097ebe14ca0e14a39849d4fcae99024fdc..a6c58cb17571b63cd0f45d0d95376a02bc4a72e2 100644 --- a/tensorflow/compiler/xla/client/compile_only_client.cc +++ b/tensorflow/compiler/xla/client/compile_only_client.cc @@ -15,16 +15,17 @@ limitations under the License. #include "tensorflow/compiler/xla/client/compile_only_client.h" +#include "absl/memory/memory.h" #include "llvm/ADT/Triple.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/status_macros.h" namespace xla { StatusOr>> CompileOnlyClient::CompileAheadOfTime( - const tensorflow::gtl::ArraySlice computations, - const AotCompilationOptions& options) { + const absl::Span computations, + const AotCompilationOptions& options, + std::unique_ptr* metadata) { std::vector service_instances; service_instances.reserve(computations.size()); for (const AotXlaComputationInstance& instance : computations) { @@ -36,10 +37,11 @@ CompileOnlyClient::CompileAheadOfTime( service_instance.argument_layouts = instance.argument_layouts; service_instance.result_layout = instance.result_layout; } - return compiler_service_->CompileAheadOfTime(service_instances, options); + return compiler_service_->CompileAheadOfTime(service_instances, options, + metadata); } -int64 CompileOnlyClient::PointerSizeForTriple(tensorflow::StringPiece triple) { +int64 CompileOnlyClient::PointerSizeForTriple(absl::string_view triple) { llvm::Triple llvm_triple( llvm::Triple::normalize(llvm::StringRef(triple.data(), triple.size()))); if (llvm_triple.isArch64Bit()) { diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h index f9a7c31270c7a11175f47a537639a97d0c9211af..9e3ed23734941d98d622c38028cd44d48d3e620a 100644 --- a/tensorflow/compiler/xla/client/compile_only_client.h +++ b/tensorflow/compiler/xla/client/compile_only_client.h @@ -17,7 +17,7 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_ #include "tensorflow/compiler/xla/client/client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/compile_only_service.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/statusor.h" @@ -46,16 +46,18 @@ class CompileOnlyClient : public Client { const Shape* result_layout; }; - // Compiles a list of xla computations for ahead-of-time execution. This is - // intended for use in static compilation. The |options| parameter describes - // the target for which the compiler should emit code. + // Compiles a list of xla computations for ahead-of-time execution. + // This is intended for use in static compilation. The |options| + // parameter describes the target for which the compiler should emit + // code. |metadata|, if provided, is populated during compilation. StatusOr>> CompileAheadOfTime( - const tensorflow::gtl::ArraySlice computations, - const AotCompilationOptions& options); + const absl::Span computations, + const AotCompilationOptions& options, + std::unique_ptr* metadata = nullptr); // Returns the size of a pointer in bytes for a given triple. - static int64 PointerSizeForTriple(tensorflow::StringPiece triple); + static int64 PointerSizeForTriple(absl::string_view triple); private: CompileOnlyService* compiler_service_; diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc index 6e3c5cb484b8f1ef053fa287a4d462aeb886e530..0f1745366b7c33e573aff2e66d85431b01488c49 100644 --- a/tensorflow/compiler/xla/client/executable_build_options.cc +++ b/tensorflow/compiler/xla/client/executable_build_options.cc @@ -15,8 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/client/executable_build_options.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { @@ -59,10 +59,10 @@ string ExecutableBuildOptions::ToString() const { if (generate_hlo_graph_.has_value()) { generate_hlo_graph = generate_hlo_graph_.value(); } - return tensorflow::strings::Printf( + return absl::StrFormat( "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, " "generate_hlo_graph=%s}", - device_ordinal_, result_layout.c_str(), generate_hlo_graph.c_str()); + device_ordinal_, result_layout, generate_hlo_graph); } ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph( @@ -71,29 +71,41 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph( return *this; } -const tensorflow::gtl::optional& -ExecutableBuildOptions::generate_hlo_graph() const { +const absl::optional& ExecutableBuildOptions::generate_hlo_graph() + const { return generate_hlo_graph_; } ExecutableBuildOptions& ExecutableBuildOptions::set_dump_optimized_hlo_proto_to( - tensorflow::StringPiece dirpath) { - dump_optimized_hlo_proto_to_ = dirpath.ToString(); + absl::string_view dirpath) { + dump_optimized_hlo_proto_to_ = string(dirpath); return *this; } -const tensorflow::gtl::optional& +const absl::optional& ExecutableBuildOptions::dump_optimized_hlo_proto_to() const { return dump_optimized_hlo_proto_to_; } +ExecutableBuildOptions& +ExecutableBuildOptions::set_dump_unoptimized_hlo_proto_to( + absl::string_view dirpath) { + dump_unoptimized_hlo_proto_to_ = string(dirpath); + return *this; +} + +const absl::optional& +ExecutableBuildOptions::dump_unoptimized_hlo_proto_to() const { + return dump_unoptimized_hlo_proto_to_; +} + ExecutableBuildOptions& ExecutableBuildOptions::set_dump_per_pass_hlo_proto_to( - tensorflow::StringPiece dirpath) { - dump_per_pass_hlo_proto_to_ = dirpath.ToString(); + absl::string_view dirpath) { + dump_per_pass_hlo_proto_to_ = string(dirpath); return *this; } -const tensorflow::gtl::optional& +const absl::optional& ExecutableBuildOptions::dump_per_pass_hlo_proto_to() const { return dump_per_pass_hlo_proto_to_; } @@ -103,7 +115,7 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_hlo_profile(bool enabled) { return *this; } -tensorflow::gtl::optional ExecutableBuildOptions::hlo_profile() const { +absl::optional ExecutableBuildOptions::hlo_profile() const { return hlo_profile_; } diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h index 11f10983606fe02b1edb11a260edde8e5f9a726f..93334db88bc24f2ffbf3c7a57ee45ef238286739 100644 --- a/tensorflow/compiler/xla/client/executable_build_options.h +++ b/tensorflow/compiler/xla/client/executable_build_options.h @@ -16,10 +16,11 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_ #define TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_ +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/optional.h" namespace xla { @@ -56,39 +57,54 @@ class ExecutableBuildOptions { // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions). ExecutableBuildOptions& set_generate_hlo_graph(string regex); - const tensorflow::gtl::optional& generate_hlo_graph() const; + const absl::optional& generate_hlo_graph() const; // If set, specifies a dirpath to dump the end-of-optimization-pipeline HLO // protobuf to (as in DebugOptions). ExecutableBuildOptions& set_dump_optimized_hlo_proto_to( - tensorflow::StringPiece dirpath); - const tensorflow::gtl::optional& dump_optimized_hlo_proto_to() const; + absl::string_view dirpath); + const absl::optional& dump_optimized_hlo_proto_to() const; + + // If set, specifies a dirpath to dump the start-of-optimization-pipeline HLO + // protobuf to (as in DebugOptions). + ExecutableBuildOptions& set_dump_unoptimized_hlo_proto_to( + absl::string_view dirpath); + const absl::optional& dump_unoptimized_hlo_proto_to() const; // If set, specifies a dirpath to dump the per-pass-in-pipeline HLO protobufs // to (as in DebugOptions). ExecutableBuildOptions& set_dump_per_pass_hlo_proto_to( - tensorflow::StringPiece dirpath); - const tensorflow::gtl::optional& dump_per_pass_hlo_proto_to() const; + absl::string_view dirpath); + const absl::optional& dump_per_pass_hlo_proto_to() const; // If true, specifies that we should record an HLO profile during execution // and log it after execution (as in DebugOptions). If nullopt the default is // used. ExecutableBuildOptions& set_hlo_profile(bool enabled); - tensorflow::gtl::optional hlo_profile() const; + absl::optional hlo_profile() const; + + void add_disabled_hlo_pass(absl::string_view pass_name) { + disabled_hlo_passes_.push_back(std::string(pass_name)); + } + const absl::Span disabled_hlo_passes() const { + return disabled_hlo_passes_; + } // Returns a string representation of the build options, suitable for // debugging. string ToString() const; private: - tensorflow::gtl::optional hlo_profile_; + absl::optional hlo_profile_; int device_ordinal_ = -1; Shape result_layout_; bool result_layout_set_ = false; - tensorflow::gtl::optional generate_hlo_graph_; - tensorflow::gtl::optional dump_optimized_hlo_proto_to_; - tensorflow::gtl::optional dump_per_pass_hlo_proto_to_; + absl::optional generate_hlo_graph_; + absl::optional dump_optimized_hlo_proto_to_; + absl::optional dump_unoptimized_hlo_proto_to_; + absl::optional dump_per_pass_hlo_proto_to_; DeviceMemoryAllocator* device_allocator_ = nullptr; + std::vector disabled_hlo_passes_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD index d49d959a6c8112d3701857a70cecb24701c7b6d9..a18c94c4e695a6cdcb9dcc60b64b617cecd276d8 100644 --- a/tensorflow/compiler/xla/client/lib/BUILD +++ b/tensorflow/compiler/xla/client/lib/BUILD @@ -13,28 +13,203 @@ filegroup( ]), ) +load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test") +load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites") + +# Generate test_suites for all backends, named "${backend}_tests". +generate_backend_suites() + cc_library( name = "arithmetic", srcs = ["arithmetic.cc"], hdrs = ["arithmetic.h"], deps = [ + ":constants", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "constants", + srcs = ["constants.cc"], + hdrs = ["constants.h"], + deps = [ + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + ], +) + +xla_test( + name = "constants_test", + srcs = ["constants_test.cc"], + tags = ["enable_for_xla_interpreter"], + deps = [ + ":constants", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + ], +) + +cc_library( + name = "conv_grad_size_util", + srcs = ["conv_grad_size_util.cc"], + hdrs = ["conv_grad_size_util.h"], + deps = [ + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla/client:padding", "//tensorflow/core:lib", ], ) +cc_library( + name = "math", + srcs = ["math.cc"], + hdrs = ["math.h"], + deps = [ + ":constants", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla/client:xla_builder", + ], +) + +xla_test( + name = "math_test", + srcs = ["math_test.cc"], + tags = ["enable_for_xla_interpreter"], + deps = [ + ":math", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + ], +) + +cc_library( + name = "numeric", + srcs = ["numeric.cc"], + hdrs = ["numeric.h"], + deps = [ + ":arithmetic", + ":constants", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "@com_google_absl//absl/types:span", + ], +) + +xla_test( + name = "numeric_test", + srcs = ["numeric_test.cc"], + tags = ["enable_for_xla_interpreter"], + deps = [ + ":numeric", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + ], +) + +cc_library( + name = "pooling", + srcs = ["pooling.cc"], + hdrs = ["pooling.h"], + deps = [ + ":arithmetic", + ":constants", + ":conv_grad_size_util", + "//tensorflow/compiler/xla/client:xla_builder", + "@com_google_absl//absl/container:inlined_vector", + ], +) + +xla_test( + name = "pooling_test", + srcs = ["pooling_test.cc"], + deps = [ + ":pooling", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "@com_google_absl//absl/container:inlined_vector", + ], +) + +cc_library( + name = "prng", + srcs = ["prng.cc"], + hdrs = ["prng.h"], + deps = [ + ":constants", + ":math", + ":numeric", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/core:lib", + ], +) + +cc_library( + name = "sorting", + srcs = ["sorting.cc"], + hdrs = ["sorting.h"], + deps = [ + ":numeric", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + ], +) + +xla_test( + name = "sorting_test", + srcs = ["sorting_test.cc"], + blacklisted_backends = [ + "cpu", + "gpu", + ], + tags = ["enable_for_xla_interpreter"], + deps = [ + ":sorting", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + ], +) + cc_library( name = "testing", srcs = ["testing.cc"], hdrs = ["testing.h"], deps = [ "//tensorflow/compiler/xla:execution_options_util", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", @@ -42,9 +217,10 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client", "//tensorflow/compiler/xla/client:global_data", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc index a1d34796ccfd86f2025eff0ecb51338eb6a9b1da..e86c10f030f3990d67e5a6638100640f73c82307 100644 --- a/tensorflow/compiler/xla/client/lib/arithmetic.cc +++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc @@ -17,13 +17,14 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { namespace { @@ -38,12 +39,12 @@ XlaComputation CreateScalarComputation(const string& name, PrimitiveType type, b = builder->CreateSubBuilder(name); } else { b = builder->CreateSubBuilder( - tensorflow::strings::StrCat(name, "_", PrimitiveType_Name(type))); + absl::StrCat(name, "_", PrimitiveType_Name(type))); } const Shape scalar = ShapeUtil::MakeShape(type, {}); - auto lhs = b->Parameter(0, scalar, "lhs"); - auto rhs = b->Parameter(1, scalar, "rhs"); + auto lhs = Parameter(b.get(), 0, scalar, "lhs"); + auto rhs = Parameter(b.get(), 1, scalar, "rhs"); generator(b.get(), lhs, rhs); return b->BuildAndNoteError(); } @@ -55,7 +56,7 @@ XlaComputation CreateScalarAddComputation(PrimitiveType type, return CreateScalarComputation( "add", type, builder, [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) { - return b->Add(lhs, rhs); + return Add(lhs, rhs); }); } @@ -64,17 +65,15 @@ XlaComputation CreateScalarMultiplyComputation(PrimitiveType type, return CreateScalarComputation( "mul", type, builder, [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) { - return b->Mul(lhs, rhs); + return Mul(lhs, rhs); }); } XlaComputation CreateScalarGeComputation(PrimitiveType type, XlaBuilder* builder) { - return CreateScalarComputation( - "ge", type, builder, - [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) { - return b->Ge(lhs, rhs); - }); + return CreateScalarComputation("ge", type, builder, + [](XlaBuilder* b, const XlaOp& lhs, + const XlaOp& rhs) { return Ge(lhs, rhs); }); } XlaComputation CreateScalarMaxComputation(PrimitiveType type, @@ -82,7 +81,7 @@ XlaComputation CreateScalarMaxComputation(PrimitiveType type, return CreateScalarComputation( "max", type, builder, [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) { - return b->Max(lhs, rhs); + return Max(lhs, rhs); }); } @@ -91,34 +90,37 @@ XlaComputation CreateScalarMinComputation(PrimitiveType type, return CreateScalarComputation( "min", type, builder, [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) { - return b->Min(lhs, rhs); + return Min(lhs, rhs); }); } -XlaComputation CreateScalarAndComputation(XlaBuilder* builder) { +XlaComputation CreateScalarAndComputation(PrimitiveType type, + XlaBuilder* builder) { return CreateScalarComputation( - "and", PRED, builder, + "and", type, builder, [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) { - return b->And(lhs, rhs); + return And(lhs, rhs); }); } -XlaComputation CreateScalarOrComputation(XlaBuilder* builder) { - return CreateScalarComputation( - "or", PRED, builder, - [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) { - return b->Or(lhs, rhs); - }); +XlaComputation CreateScalarOrComputation(PrimitiveType type, + XlaBuilder* builder) { + return CreateScalarComputation("or", type, builder, + [](XlaBuilder* b, const XlaOp& lhs, + const XlaOp& rhs) { return Or(lhs, rhs); }); } -StatusOr Any(const XlaOp& predicates, XlaBuilder* builder) { - auto f = builder->ConstantR0(false); - XlaComputation logical_or = CreateScalarOrComputation(builder); - TF_ASSIGN_OR_RETURN(const Shape& predicates_shape, - builder->GetShape(predicates)); - std::vector all_dimensions(ShapeUtil::Rank(predicates_shape)); - std::iota(all_dimensions.begin(), all_dimensions.end(), 0); - return builder->Reduce(predicates, f, logical_or, all_dimensions); +XlaOp Any(XlaOp predicates) { + XlaBuilder* builder = predicates.builder(); + return builder->ReportErrorOrReturn([&]() -> StatusOr { + auto f = ConstantR0(builder, false); + XlaComputation logical_or = CreateScalarOrComputation(PRED, builder); + TF_ASSIGN_OR_RETURN(const Shape& predicates_shape, + builder->GetShape(predicates)); + std::vector all_dimensions(ShapeUtil::Rank(predicates_shape)); + std::iota(all_dimensions.begin(), all_dimensions.end(), 0); + return Reduce(predicates, f, logical_or, all_dimensions); + }); } } // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h index 64b6b7d63353165e45bf12d35126a7eeef9e56e4..632e8cc8bc64fad236a0226c6e93079aadde7050 100644 --- a/tensorflow/compiler/xla/client/lib/arithmetic.h +++ b/tensorflow/compiler/xla/client/lib/arithmetic.h @@ -18,8 +18,8 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/xla_data.pb.h" namespace xla { @@ -45,15 +45,17 @@ XlaComputation CreateScalarMinComputation(PrimitiveType type, XlaBuilder* builder); // Creates a scalar logical AND computation and returns it. -XlaComputation CreateScalarAndComputation(XlaBuilder* builder); +XlaComputation CreateScalarAndComputation(PrimitiveType type, + XlaBuilder* builder); // Creates a scalar logical OR computation and returns it. -XlaComputation CreateScalarOrComputation(XlaBuilder* builder); +XlaComputation CreateScalarOrComputation(PrimitiveType type, + XlaBuilder* builder); // Returns whether any predicate in "predicates" is set. // // Note: if predicates is zero-sized, Any() vacuously returns false. -StatusOr Any(const XlaOp& predicates, XlaBuilder* builder); +XlaOp Any(XlaOp predicates); } // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/constants.cc b/tensorflow/compiler/xla/client/lib/constants.cc new file mode 100644 index 0000000000000000000000000000000000000000..1ada7b4a964ccf7ca400b937abbe425bef083468 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/constants.cc @@ -0,0 +1,103 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/constants.h" + +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/util.h" + +namespace xla { + +XlaOp Zero(XlaBuilder* builder, PrimitiveType type) { + return ConstantLiteral(builder, LiteralUtil::Zero(type)); +} + +XlaOp Zeros(XlaBuilder* builder, const Shape& shape) { + return Broadcast(Zero(builder, shape.element_type()), + AsInt64Slice(shape.dimensions())); +} + +XlaOp ZerosLike(XlaOp prototype) { + XlaBuilder* builder = prototype.builder(); + return builder->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(prototype)); + return Zeros(builder, shape); + }); +} + +XlaOp One(XlaBuilder* builder, PrimitiveType type) { + return ConstantLiteral(builder, LiteralUtil::One(type)); +} + +XlaOp Epsilon(XlaBuilder* builder, PrimitiveType type) { + switch (type) { + case F16: + return ConstantR0( + builder, + static_cast(Eigen::NumTraits::epsilon())); + case BF16: + return ConstantR0(builder, bfloat16::epsilon()); + case F32: + return ConstantR0(builder, std::numeric_limits::epsilon()); + case F64: + return ConstantR0(builder, + std::numeric_limits::epsilon()); + default: + return builder->ReportError(InvalidArgument( + "Invalid type for Epsilon (%s).", PrimitiveType_Name(type))); + } +} + +XlaOp MinValue(XlaBuilder* builder, PrimitiveType type) { + return ConstantLiteral(builder, LiteralUtil::MinValue(type)); +} + +XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type) { + switch (type) { + case F16: + return ConstantR0(builder, + Eigen::NumTraits::lowest()); + case BF16: + return ConstantR0(builder, bfloat16::lowest()); + case F32: + return ConstantR0(builder, -std::numeric_limits::max()); + case F64: + return ConstantR0(builder, -std::numeric_limits::max()); + default: + return MinValue(builder, type); + } +} + +XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type) { + return ConstantLiteral(builder, LiteralUtil::MaxValue(type)); +} + +XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type) { + switch (type) { + case F16: + return ConstantR0(builder, + Eigen::NumTraits::highest()); + case BF16: + return ConstantR0(builder, bfloat16::highest()); + case F32: + return ConstantR0(builder, std::numeric_limits::max()); + case F64: + return ConstantR0(builder, std::numeric_limits::max()); + default: + return MaxValue(builder, type); + } +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h new file mode 100644 index 0000000000000000000000000000000000000000..81624614c1e3599dfe116eb61d9e2edcd5230684 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/constants.h @@ -0,0 +1,124 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONSTANTS_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONSTANTS_H_ + +#include + +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/primitive_util.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { + +// Returns scalar 'value' as a scalar of 'type'. Unlike ConstantR0, 'type' is +// determined at C++ run-time, rather than C++ compile-time. +// If 'value' is floating point but 'type' is not, or if 'value' is complex but +// 'type' is not, an error will be returned. This is to catch accidental +// truncation; in such cases, use an explicit cast. +template +XlaOp ConstantR0WithType(XlaBuilder* builder, PrimitiveType type, T value) { + if (std::is_floating_point::value && + !(primitive_util::IsFloatingPointType(type) || + primitive_util::IsComplexType(type))) { + return builder->ReportError(InvalidArgument( + "Invalid cast from floating point type to %s in ConstantR0WithType.", + PrimitiveType_Name(type))); + } + if (std::is_same::value && + !primitive_util::IsComplexType(type)) { + return builder->ReportError(InvalidArgument( + "Invalid cast from complex type to %s in ConstantR0WithType.", + PrimitiveType_Name(type))); + } + switch (type) { + case F16: + return ConstantR0(builder, static_cast(value)); + case BF16: + return ConstantR0(builder, static_cast(value)); + case F32: + return ConstantR0(builder, static_cast(value)); + case F64: + return ConstantR0(builder, static_cast(value)); + case C64: + return ConstantR0(builder, static_cast(value)); + case U8: + return ConstantR0(builder, static_cast(value)); + case U32: + return ConstantR0(builder, static_cast(value)); + case U64: + return ConstantR0(builder, static_cast(value)); + case S8: + return ConstantR0(builder, static_cast(value)); + case S32: + return ConstantR0(builder, static_cast(value)); + case S64: + return ConstantR0(builder, static_cast(value)); + default: + return builder->ReportError( + InvalidArgument("Invalid type for ConstantR0WithType (%s).", + PrimitiveType_Name(type))); + } +} + +// Returns a scalar containing 'value' cast to the same run-time type as +// 'prototype'. +// If 'value' is floating point but 'prototype' is not, or if 'value' is complex +// 'prototype' is not, an error will be returned. +template +XlaOp ScalarLike(XlaOp prototype, T value) { + XlaBuilder* builder = prototype.builder(); + return builder->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(prototype)); + return ConstantR0WithType(builder, shape.element_type(), value); + }); +} + +// Returns a scalar with value '0' of 'type'. +XlaOp Zero(XlaBuilder* builder, PrimitiveType type); + +// Returns a zero-filled tensor with shape `shape`. +XlaOp Zeros(XlaBuilder* builder, const Shape& shape); + +// Returns a zero-filled tensor with the same shape as `prototype`. +XlaOp ZerosLike(XlaOp prototype); + +// Returns a scalar with value '1' of 'type'. +XlaOp One(XlaBuilder* builder, PrimitiveType type); + +// Returns the machine epsilon for floating-point type `type`, i.e., +// the difference between 1.0 and the next representable value. +XlaOp Epsilon(XlaBuilder* builder, PrimitiveType type); + +// Returns the minimum representable finite or infinite value for 'type'. +// Returns '-inf' for floating-point types. +XlaOp MinValue(XlaBuilder* builder, PrimitiveType type); + +// Returns the minimum representable finite value for 'type'. For a floating +// point type, this is equal to -MaxFiniteValue(). +XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type); + +// Returns the maximum representable finite or infinite value for 'type'. +// Returns 'inf' for floating-point types. +XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type); + +// Returns the maximum representable finite value for 'type'. +XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONSTANTS_H_ diff --git a/tensorflow/compiler/xla/client/lib/constants_test.cc b/tensorflow/compiler/xla/client/lib/constants_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4320f65c1f76d4d4c384110b39d6606773aaf01 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/constants_test.cc @@ -0,0 +1,159 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { +namespace { + +using ConstantsTest = ClientLibraryTestBase; + +using ::testing::HasSubstr; + +XLA_TEST_F(ConstantsTest, ConstantR0WithTypeS32) { + XlaBuilder builder(TestName()); + ConstantR0WithType(&builder, xla::S32, 4); + ComputeAndCompareR0(&builder, 4, {}); +} + +XLA_TEST_F(ConstantsTest, ConstantR0WithTypeS32DoesNotAcceptFloats) { + XlaBuilder builder(TestName()); + ConstantR0WithType(&builder, xla::S32, 4.5); + auto statusor = builder.Build(); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), HasSubstr("Invalid cast")); +} + +XLA_TEST_F(ConstantsTest, ConstantR0WithTypeF32) { + XlaBuilder builder(TestName()); + ConstantR0WithType(&builder, xla::F32, -7); + ComputeAndCompareR0(&builder, -7, {}); + ConstantR0WithType(&builder, xla::F32, 0.5); + ComputeAndCompareR0(&builder, 0.5, {}); +} + +XLA_TEST_F(ConstantsTest, ScalarLikeS32) { + XlaBuilder builder(TestName()); + ScalarLike(ConstantR0(&builder, 42), -3); + ComputeAndCompareR0(&builder, -3, {}); +} + +XLA_TEST_F(ConstantsTest, ScalarLikeF32) { + XlaBuilder builder(TestName()); + ScalarLike(ConstantR0(&builder, 42.75), -3.2); + ComputeAndCompareR0(&builder, -3.2, {}); +} + +XLA_TEST_F(ConstantsTest, ZeroS32) { + XlaBuilder builder(TestName()); + Zero(&builder, S32); + ComputeAndCompareR0(&builder, 0, {}); +} + +XLA_TEST_F(ConstantsTest, ZeroF32) { + XlaBuilder builder(TestName()); + Zero(&builder, F32); + ComputeAndCompareR0(&builder, 0.0, {}); +} + +XLA_TEST_F(ConstantsTest, ZerosS32) { + XlaBuilder builder(TestName()); + Zeros(&builder, ShapeUtil::MakeShape(S32, {2, 2})); + ComputeAndCompareR2(&builder, {{0, 0}, {0, 0}}, {}); +} + +XLA_TEST_F(ConstantsTest, ZerosLikeF32) { + XlaBuilder builder(TestName()); + ZerosLike(ConstantR1(&builder, {1., 2., 3.})); + ComputeAndCompareR1(&builder, {0., 0., 0.}, {}); +} + +XLA_TEST_F(ConstantsTest, OneS32) { + XlaBuilder builder(TestName()); + One(&builder, S32); + ComputeAndCompareR0(&builder, 1, {}); +} + +XLA_TEST_F(ConstantsTest, OneF32) { + XlaBuilder builder(TestName()); + One(&builder, F32); + ComputeAndCompareR0(&builder, 1., {}); +} + +XLA_TEST_F(ConstantsTest, EpsilonF32) { + XlaBuilder builder(TestName()); + Epsilon(&builder, F32); + ComputeAndCompareR0(&builder, std::numeric_limits::epsilon(), + {}); +} + +XLA_TEST_F(ConstantsTest, MinFiniteValueS32) { + XlaBuilder builder(TestName()); + MinFiniteValue(&builder, S32); + ComputeAndCompareR0(&builder, std::numeric_limits::min(), {}); +} + +XLA_TEST_F(ConstantsTest, MaxFiniteValueS32) { + XlaBuilder builder(TestName()); + MaxFiniteValue(&builder, S32); + ComputeAndCompareR0(&builder, std::numeric_limits::max(), {}); +} + +XLA_TEST_F(ConstantsTest, MinFiniteValueF32) { + XlaBuilder builder(TestName()); + MinFiniteValue(&builder, F32); + ComputeAndCompareR0(&builder, -std::numeric_limits::max(), {}); +} + +XLA_TEST_F(ConstantsTest, MaxFiniteValueF32) { + XlaBuilder builder(TestName()); + MaxFiniteValue(&builder, F32); + ComputeAndCompareR0(&builder, std::numeric_limits::max(), {}); +} + +XLA_TEST_F(ConstantsTest, MinValueS32) { + XlaBuilder builder(TestName()); + MinValue(&builder, S32); + ComputeAndCompareR0(&builder, std::numeric_limits::min(), {}); +} + +XLA_TEST_F(ConstantsTest, MaxValueS32) { + XlaBuilder builder(TestName()); + MaxValue(&builder, S32); + ComputeAndCompareR0(&builder, std::numeric_limits::max(), {}); +} + +XLA_TEST_F(ConstantsTest, MinValueF32) { + XlaBuilder builder(TestName()); + MinValue(&builder, F32); + ComputeAndCompareR0(&builder, -std::numeric_limits::infinity(), + {}); +} + +XLA_TEST_F(ConstantsTest, MaxValueF32) { + XlaBuilder builder(TestName()); + MaxValue(&builder, F32); + ComputeAndCompareR0(&builder, std::numeric_limits::infinity(), + {}); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..a4c50a5491803bc62d2de758177f8f5d050f441d --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc @@ -0,0 +1,96 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace xla { + +namespace { + +StatusOr GetWindowedOutputSize( + int64 input_size, int64 filter_size, int64 dilation_rate, int64 stride, + Padding padding_type) { + if (stride <= 0) { + return tensorflow::errors::InvalidArgument("Stride must be > 0, but got ", + stride); + } + if (dilation_rate < 1) { + return tensorflow::errors::InvalidArgument( + "Dilation rate must be >= 1, but got ", dilation_rate); + } + + int64 effective_filter_size = (filter_size - 1) * dilation_rate + 1; + SpatialDimensionOutputSizeAndPadding dim; + switch (padding_type) { + case Padding::kValid: + dim.output_size = (input_size - effective_filter_size + stride) / stride; + dim.pad_before = dim.pad_after = 0; + break; + case Padding::kSame: + dim.output_size = (input_size + stride - 1) / stride; + const int64 padding_needed = + std::max(int64{0}, (dim.output_size - 1) * stride + + effective_filter_size - input_size); + // For odd values of total padding, add more padding on the "after" side + // of the given dimension. + dim.pad_before = padding_needed / 2; + dim.pad_after = padding_needed - dim.pad_before; + break; + } + if (dim.output_size < 0) { + return tensorflow::errors::InvalidArgument( + "Computed output size would be negative: ", dim.output_size, + " [input_size: ", input_size, + ", effective_filter_size: ", effective_filter_size, + ", stride: ", stride, "]"); + } + return dim; +} + +} // namespace + +StatusOr +ConvGradExtractAndVerifyDimension(int64 input_size, int64 filter_size, + int64 output_size, int64 dilation, + int64 stride, Padding padding) { + TF_ASSIGN_OR_RETURN(SpatialDimensionOutputSizeAndPadding output_dim, + GetWindowedOutputSize(input_size, filter_size, dilation, + stride, padding)); + if (output_size != output_dim.output_size) { + return tensorflow::errors::InvalidArgument( + "Size of out_backprop doesn't match computed: ", "actual = ", + output_size, ", computed = ", output_dim.output_size, + " input: ", input_size, " filter: ", filter_size, + " output: ", output_size, " stride: ", stride, " dilation: ", dilation); + } + + SpatialDimensionOutputSizeAndPadding dim; + int64 effective_filter_size = (filter_size - 1) * dilation + 1; + dim.output_size = (output_dim.output_size - 1) * stride + 1; + const auto padded_out_size = input_size + effective_filter_size - 1; + dim.pad_before = effective_filter_size - 1 - output_dim.pad_before; + dim.pad_after = padded_out_size - dim.output_size - dim.pad_before; + VLOG(2) << "expanded_out = " << dim.output_size + << ", effective_filter_size = " << effective_filter_size + << ", padded_out = " << padded_out_size + << ", pad_before = " << dim.pad_before + << ", pad_after = " << dim.pad_after << ", dilation = " << dilation + << ", strides = " << stride; + return dim; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/conv_grad_size_util.h b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.h new file mode 100644 index 0000000000000000000000000000000000000000..0ad01728e6e828240b9ac4b948777e5d970d09e0 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.h @@ -0,0 +1,44 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_ + +#include "tensorflow/compiler/xla/client/padding.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { + +// Information about a single spatial dimension for a convolution gradients and +// windowed operations. +struct SpatialDimensionOutputSizeAndPadding { + // Effective size of the operation output (potentially expanded). + int64 output_size; + // Number of padding elements to be added before/after this dimension of + // the input when computing the input gradient. + int64 pad_before; + int64 pad_after; +}; + +// Verifies that the dimensions all match, and computes the size and padding of +// a spatial dimension for convolution gradient operations. +StatusOr +ConvGradExtractAndVerifyDimension(int64 input_size, int64 filter_size, + int64 output_size, int64 dilation, + int64 stride, Padding padding); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_ diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc new file mode 100644 index 0000000000000000000000000000000000000000..d3d7edb42a38595bbf9fdb36e0dd946ae5df51f9 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/math.cc @@ -0,0 +1,307 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/math.h" + +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" + +namespace xla { + +XlaOp Sqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, 0.5)); } + +XlaOp Rsqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, -0.5)); } + +XlaOp Square(XlaOp operand) { return operand * operand; } + +XlaOp Reciprocal(XlaOp operand) { return ScalarLike(operand, 1.0) / operand; } + +namespace { + +// Polynomials for computing erf/erfc. Originally from cephes. +// Note we use float for compatibility across devices, at the cost of some +// precision for 64 bit computations. +// +// Coefficients are in descending order. +std::array kErfcPCoefficient = { + 2.46196981473530512524E-10, 5.64189564831068821977E-1, + 7.46321056442269912687E0, 4.86371970985681366614E1, + 1.96520832956077098242E2, 5.26445194995477358631E2, + 9.34528527171957607540E2, 1.02755188689515710272E3, + 5.57535335369399327526E2}; +std::array kErfcQCoefficient = { + 1.00000000000000000000E0, 1.32281951154744992508E1, + 8.67072140885989742329E1, 3.54937778887819891062E2, + 9.75708501743205489753E2, 1.82390916687909736289E3, + 2.24633760818710981792E3, 1.65666309194161350182E3, + 5.57535340817727675546E2}; +std::array kErfcRCoefficient = { + 5.64189583547755073984E-1, 1.27536670759978104416E0, + 5.01905042251180477414E0, 6.16021097993053585195E0, + 7.40974269950448939160E0, 2.97886665372100240670E0}; +std::array kErfcSCoefficient = { + 1.00000000000000000000E0, 2.26052863220117276590E0, + 9.39603524938001434673E0, 1.20489539808096656605E1, + 1.70814450747565897222E1, 9.60896809063285878198E0, + 3.36907645100081516050E0}; +std::array kErfTCoefficient = { + 9.60497373987051638749E0, 9.00260197203842689217E1, + 2.23200534594684319226E3, 7.00332514112805075473E3, + 5.55923013010394962768E4}; +std::array kErfUCoefficient = { + 1.00000000000000000000E0, 3.35617141647503099647E1, + 5.21357949780152679795E2, 4.59432382970980127987E3, + 2.26290000613890934246E4, 4.92673942608635921086E4}; +} // namespace + +// Evaluate the polynomial given coefficients and `x`. +// N.B. Coefficients should be supplied in decreasing order. +XlaOp EvaluatePolynomial(XlaOp x, absl::Span coefficients) { + XlaOp poly = ScalarLike(x, 0.0); + for (float c : coefficients) { + poly = poly * x + ScalarLike(x, c); + } + return poly; +} + +// Compute an approximation of the error function complement (1 - erf(x)). +XlaOp Erfc(XlaOp x) { + XlaOp abs_x = Abs(x); + XlaOp z = Exp(-x * x); + + XlaOp pp = EvaluatePolynomial(abs_x, kErfcPCoefficient); + XlaOp pq = EvaluatePolynomial(abs_x, kErfcQCoefficient); + XlaOp pr = EvaluatePolynomial(abs_x, kErfcRCoefficient); + XlaOp ps = EvaluatePolynomial(abs_x, kErfcSCoefficient); + + XlaOp y = Select(Lt(abs_x, ScalarLike(x, 8.0)), z * pp / pq, z * pr / ps); + + return Select(Lt(x, ScalarLike(x, 0.0)), ScalarLike(x, 2.0) - y, y); +} + +// Compute a polynomial approximation of the error function. +XlaOp Erf(XlaOp x) { + XlaOp z = x * x; + XlaOp pt = EvaluatePolynomial(z, kErfTCoefficient); + XlaOp pu = EvaluatePolynomial(z, kErfUCoefficient); + return x * pt / pu; +} + +// Approximation for the inverse error function from +// Giles, M., "Approximating the erfinv function". +// The approximation has the form: +// w = -log((1 - x) * (1 + x)) +// if ( w < 5 ) { +// w = w - 2.5 +// p = sum_{i=1}^n lq[i]*w^i +// } else { +// w = sqrt(w) - 3 +// p = sum_{i=1}^n gq[i]*w^i +// } +// return p*x +XlaOp ErfInv(XlaOp x) { + XlaBuilder* b = x.builder(); + return b->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape shape, b->GetShape(x)); + constexpr int kDegree = 9; + constexpr std::array w_less_than_5_constants = { + 2.81022636e-08f, 3.43273939e-07f, -3.5233877e-06f, + -4.39150654e-06f, 0.00021858087f, -0.00125372503f, + -0.00417768164f, 0.246640727f, 1.50140941f}; + constexpr std::array w_greater_than_5_constants = { + -0.000200214257f, 0.000100950558f, 0.00134934322f, + -0.00367342844f, 0.00573950773f, -0.0076224613f, + 0.00943887047f, 1.00167406f, 2.83297682f}; + + auto one = ScalarLike(x, 1.0); + auto w = -Log((one - x) * (one + x)); + + auto lt = Lt(w, ScalarLike(x, 5.0)); + auto coefficient = [&](int i) { + return Select(lt, + Broadcast(ScalarLike(x, w_less_than_5_constants[i]), + AsInt64Slice(shape.dimensions())), + Broadcast(ScalarLike(x, w_greater_than_5_constants[i]), + AsInt64Slice(shape.dimensions()))); + }; + w = Select(lt, w - ScalarLike(x, 2.5), Sqrt(w) - ScalarLike(x, 3.0)); + auto p = coefficient(0); + for (int i = 1; i < kDegree; ++i) { + p = coefficient(i) + p * w; + } + return p * x; + }); +} + +namespace { +// Coefficients for the Lanczos approximation of the gamma function. The +// coefficients are uniquely determined by the choice of g and n (kLanczosGamma +// and kLanczosCoefficients.size() + 1). The coefficients below correspond to +// [7, 9]. [5, 7], [7, 9], [9, 10], and [607/128.0, 15] were evaluated and [7, +// 9] seemed to be the least sensitive to the quality of the log function. In +// particular, [5, 7] is the only choice where -1.5e-5 <= lgamma(2) <= 1.5e-5 +// for a particularly inaccurate log function. +static constexpr double kLanczosGamma = 7; // aka g +static constexpr double kBaseLanczosCoeff = 0.99999999999980993227684700473478; +static constexpr std::array kLanczosCoefficients = { + 676.520368121885098567009190444019, -1259.13921672240287047156078755283, + 771.3234287776530788486528258894, -176.61502916214059906584551354, + 12.507343278686904814458936853, -0.13857109526572011689554707, + 9.984369578019570859563e-6, 1.50563273514931155834e-7}; +} // namespace + +// Compute the Lgamma function using Lanczos' approximation from "A Precision +// Approximation of the Gamma Function". SIAM Journal on Numerical Analysis +// series B. Vol. 1: +// lgamma(z + 1) = (log(2) + log(pi)) / 2 + (z + 1/2) * log(t(z)) - t(z) + A(z) +// t(z) = z + kLanczosGamma + 1/2 +// A(z) = kBaseLanczosCoeff + sigma(k = 1, n, kLanczosCoefficients[i] / (z + k)) +XlaOp Lgamma(XlaOp input) { + XlaOp one_half = ScalarLike(input, 0.5); + XlaOp one = ScalarLike(input, 1); + + XlaOp pi = ScalarLike(input, M_PI); + XlaOp log_pi = ScalarLike(input, std::log(M_PI)); + XlaOp log_sqrt_two_pi = ScalarLike(input, (std::log(2) + std::log(M_PI)) / 2); + + XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5); + XlaOp log_lanczos_gamma_plus_one_half = + ScalarLike(input, std::log(kLanczosGamma + 0.5)); + + XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff); + + // If the input is less than 0.5 use Gauss's reflection formula: + // gamma(x) = pi / sin(pi * x) * gamma(1 - x) + XlaOp need_to_reflect = Lt(Real(input), one_half); + XlaOp z = Select(need_to_reflect, -input, input - one); + + XlaOp x = base_lanczos_coeff; + for (int i = 0; i < kLanczosCoefficients.size(); ++i) { + XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]); + XlaOp index = ScalarLike(input, i); + x = x + lanczos_coefficient / (z + index + one); + } + + // To improve accuracy on platforms with less-precise log implementations, + // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on + // the device. + // log(t) = log(kLanczosGamma + 0.5 + z) + // = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5)) + XlaOp t = lanczos_gamma_plus_one_half + z; + XlaOp log_t = + log_lanczos_gamma_plus_one_half + Log1p(z / lanczos_gamma_plus_one_half); + + XlaOp log_y = log_sqrt_two_pi + (z + one_half) * log_t - t + Log(x); + + // If z = a + 0j, the analytic continuation of log reduces to taking the + // absolute value of the real part. + // Re(log(z)) = Re(log|z| + arg(z)j) + // = log|a| + XlaOp reflection = log_pi - Log(Abs(Sin(pi * input))) - log_y; + XlaOp result = Select(need_to_reflect, reflection, log_y); + return result; +} + +// Compute the Digamma function using Lanczos' approximation from "A Precision +// Approximation of the Gamma Function". SIAM Journal on Numerical Analysis +// series B. Vol. 1: +// digamma(z + 1) = log(t(z)) + A'(z) / A(z) - kLanczosGamma / t(z) +// t(z) = z + kLanczosGamma + 1/2 +// A(z) = kBaseLanczosCoeff + sigma(k = 1, n, kLanczosCoefficients[i] / (z + k)) +// A'(z) = sigma(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k)) +XlaOp Digamma(XlaOp input) { + XlaOp zero = ScalarLike(input, 0); + XlaOp one_half = ScalarLike(input, 0.5); + XlaOp one = ScalarLike(input, 1); + + XlaOp pi = ScalarLike(input, M_PI); + + XlaOp lanczos_gamma = ScalarLike(input, kLanczosGamma); + XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5); + XlaOp log_lanczos_gamma_plus_one_half = + ScalarLike(input, std::log(kLanczosGamma + 0.5)); + + XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff); + + // If the input is less than 0.5 use Gauss's reflection formula: + // digamma(x) = digamma(1 - x) - pi * cot(pi * x) + XlaOp need_to_reflect = Lt(Real(input), one_half); + XlaOp z = Select(need_to_reflect, -input, input - one); + + XlaOp num = zero; + XlaOp denom = base_lanczos_coeff; + for (int i = 0; i < kLanczosCoefficients.size(); ++i) { + XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]); + XlaOp index = ScalarLike(input, i); + num = num - lanczos_coefficient / ((z + index + one) * (z + index + one)); + denom = denom + lanczos_coefficient / (z + index + one); + } + + // To improve accuracy on platforms with less-precise log implementations, + // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on + // the device. + // log(t) = log(kLanczosGamma + 0.5 + z) + // = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5)) + XlaOp t = lanczos_gamma_plus_one_half + z; + XlaOp log_t = + log_lanczos_gamma_plus_one_half + Log1p(z / lanczos_gamma_plus_one_half); + + XlaOp y = log_t + num / denom - lanczos_gamma / t; + XlaOp reflection = y - pi * Cos(pi * input) / Sin(pi * input); + XlaOp result = Select(need_to_reflect, reflection, y); + return result; +} + +// Trigonometric functions. + +// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x)) +XlaOp Acos(XlaOp x) { + return ScalarLike(x, 2.0) * + Atan2(Sqrt(ScalarLike(x, 1.0) - x * x), ScalarLike(x, 1.0) + x); +} + +// asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2))) +XlaOp Asin(XlaOp x) { + return ScalarLike(x, 2.0) * + Atan2(x, ScalarLike(x, 1.0) + Sqrt(ScalarLike(x, 1.0) - x * x)); +} + +XlaOp Atan(XlaOp x) { return Atan2(x, ScalarLike(x, 1.0)); } + +XlaOp Tan(XlaOp x) { return Sin(x) / Cos(x); } + +// Hyperbolic trigonometric functions. + +// acosh(x) = log(x + sqrt(x^2 - 1)) +// = log(x + sqrt((x+1)*(x-1))) +XlaOp Acosh(XlaOp x) { + return Log(x + Sqrt((x + ScalarLike(x, 1.0)) * (x - ScalarLike(x, 1.0)))); +} + +// asinh(x) = log(x + sqrt(x^2 + 1)) +XlaOp Asinh(XlaOp x) { return Log(x + Sqrt(x * x + ScalarLike(x, 1.0))); } + +// atanh(x) = 0.5 * log((1 + x) / (1 - x)) +XlaOp Atanh(XlaOp x) { + return Log((ScalarLike(x, 1.0) + x) / (ScalarLike(x, 1.0) - x)) * + ScalarLike(x, 0.5); +} + +XlaOp Cosh(XlaOp x) { return (Exp(x) + Exp(-x)) * ScalarLike(x, 0.5); } + +XlaOp Sinh(XlaOp x) { return (Exp(x) - Exp(-x)) * ScalarLike(x, 0.5); } + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h new file mode 100644 index 0000000000000000000000000000000000000000..a6cafd42077367bf23ffa1f45eab31c01dc31b16 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/math.h @@ -0,0 +1,87 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_ + +#include "tensorflow/compiler/xla/client/xla_builder.h" + +namespace xla { + +// Computes the square root of 'operand'. +XlaOp Sqrt(XlaOp operand); + +// Computes the reciprocal of the square root of 'operand'. +XlaOp Rsqrt(XlaOp operand); + +// Computes the square of 'operand'. +XlaOp Square(XlaOp operand); + +// Computes the reciprocal of 'operand'. +XlaOp Reciprocal(XlaOp operand); + +// Evaluates a polynomial given coefficients and `x`. +// N.B. Coefficients should be supplied in decreasing order. +XlaOp EvaluatePolynomial(XlaOp x, absl::Span coefficients); + +// Computes an approximation of the error function complement (1 - erf(x)). +XlaOp Erfc(XlaOp x); + +// Computes an approximation of the error function. +XlaOp Erf(XlaOp x); + +// Computes an approximation of the inverse of the error function. +XlaOp ErfInv(XlaOp x); + +// Computes an approximation of the lgamma function. +XlaOp Lgamma(XlaOp input); + +// Computes an approximation of the digamma function. +XlaOp Digamma(XlaOp input); + +// Trigonometric functions + +// Computes the arc cosine of 'x'. +XlaOp Acos(XlaOp x); + +// Computes the arc sine of 'x'. +XlaOp Asin(XlaOp x); + +// Computes the arc tangent of 'x'. +XlaOp Atan(XlaOp x); + +// Computes the tangent of 'x'. +XlaOp Tan(XlaOp x); + +// Hyperbolic trigonometric functions + +// Computes the inverse hyperbolic cosine of 'x'. +XlaOp Acosh(XlaOp x); + +// Computes the inverse hyperbolic sine of 'x'. +XlaOp Asinh(XlaOp x); + +// Computes the inverse hyperbolic tangent of 'x'. +XlaOp Atanh(XlaOp x); + +// Computes the hyperbolic cosine of 'x'. +XlaOp Cosh(XlaOp x); + +// Computes the hyperbolic sine of 'x'. +XlaOp Sinh(XlaOp x); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_ diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..14c259a7fa2a47642663b65d2785e5bbdc040cfd --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/math_test.cc @@ -0,0 +1,140 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/math.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { +namespace { + +class MathTest : public ClientLibraryTestBase { + public: + ErrorSpec error_spec_{0.0001}; +}; + +XLA_TEST_F(MathTest, SqrtF32) { + XlaBuilder builder(TestName()); + Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F32); + + std::unique_ptr zero_data = + client_->TransferToServer(zero_literal).ConsumeValueOrDie(); + + XlaOp zero = Parameter(&builder, 0, zero_literal.shape(), "zero"); + Sqrt(zero); + + ComputeAndCompareR0(&builder, 0.0f, {zero_data.get()}, error_spec_); +} + +XLA_TEST_F(MathTest, SquareTenValues) { + XlaBuilder builder(TestName()); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + Square(x); + + std::vector expected = {4.41, 6.76, 6.76, 16., 4.41, + 5.29, 25., 0.81, 5.76, 2.56}; + ComputeAndCompareR1(&builder, expected, {}, error_spec_); +} + +XLA_TEST_F(MathTest, ReciprocalTenValues) { + XlaBuilder builder(TestName()); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + Reciprocal(x); + + std::vector expected = { + 0.47619048, -0.38461538, 0.38461538, -0.25, 0.47619048, + 0.43478261, -0.2, -1.11111111, -0.41666667, 0.625}; + ComputeAndCompareR1(&builder, expected, {}, error_spec_); +} + +XLA_TEST_F(MathTest, SqrtZeroes) { + XlaBuilder builder(TestName()); + auto x = ConstantR1(&builder, {0.0, -0.0}); + Sqrt(x); + + ComputeAndCompareR1(&builder, {0, 0}, {}, error_spec_); +} + +XLA_TEST_F(MathTest, SqrtSixValues) { + XlaBuilder builder(TestName()); + auto x = ConstantR1(&builder, {16.0, 1.0, 1024.0, 0.16, 0.2, 12345}); + Sqrt(x); + + std::vector expected = {4, 1, 32, 0.4, 0.4472, 111.1080}; + ComputeAndCompareR1(&builder, expected, {}, error_spec_); +} + +XLA_TEST_F(MathTest, Lgamma) { + XlaBuilder builder(TestName()); + auto x = ConstantR1(&builder, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.5, 1.5, + 2.5, -1.5, -3.5, -5.5}); + Lgamma(x); + + std::vector expected = { + 0, + 0, + static_cast(std::log(2)), + static_cast(std::log(6)), + static_cast(std::log(24)), + static_cast(std::log(120)), + static_cast(std::log(M_PI) / 2), + static_cast(std::log(M_PI) / 2 - std::log(2)), + static_cast(std::log(M_PI) / 2 - std::log(4) + std::log(3)), + static_cast(std::log(M_PI) / 2 - std::log(3) + std::log(4)), + static_cast(std::log(M_PI) / 2 - std::log(105) + std::log(16)), + static_cast(std::log(M_PI) / 2 - std::log(10395) + std::log(64))}; + error_spec_ = ErrorSpec{0.001}; + ComputeAndCompareR1(&builder, expected, {}, error_spec_); +} + +XLA_TEST_F(MathTest, Digamma) { + XlaBuilder builder(TestName()); + auto x = ConstantR1(&builder, {1.0, 0.5, 1 / 3.0, 0.25, 1 / 6.0, 0.125, + 2.0, 3.0, 4.0, 6.0, 8.0, 9.0}); + Digamma(x); + + constexpr double euler_mascheroni = + 0.57721566490153286060651209008240243104215933593992; + std::vector expected = { + static_cast(-euler_mascheroni), + static_cast(-2 * std::log(2) - euler_mascheroni), + static_cast(-M_PI / 2 / std::sqrt(3) - 3 * std::log(3) / 2 - + euler_mascheroni), + static_cast(-M_PI / 2 - 3 * std::log(2) - euler_mascheroni), + static_cast(-M_PI * std::sqrt(3) / 2 - 2 * std::log(2) - + 3 * std::log(3) / 2 - euler_mascheroni), + static_cast( + -M_PI / 2 - 4 * std::log(2) - + (M_PI + std::log(2 + std::sqrt(2)) - std::log(2 - std::sqrt(2))) / + std::sqrt(2) - + euler_mascheroni), + static_cast(1 - euler_mascheroni), + static_cast(1.5 - euler_mascheroni), + static_cast(11 / 6.0 - euler_mascheroni), + static_cast(137 / 60.0 - euler_mascheroni), + static_cast(363 / 140.0 - euler_mascheroni), + static_cast(761 / 280.0 - euler_mascheroni)}; + ComputeAndCompareR1(&builder, expected, {}, error_spec_); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/numeric.cc b/tensorflow/compiler/xla/client/lib/numeric.cc new file mode 100644 index 0000000000000000000000000000000000000000..377654220b5df4487e9e194361473d54ff46a54e --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/numeric.cc @@ -0,0 +1,89 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" + +namespace xla { + +XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, + int64 n) { + auto a = Iota(builder, type, m); + auto b = Iota(builder, type, n); + auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0}); + return ConvertElementType(indicator, type); +} + +XlaOp GetMatrixDiagonal(XlaOp x) { + XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x)); + const int64 n_dims = ShapeUtil::Rank(shape); + TF_RET_CHECK(n_dims >= 2); + const int64 m = shape.dimensions(n_dims - 2); + const int64 n = shape.dimensions(n_dims - 1); + absl::Span major_dims = + AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2); + auto a = Iota(builder, U32, n); + auto b = Iota(builder, U32, m); + auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0}); + auto mask = Broadcast(indicator, major_dims); + + // TPUs don't support S64 add reduction at the moment. But fortunately + // OR-reductions work just as well for integers. + XlaComputation reducer = + primitive_util::IsIntegralType(shape.element_type()) + ? CreateScalarOrComputation(shape.element_type(), builder) + : CreateScalarAddComputation(shape.element_type(), builder); + + return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0), + reducer, {m >= n ? n_dims - 2 : n_dims - 1}); + }); +} + +XlaOp Triangle(XlaOp x, bool lower) { + XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x)); + const int64 n_dims = ShapeUtil::Rank(shape); + TF_RET_CHECK(n_dims >= 2); + const int64 m = shape.dimensions(n_dims - 2); + const int64 n = shape.dimensions(n_dims - 1); + absl::Span major_dims = + AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2); + auto a = Iota(builder, U32, n); + auto b = Iota(builder, U32, m); + xla::XlaOp indicator; + if (lower) { + indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0}); + } else { + indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0}); + } + auto mask = Broadcast(indicator, major_dims); + + return Select(mask, x, Zeros(builder, shape)); + }); +} + +XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); } + +XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); } + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/numeric.h b/tensorflow/compiler/xla/client/lib/numeric.h new file mode 100644 index 0000000000000000000000000000000000000000..efd8cdc25724198633e0bf1c48c4e7d9e4b4c9e1 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/numeric.h @@ -0,0 +1,48 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_ + +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { + +// Returns a rank 1 tensor of `type` containing values [0, 1, 2, ...]. +XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size); + +// Returns an m x n matrix with 1s on the diagonal elements, zeros everywhere +// else. +XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n); + +// Get the diagonals of the last two dimensions. If 'x' has shape +// [..., M, N], then the output has shape [..., min(M, N)], containing the +// diagonal elements (i.e., with indices [..., i, i]). +XlaOp GetMatrixDiagonal(XlaOp x); + +// Get the upper or lower triangle part of the last two dimensions +XlaOp Triangle(XlaOp x, bool lower); + +// Get the upper triangle part of the last two dimensions +XlaOp UpperTriangle(XlaOp x); + +// Get the lower triangle part of the last two dimensions +XlaOp LowerTriangle(XlaOp x); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_ diff --git a/tensorflow/compiler/xla/client/lib/numeric_test.cc b/tensorflow/compiler/xla/client/lib/numeric_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7d6aedd49462bd4f075f90d0b0f85c40f1191aa1 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/numeric_test.cc @@ -0,0 +1,68 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { +namespace { + +class NumericTest : public ClientLibraryTestBase { + protected: + template + void TestMatrixDiagonal(); +}; + +XLA_TEST_F(NumericTest, Triangle) { + XlaBuilder builder(TestName()); + Array3D input(2, 3, 4); + input.FillIota(0); + + XlaOp a; + auto a_data = CreateR3Parameter(input, 0, "a", &builder, &a); + LowerTriangle(a); + Array3D expected({{{0, 0, 0, 0}, {4, 5, 0, 0}, {8, 9, 10, 0}}, + {{12, 0, 0, 0}, {16, 17, 0, 0}, {20, 21, 22, 0}}}); + + ComputeAndCompareR3(&builder, expected, {a_data.get()}); +} + +template +void NumericTest::TestMatrixDiagonal() { + XlaBuilder builder("GetMatrixDiagonal"); + Array3D input(2, 3, 4); + input.FillIota(0); + + XlaOp a; + auto a_data = CreateR3Parameter(input, 0, "a", &builder, &a); + GetMatrixDiagonal(a); + Array2D expected({{0, 5, 10}, {12, 17, 22}}); + + ComputeAndCompareR2(&builder, expected, {a_data.get()}); +} + +XLA_TEST_F(NumericTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal(); } + +XLA_TEST_F(NumericTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal(); } + +XLA_TEST_F(NumericTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal(); } + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/pooling.cc b/tensorflow/compiler/xla/client/lib/pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..1979c867a4c3be438f8b997c566799fe84b43053 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/pooling.cc @@ -0,0 +1,289 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/pooling.h" +#include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h" + +namespace xla { + +namespace { + +// Common computation shared between AvgPool and AvgPoolGrad. Divide each +// element of an image by the count of elements that contributed to that +// element during pooling. +XlaOp AvgPoolDivideByCountWithGeneralPadding( + XlaOp sums, PrimitiveType dtype, absl::Span input_shape, + absl::Span> spatial_padding, + absl::Span ksize, absl::Span stride, + const TensorFormat& data_format) { + // The padding shouldn't be included in the counts. We use another + // ReduceWindow to find the right counts. + const int num_spatial_dims = spatial_padding.size(); + + std::vector input_dim_sizes(num_spatial_dims); + std::vector window_dims(num_spatial_dims); + std::vector window_ksize(num_spatial_dims); + std::vector window_stride(num_spatial_dims); + CHECK_EQ(data_format.num_spatial_dims(), num_spatial_dims) + << "Invalid number of spatial dimentions in data format specification"; + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + input_dim_sizes[i] = input_shape[dim]; + window_dims[i] = dim; + window_ksize[i] = ksize[dim]; + window_stride[i] = stride[dim]; + } + + XlaBuilder* b = sums.builder(); + // Build a matrix of all 1s, with the same width/height as the input. + auto ones = Broadcast(One(b, dtype), input_dim_sizes); + PaddingConfig padding_config; + for (int i = 0; i < num_spatial_dims; ++i) { + auto dims = padding_config.add_dimensions(); + dims->set_edge_padding_low(spatial_padding[i].first); + dims->set_edge_padding_high(spatial_padding[i].second); + } + auto zero = Zero(b, dtype); + auto padded_ones = Pad(ones, zero, padding_config); + + // Perform a ReduceWindow with the same window size, strides, and padding + // to count the number of contributions to each result element. + auto counts = + ReduceWindow(padded_ones, zero, CreateScalarAddComputation(dtype, b), + window_ksize, window_stride, Padding::kValid); + + return Div(sums, counts, window_dims); +} + +// Sums all elements in the window specified by 'kernel_size' and 'stride'. +XlaOp ComputeSums(XlaOp operand, XlaOp init_value, + absl::Span kernel_size, + absl::Span stride, + const TensorFormat& data_format) { + XlaBuilder* b = operand.builder(); + return b->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape operand_shape, b->GetShape(operand)); + TF_ASSIGN_OR_RETURN(Shape init_shape, b->GetShape(init_value)); + PrimitiveType accumulation_type = init_shape.element_type(); + auto add_computation = CreateScalarAddComputation(accumulation_type, b); + return ReduceWindow(operand, init_value, add_computation, kernel_size, + stride, Padding::kValid); + }); +} + +// Creates a padding configuration out of spatial padding values. +PaddingConfig MakeSpatialPaddingConfig( + absl::Span> spatial_padding, + int num_spatial_dims, absl::Span stride, + const TensorFormat& data_format) { + PaddingConfig padding_config; + for (int i = 0; i < 2 + num_spatial_dims; ++i) { + padding_config.add_dimensions(); + } + CHECK_EQ(data_format.num_spatial_dims(), num_spatial_dims) + << "Invalid number of spatial dimentions in data format specification"; + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + auto padding_dimension = padding_config.mutable_dimensions(dim); + padding_dimension->set_edge_padding_low(spatial_padding[i].first); + padding_dimension->set_edge_padding_high(spatial_padding[i].second); + } + return padding_config; +} + +XlaOp AvgPoolDivideByCount(XlaOp pooled, absl::Span input_size, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding, + PrimitiveType dtype, const TensorFormat& data_format, + bool counts_include_padding) { + if (counts_include_padding) { + // If counts include padding, all windows have the same number of elements + // contributing to each average. Divide by the window size everywhere to get + // the average. + int64 window_size = + std::accumulate(window_dimensions.begin(), window_dimensions.end(), 1, + [](int64 a, int64 b) { return a * b; }); + auto divisor = ConstantR0WithType(pooled.builder(), dtype, window_size); + + return pooled / divisor; + } else { + return AvgPoolDivideByCountWithGeneralPadding(pooled, dtype, input_size, + padding, window_dimensions, + window_strides, data_format); + } +} + +} // namespace + +XlaOp MaxPool(XlaOp operand, absl::Span kernel_size, + absl::Span stride, Padding padding, + const TensorFormat& data_format) { + XlaBuilder* b = operand.builder(); + return b->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape operand_shape, b->GetShape(operand)); + PrimitiveType dtype = operand_shape.element_type(); + auto max_computation = CreateScalarMaxComputation(dtype, b); + auto init_value = MinValue(b, dtype); + return ReduceWindow(operand, init_value, max_computation, kernel_size, + stride, padding); + }); +} + +XlaOp AvgPool(XlaOp operand, absl::Span kernel_size, + absl::Span stride, + absl::Span> padding, + const TensorFormat& data_format, + const bool counts_include_padding) { + XlaBuilder* b = operand.builder(); + return b->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape operand_shape, b->GetShape(operand)); + PrimitiveType dtype = operand_shape.element_type(); + auto init_value = Zero(b, dtype); + std::vector input_size(operand_shape.dimensions().begin(), + operand_shape.dimensions().end()); + const int num_dims = kernel_size.size(); + const int num_spatial_dims = num_dims - 2; + auto padding_config = MakeSpatialPaddingConfig(padding, num_spatial_dims, + stride, data_format); + auto padded_operand = Pad(operand, Zero(b, dtype), padding_config); + auto pooled = ComputeSums(padded_operand, init_value, kernel_size, stride, + data_format); + return AvgPoolDivideByCount(pooled, input_size, kernel_size, stride, + padding, dtype, data_format, + counts_include_padding); + }); +} + +std::vector> MakeSpatialPadding( + absl::Span input_size, absl::Span kernel_size, + absl::Span stride, Padding padding, + const TensorFormat& data_format) { + const int num_spatial_dims = kernel_size.size() - 2; + std::vector input_spatial_dimensions; + std::vector kernel_size_spatial_dimensions; + std::vector stride_spatial_dimensions; + CHECK_EQ(data_format.num_spatial_dims(), num_spatial_dims) + << "Invalid number of spatial dimentions in data format specification"; + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + input_spatial_dimensions.push_back(input_size[dim]); + kernel_size_spatial_dimensions.push_back(kernel_size[dim]); + stride_spatial_dimensions.push_back(stride[dim]); + } + return MakePadding(input_spatial_dimensions, kernel_size_spatial_dimensions, + stride_spatial_dimensions, padding); +} + +XlaOp AvgPoolGrad(XlaOp out_backprop, absl::Span gradients_size, + absl::Span kernel_size, + absl::Span stride, + absl::Span> spatial_padding, + const TensorFormat& data_format, + const bool counts_include_padding) { + XlaBuilder* b = out_backprop.builder(); + return b->ReportErrorOrReturn([&]() -> StatusOr { + const int num_dims = kernel_size.size(); + + if (gradients_size.size() != num_dims) { + return tensorflow::errors::InvalidArgument("gradients must be ", num_dims, + "-dimensional"); + } + + TF_ASSIGN_OR_RETURN(Shape out_backprop_xla_shape, + b->GetShape(out_backprop)); + if (out_backprop_xla_shape.dimensions().size() != num_dims) { + return tensorflow::errors::InvalidArgument("out_backprop must be ", + num_dims, "-dimensional"); + } + + // We can think of average-pooling as: + // * a convolution with a kernel consisting entirely of 1s, where the + // input feature and output feature are equal, and 0s everywhere else. + // * followed by dividing by the counts. + // + // This then gives us an algorithm to build the gradient: + // * divide out_backprop by the counts, followed by + // * Conv2DBackpropInput specialized for that kernel, which simplifies to + // a Pad and a ReduceWindow. + // + // For an explanation of backpropagation for convolution, see the comments + // in third_party/tensorflow/core/kernels/conv_grad_ops.h + + // TF filter shape is [ H, W, ..., inC, outC ] + + // The input gradients are computed by a convolution of the output gradients + // and the filter, with some appropriate padding. See the comment at the top + // of conv_grad_ops.h for details. + PrimitiveType dtype = out_backprop_xla_shape.element_type(); + auto out_backprop_div = AvgPoolDivideByCount( + out_backprop, gradients_size, kernel_size, stride, spatial_padding, + dtype, data_format, counts_include_padding); + + // Pad the gradients in the spatial dimensions. We use the same padding + // as Conv2DBackpropInput. + PaddingConfig padding_config = MakeNoPaddingConfig(num_dims); + std::vector padded_gradients_size(gradients_size.begin(), + gradients_size.end()); + // First, pad the output gradients the same way as the input. The additional + // padding will be removed as a last step before returning the input + // gradients. + const int num_spatial_dims = num_dims - 2; + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + padded_gradients_size[dim] += + (spatial_padding[i].first + spatial_padding[i].second); + } + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + TF_ASSIGN_OR_RETURN( + SpatialDimensionOutputSizeAndPadding conv_backprop_spatial_dim, + ConvGradExtractAndVerifyDimension( + /*input_size=*/padded_gradients_size[dim], + /*filter_size=*/kernel_size[dim], + /*output_size=*/out_backprop_xla_shape.dimensions(dim), + /*dilation=*/1, + /*stride=*/stride[dim], /*padding=*/Padding::kValid)); + auto* padding = padding_config.mutable_dimensions(dim); + padding->set_edge_padding_low(conv_backprop_spatial_dim.pad_before); + padding->set_edge_padding_high(conv_backprop_spatial_dim.pad_after); + padding->set_interior_padding(stride[dim] - 1); + } + + auto zero = Zero(b, dtype); + auto padded_gradients = Pad(out_backprop_div, zero, padding_config); + + // in_backprop = padded_gradients ones + std::vector ones(num_dims, 1LL); + auto in_backprop = + ReduceWindow(padded_gradients, Zero(b, dtype), + CreateScalarAddComputation(dtype, b), kernel_size, + /*window_strides=*/ones, Padding::kValid); + // The input padding doesn't contribute to the gradient, remove it. + std::vector> neg_spatial_padding; + neg_spatial_padding.reserve(spatial_padding.size()); + for (const std::pair& spatial_padding_dim : spatial_padding) { + neg_spatial_padding.emplace_back(-spatial_padding_dim.first, + -spatial_padding_dim.second); + } + auto remove_padding_config = MakeSpatialPaddingConfig( + neg_spatial_padding, num_spatial_dims, stride, data_format); + return Pad(in_backprop, zero, remove_padding_config); + }); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/pooling.h b/tensorflow/compiler/xla/client/lib/pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..5c0054857d072dc7f36e259a29b9b24fd70796ac --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/pooling.h @@ -0,0 +1,80 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_ + +#include "absl/container/inlined_vector.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" + +namespace xla { + +// Tensor format for reduce window operations. +class TensorFormat { + public: + TensorFormat(int batch_dimension, int feature_dimension, + absl::Span spatial_dimensions) + : batch_dimension_(batch_dimension), + feature_dimension_(feature_dimension), + spatial_dimensions_(spatial_dimensions.begin(), + spatial_dimensions.end()) {} + + int batch_dimension() const { return batch_dimension_; } + + int feature_dimension() const { return feature_dimension_; } + + int spatial_dimension(int dim) const { return spatial_dimensions_[dim]; } + + int num_spatial_dims() const { return spatial_dimensions_.size(); } + + private: + // The number of the dimension that represents the batch. + int batch_dimension_; + // The number of the dimension that represents the features. + int feature_dimension_; + // The dimension numbers for the spatial dimensions. + absl::InlinedVector spatial_dimensions_; +}; + +// Computes the max pool of 'operand'. +XlaOp MaxPool(XlaOp operand, absl::Span kernel_size, + absl::Span stride, Padding padding, + const TensorFormat& data_format); + +// Computes the average pool of 'operand'. +XlaOp AvgPool(XlaOp operand, absl::Span kernel_size, + absl::Span stride, + absl::Span> padding, + const TensorFormat& data_format, + const bool counts_include_padding); + +// Returns the list of low and high padding elements in each spatial dimension +// for the given 'padding' specification. +std::vector> MakeSpatialPadding( + absl::Span input_size, absl::Span kernel_size, + absl::Span stride, Padding padding, + const TensorFormat& data_format); + +// Computes the average pool gradient. +XlaOp AvgPoolGrad(XlaOp out_backprop, absl::Span gradients_size, + absl::Span kernel_size, + absl::Span stride, + absl::Span> spatial_padding, + const TensorFormat& data_format, + const bool counts_include_padding); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_ diff --git a/tensorflow/compiler/xla/client/lib/pooling_test.cc b/tensorflow/compiler/xla/client/lib/pooling_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..30adb9b1ad7fa03b40ce3802a2172680b60a9ad7 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/pooling_test.cc @@ -0,0 +1,290 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/pooling.h" +#include "absl/container/inlined_vector.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" + +namespace xla { +namespace { + +TensorFormat MakeNCHWFormat(int num_spatial_dims) { + absl::InlinedVector spatial_dimensions; + for (int i = 0; i < num_spatial_dims; ++i) { + spatial_dimensions.push_back(i + 2); + } + return TensorFormat(/*batch_dimension=*/0, /*feature_dimension=*/1, + /*spatial_dimensions=*/spatial_dimensions); +} + +std::vector> MakeGeneralPadding( + XlaOp input, absl::Span kernel_size, + absl::Span stride, Padding padding, + const xla::TensorFormat& data_format) { + XlaBuilder* b = input.builder(); + Shape operand_shape = b->GetShape(input).ValueOrDie(); + std::vector input_size(operand_shape.dimensions().begin(), + operand_shape.dimensions().end()); + return MakeSpatialPadding(input_size, kernel_size, stride, padding, + data_format); +} + +// Add singleton batch and feature dimensions to spatial dimensions, according +// to 'data_format' specification. +std::vector ExpandWithBatchAndFeatureDimensions( + absl::Span spatial_dim_sizes, + const xla::TensorFormat& data_format) { + const int num_spatial_dims = spatial_dim_sizes.size(); + std::vector tensor_sizes(num_spatial_dims + 2, 1); + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + tensor_sizes[dim] = spatial_dim_sizes[i]; + } + return tensor_sizes; +} + +class PoolingTest : public ClientLibraryTestBase { + public: + ErrorSpec error_spec_{0.0001}; +}; + +XLA_TEST_F(PoolingTest, MaxPool2D) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = kernel_size; + MaxPool(input, kernel_size, stride, Padding::kValid, data_format); + + ComputeAndCompareR4(&builder, {{{{5, 4}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, MaxPool2DWithPadding) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = kernel_size; + MaxPool(input, kernel_size, stride, Padding::kSame, data_format); + + ComputeAndCompareR4(&builder, {{{{5, 4, 5}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, MaxPool2DWithPaddingAndStride) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format); + MaxPool(input, kernel_size, stride, Padding::kSame, data_format); + + ComputeAndCompareR4(&builder, {{{{5, 4, 4, 5, 5}, {5, 4, 3, 2, 1}}}}, + {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2D) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = kernel_size; + auto padding = MakeGeneralPadding(input, kernel_size, stride, Padding::kValid, + data_format); + AvgPool(input, kernel_size, stride, padding, data_format, + /*counts_include_padding=*/true); + + ComputeAndCompareR4(&builder, {{{{3, 3}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2DWithPadding) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = kernel_size; + auto padding = MakeGeneralPadding(input, kernel_size, stride, Padding::kSame, + data_format); + AvgPool(input, kernel_size, stride, padding, data_format, + /*counts_include_padding=*/false); + + ComputeAndCompareR4(&builder, {{{{3, 3, 3}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2DWithPaddingAndStride) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format); + auto padding = MakeGeneralPadding(input, kernel_size, stride, Padding::kSame, + data_format); + AvgPool(input, kernel_size, stride, padding, data_format, + /*counts_include_padding=*/false); + + ComputeAndCompareR4(&builder, + {{{{3, 3, 3, 3, 3}, {4.5, 3.5, 2.5, 1.5, 1}}}}, {}, + error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2DWithGeneralPaddingCountNotIncludePadding) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({3, 3}, data_format); + auto stride = kernel_size; + AvgPool(input, kernel_size, stride, {{1, 1}, {2, 1}}, data_format, + /*counts_include_padding=*/false); + + ComputeAndCompareR4(&builder, {{{{3, 3}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, + AvgPool2DWithGeneralPaddingCountNotIncludePaddingAndStride) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({3, 3}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + AvgPool(input, kernel_size, stride, {{2, 1}, {1, 1}}, data_format, + /*counts_include_padding=*/false); + + ComputeAndCompareR4(&builder, {{{{1.5, 3, 4.5}, {3, 3, 3}}}}, {}, + error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2DGradNoPadding) { + XlaBuilder builder(TestName()); + for (bool counts_include_padding : {false, true}) { + XlaOp out_backprop = ConstantR4FromArray4D(&builder, {{{{1.}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, + {{0, 0}, {0, 0}}, MakeNCHWFormat(2), + /*counts_include_padding=*/counts_include_padding); + // Without padding, counts_include_padding makes no difference. + ComputeAndCompareR4( + &builder, {{{{0.25, 0.25, 0.}, {0.25, 0.25, 0.}, {0., 0., 0.}}}}, {}, + error_spec_); + } +} + +XLA_TEST_F(PoolingTest, AvgPool2DGradNoPaddingWithStride) { + XlaBuilder builder(TestName()); + for (bool counts_include_padding : {false, true}) { + XlaOp out_backprop = + ConstantR4FromArray4D(&builder, {{{{1., 1.}, {1., 1.}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format); + AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, + {{0, 0}, {0, 0}}, MakeNCHWFormat(2), + /*counts_include_padding=*/counts_include_padding); + // Without padding, counts_include_padding makes no difference. + ComputeAndCompareR4( + &builder, {{{{0.25, 0.5, 0.25}, {0.5, 1., 0.5}, {0.25, 0.5, 0.25}}}}, + {}, error_spec_); + } +} + +XLA_TEST_F(PoolingTest, AvgPool2DGradWithPadding) { + XlaBuilder builder(TestName()); + + XlaOp out_backprop = + ConstantR4FromArray4D(&builder, {{{{1., 1.}, {1., 1.}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, {{1, 1}, {1, 1}}, + MakeNCHWFormat(2), + /*counts_include_padding=*/true); + ComputeAndCompareR4( + &builder, + {{{{0.25, 0.25, 0.25}, {0.25, 0.25, 0.25}, {0.25, 0.25, 0.25}}}}, {}, + error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountNotIncludePadding) { + XlaBuilder builder(TestName()); + + XlaOp out_backprop = + ConstantR4FromArray4D(&builder, {{{{1., 1.}, {1., 1.}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, {{1, 1}, {1, 1}}, + MakeNCHWFormat(2), false); + ComputeAndCompareR4( + &builder, {{{{1., 0.5, 0.5}, {0.5, 0.25, 0.25}, {0.5, 0.25, 0.25}}}}, {}, + error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountWithStride) { + XlaBuilder builder(TestName()); + + XlaOp out_backprop = + ConstantR4FromArray4D(&builder, {{{{1., 1., 1., 1.}, + {1., 1., 1., 1.}, + {1., 1., 1., 1.}, + {1., 1., 1., 1.}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format); + AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, {{1, 1}, {1, 1}}, + MakeNCHWFormat(2), true); + ComputeAndCompareR4(&builder, + {{{{1., 1., 1.}, {1., 1., 1.}, {1., 1., 1.}}}}, {}, + error_spec_); +} + +XLA_TEST_F(PoolingTest, + AvgPool2DGradWithPaddingCountWithStrideNotIncludePadding) { + XlaBuilder builder(TestName()); + + XlaOp out_backprop = + ConstantR4FromArray4D(&builder, {{{{1., 1., 1., 1.}, + {1., 1., 1., 1.}, + {1., 1., 1., 1.}, + {1., 1., 1., 1.}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format); + AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, {{1, 1}, {1, 1}}, + MakeNCHWFormat(2), false); + ComputeAndCompareR4( + &builder, {{{{2.25, 1.5, 2.25}, {1.5, 1., 1.5}, {2.25, 1.5, 2.25}}}}, {}, + error_spec_); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc new file mode 100644 index 0000000000000000000000000000000000000000..6ef81689489d8117d5951bcb75693c2e3413e4d6 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/prng.cc @@ -0,0 +1,150 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/math.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/core/casts.h" + +namespace xla { +namespace { + +// Rotates a 32-bit integer 'v' left by 'distance' bits. +XlaOp RotateLeftS32(XlaOp v, int distance) { + return (v << ConstantR0(v.builder(), distance)) | + ShiftRightLogical(v, ConstantR0(v.builder(), 32 - distance)); +} + +using ThreeFry2x32State = std::array; + +// Implements the ThreeFry counter-based PRNG algorithm. +// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. +// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf +ThreeFry2x32State ThreeFry2x32(ThreeFry2x32State input, ThreeFry2x32State key) { + XlaBuilder* builder = input[0].builder(); + // Rotation distances specified by the Threefry2x32 algorithm. + constexpr std::array rotations = {13, 15, 26, 6, 17, 29, 16, 24}; + ThreeFry2x32State x; + + std::array ks; + // 0x1BD11BDA is a parity constant specified by the ThreeFry2x32 algorithm. + ks[2] = ConstantR0(builder, 0x1BD11BDA); + for (int i = 0; i < 2; ++i) { + ks[i] = key[i]; + x[i] = input[i]; + ks[2] = ks[2] ^ key[i]; + } + + x[0] = x[0] + ks[0]; + x[1] = x[1] + ks[1]; + + // Performs a single round of the Threefry2x32 algorithm, with a rotation + // amount 'rotation'. + auto round = [](ThreeFry2x32State v, int rotation) { + v[0] = v[0] + v[1]; + v[1] = RotateLeftS32(v[1], rotation); + v[1] = v[0] ^ v[1]; + return v; + }; + + // There are no known statistical flaws with 13 rounds of Threefry2x32. + // We are conservative and use 20 rounds. + x = round(x, rotations[0]); + x = round(x, rotations[1]); + x = round(x, rotations[2]); + x = round(x, rotations[3]); + x[0] = x[0] + ks[1]; + x[1] = x[1] + ks[2] + ConstantR0(builder, 1); + + x = round(x, rotations[4]); + x = round(x, rotations[5]); + x = round(x, rotations[6]); + x = round(x, rotations[7]); + x[0] = x[0] + ks[2]; + x[1] = x[1] + ks[0] + ConstantR0(builder, 2); + + x = round(x, rotations[0]); + x = round(x, rotations[1]); + x = round(x, rotations[2]); + x = round(x, rotations[3]); + x[0] = x[0] + ks[0]; + x[1] = x[1] + ks[1] + ConstantR0(builder, 3); + + x = round(x, rotations[4]); + x = round(x, rotations[5]); + x = round(x, rotations[6]); + x = round(x, rotations[7]); + x[0] = x[0] + ks[1]; + x[1] = x[1] + ks[2] + ConstantR0(builder, 4); + + x = round(x, rotations[0]); + x = round(x, rotations[1]); + x = round(x, rotations[2]); + x = round(x, rotations[3]); + x[0] = x[0] + ks[2]; + x[1] = x[1] + ks[0] + ConstantR0(builder, 5); + + return x; +} + +} // namespace + +XlaOp StatelessRngUniform(std::array seeds, const Shape& shape, + XlaOp minval, XlaOp maxval) { + XlaBuilder* builder = seeds[0].builder(); + if (shape.element_type() != F32) { + return builder->ReportError(Unimplemented( + "Types other than F32 are not implemented by StatelessRngUniform.")); + } + ThreeFry2x32State key = seeds; + const int64 size = ShapeUtil::ElementsIn(shape); + + const int64 half_size = CeilOfRatio(size, 2); + const bool size_is_odd = (half_size * 2 != size); + + // Fill the generator inputs with unique counter values. + ThreeFry2x32State inputs; + inputs[0] = Iota(builder, S32, half_size); + inputs[1] = inputs[0] + ConstantR0(builder, half_size); + ThreeFry2x32State outputs = ThreeFry2x32(inputs, key); + + if (size_is_odd) { + outputs[1] = Slice(outputs[1], {0}, {half_size - 1}, {1}); + } + + auto bits = Reshape(ConcatInDim(builder, outputs, 0), + AsInt64Slice(shape.dimensions())); + + // Form 23 random mantissa bits, with a leading 1 bit. The leading 1 bit + // forces the random bits into the mantissa. + constexpr int kFloatBits = 32; + constexpr int kMantissaBits = 23; + bits = ShiftRightLogical( + bits, ConstantR0(builder, kFloatBits - kMantissaBits)) | + ConstantR0(builder, tensorflow::bit_cast(1.0f)); + auto floats = BitcastConvertType(bits, F32); + + // We have a floating point number in the range [1.0, 2.0). + // Subtract 1.0f to shift to the range [0.0, 1.0) + floats = floats - ConstantR0(builder, 1.0f); + // Multiply and add to shift to the range [minval, maxval). + return floats * (maxval - minval) + minval; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/prng.h b/tensorflow/compiler/xla/client/lib/prng.h new file mode 100644 index 0000000000000000000000000000000000000000..ad000b1fa1d0655c8fccc0bb33379f2499b77f26 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/prng.h @@ -0,0 +1,34 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_PRNG_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_PRNG_H_ + +#include + +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { + +// Returns a tensor containing 'shape' random values uniformly distributed in +// the range [minval, maxval). Requires 2 32-bit integer seeds. +// Currently only 'shape's of type F32 are implemented. +XlaOp StatelessRngUniform(std::array seeds, const Shape& shape, + XlaOp minval, XlaOp maxval); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_PRNG_H_ diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc new file mode 100644 index 0000000000000000000000000000000000000000..a904be259a3870a679b2c4699ec01e2a11b1ce46 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/sorting.cc @@ -0,0 +1,46 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/sorting.h" +#include "tensorflow/compiler/xla/client/lib/numeric.h" + +namespace xla { + +XlaOp TopK(XlaOp input, int64 k) { + XlaBuilder* const builder = input.builder(); + return builder->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input)); + int last_dim = input_shape.dimensions_size() - 1; + int last_dim_size = input_shape.dimensions(last_dim); + + XlaOp iota_s32 = Iota(builder, S32, last_dim_size); + auto input_dims = input_shape.dimensions(); + std::vector broadcast_dims(input_dims.begin(), input_dims.end() - 1); + XlaOp broadcast_s32 = Broadcast(iota_s32, broadcast_dims); + XlaOp sort_result = Sort(Neg(input), broadcast_s32); + std::vector start_indices(input_shape.dimensions_size(), 0); + std::vector limit_indices(input_dims.begin(), input_dims.end()); + limit_indices[last_dim] = k; + std::vector strides(input_shape.dimensions_size(), 1); + + XlaOp values = Neg(Slice(GetTupleElement(sort_result, 0), start_indices, + limit_indices, strides)); + XlaOp indices = Slice(GetTupleElement(sort_result, 1), start_indices, + limit_indices, strides); + return Tuple(builder, {values, indices}); + }); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/sorting.h b/tensorflow/compiler/xla/client/lib/sorting.h new file mode 100644 index 0000000000000000000000000000000000000000..b9dfafdd6f957ae050e0f5dbd076d5288235b490 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/sorting.h @@ -0,0 +1,31 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SORTING_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SORTING_H_ + +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { + +// Returns a tuple composed of the top `k` values and corresponding indices in +// `input`. Output values are in descending order, from largest to smallest. +XlaOp TopK(XlaOp input, int64 k); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SORTING_H_ diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..fef98c9923096e21a755c6d730de2c7c10852b2d --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc @@ -0,0 +1,60 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/sorting.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/compiler/xla/types.h" + +namespace xla { +namespace { + +using SortingTest = ClientLibraryTestBase; + +XLA_TEST_F(SortingTest, TopK3From8Values) { + XlaBuilder builder(TestName()); + auto x = + ConstantR1(&builder, {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}); + xla::GetTupleElement(xla::TopK(x, 3), 0); + ComputeAndCompareR1(&builder, {7.0, 6.0, 5.0}, {}); +} + +XLA_TEST_F(SortingTest, TopK3From8Indices) { + XlaBuilder builder(TestName()); + auto x_rev = + ConstantR1(&builder, {7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0}); + xla::GetTupleElement(xla::TopK(x_rev, 3), 1); + ComputeAndCompareR1(&builder, {0, 1, 2}, {}); +} + +XLA_TEST_F(SortingTest, TopKFullSort) { + XlaBuilder builder(TestName()); + const int kSize = 16; + std::mt19937 eng; + std::uniform_real_distribution u_dist(0.0, 100.0); + auto gen = std::bind(u_dist, eng); + std::vector inputs(kSize); + std::generate(inputs.begin(), inputs.end(), gen); + auto x = ConstantR1(&builder, inputs); + xla::GetTupleElement(xla::TopK(x, kSize), 0); + + std::sort(inputs.begin(), inputs.end(), std::greater()); + ComputeAndCompareR1(&builder, inputs, {}); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc index 3380af9f303b1dc2cec09aa37410ec40cdeaa526..6861521acc0db1d640666a6793b898a183ab6a17 100644 --- a/tensorflow/compiler/xla/client/lib/testing.cc +++ b/tensorflow/compiler/xla/client/lib/testing.cc @@ -15,15 +15,15 @@ limitations under the License. #include "tensorflow/compiler/xla/client/lib/testing.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/execution_options_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/test_utils.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/types.h" @@ -48,21 +48,20 @@ int64 DataSizeOfShape(const Shape& shape) { // Creates a XlaOp for an op what generates fake data with the given shape. XlaOp BuildFakeDataOpOnDevice(const Shape& shape, XlaBuilder* builder) { if (ShapeUtil::IsArray(shape)) { - return builder->Broadcast( - builder->ConstantLiteral(Literal::One(shape.element_type())), + return Broadcast( + ConstantLiteral(builder, LiteralUtil::One(shape.element_type())), AsInt64Slice(shape.dimensions())); } std::vector parts; for (const Shape& s : shape.tuple_shapes()) { parts.push_back(BuildFakeDataOpOnDevice(s, builder)); } - return builder->Tuple(parts); + return Tuple(builder, parts); } std::unique_ptr MakeFakeDataViaDeviceOrDie(const Shape& shape, Client* client) { - XlaBuilder b( - tensorflow::strings::StrCat("make_fake_", ShapeUtil::HumanString(shape))); + XlaBuilder b(absl::StrCat("make_fake_", ShapeUtil::HumanString(shape))); BuildFakeDataOpOnDevice(shape, &b); XlaComputation computation = b.Build().ConsumeValueOrDie(); @@ -98,14 +97,13 @@ std::vector> MakeFakeArgumentsOrDie( << "Computation should have progran shape."; auto program_shape = computation.proto().program_shape(); - // For every (unbound) parameter that the computation wants, we manufacture - // some arbitrary data so that we can invoke the computation. - std::vector> fake_arguments; - for (const Shape& parameter : program_shape.parameters()) { - fake_arguments.push_back(MakeFakeDataOrDie(parameter, client)); - } - - return fake_arguments; + // Create and run a program which produces a tuple with one element per + // parameter, then return the tuple's constituent buffers. + std::vector param_shapes(program_shape.parameters().begin(), + program_shape.parameters().end()); + auto fake_input_tuple = + MakeFakeDataOrDie(ShapeUtil::MakeTupleShape(param_shapes), client); + return client->DeconstructTuple(*fake_input_tuple).ValueOrDie(); } } // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h index dc613099e2b42a60d0c11a654ab5cd41f8bd4f6f..03695ce2a339735e3e49522f4fe1bbf2d83a3834 100644 --- a/tensorflow/compiler/xla/client/lib/testing.h +++ b/tensorflow/compiler/xla/client/lib/testing.h @@ -21,7 +21,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/global_data.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/xla_data.pb.h" namespace xla { diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index a7c55c6b2b7fe2b5541ce71bf3eaa24114522fc5..4402ba8762c1538951c326c880fc3b6dd63ef0c6 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -17,11 +17,13 @@ limitations under the License. #include +#include "absl/memory/memory.h" #include "llvm/ADT/Triple.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/service_executable_run_options.h" #include "tensorflow/compiler/xla/service/source_map_util.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/compiler/xla/status_macros.h" using xla::source_map_util::InvalidParameterArgument; @@ -29,8 +31,8 @@ using xla::source_map_util::InvalidParameterArgument; namespace xla { namespace { -StatusOr BorrowStreamForDevice(int device_ordinal, - Backend* backend) { +StatusOr BorrowStreamForDevice(int device_ordinal, + Backend* backend) { if (device_ordinal < 0) { device_ordinal = backend->default_device_ordinal(); } @@ -49,26 +51,19 @@ LocalExecutable::LocalExecutable(std::unique_ptr executable, } Status LocalExecutable::ValidateExecutionOptions( - const tensorflow::gtl::ArraySlice arguments, + const absl::Span arguments, const ExecutableRunOptions& run_options, const Backend& backend) { - const ComputationLayout& host_computation_layout = - executable_->module_config().host_entry_computation_layout(); - const ComputationLayout& device_computation_layout = - executable_->module_config().device_entry_computation_layout(); + const ComputationLayout& computation_layout = + executable_->module_config().entry_computation_layout(); // Check argument number, shapes, and layouts. - if (arguments.size() != host_computation_layout.parameter_count()) { + if (arguments.size() != computation_layout.parameter_count()) { return InvalidArgument( - "invalid number of arguments for computation: expected %d, got %zu", - host_computation_layout.parameter_count(), arguments.size()); - } - if (arguments.size() != device_computation_layout.parameter_count()) { - return InvalidArgument( - "invalid number of arguments for computation: expected %d, got %zu", - device_computation_layout.parameter_count(), arguments.size()); + "invalid number of arguments for computation: expected %d, got %u", + computation_layout.parameter_count(), arguments.size()); } for (int i = 0; i < arguments.size(); ++i) { - if (!host_computation_layout.parameter_layout(i).MatchesLayoutInShape( + if (!computation_layout.parameter_layout(i).MatchesLayoutInShape( arguments[i]->on_host_shape())) { return InvalidParameterArgument( executable_.get(), i, @@ -77,22 +72,8 @@ Status LocalExecutable::ValidateExecutionOptions( "%d: want %s, got %s", i, ShapeUtil::HumanString( - host_computation_layout.parameter_layout(i).shape()) - .c_str(), - ShapeUtil::HumanString(arguments[i]->on_host_shape()).c_str()); - } - if (!device_computation_layout.parameter_layout(i).MatchesLayoutInShape( - arguments[i]->on_device_shape())) { - return InvalidParameterArgument( - executable_.get(), i, - "Argument does not match device shape or layout of computation " - "parameter " - "%d: want %s, got %s", - i, - ShapeUtil::HumanString( - device_computation_layout.parameter_layout(i).shape()) - .c_str(), - ShapeUtil::HumanString(arguments[i]->on_device_shape()).c_str()); + computation_layout.parameter_layout(i).shape()), + ShapeUtil::HumanString(arguments[i]->on_host_shape())); } } @@ -107,8 +88,7 @@ Status LocalExecutable::ValidateExecutionOptions( if (stream_platform != backend_->platform()) { return InvalidArgument( "stream is for platform %s, but service targets platform %s", - stream_platform->Name().c_str(), - backend_->platform()->Name().c_str()); + stream_platform->Name(), backend_->platform()->Name()); } // Cannot specify device_ordinal with a stream. The stream determines these @@ -120,11 +100,14 @@ Status LocalExecutable::ValidateExecutionOptions( } } - // Verify that the device the executable was built for is equivalent to the - // device it will run on. - int run_device_ordinal = run_options.device_ordinal() == -1 - ? backend_->default_device_ordinal() - : run_options.device_ordinal(); + // Verify that the device the executable was built for is equivalent + // to the device it will run on. + int run_device_ordinal = run_options.device_ordinal(); + if (run_device_ordinal == -1) { + run_device_ordinal = run_options.stream() != nullptr + ? run_options.stream()->parent()->device_ordinal() + : backend_->default_device_ordinal(); + } TF_ASSIGN_OR_RETURN(bool devices_equivalent, backend_->devices_equivalent( run_device_ordinal, build_options_.device_ordinal())); @@ -136,10 +119,10 @@ Status LocalExecutable::ValidateExecutionOptions( return InvalidArgument( "executable is built for device %s of type \"%s\"; cannot run it on " "device %s of type \"%s\"", - backend_->device_name(build_device_ordinal()).c_str(), - build_executor->GetDeviceDescription().name().c_str(), - backend_->device_name(run_device_ordinal).c_str(), - run_executor->GetDeviceDescription().name().c_str()); + backend_->device_name(build_device_ordinal()), + build_executor->GetDeviceDescription().name(), + backend_->device_name(run_device_ordinal), + run_executor->GetDeviceDescription().name()); } if (!run_options.allocator()) { @@ -149,20 +132,20 @@ Status LocalExecutable::ValidateExecutionOptions( if (run_options.allocator()->platform() != backend.platform()) { return InvalidArgument( "allocator platform (%s) does not match service platform (%s)", - run_options.allocator()->platform()->Name().c_str(), - backend.platform()->Name().c_str()); + run_options.allocator()->platform()->Name(), + backend.platform()->Name()); } return Status::OK(); } StatusOr LocalExecutable::Run( - const tensorflow::gtl::ArraySlice arguments, + const absl::Span arguments, ExecutableRunOptions run_options) { TF_RETURN_IF_ERROR( ValidateExecutionOptions(arguments, run_options, *backend_)); - Backend::StreamPtr stream; + StreamPool::Ptr stream; if (run_options.stream() == nullptr) { // NB! The lifetime of `stream` needs to match the lifetime of // `actual_options` (otherwise we will end up using a returned stream in @@ -185,7 +168,7 @@ StatusOr LocalExecutable::Run( run_options, backend_->StreamBorrower(), backend_->eigen_intra_op_thread_pool()); - if (executable_->dumping()) { + if (executable_->dumping_snapshot()) { return ExecuteAndDump(&service_options, arguments); } return executable_->ExecuteOnStreamWrapper( @@ -194,46 +177,45 @@ StatusOr LocalExecutable::Run( StatusOr LocalExecutable::ExecuteAndDump( const ServiceExecutableRunOptions* run_options, - const tensorflow::gtl::ArraySlice arguments) { - executable_->session_module()->set_execution_platform( + const absl::Span arguments) { + executable_->hlo_snapshot()->set_execution_platform( backend_->platform()->Name()); - TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module())); + TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->hlo_snapshot())); TF_ASSIGN_OR_RETURN( ScopedShapedBuffer result, executable_->ExecuteOnStream(run_options, arguments, /*hlo_execution_profile=*/nullptr)); - TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module())); - TF_RETURN_IF_ERROR(executable_->DumpSessionModule()); + TF_RETURN_IF_ERROR(RecordResult(&result, executable_->hlo_snapshot())); + TF_RETURN_IF_ERROR(executable_->DumpHloSnapshot()); return std::move(result); } Status LocalExecutable::RecordArguments( - const tensorflow::gtl::ArraySlice arguments, - SessionModule* session_module) { - session_module->clear_arguments(); + const absl::Span arguments, + HloSnapshot* hlo_snapshot) { + hlo_snapshot->clear_arguments(); for (const ShapedBuffer* argument : arguments) { TF_ASSIGN_OR_RETURN(std::unique_ptr literal, LiteralFromShapedBuffer(*argument)); - *session_module->add_arguments() = literal->ToProto(); + *hlo_snapshot->add_arguments() = literal->ToProto(); } return Status::OK(); } Status LocalExecutable::RecordResult(const ShapedBuffer* result, - SessionModule* session_module) { - session_module->clear_result(); + HloSnapshot* hlo_snapshot) { + hlo_snapshot->clear_result(); TF_ASSIGN_OR_RETURN(std::unique_ptr literal, LiteralFromShapedBuffer(*result)); - *session_module->mutable_result() = literal->ToProto(); + *hlo_snapshot->mutable_result() = literal->ToProto(); return Status::OK(); } StatusOr> LocalExecutable::LiteralFromShapedBuffer( const ShapedBuffer& shaped_buffer) { - TF_ASSIGN_OR_RETURN( - se::StreamExecutor * executor, - backend_->stream_executor(shaped_buffer.device_ordinal())); - return backend_->transfer_manager()->TransferLiteralFromDevice(executor, + TF_ASSIGN_OR_RETURN(auto stream, + backend_->BorrowStream(shaped_buffer.device_ordinal())); + return backend_->transfer_manager()->TransferLiteralFromDevice(stream.get(), shaped_buffer); } @@ -263,7 +245,7 @@ Backend* LocalClient::mutable_backend() { StatusOr> LocalClient::Compile( const XlaComputation& computation, - const tensorflow::gtl::ArraySlice argument_layouts, + const absl::Span argument_layouts, const ExecutableBuildOptions& options) { ExecutableBuildOptions updated_options = options; if (options.device_ordinal() == -1) { @@ -274,9 +256,9 @@ StatusOr> LocalClient::Compile( TF_ASSIGN_OR_RETURN(std::unique_ptr executable, local_service_->CompileExecutable( computation, argument_layouts, updated_options)); - return WrapUnique(new LocalExecutable(std::move(executable), - local_service_->mutable_backend(), - updated_options)); + return absl::WrapUnique(new LocalExecutable(std::move(executable), + local_service_->mutable_backend(), + updated_options)); } StatusOr LocalClient::LiteralToShapedBuffer( @@ -288,22 +270,26 @@ StatusOr LocalClient::LiteralToShapedBuffer( TF_ASSIGN_OR_RETURN(auto scoped_buffer, backend().transfer_manager()->AllocateScopedShapedBuffer( literal.shape(), allocator, device_ordinal)); - TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, - backend().stream_executor(device_ordinal)); + TF_ASSIGN_OR_RETURN(auto stream, + mutable_backend()->BorrowStream(device_ordinal)); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice( - executor, literal, scoped_buffer)); + stream.get(), literal, scoped_buffer)); return std::move(scoped_buffer); } StatusOr> LocalClient::ShapedBufferToLiteral( const ShapedBuffer& shaped_buffer) { - TF_ASSIGN_OR_RETURN( - se::StreamExecutor * executor, - backend().stream_executor(shaped_buffer.device_ordinal())); - return backend().transfer_manager()->TransferLiteralFromDevice(executor, + TF_ASSIGN_OR_RETURN(auto stream, mutable_backend()->BorrowStream( + shaped_buffer.device_ordinal())); + return backend().transfer_manager()->TransferLiteralFromDevice(stream.get(), shaped_buffer); } +StatusOr LocalClient::GlobalDataToShapedBuffer( + const GlobalDataHandle& data, int replica_number) { + return local_service_->GlobalDataToShapedBuffer(data, replica_number); +} + Status LocalClient::TransferToInfeedLocal(const Literal& literal, int device_ordinal) { TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, @@ -316,7 +302,7 @@ StatusOr> LocalClient::TransferFromOutfeedLocal( const Shape& shape, int device_ordinal) { TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, backend().stream_executor(device_ordinal)); - auto literal = MakeUnique(); + auto literal = Literal::CreateFromShape(shape); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralFromOutfeed( executor, shape, literal.get())); return std::move(literal); diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h index 3f23e52fc2126cf07e9a1b0b0a4f0a9532214868..56c3a3da023ebf124b4bd91c2c608d0cd00a2381 100644 --- a/tensorflow/compiler/xla/client/local_client.h +++ b/tensorflow/compiler/xla/client/local_client.h @@ -18,18 +18,19 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/executable_build_options.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/executable.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -39,7 +40,7 @@ class LocalExecutable { // Run the compiled computation with the given arguments and options and // return the result. StatusOr Run( - const tensorflow::gtl::ArraySlice arguments, + const absl::Span arguments, ExecutableRunOptions run_options); // Return the options used to build the executable. @@ -62,7 +63,7 @@ class LocalExecutable { // The given ExecutableRunOptions override any values from legacy_flags // (TF_XLA_FLAGS environment variable). Status ValidateExecutionOptions( - const tensorflow::gtl::ArraySlice arguments, + const absl::Span arguments, const ExecutableRunOptions& run_options, const Backend& backend); // Records the computation in a SessionModule proto with the arguments used to @@ -72,17 +73,15 @@ class LocalExecutable { // (TF_XLA_FLAGS environment variable). StatusOr ExecuteAndDump( const ServiceExecutableRunOptions* run_options, - const tensorflow::gtl::ArraySlice arguments); + const absl::Span arguments); // Records the arguments used to invoke the computation in a SessionModule // proto. - Status RecordArguments( - const tensorflow::gtl::ArraySlice arguments, - SessionModule* session_module); + Status RecordArguments(const absl::Span arguments, + HloSnapshot* hlo_snapshot); // Records the result of the computation in a SessionModule proto. - Status RecordResult(const ShapedBuffer* result, - SessionModule* session_module); + Status RecordResult(const ShapedBuffer* result, HloSnapshot* hlo_snapshot); // Returns a literal containing the contents of the given ShapedBuffer. StatusOr> LiteralFromShapedBuffer( @@ -120,7 +119,7 @@ class LocalClient : public Client { // (TF_XLA_FLAGS environment variable). StatusOr> Compile( const XlaComputation& computation, - const tensorflow::gtl::ArraySlice argument_layouts, + const absl::Span argument_layouts, const ExecutableBuildOptions& options); // Copy the literal data to the device with the given ordinal and return as a @@ -136,6 +135,11 @@ class LocalClient : public Client { StatusOr> ShapedBufferToLiteral( const ShapedBuffer& shaped_buffer); + // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid + // as long as the handle is valid. + StatusOr GlobalDataToShapedBuffer( + const GlobalDataHandle& data, int replica_number); + // Transfer the given literal to the infeed queue of the given device. // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does // not inherit from Client and there is no possibility of confusion with diff --git a/tensorflow/compiler/xla/client/padding.cc b/tensorflow/compiler/xla/client/padding.cc index 6a9cf466ac0a43ce214ef0e6aae9e6295f137b0f..992b13139c480900e7b983825be61ce88f14e11b 100644 --- a/tensorflow/compiler/xla/client/padding.cc +++ b/tensorflow/compiler/xla/client/padding.cc @@ -23,16 +23,15 @@ limitations under the License. namespace xla { -Status ValidatePaddingValues( - tensorflow::gtl::ArraySlice input_dimensions, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides) { +Status ValidatePaddingValues(absl::Span input_dimensions, + absl::Span window_dimensions, + absl::Span window_strides) { bool ok = input_dimensions.size() == window_dimensions.size() && input_dimensions.size() == window_strides.size(); if (!ok) { return InvalidArgument( - "Want input dimensions size %zu = window dimensions size %zu = window " - "strides size %zu", + "Want input dimensions size %u = window dimensions size %u = window " + "strides size %u", input_dimensions.size(), window_dimensions.size(), window_strides.size()); } @@ -40,9 +39,9 @@ Status ValidatePaddingValues( } std::vector> MakePadding( - tensorflow::gtl::ArraySlice input_dimensions, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, Padding padding) { + absl::Span input_dimensions, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding) { TF_CHECK_OK(ValidatePaddingValues(input_dimensions, window_dimensions, window_strides)); std::vector> low_high_padding; diff --git a/tensorflow/compiler/xla/client/padding.h b/tensorflow/compiler/xla/client/padding.h index e23b0b3a90a091bf80973525810793c3eda4a036..5c009bd49e48b158550a32e64b0d63e2840dd1a9 100644 --- a/tensorflow/compiler/xla/client/padding.h +++ b/tensorflow/compiler/xla/client/padding.h @@ -19,9 +19,9 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { @@ -41,10 +41,9 @@ enum class Padding { // Validates that the slices are acceptable for determining padding -- this can // be used to check the preconditions of MakePadding below to produce an error // message that can be returned to the user. -Status ValidatePaddingValues( - tensorflow::gtl::ArraySlice input_dimensions, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides); +Status ValidatePaddingValues(absl::Span input_dimensions, + absl::Span window_dimensions, + absl::Span window_strides); // Returns the padding needed for the base area, given the base area dimensions, // window dimensions, strides, and the type of padding. @@ -58,9 +57,9 @@ Status ValidatePaddingValues( // window_dimensions, and strides must match, which is equal to the number // of elements in the result vector. std::vector> MakePadding( - tensorflow::gtl::ArraySlice input_dimensions, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, Padding padding); + absl::Span input_dimensions, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding); } // namespace xla diff --git a/tensorflow/compiler/xla/client/sharding_builder.h b/tensorflow/compiler/xla/client/sharding_builder.h index 34763e54d946690289ff42a7712b980168933eee..59df3a8762c755848982bc8e2590de968ed2adb6 100644 --- a/tensorflow/compiler/xla/client/sharding_builder.h +++ b/tensorflow/compiler/xla/client/sharding_builder.h @@ -56,4 +56,4 @@ OpSharding Tuple(const ShapeTree& shardings); } // namespace sharding_builder } // namespace xla -#endif +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_SHARDING_BUILDER_H_ diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc new file mode 100644 index 0000000000000000000000000000000000000000..e639028ccda11ae7e873f601c2f95749bce178c0 --- /dev/null +++ b/tensorflow/compiler/xla/client/xla_builder.cc @@ -0,0 +1,3043 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/xla_builder.h" + +#include +#include +#include +#include +#include + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "absl/strings/match.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/client/sharding_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/execution_options_util.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/shape_inference.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/gtl/flatset.h" +#include "tensorflow/core/platform/mutex.h" + +namespace xla { + +using absl::StrCat; + +namespace { + +int64 GetUniqueId() { + static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); + static int64 built_counter = 0; + tensorflow::mutex_lock loc(mu); + const int64 id = built_counter++; + return id; +} + +} // namespace + +XlaOp operator-(const XlaOp& x) { return Neg(x); } +XlaOp operator+(const XlaOp& x, const XlaOp& y) { return Add(x, y); } +XlaOp operator-(const XlaOp& x, const XlaOp& y) { return Sub(x, y); } +XlaOp operator*(const XlaOp& x, const XlaOp& y) { return Mul(x, y); } +XlaOp operator/(const XlaOp& x, const XlaOp& y) { return Div(x, y); } +XlaOp operator%(const XlaOp& x, const XlaOp& y) { return Rem(x, y); } + +XlaOp operator~(const XlaOp& x) { return Not(x); } +XlaOp operator&(const XlaOp& x, const XlaOp& y) { return And(x, y); } +XlaOp operator|(const XlaOp& x, const XlaOp& y) { return Or(x, y); } +XlaOp operator^(const XlaOp& x, const XlaOp& y) { return Xor(x, y); } +XlaOp operator<<(const XlaOp& x, const XlaOp& y) { return ShiftLeft(x, y); } + +XlaOp operator>>(const XlaOp& x, const XlaOp& y) { + XlaBuilder* builder = x.builder(); + return builder->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x)); + if (!ShapeUtil::ElementIsIntegral(shape)) { + return InvalidArgument( + "Argument to >> operator does not have an integral type (%s).", + ShapeUtil::HumanString(shape)); + } + if (ShapeUtil::ElementIsSigned(shape)) { + return ShiftRightArithmetic(x, y); + } else { + return ShiftRightLogical(x, y); + } + }); +} + +StatusOr XlaBuilder::GetShape(const XlaOp& op) const { + TF_RETURN_IF_ERROR(first_error_); + + TF_ASSIGN_OR_RETURN(auto instr, LookUpInstruction(op)); + return instr->shape(); +} + +StatusOr> XlaBuilder::GetOperandShapes( + absl::Span operands) const { + std::vector operand_shapes; + for (const XlaOp& operand : operands) { + TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand)); + operand_shapes.push_back(shape); + } + return operand_shapes; +} + +XlaBuilder::XlaBuilder(const string& computation_name) + : name_(computation_name) {} + +XlaBuilder::~XlaBuilder() {} + +XlaOp XlaBuilder::ReportError(const Status& error) { + CHECK(!error.ok()); + if (die_immediately_on_error_) { + LOG(FATAL) << "error building computation: " << error; + } + + if (first_error_.ok()) { + first_error_ = error; + first_error_backtrace_.CreateCurrent(/*skip_count=*/1); + } + return XlaOp(this); +} + +XlaOp XlaBuilder::ReportErrorOrReturn(const StatusOr& op) { + if (!first_error_.ok()) { + return XlaOp(this); + } + if (!op.ok()) { + return ReportError(op.status()); + } + return op.ValueOrDie(); +} + +XlaOp XlaBuilder::ReportErrorOrReturn( + const std::function()>& op_creator) { + return ReportErrorOrReturn(op_creator()); +} + +StatusOr XlaBuilder::GetProgramShape(int64 root_id) const { + TF_RETURN_IF_ERROR(first_error_); + TF_RET_CHECK((root_id >= 0) && (root_id < instructions_.size())); + + ProgramShape program_shape; + + *program_shape.mutable_result() = instructions_[root_id].shape(); + + // Check that the parameter numbers are continuous from 0, and add parameter + // shapes and names to the program shape. + const int64 param_count = parameter_numbers_.size(); + for (int64 i = 0; i < param_count; i++) { + program_shape.add_parameters(); + program_shape.add_parameter_names(); + } + for (const HloInstructionProto& instr : instructions_) { + // Parameter number uniqueness is guaranteed in XlaBuilder::Parameter(). So + // to verify continuity, we just need to verify that every parameter is in + // the right range. + if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter)) { + const int64 index = instr.parameter_number(); + TF_RET_CHECK(index >= 0 && index < param_count) + << "invalid parameter number: " << index; + *program_shape.mutable_parameters(index) = instr.shape(); + *program_shape.mutable_parameter_names(index) = instr.name(); + } + } + return program_shape; +} + +StatusOr XlaBuilder::GetProgramShape() const { + TF_RET_CHECK(!instructions_.empty()); + return GetProgramShape(instructions_.back().id()); +} + +StatusOr XlaBuilder::GetProgramShape(XlaOp root) const { + if (root.builder_ != this) { + return InvalidArgument("Given root operation is not in this computation."); + } + return GetProgramShape(root.handle()); +} + +void XlaBuilder::IsConstantVisitor(const int64 op_handle, + std::set* visited, + bool* is_constant) const { + if (visited->count(op_handle) != 0 || !*is_constant) { + return; + } + + CHECK(op_handle < instructions_.size() && op_handle >= 0); + + const HloInstructionProto& instr = instructions_[op_handle]; + const HloOpcode opcode = StringToHloOpcode(instr.opcode()).ValueOrDie(); + switch (opcode) { + default: + for (const int64 operand_id : instr.operand_ids()) { + IsConstantVisitor(operand_id, visited, is_constant); + } + // TODO(b/32495713): We aren't checking the called computations. + break; + + // Non functional ops. + case HloOpcode::kRng: + case HloOpcode::kCrossReplicaSum: + // TODO(b/33009255): Implmement constant folding for cross replica sum. + case HloOpcode::kInfeed: + case HloOpcode::kOutfeed: + case HloOpcode::kCall: + // TODO(b/32495713): We aren't checking the to_apply computation itself, + // so we conservatively say that computations containing the Call op + // cannot be constant. We cannot set is_functional=false in other similar + // cases since we're already relying on IsConstant to return true. + case HloOpcode::kCustomCall: + case HloOpcode::kWhile: + // TODO(b/32495713): We aren't checking the condition and body + // computations themselves. + case HloOpcode::kSend: + case HloOpcode::kRecv: + case HloOpcode::kParameter: + *is_constant = false; + break; + } + if (!*is_constant) { + VLOG(1) << "Non-constant: " << instr.name(); + } + visited->insert(op_handle); +} + +XlaComputation XlaBuilder::BuildAndNoteError() { + DCHECK(parent_builder_ != nullptr); + auto build_status = Build(); + if (!build_status.ok()) { + parent_builder_->ReportError( + AddStatus(build_status.status(), absl::StrCat("error from: ", name_))); + return {}; + } + return build_status.ConsumeValueOrDie(); +} + +StatusOr XlaBuilder::Build() { + if (!first_error_.ok()) { + string backtrace; + first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace); + return AppendStatus(first_error_, backtrace); + } + return Build(instructions_.back().id()); +} + +StatusOr XlaBuilder::Build(XlaOp root) { + if (root.builder_ != this) { + return InvalidArgument("Given root operation is not in this computation."); + } + return Build(root.handle()); +} + +StatusOr XlaBuilder::Build(int64 root_id) { + if (!first_error_.ok()) { + string backtrace; + first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace); + return AppendStatus(first_error_, backtrace); + } + + HloComputationProto entry; + entry.set_id(GetUniqueId()); // Give the computation a global unique id. + entry.set_name(StrCat(name_, entry.id())); // Ensure that the name is unique. + + TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(), GetProgramShape(root_id)); + entry.set_root_id(root_id); + + for (auto& instruction : instructions_) { + // Ensures that the instruction names are unique among the whole graph. + const string& new_name = + StrCat(instruction.name(), ".", entry.id(), ".", instruction.id()); + instruction.set_name(new_name); + entry.add_instructions()->Swap(&instruction); + } + + XlaComputation computation(entry.id()); + HloModuleProto* module = computation.mutable_proto(); + module->set_name(entry.name()); + module->set_id(entry.id()); + module->set_entry_computation_name(entry.name()); + module->set_entry_computation_id(entry.id()); + *module->mutable_program_shape() = entry.program_shape(); + for (auto& e : embedded_) { + module->add_computations()->Swap(&e.second); + } + module->add_computations()->Swap(&entry); + + // Clear data held by this builder. + this->instructions_.clear(); + this->embedded_.clear(); + this->parameter_numbers_.clear(); + + return std::move(computation); +} + +StatusOr XlaBuilder::InDimBroadcast( + const Shape& shape, const XlaOp& operand, + absl::Span broadcast_dimensions) { + TF_RETURN_IF_ERROR(first_error_); + + HloInstructionProto instr; + *instr.mutable_shape() = shape; + for (int64 dim : broadcast_dimensions) { + instr.add_dimensions(dim); + } + return AddInstruction(std::move(instr), HloOpcode::kBroadcast, {operand}); +} + +StatusOr XlaBuilder::AddBroadcastSequence(const Shape& output_shape, + const XlaOp& operand) { + TF_RETURN_IF_ERROR(first_error_); + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + + CHECK(ShapeUtil::IsScalar(operand_shape) || + ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape)); + Shape broadcast_shape = + ShapeUtil::ChangeElementType(output_shape, operand_shape.element_type()); + + // Do explicit broadcast for scalar. + if (ShapeUtil::IsScalar(operand_shape)) { + return InDimBroadcast(broadcast_shape, operand, {}); + } + + // Do explicit broadcast for degenerate broadcast. + std::vector broadcast_dimensions; + std::vector reshaped_dimensions; + for (int i = 0; i < ShapeUtil::Rank(operand_shape); i++) { + if (operand_shape.dimensions(i) == output_shape.dimensions(i)) { + broadcast_dimensions.push_back(i); + reshaped_dimensions.push_back(operand_shape.dimensions(i)); + } else { + TF_RET_CHECK(operand_shape.dimensions(i) == 1) + << "An explicit broadcast sequence requires the broadcasted " + "dimensions to be trivial; operand shape: " + << operand_shape << "; output_shape: " << output_shape; + } + } + // Eliminate the size one dimensions. + TF_ASSIGN_OR_RETURN(XlaOp reshaped_operand, + Reshape(ShapeUtil::MakeShape(operand_shape.element_type(), + reshaped_dimensions), + operand)); + // Broadcast 'reshape' up to the larger size. + return InDimBroadcast(broadcast_shape, reshaped_operand, + broadcast_dimensions); +} + +XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferUnaryOpShape(unop, operand_shape)); + return AddInstruction(std::move(instr), unop, {operand}); + }); +} + +XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); + TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferBinaryOpShape( + binop, lhs_shape, rhs_shape, broadcast_dimensions)); + + const int64 lhs_rank = ShapeUtil::Rank(lhs_shape); + const int64 rhs_rank = ShapeUtil::Rank(rhs_shape); + + XlaOp updated_lhs = lhs; + XlaOp updated_rhs = rhs; + + if (!broadcast_dimensions.empty() && lhs_rank != rhs_rank) { + const bool should_broadcast_lhs = lhs_rank < rhs_rank; + XlaOp from = should_broadcast_lhs ? lhs : rhs; + const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape; + + std::vector to_size; + for (int64 size : instr.shape().dimensions()) { + to_size.push_back(size); + } + for (int64 from_dim = 0; from_dim < ShapeUtil::Rank(from_shape); + from_dim++) { + int64 to_dim = broadcast_dimensions[from_dim]; + to_size[to_dim] = from_shape.dimensions(from_dim); + } + + const Shape& broadcasted_shape = + ShapeUtil::MakeShape(from_shape.element_type(), to_size); + TF_ASSIGN_OR_RETURN( + XlaOp broadcasted_operand, + InDimBroadcast(broadcasted_shape, from, broadcast_dimensions)); + + updated_lhs = should_broadcast_lhs ? broadcasted_operand : lhs; + updated_rhs = !should_broadcast_lhs ? broadcasted_operand : rhs; + } + + TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, GetShape(updated_lhs)); + if (!ShapeUtil::SameDimensions(instr.shape(), updated_lhs_shape)) { + TF_ASSIGN_OR_RETURN(updated_lhs, + AddBroadcastSequence(instr.shape(), updated_lhs)); + } + TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, GetShape(updated_rhs)); + if (!ShapeUtil::SameDimensions(instr.shape(), updated_rhs_shape)) { + TF_ASSIGN_OR_RETURN(updated_rhs, + AddBroadcastSequence(instr.shape(), updated_rhs)); + } + + return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs}); + }); +} + +XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs, + const XlaOp& ehs) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); + TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); + TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(ehs)); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferTernaryOpShape( + triop, lhs_shape, rhs_shape, ehs_shape)); + XlaOp updated_lhs = lhs; + XlaOp updated_rhs = rhs; + XlaOp updated_ehs = ehs; + if (!ShapeUtil::IsTuple(instr.shape())) { + if (!ShapeUtil::IsTuple(lhs_shape) && + !ShapeUtil::SameDimensions(instr.shape(), lhs_shape)) { + // lhs is being implicitly broadcasted. Change to explicit. + TF_ASSIGN_OR_RETURN(updated_lhs, + AddBroadcastSequence(instr.shape(), lhs)); + } + if (!ShapeUtil::IsTuple(rhs_shape) && + !ShapeUtil::SameDimensions(instr.shape(), rhs_shape)) { + // rhs is being implicitly broadcasted. Change to explicit. + TF_ASSIGN_OR_RETURN(updated_rhs, + AddBroadcastSequence(instr.shape(), rhs)); + } + if (!ShapeUtil::IsTuple(ehs_shape) && + !ShapeUtil::SameDimensions(instr.shape(), ehs_shape)) { + // ehs is being implicitly broadcasted. Change to explicit. + TF_ASSIGN_OR_RETURN(updated_ehs, + AddBroadcastSequence(instr.shape(), ehs)); + } + } + return AddInstruction(std::move(instr), triop, + {updated_lhs, updated_rhs, updated_ehs}); + }); +} + +XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kAdd, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kMultiply, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + *instr.mutable_shape() = literal.shape(); + *instr.mutable_literal() = literal.ToProto(); + return AddInstruction(std::move(instr), HloOpcode::kConstant); + }); +} + +XlaOp XlaBuilder::Iota(const Shape& shape, int64 iota_dimension) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + *instr.mutable_shape() = shape; + instr.add_dimensions(iota_dimension); + return AddInstruction(std::move(instr), HloOpcode::kIota); + }); +} + +XlaOp XlaBuilder::Iota(PrimitiveType type, int64 size) { + return Iota(ShapeUtil::MakeShape(type, {size}), /*iota_dimension=*/0); +} + +XlaOp XlaBuilder::Call(const XlaComputation& computation, + absl::Span operands) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + std::vector operand_shape_ptrs; + TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands)); + absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs), + [](const Shape& shape) { return &shape; }); + TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape, + computation.GetProgramShape()); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferCallShape(operand_shape_ptrs, + /*to_apply=*/called_program_shape)); + + AddCalledComputation(computation, &instr); + + return AddInstruction(std::move(instr), HloOpcode::kCall, operands); + }); +} + +XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape, + const string& name) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + if (!parameter_numbers_.insert(parameter_number).second) { + return InvalidArgument("parameter %d already registered", + parameter_number); + } + instr.set_parameter_number(parameter_number); + instr.set_name(name); + *instr.mutable_shape() = shape; + return AddInstruction(std::move(instr), HloOpcode::kParameter); + }); +} + +XlaOp XlaBuilder::Broadcast(const XlaOp& operand, + absl::Span broadcast_sizes) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN( + const Shape& shape, + ShapeInference::InferBroadcastShape(operand_shape, broadcast_sizes)); + + // The client-level broadcast op just appends dimensions on the left (adds + // lowest numbered dimensions). The HLO broadcast instruction is more + // flexible and can add new dimensions anywhere. The instruction's + // dimensions field maps operand dimensions to dimensions in the broadcast + // output, so to append dimensions on the left the instruction's dimensions + // should just be the n highest dimension numbers of the output shape where + // n is the number of input dimensions. + const int64 operand_rank = ShapeUtil::Rank(operand_shape); + std::vector dimensions(operand_rank); + for (int i = 0; i < operand_rank; ++i) { + dimensions[i] = i + ShapeUtil::Rank(shape) - operand_rank; + } + return InDimBroadcast(shape, operand, dimensions); + }); +} + +XlaOp XlaBuilder::BroadcastInDim( + const XlaOp& operand, const Shape& shape, + const absl::Span broadcast_dimensions) { + return ReportErrorOrReturn([&]() -> StatusOr { + return InDimBroadcast(shape, operand, broadcast_dimensions); + }); +} + +StatusOr XlaBuilder::Reshape(const Shape& shape, const XlaOp& operand) { + TF_RETURN_IF_ERROR(first_error_); + + HloInstructionProto instr; + *instr.mutable_shape() = shape; + return AddInstruction(std::move(instr), HloOpcode::kReshape, {operand}); +} + +XlaOp XlaBuilder::Slice(const XlaOp& operand, + absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferSliceShape(operand_shape, start_indices, + limit_indices, strides)); + for (int i = 0; i < start_indices.size(); i++) { + auto* slice_config = instr.add_slice_dimensions(); + slice_config->set_start(start_indices[i]); + slice_config->set_limit(limit_indices[i]); + slice_config->set_stride(strides[i]); + } + + return AddInstruction(std::move(instr), HloOpcode::kSlice, {operand}); + }); +} + +XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index, + int64 limit_index, int64 stride, int64 dimno) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand)); + std::vector starts(ShapeUtil::Rank(shape), 0); + std::vector limits(shape.dimensions().begin(), + shape.dimensions().end()); + std::vector strides(ShapeUtil::Rank(shape), 1); + starts[dimno] = start_index; + limits[dimno] = limit_index; + strides[dimno] = stride; + return Slice(operand, starts, limits, strides); + }); +} + +XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices, + absl::Span slice_sizes) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape, + GetShape(start_indices)); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferDynamicSliceShape( + operand_shape, start_indices_shape, slice_sizes)); + + for (int64 size : slice_sizes) { + instr.add_dynamic_slice_sizes(size); + } + + return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, + {operand, start_indices}); + }); +} + +XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update, + const XlaOp& start_indices) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update)); + TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape, + GetShape(start_indices)); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferDynamicUpdateSliceShape( + operand_shape, update_shape, start_indices_shape)); + + return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice, + {operand, update, start_indices}); + }); +} + +XlaOp XlaBuilder::ConcatInDim(absl::Span operands, + int64 dimension) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + std::vector operand_shape_ptrs; + TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands)); + absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs), + [](const Shape& shape) { return &shape; }); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension)); + + instr.add_dimensions(dimension); + + return AddInstruction(std::move(instr), HloOpcode::kConcatenate, operands); + }); +} + +XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value, + const PaddingConfig& padding_config) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(const Shape& padding_value_shape, + GetShape(padding_value)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferPadShape(operand_shape, padding_value_shape, + padding_config)); + + *instr.mutable_padding_config() = padding_config; + + return AddInstruction(std::move(instr), HloOpcode::kPad, + {operand, padding_value}); + }); +} + +XlaOp XlaBuilder::Reshape(const XlaOp& operand, + absl::Span dimensions, + absl::Span new_sizes) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(const Shape& shape, + ShapeInference::InferReshapeShape( + operand_shape, dimensions, new_sizes)); + XlaOp transposed = IsIdentityPermutation(dimensions) + ? operand + : Transpose(operand, dimensions); + return Reshape(shape, transposed); + }); +} + +XlaOp XlaBuilder::Reshape(const XlaOp& operand, + absl::Span new_sizes) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(auto shape, GetShape(operand)); + std::vector dimensions(shape.dimensions_size()); + std::iota(dimensions.begin(), dimensions.end(), 0); + return Reshape(operand, dimensions, new_sizes); + }); +} + +XlaOp XlaBuilder::Collapse(const XlaOp& operand, + absl::Span dimensions) { + return ReportErrorOrReturn([&]() -> StatusOr { + if (dimensions.size() <= 1) { + // Not collapsing anything, trivially we can return the operand versus + // enqueueing a trivial reshape. + return operand; + } + + // Out-of-order collapse is not supported. + // Checks that the collapsed dimensions are in order and consecutive. + for (absl::Span::size_type i = 1; i < dimensions.size(); ++i) { + if (dimensions[i] - 1 != dimensions[i - 1]) { + return InvalidArgument( + "Collapsed dimensions are not in consecutive order."); + } + } + + // Create a new sizes vector from the old shape, replacing the collapsed + // dimensions by the product of their sizes. + TF_ASSIGN_OR_RETURN(const Shape& original_shape, GetShape(operand)); + + VLOG(3) << "original shape: " << ShapeUtil::HumanString(original_shape); + VLOG(3) << "dims to collapse: " << absl::StrJoin(dimensions, ","); + + std::vector new_sizes; + for (int i = 0; i < ShapeUtil::Rank(original_shape); ++i) { + if (i <= dimensions.front() || i > dimensions.back()) { + new_sizes.push_back(original_shape.dimensions(i)); + } else { + new_sizes.back() *= original_shape.dimensions(i); + } + } + + VLOG(3) << "new sizes: [" << absl::StrJoin(new_sizes, ",") << "]"; + + return Reshape(operand, new_sizes); + }); +} + +void XlaBuilder::Trace(const string& tag, const XlaOp& operand) { + ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + *instr.mutable_shape() = ShapeUtil::MakeNil(); + *instr.mutable_literal() = LiteralUtil::CreateR1U8(tag)->ToProto(); + return AddInstruction(std::move(instr), HloOpcode::kTrace, {operand}); + }); +} + +XlaOp XlaBuilder::Select(const XlaOp& pred, const XlaOp& on_true, + const XlaOp& on_false) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& true_shape, GetShape(on_true)); + TF_ASSIGN_OR_RETURN(const Shape& false_shape, GetShape(on_false)); + TF_RET_CHECK(ShapeUtil::IsTuple(true_shape) == + ShapeUtil::IsTuple(false_shape)); + HloOpcode opcode = ShapeUtil::IsTuple(true_shape) ? HloOpcode::kTupleSelect + : HloOpcode::kSelect; + return TernaryOp(opcode, pred, on_true, on_false); + }); +} + +XlaOp XlaBuilder::Tuple(absl::Span elements) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + std::vector operand_shape_ptrs; + TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements)); + absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs), + [](const Shape& shape) { return &shape; }); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferVariadicOpShape( + HloOpcode::kTuple, operand_shape_ptrs)); + return AddInstruction(std::move(instr), HloOpcode::kTuple, elements); + }); +} + +XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& tuple_shape, GetShape(tuple_data)); + if (!ShapeUtil::IsTuple(tuple_shape)) { + return InvalidArgument( + "Operand to GetTupleElement() is not a tuple; got %s", + ShapeUtil::HumanString(tuple_shape)); + } + *instr.mutable_shape() = + ShapeUtil::GetTupleElementShape(tuple_shape, index); + + instr.set_tuple_index(index); + + return AddInstruction(std::move(instr), HloOpcode::kGetTupleElement, + {tuple_data}); + }); +} + +XlaOp XlaBuilder::Eq(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kEq, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Ne(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kNe, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Ge(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kGe, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Gt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kGt, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Le(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kLe, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kLt, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs, + const PrecisionConfigProto* precision_config_proto) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); + + DotDimensionNumbers dimension_numbers; + dimension_numbers.add_lhs_contracting_dimensions( + lhs_shape.dimensions_size() == 1 ? 0 : 1); + dimension_numbers.add_rhs_contracting_dimensions(0); + return DotGeneral(lhs, rhs, dimension_numbers, precision_config_proto); + }); +} + +XlaOp XlaBuilder::DotGeneral( + const XlaOp& lhs, const XlaOp& rhs, + const DotDimensionNumbers& dimension_numbers, + const PrecisionConfigProto* precision_config_proto) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); + TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, + dimension_numbers)); + *instr.mutable_dot_dimension_numbers() = dimension_numbers; + if (precision_config_proto != nullptr) { + *instr.mutable_precision_config() = *precision_config_proto; + } + return AddInstruction(std::move(instr), HloOpcode::kDot, {lhs, rhs}); + }); +} + +Status XlaBuilder::VerifyConvolution( + const Shape& lhs_shape, const Shape& rhs_shape, + const ConvolutionDimensionNumbers& dimension_numbers) const { + if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) { + return InvalidArgument( + "Convolution arguments must have same number of " + "dimensions. Got: %s and %s", + ShapeUtil::HumanString(lhs_shape), ShapeUtil::HumanString(rhs_shape)); + } + int num_dims = ShapeUtil::Rank(lhs_shape); + if (num_dims < 2) { + return InvalidArgument( + "Convolution expects argument arrays with >= 3 dimensions. " + "Got: %s and %s", + ShapeUtil::HumanString(lhs_shape), ShapeUtil::HumanString(rhs_shape)); + } + int num_spatial_dims = num_dims - 2; + + const auto check_spatial_dimensions = + [&](const char* const field_name, + const tensorflow::protobuf::RepeatedField& + numbers) { + if (numbers.size() != num_spatial_dims) { + return InvalidArgument("Expected %d elements for %s, but got %d.", + num_spatial_dims, field_name, numbers.size()); + } + for (int i = 0; i < numbers.size(); ++i) { + if (numbers.Get(i) < 0 || numbers.Get(i) >= num_dims) { + return InvalidArgument("Convolution %s[%d] is out of bounds: %d", + field_name, i, numbers.Get(i)); + } + } + return Status::OK(); + }; + TF_RETURN_IF_ERROR( + check_spatial_dimensions("input_spatial_dimensions", + dimension_numbers.input_spatial_dimensions())); + TF_RETURN_IF_ERROR( + check_spatial_dimensions("kernel_spatial_dimensions", + dimension_numbers.kernel_spatial_dimensions())); + return check_spatial_dimensions( + "output_spatial_dimensions", + dimension_numbers.output_spatial_dimensions()); +} + +XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, Padding padding, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return ConvWithGeneralDimensions( + lhs, rhs, window_strides, padding, + CreateDefaultConvDimensionNumbers(window_strides.size()), + feature_group_count, precision_config_proto); +} + +XlaOp XlaBuilder::ConvWithGeneralPadding( + const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, + absl::Span> padding, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return ConvGeneral(lhs, rhs, window_strides, padding, + CreateDefaultConvDimensionNumbers(window_strides.size()), + feature_group_count, precision_config_proto); +} + +XlaOp XlaBuilder::ConvWithGeneralDimensions( + const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, + Padding padding, const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); + TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); + + TF_RETURN_IF_ERROR( + VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers)); + + std::vector base_area_dimensions( + dimension_numbers.input_spatial_dimensions_size()); + for (std::vector::size_type i = 0; i < base_area_dimensions.size(); + ++i) { + base_area_dimensions[i] = + lhs_shape.dimensions(dimension_numbers.input_spatial_dimensions(i)); + } + + std::vector window_dimensions( + dimension_numbers.kernel_spatial_dimensions_size()); + for (std::vector::size_type i = 0; i < window_dimensions.size(); + ++i) { + window_dimensions[i] = + rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i)); + } + + return ConvGeneral(lhs, rhs, window_strides, + MakePadding(base_area_dimensions, window_dimensions, + window_strides, padding), + dimension_numbers, feature_group_count, + precision_config_proto); + }); +} + +XlaOp XlaBuilder::ConvGeneral( + const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, + absl::Span> padding, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {}, + dimension_numbers, feature_group_count, + precision_config_proto); +} + +XlaOp XlaBuilder::ConvGeneralDilated( + const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, + absl::Span> padding, + absl::Span lhs_dilation, absl::Span rhs_dilation, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); + TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); + TF_RETURN_IF_ERROR( + VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers)); + + std::vector window_dimensions( + dimension_numbers.kernel_spatial_dimensions_size()); + for (std::vector::size_type i = 0; i < window_dimensions.size(); + ++i) { + window_dimensions[i] = + rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i)); + } + TF_ASSIGN_OR_RETURN(*instr.mutable_window(), + MakeWindow(window_dimensions, window_strides, padding, + lhs_dilation, rhs_dilation)); + + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferConvolveShape( + lhs_shape, rhs_shape, instr.window(), + dimension_numbers, feature_group_count)); + + *instr.mutable_convolution_dimension_numbers() = dimension_numbers; + instr.set_feature_group_count(feature_group_count); + + if (precision_config_proto != nullptr) { + *instr.mutable_precision_config() = *precision_config_proto; + } + + return AddInstruction(std::move(instr), HloOpcode::kConvolution, + {lhs, rhs}); + }); +} + +StatusOr XlaBuilder::MakeWindow( + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding, + absl::Span lhs_dilation, + absl::Span rhs_dilation) const { + const auto verify_size = [&](const size_t x, const char* x_name) { + if (x == 0 || x == window_dimensions.size()) { + return Status::OK(); + } else { + return InvalidArgument( + "%s", absl::StrCat( + "Window has different number of window dimensions than of ", + x_name, + "\nNumber of window dimensions: ", window_dimensions.size(), + "\nNumber of ", x_name, ": ", x, "\n")); + } + }; + TF_RETURN_IF_ERROR(verify_size(window_strides.size(), "window strides")); + TF_RETURN_IF_ERROR(verify_size(padding.size(), "padding entries")); + TF_RETURN_IF_ERROR(verify_size(lhs_dilation.size(), "lhs dilation factors")); + TF_RETURN_IF_ERROR(verify_size(rhs_dilation.size(), "rhs dilation factors")); + + Window window; + for (size_t i = 0; i < window_dimensions.size(); i++) { + auto dim = window.add_dimensions(); + dim->set_size(window_dimensions[i]); + if (!window_strides.empty()) { + dim->set_stride(window_strides[i]); + } else { + dim->set_stride(1); + } + if (!padding.empty()) { + dim->set_padding_low(padding[i].first); + dim->set_padding_high(padding[i].second); + } else { + dim->set_padding_low(0); + dim->set_padding_high(0); + } + if (!lhs_dilation.empty()) { + dim->set_base_dilation(lhs_dilation[i]); + } else { + dim->set_base_dilation(1); + } + if (!rhs_dilation.empty()) { + dim->set_window_dilation(rhs_dilation[i]); + } else { + dim->set_window_dilation(1); + } + dim->set_window_reversal(false); + } + return window; +} + +XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type, + const absl::Span fft_length) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferFftShape(operand_shape, fft_type, fft_length)); + + instr.set_fft_type(fft_type); + for (int64 i : fft_length) { + instr.add_fft_length(i); + } + + return AddInstruction(std::move(instr), HloOpcode::kFft, {operand}); + }); +} + +XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + if (!LayoutUtil::HasLayout(shape)) { + return InvalidArgument("Given shape to Infeed must have a layout"); + } + const Shape infeed_instruction_shape = + ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()}); + *instr.mutable_shape() = infeed_instruction_shape; + instr.set_infeed_config(config); + + if (ShapeUtil::IsArray(shape) && sharding() && + sharding()->type() == OpSharding::Type::OpSharding_Type_OTHER) { + // TODO(b/110793772): Support tiled array-shaped infeeds. + return InvalidArgument( + "Tiled sharding is not yet supported for array-shaped infeeds"); + } + + if (sharding() && + sharding()->type() == OpSharding::Type::OpSharding_Type_REPLICATED) { + return InvalidArgument( + "Replicated sharding is not yet supported for infeeds"); + } + + // Infeed takes a single token operand. Generate the token to pass to the + // infeed. + XlaOp token; + auto make_token = [&]() { + HloInstructionProto token_instr; + *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + return AddInstruction(std::move(token_instr), HloOpcode::kAfterAll, {}); + }; + if (sharding()) { + // Arbitrarily assign token to device 0. + OpSharding sharding = sharding_builder::AssignDevice(0); + XlaScopedShardingAssignment scoped_sharding(this, sharding); + TF_ASSIGN_OR_RETURN(token, make_token()); + } else { + TF_ASSIGN_OR_RETURN(token, make_token()); + } + + // The sharding is set by the client according to the data tuple shape. + // However, the shape of the infeed instruction is a tuple containing the + // data and a token. For tuple sharding type, the sharding must be changed + // to accommodate the token. + XlaOp infeed; + if (sharding() && + sharding()->type() == OpSharding::Type::OpSharding_Type_TUPLE) { + // TODO(b/80000000): Remove this when clients have been updated to handle + // tokens. + OpSharding infeed_instruction_sharding = *sharding(); + // Arbitrarily assign the token to device 0. + *infeed_instruction_sharding.add_tuple_shardings() = + sharding_builder::AssignDevice(0); + XlaScopedShardingAssignment scoped_sharding(this, + infeed_instruction_sharding); + TF_ASSIGN_OR_RETURN(infeed, AddInstruction(std::move(instr), + HloOpcode::kInfeed, {token})); + } else { + TF_ASSIGN_OR_RETURN(infeed, AddInstruction(std::move(instr), + HloOpcode::kInfeed, {token})); + } + + // The infeed instruction produces a tuple of the infed data and a token + // type. Return XLA op containing the data. + // TODO(b/80000000): Remove this when clients have been updated to handle + // tokens. + HloInstructionProto infeed_data; + *infeed_data.mutable_shape() = shape; + infeed_data.set_tuple_index(0); + return AddInstruction(std::move(infeed_data), HloOpcode::kGetTupleElement, + {infeed}); + }); +} + +XlaOp XlaBuilder::InfeedWithToken(const XlaOp& token, const Shape& shape, + const string& config) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + if (!LayoutUtil::HasLayout(shape)) { + return InvalidArgument("Given shape to Infeed must have a layout"); + } + const Shape infeed_instruction_shape = + ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()}); + *instr.mutable_shape() = infeed_instruction_shape; + instr.set_infeed_config(config); + + if (ShapeUtil::IsArray(shape) && sharding() && + sharding()->type() == OpSharding::Type::OpSharding_Type_OTHER) { + // TODO(b/110793772): Support tiled array-shaped infeeds. + return InvalidArgument( + "Tiled sharding is not yet supported for array-shaped infeeds"); + } + + if (sharding() && + sharding()->type() == OpSharding::Type::OpSharding_Type_REPLICATED) { + return InvalidArgument( + "Replicated sharding is not yet supported for infeeds"); + } + + return AddInstruction(std::move(instr), HloOpcode::kInfeed, {token}); + }); +} + +void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout, + const string& outfeed_config) { + ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + *instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + + // Check and set outfeed shape. + if (!LayoutUtil::HasLayout(shape_with_layout)) { + return InvalidArgument("Given shape to Outfeed must have a layout"); + } + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) { + return InvalidArgument( + "Outfeed shape %s must be compatible with operand shape %s", + ShapeUtil::HumanStringWithLayout(shape_with_layout), + ShapeUtil::HumanStringWithLayout(operand_shape)); + } + *instr.mutable_outfeed_shape() = shape_with_layout; + + instr.set_outfeed_config(outfeed_config); + + // Outfeed takes a token as its second operand. Generate the token to pass + // to the outfeed. + HloInstructionProto token_instr; + *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr), + HloOpcode::kAfterAll, {})); + + TF_RETURN_IF_ERROR( + AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand, token}) + .status()); + + // The outfeed instruction produces a token. However, existing users expect + // a nil shape (empty tuple). This should only be relevant if the outfeed is + // the root of a computation. + // TODO(b/80000000): Remove this when clients have been updated to handle + // tokens. + HloInstructionProto tuple_instr; + *tuple_instr.mutable_shape() = ShapeUtil::MakeNil(); + + // The dummy tuple should have no sharding. + { + XlaScopedShardingAssignment scoped_sharding(this, OpSharding()); + TF_ASSIGN_OR_RETURN( + XlaOp empty_tuple, + AddInstruction(std::move(tuple_instr), HloOpcode::kTuple, {})); + return empty_tuple; + } + }); +} + +XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, + const string& outfeed_config) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + *instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + + // Check and set outfeed shape. + if (!LayoutUtil::HasLayout(shape_with_layout)) { + return InvalidArgument("Given shape to Outfeed must have a layout"); + } + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) { + return InvalidArgument( + "Outfeed shape %s must be compatible with operand shape %s", + ShapeUtil::HumanStringWithLayout(shape_with_layout), + ShapeUtil::HumanStringWithLayout(operand_shape)); + } + *instr.mutable_outfeed_shape() = shape_with_layout; + + instr.set_outfeed_config(outfeed_config); + + return AddInstruction(std::move(instr), HloOpcode::kOutfeed, + {operand, token}); + }); +} + +XlaOp XlaBuilder::CreateToken() { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + *instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + return AddInstruction(std::move(instr), HloOpcode::kAfterAll); + }); +} + +XlaOp XlaBuilder::AfterAll(absl::Span tokens) { + return ReportErrorOrReturn([&]() -> StatusOr { + if (tokens.empty()) { + return InvalidArgument("AfterAll requires at least one operand"); + } + HloInstructionProto instr; + *instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + return AddInstruction(std::move(instr), HloOpcode::kAfterAll, tokens); + }); +} + +XlaOp XlaBuilder::CustomCall(const string& call_target_name, + absl::Span operands, + const Shape& shape) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + if (absl::StartsWith(call_target_name, "$")) { + return InvalidArgument( + "Invalid custom_call_target \"%s\": Call targets that start with '$' " + "are reserved for internal use.", + call_target_name); + } + *instr.mutable_shape() = shape; + instr.set_custom_call_target(call_target_name); + return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands); + }); +} + +XlaOp XlaBuilder::Complex(const XlaOp& real, const XlaOp& imag, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions); +} + +XlaOp XlaBuilder::Conj(const XlaOp& operand) { + return Complex(Real(operand), Neg(Imag(operand))); +} + +XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kSubtract, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Div(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kDivide, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Rem(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kRemainder, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Max(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kMaximum, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Min(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kMinimum, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::And(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kAnd, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Or(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kOr, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Xor(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kXor, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::Not(const XlaOp& operand) { + return UnaryOp(HloOpcode::kNot, operand); +} + +XlaOp XlaBuilder::ShiftLeft(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kShiftLeft, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::ShiftRightArithmetic( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs, + broadcast_dimensions); +} + +XlaOp XlaBuilder::ShiftRightLogical( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs, + broadcast_dimensions); +} + +XlaOp XlaBuilder::Abs(const XlaOp& operand) { + return UnaryOp(HloOpcode::kAbs, operand); +} + +XlaOp XlaBuilder::Atan2(const XlaOp& y, const XlaOp& x, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kAtan2, y, x, broadcast_dimensions); +} + +XlaOp XlaBuilder::Exp(const XlaOp& operand) { + return UnaryOp(HloOpcode::kExp, operand); +} + +XlaOp XlaBuilder::Expm1(const XlaOp& operand) { + return UnaryOp(HloOpcode::kExpm1, operand); +} + +XlaOp XlaBuilder::Floor(const XlaOp& operand) { + return UnaryOp(HloOpcode::kFloor, operand); +} + +XlaOp XlaBuilder::Ceil(const XlaOp& operand) { + return UnaryOp(HloOpcode::kCeil, operand); +} + +XlaOp XlaBuilder::Round(const XlaOp& operand) { + return UnaryOp(HloOpcode::kRoundNearestAfz, operand); +} + +XlaOp XlaBuilder::Log(const XlaOp& operand) { + return UnaryOp(HloOpcode::kLog, operand); +} + +XlaOp XlaBuilder::Log1p(const XlaOp& operand) { + return UnaryOp(HloOpcode::kLog1p, operand); +} + +XlaOp XlaBuilder::Sign(const XlaOp& operand) { + return UnaryOp(HloOpcode::kSign, operand); +} + +XlaOp XlaBuilder::Clz(const XlaOp& operand) { + return UnaryOp(HloOpcode::kClz, operand); +} + +XlaOp XlaBuilder::Cos(const XlaOp& operand) { + return UnaryOp(HloOpcode::kCos, operand); +} + +XlaOp XlaBuilder::Sin(const XlaOp& operand) { + return UnaryOp(HloOpcode::kSin, operand); +} + +XlaOp XlaBuilder::Tanh(const XlaOp& operand) { + return UnaryOp(HloOpcode::kTanh, operand); +} + +XlaOp XlaBuilder::Real(const XlaOp& operand) { + return UnaryOp(HloOpcode::kReal, operand); +} + +XlaOp XlaBuilder::Imag(const XlaOp& operand) { + return UnaryOp(HloOpcode::kImag, operand); +} + +XlaOp XlaBuilder::IsFinite(const XlaOp& operand) { + return UnaryOp(HloOpcode::kIsFinite, operand); +} + +XlaOp XlaBuilder::Transpose(const XlaOp& operand, + absl::Span permutation) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferTransposeShape(operand_shape, permutation)); + for (int64 dim : permutation) { + instr.add_dimensions(dim); + } + return AddInstruction(std::move(instr), HloOpcode::kTranspose, {operand}); + }); +} + +XlaOp XlaBuilder::Rev(const XlaOp& operand, + absl::Span dimensions) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferReverseShape(operand_shape, dimensions)); + for (int64 dim : dimensions) { + instr.add_dimensions(dim); + } + return AddInstruction(std::move(instr), HloOpcode::kReverse, {operand}); + }); +} + +XlaOp XlaBuilder::Sort(XlaOp keys, absl::optional values, + int64 dimension) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + std::vector operand_shape_ptrs; + TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys)); + operand_shape_ptrs.push_back(&keys_shape); + Shape values_shape; + if (values.has_value()) { + TF_ASSIGN_OR_RETURN(values_shape, GetShape(*values)); + operand_shape_ptrs.push_back(&values_shape); + } + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferVariadicOpShape( + HloOpcode::kSort, operand_shape_ptrs)); + if (dimension == -1) { + TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys)); + dimension = ShapeUtil::Rank(keys_shape) - 1; + } + instr.add_dimensions(dimension); + return values.has_value() + ? AddInstruction(std::move(instr), HloOpcode::kSort, + {keys, *values}) + : AddInstruction(std::move(instr), HloOpcode::kSort, {keys}); + }); +} + +XlaOp XlaBuilder::Pow(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return BinaryOp(HloOpcode::kPower, lhs, rhs, broadcast_dimensions); +} + +XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand, + PrimitiveType new_element_type) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferConvertShape(operand_shape, new_element_type)); + return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand}); + }); +} + +XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand, + PrimitiveType new_element_type) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferConvertShape(operand_shape, new_element_type)); + return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert, + {operand}); + }); +} + +XlaOp XlaBuilder::Neg(const XlaOp& operand) { + return UnaryOp(HloOpcode::kNegate, operand); +} + +XlaOp XlaBuilder::Clamp(const XlaOp& min, const XlaOp& operand, + const XlaOp& max) { + return TernaryOp(HloOpcode::kClamp, min, operand, max); +} + +XlaOp XlaBuilder::Map(absl::Span operands, + const XlaComputation& computation, + absl::Span dimensions, + absl::Span static_operands) { + return ReportErrorOrReturn([&]() -> StatusOr { + if (!static_operands.empty()) { + return Unimplemented("static_operands is not supported in Map"); + } + + HloInstructionProto instr; + std::vector operand_shape_ptrs; + TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands)); + absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs), + [](const Shape& shape) { return &shape; }); + TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape, + computation.GetProgramShape()); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferMapShape(operand_shape_ptrs, called_program_shape, + dimensions)); + + const Shape& output_shape = instr.shape(); + const int64 output_rank = ShapeUtil::Rank(output_shape); + AddCalledComputation(computation, &instr); + std::vector new_operands(operands.begin(), operands.end()); + for (XlaOp& new_operand : new_operands) { + TF_ASSIGN_OR_RETURN(Shape shape, GetShape(new_operand)); + const int64 rank = ShapeUtil::Rank(shape); + if (rank != output_rank) { + TF_ASSIGN_OR_RETURN(new_operand, + InDimBroadcast(output_shape, new_operand, {})); + TF_ASSIGN_OR_RETURN(shape, GetShape(new_operand)); + } + if (!ShapeUtil::SameDimensions(output_shape, shape)) { + TF_ASSIGN_OR_RETURN(new_operand, + AddBroadcastSequence(output_shape, new_operand)); + } + } + + return AddInstruction(std::move(instr), HloOpcode::kMap, new_operands); + }); +} + +XlaOp XlaBuilder::RngOp(RandomDistribution distribution, + absl::Span parameters, + const Shape& shape) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + // Check the number of parameters per RNG distribution. + switch (distribution) { + case RandomDistribution::RNG_NORMAL: + case RandomDistribution::RNG_UNIFORM: + if (parameters.size() != 2) { + return InvalidArgument( + "RNG distribution (%s) expects 2 parameters, but got %ld", + RandomDistribution_Name(distribution), parameters.size()); + } + break; + default: + LOG(FATAL) << "unhandled distribution " << distribution; + } + + TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape)); + *instr.mutable_shape() = shape; + + instr.set_distribution(distribution); + + return AddInstruction(std::move(instr), HloOpcode::kRng, parameters); + }); +} + +XlaOp XlaBuilder::RngNormal(const XlaOp& mu, const XlaOp& sigma, + const Shape& shape) { + return RngOp(RandomDistribution::RNG_NORMAL, {mu, sigma}, shape); +} + +XlaOp XlaBuilder::RngUniform(const XlaOp& a, const XlaOp& b, + const Shape& shape) { + return RngOp(RandomDistribution::RNG_UNIFORM, {a, b}, shape); +} + +XlaOp XlaBuilder::While(const XlaComputation& condition, + const XlaComputation& body, const XlaOp& init) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + // Infer shape. + TF_ASSIGN_OR_RETURN(const auto& body_program_shape, body.GetProgramShape()); + TF_ASSIGN_OR_RETURN(const auto& condition_program_shape, + condition.GetProgramShape()); + TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferWhileShape(condition_program_shape, + body_program_shape, init_shape)); + // Body comes before condition computation in the vector. + AddCalledComputation(body, &instr); + AddCalledComputation(condition, &instr); + return AddInstruction(std::move(instr), HloOpcode::kWhile, {init}); + }); +} + +XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& start_indices, + const GatherDimensionNumbers& dimension_numbers, + absl::Span slice_sizes) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input)); + TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape, + GetShape(start_indices)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferGatherShape(input_shape, start_indices_shape, + dimension_numbers, slice_sizes)); + + *instr.mutable_gather_dimension_numbers() = dimension_numbers; + for (int64 bound : slice_sizes) { + instr.add_gather_slice_sizes(bound); + } + + return AddInstruction(std::move(instr), HloOpcode::kGather, + {input, start_indices}); + }); +} + +XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, + const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input)); + TF_ASSIGN_OR_RETURN(const Shape& scatter_indices_shape, + GetShape(scatter_indices)); + TF_ASSIGN_OR_RETURN(const Shape& updates_shape, GetShape(updates)); + TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape, + update_computation.GetProgramShape()); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferScatterShape( + input_shape, scatter_indices_shape, updates_shape, + to_apply_shape, dimension_numbers)); + + *instr.mutable_scatter_dimension_numbers() = dimension_numbers; + + AddCalledComputation(update_computation, &instr); + return AddInstruction(std::move(instr), HloOpcode::kScatter, + {input, scatter_indices, updates}); + }); +} + +XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand, + const XlaComputation& true_computation, + const XlaOp& false_operand, + const XlaComputation& false_computation) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& predicate_shape, GetShape(predicate)); + TF_ASSIGN_OR_RETURN(const Shape& true_operand_shape, + GetShape(true_operand)); + TF_ASSIGN_OR_RETURN(const ProgramShape& true_computation_shape, + true_computation.GetProgramShape()); + TF_ASSIGN_OR_RETURN(const Shape& false_operand_shape, + GetShape(false_operand)); + TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape, + false_computation.GetProgramShape()); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferConditionalShape( + predicate_shape, true_operand_shape, false_operand_shape, + true_computation_shape, false_computation_shape)); + + // The index of true_computation must be 0 and that of false computation + // must be 1. + AddCalledComputation(true_computation, &instr); + AddCalledComputation(false_computation, &instr); + + return AddInstruction(std::move(instr), HloOpcode::kConditional, + {predicate, true_operand, false_operand}); + }); +} + +XlaOp XlaBuilder::Reduce(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span dimensions_to_reduce) { + return Reduce(absl::Span({operand}), + absl::Span({init_value}), computation, + dimensions_to_reduce); +} + +XlaOp XlaBuilder::Reduce(absl::Span operands, + absl::Span init_values, + const XlaComputation& computation, + absl::Span dimensions_to_reduce) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape, + computation.GetProgramShape()); + + std::vector all_operands; + all_operands.insert(all_operands.end(), operands.begin(), operands.end()); + all_operands.insert(all_operands.end(), init_values.begin(), + init_values.end()); + + std::vector operand_shape_ptrs; + TF_ASSIGN_OR_RETURN(const auto& operand_shapes, + GetOperandShapes(all_operands)); + absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs), + [](const Shape& shape) { return &shape; }); + + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferReduceShape( + operand_shape_ptrs, dimensions_to_reduce, called_program_shape)); + + for (int64 dim : dimensions_to_reduce) { + instr.add_dimensions(dim); + } + + AddCalledComputation(computation, &instr); + + return AddInstruction(std::move(instr), HloOpcode::kReduce, all_operands); + }); +} + +XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + std::vector all_dimnos(ShapeUtil::Rank(operand_shape)); + std::iota(all_dimnos.begin(), all_dimnos.end(), 0); + return Reduce(operand, init_value, computation, all_dimnos); + }); +} + +XlaOp XlaBuilder::ReduceWindow(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, + Padding padding) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_RETURN_IF_ERROR( + ValidatePaddingValues(AsInt64Slice(operand_shape.dimensions()), + window_dimensions, window_strides)); + + std::vector> padding_values = + MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions, + window_strides, padding); + return ReduceWindowWithGeneralPadding(operand, init_value, computation, + window_dimensions, window_strides, + padding_values); + }); +} + +XlaOp XlaBuilder::ReduceWindowWithGeneralPadding( + const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value)); + TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape, + computation.GetProgramShape()); + TF_ASSIGN_OR_RETURN(*instr.mutable_window(), + MakeWindow(window_dimensions, window_strides, padding, + /*lhs_dilation=*/{}, /*rhs_dilation=*/{})); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferReduceWindowShape(operand_shape, init_shape, + instr.window(), to_apply_shape)); + + AddCalledComputation(computation, &instr); + return AddInstruction(std::move(instr), HloOpcode::kReduceWindow, + {operand, init_value}); + }); +} + +XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, float epsilon, + int64 feature_index) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale)); + TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferBatchNormTrainingShape( + operand_shape, scale_shape, offset_shape, feature_index)); + + instr.set_epsilon(epsilon); + instr.set_feature_index(feature_index); + + return AddInstruction(std::move(instr), HloOpcode::kBatchNormTraining, + {operand, scale, offset}); + }); +} + +XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, const XlaOp& mean, + const XlaOp& variance, float epsilon, + int64 feature_index) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale)); + TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset)); + TF_ASSIGN_OR_RETURN(const Shape& mean_shape, GetShape(mean)); + TF_ASSIGN_OR_RETURN(const Shape& variance_shape, GetShape(variance)); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferBatchNormInferenceShape( + operand_shape, scale_shape, offset_shape, + mean_shape, variance_shape, feature_index)); + + instr.set_epsilon(epsilon); + instr.set_feature_index(feature_index); + + return AddInstruction(std::move(instr), HloOpcode::kBatchNormInference, + {operand, scale, offset, mean, variance}); + }); +} + +XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale, + const XlaOp& batch_mean, const XlaOp& batch_var, + const XlaOp& grad_output, float epsilon, + int64 feature_index) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale)); + TF_ASSIGN_OR_RETURN(const Shape& batch_mean_shape, GetShape(batch_mean)); + TF_ASSIGN_OR_RETURN(const Shape& batch_var_shape, GetShape(batch_var)); + TF_ASSIGN_OR_RETURN(const Shape& grad_output_shape, GetShape(grad_output)); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferBatchNormGradShape( + operand_shape, scale_shape, batch_mean_shape, + batch_var_shape, grad_output_shape, feature_index)); + + instr.set_epsilon(epsilon); + instr.set_feature_index(feature_index); + + return AddInstruction(std::move(instr), HloOpcode::kBatchNormGrad, + {operand, scale, batch_mean, batch_var, grad_output}); + }); +} + +XlaOp XlaBuilder::CrossReplicaSum( + const XlaOp& operand, absl::Span replica_groups) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand)); + const Shape& scalar_shape = ShapeUtil::MakeShape(shape.element_type(), {}); + auto b = CreateSubBuilder("sum"); + b->Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"), + b->Parameter(/*parameter_number=*/1, scalar_shape, "y")); + TF_ASSIGN_OR_RETURN(auto computation, b->Build()); + return CrossReplicaSum(operand, computation, replica_groups, + /*channel_id=*/absl::nullopt); + }); +} + +XlaOp XlaBuilder::CrossReplicaSum( + const XlaOp& operand, const XlaComputation& computation, + absl::Span replica_groups, + const absl::optional& channel_id) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferCrossReplicaSumShape({&operand_shape})); + + for (const ReplicaGroup& group : replica_groups) { + *instr.add_replica_groups() = group; + } + + if (channel_id.has_value()) { + instr.set_all_reduce_id(channel_id->handle()); + } + + AddCalledComputation(computation, &instr); + + return AddInstruction(std::move(instr), HloOpcode::kCrossReplicaSum, + {operand}); + }); +} + +XlaOp XlaBuilder::AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + + // The HloInstruction for Alltoall currently only handles the data + // communication: it accepts N already split parts and scatters them to N + // cores, and each core gathers the N received parts into a tuple as the + // output. So here we explicitly split the operand before the hlo alltoall, + // and concat the tuple elements. + // + // First, run shape inference to make sure the shapes are valid. + TF_RETURN_IF_ERROR( + ShapeInference::InferAllToAllShape(operand_shape, split_dimension, + concat_dimension, split_count) + .status()); + + // Split into N parts. + std::vector slices; + slices.reserve(split_count); + const int64 block_size = + operand_shape.dimensions(split_dimension) / split_count; + for (int i = 0; i < split_count; i++) { + slices.push_back(SliceInDim(operand, /*start_index=*/i * block_size, + /*limit_index=*/(i + 1) * block_size, + /*stride=*/1, /*dimno=*/split_dimension)); + } + + // Handle data communication. + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(auto slice_shapes, this->GetOperandShapes(slices)); + std::vector slice_shape_ptrs; + absl::c_transform(slice_shapes, std::back_inserter(slice_shape_ptrs), + [](const Shape& shape) { return &shape; }); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs)); + for (const ReplicaGroup& group : replica_groups) { + *instr.add_replica_groups() = group; + } + TF_ASSIGN_OR_RETURN( + XlaOp alltoall, + AddInstruction(std::move(instr), HloOpcode::kAllToAll, slices)); + + // Concat the N received parts. + std::vector received; + received.reserve(split_count); + for (int i = 0; i < split_count; i++) { + received.push_back(this->GetTupleElement(alltoall, i)); + } + return this->ConcatInDim(received, concat_dimension); + }); +} + +XlaOp XlaBuilder::CollectivePermute( + const XlaOp& operand, + const std::vector>& source_target_pairs) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferCollectivePermuteShape(operand_shape)); + + for (const auto& pair : source_target_pairs) { + auto* proto_pair = instr.add_source_target_pairs(); + proto_pair->set_source(pair.first); + proto_pair->set_target(pair.second); + } + + return AddInstruction(std::move(instr), HloOpcode::kCollectivePermute, + {operand}); + }); +} + +XlaOp XlaBuilder::SelectAndScatter(const XlaOp& operand, + const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, + Padding padding, const XlaOp& source, + const XlaOp& init_value, + const XlaComputation& scatter) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + return SelectAndScatterWithGeneralPadding( + operand, select, window_dimensions, window_strides, + MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions, + window_strides, padding), + source, init_value, scatter); + }); +} + +XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding( + const XlaOp& operand, const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding, const XlaOp& source, + const XlaOp& init_value, const XlaComputation& scatter) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(const Shape& source_shape, GetShape(source)); + TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value)); + TF_ASSIGN_OR_RETURN(const ProgramShape& select_shape, + select.GetProgramShape()); + TF_ASSIGN_OR_RETURN(const ProgramShape& scatter_shape, + scatter.GetProgramShape()); + TF_ASSIGN_OR_RETURN(*instr.mutable_window(), + MakeWindow(window_dimensions, window_strides, padding, + /*lhs_dilation=*/{}, /*rhs_dilation=*/{})); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferSelectAndScatterShape( + operand_shape, select_shape, instr.window(), + source_shape, init_shape, scatter_shape)); + + AddCalledComputation(select, &instr); + AddCalledComputation(scatter, &instr); + + return AddInstruction(std::move(instr), HloOpcode::kSelectAndScatter, + {operand, source, init_value}); + }); +} + +XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits, + const int mantissa_bits) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferReducePrecisionShape( + operand_shape, exponent_bits, mantissa_bits)); + instr.set_exponent_bits(exponent_bits); + instr.set_mantissa_bits(mantissa_bits); + return AddInstruction(std::move(instr), HloOpcode::kReducePrecision, + {operand}); + }); +} + +void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) { + ReportErrorOrReturn([&]() -> StatusOr { + // Send HLO takes two operands: a data operand and a token. Generate the + // token to pass into the send. + // TODO(b/80000000): Remove this when clients have been updated to handle + // tokens. + HloInstructionProto token_instr; + *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr), + HloOpcode::kAfterAll, {})); + + return SendWithToken(operand, token, handle); + }); +} + +XlaOp XlaBuilder::SendWithToken(const XlaOp& operand, const XlaOp& token, + const ChannelHandle& handle) { + return ReportErrorOrReturn([&]() -> StatusOr { + if (handle.type() != ChannelHandle::DEVICE_TO_DEVICE) { + return InvalidArgument("Send must use a device-to-device channel"); + } + + // Send instruction produces a tuple of {aliased operand, U32 context, + // token}. + HloInstructionProto send_instr; + TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand)); + *send_instr.mutable_shape() = ShapeUtil::MakeTupleShape( + {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()}); + send_instr.set_channel_id(handle.handle()); + TF_ASSIGN_OR_RETURN(XlaOp send, + AddInstruction(std::move(send_instr), HloOpcode::kSend, + {operand, token})); + + HloInstructionProto send_done_instr; + *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + send_done_instr.set_channel_id(handle.handle()); + return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone, + {send}); + }); +} + +XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) { + return ReportErrorOrReturn([&]() -> StatusOr { + // Recv HLO takes a single token operand. Generate the token to pass into + // the Recv and RecvDone instructions. + // TODO(b/80000000): Remove this when clients have been updated to handle + // tokens. + HloInstructionProto token_instr; + *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr), + HloOpcode::kAfterAll, {})); + + XlaOp recv = RecvWithToken(token, shape, handle); + + // The RecvDone instruction produces a tuple of the data and a token + // type. Return XLA op containing the data. + // TODO(b/80000000): Remove this when clients have been updated to handle + // tokens. + HloInstructionProto recv_data; + *recv_data.mutable_shape() = shape; + recv_data.set_tuple_index(0); + return AddInstruction(std::move(recv_data), HloOpcode::kGetTupleElement, + {recv}); + }); +} + +XlaOp XlaBuilder::RecvWithToken(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle) { + return ReportErrorOrReturn([&]() -> StatusOr { + if (handle.type() != ChannelHandle::DEVICE_TO_DEVICE) { + return InvalidArgument("Recv must use a device-to-device channel"); + } + + // Recv instruction produces a tuple of {receive buffer, U32 context, + // token}. + HloInstructionProto recv_instr; + *recv_instr.mutable_shape() = ShapeUtil::MakeTupleShape( + {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()}); + recv_instr.set_channel_id(handle.handle()); + TF_ASSIGN_OR_RETURN(XlaOp recv, AddInstruction(std::move(recv_instr), + HloOpcode::kRecv, {token})); + + HloInstructionProto recv_done_instr; + *recv_done_instr.mutable_shape() = + ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()}); + recv_done_instr.set_channel_id(handle.handle()); + return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone, + {recv}); + }); +} + +XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, + const ChannelHandle& handle) { + return ReportErrorOrReturn([&]() -> StatusOr { + if (!LayoutUtil::HasLayout(shape_with_layout)) { + return InvalidArgument("Shape passed to SendToHost must have a layout"); + } + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) { + return InvalidArgument( + "SendToHost shape %s must be compatible with operand shape %s", + ShapeUtil::HumanStringWithLayout(shape_with_layout), + ShapeUtil::HumanStringWithLayout(operand_shape)); + } + // TODO(b/111544877): Support tuple shapes. + if (!ShapeUtil::IsArray(operand_shape)) { + return InvalidArgument("SendToHost only supports array shapes, shape: %s", + ShapeUtil::HumanString(operand_shape)); + } + + if (handle.type() != ChannelHandle::DEVICE_TO_HOST) { + return InvalidArgument("SendToHost must use a device-to-host channel"); + } + + // Send instruction produces a tuple of {aliased operand, U32 context, + // token}. + HloInstructionProto send_instr; + *send_instr.mutable_shape() = ShapeUtil::MakeTupleShape( + {shape_with_layout, ShapeUtil::MakeShape(U32, {}), + ShapeUtil::MakeTokenShape()}); + send_instr.set_channel_id(handle.handle()); + send_instr.set_is_host_transfer(true); + TF_ASSIGN_OR_RETURN(XlaOp send, + AddInstruction(std::move(send_instr), HloOpcode::kSend, + {operand, token})); + + HloInstructionProto send_done_instr; + *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape(); + send_done_instr.set_channel_id(handle.handle()); + send_done_instr.set_is_host_transfer(true); + return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone, + {send}); + }); +} + +XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle) { + return ReportErrorOrReturn([&]() -> StatusOr { + if (!LayoutUtil::HasLayout(shape)) { + return InvalidArgument("Shape passed to RecvFromHost must have a layout"); + } + + // TODO(b/111544877): Support tuple shapes. + if (!ShapeUtil::IsArray(shape)) { + return InvalidArgument( + "RecvFromHost only supports array shapes, shape: %s", + ShapeUtil::HumanString(shape)); + } + + if (handle.type() != ChannelHandle::HOST_TO_DEVICE) { + return InvalidArgument("RecvFromHost must use a host-to-device channel"); + } + + // Recv instruction produces a tuple of {receive buffer, U32 context, + // token}. + HloInstructionProto recv_instr; + *recv_instr.mutable_shape() = ShapeUtil::MakeTupleShape( + {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()}); + recv_instr.set_channel_id(handle.handle()); + recv_instr.set_is_host_transfer(true); + TF_ASSIGN_OR_RETURN(XlaOp recv, AddInstruction(std::move(recv_instr), + HloOpcode::kRecv, {token})); + + HloInstructionProto recv_done_instr; + *recv_done_instr.mutable_shape() = + ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()}); + recv_done_instr.set_channel_id(handle.handle()); + recv_done_instr.set_is_host_transfer(true); + return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone, + {recv}); + }); +} + +StatusOr XlaBuilder::IsConstant(const XlaOp& operand) const { + TF_RETURN_IF_ERROR(first_error_); + + // Verify that the handle is valid. + TF_RETURN_IF_ERROR(LookUpInstruction(operand).status()); + + bool is_constant = true; + std::set visited; + IsConstantVisitor(operand.handle(), &visited, &is_constant); + return is_constant; +} + +StatusOr XlaBuilder::BuildConstantSubGraph( + const XlaOp& root_op) const { + TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op)); + if (!is_constant) { + auto op_status = LookUpInstruction(root_op); + string op_string = + op_status.ok() ? op_status.ValueOrDie()->name() : ""; + return InvalidArgument( + "Operand to BuildConstantSubGraph depends on a parameter.\n\n" + " op requested for constant subgraph: %s\n\n" + "This is an internal error that typically happens when the XLA user " + "(e.g. TensorFlow) is attempting to determine a value that must be a " + "compile-time constant (e.g. an array dimension) but it is not capable " + "of being evaluated at XLA compile time.\n\n" + "Please file a usability bug with the framework being used (e.g. " + "TensorFlow).", + op_string); + } + + TF_ASSIGN_OR_RETURN(const HloInstructionProto* root, + LookUpInstruction(root_op)); + + HloComputationProto entry; + entry.set_id(GetUniqueId()); // Give the computation a global unique id. + entry.set_name(StrCat(name_, entry.id(), "_compute_constant")); + entry.set_root_id(root->id()); + ProgramShape* program_shape = entry.mutable_program_shape(); + *program_shape->mutable_result() = root->shape(); + + // We use std::set to keep the instruction ids in ascending order (which is + // also a valid denpendency order). The related ops will be added to the + // subgraph in the same order. + std::set related_ops; + tensorflow::gtl::FlatSet related_calls; // Related computations. + std::queue worklist; + worklist.push(root->id()); + related_ops.insert(root->id()); + while (!worklist.empty()) { + int64 node = worklist.front(); + worklist.pop(); + for (int64 id : instructions_[node].operand_ids()) { + if (related_ops.insert(id).second) { + worklist.push(id); + } + } + for (int64 called_id : instructions_[node].called_computation_ids()) { + related_calls.insert(called_id); + } + } + + // Add related ops to the computation. + for (int64 id : related_ops) { + auto* instr = entry.add_instructions(); + *instr = instructions_[id]; + // Ensures that the instruction names are unique among the graph. + const string& new_name = + StrCat(instr->name(), ".", entry.id(), ".", instr->id()); + instr->set_name(new_name); + } + + XlaComputation computation(entry.id()); + HloModuleProto* module = computation.mutable_proto(); + module->set_name(entry.name()); + module->set_id(entry.id()); + module->set_entry_computation_name(entry.name()); + module->set_entry_computation_id(entry.id()); + *module->mutable_program_shape() = *program_shape; + for (auto& e : embedded_) { + if (related_calls.find(e.second.id()) != related_calls.end()) { + *module->add_computations() = e.second; + } + } + *module->add_computations() = std::move(entry); + + return std::move(computation); +} + +std::unique_ptr XlaBuilder::CreateSubBuilder( + const string& computation_name) { + auto sub_builder = absl::make_unique(computation_name); + sub_builder->parent_builder_ = this; + sub_builder->die_immediately_on_error_ = this->die_immediately_on_error_; + return sub_builder; +} + +/* static */ ConvolutionDimensionNumbers +XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) { + ConvolutionDimensionNumbers dimension_numbers; + dimension_numbers.set_input_batch_dimension(kConvBatchDimension); + dimension_numbers.set_input_feature_dimension(kConvFeatureDimension); + dimension_numbers.set_output_batch_dimension(kConvBatchDimension); + dimension_numbers.set_output_feature_dimension(kConvFeatureDimension); + dimension_numbers.set_kernel_output_feature_dimension( + kConvKernelOutputDimension); + dimension_numbers.set_kernel_input_feature_dimension( + kConvKernelInputDimension); + for (int i = 0; i < num_spatial_dims; ++i) { + dimension_numbers.add_input_spatial_dimensions(i + 2); + dimension_numbers.add_kernel_spatial_dimensions(i + 2); + dimension_numbers.add_output_spatial_dimensions(i + 2); + } + return dimension_numbers; +} + +/* static */ Status XlaBuilder::Validate( + const ConvolutionDimensionNumbers& dnum) { + if (dnum.input_spatial_dimensions_size() < 2) { + return FailedPrecondition("input spacial dimension < 2: %d", + dnum.input_spatial_dimensions_size()); + } + if (dnum.kernel_spatial_dimensions_size() < 2) { + return FailedPrecondition("kernel spacial dimension < 2: %d", + dnum.kernel_spatial_dimensions_size()); + } + if (dnum.output_spatial_dimensions_size() < 2) { + return FailedPrecondition("output spacial dimension < 2: %d", + dnum.output_spatial_dimensions_size()); + } + + if (std::set( + {dnum.input_batch_dimension(), dnum.input_feature_dimension(), + dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1)}) + .size() != 4) { + return FailedPrecondition( + "dimension numbers for the input are not unique: (%d, %d, %d, " + "%d)", + dnum.input_batch_dimension(), dnum.input_feature_dimension(), + dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1)); + } + if (std::set({dnum.kernel_output_feature_dimension(), + dnum.kernel_input_feature_dimension(), + dnum.kernel_spatial_dimensions(0), + dnum.kernel_spatial_dimensions(1)}) + .size() != 4) { + return FailedPrecondition( + "dimension numbers for the weight are not unique: (%d, %d, %d, " + "%d)", + dnum.kernel_output_feature_dimension(), + dnum.kernel_input_feature_dimension(), + dnum.kernel_spatial_dimensions(0), dnum.kernel_spatial_dimensions(1)); + } + if (std::set({dnum.output_batch_dimension(), + dnum.output_feature_dimension(), + dnum.output_spatial_dimensions(0), + dnum.output_spatial_dimensions(1)}) + .size() != 4) { + return FailedPrecondition( + "dimension numbers for the output are not unique: (%d, %d, %d, " + "%d)", + dnum.output_batch_dimension(), dnum.output_feature_dimension(), + dnum.output_spatial_dimensions(0), dnum.output_spatial_dimensions(1)); + } + return Status::OK(); +} + +StatusOr XlaBuilder::AddInstruction(HloInstructionProto&& instr, + HloOpcode opcode, + absl::Span operands) { + TF_RETURN_IF_ERROR(first_error_); + + const int64 handle = instructions_.size(); + instr.set_id(handle); + instr.set_opcode(HloOpcodeString(opcode)); + if (instr.name().empty()) { + instr.set_name(StrCat(instr.opcode())); + } + for (const auto& operand : operands) { + if (operand.builder_ == nullptr) { + return InvalidArgument("invalid XlaOp with handle %d", operand.handle()); + } + if (operand.builder_ != this) { + return InvalidArgument("Do not add XlaOp from builder %s to builder %s", + operand.builder_->name(), this->name()); + } + instr.add_operand_ids(operand.handle()); + } + + *instr.mutable_metadata() = metadata_; + if (sharding_) { + *instr.mutable_sharding() = *sharding_; + } + + instructions_.push_back(instr); + + XlaOp op(handle, this); + return op; +} + +void XlaBuilder::AddCalledComputation(const XlaComputation& computation, + HloInstructionProto* instr) { + instr->add_called_computation_ids(computation.proto().entry_computation_id()); + for (const HloComputationProto& e : computation.proto().computations()) { + embedded_.insert({e.id(), e}); + } +} + +StatusOr XlaBuilder::LookUpInstruction( + const XlaOp& op) const { + TF_RETURN_IF_ERROR(first_error_); + + if (op.builder_ == nullptr) { + return InvalidArgument( + "invalid XlaOp with handle %d; the builder of this op is freed", + op.handle()); + } + if (op.builder_ != this) { + return InvalidArgument( + "XlaOp with handle %d is built by builder '%s', but is trying to use " + "it in builder '%s'", + op.handle(), op.builder_->name(), this->name()); + } + + if (op.handle() >= instructions_.size() || op.handle() < 0) { + return InvalidArgument("no XlaOp value %d", op.handle()); + } + return &instructions_[op.handle()]; +} + +// Enqueues a "retrieve parameter value" instruction for a parameter that was +// passed to the computation. +XlaOp Parameter(XlaBuilder* builder, int64 parameter_number, const Shape& shape, + const string& name) { + return builder->Parameter(parameter_number, shape, name); +} + +// Enqueues a constant with the value of the given literal onto the +// computation. +XlaOp ConstantLiteral(XlaBuilder* builder, const LiteralSlice& literal) { + return builder->ConstantLiteral(literal); +} + +XlaOp Broadcast(const XlaOp& operand, absl::Span broadcast_sizes) { + return operand.builder()->Broadcast(operand, broadcast_sizes); +} + +XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape, + const absl::Span broadcast_dimensions) { + return operand.builder()->BroadcastInDim(operand, shape, + broadcast_dimensions); +} + +XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value, + const PaddingConfig& padding_config) { + return operand.builder()->Pad(operand, padding_value, padding_config); +} + +XlaOp Reshape(const XlaOp& operand, absl::Span dimensions, + absl::Span new_sizes) { + return operand.builder()->Reshape(operand, dimensions, new_sizes); +} + +XlaOp Reshape(const XlaOp& operand, absl::Span new_sizes) { + return operand.builder()->Reshape(operand, new_sizes); +} + +XlaOp Collapse(const XlaOp& operand, absl::Span dimensions) { + return operand.builder()->Collapse(operand, dimensions); +} + +XlaOp Slice(const XlaOp& operand, absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides) { + return operand.builder()->Slice(operand, start_indices, limit_indices, + strides); +} + +XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index, + int64 stride, int64 dimno) { + return operand.builder()->SliceInDim(operand, start_index, limit_index, + stride, dimno); +} + +XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices, + absl::Span slice_sizes) { + return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes); +} + +XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update, + const XlaOp& start_indices) { + return operand.builder()->DynamicUpdateSlice(operand, update, start_indices); +} + +XlaOp ConcatInDim(XlaBuilder* builder, absl::Span operands, + int64 dimension) { + return builder->ConcatInDim(operands, dimension); +} + +void Trace(const string& tag, const XlaOp& operand) { + return operand.builder()->Trace(tag, operand); +} + +XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false) { + return pred.builder()->Select(pred, on_true, on_false); +} + +XlaOp Tuple(XlaBuilder* builder, absl::Span elements) { + return builder->Tuple(elements); +} + +XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index) { + return tuple_data.builder()->GetTupleElement(tuple_data, index); +} + +XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Eq(lhs, rhs, broadcast_dimensions); +} + +XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Ne(lhs, rhs, broadcast_dimensions); +} + +XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Ge(lhs, rhs, broadcast_dimensions); +} + +XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Gt(lhs, rhs, broadcast_dimensions); +} + +XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Lt(lhs, rhs, broadcast_dimensions); +} + +XlaOp Le(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Le(lhs, rhs, broadcast_dimensions); +} + +XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs, + const PrecisionConfigProto* precision_config_proto) { + return lhs.builder()->Dot(lhs, rhs, precision_config_proto); +} + +XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs, + const DotDimensionNumbers& dimension_numbers, + const PrecisionConfigProto* precision_config_proto) { + return lhs.builder()->DotGeneral(lhs, rhs, dimension_numbers, + precision_config_proto); +} + +XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, Padding padding, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return lhs.builder()->Conv(lhs, rhs, window_strides, padding, + feature_group_count, precision_config_proto); +} + +XlaOp ConvWithGeneralPadding( + const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, + absl::Span> padding, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return lhs.builder()->ConvWithGeneralPadding(lhs, rhs, window_strides, + padding, feature_group_count, + precision_config_proto); +} + +XlaOp ConvWithGeneralDimensions( + const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, + Padding padding, const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return lhs.builder()->ConvWithGeneralDimensions( + lhs, rhs, window_strides, padding, dimension_numbers, feature_group_count, + precision_config_proto); +} + +XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, + absl::Span> padding, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return lhs.builder()->ConvGeneral(lhs, rhs, window_strides, padding, + dimension_numbers, feature_group_count, + precision_config_proto); +} + +XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, + absl::Span> padding, + absl::Span lhs_dilation, + absl::Span rhs_dilation, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto) { + return lhs.builder()->ConvGeneralDilated( + lhs, rhs, window_strides, padding, lhs_dilation, rhs_dilation, + dimension_numbers, feature_group_count, precision_config_proto); +} + +XlaOp Fft(const XlaOp& operand, FftType fft_type, + absl::Span fft_length) { + return operand.builder()->Fft(operand, fft_type, fft_length); +} + +XlaOp Infeed(XlaBuilder* builder, const Shape& shape, const string& config) { + return builder->Infeed(shape, config); +} + +void Outfeed(const XlaOp& operand, const Shape& shape_with_layout, + const string& outfeed_config) { + return operand.builder()->Outfeed(operand, shape_with_layout, outfeed_config); +} + +XlaOp Call(XlaBuilder* builder, const XlaComputation& computation, + absl::Span operands) { + return builder->Call(computation, operands); +} + +XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name, + absl::Span operands, const Shape& shape) { + return builder->CustomCall(call_target_name, operands, shape); +} + +XlaOp Complex(const XlaOp& real, const XlaOp& imag, + absl::Span broadcast_dimensions) { + return real.builder()->Complex(real, imag, broadcast_dimensions); +} + +XlaOp Conj(const XlaOp& operand) { return operand.builder()->Conj(operand); } + +XlaOp Add(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Add(lhs, rhs, broadcast_dimensions); +} + +XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Sub(lhs, rhs, broadcast_dimensions); +} + +XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Mul(lhs, rhs, broadcast_dimensions); +} + +XlaOp Div(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Div(lhs, rhs, broadcast_dimensions); +} + +XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Rem(lhs, rhs, broadcast_dimensions); +} + +XlaOp Max(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Max(lhs, rhs, broadcast_dimensions); +} + +XlaOp Min(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Min(lhs, rhs, broadcast_dimensions); +} + +XlaOp And(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->And(lhs, rhs, broadcast_dimensions); +} + +XlaOp Or(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Or(lhs, rhs, broadcast_dimensions); +} + +XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Xor(lhs, rhs, broadcast_dimensions); +} + +XlaOp Not(const XlaOp& operand) { return operand.builder()->Not(operand); } + +XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->ShiftLeft(lhs, rhs, broadcast_dimensions); +} + +XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->ShiftRightArithmetic(lhs, rhs, broadcast_dimensions); +} + +XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->ShiftRightLogical(lhs, rhs, broadcast_dimensions); +} + +XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span dimensions_to_reduce) { + return operand.builder()->Reduce(operand, init_value, computation, + dimensions_to_reduce); +} + +// Reduces several arrays simultaneously among the provided dimensions, given +// "computation" as a reduction operator. +XlaOp Reduce(XlaBuilder* builder, absl::Span operands, + absl::Span init_values, + const XlaComputation& computation, + absl::Span dimensions_to_reduce) { + return builder->Reduce(operands, init_values, computation, + dimensions_to_reduce); +} + +XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation) { + return operand.builder()->ReduceAll(operand, init_value, computation); +} + +XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding) { + return operand.builder()->ReduceWindow(operand, init_value, computation, + window_dimensions, window_strides, + padding); +} + +XlaOp ReduceWindowWithGeneralPadding( + const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding) { + return operand.builder()->ReduceWindowWithGeneralPadding( + operand, init_value, computation, window_dimensions, window_strides, + padding); +} + +XlaOp CrossReplicaSum(const XlaOp& operand, + absl::Span replica_groups) { + return operand.builder()->CrossReplicaSum(operand, replica_groups); +} + +XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation, + absl::Span replica_groups, + const absl::optional& channel_id) { + return operand.builder()->CrossReplicaSum(operand, computation, + replica_groups, channel_id); +} + +XlaOp AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups) { + return operand.builder()->AllToAll(operand, split_dimension, concat_dimension, + split_count, replica_groups); +} + +XlaOp CollectivePermute( + const XlaOp& operand, + const std::vector>& source_target_pairs) { + return operand.builder()->CollectivePermute(operand, source_target_pairs); +} + +XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding, + const XlaOp& source, const XlaOp& init_value, + const XlaComputation& scatter) { + return operand.builder()->SelectAndScatter(operand, select, window_dimensions, + window_strides, padding, source, + init_value, scatter); +} + +XlaOp SelectAndScatterWithGeneralPadding( + const XlaOp& operand, const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding, const XlaOp& source, + const XlaOp& init_value, const XlaComputation& scatter) { + return operand.builder()->SelectAndScatterWithGeneralPadding( + operand, select, window_dimensions, window_strides, padding, source, + init_value, scatter); +} + +XlaOp Abs(const XlaOp& operand) { return operand.builder()->Abs(operand); } + +XlaOp Atan2(const XlaOp& y, const XlaOp& x, + absl::Span broadcast_dimensions) { + return y.builder()->Atan2(y, x, broadcast_dimensions); +} + +XlaOp Exp(const XlaOp& operand) { return operand.builder()->Exp(operand); } + +XlaOp Expm1(const XlaOp& operand) { return operand.builder()->Expm1(operand); } + +XlaOp Floor(const XlaOp& operand) { return operand.builder()->Floor(operand); } + +XlaOp Ceil(const XlaOp& operand) { return operand.builder()->Ceil(operand); } + +XlaOp Round(const XlaOp& operand) { return operand.builder()->Round(operand); } + +XlaOp Log(const XlaOp& operand) { return operand.builder()->Log(operand); } + +XlaOp Log1p(const XlaOp& operand) { return operand.builder()->Log1p(operand); } + +XlaOp Sign(const XlaOp& operand) { return operand.builder()->Sign(operand); } + +XlaOp Clz(const XlaOp& operand) { return operand.builder()->Clz(operand); } + +XlaOp Cos(const XlaOp& operand) { return operand.builder()->Cos(operand); } + +XlaOp Sin(const XlaOp& operand) { return operand.builder()->Sin(operand); } + +XlaOp Tanh(const XlaOp& operand) { return operand.builder()->Tanh(operand); } + +XlaOp Real(const XlaOp& operand) { return operand.builder()->Real(operand); } + +XlaOp Imag(const XlaOp& operand) { return operand.builder()->Imag(operand); } + +XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions) { + return lhs.builder()->Pow(lhs, rhs, broadcast_dimensions); +} + +XlaOp IsFinite(const XlaOp& operand) { + return operand.builder()->IsFinite(operand); +} + +XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type) { + return operand.builder()->ConvertElementType(operand, new_element_type); +} + +XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type) { + return operand.builder()->BitcastConvertType(operand, new_element_type); +} + +XlaOp Neg(const XlaOp& operand) { return operand.builder()->Neg(operand); } + +XlaOp Transpose(const XlaOp& operand, absl::Span permutation) { + return operand.builder()->Transpose(operand, permutation); +} + +XlaOp Rev(const XlaOp& operand, absl::Span dimensions) { + return operand.builder()->Rev(operand, dimensions); +} + +XlaOp Sort(XlaOp keys, absl::optional values, int64 dimension) { + return keys.builder()->Sort(keys, std::move(values), dimension); +} + +XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max) { + return min.builder()->Clamp(min, operand, max); +} + +XlaOp Map(XlaBuilder* builder, absl::Span operands, + const XlaComputation& computation, absl::Span dimensions, + absl::Span static_operands) { + return builder->Map(operands, computation, dimensions, static_operands); +} + +XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape) { + return mu.builder()->RngNormal(mu, sigma, shape); +} + +XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape) { + return a.builder()->RngUniform(a, b, shape); +} + +XlaOp While(const XlaComputation& condition, const XlaComputation& body, + const XlaOp& init) { + return init.builder()->While(condition, body, init); +} + +XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand, + const XlaComputation& true_computation, + const XlaOp& false_operand, + const XlaComputation& false_computation) { + return predicate.builder()->Conditional(predicate, true_operand, + true_computation, false_operand, + false_computation); +} + +XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits, + const int mantissa_bits) { + return operand.builder()->ReducePrecision(operand, exponent_bits, + mantissa_bits); +} + +XlaOp Gather(const XlaOp& input, const XlaOp& start_indices, + const GatherDimensionNumbers& dimension_numbers, + absl::Span slice_sizes) { + return input.builder()->Gather(input, start_indices, dimension_numbers, + slice_sizes); +} + +XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers) { + return input.builder()->Scatter(input, scatter_indices, updates, + update_computation, dimension_numbers); +} + +void Send(const XlaOp& operand, const ChannelHandle& handle) { + return operand.builder()->Send(operand, handle); +} + +XlaOp Recv(XlaBuilder* builder, const Shape& shape, + const ChannelHandle& handle) { + return builder->Recv(shape, handle); +} + +XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token, + const ChannelHandle& handle) { + return operand.builder()->SendWithToken(operand, token, handle); +} + +XlaOp RecvWithToken(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle) { + return token.builder()->RecvWithToken(token, shape, handle); +} + +XlaOp SendToHost(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, const ChannelHandle& handle) { + return operand.builder()->SendToHost(operand, token, shape_with_layout, + handle); +} + +XlaOp RecvFromHost(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle) { + return token.builder()->RecvFromHost(token, shape, handle); +} + +XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape, + const string& config) { + return token.builder()->InfeedWithToken(token, shape, config); +} + +XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, + const string& outfeed_config) { + return operand.builder()->OutfeedWithToken(operand, token, shape_with_layout, + outfeed_config); +} + +XlaOp CreateToken(XlaBuilder* builder) { return builder->CreateToken(); } + +XlaOp AfterAll(XlaBuilder* builder, absl::Span tokens) { + return builder->AfterAll(tokens); +} + +XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, float epsilon, + int64 feature_index) { + return operand.builder()->BatchNormTraining(operand, scale, offset, epsilon, + feature_index); +} + +XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, const XlaOp& mean, + const XlaOp& variance, float epsilon, + int64 feature_index) { + return operand.builder()->BatchNormInference( + operand, scale, offset, mean, variance, epsilon, feature_index); +} + +XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale, + const XlaOp& batch_mean, const XlaOp& batch_var, + const XlaOp& grad_output, float epsilon, + int64 feature_index) { + return operand.builder()->BatchNormGrad(operand, scale, batch_mean, batch_var, + grad_output, epsilon, feature_index); +} + +XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size) { + return builder->Iota(type, size); +} + +XlaOp Iota(XlaBuilder* builder, const Shape& shape, int64 iota_dimension) { + return builder->Iota(shape, iota_dimension); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h new file mode 100644 index 0000000000000000000000000000000000000000..59fbc664f2b35fd00f9b9094d6147847d03797ea --- /dev/null +++ b/tensorflow/compiler/xla/client/xla_builder.h @@ -0,0 +1,2285 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_BUILDER_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_BUILDER_H_ + +#include +#include +#include +#include + +#include "absl/strings/string_view.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/client/padding.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/gtl/flatset.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/stacktrace.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { + +class XlaBuilder; + +// This represents an instruction that has been enqueued using the XlaBuilder. +// This is used to pass to subsequent computations that depends upon the +// instruction as an operand. +class XlaOp { + public: + XlaOp() : handle_(-1), builder_(nullptr) { + static_assert(std::is_trivially_destructible::value, + "XlaOp should be trivially destructible"); + } + ~XlaOp() = default; + + // Precondition: !IsUninitialized(). + // + // It's very common to do foo.builder()->bar(). Without this precondition, if + // foo.builder() is null, the call to bar will segfault at some point possibly + // deep in the callstack when we finally dereference `this`. The precondition + // lets us avoid this tricky-to-debug problem. + XlaBuilder* builder() const { + CHECK(builder_ != nullptr); + return builder_; + } + + // Returns true if the XlaOp represents valid, non-erroneous value. + bool valid() const { return handle_ >= 0; } + + // Returns true if the XlaOp was created by the XlaOp() constructor and + // not returned by a builder. + bool IsUninitialized() const { return builder_ == nullptr; } + + bool IsIdenticalTo(const XlaOp& rhs) const { + return handle_ == rhs.handle_ && builder_ == rhs.builder_; + } + + friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) { + out << op.handle(); + return out; + } + + private: + explicit XlaOp(XlaBuilder* builder) : handle_(-1), builder_(builder) {} + XlaOp(int64 handle, XlaBuilder* builder) + : handle_(handle), builder_(builder) {} + + int64 handle() const { return handle_; } + + friend class XlaBuilder; + + // < 0 means "invalid handle". + int64 handle_; + + // Not owned. Non-null for any handle returned by XlaBuilder, even if the + // handle is invalid. + XlaBuilder* builder_; +}; + +// Arithmetic operator overloads for the XlaOp type. +XlaOp operator-(const XlaOp& x); +XlaOp operator+(const XlaOp& x, const XlaOp& y); +XlaOp operator-(const XlaOp& x, const XlaOp& y); +XlaOp operator*(const XlaOp& x, const XlaOp& y); +XlaOp operator/(const XlaOp& x, const XlaOp& y); +XlaOp operator%(const XlaOp& x, const XlaOp& y); + +// Bitwise operator overloads for the XlaOp type. +XlaOp operator~(const XlaOp& x); +XlaOp operator&(const XlaOp& x, const XlaOp& y); +XlaOp operator|(const XlaOp& x, const XlaOp& y); +XlaOp operator^(const XlaOp& x, const XlaOp& y); +XlaOp operator<<(const XlaOp& x, const XlaOp& y); +// Performs a right arithmetic shift if 'x' is a signed type, otherwise performs +// a right logical shift. +XlaOp operator>>(const XlaOp& x, const XlaOp& y); + +// We don't overload the relational operators (==, !=, <, <=, >, >=) because the +// semantics might be surprising since their result types are usually 'bool'. +// Further programmers may expect == to be a structural equality. +// We also choose not to overload any of the mutating operators (e.g., +=, -=) +// because the semantics might be misleading — XLA computations are immutable. + +// A convenient interface for building up computations. +// +// Thread-compatible. +class XlaBuilder { + public: + // computation_name: name to use for the built computation. + XlaBuilder(const string& computation_name); + + XlaBuilder(const XlaBuilder&) = delete; + XlaBuilder& operator=(const XlaBuilder&) = delete; + + ~XlaBuilder(); + + // Returns the computation name. + const string& name() const { return name_; } + + // Sets OpMetadata that will be added to all instructions until cleared. + // + // OpMetadata is often applied to a series of XLA HLO instructions. As a + // result, OpMetadata is set on the Computation Builder. All subsequent + // instructions generated via this Computation Builder will have the same + // OpMetadata attached until a call to ClearOpMetadata. + void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; } + + // Clears the HloMetadata state. + void ClearOpMetadata() { metadata_.Clear(); } + + // Sets an OpSharding that will be attached to all instructions until cleared. + void SetSharding(const OpSharding& sharding) { sharding_ = sharding; } + + // Clears the sharding. Ops will be sharded according to the default placement + // policy. + void ClearSharding() { sharding_ = absl::nullopt; } + + // Returns the OpSharding that will be attached to all instructions. + const absl::optional& sharding() const { return sharding_; } + + // Sets the builder to a mode where it will die immediately when an error is + // encountered, rather than producing it in a deferred fashion when Build() is + // called (which is the default). + void set_die_immediately_on_error(bool enabled) { + die_immediately_on_error_ = enabled; + } + + // Default dimension numbers used for a 2D convolution. + static constexpr int64 kConvBatchDimension = 0; + static constexpr int64 kConvFeatureDimension = 1; + static constexpr int64 kConvFirstSpatialDimension = 2; + static constexpr int64 kConvSecondSpatialDimension = 3; + static constexpr int64 kConvKernelOutputDimension = 0; + static constexpr int64 kConvKernelInputDimension = 1; + static constexpr int64 kConvKernelFirstSpatialDimension = 2; + static constexpr int64 kConvKernelSecondSpatialDimension = 3; + + // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for + // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for + // the kernel operand + // {output_feature, input_feature, height, width} = {0, 1, 2, 3}. + static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers( + int num_spatial_dims = 2); + + // Returns an error if the convolution dimension numbers have conflicts. + static Status Validate(const ConvolutionDimensionNumbers& dnum); + + // Returns a new XlaBuilder whose resultant Computation is used only by this + // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error + // behavior as the parent. + std::unique_ptr CreateSubBuilder(const string& computation_name); + + // Builds the computation with the requested operations, or returns a non-ok + // status. Note that all ops that have been enqueued will be moved to the + // computation being returned. The root of the computation will be the last + // added operation. + StatusOr Build(); + + // Overload of Build which specifies a particular root instruction for the + // computation. + StatusOr Build(XlaOp root); + + // Builds the computation with the requested operations, or notes an error in + // the parent XlaBuilder and returns an empty computation if building failed. + // This function is intended to be used where the returned XlaComputation is + // only used by the parent XlaBuilder and hence further operation on the + // returned XlaComputation will simply be error'ed out if an error occurred + // while building this computation. If the built computation is to be used by + // a XlaBuilder other than the parent XlaBuilder then Build() should be used + // instead. + XlaComputation BuildAndNoteError(); + + // Returns a subgraph that roots on the given root. If the root is not a + // compile-time constant (see `IsConstant`), returns an error. + // + // This will copy the needed ops/computations to the subgraph. + StatusOr BuildConstantSubGraph(const XlaOp& root_op) const; + + // Returns the first error that was encountered while building the + // computation. When an error is encountered, by default we return a vacuous + // XlaOp and inform the user of the error that occurred while + // building the computation when they make a final call to Build(). + // + // See also set_die_immediately_on_error(). + Status first_error() const { return first_error_; } + + // Returns the shape of the given op. + StatusOr GetShape(const XlaOp& op) const; + + // Returns the (inferred) result for the current computation's shape. This + // assumes the root instruction is the last added instruction. + StatusOr GetProgramShape() const; + + // Returns the (inferred) result for the current computation's shape using the + // given operation as the root. + StatusOr GetProgramShape(XlaOp root) const; + + // Reports an error to the builder, by + // * storing it internally and capturing a backtrace if it's the first error + // (this deferred value will be produced on the call to + // Build()/GetShape()/...) + // * dying if die_immediately_on_error_ is true. + // Returns an XlaOp with an invalid handle but a valid builder. This value can + // be returned in place of a value in APIs that return an XlaOp. + XlaOp ReportError(const Status& error); + + // A helper function that converts a StatusOr into an XlaOp. + // If the Status was an error, reports the error to builder and returns an + // invalid XlaOp handle. + XlaOp ReportErrorOrReturn(const StatusOr& op); + + // A helper function that runs a function that returns a StatusOr and + // returns an XlaOp. + XlaOp ReportErrorOrReturn(const std::function()>& op_creator); + + // Returns true if 'operand' is a compile-time constant. A compile-time + // constant does not depend on any parameters, or on stateful operators such + // as `RngNormal` or `Infeed`. + // + // This tests whether a computation is a compile-time constant without + // evaluating the computation. + StatusOr IsConstant(const XlaOp& operand) const; + + private: + // Build helper which takes the id of the root operation.. + StatusOr Build(int64 root_id); + + // Enqueues a "retrieve parameter value" instruction for a parameter that was + // passed to the computation. + XlaOp Parameter(int64 parameter_number, const Shape& shape, + const string& name); + + // Enqueues a constant with the value of the given literal onto the + // computation. + XlaOp ConstantLiteral(const LiteralSlice& literal); + + // Enqueues a constant onto the computation. Methods are templated on the + // native host type (NativeT) which corresponds to a specific XLA + // PrimitiveType as given in the following table: + // + // Native Type PrimitiveType + // ----------------------------- + // bool PRED + // int32 S32 + // int64 S64 + // uint32 U32 + // uint64 U64 + // float F32 + // double F64 + // + // Note: not all primitive types defined in xla_data.proto have a + // corresponding native type yet. + template + XlaOp ConstantR0(NativeT value); + template + XlaOp ConstantR1(absl::Span values); + XlaOp ConstantR1(const tensorflow::core::Bitmap& values); + template + XlaOp ConstantR2( + std::initializer_list> values); + template + XlaOp ConstantFromArrayWithLayout(const Array& values, + const Layout& layout); + template + XlaOp ConstantFromArray(const Array& values); + template + XlaOp ConstantR2FromArray2DWithLayout(const Array2D& values, + const Layout& layout); + template + XlaOp ConstantR2FromArray2D(const Array2D& values); + template + XlaOp ConstantR3FromArray3DWithLayout(const Array3D& values, + const Layout& layout); + template + XlaOp ConstantR3FromArray3D(const Array3D& values); + template + XlaOp ConstantR4FromArray4DWithLayout(const Array4D& values, + const Layout& layout); + template + XlaOp ConstantR4FromArray4D(const Array4D& values); + + // Enqueues a rank one constant (vector) onto the computation. The vector has + // size 'length' and every element has the value 'value'. + template + XlaOp ConstantR1(int64 length, NativeT value); + + // Adds dimensions to an array by duplicating the data in the array. + // + // The new dimensions are inserted on the left, i.e. if + // broadcast_sizes has values {a0, ..., aN} and the operand shape + // has dimensions {b0, ..., bM} then the shape of the output has + // dimensions {a0, ..., aN, b0, ..., bM}. + // + // The new dimensions index into copies of the operand, i.e. + // + // output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM] + XlaOp Broadcast(const XlaOp& operand, + absl::Span broadcast_sizes); + + // Performs in-dimension-style broadcast. + // + // Operand specifies the input to be broadcast. "shape" is expected output + // shape. "broadcast_dimensions" are the dimensions to be broadcasting into. + // Dimension numbers in broadcast_dimensions map to individual dimensions + // of the operand, and specify what dimension of the output shape they + // should be broadcast. + // e.g. + // Say operand = [1, 2], i.e., a 1D tensor with 2 elements. + // and dimension of shape is [2,2]. + // Specifying {1} as brodcast_dimension will generate output + // [1 , 2] + // [1 , 2] + // On the other hand, specifying {0} as broadcast_dimension + // will generate output + // [1 , 1] + // [2 , 2] + XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape, + const absl::Span broadcast_dimensions); + + // Enqueues a pad operation onto the computation that pads the given value on + // the edges as well as between the elements of the input. padding_config + // specifies the padding amount for each dimension. + XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value, + const PaddingConfig& padding_config); + + // Enqueues an operation onto the computation that flattens the operand based + // on the dimension order (major/slowest-varying to minor/fastest-varying) + // given, followed by reshaping it into the shape with the given dimension + // sizes (also major to minor). Conceptually, this is a limited form of + // "shape casting". + XlaOp Reshape(const XlaOp& operand, absl::Span dimensions, + absl::Span new_sizes); + + // Enqueues an operation onto the computation that collapses the operand, from + // first to last dimension (C order), then reshapes it to the given dimension + // sizes. Conceptually, this is a limited form of "shape casting". + XlaOp Reshape(const XlaOp& operand, absl::Span new_sizes); + + // Wrapper for Reshape. + // Enqueues an operation to collapse the provided dimensions; e.g. an + // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to + // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must + // be a consecutive, in-order subsequence of the operand dimensions. + // + // Note that collapsing a single dimension does nothing: + // + // {256} collapsing {0} => {256} + // {1} collapsing {0} => {1} + // + // Collapsing multiple dimensions produces a single result dimension: + // + // {256, 2} collapsing {0,1} => {512} + // {256, 2, 3} collapsing {0,1} => {512, 3} + // + // This could potentially cause data to be moved -- it provides a more + // structured form of reshaping than an arbitrary Reshape operation. + XlaOp Collapse(const XlaOp& operand, absl::Span dimensions); + + // Enqueues a slice operation onto the computation that slices the operand + // from the start indices to the limit indices; e.g. + // + // x + // [ 0 1 2 3 ] + // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ] + // [ 8 9 a b ] + // + // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D + // range notation. + // The strides parameter determines the stride over the slice + XlaOp Slice(const XlaOp& operand, absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides); + + // Enqueues a slice operation in a given dimension, taking all other + // dimensions as they are; e.g. if dimno is 1 from start_index 2 to + // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand + // for: + // + // array[:, 2:4:1, :] + XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index, + int64 stride, int64 dimno); + + // Enqueues a slice operation onto the computation that slices the 'operand' + // from dynamic start indices which are passed in 'start_indices'. + // The size of the slice in each dimension is passed in 'slice_sizes', + // which specify the end point of exclusive slice intervals in each + // dimension [start, start + size). + // The shape of 'start_indices' must be rank == 1, with dimension size + // equal to the rank of the 'operand'. + // Slice index calculations are computed modulo input dimension sizes to + // prevent dynamic start indices from generating out-of-bound array accesses. + XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices, + absl::Span slice_sizes); + + // Enqueues a dynamic update slice operation onto the computation, which + // updates a slice of 'operand' with 'update' at dynamic 'start_indices'. + // The shape of 'update' determines the shape of the slice of 'operand' + // which is updated. + // The indices specified in 'start_indices' specify the offset of the slice + // of 'operand' which is updated. + // + // update = {10, 11} // calculated at runtime. + // [1 2 3] start = {1, 1} // calculated at runtime. [1 2 3 ] + // [4 5 6] => DynamicUpdateslice(data, update, start) => [4 10 11] + // [7 8 9] [7 8 9 ] + // + // The shape of 'start_indices' must be rank == 1, with dimension size + // equal to the rank of the 'operand'. + // Slice index calculations are computed modulo update dimension sizes to + // prevent dynamic start indices from generating out-of-bound array accesses. + XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update, + const XlaOp& start_indices); + + // Enqueues a concatenate instruction onto the computation. 'operands' must + // have >= 1 entry. + XlaOp ConcatInDim(absl::Span operands, int64 dimension); + + // Enqueue a tracing operation onto the computation; the computation will emit + // a logging message with the operand. + void Trace(const string& tag, const XlaOp& operand); + + // Enqueues a conditional-move-like select operation onto the computation; + // predicated on pred, selects between on_true and on_false. + XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false); + + // Enqueues a tuple-creation instruction onto the computation. + XlaOp Tuple(absl::Span elements); + + // Enqueues a tuple-element-get instruction onto the computation. + XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index); + + // Enqueues an equal-to comparison instruction onto the computation. + XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a not-equal comparison instruction onto the computation. + XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a greater-or-equal comparison instruction onto the computation. + XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a greater-than comparison instruction onto the computation. + XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a less-than comparison instruction onto the computation. + XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a less-or-equal comparison instruction onto the computation. + XlaOp Le(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a dot instruction onto the computation. + XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs, + const PrecisionConfigProto* precision_config_proto = nullptr); + + // Enqueues a general dot instruction onto the computation. + XlaOp DotGeneral( + const XlaOp& lhs, const XlaOp& rhs, + const DotDimensionNumbers& dimension_numbers, + const PrecisionConfigProto* precision_config_proto = nullptr); + + // Enqueues a convolution instruction onto the computation, which uses the + // default convolution dimension numbers. + XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, Padding padding, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + + // Enqueues a convolution instruction onto the computation, with the caller + // provided padding configuration in the format returned by MakePadding(). + XlaOp ConvWithGeneralPadding( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, + absl::Span> padding, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + + // Enqueues a convolution instruction onto the computation, with the caller + // provided dimension numbers configuration. + XlaOp ConvWithGeneralDimensions( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, Padding padding, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + + // Enqueues a convolution instruction onto the computation, with the caller + // provided padding configuration as well as the dimension numbers. + XlaOp ConvGeneral( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, + absl::Span> padding, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + + // Enqueues a convolution instruction onto the computation, with the caller + // provided padding configuration, dilation factors and dimension numbers. + XlaOp ConvGeneralDilated( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, + absl::Span> padding, + absl::Span lhs_dilation, + absl::Span rhs_dilation, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + + // Enqueues an FFT instruction onto the computation, of the given type and + // with the given FFT length. + XlaOp Fft(const XlaOp& operand, FftType fft_type, + absl::Span fft_length); + + // Enqueues an infeed instruction onto the computation, which writes data of + // the given shape to the infeed buffer of the device. + XlaOp Infeed(const Shape& shape, const string& config = ""); + XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape, + const string& config = ""); + + // Enqueues an outfeed instruction onto the computation. This instruction + // generates outgoing data transfers for the given data. + // + // shape_with_layout communicates the laid out shape that we want to outfeed + // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error + // will occur. + void Outfeed(const XlaOp& operand, const Shape& shape_with_layout, + const string& outfeed_config); + XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, + const string& outfeed_config); + + // Enqueues a call instruction onto the computation. + XlaOp Call(const XlaComputation& computation, + absl::Span operands); + + // Enqueues a custom call instruction onto the computation. + // During code generation, a call instruction is emitted which targets a + // symbol with the name |call_target_name|. The |operands| are passed to the + // call instruction. |shape| is the resultant shape. + XlaOp CustomCall(const string& call_target_name, + absl::Span operands, const Shape& shape); + + // The following methods enqueue element-wise binary arithmetic operations + // onto the computation. The shapes of the operands have to match unless one + // of the operands is a scalar, or an explicit broadcast dimension is given + // (see g3doc for more details). + + // Enqueues a complex compose instruction onto the computation. + XlaOp Complex(const XlaOp& real, const XlaOp& imag, + absl::Span broadcast_dimensions = {}); + + // Enqueues a complex conjugate instruction onto the computation. + XlaOp Conj(const XlaOp& operand); + + // Enqueues an add instruction onto the computation. + XlaOp Add(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a subtract instruction onto the computation. + XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a multiply instruction onto the computation. + XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a divide instruction onto the computation. + XlaOp Div(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a remainder instruction onto the computation. + XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a max instruction onto the computation. + XlaOp Max(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues a min instruction onto the computation. + XlaOp Min(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Element-wise logical operators + XlaOp And(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + XlaOp Or(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + XlaOp Not(const XlaOp& operand); + + XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Reduces an array among the provided dimensions, given "computation" as a + // reduction operator. + XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span dimensions_to_reduce); + + // Reduces several arrays simultaneously among the provided dimensions, given + // "computation" as a reduction operator. + XlaOp Reduce(absl::Span operands, + absl::Span init_values, + const XlaComputation& computation, + absl::Span dimensions_to_reduce); + + // Convenience wrapper around the above that reduces all the dimensions in the + // operand shape. + XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation); + + // Enqueues a windowed reduce instruction onto the computation. + XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding); + + // As ReduceWindow(), but the padding is given in the format + // returned by MakePadding(). + XlaOp ReduceWindowWithGeneralPadding( + const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding); + + // Returns the sum of the operand value within each subgroup of replicas. All + // replicas supply one input to the sum and all replicas receive the resulting + // sum for each subgroup. + XlaOp CrossReplicaSum(const XlaOp& operand, + absl::Span replica_groups = {}); + + // Enqueues an operation that do an AllReduce of the operand cross cores. Here + // AllReduce means doing a reduction on the input operand cross cores and then + // broadcasting the reduction result to those cores. The reduction function is + // defined by `computation`, which should be a commutative computation on + // scalars, e.g., add, min, or max. The way that AllReduce is applied is + // configured by: + // + // - `replica_groups`: each ReplicaGroup contains a list of replica id. If + // empty, all replicas belong to one group. Allreduce will be applied within + // subgroups. For example, we have 4 replicas, then + // replica_groups={{0,2},{1,3}} means, replica 0 and 2 are in subgroup 0, + // replica 1 and 3 are in subgroup 1. + // + // - `channel_id`: for Allreduce nodes from different modules, if they have + // the same channel_id, they will be 'Allreduce'd. If empty, Allreduce will + // not be applied cross modules. + // + // TODO(b/79737069): Rename this to AllReduce when it's ready to use. + XlaOp CrossReplicaSum( + const XlaOp& operand, const XlaComputation& computation, + absl::Span replica_groups = {}, + const absl::optional& channel_id = absl::nullopt); + + // Enqueues an operation that do an Alltoall of the operand cross cores. + XlaOp AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups); + + // Enqueues an operation that do an CollectivePermute of the operand cross + // cores. + XlaOp CollectivePermute( + const XlaOp& operand, + const std::vector>& source_target_pairs); + + // Enqueues an operation that scatters the `source` array to the selected + // indices of each window. + XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, + Padding padding, const XlaOp& source, + const XlaOp& init_value, + const XlaComputation& scatter); + + // As SelectAndScatter(), but the padding is given in the format + // returned by MakePadding(). + XlaOp SelectAndScatterWithGeneralPadding( + const XlaOp& operand, const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding, const XlaOp& source, + const XlaOp& init_value, const XlaComputation& scatter); + + // Enqueues an abs instruction onto the computation. + XlaOp Abs(const XlaOp& operand); + + // Enqueues a atan2 instruction onto the computation. + XlaOp Atan2(const XlaOp& y, const XlaOp& x, + absl::Span broadcast_dimensions = {}); + + // Enqueues an exp instruction onto the computation. + XlaOp Exp(const XlaOp& operand); + + // Enqueues an expm1 instruction onto the computation. + XlaOp Expm1(const XlaOp& operand); + + // Enqueues a floor instruction onto the computation. + XlaOp Floor(const XlaOp& operand); + + // Enqueues a ceil instruction onto the computation. + XlaOp Ceil(const XlaOp& operand); + + // Enqueues a round instruction onto the computation, rounding to nearest even + // with half-way cases rounding away from zero. + XlaOp Round(const XlaOp& operand); + + // Enqueues an log instruction (natural logarithm) onto the computation. + XlaOp Log(const XlaOp& operand); + + // Enqueues an log1p instruction (log(x+1)) onto the computation. + XlaOp Log1p(const XlaOp& operand); + + // Enqueues a sign instruction onto the computation. + XlaOp Sign(const XlaOp& operand); + + // Enqueues a count leading zeros instruction onto the computation. + XlaOp Clz(const XlaOp& operand); + + // Enqueues a cosine instruction onto the computation. + XlaOp Cos(const XlaOp& operand); + + // Enqueues a sine instruction onto the computation. + XlaOp Sin(const XlaOp& operand); + + // Enqueues a tanh instruction onto the computation. + XlaOp Tanh(const XlaOp& operand); + + // Enqueues a real-part instruction onto the computation. + XlaOp Real(const XlaOp& operand); + + // Enqueues an imaginary-part instruction onto the computation. + XlaOp Imag(const XlaOp& operand); + + // Enqueues a lhs^rhs computation onto the computation. + XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + + // Enqueues an operator that tests if the operand's values are finite, i.e., + // not Inf or NaN. Defined only for floating-point types. Returns an array of + // booleans with the same shape where entries are true iff the corresponding + // entry was NaN. + XlaOp IsFinite(const XlaOp& operand); + + // Enqueues an iota operation onto the computation. + XlaOp Iota(const Shape& shape, int64 iota_dimension); + + // Enqueues a rank-1 iota operation onto the computation. + XlaOp Iota(PrimitiveType type, int64 size); + + // Enqueues a convert instruction onto the computation that changes the + // element type of the operand array to primitive_type. + XlaOp ConvertElementType(const XlaOp& operand, + PrimitiveType new_element_type); + + // Enqueues a no-op instruction onto the computation that changes + // the element type of the operand array to primitive_type. The + // bit-widths of the source and destination element types must be + // identical. + XlaOp BitcastConvertType(const XlaOp& operand, + PrimitiveType new_element_type); + + // Enqueues a negate instruction onto the computation. + XlaOp Neg(const XlaOp& operand); + + // Enqueues a transpose instruction onto the computation. + XlaOp Transpose(const XlaOp& operand, absl::Span permutation); + + // Enqueues a reverse instruction onto the computation. The order of the + // elements in the given dimensions is reversed (i.e., the element at index i + // is moved to index dimension_size - 1 - i). + XlaOp Rev(const XlaOp& operand, absl::Span dimensions); + + // Enqueues a sort (as increasing order) instruction onto the computation. + // If only keys are provided: + // * If the keys are an rank-1 tensor (an array), the result is a sorted array + // of keys, in ascending order. + // * If the keys have higher rank, the keys are sorted along the provided + // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension + // value of 0 will indepenently sort every column, and a dimension value of 1 + // will independently sort each row. If no dimension number is provided, then + // the last dimension is chosen by default. + // + // If both keys and values are provided: + // * The keys and the values must tensors with the same dimensions. The + // element types of the tensors may be different. + // * The result is a tuple that consists of a sorted tensor of keys (along the + // provided dimension, as above) as the first element, and a tensor with their + // corresponding values as the second element. + XlaOp Sort(XlaOp keys, absl::optional values = absl::nullopt, + int64 dimension = -1); + + // Enqueues a clamp instruction onto the computation. + XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max); + + // Enqueues a map instruction onto the computation. + XlaOp Map(absl::Span operands, const XlaComputation& computation, + absl::Span dimensions, + absl::Span static_operands = {}); + + // Enqueues a N(mu, sigma) random number generation instruction onto the + // computation. + XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape); + + // Enqueues a U(a, b) random number generation instruction onto the + // computation. Returns values in the semi-open interval [a, b). + XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape); + + // Enqueues a while node onto the computation. + XlaOp While(const XlaComputation& condition, const XlaComputation& body, + const XlaOp& init); + + // Enqueues a conditional node onto the computation. + XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand, + const XlaComputation& true_computation, + const XlaOp& false_operand, + const XlaComputation& false_computation); + + // Enqueues a ReducePrecision node onto the computation. + XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits, + const int mantissa_bits); + + // Enqueues a Gather node onto the computation. + XlaOp Gather(const XlaOp& input, const XlaOp& start_indices, + const GatherDimensionNumbers& dimension_numbers, + absl::Span slice_sizes); + + // Enqueues a Scatter node onto the computation. + XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers); + + // Enqueues a Send node onto the computation for device-to-device + // communication, to send the given operand to a Recv instruction that shares + // the same channel handle. + void Send(const XlaOp& operand, const ChannelHandle& handle); + XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token, + const ChannelHandle& handle); + + // Enqueues a Send node which sends data to the host. + XlaOp SendToHost(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, const ChannelHandle& handle); + + // Enqueues a Recv node which receives data from the host. + XlaOp RecvFromHost(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle); + + // Enqueues an AfterAll operation with no operands producing a token-shaped + // value. + XlaOp CreateToken(); + + // Enqueues an AfterAll operation with no operands producing a token-shaped + // value. + XlaOp AfterAll(absl::Span tokens); + + // Enqueues a Recv node onto the computation. The data comes from a Send + // instruction that shares the same channel handle and its shape must + // be the same as the given shape. + XlaOp Recv(const Shape& shape, const ChannelHandle& handle); + XlaOp RecvWithToken(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle); + + // Normalizes operand across spatial and batch dimensions for each feature. + // + // Returns a tuple (normalized, batch_mean, batch_var) where `normalized` + // is the normalized result and batch_mean and batch_var are the mean and + // variance, respectively, across batch for the operand. + XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, float epsilon, + int64 feature_index); + + // Normalizes operand across spatial and batch dimensions for each feature. + // + // `BatchNormInference` is equivalent to calling `BatchNormTraining` without + // computing `mean` and `variance` for each batch inside the operation. It + // uses the input `mean` and `variance` instead as estimated values. The + // purpose of this op is to reduce latency in inference, hence the name + // `BatchNormInference`. + // + // The output has the same shape as `operand`, and contains the normalized + // values for each batch. + XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, const XlaOp& mean, + const XlaOp& variance, float epsilon, + int64 feature_index); + + // Calculates the gradients of a batch norm op. + // + // The inputs `batch_mean` and `batch_var` represent the mean and variance + // across the batch. + // + // Returns a tuple of three elements: + // - grad_operand: Gradient with respect to input `operand` + // - grad_offset: Gradient with respect to input `offset` + // - grad_scale: Gradient with respect to input `scale` + XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale, + const XlaOp& batch_mean, const XlaOp& batch_var, + const XlaOp& grad_output, float epsilon, + int64 feature_index); + + StatusOr AddInstruction(HloInstructionProto&& instr, HloOpcode opcode, + absl::Span operands = {}); + + void AddCalledComputation(const XlaComputation& computation, + HloInstructionProto* instr); + + StatusOr LookUpInstruction(const XlaOp& op) const; + + // Internal helper method that does the building for an arbitrary unary op. + XlaOp UnaryOp(HloOpcode unop, const XlaOp& operand); + + // Internal helper method that does the building for an arbitrary binary op. + // broadcast_dimensions specifies which dimensions to use for broadcasting + // when the operation is between tensors of different ranks. + XlaOp BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + + // Internal helper method that does the building for an arbitrary ternary op. + XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs, + const XlaOp& ehs); + + XlaOp RngOp(RandomDistribution distribution, + absl::Span parameters, const Shape& shape); + + StatusOr InDimBroadcast(const Shape& shape, const XlaOp& operand, + absl::Span broadcast_dimensions); + + // Internal helper method that creates a sequence of instructions that + // performs an explicit broadcast of the operand to the target shape. + StatusOr AddBroadcastSequence(const Shape& output_shape, + const XlaOp& operand); + + // Internal helper method for creating a Reshape op with the already inferred + // shape. + StatusOr Reshape(const Shape& shape, const XlaOp& operand); + + // Returns the (inferred) result for the program shape using the given root. + StatusOr GetProgramShape(int64 root_id) const; + + // Returns shapes for the operands. + StatusOr> GetOperandShapes( + absl::Span operands) const; + + // A visitor which checks whether an operation is a compile-time constant, + // meaning that it doesn't depend on any parameters, or on any stateful + // operation such as `RngNormal` or `Infeed`. The visitor walks the + // computation starting at a given operation and sets is_constant to false iff + // a parameter or stateful operation is encountered. + void IsConstantVisitor(const int64 op_handle, std::set* visited, + bool* is_constant) const; + + // Checks bounds for convolution parameters. + Status VerifyConvolution( + const Shape& lhs_shape, const Shape& rhs_shape, + const ConvolutionDimensionNumbers& dimension_numbers) const; + + // Helper function for creating a Window proto from user-supplied data. + // Returns error if the user-supplied data was invalid. + StatusOr MakeWindow(absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding, + absl::Span lhs_dilation, + absl::Span rhs_dilation) const; + + string name_; // Name to use for the built computation. + + // The first error encountered while building the computation. + // This is OK until the first error is encountered. + Status first_error_; + + // The saved stack trace from the point at which the first error occurred. + tensorflow::SavedStackTrace first_error_backtrace_; + + // The instructions of this computation. + std::vector instructions_; + + // The embedded computations used by this computation. Each computation was + // the entry computation of some XlaComputation, the key is the unique id of + // that XlaComputation. + std::map embedded_; + + // The unique parameter numbers. + tensorflow::gtl::FlatSet parameter_numbers_; + + // The metadata to attach to each op. This is structured as a "modal"-like + // operation, in order to simplify client code (and not sprinkle this metadata + // throughout the TensorFlow op kernel implementations). + OpMetadata metadata_; + + // Sharding for this operator. This is structured as a "model"-like operation, + // in order to simplify client code, similar to metadata_. + absl::optional sharding_; + + // Mode bit that indicates whether to die when a first error is encountered. + bool die_immediately_on_error_ = false; + + XlaBuilder* parent_builder_{nullptr}; + + friend XlaOp Parameter(XlaBuilder* builder, int64 parameter_number, + const Shape& shape, const string& name); + friend XlaOp ConstantLiteral(XlaBuilder* builder, + const LiteralSlice& literal); + template + friend XlaOp ConstantR0(XlaBuilder* builder, NativeT value); + template + friend XlaOp ConstantR1(XlaBuilder* builder, + absl::Span values); + friend XlaOp ConstantR1(XlaBuilder* builder, + const tensorflow::core::Bitmap& values); + template + friend XlaOp ConstantR2( + XlaBuilder* builder, + std::initializer_list> values); + template + friend XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder, + const Array& values, + const Layout& layout); + template + friend XlaOp ConstantFromArray(XlaBuilder* builder, + const Array& values); + template + friend XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder, + const Array2D& values, + const Layout& layout); + template + friend XlaOp ConstantR2FromArray2D(XlaBuilder* builder, + const Array2D& values); + template + friend XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder, + const Array3D& values, + const Layout& layout); + template + friend XlaOp ConstantR3FromArray3D(XlaBuilder* builder, + const Array3D& values); + template + friend XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder, + const Array4D& values, + const Layout& layout); + template + friend XlaOp ConstantR4FromArray4D(XlaBuilder* builder, + const Array4D& values); + + template + friend XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value); + + friend XlaOp Broadcast(const XlaOp& operand, + absl::Span broadcast_sizes); + + friend XlaOp BroadcastInDim( + const XlaOp& operand, const Shape& shape, + const absl::Span broadcast_dimensions); + + friend XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value, + const PaddingConfig& padding_config); + + friend XlaOp Reshape(const XlaOp& operand, absl::Span dimensions, + absl::Span new_sizes); + + friend XlaOp Reshape(const XlaOp& operand, absl::Span new_sizes); + + friend XlaOp Collapse(const XlaOp& operand, + absl::Span dimensions); + + friend XlaOp Slice(const XlaOp& operand, + absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides); + + friend XlaOp SliceInDim(const XlaOp& operand, int64 start_index, + int64 limit_index, int64 stride, int64 dimno); + + friend XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices, + absl::Span slice_sizes); + + friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update, + const XlaOp& start_indices); + + friend XlaOp ConcatInDim(XlaBuilder* builder, + absl::Span operands, int64 dimension); + + friend void Trace(const string& tag, const XlaOp& operand); + + friend XlaOp Select(const XlaOp& pred, const XlaOp& on_true, + const XlaOp& on_false); + friend XlaOp Tuple(XlaBuilder* builder, absl::Span elements); + friend XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index); + friend XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Le(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs, + const PrecisionConfigProto* precision_config_proto); + friend XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs, + const DotDimensionNumbers& dimension_number, + const PrecisionConfigProto* precision_config_proto); + friend XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, Padding padding, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto); + friend XlaOp ConvWithGeneralPadding( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, + absl::Span> padding, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto); + friend XlaOp ConvWithGeneralDimensions( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, Padding padding, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto); + friend XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, + absl::Span> padding, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto); + friend XlaOp ConvGeneralDilated( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, + absl::Span> padding, + absl::Span lhs_dilation, + absl::Span rhs_dilation, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count, + const PrecisionConfigProto* precision_config_proto); + friend XlaOp Fft(const XlaOp& operand, FftType fft_type, + absl::Span fft_length); + friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape, + const string& config); + friend void Outfeed(const XlaOp& operand, const Shape& shape_with_layout, + const string& outfeed_config); + friend XlaOp Call(XlaBuilder* builder, const XlaComputation& computation, + absl::Span operands); + friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name, + absl::Span operands, const Shape& shape); + friend XlaOp Complex(const XlaOp& real, const XlaOp& imag, + absl::Span broadcast_dimensions); + friend XlaOp Conj(const XlaOp& operand); + friend XlaOp Add(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Div(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Max(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Min(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp And(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Or(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Not(const XlaOp& operand); + friend XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp ShiftRightArithmetic( + const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span dimensions_to_reduce); + friend XlaOp Reduce(XlaBuilder* builder, absl::Span operands, + absl::Span init_values, + const XlaComputation& computation, + absl::Span dimensions_to_reduce); + friend XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation); + friend XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, + Padding padding); + friend XlaOp ReduceWindowWithGeneralPadding( + const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding); + friend XlaOp CrossReplicaSum(const XlaOp& operand, + absl::Span replica_groups); + friend XlaOp CrossReplicaSum(const XlaOp& operand, + const XlaComputation& computation, + absl::Span replica_groups, + const absl::optional& channel_id); + friend XlaOp AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups); + friend XlaOp CollectivePermute( + const XlaOp& operand, + const std::vector>& source_target_pairs); + friend XlaOp SelectAndScatter(const XlaOp& operand, + const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, + Padding padding, const XlaOp& source, + const XlaOp& init_value, + const XlaComputation& scatter); + friend XlaOp SelectAndScatterWithGeneralPadding( + const XlaOp& operand, const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding, const XlaOp& source, + const XlaOp& init_value, const XlaComputation& scatter); + friend XlaOp Abs(const XlaOp& operand); + friend XlaOp Atan2(const XlaOp& y, const XlaOp& x, + absl::Span broadcast_dimensions); + friend XlaOp Exp(const XlaOp& operand); + friend XlaOp Expm1(const XlaOp& operand); + friend XlaOp Floor(const XlaOp& operand); + friend XlaOp Ceil(const XlaOp& operand); + friend XlaOp Round(const XlaOp& operand); + friend XlaOp Log(const XlaOp& operand); + friend XlaOp Log1p(const XlaOp& operand); + friend XlaOp Sign(const XlaOp& operand); + friend XlaOp Clz(const XlaOp& operand); + friend XlaOp Cos(const XlaOp& operand); + friend XlaOp Sin(const XlaOp& operand); + friend XlaOp Tanh(const XlaOp& operand); + friend XlaOp Real(const XlaOp& operand); + friend XlaOp Imag(const XlaOp& operand); + friend XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions); + friend XlaOp IsFinite(const XlaOp& operand); + friend XlaOp Iota(XlaBuilder* builder, const Shape& shape, + int64 iota_dimension); + friend XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size); + friend XlaOp ConvertElementType(const XlaOp& operand, + PrimitiveType new_element_type); + friend XlaOp BitcastConvertType(const XlaOp& operand, + PrimitiveType new_element_type); + friend XlaOp Neg(const XlaOp& operand); + friend XlaOp Transpose(const XlaOp& operand, + absl::Span permutation); + friend XlaOp Rev(const XlaOp& operand, absl::Span dimensions); + friend XlaOp Sort(XlaOp keys, absl::optional values, int64 dimension); + friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max); + friend XlaOp Map(XlaBuilder* builder, absl::Span operands, + const XlaComputation& computation, + absl::Span dimensions, + absl::Span static_operands); + friend XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, + const Shape& shape); + friend XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape); + friend XlaOp While(const XlaComputation& condition, + const XlaComputation& body, const XlaOp& init); + friend XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand, + const XlaComputation& true_computation, + const XlaOp& false_operand, + const XlaComputation& false_computation); + friend XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits, + const int mantissa_bits); + friend XlaOp Gather(const XlaOp& input, const XlaOp& start_indices, + const GatherDimensionNumbers& dimension_numbers, + absl::Span slice_sizes); + friend XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, + const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers); + friend void Send(const XlaOp& operand, const ChannelHandle& handle); + friend XlaOp Recv(XlaBuilder* builder, const Shape& shape, + const ChannelHandle& handle); + friend XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, float epsilon, + int64 feature_index); + friend XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, const XlaOp& mean, + const XlaOp& variance, float epsilon, + int64 feature_index); + friend XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale, + const XlaOp& batch_mean, const XlaOp& batch_var, + const XlaOp& grad_output, float epsilon, + int64 feature_index); + friend XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token, + const ChannelHandle& handle); + friend XlaOp RecvWithToken(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle); + friend XlaOp SendToHost(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, + const ChannelHandle& handle); + friend XlaOp RecvFromHost(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle); + friend XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape, + const string& config); + friend XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, + const string& outfeed_config); + friend XlaOp CreateToken(XlaBuilder* builder); + friend XlaOp AfterAll(XlaBuilder* builder, absl::Span tokens); +}; + +// RAII-style object: sets the current sharding assignment in builder on +// construction, and sets back to the previous assignment on destruction. +class XlaScopedShardingAssignment { + public: + XlaScopedShardingAssignment(xla::XlaBuilder* builder, + absl::optional sharding) + : builder_(builder), prev_sharding_(builder->sharding()) { + SetSharding(sharding); + } + + XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete; + XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) = + delete; + + ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); } + + private: + void SetSharding(const absl::optional& sharding) { + if (sharding.has_value()) { + builder_->SetSharding(sharding.value()); + } else { + builder_->ClearSharding(); + } + } + + xla::XlaBuilder* const builder_; + absl::optional prev_sharding_; +}; + +// Free functions for building XlaOps. The intention is that these will +// become the public API for building XlaOps rather than calling methods on +// XlaBuilder directly. + +// Enqueues a "retrieve parameter value" instruction for a parameter that was +// passed to the computation. +XlaOp Parameter(XlaBuilder* builder, int64 parameter_number, const Shape& shape, + const string& name); + +// Enqueues a constant with the value of the given literal onto the +// computation. +XlaOp ConstantLiteral(XlaBuilder* builder, const LiteralSlice& literal); + +// Enqueues a constant onto the computation. Methods are templated on the +// native host type (NativeT) which corresponds to a specific XLA +// PrimitiveType as given in the following table: +// +// Native Type PrimitiveType +// ----------------------------- +// bool PRED +// int32 S32 +// int64 S64 +// uint32 U32 +// uint64 U64 +// float F32 +// double F64 +// +// Note: not all primitive types defined in xla_data.proto have a +// corresponding native type yet. +template +XlaOp ConstantR0(XlaBuilder* builder, NativeT value); +template +XlaOp ConstantR1(XlaBuilder* builder, absl::Span values); +XlaOp ConstantR1(XlaBuilder* builder, const tensorflow::core::Bitmap& values); +template +XlaOp ConstantR2(XlaBuilder* builder, + std::initializer_list> values); +template +XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder, + const Array& values, + const Layout& layout); +template +XlaOp ConstantFromArray(XlaBuilder* builder, const Array& values); +template +XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder, + const Array2D& values, + const Layout& layout); +template +XlaOp ConstantR2FromArray2D(XlaBuilder* builder, + const Array2D& values); +template +XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder, + const Array3D& values, + const Layout& layout); +template +XlaOp ConstantR3FromArray3D(XlaBuilder* builder, + const Array3D& values); +template +XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder, + const Array4D& values, + const Layout& layout); +template +XlaOp ConstantR4FromArray4D(XlaBuilder* builder, + const Array4D& values); + +// Enqueues a rank one constant (XlaBuilder* builder, vector) onto the +// computation. The vector has size 'length' and every element has the value +// 'value'. +template +XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value); + +// Adds dimensions to an array by duplicating the data in the array. +// +// The new dimensions are inserted on the left, i.e. if +// broadcast_sizes has values {a0, ..., aN} and the operand shape +// has dimensions {b0, ..., bM} then the shape of the output has +// dimensions {a0, ..., aN, b0, ..., bM}. +// +// The new dimensions index into copies of the operand, i.e. +// +// output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM] +XlaOp Broadcast(const XlaOp& operand, absl::Span broadcast_sizes); + +// Performs in-dimension-style broadcast. +// +// Operand specifies the input to be broadcast. "shape" is expected output +// shape. "broadcast_dimensions" are the dimensions to be broadcasting into. +// Dimension numbers in broadcast_dimensions map to individual dimensions +// of the operand, and specify what dimension of the output shape they +// should be broadcast. +// e.g. +// Say operand = [1, 2], i.e., a 1D tensor with 2 elements. +// and dimension of shape is [2,2]. +// Specifying {1} as brodcast_dimension will generate output +// [1 , 2] +// [1 , 2] +// On the other hand, specifying {0} as broadcast_dimension +// will generate output +// [1 , 1] +// [2 , 2] +XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape, + const absl::Span broadcast_dimensions); + +// Enqueues a pad operation onto the computation that pads the given value on +// the edges as well as between the elements of the input. padding_config +// specifies the padding amount for each dimension. +XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value, + const PaddingConfig& padding_config); + +// Enqueues an operation onto the computation that flattens the operand based +// on the dimension order (major/slowest-varying to minor/fastest-varying) +// given, followed by reshaping it into the shape with the given dimension +// sizes (also major to minor). Conceptually, this is a limited form of +// "shape casting". +XlaOp Reshape(const XlaOp& operand, absl::Span dimensions, + absl::Span new_sizes); + +// Enqueues an operation onto the computation that collapses the operand, from +// first to last dimension (C order), then reshapes it to the given dimension +// sizes. Conceptually, this is a limited form of "shape casting". +XlaOp Reshape(const XlaOp& operand, absl::Span new_sizes); + +// Wrapper for Reshape. +// Enqueues an operation to collapse the provided dimensions; e.g. an +// operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to +// {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must +// be a consecutive, in-order subsequence of the operand dimensions. +// +// Note that collapsing a single dimension does nothing: +// +// {256} collapsing {0} => {256} +// {1} collapsing {0} => {1} +// +// Collapsing multiple dimensions produces a single result dimension: +// +// {256, 2} collapsing {0,1} => {512} +// {256, 2, 3} collapsing {0,1} => {512, 3} +// +// This could potentially cause data to be moved -- it provides a more +// structured form of reshaping than an arbitrary Reshape operation. +XlaOp Collapse(const XlaOp& operand, absl::Span dimensions); + +// Enqueues a slice operation onto the computation that slices the operand +// from the start indices to the limit indices; e.g. +// +// x +// [ 0 1 2 3 ] +// y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ] +// [ 8 9 a b ] +// +// Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D +// range notation. +// The strides parameter determines the stride over the slice +XlaOp Slice(const XlaOp& operand, absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides); + +// Enqueues a slice operation in a given dimension, taking all other +// dimensions as they are; e.g. if dimno is 1 from start_index 2 to +// limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand +// for: +// +// array[:, 2:4:1, :] +XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index, + int64 stride, int64 dimno); + +// Enqueues a slice operation onto the computation that slices the 'operand' +// from dynamic start indices which are passed in 'start_indices'. +// The size of the slice in each dimension is passed in 'slice_sizes', +// which specify the end point of exclusive slice intervals in each +// dimension [start, start + size). +// The shape of 'start_indices' must be rank == 1, with dimension size +// equal to the rank of the 'operand'. +// Slice index calculations are computed modulo input dimension sizes to +// prevent dynamic start indices from generating out-of-bound array accesses. +XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices, + absl::Span slice_sizes); + +// Enqueues a dynamic update slice operation onto the computation, which +// updates a slice of 'operand' with 'update' at dynamic 'start_indices'. +// The shape of 'update' determines the shape of the slice of 'operand' +// which is updated. +// The indices specified in 'start_indices' specify the offset of the slice +// of 'operand' which is updated. +// +// update = {10, 11} // calculated at runtime. +// [1 2 3] start = {1, 1} // calculated at runtime. [1 2 3 ] +// [4 5 6] => DynamicUpdateslice(data, update, start) => [4 10 11] +// [7 8 9] [7 8 9 ] +// +// The shape of 'start_indices' must be rank == 1, with dimension size +// equal to the rank of the 'operand'. +// Slice index calculations are computed modulo update dimension sizes to +// prevent dynamic start indices from generating out-of-bound array accesses. +XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update, + const XlaOp& start_indices); + +// Enqueues a concatenate instruction onto the computation. 'operands' must +// have >= 1 entry. +XlaOp ConcatInDim(XlaBuilder* builder, absl::Span operands, + int64 dimension); + +// Enqueue a tracing operation onto the computation; the computation will emit +// a logging message with the operand. +void Trace(const string& tag, const XlaOp& operand); + +// Enqueues a conditional-move-like select operation onto the computation; +// predicated on pred, selects between on_true and on_false. +XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false); + +// Enqueues a tuple-creation instruction onto the computation. +XlaOp Tuple(XlaBuilder* builder, absl::Span elements); + +// Enqueues a tuple-element-get instruction onto the computation. +XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index); + +// Enqueues an equal-to comparison instruction onto the computation. +XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a not-equal comparison instruction onto the computation. +XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a greater-or-equal comparison instruction onto the computation. +XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a greater-than comparison instruction onto the computation. +XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a less-than comparison instruction onto the computation. +XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a less-or-equal comparison instruction onto the computation. +XlaOp Le(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a dot instruction onto the computation. +XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs, + const PrecisionConfigProto* precision_config_proto = nullptr); + +// Enqueues a general dot instruction onto the computation. +XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs, + const DotDimensionNumbers& dimension_numbers, + const PrecisionConfigProto* precision_config_proto = nullptr); + +// Enqueues a convolution instruction onto the computation, which uses the +// default convolution dimension numbers. +XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, Padding padding, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + +// Enqueues a convolution instruction onto the computation, with the caller +// provided padding configuration in the format returned by MakePadding(). +XlaOp ConvWithGeneralPadding( + const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, + absl::Span> padding, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + +// Enqueues a convolution instruction onto the computation, with the caller +// provided dimension numbers configuration. +XlaOp ConvWithGeneralDimensions( + const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, + Padding padding, const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + +// Enqueues a convolution instruction onto the computation, with the caller +// provided padding configuration as well as the dimension numbers. +XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs, + absl::Span window_strides, + absl::Span> padding, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + +// Enqueues a convolution instruction onto the computation, with the caller +// provided padding configuration, dilation factors and dimension numbers. +XlaOp ConvGeneralDilated( + const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, + absl::Span> padding, + absl::Span lhs_dilation, absl::Span rhs_dilation, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count = 1, + const PrecisionConfigProto* precision_config_proto = nullptr); + +// Enqueues an FFT instruction onto the computation, of the given type and +// with the given FFT length. +XlaOp Fft(const XlaOp& operand, FftType fft_type, + absl::Span fft_length); + +// Enqueues an infeed instruction onto the computation, which writes data of +// the given shape to the infeed buffer of the device. +XlaOp Infeed(XlaBuilder* builder, const Shape& shape, + const string& config = ""); + +// Variant of Infeed which takes a token-shaped operand and produces a +// two-element tuple containing the data value and a token-shaped value. +// Tokens are used for ordering side-effecting operations. +// TODO(b/110532604): Replace all uses of the non-token form with this variant. +XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape, + const string& config = ""); + +// Enqueues an outfeed instruction onto the computation. This instruction +// generates outgoing data transfers for the given data. +// +// shape_with_layout communicates the laid out shape that we want to outfeed +// -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error +// will occur. +void Outfeed(const XlaOp& operand, const Shape& shape_with_layout, + const string& outfeed_config); + +// Variant of Outfeed which takes a token-shaped operand and produces a +// token-shaped value. Tokens are used for ordering side-effecting operations. +// TODO(b/110532604): Replace all uses of the non-token form with this variant. +XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, + const string& outfeed_config); + +// Enqueues a call instruction onto the computation. +XlaOp Call(XlaBuilder* builder, const XlaComputation& computation, + absl::Span operands); + +// Enqueues a custom call instruction onto the computation. +// During code generation, a call instruction is emitted which targets a +// symbol with the name |call_target_name|. The |operands| are passed to the +// call instruction. |shape| is the resultant shape. +XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name, + absl::Span operands, const Shape& shape); + +// The following methods enqueue element-wise binary arithmetic operations +// onto the computation. The shapes of the operands have to match unless one +// of the operands is a scalar, or an explicit broadcast dimension is given +// (see g3doc for more details). + +// Enqueues a complex compose instruction onto the computation. +XlaOp Complex(const XlaOp& real, const XlaOp& imag, + absl::Span broadcast_dimensions = {}); + +// Enqueues a complex conjugate instruction onto the computation. +XlaOp Conj(const XlaOp& operand); + +// Enqueues an add instruction onto the computation. +XlaOp Add(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a subtract instruction onto the computation. +XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a multiply instruction onto the computation. +XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a divide instruction onto the computation. +XlaOp Div(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a remainder instruction onto the computation. +XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a max instruction onto the computation. +XlaOp Max(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues a min instruction onto the computation. +XlaOp Min(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Element-wise logical operators +XlaOp And(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +XlaOp Or(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +XlaOp Not(const XlaOp& operand); + +XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); +XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); +XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Reduces an array among the provided dimensions, given "computation" as a +// reduction operator. +XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span dimensions_to_reduce); + +// Reduces several arrays simultaneously among the provided dimensions, given +// "computation" as a reduction operator. +XlaOp Reduce(XlaBuilder* builder, absl::Span operands, + absl::Span init_values, + const XlaComputation& computation, + absl::Span dimensions_to_reduce); + +// Convenience wrapper around the above that reduces all the dimensions in the +// operand shape. +XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation); + +// Enqueues a windowed reduce instruction onto the computation. +XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding); + +// As ReduceWindow(), but the padding is given in the format +// returned by MakePadding(). +XlaOp ReduceWindowWithGeneralPadding( + const XlaOp& operand, const XlaOp& init_value, + const XlaComputation& computation, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding); + +// Returns the sum of the operand value within each subgroup of replicas. All +// replicas supply one input to the sum and all replicas receive the resulting +// sum for each subgroup. +XlaOp CrossReplicaSum(const XlaOp& operand, + absl::Span replica_groups = {}); + +// Enqueues an operation that do an AllReduce of the operand cross cores. Here +// AllReduce means doing a reduction on the input operand cross cores and then +// broadcasting the reduction result to those cores. The reduction function is +// defined by `computation`, which should be a commutative computation on +// scalars, e.g., add, min, or max. The way that AllReduce is applied is +// configured by: +// +// - `replica_groups`: each ReplicaGroup contains a list of replica id. If +// empty, all replicas belong to one group. Allreduce will be applied within +// subgroups. For example, we have 4 replicas, then replica_groups={{0,2},{1,3}} +// means, replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1. +// +// - `channel_id`: for Allreduce nodes from different modules, if they have the +// same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be +// applied cross modules. +// +// TODO(b/79737069): Rename this to AllReduce when it's ready to use. +XlaOp CrossReplicaSum( + const XlaOp& operand, const XlaComputation& computation, + absl::Span replica_groups = {}, + const absl::optional& channel_id = absl::nullopt); + +// Enqueues an operation that do an Alltoall of the operand cross cores. +XlaOp AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups = {}); + +// Enqueues an collective operation that sends and receives data cross replicas. +// +// - `source_target_pair`: a list of (source_replica_id, target_replica_id) +// pairs. For each pair, the operand is sent from source replica to target +// replica. Note that, 1) any two pairs should not have the same target replica +// id, and they should not have the same source replica id; 2) if a replica id +// is not a target in any pair, then the output on that replica is a tensor +// consists of 0(s) with the same shape as the input. +XlaOp CollectivePermute( + const XlaOp& operand, + const std::vector>& source_target_pairs); + +// Enqueues an operation that scatters the `source` array to the selected +// indices of each window. +XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding, + const XlaOp& source, const XlaOp& init_value, + const XlaComputation& scatter); + +// As SelectAndScatter(), but the padding is given in the format +// returned by MakePadding(). +XlaOp SelectAndScatterWithGeneralPadding( + const XlaOp& operand, const XlaComputation& select, + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding, const XlaOp& source, + const XlaOp& init_value, const XlaComputation& scatter); + +// Enqueues an abs instruction onto the computation. +XlaOp Abs(const XlaOp& operand); + +// Enqueues a atan2 instruction onto the computation. +XlaOp Atan2(const XlaOp& y, const XlaOp& x, + absl::Span broadcast_dimensions = {}); + +// Enqueues an exp instruction onto the computation. +XlaOp Exp(const XlaOp& operand); + +// Enqueues an expm1 instruction onto the computation. +XlaOp Expm1(const XlaOp& operand); + +// Enqueues a floor instruction onto the computation. +XlaOp Floor(const XlaOp& operand); + +// Enqueues a ceil instruction onto the computation. +XlaOp Ceil(const XlaOp& operand); + +// Enqueues a round instruction onto the computation, rounding to nearest even +// with half-way cases rounding away from zero. +XlaOp Round(const XlaOp& operand); + +// Enqueues an log instruction (natural logarithm) onto the computation. +XlaOp Log(const XlaOp& operand); + +// Enqueues an log1p instruction (log(x+1)) onto the computation. +XlaOp Log1p(const XlaOp& operand); + +// Enqueues a sign instruction onto the computation. +XlaOp Sign(const XlaOp& operand); + +// Enqueues a count leading zeros instruction onto the computation. +XlaOp Clz(const XlaOp& operand); + +// Enqueues a cosine instruction onto the computation. +XlaOp Cos(const XlaOp& operand); + +// Enqueues a sine instruction onto the computation. +XlaOp Sin(const XlaOp& operand); + +// Enqueues a tanh instruction onto the computation. +XlaOp Tanh(const XlaOp& operand); + +// Enqueues a real-part instruction onto the computation. +XlaOp Real(const XlaOp& operand); + +// Enqueues an imaginary-part instruction onto the computation. +XlaOp Imag(const XlaOp& operand); + +// Enqueues a lhs^rhs computation onto the computation. +XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs, + absl::Span broadcast_dimensions = {}); + +// Enqueues an operator that tests if the operand's values are finite, i.e., +// not Inf or NaN. Defined only for floating-point types. Returns an array of +// booleans with the same shape where entries are true iff the corresponding +// entry was NaN. +XlaOp IsFinite(const XlaOp& operand); + +// Enqueues an iota operation onto the computation. +XlaOp Iota(XlaBuilder* builder, const Shape& shape, int64 iota_dimension); + +// Enqueues a rank-1 iota operation onto the computation. +XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size); + +// Enqueues a convert instruction onto the computation that changes the +// element type of the operand array to primitive_type. +XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type); + +// Enqueues a no-op instruction onto the computation that changes +// the element type of the operand array to primitive_type. The +// bit-widths of the source and destination element types must be +// identical. +XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type); + +// Enqueues a negate instruction onto the computation. +XlaOp Neg(const XlaOp& operand); + +// Enqueues a transpose instruction onto the computation. +XlaOp Transpose(const XlaOp& operand, absl::Span permutation); + +// Enqueues a reverse instruction onto the computation. The order of the +// elements in the given dimensions is reversed (i.e., the element at index i +// is moved to index dimension_size - 1 - i). +XlaOp Rev(const XlaOp& operand, absl::Span dimensions); + +// Enqueues a sort (as increasing order) instruction onto the computation. +// If only keys are provided: +// * If the keys are an rank-1 tensor (an array), the result is a sorted array +// of keys, in ascending order. +// * If the keys have higher rank, the keys are sorted along the provided +// dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension +// value of 0 will indepenently sort every column, and a dimension value of 1 +// will independently sort each row. If no dimension number is provided, then +// the last dimension is chosen by default. +// +// If both keys and values are provided: +// * The keys and the values must tensors with the same dimensions. The +// element types of the tensors may be different. +// * The result is a tuple that consists of a sorted tensor of keys (along the +// provided dimension, as above) as the first element, and a tensor with their +// corresponding values as the second element. +XlaOp Sort(XlaOp keys, absl::optional values = absl::nullopt, + int64 dimension = -1); + +// Enqueues a clamp instruction onto the computation. +XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max); + +// Enqueues a map instruction onto the computation. +XlaOp Map(XlaBuilder* builder, absl::Span operands, + const XlaComputation& computation, absl::Span dimensions, + absl::Span static_operands = {}); + +// Enqueues a N(mu, sigma) random number generation instruction onto the +// computation. +XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape); + +// Enqueues a U(a, b) random number generation instruction onto the +// computation. Returns values in the semi-open interval [a, b). +XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape); + +// Enqueues a while node onto the computation. +XlaOp While(const XlaComputation& condition, const XlaComputation& body, + const XlaOp& init); + +// Enqueues a conditional node onto the computation. +XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand, + const XlaComputation& true_computation, + const XlaOp& false_operand, + const XlaComputation& false_computation); + +// Enqueues a ReducePrecision node onto the computation. +XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits, + const int mantissa_bits); + +// Enqueues a Gather node onto the computation. +XlaOp Gather(const XlaOp& input, const XlaOp& start_indices, + const GatherDimensionNumbers& dimension_numbers, + absl::Span slice_sizes); + +// Enqueues a Scatter node onto the computation. +XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers); + +// Enqueues a Send node onto the computation for device-to-device +// communication. This operation sends the given operand to +// a Recv instruction in a different computation that shares the same channel +// handle. +void Send(const XlaOp& operand, const ChannelHandle& handle); + +// Variant of Send which takes a token-shaped operand and produces a +// token-shaped value. Tokens are used for ordering side-effecting operations. +// TODO(b/110532604): Replace all uses of the non-token form with this variant. +XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token, + const ChannelHandle& handle); + +// Enqueues a Recv node onto the computation for device-to-device +// communication. The data comes from a Send instruction in a different +// computation that shares the same channel handle and its shape must be the +// same as the given shape. +XlaOp Recv(XlaBuilder* builder, const Shape& shape, + const ChannelHandle& handle); + +// Variant of Recv which takes a token-shaped operand and produces a two-element +// tuple containing the data value and a token-shaped value. Tokens are used +// for ordering side-effecting operations. +// TODO(b/110532604): Replace all uses of the non-token form with this variant. +XlaOp RecvWithToken(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle); + +// Enqueues a Send node which transfers data from the device to the host. The +// 'shape_with_layout' argument defines the layout of the data transferred; its +// shape must be compatible with the shape of the operand. The operand must be +// array-shaped. +// TODO(b/111544877): Support tuple shapes. +XlaOp SendToHost(const XlaOp& operand, const XlaOp& token, + const Shape& shape_with_layout, const ChannelHandle& handle); + +// Enqueues a Recv node which transfers data from the host to the device. The +// given shape must contain a layout and must be an array. +// TODO(b/111544877): Support tuple shapes. +XlaOp RecvFromHost(const XlaOp& token, const Shape& shape, + const ChannelHandle& handle); + +// Enqueues an operation (AfterAll) with no operands that produces a +// token-shaped value. Tokens are used for ordering side-effecting operations. +// This is a separate method from AfterAll to facility the removal of +// operand-less AfterAll instructions. +// TODO(b/110532604): Remove this function when all tokens are derived from a +// single token generated or passed into the entry computation. +XlaOp CreateToken(XlaBuilder* builder); + +// Enqueues an AfterAll instruction which produces a token-shaped value and +// takes a variadic number of token-shaped operands. The number of operands must +// be greater than zero. Used for joining tokens. +XlaOp AfterAll(XlaBuilder* builder, absl::Span tokens); + +// Normalizes operand across spatial and batch dimensions for each feature. +// +// Returns a tuple (normalized, batch_mean, batch_var) where `normalized` +// is the normalized result and batch_mean and batch_var are the mean and +// variance, respectively, across batch for the operand. +XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, float epsilon, + int64 feature_index); + +// Normalizes operand across spatial and batch dimensions for each feature. +// +// `BatchNormInference` is equivalent to calling `BatchNormTraining` without +// computing `mean` and `variance` for each batch inside the operation. It +// uses the input `mean` and `variance` instead as estimated values. The +// purpose of this op is to reduce latency in inference, hence the name +// `BatchNormInference`. +// +// The output has the same shape as `operand`, and contains the normalized +// values for each batch. +XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale, + const XlaOp& offset, const XlaOp& mean, + const XlaOp& variance, float epsilon, + int64 feature_index); + +// Calculates the gradients of a batch norm op. +// +// The inputs `batch_mean` and `batch_var` represent the mean and variance +// across the batch. +// +// Returns a tuple of three elements: +// - grad_operand: Gradient with respect to input `operand` +// - grad_offset: Gradient with respect to input `offset` +// - grad_scale: Gradient with respect to input `scale` +XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale, + const XlaOp& batch_mean, const XlaOp& batch_var, + const XlaOp& grad_output, float epsilon, + int64 feature_index); + +// Implementation details below this point. + +template +XlaOp XlaBuilder::ConstantR0(NativeT value) { + return ConstantLiteral(*LiteralUtil::CreateR0(value)); +} + +template +XlaOp XlaBuilder::ConstantR1(absl::Span values) { + return ConstantLiteral(*LiteralUtil::CreateR1(values)); +} + +template +XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) { + Literal literal(ShapeUtil::MakeShape( + primitive_util::NativeToPrimitiveType(), {length})); + literal.PopulateWithValue(value); + return ConstantLiteral(literal); +} + +inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) { + return ConstantLiteral(*LiteralUtil::CreateR1(values)); +} + +template +XlaOp XlaBuilder::ConstantR2( + std::initializer_list> values) { + return ConstantLiteral(*LiteralUtil::CreateR2(values)); +} + +template +XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array& values, + const Layout& layout) { + return ConstantLiteral( + *LiteralUtil::CreateFromArrayWithLayout(values, layout)); +} + +template +XlaOp XlaBuilder::ConstantFromArray(const Array& values) { + return ConstantLiteral(*LiteralUtil::CreateFromArray(values)); +} + +template +XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout( + const Array2D& values, const Layout& layout) { + return ConstantLiteral( + *LiteralUtil::CreateFromArrayWithLayout(values, layout)); +} + +template +XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D& values) { + return ConstantLiteral(*LiteralUtil::CreateR2FromArray2D(values)); +} + +template +XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout( + const Array3D& values, const Layout& layout) { + return ConstantLiteral( + *LiteralUtil::CreateR3FromArray3DWithLayout(values, layout)); +} + +template +XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D& values) { + return ConstantFromArray(values); +} + +template +XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout( + const Array4D& values, const Layout& layout) { + return ConstantFromArrayWithLayout(values, layout); +} + +template +XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D& values) { + return ConstantFromArray(values); +} + +// Free function template implementations. + +template +XlaOp ConstantR0(XlaBuilder* builder, NativeT value) { + return ConstantLiteral(builder, *LiteralUtil::CreateR0(value)); +} + +template +XlaOp ConstantR1(XlaBuilder* builder, absl::Span values) { + return ConstantLiteral(builder, *LiteralUtil::CreateR1(values)); +} + +template +XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value) { + Literal literal(ShapeUtil::MakeShape( + primitive_util::NativeToPrimitiveType(), {length})); + literal.PopulateWithValue(value); + return ConstantLiteral(builder, literal); +} + +inline XlaOp ConstantR1(XlaBuilder* builder, + const tensorflow::core::Bitmap& values) { + return ConstantLiteral(builder, *LiteralUtil::CreateR1(values)); +} + +template +XlaOp ConstantR2(XlaBuilder* builder, + std::initializer_list> values) { + return ConstantLiteral(builder, *LiteralUtil::CreateR2(values)); +} + +template +XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder, + const Array& values, + const Layout& layout) { + return ConstantLiteral( + builder, + *LiteralUtil::CreateFromArrayWithLayout(values, layout)); +} + +template +XlaOp ConstantFromArray(XlaBuilder* builder, const Array& values) { + return ConstantLiteral(builder, + *LiteralUtil::CreateFromArray(values)); +} + +template +XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder, + const Array2D& values, + const Layout& layout) { + return ConstantLiteral( + builder, + *LiteralUtil::CreateFromArrayWithLayout(values, layout)); +} + +template +XlaOp ConstantR2FromArray2D(XlaBuilder* builder, + const Array2D& values) { + return ConstantLiteral(builder, + *LiteralUtil::CreateR2FromArray2D(values)); +} + +template +XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder, + const Array3D& values, + const Layout& layout) { + return ConstantLiteral( + builder, + *LiteralUtil::CreateR3FromArray3DWithLayout(values, layout)); +} + +template +XlaOp ConstantR3FromArray3D(XlaBuilder* builder, + const Array3D& values) { + return ConstantFromArray(builder, values); +} + +template +XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder, + const Array4D& values, + const Layout& layout) { + return ConstantFromArrayWithLayout(builder, values, layout); +} + +template +XlaOp ConstantR4FromArray4D(XlaBuilder* builder, + const Array4D& values) { + return ConstantFromArray(builder, values); +} + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_BUILDER_H_ diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7c37ed00cd3dcc214fb0b36c0161d3c39a5bf8c8 --- /dev/null +++ b/tensorflow/compiler/xla/client/xla_builder_test.cc @@ -0,0 +1,400 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/xla_builder.h" + +#include + +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/test_helpers.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { + +namespace { + +namespace op = xla::testing::opcode_matchers; + +using ::testing::HasSubstr; + +// TODO(b/74197823): Move the tests to service/. +class XlaBuilderTest : public ::testing::Test { + protected: + StatusOr> BuildHloModule(XlaBuilder* b) { + TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build()); + const HloModuleProto& proto = computation.proto(); + TF_ASSIGN_OR_RETURN(const auto& config, + HloModule::CreateModuleConfigFromProto( + proto, legacy_flags::GetDebugOptionsFromFlags())); + return HloModule::CreateFromProto(proto, config); + } + + // Overload which explicitly specifies the root instruction. + StatusOr> BuildHloModule(XlaBuilder* b, + XlaOp root) { + TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build(root)); + const HloModuleProto& proto = computation.proto(); + TF_ASSIGN_OR_RETURN(const auto& config, + HloModule::CreateModuleConfigFromProto( + proto, legacy_flags::GetDebugOptionsFromFlags())); + return HloModule::CreateFromProto(proto, config); + } + + // Returns the name of the test currently being run. + string TestName() const { + return ::testing::UnitTest::GetInstance()->current_test_info()->name(); + } +}; + +TEST_F(XlaBuilderTest, OnePlusTwo) { + XlaBuilder b(TestName()); + Add(ConstantR0(&b, 1.0), ConstantR0(&b, 2.0)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Add(op::Constant(), op::Constant())); +} + +TEST_F(XlaBuilderTest, UnaryOperatorsBuildExpectedHLO) { + auto test_unary_operator = + [&](std::function op, + ::testing::Matcher matches_pattern) { + XlaBuilder b(TestName()); + op(ConstantR0(&b, 1)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, matches_pattern); + }; + test_unary_operator([](XlaOp x) { return -x; }, op::Negate(op::Constant())); + test_unary_operator([](XlaOp x) { return ~x; }, op::Not(op::Constant())); +} + +TEST_F(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) { + auto test_binary_operator = + [&](std::function op, + ::testing::Matcher matches_pattern) { + XlaBuilder b(TestName()); + op(ConstantR0(&b, 1), ConstantR0(&b, 2)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, matches_pattern); + }; + + test_binary_operator([](XlaOp x, XlaOp y) { return x + y; }, + op::Add(op::Constant(), op::Constant())); + test_binary_operator([](XlaOp x, XlaOp y) { return x - y; }, + op::Subtract(op::Constant(), op::Constant())); + test_binary_operator([](XlaOp x, XlaOp y) { return x * y; }, + op::Multiply(op::Constant(), op::Constant())); + test_binary_operator([](XlaOp x, XlaOp y) { return x / y; }, + op::Divide(op::Constant(), op::Constant())); + + test_binary_operator([](XlaOp x, XlaOp y) { return x & y; }, + op::And(op::Constant(), op::Constant())); + test_binary_operator([](XlaOp x, XlaOp y) { return x | y; }, + op::Or(op::Constant(), op::Constant())); + test_binary_operator([](XlaOp x, XlaOp y) { return x ^ y; }, + op::Xor(op::Constant(), op::Constant())); + test_binary_operator([](XlaOp x, XlaOp y) { return x << y; }, + op::ShiftLeft(op::Constant(), op::Constant())); + test_binary_operator( + [](XlaOp x, XlaOp y) { return x >> y; }, + op::ShiftRightArithmetic(op::Constant(), op::Constant())); + + auto test_unsigned_binary_operator = + [&](std::function op, + ::testing::Matcher matches_pattern) { + XlaBuilder b(TestName()); + op(ConstantR0(&b, 1), ConstantR0(&b, 2)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, matches_pattern); + }; + test_unsigned_binary_operator( + [](XlaOp x, XlaOp y) { return x >> y; }, + op::ShiftRightLogical(op::Constant(), op::Constant())); +} + +TEST_F(XlaBuilderTest, ShiftRightOperatorOnNonIntegerProducesError) { + XlaBuilder b(TestName()); + ConstantR0(&b, 1) >> ConstantR0(&b, 2); + auto statusor = b.Build(); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Argument to >> operator does not have an integral type")); +} + +TEST_F(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {3, 5}), "x"); + Add(x, ConstantR0(&b, 1.0)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Add(op::Parameter(), op::Broadcast(op::Constant()))); +} + +TEST_F(XlaBuilderTest, ParamPlusParamHasBroadcast) { + XlaBuilder b(TestName()); + const auto& x_shape = ShapeUtil::MakeShape(S32, {2, 4, 6}); + const auto& y_shape = ShapeUtil::MakeShape(S32, {2, 4}); + auto x = Parameter(&b, 0, x_shape, "x"); + auto y = Parameter(&b, 1, y_shape, "y"); + auto add = Add(x, y, /*broadcast_dimensions=*/{0, 1}); + + TF_ASSERT_OK_AND_ASSIGN(auto add_shape, b.GetShape(add)); + EXPECT_TRUE(ShapeUtil::Equal(add_shape, x_shape)); + + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Add(op::Parameter(0), op::Broadcast(op::Parameter(1)))); +} + +TEST_F(XlaBuilderTest, XPlusX) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(S32, {1, 3, 5, 7}), "x"); + Add(x, x); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Add(op::Parameter(0), op::Parameter(0))); +} + +TEST_F(XlaBuilderTest, ShapeInferenceError) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(U32, {2, 4, 6}), "x"); + auto y = Parameter(&b, 1, ShapeUtil::MakeShape(U32, {2, 4}), "y"); + Add(x, y); + auto statusor = BuildHloModule(&b); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), HasSubstr("shape inference")); +} + +TEST_F(XlaBuilderTest, ParameterAlreadyRegistered) { + XlaBuilder b_call("add"); + Parameter(&b_call, 0, ShapeUtil::MakeShape(PRED, {}), "x"); + + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(PRED, {}), "x"); + auto y = Parameter(&b, 0, ShapeUtil::MakeShape(PRED, {}), "y"); + Add(x, y); + auto statusor = BuildHloModule(&b); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("parameter 0 already registered")); +} + +TEST_F(XlaBuilderTest, Call) { + XlaBuilder b_call("the_only_to_apply"); + auto p0 = Parameter(&b_call, 0, ShapeUtil::MakeShape(F32, {}), "p0"); + auto p1 = Parameter(&b_call, 1, ShapeUtil::MakeShape(F32, {}), "p1"); + Add(p0, p1); + TF_ASSERT_OK_AND_ASSIGN(auto call, b_call.Build()); + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y"); + auto one = ConstantR0(&b, 1); + auto two = ConstantR0(&b, 2); + Add(Call(&b, call, {x, y}), Call(&b, call, {one, two})); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Add(op::Call(op::Parameter(), op::Parameter()), + op::Call(op::Constant(), op::Constant()))); +} + +TEST_F(XlaBuilderTest, BinopHasDegenerateBroadcast) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {1, 2, 3}), "x"); + auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {1, 2, 1}), "y"); + Add(x, y); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + + // Expected: + // + // x: f32[1,2,3] y: f32[1,2,1] + // | | + // | reshape: f32[1,2] + // | | + // | broadcast: f32[1,2,3] + // \ / + // add + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Add(op::Parameter(0), + op::Broadcast(op::Reshape(op::Parameter(1))))); +} + +TEST_F(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3}), "x"); + auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {2, 1, 4}), "y"); + Add(x, y, /*broadcast_dimensions=*/{0, 1}); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + + // The binary operation has in-dim broadcast and degenerate broadcast, should + // first do the in-dim broadcast then convert the degnerate broadcast into a + // reshape and a broadcast. + // + // Expected: + // + // x: f32[2,3] y: f32[2,1,4] + // | | + // broadcast: f32[2,3,4] reshape: f32[2,4] + // | | + // | broadcast: f32[2,3,4] + // \ / + // add + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Add(op::Broadcast(op::Parameter(0)), + op::Broadcast(op::Reshape(op::Parameter(1))))); +} + +TEST_F(XlaBuilderTest, OperandFromWrongBuilder) { + XlaBuilder b1("b1"); + auto p0 = Parameter(&b1, 0, ShapeUtil::MakeShape(F32, {}), "p0"); + XlaBuilder builder("main"); + auto p = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "p"); + Add(p, p0); + auto statusor = builder.Build(); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr( + "built by builder 'b1', but is trying to use it in builder 'main'")); +} + +TEST_F(XlaBuilderTest, ReshapeDefaultOrder) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x"); + Reshape(x, /*new_sizes=*/{6, 35}); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Reshape(op::Parameter())); +} + +TEST_F(XlaBuilderTest, ReshapeHasTranspose) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x"); + Reshape(x, /*dimensions=*/{3, 2, 1, 0}, /*new_sizes=*/{6, 35}); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Reshape(op::Transpose(op::Parameter()))); +} + +TEST_F(XlaBuilderTest, Transpose) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x"); + Transpose(x, /*permutation=*/{1, 0}); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Transpose(op::Parameter())); +} + +TEST_F(XlaBuilderTest, AllToAll) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x"); + AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, + /*split_count=*/2); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + + // AllToAll is decomposed into slices -> all-to-all -> gte -> concat. + EXPECT_EQ(root->opcode(), HloOpcode::kConcatenate); + EXPECT_EQ(root->operand(0)->operand(0)->opcode(), HloOpcode::kAllToAll); + EXPECT_TRUE( + ShapeUtil::Equal(root->shape(), ShapeUtil::MakeShape(F32, {8, 8}))); +} + +TEST_F(XlaBuilderTest, CollectivePermute) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x"); + CollectivePermute(x, {{0, 1}, {1, 2}, {2, 3}}); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kCollectivePermute); +} + +TEST_F(XlaBuilderTest, ReportError) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x"); + Add(b.ReportError(InvalidArgument("a test error")), x); + auto statusor = b.Build(); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), HasSubstr("a test error")); +} + +TEST_F(XlaBuilderTest, ReportErrorOrReturnHandlesNonErrors) { + XlaBuilder b(TestName()); + StatusOr op(ConstantR0(&b, 1.0)); + Add(b.ReportErrorOrReturn(op), ConstantR0(&b, 2.0)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Add(op::Constant(), op::Constant())); +} + +TEST_F(XlaBuilderTest, ReportErrorOrReturnHandlesErrors) { + XlaBuilder b(TestName()); + StatusOr op(InvalidArgument("a test error")); + Add(b.ReportErrorOrReturn(op), ConstantR0(&b, 2.0)); + auto statusor = b.Build(); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), HasSubstr("a test error")); +} + +TEST_F(XlaBuilderTest, BuildWithSpecificRoot) { + XlaBuilder b(TestName()); + XlaOp constant = ConstantR0(&b, 1.0); + Add(constant, ConstantR0(&b, 2.0)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/constant)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Constant()); +} + +TEST_F(XlaBuilderTest, BuildWithSpecificRootAndMultipleParameters) { + // Specifying a particular root in Build should still include all entry + // parameters. + XlaBuilder b(TestName()); + const Shape shape = ShapeUtil::MakeShape(F32, {42, 123}); + XlaOp x = Parameter(&b, 0, shape, "x"); + XlaOp y = Parameter(&b, 1, shape, "y"); + XlaOp z = Parameter(&b, 2, shape, "z"); + Add(x, Sub(y, z)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/x)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Parameter()); + EXPECT_EQ(module->entry_computation()->num_parameters(), 3); + EXPECT_EQ(module->entry_computation()->instruction_count(), 5); +} + +TEST_F(XlaBuilderTest, BuildWithSpecificRootWithWrongBuilder) { + XlaBuilder b(TestName()); + XlaBuilder other_b(TestName()); + const Shape shape = ShapeUtil::MakeShape(F32, {42, 123}); + + Parameter(&b, 0, shape, "param"); + XlaOp other_param = Parameter(&other_b, 0, shape, "other_param"); + + Status status = b.Build(other_param).status(); + ASSERT_IS_NOT_OK(status); + EXPECT_THAT( + status.error_message(), + ::testing::HasSubstr("root operation is not in this computation")); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD deleted file mode 100644 index 0d6e207971ec64515ec5e6da292910920edd101a..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/client/xla_client/BUILD +++ /dev/null @@ -1,79 +0,0 @@ -# Description: -# The new XLA client libraries. -# -# This is NOT YET ready to use. - -licenses(["notice"]) # Apache 2.0 - -package(default_visibility = [":friends"]) - -package_group( - name = "friends", - includes = [ - "//tensorflow/compiler/xla:friends", - ], -) - -# Filegroup used to collect source files for dependency checking. -filegroup( - name = "c_srcs", - data = glob([ - "**/*.cc", - "**/*.h", - ]), -) - -load("//tensorflow:tensorflow.bzl", "tf_cc_test") - -cc_library( - name = "xla_computation", - srcs = ["xla_computation.cc"], - hdrs = ["xla_computation.h"], - deps = [ - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo_proto", - ], -) - -# TODO(b/74197823): Replace computation_builder with xla_builder. -cc_library( - name = "xla_builder", - srcs = ["xla_builder.cc"], - hdrs = ["xla_builder.h"], - deps = [ - ":xla_computation", - "//tensorflow/compiler/xla:execution_options_util", - "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client:padding", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_proto", - "//tensorflow/compiler/xla/service:shape_inference", - "//tensorflow/core:lib", - ], -) - -tf_cc_test( - name = "xla_builder_test", - srcs = ["xla_builder_test.cc"], - deps = [ - ":xla_builder", - "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla:test_helpers", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_matchers", - "//tensorflow/core:test", - ], -) diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc deleted file mode 100644 index ae506317c2e4862d77cb4f0628e919871ad1aeb2..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc +++ /dev/null @@ -1,1974 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" - -#include -#include -#include -#include -#include - -#include "tensorflow/compiler/xla/execution_options_util.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/service/shape_inference.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/platform/mutex.h" - -namespace xla { - -using tensorflow::strings::StrCat; - -namespace { - -int64 GetUniqueId() { - static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); - static int64 built_counter = 0; - tensorflow::mutex_lock loc(mu); - const int64 id = built_counter++; - return id; -} - -// Returns true if an instruction with the given opcode can be the root of the -// computation. -bool CanBeRoot(HloOpcode opcode) { - switch (opcode) { - case HloOpcode::kSend: - case HloOpcode::kSendDone: - case HloOpcode::kOutfeed: - case HloOpcode::kTrace: - return false; - default: - return true; - } -} - -} // namespace - -StatusOr XlaBuilder::GetShape(const XlaOp& op) const { - TF_RETURN_IF_ERROR(first_error_); - - TF_ASSIGN_OR_RETURN(auto instr, LookUpInstruction(op)); - return instr->shape(); -} - -StatusOr> XlaBuilder::GetOperandShapes( - tensorflow::gtl::ArraySlice operands) const { - std::vector operand_shapes; - for (const XlaOp& operand : operands) { - TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand)); - operand_shapes.push_back(shape); - } - return operand_shapes; -} - -XlaBuilder::XlaBuilder(const string& computation_name) - : name_(computation_name) {} - -XlaBuilder::~XlaBuilder() {} - -void XlaBuilder::NoteError(const Status& error) { - CHECK(!error.ok()); - if (die_immediately_on_error_) { - LOG(FATAL) << "error building computation: " << error; - } - - if (first_error_.ok()) { - first_error_ = error; - first_error_backtrace_.CreateCurrent(/*skip_count=*/1); - } -} - -XlaOp XlaBuilder::NoteErrorOrReturn( - const std::function()>& op_creator) { - if (!first_error_.ok()) { - return {}; - } - auto op = op_creator(); - if (!op.ok()) { - NoteError(op.status()); - return {}; - } - return op.ConsumeValueOrDie(); -} - -StatusOr XlaBuilder::GetProgramShape(int64* root_id) const { - TF_RETURN_IF_ERROR(first_error_); - - TF_RET_CHECK(root_id != nullptr); - - ProgramShape program_shape; - - // Not all instructions can be roots. Walk backwards from the last added - // instruction until a valid root is found. - int64 index = instructions_.size() - 1; - for (; index >= 0; index--) { - TF_ASSIGN_OR_RETURN(HloOpcode opcode, - StringToHloOpcode(instructions_[index].opcode())); - if (CanBeRoot(opcode)) { - break; - } - } - if (index < 0) { - return FailedPrecondition("no root instruction was found"); - } - *root_id = instructions_[index].id(); - *program_shape.mutable_result() = instructions_[index].shape(); - - // Check that the parameter numbers are continuous from 0, and add parameter - // shapes and names to the program shape. - const int64 param_count = parameter_numbers_.size(); - for (int64 i = 0; i < param_count; i++) { - program_shape.add_parameters(); - program_shape.add_parameter_names(); - } - for (const HloInstructionProto& instr : instructions_) { - // Parameter number uniqueness is guaranteed in XlaBuilder::Parameter(). So - // to verify continuity, we just need to verify that every parameter is in - // the right range. - if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter)) { - const int64 index = instr.parameter_number(); - TF_RET_CHECK(index >= 0 && index < param_count) - << "invalid parameter number: " << index; - *program_shape.mutable_parameters(index) = instr.shape(); - *program_shape.mutable_parameter_names(index) = instr.name(); - } - } - return program_shape; -} - -StatusOr XlaBuilder::GetProgramShape() const { - int64 root; - return GetProgramShape(&root); -} - -void XlaBuilder::IsConstantVisitor(const int64 op_handle, - std::set* visited, - bool* is_constant) const { - if (visited->count(op_handle) != 0 || !*is_constant) { - return; - } - - CHECK(op_handle < instructions_.size() && op_handle >= 0); - - const HloInstructionProto& instr = instructions_[op_handle]; - const HloOpcode opcode = StringToHloOpcode(instr.opcode()).ValueOrDie(); - switch (opcode) { - default: - for (const int64 operand_id : instr.operand_ids()) { - IsConstantVisitor(operand_id, visited, is_constant); - } - // TODO(b/32495713): We aren't checking the called computations. - break; - - // Non functional ops. - case HloOpcode::kRng: - case HloOpcode::kCrossReplicaSum: - // TODO(b/33009255): Implmement constant folding for cross replica sum. - case HloOpcode::kInfeed: - case HloOpcode::kOutfeed: - case HloOpcode::kHostCompute: - case HloOpcode::kCall: - // TODO(b/32495713): We aren't checking the to_apply computation itself, - // so we conservatively say that computations containing the Call op - // cannot be constant. We cannot set is_functional=false in other similar - // cases since we're already relying on IsConstant to return true. - case HloOpcode::kCustomCall: - case HloOpcode::kWhile: - // TODO(b/32495713): We aren't checking the condition and body - // computations themselves. - case HloOpcode::kSend: - case HloOpcode::kRecv: - case HloOpcode::kParameter: - *is_constant = false; - break; - } - if (!*is_constant) { - VLOG(1) << "Non-constant: " << instr.name(); - } - visited->insert(op_handle); -} - -XlaComputation XlaBuilder::BuildAndNoteError() { - DCHECK(parent_builder_ != nullptr); - auto build_status = Build(); - if (!build_status.ok()) { - parent_builder_->NoteError( - AddStatus(build_status.status(), - tensorflow::strings::StrCat("error from: ", name_))); - return {}; - } - return build_status.ConsumeValueOrDie(); -} - -StatusOr XlaBuilder::Build() { - if (!first_error_.ok()) { - string backtrace; - first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace); - return AppendStatus(first_error_, backtrace); - } - - HloComputationProto entry; - entry.set_id(GetUniqueId()); // Give the computation a global unique id. - entry.set_name(StrCat(name_, entry.id())); // Ensure that the name is unique. - - { - int64 root_id; - TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(), - GetProgramShape(&root_id)); - entry.set_root_id(root_id); - } - - for (auto& instruction : instructions_) { - // Ensures that the instruction names are unique among the whole graph. - const string& new_name = - StrCat(instruction.name(), ".", entry.id(), ".", instruction.id()); - instruction.set_name(new_name); - entry.add_instructions()->Swap(&instruction); - } - - XlaComputation computation(entry.id()); - HloModuleProto* module = computation.mutable_proto(); - module->set_name(entry.name()); - module->set_id(entry.id()); - module->set_entry_computation_name(entry.name()); - module->set_entry_computation_id(entry.id()); - *module->mutable_program_shape() = entry.program_shape(); - for (auto& e : embedded_) { - module->add_computations()->Swap(&e.second); - } - module->add_computations()->Swap(&entry); - - // Clear data held by this builder. - this->instructions_.clear(); - this->embedded_.clear(); - this->parameter_numbers_.clear(); - - return std::move(computation); -} - -StatusOr XlaBuilder::InDimBroadcast( - const Shape& shape, const XlaOp& operand, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - TF_RETURN_IF_ERROR(first_error_); - - HloInstructionProto instr; - *instr.mutable_shape() = shape; - for (int64 dim : broadcast_dimensions) { - instr.add_dimensions(dim); - } - return AddInstruction(std::move(instr), HloOpcode::kBroadcast, {operand}); -} - -StatusOr XlaBuilder::AddBroadcastSequence(const Shape& output_shape, - const XlaOp& operand) { - TF_RETURN_IF_ERROR(first_error_); - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - - CHECK(ShapeUtil::IsScalar(operand_shape) || - ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape)); - Shape broadcast_shape = - ShapeUtil::ChangeElementType(output_shape, operand_shape.element_type()); - - // Do explicit broadcast for scalar. - if (ShapeUtil::IsScalar(operand_shape)) { - return InDimBroadcast(broadcast_shape, operand, {}); - } - - // Do explicit broadcast for degenerate broadcast. - std::vector broadcast_dimensions; - std::vector reshaped_dimensions; - for (int i = 0; i < ShapeUtil::Rank(operand_shape); i++) { - if (operand_shape.dimensions(i) == output_shape.dimensions(i)) { - broadcast_dimensions.push_back(i); - reshaped_dimensions.push_back(operand_shape.dimensions(i)); - } else { - TF_RET_CHECK(operand_shape.dimensions(i) == 1) - << "An explicit broadcast sequence requires the broadcasted " - "dimensions to be trivial; operand shape: " - << operand_shape << "; output_shape: " << output_shape; - } - } - // Eliminate the size one dimensions. - TF_ASSIGN_OR_RETURN(XlaOp reshaped_operand, - Reshape(ShapeUtil::MakeShape(operand_shape.element_type(), - reshaped_dimensions), - operand)); - // Broadcast 'reshape' up to the larger size. - return InDimBroadcast(broadcast_shape, reshaped_operand, - broadcast_dimensions); -} - -XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferUnaryOpShape(unop, operand_shape)); - return AddInstruction(std::move(instr), unop, {operand}); - }); -} - -XlaOp XlaBuilder::BinaryOp( - HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); - TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferBinaryOpShape( - binop, lhs_shape, rhs_shape, broadcast_dimensions)); - - const int64 lhs_rank = ShapeUtil::Rank(lhs_shape); - const int64 rhs_rank = ShapeUtil::Rank(rhs_shape); - - XlaOp updated_lhs = lhs; - XlaOp updated_rhs = rhs; - - if (!broadcast_dimensions.empty() && lhs_rank != rhs_rank) { - const bool should_broadcast_lhs = lhs_rank < rhs_rank; - XlaOp from = should_broadcast_lhs ? lhs : rhs; - const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape; - - std::vector to_size; - for (int64 size : instr.shape().dimensions()) { - to_size.push_back(size); - } - for (int64 from_dim = 0; from_dim < ShapeUtil::Rank(from_shape); - from_dim++) { - int64 to_dim = broadcast_dimensions[from_dim]; - to_size[to_dim] = from_shape.dimensions(from_dim); - } - - const Shape& broadcasted_shape = - ShapeUtil::MakeShape(from_shape.element_type(), to_size); - TF_ASSIGN_OR_RETURN( - XlaOp broadcasted_operand, - InDimBroadcast(broadcasted_shape, from, broadcast_dimensions)); - - updated_lhs = should_broadcast_lhs ? broadcasted_operand : lhs; - updated_rhs = !should_broadcast_lhs ? broadcasted_operand : rhs; - } - - TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, GetShape(updated_lhs)); - if (!ShapeUtil::SameDimensions(instr.shape(), updated_lhs_shape)) { - TF_ASSIGN_OR_RETURN(updated_lhs, - AddBroadcastSequence(instr.shape(), updated_lhs)); - } - TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, GetShape(updated_rhs)); - if (!ShapeUtil::SameDimensions(instr.shape(), updated_rhs_shape)) { - TF_ASSIGN_OR_RETURN(updated_rhs, - AddBroadcastSequence(instr.shape(), updated_rhs)); - } - - return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs}); - }); -} - -XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs, - const XlaOp& ehs) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); - TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); - TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(ehs)); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferTernaryOpShape( - triop, lhs_shape, rhs_shape, ehs_shape)); - XlaOp updated_lhs = lhs; - XlaOp updated_rhs = rhs; - XlaOp updated_ehs = ehs; - if (!ShapeUtil::IsTuple(instr.shape())) { - if (!ShapeUtil::IsTuple(lhs_shape) && - !ShapeUtil::SameDimensions(instr.shape(), lhs_shape)) { - // lhs is being implicitly broadcasted. Change to explicit. - TF_ASSIGN_OR_RETURN(updated_lhs, - AddBroadcastSequence(instr.shape(), lhs)); - } - if (!ShapeUtil::IsTuple(rhs_shape) && - !ShapeUtil::SameDimensions(instr.shape(), rhs_shape)) { - // rhs is being implicitly broadcasted. Change to explicit. - TF_ASSIGN_OR_RETURN(updated_rhs, - AddBroadcastSequence(instr.shape(), rhs)); - } - if (!ShapeUtil::IsTuple(ehs_shape) && - !ShapeUtil::SameDimensions(instr.shape(), ehs_shape)) { - // ehs is being implicitly broadcasted. Change to explicit. - TF_ASSIGN_OR_RETURN(updated_ehs, - AddBroadcastSequence(instr.shape(), ehs)); - } - } - return AddInstruction(std::move(instr), triop, - {updated_lhs, updated_rhs, updated_ehs}); - }); -} - -XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kAdd, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kMultiply, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - *instr.mutable_shape() = literal.shape(); - *instr.mutable_literal() = literal.ToProto(); - return AddInstruction(std::move(instr), HloOpcode::kConstant); - }); -} - -XlaOp XlaBuilder::Call(const XlaComputation& computation, - tensorflow::gtl::ArraySlice operands) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - std::vector operand_shape_ptrs; - TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands)); - c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs), - [](const Shape& shape) { return &shape; }); - TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape, - computation.GetProgramShape()); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferCallShape(operand_shape_ptrs, - /*to_apply=*/called_program_shape)); - - AddCalledComputation(computation, &instr); - - return AddInstruction(std::move(instr), HloOpcode::kCall, operands); - }); -} - -XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape, - const string& name) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - if (!parameter_numbers_.insert(parameter_number).second) { - return InvalidArgument("parameter %lld already registered", - parameter_number); - } - instr.set_parameter_number(parameter_number); - instr.set_name(name); - *instr.mutable_shape() = shape; - return AddInstruction(std::move(instr), HloOpcode::kParameter); - }); -} - -XlaOp XlaBuilder::Broadcast( - const XlaOp& operand, tensorflow::gtl::ArraySlice broadcast_sizes) { - return NoteErrorOrReturn([&]() -> StatusOr { - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN( - const Shape& shape, - ShapeInference::InferBroadcastShape(operand_shape, broadcast_sizes)); - - // The client-level broadcast op just appends dimensions on the left (adds - // lowest numbered dimensions). The HLO broadcast instruction is more - // flexible and can add new dimensions anywhere. The instruction's - // dimensions field maps operand dimensions to dimensions in the broadcast - // output, so to append dimensions on the left the instruction's dimensions - // should just be the n highest dimension numbers of the output shape where - // n is the number of input dimensions. - const int64 operand_rank = ShapeUtil::Rank(operand_shape); - std::vector dimensions(operand_rank); - for (int i = 0; i < operand_rank; ++i) { - dimensions[i] = i + ShapeUtil::Rank(shape) - operand_rank; - } - return InDimBroadcast(shape, operand, dimensions); - }); -} - -StatusOr XlaBuilder::Reshape(const Shape& shape, const XlaOp& operand) { - TF_RETURN_IF_ERROR(first_error_); - - HloInstructionProto instr; - *instr.mutable_shape() = shape; - return AddInstruction(std::move(instr), HloOpcode::kReshape, {operand}); -} - -XlaOp XlaBuilder::Slice(const XlaOp& operand, - tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices, - tensorflow::gtl::ArraySlice strides) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferSliceShape(operand_shape, start_indices, - limit_indices, strides)); - for (int i = 0; i < start_indices.size(); i++) { - auto* slice_config = instr.add_slice_dimensions(); - slice_config->set_start(start_indices[i]); - slice_config->set_limit(limit_indices[i]); - slice_config->set_stride(strides[i]); - } - - return AddInstruction(std::move(instr), HloOpcode::kSlice, {operand}); - }); -} - -XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index, - int64 limit_index, int64 stride, int64 dimno) { - return NoteErrorOrReturn([&]() -> StatusOr { - TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand)); - std::vector starts(ShapeUtil::Rank(shape), 0); - std::vector limits(shape.dimensions().begin(), - shape.dimensions().end()); - std::vector strides(ShapeUtil::Rank(shape), 1); - starts[dimno] = start_index; - limits[dimno] = limit_index; - strides[dimno] = stride; - return Slice(operand, starts, limits, strides); - }); -} - -XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices, - tensorflow::gtl::ArraySlice slice_sizes) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape, - GetShape(start_indices)); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferDynamicSliceShape( - operand_shape, start_indices_shape, slice_sizes)); - - for (int64 size : slice_sizes) { - instr.add_dynamic_slice_sizes(size); - } - - return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, - {operand, start_indices}); - }); -} - -XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update, - const XlaOp& start_indices) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update)); - TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape, - GetShape(start_indices)); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferDynamicUpdateSliceShape( - operand_shape, update_shape, start_indices_shape)); - - return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice, - {operand, update, start_indices}); - }); -} - -XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice operands, - int64 dimension) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - std::vector operand_shape_ptrs; - TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands)); - c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs), - [](const Shape& shape) { return &shape; }); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension)); - - instr.add_dimensions(dimension); - - return AddInstruction(std::move(instr), HloOpcode::kConcatenate, operands); - }); -} - -XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value, - const PaddingConfig& padding_config) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& padding_value_shape, - GetShape(padding_value)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferPadShape(operand_shape, padding_value_shape, - padding_config)); - - *instr.mutable_padding_config() = padding_config; - - return AddInstruction(std::move(instr), HloOpcode::kPad, - {operand, padding_value}); - }); -} - -XlaOp XlaBuilder::Reshape(const XlaOp& operand, - tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice new_sizes) { - return NoteErrorOrReturn([&]() -> StatusOr { - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& shape, - ShapeInference::InferReshapeShape( - operand_shape, dimensions, new_sizes)); - XlaOp transposed = IsIdentityPermutation(dimensions) - ? operand - : Transpose(operand, dimensions); - return Reshape(shape, transposed); - }); -} - -XlaOp XlaBuilder::Reshape(const XlaOp& operand, - tensorflow::gtl::ArraySlice new_sizes) { - return NoteErrorOrReturn([&]() -> StatusOr { - TF_ASSIGN_OR_RETURN(auto shape, GetShape(operand)); - std::vector dimensions(shape.dimensions_size()); - std::iota(dimensions.begin(), dimensions.end(), 0); - return Reshape(operand, dimensions, new_sizes); - }); -} - -XlaOp XlaBuilder::Collapse(const XlaOp& operand, - tensorflow::gtl::ArraySlice dimensions) { - return NoteErrorOrReturn([&]() -> StatusOr { - if (dimensions.size() <= 1) { - // Not collapsing anything, trivially we can return the operand versus - // enqueueing a trivial reshape. - return operand; - } - - // Out-of-order collapse is not supported. - // Checks that the collapsed dimensions are in order and consecutive. - for (tensorflow::gtl::ArraySlice::size_type i = 1; - i < dimensions.size(); ++i) { - if (dimensions[i] - 1 != dimensions[i - 1]) { - return InvalidArgument( - "Collapsed dimensions are not in consecutive order."); - } - } - - // Create a new sizes vector from the old shape, replacing the collapsed - // dimensions by the product of their sizes. - TF_ASSIGN_OR_RETURN(const Shape& original_shape, GetShape(operand)); - - VLOG(3) << "original shape: " << ShapeUtil::HumanString(original_shape); - VLOG(3) << "dims to collapse: " - << tensorflow::str_util::Join(dimensions, ","); - - std::vector new_sizes; - for (int i = 0; i < ShapeUtil::Rank(original_shape); ++i) { - if (i <= dimensions.front() || i > dimensions.back()) { - new_sizes.push_back(original_shape.dimensions(i)); - } else { - new_sizes.back() *= original_shape.dimensions(i); - } - } - - VLOG(3) << "new sizes: [" << tensorflow::str_util::Join(new_sizes, ",") - << "]"; - - return Reshape(operand, new_sizes); - }); -} - -void XlaBuilder::Trace(const string& tag, const XlaOp& operand) { - NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - *instr.mutable_shape() = ShapeUtil::MakeNil(); - *instr.mutable_literal() = Literal::CreateR1U8(tag)->ToProto(); - return AddInstruction(std::move(instr), HloOpcode::kTrace, {operand}); - }); -} - -XlaOp XlaBuilder::Select(const XlaOp& pred, const XlaOp& on_true, - const XlaOp& on_false) { - return TernaryOp(HloOpcode::kSelect, pred, on_true, on_false); -} - -XlaOp XlaBuilder::Tuple(tensorflow::gtl::ArraySlice elements) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - std::vector operand_shape_ptrs; - TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements)); - c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs), - [](const Shape& shape) { return &shape; }); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferVariadicOpShape( - HloOpcode::kTuple, operand_shape_ptrs)); - return AddInstruction(std::move(instr), HloOpcode::kTuple, elements); - }); -} - -XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& tuple_shape, GetShape(tuple_data)); - if (!ShapeUtil::IsTuple(tuple_shape)) { - return InvalidArgument( - "Operand to GetTupleElement() is not a tuple; got %s", - ShapeUtil::HumanString(tuple_shape).c_str()); - } - *instr.mutable_shape() = - ShapeUtil::GetTupleElementShape(tuple_shape, index); - - instr.set_tuple_index(index); - - return AddInstruction(std::move(instr), HloOpcode::kGetTupleElement, - {tuple_data}); - }); -} - -XlaOp XlaBuilder::Eq(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kEq, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Ne(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kNe, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Ge(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kGe, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Gt(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kGt, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Le(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kLe, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kLt, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs) { - return NoteErrorOrReturn([&]() -> StatusOr { - TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); - - DotDimensionNumbers dimension_numbers; - dimension_numbers.add_lhs_contracting_dimensions( - lhs_shape.dimensions_size() == 1 ? 0 : 1); - dimension_numbers.add_rhs_contracting_dimensions(0); - return DotGeneral(lhs, rhs, dimension_numbers); - }); -} - -XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs, - const DotDimensionNumbers& dimension_numbers) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); - TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, - dimension_numbers)); - *instr.mutable_dot_dimension_numbers() = dimension_numbers; - return AddInstruction(std::move(instr), HloOpcode::kDot, {lhs, rhs}); - }); -} - -Status XlaBuilder::VerifyConvolution( - const Shape& lhs_shape, const Shape& rhs_shape, - const ConvolutionDimensionNumbers& dimension_numbers) const { - if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) { - return InvalidArgument( - "Convolution arguments must have same number of " - "dimensions. Got: %s and %s", - ShapeUtil::HumanString(lhs_shape).c_str(), - ShapeUtil::HumanString(rhs_shape).c_str()); - } - int num_dims = ShapeUtil::Rank(lhs_shape); - if (num_dims < 2) { - return InvalidArgument( - "Convolution expects argument arrays with >= 3 dimensions. " - "Got: %s and %s", - ShapeUtil::HumanString(lhs_shape).c_str(), - ShapeUtil::HumanString(rhs_shape).c_str()); - } - int num_spatial_dims = num_dims - 2; - - const auto check_spatial_dimensions = - [&](const char* const field_name, - const tensorflow::protobuf::RepeatedField& - numbers) { - if (numbers.size() != num_spatial_dims) { - return InvalidArgument("Expected %d elements for %s, but got %d.", - num_spatial_dims, field_name, numbers.size()); - } - for (int i = 0; i < numbers.size(); ++i) { - if (numbers.Get(i) < 0 || numbers.Get(i) >= num_dims) { - return InvalidArgument("Convolution %s[%d] is out of bounds: %lld", - field_name, i, numbers.Get(i)); - } - } - return Status::OK(); - }; - TF_RETURN_IF_ERROR( - check_spatial_dimensions("input_spatial_dimensions", - dimension_numbers.input_spatial_dimensions())); - TF_RETURN_IF_ERROR( - check_spatial_dimensions("kernel_spatial_dimensions", - dimension_numbers.kernel_spatial_dimensions())); - return check_spatial_dimensions( - "output_spatial_dimensions", - dimension_numbers.output_spatial_dimensions()); -} - -XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - Padding padding) { - return ConvWithGeneralDimensions( - lhs, rhs, window_strides, padding, - CreateDefaultConvDimensionNumbers(window_strides.size())); -} - -XlaOp XlaBuilder::ConvWithGeneralPadding( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding) { - return ConvGeneral(lhs, rhs, window_strides, padding, - CreateDefaultConvDimensionNumbers(window_strides.size())); -} - -XlaOp XlaBuilder::ConvWithGeneralDimensions( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, Padding padding, - const ConvolutionDimensionNumbers& dimension_numbers) { - return NoteErrorOrReturn([&]() -> StatusOr { - TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); - TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); - - TF_RETURN_IF_ERROR( - VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers)); - - std::vector base_area_dimensions( - dimension_numbers.input_spatial_dimensions_size()); - for (std::vector::size_type i = 0; i < base_area_dimensions.size(); - ++i) { - base_area_dimensions[i] = - lhs_shape.dimensions(dimension_numbers.input_spatial_dimensions(i)); - } - - std::vector window_dimensions( - dimension_numbers.kernel_spatial_dimensions_size()); - for (std::vector::size_type i = 0; i < window_dimensions.size(); - ++i) { - window_dimensions[i] = - rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i)); - } - - return ConvGeneral(lhs, rhs, window_strides, - MakePadding(base_area_dimensions, window_dimensions, - window_strides, padding), - dimension_numbers); - }); -} - -XlaOp XlaBuilder::ConvGeneral( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - const ConvolutionDimensionNumbers& dimension_numbers) { - return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {}, - dimension_numbers); -} - -XlaOp XlaBuilder::ConvGeneralDilated( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - tensorflow::gtl::ArraySlice lhs_dilation, - tensorflow::gtl::ArraySlice rhs_dilation, - const ConvolutionDimensionNumbers& dimension_numbers) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs)); - TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs)); - TF_RETURN_IF_ERROR( - VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers)); - - std::vector window_dimensions( - dimension_numbers.kernel_spatial_dimensions_size()); - for (std::vector::size_type i = 0; i < window_dimensions.size(); - ++i) { - window_dimensions[i] = - rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i)); - } - TF_ASSIGN_OR_RETURN(*instr.mutable_window(), - MakeWindow(window_dimensions, window_strides, padding, - lhs_dilation, rhs_dilation)); - - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferConvolveShape(lhs_shape, rhs_shape, instr.window(), - dimension_numbers)); - - *instr.mutable_convolution_dimension_numbers() = dimension_numbers; - - return AddInstruction(std::move(instr), HloOpcode::kConvolution, - {lhs, rhs}); - }); -} - -StatusOr XlaBuilder::MakeWindow( - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - tensorflow::gtl::ArraySlice lhs_dilation, - tensorflow::gtl::ArraySlice rhs_dilation) const { - const auto verify_size = [&](const size_t x, const char* x_name) { - if (x == 0 || x == window_dimensions.size()) { - return Status::OK(); - } else { - return InvalidArgument( - "%s", tensorflow::strings::StrCat( - "Window has different number of window dimensions than of ", - x_name, - "\nNumber of window dimensions: ", window_dimensions.size(), - "\nNumber of ", x_name, ": ", x, "\n") - .c_str()); - } - }; - TF_RETURN_IF_ERROR(verify_size(window_strides.size(), "window strides")); - TF_RETURN_IF_ERROR(verify_size(padding.size(), "padding entries")); - TF_RETURN_IF_ERROR(verify_size(lhs_dilation.size(), "lhs dilation factors")); - TF_RETURN_IF_ERROR(verify_size(rhs_dilation.size(), "rhs dilation factors")); - - Window window; - for (size_t i = 0; i < window_dimensions.size(); i++) { - auto dim = window.add_dimensions(); - dim->set_size(window_dimensions[i]); - if (!window_strides.empty()) { - dim->set_stride(window_strides[i]); - } else { - dim->set_stride(1); - } - if (!padding.empty()) { - dim->set_padding_low(padding[i].first); - dim->set_padding_high(padding[i].second); - } else { - dim->set_padding_low(0); - dim->set_padding_high(0); - } - if (!lhs_dilation.empty()) { - dim->set_base_dilation(lhs_dilation[i]); - } else { - dim->set_base_dilation(1); - } - if (!rhs_dilation.empty()) { - dim->set_window_dilation(rhs_dilation[i]); - } else { - dim->set_window_dilation(1); - } - dim->set_window_reversal(false); - } - return window; -} - -XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type, - const tensorflow::gtl::ArraySlice fft_length) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferFftShape(operand_shape, fft_type, fft_length)); - - instr.set_fft_type(fft_type); - for (int64 i : fft_length) { - instr.add_fft_length(i); - } - - return AddInstruction(std::move(instr), HloOpcode::kFft, {operand}); - }); -} - -XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - if (!LayoutUtil::HasLayout(shape)) { - return InvalidArgument("Given shape to Infeed must have a layout"); - } - *instr.mutable_shape() = shape; - instr.set_infeed_config(config); - return AddInstruction(std::move(instr), HloOpcode::kInfeed); - }); -} - -void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout, - const string& outfeed_config) { - NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - *instr.mutable_shape() = ShapeUtil::MakeNil(); - - // Check and set outfeed shape. - if (!LayoutUtil::HasLayout(shape_with_layout)) { - return InvalidArgument("Given shape to Outfeed must have a layout"); - } - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) { - return InvalidArgument( - "Outfeed shape %s must be compatible with operand shape %s", - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str(), - ShapeUtil::HumanStringWithLayout(operand_shape).c_str()); - } - *instr.mutable_outfeed_shape() = shape_with_layout; - - instr.set_outfeed_config(outfeed_config); - - return AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand}); - }); -} - -XlaOp XlaBuilder::CustomCall(const string& call_target_name, - tensorflow::gtl::ArraySlice operands, - const Shape& shape) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - if (tensorflow::str_util::StartsWith(call_target_name, "$")) { - return InvalidArgument( - "Invalid custom_call_target \"%s\": Call targets that start with '$' " - "are reserved for internal use.", - call_target_name.c_str()); - } - *instr.mutable_shape() = shape; - instr.set_custom_call_target(call_target_name); - return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands); - }); -} - -XlaOp XlaBuilder::HostCompute(tensorflow::gtl::ArraySlice operands, - const string& channel_name, - int64 cost_estimate_ns, const Shape& shape) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - *instr.mutable_shape() = shape; - instr.set_channel_name(channel_name); - instr.set_cost_estimate_ns(cost_estimate_ns); - return AddInstruction(std::move(instr), HloOpcode::kHostCompute, operands); - }); -} - -XlaOp XlaBuilder::Complex( - const XlaOp& real, const XlaOp& imag, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions); -} - -XlaOp XlaBuilder::Conj(const XlaOp& operand) { - return Complex(Real(operand), Neg(Imag(operand))); -} - -XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kSubtract, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Div(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kDivide, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Rem(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kRemainder, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Max(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kMaximum, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Min(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kMinimum, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::And(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kAnd, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::Or(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kOr, lhs, rhs, broadcast_dimensions); -} - -// TODO(b/65209188): Create a dedicated lowering for Xor. -XlaOp XlaBuilder::Xor(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return Or(And(Not(lhs), rhs, broadcast_dimensions), - And(lhs, Not(rhs), broadcast_dimensions)); -} - -XlaOp XlaBuilder::Not(const XlaOp& operand) { - return UnaryOp(HloOpcode::kNot, operand); -} - -XlaOp XlaBuilder::ShiftLeft( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kShiftLeft, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::ShiftRightArithmetic( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs, - broadcast_dimensions); -} - -XlaOp XlaBuilder::ShiftRightLogical( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs, - broadcast_dimensions); -} - -XlaOp XlaBuilder::Abs(const XlaOp& operand) { - return UnaryOp(HloOpcode::kAbs, operand); -} - -XlaOp XlaBuilder::Atan2( - const XlaOp& y, const XlaOp& x, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kAtan2, y, x, broadcast_dimensions); -} - -XlaOp XlaBuilder::Exp(const XlaOp& operand) { - return UnaryOp(HloOpcode::kExp, operand); -} - -XlaOp XlaBuilder::Expm1(const XlaOp& operand) { - return UnaryOp(HloOpcode::kExpm1, operand); -} - -XlaOp XlaBuilder::Floor(const XlaOp& operand) { - return UnaryOp(HloOpcode::kFloor, operand); -} - -XlaOp XlaBuilder::Ceil(const XlaOp& operand) { - return UnaryOp(HloOpcode::kCeil, operand); -} - -XlaOp XlaBuilder::Round(const XlaOp& operand) { - return UnaryOp(HloOpcode::kRoundNearestAfz, operand); -} - -XlaOp XlaBuilder::Log(const XlaOp& operand) { - return UnaryOp(HloOpcode::kLog, operand); -} - -XlaOp XlaBuilder::Log1p(const XlaOp& operand) { - return UnaryOp(HloOpcode::kLog1p, operand); -} - -XlaOp XlaBuilder::Sign(const XlaOp& operand) { - return UnaryOp(HloOpcode::kSign, operand); -} - -XlaOp XlaBuilder::Clz(const XlaOp& operand) { - return UnaryOp(HloOpcode::kClz, operand); -} - -XlaOp XlaBuilder::Cos(const XlaOp& operand) { - return UnaryOp(HloOpcode::kCos, operand); -} - -XlaOp XlaBuilder::Sin(const XlaOp& operand) { - return UnaryOp(HloOpcode::kSin, operand); -} - -XlaOp XlaBuilder::Tanh(const XlaOp& operand) { - return UnaryOp(HloOpcode::kTanh, operand); -} - -XlaOp XlaBuilder::Real(const XlaOp& operand) { - return UnaryOp(HloOpcode::kReal, operand); -} - -XlaOp XlaBuilder::Imag(const XlaOp& operand) { - return UnaryOp(HloOpcode::kImag, operand); -} - -XlaOp XlaBuilder::IsFinite(const XlaOp& operand) { - return UnaryOp(HloOpcode::kIsFinite, operand); -} - -XlaOp XlaBuilder::Transpose(const XlaOp& operand, - tensorflow::gtl::ArraySlice permutation) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferTransposeShape(operand_shape, permutation)); - for (int64 dim : permutation) { - instr.add_dimensions(dim); - } - return AddInstruction(std::move(instr), HloOpcode::kTranspose, {operand}); - }); -} - -XlaOp XlaBuilder::Rev(const XlaOp& operand, - tensorflow::gtl::ArraySlice dimensions) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferReverseShape(operand_shape, dimensions)); - for (int64 dim : dimensions) { - instr.add_dimensions(dim); - } - return AddInstruction(std::move(instr), HloOpcode::kReverse, {operand}); - }); -} - -XlaOp XlaBuilder::Sort(const XlaOp& operand) { - return UnaryOp(HloOpcode::kSort, operand); -} - -XlaOp XlaBuilder::SqrtF32(const XlaOp& operand) { - return BinaryOp(HloOpcode::kPower, operand, ConstantR0(0.5), - /*broadcast_dimensions=*/{}); -} - -XlaOp XlaBuilder::Pow(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return BinaryOp(HloOpcode::kPower, lhs, rhs, broadcast_dimensions); -} - -XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand, - PrimitiveType new_element_type) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferConvertShape(operand_shape, new_element_type)); - return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand}); - }); -} - -XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand, - PrimitiveType new_element_type) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferConvertShape(operand_shape, new_element_type)); - return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert, - {operand}); - }); -} - -XlaOp XlaBuilder::SquareF32(const XlaOp& operand) { - return BinaryOp(HloOpcode::kPower, operand, ConstantR0(2.0), - /*broadcast_dimensions=*/{}); -} - -XlaOp XlaBuilder::ReciprocalF32(const XlaOp& operand) { - return BinaryOp(HloOpcode::kPower, operand, ConstantR0(-1.0), - /*broadcast_dimensions=*/{}); -} - -XlaOp XlaBuilder::Neg(const XlaOp& operand) { - return UnaryOp(HloOpcode::kNegate, operand); -} - -XlaOp XlaBuilder::Clamp(const XlaOp& min, const XlaOp& operand, - const XlaOp& max) { - return TernaryOp(HloOpcode::kClamp, min, operand, max); -} - -XlaOp XlaBuilder::Map(tensorflow::gtl::ArraySlice operands, - const XlaComputation& computation, - tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice static_operands) { - return NoteErrorOrReturn([&]() -> StatusOr { - if (!static_operands.empty()) { - return Unimplemented("static_operands is not supported in Map"); - } - - HloInstructionProto instr; - - std::vector operand_shape_ptrs; - TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands)); - c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs), - [](const Shape& shape) { return &shape; }); - TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape, - computation.GetProgramShape()); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferMapShape(operand_shape_ptrs, called_program_shape, - dimensions)); - - AddCalledComputation(computation, &instr); - - return AddInstruction(std::move(instr), HloOpcode::kMap, operands); - }); -} - -XlaOp XlaBuilder::RngOp(RandomDistribution distribution, - tensorflow::gtl::ArraySlice parameters, - const Shape& shape) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - // Check the number of parameters per RNG distribution. - switch (distribution) { - case RandomDistribution::RNG_NORMAL: - case RandomDistribution::RNG_UNIFORM: - if (parameters.size() != 2) { - return InvalidArgument( - "RNG distribution (%s) expects 2 parameters, but got %ld", - RandomDistribution_Name(distribution).c_str(), parameters.size()); - } - break; - default: - LOG(FATAL) << "unhandled distribution " << distribution; - } - - TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape)); - *instr.mutable_shape() = shape; - - instr.set_distribution(distribution); - - return AddInstruction(std::move(instr), HloOpcode::kRng, parameters); - }); -} - -XlaOp XlaBuilder::RngNormal(const XlaOp& mu, const XlaOp& sigma, - const Shape& shape) { - return RngOp(RandomDistribution::RNG_NORMAL, {mu, sigma}, shape); -} - -XlaOp XlaBuilder::RngUniform(const XlaOp& a, const XlaOp& b, - const Shape& shape) { - return RngOp(RandomDistribution::RNG_UNIFORM, {a, b}, shape); -} - -XlaOp XlaBuilder::While(const XlaComputation& condition, - const XlaComputation& body, const XlaOp& init) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - // Infer shape. - TF_ASSIGN_OR_RETURN(const auto& body_program_shape, body.GetProgramShape()); - TF_ASSIGN_OR_RETURN(const auto& condition_program_shape, - condition.GetProgramShape()); - TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferWhileShape(condition_program_shape, - body_program_shape, init_shape)); - // Body comes before condition computation in the vector. - AddCalledComputation(body, &instr); - AddCalledComputation(condition, &instr); - return AddInstruction(std::move(instr), HloOpcode::kWhile, {init}); - }); -} - -XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices, - const GatherDimensionNumbers& dimension_numbers, - tensorflow::gtl::ArraySlice window_bounds) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input)); - TF_ASSIGN_OR_RETURN(const Shape& gather_indices_shape, - GetShape(gather_indices)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferGatherShape(input_shape, gather_indices_shape, - dimension_numbers, window_bounds)); - - *instr.mutable_gather_dimension_numbers() = dimension_numbers; - for (int64 bound : window_bounds) { - instr.add_gather_window_bounds(bound); - } - - return AddInstruction(std::move(instr), HloOpcode::kGather, - {input, gather_indices}); - }); -} - -XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand, - const XlaComputation& true_computation, - const XlaOp& false_operand, - const XlaComputation& false_computation) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& predicate_shape, GetShape(predicate)); - TF_ASSIGN_OR_RETURN(const Shape& true_operand_shape, - GetShape(true_operand)); - TF_ASSIGN_OR_RETURN(const ProgramShape& true_computation_shape, - true_computation.GetProgramShape()); - TF_ASSIGN_OR_RETURN(const Shape& false_operand_shape, - GetShape(false_operand)); - TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape, - false_computation.GetProgramShape()); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferConditionalShape( - predicate_shape, true_operand_shape, false_operand_shape, - true_computation_shape, false_computation_shape)); - - // The index of true_computation must be 0 and that of false computation - // must be 1. - AddCalledComputation(true_computation, &instr); - AddCalledComputation(false_computation, &instr); - - return AddInstruction(std::move(instr), HloOpcode::kConditional, - {predicate, true_operand, false_operand}); - }); -} - -XlaOp XlaBuilder::Reduce( - const XlaOp& operand, const XlaOp& init_value, - const XlaComputation& computation, - tensorflow::gtl::ArraySlice dimensions_to_reduce) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value)); - TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape, - computation.GetProgramShape()); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferReduceShape( - operand_shape, init_shape, dimensions_to_reduce, - called_program_shape)); - - for (int64 dim : dimensions_to_reduce) { - instr.add_dimensions(dim); - } - - AddCalledComputation(computation, &instr); - - return AddInstruction(std::move(instr), HloOpcode::kReduce, - {operand, init_value}); - }); -} - -XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value, - const XlaComputation& computation) { - return NoteErrorOrReturn([&]() -> StatusOr { - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - std::vector all_dimnos(ShapeUtil::Rank(operand_shape)); - std::iota(all_dimnos.begin(), all_dimnos.end(), 0); - return Reduce(operand, init_value, computation, all_dimnos); - }); -} - -XlaOp XlaBuilder::ReduceWindow( - const XlaOp& operand, const XlaOp& init_value, - const XlaComputation& computation, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, Padding padding) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_RETURN_IF_ERROR( - ValidatePaddingValues(AsInt64Slice(operand_shape.dimensions()), - window_dimensions, window_strides)); - - std::vector> padding_values = - MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions, - window_strides, padding); - return ReduceWindowWithGeneralPadding(operand, init_value, computation, - window_dimensions, window_strides, - padding_values); - }); -} - -XlaOp XlaBuilder::ReduceWindowWithGeneralPadding( - const XlaOp& operand, const XlaOp& init_value, - const XlaComputation& computation, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value)); - TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape, - computation.GetProgramShape()); - TF_ASSIGN_OR_RETURN(*instr.mutable_window(), - MakeWindow(window_dimensions, window_strides, padding, - /*lhs_dilation=*/{}, /*rhs_dilation=*/{})); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferReduceWindowShape(operand_shape, init_shape, - instr.window(), to_apply_shape)); - - AddCalledComputation(computation, &instr); - return AddInstruction(std::move(instr), HloOpcode::kReduceWindow, - {operand, init_value}); - }); -} - -XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale, - const XlaOp& offset, float epsilon, - int64 feature_index) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale)); - TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferBatchNormTrainingShape( - operand_shape, scale_shape, offset_shape, feature_index)); - - instr.set_epsilon(epsilon); - instr.set_feature_index(feature_index); - - return AddInstruction(std::move(instr), HloOpcode::kBatchNormTraining, - {operand, scale, offset}); - }); -} - -XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale, - const XlaOp& offset, const XlaOp& mean, - const XlaOp& variance, float epsilon, - int64 feature_index) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale)); - TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset)); - TF_ASSIGN_OR_RETURN(const Shape& mean_shape, GetShape(mean)); - TF_ASSIGN_OR_RETURN(const Shape& variance_shape, GetShape(variance)); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferBatchNormInferenceShape( - operand_shape, scale_shape, offset_shape, - mean_shape, variance_shape, feature_index)); - - instr.set_epsilon(epsilon); - instr.set_feature_index(feature_index); - - return AddInstruction(std::move(instr), HloOpcode::kBatchNormInference, - {operand, scale, offset, mean, variance}); - }); -} - -XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale, - const XlaOp& batch_mean, const XlaOp& batch_var, - const XlaOp& grad_output, float epsilon, - int64 feature_index) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale)); - TF_ASSIGN_OR_RETURN(const Shape& batch_mean_shape, GetShape(batch_mean)); - TF_ASSIGN_OR_RETURN(const Shape& batch_var_shape, GetShape(batch_var)); - TF_ASSIGN_OR_RETURN(const Shape& grad_output_shape, GetShape(grad_output)); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferBatchNormGradShape( - operand_shape, scale_shape, batch_mean_shape, - batch_var_shape, grad_output_shape, feature_index)); - - instr.set_epsilon(epsilon); - instr.set_feature_index(feature_index); - - return AddInstruction(std::move(instr), HloOpcode::kBatchNormGrad, - {operand, scale, batch_mean, batch_var, grad_output}); - }); -} - -XlaOp XlaBuilder::CrossReplicaSum(const XlaOp& operand) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN( - *instr.mutable_shape(), - ShapeInference::InferCrossReplicaSumShape({&operand_shape})); - - return AddInstruction(std::move(instr), HloOpcode::kCrossReplicaSum, - {operand}); - }); -} - -XlaOp XlaBuilder::SelectAndScatter( - const XlaOp& operand, const XlaComputation& select, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, Padding padding, - const XlaOp& source, const XlaOp& init_value, - const XlaComputation& scatter) { - return NoteErrorOrReturn([&]() -> StatusOr { - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - return SelectAndScatterWithGeneralPadding( - operand, select, window_dimensions, window_strides, - MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions, - window_strides, padding), - source, init_value, scatter); - }); -} - -XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding( - const XlaOp& operand, const XlaComputation& select, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - const XlaOp& source, const XlaOp& init_value, - const XlaComputation& scatter) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(const Shape& source_shape, GetShape(source)); - TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value)); - TF_ASSIGN_OR_RETURN(const ProgramShape& select_shape, - select.GetProgramShape()); - TF_ASSIGN_OR_RETURN(const ProgramShape& scatter_shape, - scatter.GetProgramShape()); - TF_ASSIGN_OR_RETURN(*instr.mutable_window(), - MakeWindow(window_dimensions, window_strides, padding, - /*lhs_dilation=*/{}, /*rhs_dilation=*/{})); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferSelectAndScatterShape( - operand_shape, select_shape, instr.window(), - source_shape, init_shape, scatter_shape)); - - AddCalledComputation(select, &instr); - AddCalledComputation(scatter, &instr); - - return AddInstruction(std::move(instr), HloOpcode::kSelectAndScatter, - {operand, source, init_value}); - }); -} - -XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits, - const int mantissa_bits) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); - TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), - ShapeInference::InferReducePrecisionShape( - operand_shape, exponent_bits, mantissa_bits)); - instr.set_exponent_bits(exponent_bits); - instr.set_mantissa_bits(mantissa_bits); - return AddInstruction(std::move(instr), HloOpcode::kReducePrecision, - {operand}); - }); -} - -void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) { - NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - // Send instruction produces a tuple of {aliased operand, U32 context}. - TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand)); - *instr.mutable_shape() = - ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})}); - instr.set_channel_id(handle.handle()); - TF_ASSIGN_OR_RETURN( - XlaOp send, - AddInstruction(std::move(instr), HloOpcode::kSend, {operand})); - - HloInstructionProto send_done_instr; - *send_done_instr.mutable_shape() = ShapeUtil::MakeNil(); - send_done_instr.set_channel_id(handle.handle()); - return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone, - {send}); - }); -} - -XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) { - return NoteErrorOrReturn([&]() -> StatusOr { - HloInstructionProto instr; - - // Recv instruction produces a tuple of {receive buffer, U32 context}. - *instr.mutable_shape() = - ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})}); - instr.set_channel_id(handle.handle()); - TF_ASSIGN_OR_RETURN(XlaOp recv, - AddInstruction(std::move(instr), HloOpcode::kRecv, {})); - - HloInstructionProto recv_done_instr; - *recv_done_instr.mutable_shape() = shape; - recv_done_instr.set_channel_id(handle.handle()); - return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone, - {recv}); - }); -} - -StatusOr XlaBuilder::IsConstant(const XlaOp& operand) const { - TF_RETURN_IF_ERROR(first_error_); - - // Verify that the handle is valid. - TF_RETURN_IF_ERROR(LookUpInstruction(operand).status()); - - bool is_constant = true; - std::set visited; - IsConstantVisitor(operand.handle(), &visited, &is_constant); - return is_constant; -} - -StatusOr XlaBuilder::BuildConstantSubGraph( - const XlaOp& root_op) const { - TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op)); - if (!is_constant) { - auto op_status = LookUpInstruction(root_op); - string op_string = - op_status.ok() ? op_status.ValueOrDie()->name() : ""; - return InvalidArgument( - "Operand to BuildConstantSubGraph depends on a parameter.\n\n" - " op requested for constant subgraph: %s\n\n" - "This is an internal error that typically happens when the XLA user " - "(e.g. TensorFlow) is attempting to determine a value that must be a " - "compile-time constant (e.g. an array dimension) but it is not capable " - "of being evaluated at XLA compile time.\n\n" - "Please file a usability bug with the framework being used (e.g. " - "TensorFlow).", - op_string.c_str()); - } - - TF_ASSIGN_OR_RETURN(const HloInstructionProto* root, - LookUpInstruction(root_op)); - TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode())); - if (!CanBeRoot(opcode)) { - return InvalidArgument("the operand with opcode %s cannot be root", - root->opcode().c_str()); - } - - HloComputationProto entry; - entry.set_id(GetUniqueId()); // Give the computation a global unique id. - entry.set_name(StrCat(name_, entry.id(), "_compute_constant")); - entry.set_root_id(root->id()); - ProgramShape* program_shape = entry.mutable_program_shape(); - *program_shape->mutable_result() = root->shape(); - - // We use std::set to keep the instruction ids in ascending order (which is - // also a valid denpendency order). The related ops will be added to the - // subgraph in the same order. - std::set related_ops; - tensorflow::gtl::FlatSet related_calls; // Related computations. - std::queue worklist; - worklist.push(root->id()); - related_ops.insert(root->id()); - while (!worklist.empty()) { - int64 node = worklist.front(); - worklist.pop(); - for (int64 id : instructions_[node].operand_ids()) { - if (related_ops.insert(id).second) { - worklist.push(id); - } - } - for (int64 called_id : instructions_[node].called_computation_ids()) { - related_calls.insert(called_id); - } - } - - // Add related ops to the computation. - for (int64 id : related_ops) { - auto* instr = entry.add_instructions(); - *instr = instructions_[id]; - // Ensures that the instruction names are unique among the graph. - const string& new_name = - StrCat(instr->name(), ".", entry.id(), ".", instr->id()); - instr->set_name(new_name); - } - - XlaComputation computation(entry.id()); - HloModuleProto* module = computation.mutable_proto(); - module->set_name(entry.name()); - module->set_id(entry.id()); - module->set_entry_computation_name(entry.name()); - module->set_entry_computation_id(entry.id()); - *module->mutable_program_shape() = *program_shape; - for (auto& e : embedded_) { - if (related_calls.find(e.second.id()) != related_calls.end()) { - *module->add_computations() = e.second; - } - } - *module->add_computations() = std::move(entry); - - return std::move(computation); -} - -std::unique_ptr XlaBuilder::CreateSubBuilder( - const string& computation_name) { - auto sub_builder = MakeUnique(computation_name); - sub_builder->parent_builder_ = this; - sub_builder->die_immediately_on_error_ = this->die_immediately_on_error_; - return sub_builder; -} - -/* static */ ConvolutionDimensionNumbers -XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) { - ConvolutionDimensionNumbers dimension_numbers; - dimension_numbers.set_input_batch_dimension(kConvBatchDimension); - dimension_numbers.set_input_feature_dimension(kConvFeatureDimension); - dimension_numbers.set_output_batch_dimension(kConvBatchDimension); - dimension_numbers.set_output_feature_dimension(kConvFeatureDimension); - dimension_numbers.set_kernel_output_feature_dimension( - kConvKernelOutputDimension); - dimension_numbers.set_kernel_input_feature_dimension( - kConvKernelInputDimension); - for (int i = 0; i < num_spatial_dims; ++i) { - dimension_numbers.add_input_spatial_dimensions(i + 2); - dimension_numbers.add_kernel_spatial_dimensions(i + 2); - dimension_numbers.add_output_spatial_dimensions(i + 2); - } - return dimension_numbers; -} - -/* static */ Status XlaBuilder::Validate( - const ConvolutionDimensionNumbers& dnum) { - if (dnum.input_spatial_dimensions_size() < 2) { - return FailedPrecondition("input spacial dimension < 2: %d", - dnum.input_spatial_dimensions_size()); - } - if (dnum.kernel_spatial_dimensions_size() < 2) { - return FailedPrecondition("kernel spacial dimension < 2: %d", - dnum.kernel_spatial_dimensions_size()); - } - if (dnum.output_spatial_dimensions_size() < 2) { - return FailedPrecondition("output spacial dimension < 2: %d", - dnum.output_spatial_dimensions_size()); - } - - if (std::set( - {dnum.input_batch_dimension(), dnum.input_feature_dimension(), - dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1)}) - .size() != 4) { - return FailedPrecondition( - "dimension numbers for the input are not unique: (%lld, %lld, %lld, " - "%lld)", - dnum.input_batch_dimension(), dnum.input_feature_dimension(), - dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1)); - } - if (std::set({dnum.kernel_output_feature_dimension(), - dnum.kernel_input_feature_dimension(), - dnum.kernel_spatial_dimensions(0), - dnum.kernel_spatial_dimensions(1)}) - .size() != 4) { - return FailedPrecondition( - "dimension numbers for the weight are not unique: (%lld, %lld, %lld, " - "%lld)", - dnum.kernel_output_feature_dimension(), - dnum.kernel_input_feature_dimension(), - dnum.kernel_spatial_dimensions(0), dnum.kernel_spatial_dimensions(1)); - } - if (std::set({dnum.output_batch_dimension(), - dnum.output_feature_dimension(), - dnum.output_spatial_dimensions(0), - dnum.output_spatial_dimensions(1)}) - .size() != 4) { - return FailedPrecondition( - "dimension numbers for the output are not unique: (%lld, %lld, %lld, " - "%lld)", - dnum.output_batch_dimension(), dnum.output_feature_dimension(), - dnum.output_spatial_dimensions(0), dnum.output_spatial_dimensions(1)); - } - return Status::OK(); -} - -StatusOr XlaBuilder::AddInstruction( - HloInstructionProto&& instr, HloOpcode opcode, - tensorflow::gtl::ArraySlice operands) { - TF_RETURN_IF_ERROR(first_error_); - - const int64 handle = instructions_.size(); - instr.set_id(handle); - instr.set_opcode(HloOpcodeString(opcode)); - if (instr.name().empty()) { - instr.set_name(StrCat(instr.opcode())); - } - for (const auto& operand : operands) { - if (operand.builder_ == nullptr) { - return InvalidArgument("invalid XlaOp with handle %lld", - operand.handle()); - } - if (operand.builder_ != this) { - return InvalidArgument("Do not add XlaOp from builder %s to builder %s", - operand.builder_->name().c_str(), - this->name().c_str()); - } - instr.add_operand_ids(operand.handle()); - } - - *instr.mutable_metadata() = metadata_; - if (sharding_) { - *instr.mutable_sharding() = *sharding_; - } - - instructions_.push_back(instr); - - XlaOp op(handle, this); - return op; -} - -void XlaBuilder::AddCalledComputation(const XlaComputation& computation, - HloInstructionProto* instr) { - instr->add_called_computation_ids(computation.proto().entry_computation_id()); - for (const HloComputationProto& e : computation.proto().computations()) { - embedded_.insert({e.id(), e}); - } -} - -StatusOr XlaBuilder::LookUpInstruction( - const XlaOp& op) const { - TF_RETURN_IF_ERROR(first_error_); - - if (op.builder_ == nullptr) { - return InvalidArgument( - "invalid XlaOp with handle %lld; the builder of this op is freed", - op.handle()); - } - if (op.builder_ != this) { - return InvalidArgument( - "XlaOp with handle %lld is built by builder '%s', but is trying to use " - "it in builder '%s'", - op.handle(), op.builder_->name().c_str(), this->name().c_str()); - } - - if (op.handle() >= instructions_.size() || op.handle() < 0) { - return InvalidArgument("no XlaOp value %lld", op.handle()); - } - return &instructions_[op.handle()]; -} - -XlaOp XlaBuilder::UnimplementedOp() { - NoteError(Unimplemented("Op not implemented")); - return {}; -} - -} // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h deleted file mode 100644 index 2b3013a91c488782098bd81994e899eae5a1f506..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h +++ /dev/null @@ -1,1013 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_ -#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_ - -#include -#include -#include - -#include "tensorflow/compiler/xla/client/padding.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/service/hlo.pb.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/stacktrace.h" -#include "tensorflow/core/platform/types.h" - -namespace xla { - -class XlaBuilder; - -// This represents an instruction that has been enqueued using the XlaBuilder. -// This is used to pass to subsequent computations that depends upon the -// instruction as an operand. -class XlaOp { - public: - XlaOp() : handle_(0), builder_(nullptr) {} - ~XlaOp() {} - - const XlaBuilder* builder() const { return builder_; } - - bool operator==(const XlaOp& rhs) const { - return handle_ == rhs.handle_ && builder_ == rhs.builder_; - } - - bool operator!=(const XlaOp& rhs) const { - return handle_ != rhs.handle_ || builder_ != rhs.builder_; - } - - friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) { - out << op.handle(); - return out; - } - - private: - XlaOp(int64 handle, XlaBuilder* builder) - : handle_(handle), builder_(builder) {} - - int64 handle() const { return handle_; } - - friend class XlaBuilder; - - int64 handle_; - XlaBuilder* builder_; // Not owned. -}; - -// A convenient interface for building up computations. -// -// Thread-compatible. -class XlaBuilder { - public: - // computation_name: name to use for the built computation. - XlaBuilder(const string& computation_name); - - XlaBuilder(const XlaBuilder&) = delete; - XlaBuilder& operator=(const XlaBuilder&) = delete; - - ~XlaBuilder(); - - // Returns the computation name. - const string& name() const { return name_; } - - // Sets OpMetadata that will be added to all instructions until cleared. - // - // OpMetadata is often applied to a series of XLA HLO instructions. As a - // result, OpMetadata is set on the Computation Builder. All subsequent - // instructions generated via this Computation Builder will have the same - // OpMetadata attached until a call to ClearOpMetadata. - void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; } - - // Clears the HloMetadata state. - void ClearOpMetadata() { metadata_.Clear(); } - - // Sets an OpSharding that will be attached to all instructions until cleared. - void SetSharding(const OpSharding& sharding) { sharding_ = sharding; } - - // Clears the sharding. Ops will be sharded according to the default placement - // policy. - void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; } - - // Returns the OpSharding that will be attached to all instructions. - const tensorflow::gtl::optional& sharding() const { - return sharding_; - } - - // Sets the builder to a mode where it will die immediately when an error is - // encountered, rather than producing it in a deferred fashion when Build() is - // called (which is the default). - void set_die_immediately_on_error(bool enabled) { - die_immediately_on_error_ = enabled; - } - - // Enqueues a "retrieve parameter value" instruction for a parameter that was - // passed to the computation. - XlaOp Parameter(int64 parameter_number, const Shape& shape, - const string& name); - - // Enqueues a constant with the value of the given literal onto the - // computation. - XlaOp ConstantLiteral(const LiteralSlice& literal); - - // Enqueues a constant onto the computation. Methods are templated on the - // native host type (NativeT) which corresponds to a specific XLA - // PrimitiveType as given in the following table: - // - // Native Type PrimitiveType - // ----------------------------- - // bool PRED - // int32 S32 - // int64 S64 - // uint32 U32 - // uint64 U64 - // float F32 - // double F64 - // - // Note: not all primitive types defined in xla_data.proto have a - // corresponding native type yet. - template - XlaOp ConstantR0(NativeT value); - template - XlaOp ConstantR1(tensorflow::gtl::ArraySlice values); - XlaOp ConstantR1(const tensorflow::core::Bitmap& values); - template - XlaOp ConstantR2( - std::initializer_list> values); - template - XlaOp ConstantFromArrayWithLayout(const Array& values, - const Layout& layout); - template - XlaOp ConstantFromArray(const Array& values); - template - XlaOp ConstantR2FromArray2DWithLayout(const Array2D& values, - const Layout& layout); - template - XlaOp ConstantR2FromArray2D(const Array2D& values); - template - XlaOp ConstantR3FromArray3DWithLayout(const Array3D& values, - const Layout& layout); - template - XlaOp ConstantR3FromArray3D(const Array3D& values); - template - XlaOp ConstantR4FromArray4DWithLayout(const Array4D& values, - const Layout& layout); - template - XlaOp ConstantR4FromArray4D(const Array4D& values); - - // Enqueues a rank one constant (vector) onto the computation. The vector has - // size 'length' and every element has the value 'value'. - template - XlaOp ConstantR1(int64 length, NativeT value); - - // Adds dimensions to an array by duplicating the data in the array. - // - // The new dimensions are inserted on the left, i.e. if - // broadcast_sizes has values {a0, ..., aN} and the operand shape - // has dimensions {b0, ..., bM} then the shape of the output has - // dimensions {a0, ..., aN, b0, ..., bM}. - // - // The new dimensions index into copies of the operand, i.e. - // - // output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM] - XlaOp Broadcast(const XlaOp& operand, - tensorflow::gtl::ArraySlice broadcast_sizes); - - // Enqueues a pad operation onto the computation that pads the given value on - // the edges as well as between the elements of the input. padding_config - // specifies the padding amount for each dimension. - XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value, - const PaddingConfig& padding_config); - - // Enqueues an operation onto the computation that flattens the operand based - // on the dimension order (major/slowest-varying to minor/fastest-varying) - // given, followed by reshaping it into the shape with the given dimension - // sizes (also major to minor). Conceptually, this is a limited form of - // "shape casting". - XlaOp Reshape(const XlaOp& operand, - tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice new_sizes); - - // Enqueues an operation onto the computation that collapses the operand, from - // first to last dimension (C order), then reshapes it to the given dimension - // sizes. Conceptually, this is a limited form of "shape casting". - XlaOp Reshape(const XlaOp& operand, - tensorflow::gtl::ArraySlice new_sizes); - - // Wrapper for Reshape. - // Enqueues an operation to collapse the provided dimensions; e.g. an - // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to - // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must - // be a consecutive, in-order subsequence of the operand dimensions. - // - // Note that collapsing a single dimension does nothing: - // - // {256} collapsing {0} => {256} - // {1} collapsing {0} => {1} - // - // Collapsing multiple dimensions produces a single result dimension: - // - // {256, 2} collapsing {0,1} => {512} - // {256, 2, 3} collapsing {0,1} => {512, 3} - // - // This could potentially cause data to be moved -- it provides a more - // structured form of reshaping than an arbitrary Reshape operation. - XlaOp Collapse(const XlaOp& operand, - tensorflow::gtl::ArraySlice dimensions); - - // Enqueues a slice operation onto the computation that slices the operand - // from the start indices to the limit indices; e.g. - // - // x - // [ 0 1 2 3 ] - // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ] - // [ 8 9 a b ] - // - // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D - // range notation. - // The strides parameter determines the stride over the slice - XlaOp Slice(const XlaOp& operand, - tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices, - tensorflow::gtl::ArraySlice strides); - - // Enqueues a slice operation in a given dimension, taking all other - // dimensions as they are; e.g. if dimno is 1 from start_index 2 to - // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand - // for: - // - // array[:, 2:4:1, :] - XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index, - int64 stride, int64 dimno); - - // Enqueues a slice operation onto the computation that slices the 'operand' - // from dynamic start indices which are passed in 'start_indices'. - // The size of the slice in each dimension is passed in 'slice_sizes', - // which specify the end point of exclusive slice intervals in each - // dimension [start, start + size). - // The shape of 'start_indices' must be rank == 1, with dimension size - // equal to the rank of the 'operand'. - // Slice index calculations are computed modulo input dimension sizes to - // prevent dynamic start indices from generating out-of-bound array accesses. - XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices, - tensorflow::gtl::ArraySlice slice_sizes); - - // Enqueues a dynamic update slice operation onto the computation, which - // updates a slice of 'operand' with 'update' at dynamic 'start_indices'. - // The shape of 'update' determines the shape of the slice of 'operand' - // which is updated. - // The indices specified in 'start_indices' specify the offset of the slice - // of 'operand' which is updated. - // - // update = {10, 11} // calculated at runtime. - // [1 2 3] start = {1, 1} // calculated at runtime. [1 2 3 ] - // [4 5 6] => DynamicUpdateslice(data, update, start) => [4 10 11] - // [7 8 9] [7 8 9 ] - // - // The shape of 'start_indices' must be rank == 1, with dimension size - // equal to the rank of the 'operand'. - // Slice index calculations are computed modulo update dimension sizes to - // prevent dynamic start indices from generating out-of-bound array accesses. - XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update, - const XlaOp& start_indices); - - // Enqueues a concatenate instruction onto the computation. 'operands' must - // have >= 1 entry. - XlaOp ConcatInDim(tensorflow::gtl::ArraySlice operands, - int64 dimension); - - // Enqueue a tracing operation onto the computation; the computation will emit - // a logging message with the operand. - void Trace(const string& tag, const XlaOp& operand); - - // Enqueues a conditional-move-like select operation onto the computation; - // predicated on pred, selects between on_true and on_false. - XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false); - - // Enqueues a tuple-creation instruction onto the computation. - XlaOp Tuple(tensorflow::gtl::ArraySlice elements); - - // Enqueues a tuple-element-get instruction onto the computation. - XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index); - - // Enqueues an equal-to comparison instruction onto the computation. - XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a not-equal comparison instruction onto the computation. - XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a greater-or-equal comparison instruction onto the computation. - XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a greater-than comparison instruction onto the computation. - XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a less-than comparison instruction onto the computation. - XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a less-or-equal comparison instruction onto the computation. - XlaOp Le(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a dot instruction onto the computation. - XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs); - - // Enqueues a general dot instruction onto the computation. - XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs, - const DotDimensionNumbers& dimension_numbers); - - // Default dimension numbers used for a 2D convolution. - static constexpr int64 kConvBatchDimension = 0; - static constexpr int64 kConvFeatureDimension = 1; - static constexpr int64 kConvFirstSpatialDimension = 2; - static constexpr int64 kConvSecondSpatialDimension = 3; - static constexpr int64 kConvKernelOutputDimension = 0; - static constexpr int64 kConvKernelInputDimension = 1; - static constexpr int64 kConvKernelFirstSpatialDimension = 2; - static constexpr int64 kConvKernelSecondSpatialDimension = 3; - - // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for - // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for - // the kernel operand - // {output_feature, input_feature, height, width} = {0, 1, 2, 3}. - static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers( - int num_spatial_dims = 2); - - // Returns an error if the convolution dimension numbers have conflicts. - static Status Validate(const ConvolutionDimensionNumbers& dnum); - - // Enqueues a convolution instruction onto the computation, which uses the - // default convolution dimension numbers. - XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - Padding padding); - - // Enqueues a convolution instruction onto the computation, with the caller - // provided padding configuration in the format returned by MakePadding(). - XlaOp ConvWithGeneralPadding( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding); - - // Enqueues a convolution instruction onto the computation, with the caller - // provided dimension numbers configuration. - XlaOp ConvWithGeneralDimensions( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, Padding padding, - const ConvolutionDimensionNumbers& dimension_numbers); - - // Enqueues a convolution instruction onto the computation, with the caller - // provided padding configuration as well as the dimension numbers. - XlaOp ConvGeneral( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - const ConvolutionDimensionNumbers& dimension_numbers); - - // Enqueues a convolution instruction onto the computation, with the caller - // provided padding configuration, dilation factors and dimension numbers. - XlaOp ConvGeneralDilated( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - tensorflow::gtl::ArraySlice lhs_dilation, - tensorflow::gtl::ArraySlice rhs_dilation, - const ConvolutionDimensionNumbers& dimension_numbers); - - // Enqueues an FFT instruction onto the computation, of the given type and - // with the given FFT length. - XlaOp Fft(const XlaOp& operand, FftType fft_type, - tensorflow::gtl::ArraySlice fft_length); - - // Enqueues an infeed instruction onto the computation, which writes data of - // the given shape to the infeed buffer of the device. - XlaOp Infeed(const Shape& shape, const string& config = ""); - - // Enqueues an outfeed instruction onto the computation. This instruction - // generates outgoing data transfers for the given data. - // - // shape_with_layout communicates the laid out shape that we want to outfeed - // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error - // will occur. - void Outfeed(const XlaOp& operand, const Shape& shape_with_layout, - const string& outfeed_config); - - // Enqueues a call instruction onto the computation. - XlaOp Call(const XlaComputation& computation, - tensorflow::gtl::ArraySlice operands); - - // Enqueues a custom call instruction onto the computation. - // During code generation, a call instruction is emitted which targets a - // symbol with the name |call_target_name|. The |operands| are passed to the - // call instruction. |shape| is the resultant shape. - XlaOp CustomCall(const string& call_target_name, - tensorflow::gtl::ArraySlice operands, - const Shape& shape); - - // Enqueues a pseudo-op to represent host-side computation data-dependencies. - // During code generation, host send and receive operations will be generated - // to transfer |operands| to the host and a single result of |shape| back to - // the device. Host send/recv operations are emitted using |channel_name|. - // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO - // instruction scheduling. - XlaOp HostCompute(tensorflow::gtl::ArraySlice operands, - const string& channel_name, int64 cost_estimate_ns, - const Shape& shape); - - // The following methods enqueue element-wise binary arithmetic operations - // onto the computation. The shapes of the operands have to match unless one - // of the operands is a scalar, or an explicit broadcast dimension is given - // (see g3doc for more details). - - // Enqueues a complex compose instruction onto the computation. - XlaOp Complex(const XlaOp& real, const XlaOp& imag, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a complex conjugate instruction onto the computation. - XlaOp Conj(const XlaOp& operand); - - // Enqueues an add instruction onto the computation. - XlaOp Add(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a subtract instruction onto the computation. - XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a multiply instruction onto the computation. - XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a divide instruction onto the computation. - XlaOp Div(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a remainder instruction onto the computation. - XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a max instruction onto the computation. - XlaOp Max(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues a min instruction onto the computation. - XlaOp Min(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Element-wise logical operators - XlaOp And(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - XlaOp Or(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - XlaOp Not(const XlaOp& operand); - - XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - XlaOp ShiftRightArithmetic( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - XlaOp ShiftRightLogical( - const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Reduces an array among the provided dimensions, given "computation" as a - // reduction operator. - XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value, - const XlaComputation& computation, - tensorflow::gtl::ArraySlice dimensions_to_reduce); - - // Convenience wrapper around the above that reduces all the dimensions in the - // operand shape. - XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value, - const XlaComputation& computation); - - // Enqueues a windowed reduce instruction onto the computation. - XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value, - const XlaComputation& computation, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - Padding padding); - - // As ReduceWindow(), but the padding is given in the format - // returned by MakePadding(). - XlaOp ReduceWindowWithGeneralPadding( - const XlaOp& operand, const XlaOp& init_value, - const XlaComputation& computation, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding); - - // Returns the sum of the operand value across all replicas. All replicas - // supply one input to the sum and all replicas receive the resulting sum. - XlaOp CrossReplicaSum(const XlaOp& operand); - - // Enqueues an operation that scatters the `source` array to the selected - // indices of each window. - XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - Padding padding, const XlaOp& source, - const XlaOp& init_value, - const XlaComputation& scatter); - - // As SelectAndScatter(), but the padding is given in the format - // returned by MakePadding(). - XlaOp SelectAndScatterWithGeneralPadding( - const XlaOp& operand, const XlaComputation& select, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - const XlaOp& source, const XlaOp& init_value, - const XlaComputation& scatter); - - // Enqueues an abs instruction onto the computation. - XlaOp Abs(const XlaOp& operand); - - // Enqueues a atan2 instruction onto the computation. - XlaOp Atan2(const XlaOp& y, const XlaOp& x, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues an exp instruction onto the computation. - XlaOp Exp(const XlaOp& operand); - - // Enqueues an expm1 instruction onto the computation. - XlaOp Expm1(const XlaOp& operand); - - // Enqueues a floor instruction onto the computation. - XlaOp Floor(const XlaOp& operand); - - // Enqueues a ceil instruction onto the computation. - XlaOp Ceil(const XlaOp& operand); - - // Enqueues a round instruction onto the computation, rounding to nearest even - // with half-way cases rounding away from zero. - XlaOp Round(const XlaOp& operand); - - // Enqueues an log instruction (natural logarithm) onto the computation. - XlaOp Log(const XlaOp& operand); - - // Enqueues an log1p instruction (log(x+1)) onto the computation. - XlaOp Log1p(const XlaOp& operand); - - // Enqueues a sign instruction onto the computation. - XlaOp Sign(const XlaOp& operand); - - // Enqueues a count leading zeros instruction onto the computation. - XlaOp Clz(const XlaOp& operand); - - // Enqueues a cosine instruction onto the computation. - XlaOp Cos(const XlaOp& operand); - - // Enqueues a sine instruction onto the computation. - XlaOp Sin(const XlaOp& operand); - - // Enqueues a tanh instruction onto the computation. - XlaOp Tanh(const XlaOp& operand); - - // Enqueues a real-part instruction onto the computation. - XlaOp Real(const XlaOp& operand); - - // Enqueues an imaginary-part instruction onto the computation. - XlaOp Imag(const XlaOp& operand); - - // Enqueues a float32 sqrt instruction onto the computation. - // (float32 is specified as there is an implicit float32 0.5f constant - // exponent). - XlaOp SqrtF32(const XlaOp& operand); - - // Enqueues a float32 square instruction onto the computation. - // (float32 is specified as there is an implicit float32 2.0f constant - // exponent). - XlaOp SquareF32(const XlaOp& operand); - - // Enqueues a lhs^rhs computation onto the computation. - XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions = {}); - - // Enqueues an operator that tests if the operand's values are finite, i.e., - // not Inf or NaN. Defined only for floating-point types. Returns an array of - // booleans with the same shape where entries are true iff the corresponding - // entry was NaN. - XlaOp IsFinite(const XlaOp& operand); - - // Enqueues a convert instruction onto the computation that changes the - // element type of the operand array to primitive_type. - XlaOp ConvertElementType(const XlaOp& operand, - PrimitiveType new_element_type); - - // Enqueues a no-op instruction onto the computation that changes - // the element type of the operand array to primitive_type. The - // bit-widths of the source and destination element types must be - // identical. - XlaOp BitcastConvertType(const XlaOp& operand, - PrimitiveType new_element_type); - - // Enqueues a float32 reciprocal instruction onto the computation. - // (float32 is specified as there is an implicit float32 -1.0f constant - // exponent). - // - // TODO(b/34468990) axe F32 suffix, can be determined by reflecting on the - // shape of the operand. - XlaOp ReciprocalF32(const XlaOp& operand); - - // Enqueues a negate instruction onto the computation. - XlaOp Neg(const XlaOp& operand); - - // Enqueues a transpose instruction onto the computation. - XlaOp Transpose(const XlaOp& operand, - tensorflow::gtl::ArraySlice permutation); - - // Enqueues a reverse instruction onto the computation. The order of the - // elements in the given dimensions is reversed (i.e., the element at index i - // is moved to index dimension_size - 1 - i). - XlaOp Rev(const XlaOp& operand, - tensorflow::gtl::ArraySlice dimensions); - - // Enqueues a sort (as increasing order) instruction onto the computation. - XlaOp Sort(const XlaOp& operand); - - // Enqueues a clamp instruction onto the computation. - XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max); - - // Enqueues a map instruction onto the computation. - XlaOp Map(tensorflow::gtl::ArraySlice operands, - const XlaComputation& computation, - tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice static_operands = {}); - - // Enqueues a N(mu, sigma) random number generation instruction onto the - // computation. - XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape); - - // Enqueues a U(a, b) random number generation instruction onto the - // computation. Returns values in the semi-open interval [a, b). - XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape); - - // Enqueues a while node onto the computation. - XlaOp While(const XlaComputation& condition, const XlaComputation& body, - const XlaOp& init); - - // Enqueues a conditional node onto the computation. - XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand, - const XlaComputation& true_computation, - const XlaOp& false_operand, - const XlaComputation& false_computation); - - // Enqueues a ReducePrecision node onto the computation. - XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits, - const int mantissa_bits); - - // Enqueues a Gather node onto the computation. - XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices, - const GatherDimensionNumbers& dimension_numbers, - tensorflow::gtl::ArraySlice window_bounds); - - // Enqueues a Send node onto the computation, to send the given operand to - // a Recv instruction that shares the same channel handle. - void Send(const XlaOp& operand, const ChannelHandle& handle); - - // Enqueues a Recv node onto the computation. The data comes from a Send - // instruction that shares the same channel handle and its shape must - // be the same as the given shape. - XlaOp Recv(const Shape& shape, const ChannelHandle& handle); - - // Returns true if 'operand' is a compile-time constant. A compile-time - // constant does not depend on any parameters, or on stateful operators such - // as `RngNormal` or `Infeed`. - // - // This tests whether a computation is a compile-time constant without - // evaluating the computation. - StatusOr IsConstant(const XlaOp& operand) const; - - // Normalizes operand across spatial and batch dimensions for each feature. - // - // Returns a tuple (normalized, batch_mean, batch_var) where `normalized` - // is the normalized result and batch_mean and batch_var are the mean and - // variance, respectively, across batch for the operand. - XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale, - const XlaOp& offset, float epsilon, - int64 feature_index); - - // Normalizes operand across spatial and batch dimensions for each feature. - // - // `BatchNormInference` is equivalent to calling `BatchNormTraining` without - // computing `mean` and `variance` for each batch inside the operation. It - // uses the input `mean` and `variance` instead as estimated values. The - // purpose of this op is to reduce latency in inference, hence the name - // `BatchNormInference`. - // - // The output has the same shape as `operand`, and contains the normalized - // values for each batch. - XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale, - const XlaOp& offset, const XlaOp& mean, - const XlaOp& variance, float epsilon, - int64 feature_index); - - // Calculates the gradients of a batch norm op. - // - // The inputs `batch_mean` and `batch_var` represent the mean and variance - // across the batch. - // - // Returns a tuple of three elements: - // - grad_operand: Gradient with respect to input `operand` - // - grad_offset: Gradient with respect to input `offset` - // - grad_scale: Gradient with respect to input `scale` - XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale, - const XlaOp& batch_mean, const XlaOp& batch_var, - const XlaOp& grad_output, float epsilon, - int64 feature_index); - - // Returns a new XlaBuilder whose resultant Computation is used only by this - // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error - // behavior as the parent. - std::unique_ptr CreateSubBuilder(const string& computation_name); - - // Builds the computation with the requested operations, or returns a non-ok - // status. Note that all ops that have been enqueued will be moved to the - // computation being returned. - StatusOr Build(); - - // Builds the computation with the requested operations, or notes an error in - // the parent XlaBuilder and returns an empty computation if building failed. - // This function is intended to be used where the returned XlaComputation is - // only used by the parent XlaBuilder and hence further operation on the - // returned XlaComputation will simply be error'ed out if an error occurred - // while building this computation. If the built computation is to be used by - // a XlaBuilder other than the parent XlaBuilder then Build() should be used - // instead. - XlaComputation BuildAndNoteError(); - - // Returns a subgraph that roots on the given root. If the root is not a - // compile-time constant (see `IsConstant`), returns an error. - // - // This will copy the needed ops/computations to the subgraph. - StatusOr BuildConstantSubGraph(const XlaOp& root_op) const; - - // Returns the first error that was encountered while building the - // computation. When an error is encountered, by default we return a vacuous - // XlaOp and inform the user of the error that occurred while - // building the computation when they make a final call to Build(). - // - // See also set_die_immediately_on_error(). - Status first_error() const { return first_error_; } - - // Returns the shape of the given op. - StatusOr GetShape(const XlaOp& op) const; - - // Returns the (inferred) result for the current computation's shape. - StatusOr GetProgramShape() const; - - private: - StatusOr AddInstruction( - HloInstructionProto&& instr, HloOpcode opcode, - tensorflow::gtl::ArraySlice operands = {}); - - void AddCalledComputation(const XlaComputation& computation, - HloInstructionProto* instr); - - // Notes that the error occurred by: - // * storing it internally and capturing a backtrace if it's the first error - // (this deferred value will be produced on the call to Build()) - // * dying if die_immediately_on_error_ is true - void NoteError(const Status& error); - - XlaOp NoteErrorOrReturn(const std::function()>& op_creator); - - // Helper method that creates an empty op and notes error. - XlaOp UnimplementedOp(); - - StatusOr LookUpInstruction(const XlaOp& op) const; - - // Internal helper method that does the building for an arbitrary unary op. - XlaOp UnaryOp(HloOpcode unop, const XlaOp& operand); - - // Internal helper method that does the building for an arbitrary binary op. - // broadcast_dimensions specifies which dimensions to use for broadcasting - // when the operation is between tensors of different ranks. - XlaOp BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions); - - // Internal helper method that does the building for an arbitrary ternary op. - XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs, - const XlaOp& ehs); - - XlaOp RngOp(RandomDistribution distribution, - tensorflow::gtl::ArraySlice parameters, - const Shape& shape); - - StatusOr InDimBroadcast( - const Shape& shape, const XlaOp& operand, - tensorflow::gtl::ArraySlice broadcast_dimensions); - - // Internal helper method that creates a sequence of instructions that - // performs an explicit broadcast of the operand to the target shape. - StatusOr AddBroadcastSequence(const Shape& output_shape, - const XlaOp& operand); - - // Internal helper method for creating a Reshape op with the already inferred - // shape. - StatusOr Reshape(const Shape& shape, const XlaOp& operand); - - // Returns the (inferred) result for the program shape for the current - // computation and fills the root_id in the pointer. - StatusOr GetProgramShape(int64* root_id) const; - - // Returns shapes for the operands. - StatusOr> GetOperandShapes( - tensorflow::gtl::ArraySlice operands) const; - - // A visitor which checks whether an operation is a compile-time constant, - // meaning that it doesn't depend on any parameters, or on any stateful - // operation such as `RngNormal` or `Infeed`. The visitor walks the - // computation starting at a given operation and sets is_constant to false iff - // a parameter or stateful operation is encountered. - void IsConstantVisitor(const int64 op_handle, std::set* visited, - bool* is_constant) const; - - // Checks bounds for convolution parameters. - Status VerifyConvolution( - const Shape& lhs_shape, const Shape& rhs_shape, - const ConvolutionDimensionNumbers& dimension_numbers) const; - - // Helper function for creating a Window proto from user-supplied data. - // Returns error if the user-supplied data was invalid. - StatusOr MakeWindow( - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - tensorflow::gtl::ArraySlice lhs_dilation, - tensorflow::gtl::ArraySlice rhs_dilation) const; - - string name_; // Name to use for the built computation. - - // The first error encountered while building the computation. - // This is OK until the first error is encountered. - Status first_error_; - - // The saved stack trace from the point at which the first error occurred. - tensorflow::SavedStackTrace first_error_backtrace_; - - // The instructions of this computation. - std::vector instructions_; - - // The embedded computations used by this computation. Each computation was - // the entry computation of some XlaComputation, the key is the unique id of - // that XlaComputation. - std::map embedded_; - - // The unique parameter numbers. - tensorflow::gtl::FlatSet parameter_numbers_; - - // The metadata to attach to each op. This is structured as a "modal"-like - // operation, in order to simplify client code (and not sprinkle this metadata - // throughout the TensorFlow op kernel implementations). - OpMetadata metadata_; - - // Sharding for this operator. This is structured as a "model"-like operation, - // in order to simplify client code, similar to metadata_. - tensorflow::gtl::optional sharding_; - - // Mode bit that indicates whether to die when a first error is encountered. - bool die_immediately_on_error_ = false; - - XlaBuilder* parent_builder_{nullptr}; -}; - -template -XlaOp XlaBuilder::ConstantR0(NativeT value) { - return ConstantLiteral(*Literal::CreateR0(value)); -} - -template -XlaOp XlaBuilder::ConstantR1(tensorflow::gtl::ArraySlice values) { - return ConstantLiteral(*Literal::CreateR1(values)); -} - -template -XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) { - Literal literal(ShapeUtil::MakeShape( - primitive_util::NativeToPrimitiveType(), {length})); - literal.PopulateWithValue(value); - return ConstantLiteral(literal); -} - -inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) { - return ConstantLiteral(*Literal::CreateR1(values)); -} - -template -XlaOp XlaBuilder::ConstantR2( - std::initializer_list> values) { - return ConstantLiteral(*Literal::CreateR2(values)); -} - -template -XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array& values, - const Layout& layout) { - return ConstantLiteral( - *Literal::CreateFromArrayWithLayout(values, layout)); -} - -template -XlaOp XlaBuilder::ConstantFromArray(const Array& values) { - return ConstantLiteral(*Literal::CreateFromArray(values)); -} - -template -XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout( - const Array2D& values, const Layout& layout) { - return ConstantLiteral( - *Literal::CreateFromArrayWithLayout(values, layout)); -} - -template -XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D& values) { - return ConstantLiteral(*Literal::CreateR2FromArray2D(values)); -} - -template -XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout( - const Array3D& values, const Layout& layout) { - return ConstantLiteral( - *Literal::CreateR3FromArray3DWithLayout(values, layout)); -} - -template -XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D& values) { - return ConstantFromArray(values); -} - -template -XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout( - const Array4D& values, const Layout& layout) { - return ConstantFromArrayWithLayout(values, layout); -} - -template -XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D& values) { - return ConstantFromArray(values); -} - -// RAII-style object: sets the current sharding assignment in builder on -// construction, and sets back to the previous assignment on destruction. -class XlaScopedShardingAssignment { - public: - XlaScopedShardingAssignment(xla::XlaBuilder* builder, - tensorflow::gtl::optional sharding) - : builder_(builder), prev_sharding_(builder->sharding()) { - SetSharding(sharding); - } - - XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete; - XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) = - delete; - - ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); } - - private: - void SetSharding(const tensorflow::gtl::optional& sharding) { - if (sharding.has_value()) { - builder_->SetSharding(sharding.value()); - } else { - builder_->ClearSharding(); - } - } - - xla::XlaBuilder* const builder_; - tensorflow::gtl::optional prev_sharding_; -}; - -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_ diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc deleted file mode 100644 index 2df3ea3af0d4fcfb9bc803feebd96f09042ab1f3..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc +++ /dev/null @@ -1,239 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" - -#include - -#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" -#include "tensorflow/compiler/xla/service/hlo_matchers.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" - -namespace xla { - -namespace { - -namespace op = xla::testing::opcode_matchers; - -using ::testing::HasSubstr; - -// TODO(b/74197823): Move the tests to service/. -class XlaBuilderTest : public ::testing::Test { - protected: - StatusOr> BuildHloModule(XlaBuilder* b) { - TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build()); - const HloModuleProto& proto = computation.proto(); - TF_ASSIGN_OR_RETURN(const auto& config, - HloModule::CreateModuleConfigFromProto( - proto, legacy_flags::GetDebugOptionsFromFlags())); - return HloModule::CreateFromProto(proto, config); - } - - // Returns the name of the test currently being run. - string TestName() const { - return ::testing::UnitTest::GetInstance()->current_test_info()->name(); - } -}; - -TEST_F(XlaBuilderTest, OnePlusTwo) { - XlaBuilder b(TestName()); - b.Add(b.ConstantR0(1.0), b.ConstantR0(2.0)); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Add(op::Constant(), op::Constant())); -} - -TEST_F(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) { - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {3, 5}), "x"); - b.Add(x, b.ConstantR0(1.0)); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Add(op::Parameter(), op::Broadcast(op::Constant()))); -} - -TEST_F(XlaBuilderTest, ParamPlusParamHasBroadcast) { - XlaBuilder b(TestName()); - const auto& x_shape = ShapeUtil::MakeShape(S32, {2, 4, 6}); - const auto& y_shape = ShapeUtil::MakeShape(S32, {2, 4}); - auto x = b.Parameter(0, x_shape, "x"); - auto y = b.Parameter(1, y_shape, "y"); - auto add = b.Add(x, y, /*broadcast_dimensions=*/{0, 1}); - - TF_ASSERT_OK_AND_ASSIGN(auto add_shape, b.GetShape(add)); - EXPECT_TRUE(ShapeUtil::Equal(add_shape, x_shape)); - - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Add(op::Parameter(0), op::Broadcast(op::Parameter(1)))); -} - -TEST_F(XlaBuilderTest, XPlusX) { - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(S32, {1, 3, 5, 7}), "x"); - b.Add(x, x); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Add(op::Parameter(0), op::Parameter(0))); -} - -TEST_F(XlaBuilderTest, ShapeInferenceError) { - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(U32, {2, 4, 6}), "x"); - auto y = b.Parameter(1, ShapeUtil::MakeShape(U32, {2, 4}), "y"); - b.Add(x, y); - auto statusor = BuildHloModule(&b); - ASSERT_FALSE(statusor.ok()); - EXPECT_THAT(statusor.status().error_message(), HasSubstr("shape inference")); -} - -TEST_F(XlaBuilderTest, ParameterAlreadyRegistered) { - XlaBuilder b_call("add"); - b_call.Parameter(0, ShapeUtil::MakeShape(PRED, {}), "x"); - - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(PRED, {}), "x"); - auto y = b.Parameter(0, ShapeUtil::MakeShape(PRED, {}), "y"); - b.Add(x, y); - auto statusor = BuildHloModule(&b); - ASSERT_FALSE(statusor.ok()); - EXPECT_THAT(statusor.status().error_message(), - HasSubstr("parameter 0 already registered")); -} - -TEST_F(XlaBuilderTest, Call) { - XlaBuilder b_call("the_only_to_apply"); - auto p0 = b_call.Parameter(0, ShapeUtil::MakeShape(F32, {}), "p0"); - auto p1 = b_call.Parameter(1, ShapeUtil::MakeShape(F32, {}), "p1"); - b_call.Add(p0, p1); - TF_ASSERT_OK_AND_ASSIGN(auto call, b_call.Build()); - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - auto one = b.ConstantR0(1); - auto two = b.ConstantR0(2); - b.Add(b.Call(call, {x, y}), b.Call(call, {one, two})); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Add(op::Call(op::Parameter(), op::Parameter()), - op::Call(op::Constant(), op::Constant()))); -} - -TEST_F(XlaBuilderTest, BinopHasDegenerateBroadcast) { - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {1, 2, 3}), "x"); - auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {1, 2, 1}), "y"); - b.Add(x, y); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - - // Expected: - // - // x: f32[1,2,3] y: f32[1,2,1] - // | | - // | reshape: f32[1,2] - // | | - // | broadcast: f32[1,2,3] - // \ / - // add - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Add(op::Parameter(0), - op::Broadcast(op::Reshape(op::Parameter(1))))); -} - -TEST_F(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) { - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 3}), "x"); - auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {2, 1, 4}), "y"); - b.Add(x, y, /*broadcast_dimensions=*/{0, 1}); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - - // The binary operation has in-dim broadcast and degenerate broadcast, should - // first do the in-dim broadcast then convert the degnerate broadcast into a - // reshape and a broadcast. - // - // Expected: - // - // x: f32[2,3] y: f32[2,1,4] - // | | - // broadcast: f32[2,3,4] reshape: f32[2,4] - // | | - // | broadcast: f32[2,3,4] - // \ / - // add - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Add(op::Broadcast(op::Parameter(0)), - op::Broadcast(op::Reshape(op::Parameter(1))))); -} - -TEST_F(XlaBuilderTest, OperandFromWrongBuilder) { - XlaBuilder b1("b1"); - auto p0 = b1.Parameter(0, ShapeUtil::MakeShape(F32, {}), "p0"); - XlaBuilder builder("main"); - builder.Add(p0, p0); - auto statusor = builder.Build(); - ASSERT_FALSE(statusor.ok()); - EXPECT_THAT( - statusor.status().error_message(), - HasSubstr( - "built by builder 'b1', but is trying to use it in builder 'main'")); -} - -TEST_F(XlaBuilderTest, ReshapeDefaultOrder) { - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x"); - b.Reshape(x, /*new_sizes=*/{6, 35}); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Reshape(op::Parameter())); -} - -TEST_F(XlaBuilderTest, ReshapeHasTranspose) { - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x"); - b.Reshape(x, /*dimensions=*/{3, 2, 1, 0}, /*new_sizes=*/{6, 35}); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Reshape(op::Transpose(op::Parameter()))); -} - -TEST_F(XlaBuilderTest, Transpose) { - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {5, 7}), "x"); - b.Transpose(x, /*permutation=*/{1, 0}); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - auto root = module->entry_computation()->root_instruction(); - EXPECT_THAT(root, op::Transpose(op::Parameter())); -} - -// TODO(b/65209188): Create a dedicated lowering for Xor. -TEST_F(XlaBuilderTest, Xor) { - XlaBuilder b(TestName()); - auto x = b.Parameter(0, ShapeUtil::MakeShape(PRED, {}), "x"); - auto y = b.Parameter(1, ShapeUtil::MakeShape(PRED, {}), "y"); - b.Xor(x, y); - TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); - auto root = module->entry_computation()->root_instruction(); - LOG(ERROR) << module->ToString(); - EXPECT_THAT(root, - op::Or(op::And(op::Not(op::Parameter(0)), op::Parameter(1)), - op::And(op::Parameter(0), op::Not(op::Parameter(1))))); -} - -} // namespace -} // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_computation.cc similarity index 88% rename from tensorflow/compiler/xla/client/xla_client/xla_computation.cc rename to tensorflow/compiler/xla/client/xla_computation.cc index 72e3935696e0c44ae3893fc8f1ceb261fa5e2646..22c9e83bb2ae9e3e205bdd480b64c703e31c6ffd 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc +++ b/tensorflow/compiler/xla/client/xla_computation.cc @@ -13,11 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/util.h" @@ -32,7 +32,7 @@ StatusOr> XlaComputation::Snapshot() const { if (IsNull()) { return InvalidArgument("Computation is invalid."); } - auto session = MakeUnique(); + auto session = absl::make_unique(); *session->mutable_hlo()->mutable_hlo_module() = proto_; return std::move(session); } diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h similarity index 90% rename from tensorflow/compiler/xla/client/xla_client/xla_computation.h rename to tensorflow/compiler/xla/client/xla_computation.h index 0ffba208b1f8683fe1d26107cbfd096b856267f1..71598ef8b296a760b0ee818fce0a59aed5cfc6b4 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h +++ b/tensorflow/compiler/xla/client/xla_computation.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_ -#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_ +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_ #include @@ -64,4 +64,4 @@ class XlaComputation { } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_ +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_ diff --git a/tensorflow/compiler/xla/device_util.h b/tensorflow/compiler/xla/device_util.h index 1a51fdee680721a4a03fa5de79a81746d92af76b..6d51126d882f87a84b054e9db599b995868824bf 100644 --- a/tensorflow/compiler/xla/device_util.h +++ b/tensorflow/compiler/xla/device_util.h @@ -21,8 +21,8 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -30,8 +30,8 @@ namespace xla { // Returns a string that represents the device in terms of platform and ordinal; // e.g. the first CUDA device will be "cuda:0" string DeviceIdentifier(se::StreamExecutor* stream_exec) { - return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":", - stream_exec->device_ordinal()); + return absl::StrCat(stream_exec->platform()->Name(), ":", + stream_exec->device_ordinal()); } } // namespace xla diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/BUILD b/tensorflow/compiler/xla/experimental/xla_sharding/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..a26b20c861846501c911253d89619591c37322b3 --- /dev/null +++ b/tensorflow/compiler/xla/experimental/xla_sharding/BUILD @@ -0,0 +1,18 @@ +# Description: +# Python API for shardings in XLA. + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//tensorflow:internal"]) + +py_library( + name = "xla_sharding", + srcs = ["xla_sharding.py"], + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/compiler/xla:xla_data_proto_py", + "//tensorflow/compiler/xla/python_api:types", + "//tensorflow/compiler/xla/python_api:xla_shape", + "//third_party/py/numpy", + ], +) diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py new file mode 100644 index 0000000000000000000000000000000000000000..fb135f5ceda67ce6c001de15b8f3f084ca164826 --- /dev/null +++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py @@ -0,0 +1,204 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ====================================== +"""Experimental support for defining XLA shardings.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import numpy as _np # Avoids becoming a part of public Tensorflow API. + +from tensorflow.compiler.xla import xla_data_pb2 +from tensorflow.compiler.xla.python_api import xla_shape +from tensorflow.core.framework import attr_value_pb2 + + +class Sharding(object): + """A class to support adding sharding attributes to Ops. + + Use the factory constructors and then call apply_to_tensor: + Sharding.replicate().apply_to_tensor(tensor) + """ + + def __init__(self, proto=None): + """Do not use this constructor; use the factory functions below.""" + self._proto = proto + + @classmethod + def replicate(cls): + """Returns a replicated sharding attribute. + + This causes an op to be computed in its entirety independently on all + cores in the XLA device. + """ + return Sharding( + proto=xla_data_pb2.OpSharding(type=xla_data_pb2.OpSharding.REPLICATED)) + + @classmethod + def assign_device(cls, core): + """Returns an AssignDevice sharding attribute. + + This causes an op to be computed in its entirety only on one core in + the XLA device. + Args: + core: The core to assign this Op to. + """ + return Sharding( + proto=xla_data_pb2.OpSharding( + type=xla_data_pb2.OpSharding.MAXIMAL, + tile_assignment_dimensions=[1], + tile_assignment_devices=[core])) + + @classmethod + def tile(cls, tile_shape, tile_assignment): + """Returns a Tiled sharding attribute. + + This causes an op to be partially computed on multiple cores in the + XLA device. + + Args: + tile_shape: A xla_shape.Shape describing the tile shape that each core + will compute. + The tile shape does not need to be divisible by the tile assignment. + tile_assignment: An np.ndarray describing the topology of the tiling and + which device will compute which part of the topology. + + Raises: + TypeError: tile_assignment was not of np.array type or tile_shape was + not of xla_shape.Shape type. + + TODO(jmolloy): This concept is nefarious and is not + something we really want to expose to users (especially as the + contract for tile_assignment is very strict). + """ + if not isinstance(tile_assignment, _np.ndarray): + raise TypeError('Tile assignment must be of type np.ndarray') + if not isinstance(tile_shape, xla_shape.Shape): + raise TypeError('Tile shape must be of type xla_shape.Shape') + dims = list(tile_assignment.shape) + flattened_devices = tile_assignment.reshape(-1, order='C') + return Sharding( + proto=xla_data_pb2.OpSharding( + type=xla_data_pb2.OpSharding.OTHER, + tile_shape=tile_shape.message, + tile_assignment_dimensions=dims, + tile_assignment_devices=list(flattened_devices))) + + @classmethod + def split(cls, tensor, split_dimension, num_devices): + """Returns a Sharding that splits a tensor across a dimension. + + This creates a Tiled attribute, similar to tile(), but easier to use for the + common case of tiling a tensor N ways in one dimension. + + Args: + tensor: A tf.Tensor to split. + split_dimension: The dimension number to split. + num_devices: The number of cores to split `tensor` over. + + Raises: + ValueError: The tensor to split was smaller in the split dimension than + the number of devices to split over. + """ + tensor.shape.assert_is_fully_defined() + shape = tensor.shape.as_list() + if shape[split_dimension] < num_devices: + raise ValueError('Split dimension was smaller than the required number ' + 'of splits: shape=%r, dimension=%r, num_devices=%r', + shape, split_dimension, num_devices) + + tile_shape = shape + tile_shape[split_dimension] = int( + math.ceil(tile_shape[split_dimension] / num_devices)) + tile_shape_proto = xla_data_pb2.Shape( + element_type=xla_data_pb2.F32, dimensions=tile_shape) + + tile_assignment_dims = [1] * len(shape) + tile_assignment_dims[split_dimension] = num_devices + + return Sharding( + proto=xla_data_pb2.OpSharding( + type=xla_data_pb2.OpSharding.OTHER, + tile_shape=tile_shape_proto, + tile_assignment_dimensions=tile_assignment_dims, + tile_assignment_devices=range(num_devices))) + + def apply_to_tensor(self, tensor): + """Applies this Sharding attribute to `tensor`.""" + if len(tensor.op.outputs) > 1: + proto = self._get_or_create_tuple_proto(tensor.op) + # We can't mutate an element of old_proto.tuple_shardings, so create + # a new proto. + tuple_shardings = list(proto.tuple_shardings) + tuple_shardings[tensor.value_index] = self._proto + proto = xla_data_pb2.OpSharding( + type=xla_data_pb2.OpSharding.TUPLE, tuple_shardings=tuple_shardings) + else: + proto = self._proto + + attr_value = attr_value_pb2.AttrValue(s=proto.SerializeToString()) + # TODO(jmolloy): This need to be seriously revisited before declaring this + # API available for public use. + # pylint: disable=protected-access + tensor.op._set_attr('_XlaSharding', attr_value) + + @property + def proto(self): + """Return the sharding protobuf of type xla_data_pb2.OpSharding.""" + return self._proto + + def _get_or_create_tuple_proto(self, op): + try: + attr = op.get_attr('_XlaSharding') + proto = xla_data_pb2.OpSharding() + proto.ParseFromString(attr) + return proto + except ValueError: + return self._create_tuple_proto(op) + + def _create_tuple_proto(self, op): + shardings = [ + xla_data_pb2.OpSharding(type=xla_data_pb2.OpSharding.REPLICATED) + for _ in op.outputs + ] + return xla_data_pb2.OpSharding( + type=xla_data_pb2.OpSharding.TUPLE, tuple_shardings=shardings) + + +# Helpers for the above factory functions that allow easy application of +# shardings, for example: +# tensor = xla_sharding.replicate(tensor) + + +def replicate(tensor): + Sharding.replicate().apply_to_tensor(tensor) + return tensor + + +def assign_device(tensor, device): + Sharding.assign_device(device).apply_to_tensor(tensor) + return tensor + + +def tile(tensor, tile_shape, tile_assignment): + Sharding.tile(tile_shape, tile_assignment).apply_to_tensor(tensor) + return tensor + + +def split(tensor, split_dimension, num_devices): + Sharding.split(tensor, split_dimension, num_devices).apply_to_tensor(tensor) + return tensor diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc index ffd1fb79e986f82e1c2721f0eefbf3b4c0838e41..3fadabcf5207097aa875d654320b930b1ed94ad3 100644 --- a/tensorflow/compiler/xla/index_util.cc +++ b/tensorflow/compiler/xla/index_util.cc @@ -18,16 +18,16 @@ limitations under the License. #include #include +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/logging.h" namespace xla { /* static */ int64 IndexUtil::MultidimensionalIndexToLinearIndex( - const Shape& shape, tensorflow::gtl::ArraySlice multi_index) { + const Shape& shape, absl::Span multi_index) { DCHECK_EQ(shape.dimensions_size(), multi_index.size()); // Padding and nested layouts not supported yet. DCHECK_EQ(0, shape.layout().padded_dimensions_size()); @@ -36,7 +36,7 @@ namespace xla { DCHECK_GE(multi_index[i], 0); DCHECK_LT(multi_index[i], shape.dimensions(i)) << "indexing beyond extent in dimension " << i << ":" - << "\n\tindex: " << tensorflow::str_util::Join(multi_index, ",") + << "\n\tindex: " << absl::StrJoin(multi_index, ",") << "\n\tshape: " << ShapeUtil::HumanString(shape); } @@ -118,8 +118,8 @@ namespace xla { return multi_index; } -/* static */ bool IndexUtil::BumpIndices( - const Shape& shape, tensorflow::gtl::MutableArraySlice indices) { +/* static */ bool IndexUtil::BumpIndices(const Shape& shape, + absl::Span indices) { for (int64 dimno = indices.size() - 1; dimno >= 0; --dimno) { int64 limit = shape.dimensions(dimno); if (indices[dimno] + 1 < limit) { @@ -149,8 +149,8 @@ namespace xla { return stride; } -/* static */ bool IndexUtil::IndexInBounds( - const Shape& shape, tensorflow::gtl::ArraySlice index) { +/* static */ bool IndexUtil::IndexInBounds(const Shape& shape, + absl::Span index) { int64 rank = ShapeUtil::Rank(shape); if (rank != index.size()) { return false; @@ -163,9 +163,8 @@ namespace xla { return true; } -/* static */ int IndexUtil::CompareIndices( - tensorflow::gtl::ArraySlice lhs, - tensorflow::gtl::ArraySlice rhs) { +/* static */ int IndexUtil::CompareIndices(absl::Span lhs, + absl::Span rhs) { int64 rank = lhs.size(); CHECK_EQ(rhs.size(), rank); for (int64 dim = 0; dim < rank; ++dim) { diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h index 142006f2626e83d3254f2de65fc28fd5d6694e53..2979cf87dde92893ce2151cb09b46c8db8473b31 100644 --- a/tensorflow/compiler/xla/index_util.h +++ b/tensorflow/compiler/xla/index_util.h @@ -20,9 +20,9 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" namespace xla { @@ -35,7 +35,7 @@ class IndexUtil { // on the shape and its layout. The first index in the multi_index is // dimension 0. static int64 MultidimensionalIndexToLinearIndex( - const Shape& shape, tensorflow::gtl::ArraySlice multi_index); + const Shape& shape, absl::Span multi_index); // Converts a linear index into multidimensional index (eg {x, y, z}) based on // the shape and its layout. The first index in the returned multidimensional @@ -58,8 +58,7 @@ class IndexUtil { // // Returns true iff the indices were successfully bumped; false if we've hit // the limit where it can no longer be bumped in-bounds. - static bool BumpIndices(const Shape& shape, - tensorflow::gtl::MutableArraySlice indices); + static bool BumpIndices(const Shape& shape, absl::Span indices); // Calculates the stride size (in number of elements, not byte size) of a // given logical shape dimension (from 0 to rank-1). If available, padded @@ -71,15 +70,14 @@ class IndexUtil { // Returns true iff the given multi-index is contained in the bounds for the // shape. - static bool IndexInBounds(const Shape& shape, - tensorflow::gtl::ArraySlice index); + static bool IndexInBounds(const Shape& shape, absl::Span index); // Compares the given indices in lexicographic order. lhs[0] and rhs[0] are // compared first, and lhs[rank-1] and rhs[rank-1] last. If lhs is larger, // then -1 is returned. If rhs is larger, then 1 is returned. Otherwise, 0 is // returned. - static int CompareIndices(tensorflow::gtl::ArraySlice lhs, - tensorflow::gtl::ArraySlice rhs); + static int CompareIndices(absl::Span lhs, + absl::Span rhs); private: TF_DISALLOW_COPY_AND_ASSIGN(IndexUtil); diff --git a/tensorflow/compiler/xla/index_util_test.cc b/tensorflow/compiler/xla/index_util_test.cc index 7c4efdee484d9530a69b31cbe3a0d69a8a3cffa7..93522d2ca87a7eba8d3c7533785c54e63ce507b0 100644 --- a/tensorflow/compiler/xla/index_util_test.cc +++ b/tensorflow/compiler/xla/index_util_test.cc @@ -142,13 +142,13 @@ TEST(IndexUtilTest, LinearToMultiToLinear) { TEST(IndexUtilTest, BumpIndices2x2) { auto shape = ShapeUtil::MakeShape(S32, {2, 2}); std::vector indices = {0, 0}; - EXPECT_TRUE(IndexUtil::BumpIndices(shape, &indices)); + EXPECT_TRUE(IndexUtil::BumpIndices(shape, absl::MakeSpan(indices))); EXPECT_THAT(indices, ::testing::ElementsAre(0, 1)); - EXPECT_TRUE(IndexUtil::BumpIndices(shape, &indices)); + EXPECT_TRUE(IndexUtil::BumpIndices(shape, absl::MakeSpan(indices))); EXPECT_THAT(indices, ::testing::ElementsAre(1, 0)); - EXPECT_TRUE(IndexUtil::BumpIndices(shape, &indices)); + EXPECT_TRUE(IndexUtil::BumpIndices(shape, absl::MakeSpan(indices))); EXPECT_THAT(indices, ::testing::ElementsAre(1, 1)); - EXPECT_FALSE(IndexUtil::BumpIndices(shape, &indices)); + EXPECT_FALSE(IndexUtil::BumpIndices(shape, absl::MakeSpan(indices))); } } // namespace diff --git a/tensorflow/compiler/xla/iterator_util.h b/tensorflow/compiler/xla/iterator_util.h index a8bb8c7a7e6784e555f4e9dad73ecc78c668ac42..3a3ee21e7635b9dee61f59e4e8c69eec3d420c86 100644 --- a/tensorflow/compiler/xla/iterator_util.h +++ b/tensorflow/compiler/xla/iterator_util.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_ +#ifndef TENSORFLOW_COMPILER_XLA_ITERATOR_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_ITERATOR_UTIL_H_ #include #include @@ -95,4 +95,4 @@ UnwrappingIterator MakeUnwrappingIterator(NestedIter iter) { } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_ +#endif // TENSORFLOW_COMPILER_XLA_ITERATOR_UTIL_H_ diff --git a/tensorflow/compiler/xla/iterator_util_test.cc b/tensorflow/compiler/xla/iterator_util_test.cc index 7bc3189507ec5233c6983eb26cfb07dc9bfadd52..ec8b66df2db0b9d8c045fbf6133f607e57c81c26 100644 --- a/tensorflow/compiler/xla/iterator_util_test.cc +++ b/tensorflow/compiler/xla/iterator_util_test.cc @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/test.h" namespace xla { @@ -27,7 +27,7 @@ namespace { TEST(UnwrappingIteratorTest, Simple) { std::vector> v; for (int i = 0; i < 3; ++i) { - v.push_back(MakeUnique(i)); + v.push_back(absl::make_unique(i)); } int i = 0; for (auto iter = MakeUnwrappingIterator(v.begin()); @@ -51,7 +51,7 @@ TEST(UnwrappingIteratorTest, PostincrementOperator) { TEST(UnwrappingIteratorTest, StdFind) { std::list> l; for (int i = 0; i < 3; ++i) { - l.push_back(MakeUnique(i)); + l.push_back(absl::make_unique(i)); } EXPECT_EQ(l.begin()->get(), *std::find(MakeUnwrappingIterator(l.begin()), diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc index 89cafa1a7dee97cbc10d17133a143a36f0f12ee1..d310335618ded7b581e6ed632223218585bb791f 100644 --- a/tensorflow/compiler/xla/layout_util.cc +++ b/tensorflow/compiler/xla/layout_util.cc @@ -23,6 +23,8 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/protobuf_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -31,8 +33,6 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" @@ -56,7 +56,7 @@ void SetDefaultLayoutToContainer( } // namespace /* static */ Layout LayoutUtil::MakeLayout( - tensorflow::gtl::ArraySlice minor_to_major) { + absl::Span minor_to_major) { Layout layout; layout.set_format(DENSE); for (int64 dimension_number : minor_to_major) { @@ -66,7 +66,7 @@ void SetDefaultLayoutToContainer( } /* static */ Layout LayoutUtil::MakeLayoutFromMajorToMinor( - tensorflow::gtl::ArraySlice major_to_minor) { + absl::Span major_to_minor) { Layout layout; layout.set_format(DENSE); for (int i = major_to_minor.size() - 1; i >= 0; i--) { @@ -98,8 +98,13 @@ Layout CreateDefaultLayoutForRank(int64 rank) { } // namespace /* static */ Layout LayoutUtil::GetDefaultLayoutForShape(const Shape& shape) { + if (ShapeUtil::IsOpaque(shape) || ShapeUtil::IsToken(shape)) { + // Opaque and token types have empty layouts. + return Layout(); + } + // A Layout proto corresponds to a single array, not a tuple. - DCHECK(!ShapeUtil::IsTuple(shape)); + CHECK(ShapeUtil::IsArray(shape)); return CreateDefaultLayoutForRank(shape.dimensions_size()); } @@ -126,14 +131,15 @@ Layout CreateDefaultLayoutForRank(int64 rank) { SetToDefaultLayout(&element_shape); } shape->clear_layout(); - } else if (ShapeUtil::IsOpaque(*shape)) { - shape->clear_layout(); - } else { + } else if (ShapeUtil::IsArray(*shape)) { shape->mutable_layout()->set_format(DENSE); tensorflow::protobuf::RepeatedField* minor_to_major = shape->mutable_layout()->mutable_minor_to_major(); minor_to_major->Resize(shape->dimensions_size(), 0); SetDefaultLayoutToContainer(minor_to_major); + } else { + // Opaque, token types etc. have no layout. + shape->clear_layout(); } } @@ -160,18 +166,20 @@ Layout CreateDefaultLayoutForRank(int64 rank) { TF_RETURN_IF_ERROR(ValidateLayoutInShape(element_shape)); } return Status::OK(); - } else if (ShapeUtil::IsOpaque(shape)) { - if (shape.has_layout()) { - return InvalidArgument("opaque should not have a layout field"); - } - return Status::OK(); - } else { - // Array shape. + } else if (ShapeUtil::IsArray(shape)) { if (!shape.has_layout()) { return InvalidArgument("shape %s does not have a layout", - ShapeUtil::HumanString(shape).c_str()); + ShapeUtil::HumanString(shape)); } return ValidateLayoutForShape(shape.layout(), shape); + } else { + // Token, opaque, etc. shape. + if (shape.has_layout()) { + return InvalidArgument( + "shape of primitive type %s should not have a layout", + PrimitiveType_Name(shape.element_type())); + } + return Status::OK(); } } @@ -181,24 +189,30 @@ Layout CreateDefaultLayoutForRank(int64 rank) { return InvalidArgument("a single Layout is not valid for tuple shapes"); } - if (ShapeUtil::IsOpaque(shape)) { + if (!ShapeUtil::IsArray(shape)) { + if (layout.minor_to_major_size() != 0 || + layout.padded_dimensions_size() != 0) { + return InvalidArgument( + "shape of primitive type %s should not have a non-trivial layout", + PrimitiveType_Name(shape.element_type())); + } return Status::OK(); } if (layout.format() == INVALID_FORMAT) { return InvalidArgument( "Layout does not have a valid format: layout {%s}, shape {%s}", - layout.ShortDebugString().c_str(), shape.ShortDebugString().c_str()); + layout.ShortDebugString(), shape.ShortDebugString()); } if (layout.format() == DENSE) { if (layout.minor_to_major_size() != ShapeUtil::Rank(shape)) { return InvalidArgument( "layout minor_to_major field contains %d elements, " - "but shape is rank %lld: {%s}; shape: %s", + "but shape is rank %d: {%s}; shape: %s", layout.minor_to_major_size(), ShapeUtil::Rank(shape), - tensorflow::str_util::Join(layout.minor_to_major(), ", ").c_str(), - shape.ShortDebugString().c_str()); + absl::StrJoin(layout.minor_to_major(), ", "), + shape.ShortDebugString()); } std::vector dimensions_in_layout(ShapeUtil::Rank(shape), false); @@ -207,12 +221,12 @@ Layout CreateDefaultLayoutForRank(int64 rank) { if (dim < 0 || dim >= ShapeUtil::Rank(shape)) { return InvalidArgument( "layout minor_to_major field has out-of-bounds value: %s", - HumanString(layout).c_str()); + HumanString(layout)); } if (dimensions_in_layout[dim]) { return InvalidArgument( "layout minor_to_major field has duplicate values: {%s}", - HumanString(layout).c_str()); + HumanString(layout)); } dimensions_in_layout[dim] = true; } @@ -220,20 +234,26 @@ Layout CreateDefaultLayoutForRank(int64 rank) { if (layout.padded_dimensions_size() > 0) { if (layout.padded_dimensions_size() != ShapeUtil::Rank(shape)) { return InvalidArgument( - "layout has %d padded dimensions, but shape is rank %lld", + "layout has %d padded dimensions, but shape is rank %d", layout.padded_dimensions_size(), ShapeUtil::Rank(shape)); } for (int i = 0; i < layout.padded_dimensions_size(); ++i) { if (layout.padded_dimensions(i) < shape.dimensions(i)) { return InvalidArgument( - "for dimension %d, dimension padding (%lld) is smaller than " - "the dimension size (%lld) of the shape", + "for dimension %d, dimension padding (%d) is smaller than " + "the dimension size (%d) of the shape", i, layout.padded_dimensions(i), shape.dimensions(i)); } } } } + if (layout.format() == SPARSE) { + if (!layout.padded_dimensions().empty()) { + return InvalidArgument("Sparse layout has padded dimensions"); + } + } + return Status::OK(); } @@ -273,11 +293,11 @@ Layout CreateDefaultLayoutForRank(int64 rank) { } /* static */ bool LayoutUtil::IsPadded(const Shape& shape) { - if (ShapeUtil::IsTuple(shape) || !HasLayout(shape) || + if (!ShapeUtil::IsArray(shape) || !HasLayout(shape) || shape.layout().padded_dimensions_size() == 0) { return false; } - CHECK(IsDenseArray(shape)); + CHECK(IsDenseArray(shape)) << shape.ShortDebugString(); CHECK_EQ(shape.dimensions_size(), shape.layout().padded_dimensions_size()); for (int64 i = 0; i < shape.dimensions_size(); ++i) { if (shape.layout().padded_dimensions(i) > shape.dimensions(i)) { @@ -287,7 +307,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) { return false; } -/* static */ tensorflow::gtl::ArraySlice LayoutUtil::PaddedDimensions( +/* static */ absl::Span LayoutUtil::PaddedDimensions( const Shape& shape) { CHECK(IsDenseArray(shape)); return AsInt64Slice(shape.layout().padded_dimensions()); @@ -323,7 +343,8 @@ Layout CreateDefaultLayoutForRank(int64 rank) { // Tuple shape: all subshapes must have a layout. return std::all_of(shape.tuple_shapes().begin(), shape.tuple_shapes().end(), [](const Shape& s) { return HasLayout(s); }); - } else if (ShapeUtil::IsOpaque(shape)) { + } else if (!ShapeUtil::IsArray(shape)) { + // Opaque, token types etc. ignore layout. return true; } return shape.has_layout() && shape.layout().format() != INVALID_FORMAT; @@ -342,13 +363,13 @@ Layout CreateDefaultLayoutForRank(int64 rank) { return protobuf_util::ProtobufEquals(lhs, rhs); } -/* static */ tensorflow::gtl::ArraySlice LayoutUtil::MinorToMajor( +/* static */ absl::Span LayoutUtil::MinorToMajor( const Shape& shape) { CHECK(IsDenseArray(shape)); return AsInt64Slice(shape.layout().minor_to_major()); } -/* static */ tensorflow::gtl::ArraySlice LayoutUtil::MinorToMajor( +/* static */ absl::Span LayoutUtil::MinorToMajor( const Layout& layout) { CHECK(layout.format() == DENSE); return AsInt64Slice(layout.minor_to_major()); @@ -382,12 +403,10 @@ Layout CreateDefaultLayoutForRank(int64 rank) { /* static */ string LayoutUtil::HumanString(const Layout& layout) { if (IsSparse(layout)) { - return tensorflow::strings::StrCat("sparse{", layout.max_sparse_elements(), - "}"); + return absl::StrCat("sparse{", layout.max_sparse_elements(), "}"); } CHECK(IsDense(layout)); - return tensorflow::strings::StrCat( - "{", tensorflow::str_util::Join(layout.minor_to_major(), ","), "}"); + return absl::StrCat("{", absl::StrJoin(layout.minor_to_major(), ","), "}"); } namespace { @@ -432,12 +451,9 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) { /* static */ bool LayoutUtil::LayoutsInShapesEqual(const Shape& lhs, const Shape& rhs) { - if (ShapeUtil::IsTuple(lhs) != ShapeUtil::IsTuple(rhs)) { - return false; - } if (ShapeUtil::IsTuple(lhs)) { - if (ShapeUtil::TupleElementCount(lhs) != - ShapeUtil::TupleElementCount(rhs)) { + if (!ShapeUtil::IsTuple(rhs) || ShapeUtil::TupleElementCount(lhs) != + ShapeUtil::TupleElementCount(rhs)) { return false; } for (int i = 0; i < ShapeUtil::TupleElementCount(lhs); ++i) { @@ -446,14 +462,17 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) { } } return true; - } else { + } else if (ShapeUtil::IsArray(lhs)) { return ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs) && LayoutUtil::Equal(lhs.layout(), rhs.layout()); + } else { + // Layouts of non-array and non-tuple shapes is ignored. + return true; } } /* static */ bool LayoutUtil::AreDimensionsConsecutive( - const Layout& layout, tensorflow::gtl::ArraySlice dims) { + const Layout& layout, absl::Span dims) { CHECK(IsDense(layout)); std::vector positions_in_layout; for (int64 dim : dims) { diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h index 739bbe73675c7fb855627006028eafdf703d6540..b78883c2d870043032306637730c4666665125a8 100644 --- a/tensorflow/compiler/xla/layout_util.h +++ b/tensorflow/compiler/xla/layout_util.h @@ -20,10 +20,10 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -34,11 +34,11 @@ class LayoutUtil { public: // Creates a layout with the given minor-to-major dimension order. (This is a // convenience function for protobuf construction.) - static Layout MakeLayout(tensorflow::gtl::ArraySlice minor_to_major); + static Layout MakeLayout(absl::Span minor_to_major); // Similar to MakeLayout, but take indices in reverse order. static Layout MakeLayoutFromMajorToMinor( - tensorflow::gtl::ArraySlice major_to_minor); + absl::Span major_to_minor); // Creates a sparse layout with the given maximum number of elements. (This is // a convenience function for protobuf construction.) @@ -104,8 +104,7 @@ class LayoutUtil { // Returns the padded_dimensions array for the given Shape. Requires that the // shape is an array and has a dense layout. - static tensorflow::gtl::ArraySlice PaddedDimensions( - const Shape& shape); + static absl::Span PaddedDimensions(const Shape& shape); // Returns the given index of the padded_dimensions array for the given Shape. // Requires that the shape is an array and has a dense layout. @@ -138,8 +137,8 @@ class LayoutUtil { // Returns the minor_to_major array for the given Shape. Requires that the // shape is an array and has a dense layout. - static tensorflow::gtl::ArraySlice MinorToMajor(const Shape& shape); - static tensorflow::gtl::ArraySlice MinorToMajor(const Layout& layout); + static absl::Span MinorToMajor(const Shape& shape); + static absl::Span MinorToMajor(const Layout& layout); // Major(0) is the most major logical dimension number, Major(1) is the // second-most-major logical dimension number and so on. @@ -196,7 +195,7 @@ class LayoutUtil { // Returns whether the given dimensions are consecutive in the given layout, // not necessarily in the order given. static bool AreDimensionsConsecutive(const Layout& layout, - tensorflow::gtl::ArraySlice dims); + absl::Span dims); // Compute a hash for `layout`. static size_t Hash(const Layout& layout); diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc index 4fd1d818e3e3b417eee9f6b14bb598bfb9480c6e..f25dae6ff411133c74502039f441060f1329ffd4 100644 --- a/tensorflow/compiler/xla/layout_util_test.cc +++ b/tensorflow/compiler/xla/layout_util_test.cc @@ -27,15 +27,15 @@ namespace { class LayoutUtilTest : public ::testing::Test { protected: Shape MakeShapeWithLayout(PrimitiveType element_type, - tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice minor_to_major) { + absl::Span dimensions, + absl::Span minor_to_major) { Shape shape = ShapeUtil::MakeShape(element_type, dimensions); *shape.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major); return shape; } Shape MakeShapeWithSparseLayout(PrimitiveType element_type, - tensorflow::gtl::ArraySlice dimensions, + absl::Span dimensions, int64 max_sparse_elements) { Shape shape = ShapeUtil::MakeShape(element_type, dimensions); *shape.mutable_layout() = LayoutUtil::MakeSparseLayout(max_sparse_elements); @@ -218,6 +218,47 @@ TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) { "elements, but shape is rank")); } +TEST_F(LayoutUtilTest, CopyTokenLayout) { + Shape src = ShapeUtil::MakeTokenShape(); + Shape dst = ShapeUtil::MakeTokenShape(); + + // Layouts are trivially the same for token types and copying layouts should + // be a nop. + EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst)); + EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst)); + EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst)); +} + +TEST_F(LayoutUtilTest, CopyOpaqueLayout) { + Shape src = ShapeUtil::MakeOpaqueShape(); + Shape dst = ShapeUtil::MakeOpaqueShape(); + + // Layouts are trivially the same for opaque types and copying layouts should + // be a nop. + EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst)); + EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst)); + EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst)); +} + +TEST_F(LayoutUtilTest, CopyTupleLayoutWithTokenAndOpaque) { + Shape src = ShapeUtil::MakeTupleShape( + {MakeShapeWithLayout(F32, {2, 3}, {0, 1}), + MakeShapeWithLayout(F32, {42, 123}, {1, 0}), ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeOpaqueShape(), MakeShapeWithLayout(F32, {}, {}), + MakeShapeWithLayout(F32, {1, 2, 3}, {0, 2, 1})})}); + Shape dst = ShapeUtil::MakeTupleShape( + {MakeShapeWithLayout(F32, {2, 3}, {1, 0}), + MakeShapeWithLayout(F32, {42, 123}, {1, 0}), ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeOpaqueShape(), MakeShapeWithLayout(F32, {}, {}), + MakeShapeWithLayout(F32, {1, 2, 3}, {1, 2, 0})})}); + + EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst)); + EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst)); + EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst)); +} + TEST_F(LayoutUtilTest, ClearLayoutTuple) { Shape shape = ShapeUtil::MakeTupleShape( {MakeShapeWithLayout(F32, {2, 3}, {1, 0}), @@ -236,6 +277,16 @@ TEST_F(LayoutUtilTest, ClearLayoutTuple) { EXPECT_FALSE(shape.tuple_shapes(2).tuple_shapes(1).has_layout()); } +TEST_F(LayoutUtilTest, ClearLayoutOpaqueAndToken) { + // Opaque and token types trivially have layouts. + for (Shape shape : + {ShapeUtil::MakeOpaqueShape(), ShapeUtil::MakeTokenShape()}) { + EXPECT_TRUE(LayoutUtil::HasLayout(shape)); + LayoutUtil::ClearLayout(&shape); + EXPECT_TRUE(LayoutUtil::HasLayout(shape)); + } +} + TEST_F(LayoutUtilTest, SetToDefaultLayoutTuple) { Shape shape = ShapeUtil::MakeTupleShape( {MakeShapeWithLayout(F32, {2, 3, 4}, {1, 0, 2}), diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD index 89353448e29ec3d97275dac288e23aa8e96e31b2..3e79129aafd234e5eab05d205f2017b54057795e 100644 --- a/tensorflow/compiler/xla/legacy_flags/BUILD +++ b/tensorflow/compiler/xla/legacy_flags/BUILD @@ -26,6 +26,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -39,6 +40,7 @@ tf_cc_test( "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings:str_format", ], ) @@ -56,6 +58,7 @@ cc_library( "//tensorflow/compiler/xla/service:hlo", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -73,5 +76,7 @@ tf_cc_test( "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc index f42fb92359f40ec763866af094972046f6407ae1..0d3136b0cc6a3a695eacb98c16200e46a144c571 100644 --- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc +++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc @@ -17,9 +17,9 @@ limitations under the License. #include // NOLINT(build/c++11): only using std::call_once, not mutex. #include +#include "absl/strings/str_split.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h" #include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace xla { namespace legacy_flags { @@ -31,7 +31,6 @@ std::vector* flag_objects; std::once_flag flags_init; void SetDebugOptionsDefaults(DebugOptions* flags) { - flags->set_xla_enable_fast_math(true); flags->set_xla_llvm_enable_alias_scope_metadata(true); flags->set_xla_llvm_enable_noalias_metadata(true); flags->set_xla_llvm_enable_invariant_load_metadata(true); @@ -53,6 +52,11 @@ void SetDebugOptionsDefaults(DebugOptions* flags) { // the heuristics needed to decide when to run on multiple streams. See // b/77879207. flags->set_xla_gpu_disable_multi_streaming(true); + + // TODO(jlebar): Disable fastmath once doing so is not a performance + // regression. + flags->set_xla_cpu_enable_fast_math(true); + flags->set_xla_gpu_enable_fast_math(true); } // Allocates flag_values and flag_objects; this function must not be called more @@ -83,7 +87,7 @@ void AllocateFlags() { // Custom "sub-parser" lambda for xla_disable_hlo_passes. auto setter_for_xla_disable_hlo_passes = [](string comma_separated_values) { std::vector disabled_passes = - tensorflow::str_util::Split(comma_separated_values, ','); + absl::StrSplit(comma_separated_values, ','); for (const auto& passname : disabled_passes) { flag_values->add_xla_disable_hlo_passes(passname); } @@ -150,10 +154,16 @@ void AllocateFlags() { flag_values->mutable_xla_generate_hlo_text_to(), "Dump all HLO modules as text into the provided directory path."), tensorflow::Flag( - "xla_enable_fast_math", - bool_setter_for(&DebugOptions::set_xla_enable_fast_math), - flag_values->xla_enable_fast_math(), - "Enable unsafe fast-math optimizations in the compiler; " + "xla_cpu_enable_fast_math", + bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math), + flag_values->xla_cpu_enable_fast_math(), + "Enable unsafe fast-math optimizations in the CPU compiler; " + "this may produce faster code at the expense of some accuracy."), + tensorflow::Flag( + "xla_gpu_enable_fast_math", + bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math), + flag_values->xla_cpu_enable_fast_math(), + "Enable unsafe fast-math optimizations in the GPU compiler; " "this may produce faster code at the expense of some accuracy."), tensorflow::Flag( "xla_llvm_enable_alias_scope_metadata", @@ -306,6 +316,13 @@ void AllocateFlags() { bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn), flag_values->xla_cpu_use_mkl_dnn(), "Generate calls to MKL-DNN in the CPU backend."), + tensorflow::Flag( + "xla_gpu_crash_on_verification_failures", + bool_setter_for( + &DebugOptions::set_xla_gpu_crash_on_verification_failures), + flag_values->xla_gpu_crash_on_verification_failures(), + "Crashes the program on extra verification failures, e.g. cuDNN " + "cross checking failures"), }); ParseFlagsFromEnv(*flag_objects); } diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h index e9cf435d83d8345e974d83f8e5340dafeba8e3b2..ee7eb019c07cf898e48886955b18710146644cac 100644 --- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h +++ b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h @@ -17,10 +17,10 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_ #include +#include "absl/strings/numbers.h" +#include "absl/strings/str_split.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/xla.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { namespace legacy_flags { @@ -30,7 +30,7 @@ template void parse_xla_backend_extra_options(T* extra_options_map, string comma_separated_values) { std::vector extra_options_parts = - tensorflow::str_util::Split(comma_separated_values, ','); + absl::StrSplit(comma_separated_values, ','); // The flag contains a comma-separated list of options; some options // have arguments following "=", some don't. @@ -59,8 +59,7 @@ void parse_xla_backend_extra_options(T* extra_options_map, inline bool parse_xla_reduce_precision_option( HloReducePrecisionOptions* options, string option_string) { // Split off "LOCATION" from remainder of string. - std::vector eq_split = - tensorflow::str_util::Split(option_string, '='); + std::vector eq_split = absl::StrSplit(option_string, '='); if (eq_split.size() != 2) { return false; } @@ -80,26 +79,25 @@ inline bool parse_xla_reduce_precision_option( } // Split off "E,M" from remainder of string. - std::vector colon_split = - tensorflow::str_util::Split(eq_split[1], ':'); + std::vector colon_split = absl::StrSplit(eq_split[1], ':'); if (colon_split.size() != 2) { return false; } // Split E and M, and parse. std::vector bitsizes; - if (!tensorflow::str_util::SplitAndParseAsInts(colon_split[0], ',', - &bitsizes) || - bitsizes.size() != 2) { - return false; + for (const auto& s : absl::StrSplit(colon_split[0], ',')) { + bitsizes.emplace_back(); + if (!absl::SimpleAtoi(s, &bitsizes.back())) { + return false; + } } options->set_exponent_bits(bitsizes[0]); options->set_mantissa_bits(bitsizes[1]); // Split off OPS comma-separated list from remainder of string, if the // remainder exists. - std::vector semicolon_split = - tensorflow::str_util::Split(colon_split[1], ';'); + std::vector semicolon_split = absl::StrSplit(colon_split[1], ';'); if (semicolon_split.size() > 2) { return false; } @@ -113,8 +111,7 @@ inline bool parse_xla_reduce_precision_option( options->add_opcodes_to_suffix(i); } } else { - std::vector opcodes = - tensorflow::str_util::Split(opcode_string, ','); + std::vector opcodes = absl::StrSplit(opcode_string, ','); for (const string& opcode : opcodes) { bool found = false; for (int i = 0; i < HloOpcodeCount(); i++) { @@ -132,8 +129,7 @@ inline bool parse_xla_reduce_precision_option( // Process the NAMES string, if it exists. if (semicolon_split.size() == 2) { - std::vector opnames = - tensorflow::str_util::Split(semicolon_split[1], ','); + std::vector opnames = absl::StrSplit(semicolon_split[1], ','); for (const string& opname : opnames) { if (opname.length() > 0) { options->add_opname_substrings_to_suffix(opname); diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc index 0ed788a9676fe9b1bd06fb3ceabf627c108a2c70..6f197aec53c7596e84437a03affa9118f22f5a1d 100644 --- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc +++ b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc @@ -20,7 +20,6 @@ limitations under the License. #include #include -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/test.h" namespace xla { diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc index 7b6ae311c1099dccb8dceb2f49743c1b185cd5ab..138c0c852e2bb0527d171f25b4d96cedc5671516 100644 --- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc +++ b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc @@ -21,8 +21,8 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/subprocess.h" #include "tensorflow/core/platform/test.h" @@ -106,8 +106,8 @@ TEST(ParseFlagsFromEnv, File) { if (tmp_dir == nullptr) { tmp_dir = kTempDir; } - string tmp_file = tensorflow::strings::Printf("%s/parse_flags_from_env.%d", - tmp_dir, getpid()); + string tmp_file = + absl::StrFormat("%s/parse_flags_from_env.%d", tmp_dir, getpid()); FILE* fp = fopen(tmp_file.c_str(), "w"); CHECK_NE(fp, nullptr) << "can't write to " << tmp_file; for (int i = 0; kTestFlagString[i] != '\0'; i++) { diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc new file mode 100644 index 0000000000000000000000000000000000000000..3f7635bd400c6ec87e0e3a739658272e906a72fb --- /dev/null +++ b/tensorflow/compiler/xla/literal.cc @@ -0,0 +1,2138 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/literal.h" + +#include +#include +#include +#include +#include +#include + +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/index_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/core/casts.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace { + +using absl::StrCat; +using absl::StrFormat; + +constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; + +// Converts between little and big endian. +// +// Precondition: size % 2 == 0 (elements in the array are 16 bits long) +void ConvertEndianShort(string* bytes) { + CHECK_EQ(bytes->size() / 2, 0); + for (int64 i = 0; i < bytes->size(); i += 2) { + std::swap((*bytes)[i], (*bytes)[i + 1]); + } +} + +void ConvertEndianShort(char* bytes, int64 size) { + CHECK_EQ(size / 2, 0); + for (int64 i = 0; i < size; i += 2) { + std::swap(bytes[i], bytes[i + 1]); + } +} + +} // namespace + +LiteralBase::~LiteralBase() {} + +std::ostream& operator<<(std::ostream& out, const Literal& literal) { + out << literal.ToString(); + return out; +} + +MutableLiteralBase::StrideConfig::StrideConfig( + const Shape& source_shape, const Shape& dest_shape, + absl::Span dimensions) + : dimensions(dimensions), + base(dimensions.size(), 0), + step(dimensions.size(), 1) { + if (!dimensions.empty()) { + // Selects the shape with the largest minor dimension as the one upon + // which to run the tight stride loop. + if (dimensions[LayoutUtil::Minor(source_shape.layout(), 0)] >= + dimensions[LayoutUtil::Minor(dest_shape.layout(), 0)]) { + minor_dimension = LayoutUtil::Minor(source_shape.layout(), 0); + dest_stride = IndexUtil::GetDimensionStride(dest_shape, minor_dimension); + } else { + minor_dimension = LayoutUtil::Minor(dest_shape.layout(), 0); + source_stride = + IndexUtil::GetDimensionStride(source_shape, minor_dimension); + } + minor_loop_size = dimensions[minor_dimension]; + step[minor_dimension] = minor_loop_size; + } +} + +Literal::Literal(const Shape& shape) + : Literal(shape, /*allocate_arrays=*/true) {} + +void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) { + if (ShapeUtil::IsTuple(shape)) { + for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { + const Shape& subshape = shape.tuple_shapes(i); + + auto child_piece = Piece(); + child_piece.set_subshape(&subshape); + + SetPiece(subshape, &child_piece, allocate_arrays); + + piece->emplace_back(std::move(child_piece)); + } + } else if (ShapeUtil::IsArray(shape)) { + if (allocate_arrays) { + if (LayoutUtil::IsSparseArray(shape)) { + // For sparse arrays, the buffer must be of the size of the maximum + // number of sparse elements possible. + const int64 max_sparse_elements = + LayoutUtil::MaxSparseElements(shape.layout()); + piece->set_buffer( + new char[max_sparse_elements * + ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type())]); + piece->set_sparse_indices( + new SparseIndexArray(max_sparse_elements, ShapeUtil::Rank(shape))); + } else { + piece->set_buffer(new char[piece->size_bytes()]); + } + } + } else { + // If the shape is neither an array nor tuple, then it must be + // zero-sized. Otherwise, some memory needs to be allocated for it. + CHECK_EQ(piece->size_bytes(), 0); + } +} + +Literal::Literal(const Shape& shape, bool allocate_arrays) + : MutableLiteralBase() { + shape_ = absl::make_unique(shape); + CHECK(LayoutUtil::HasLayout(*shape_)); + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + CHECK(&root_piece_->subshape() == shape_.get()); + + SetPiece(*shape_, root_piece_, allocate_arrays); +} + +Literal::~Literal() { + if (root_piece_ != nullptr) { + DeallocateBuffers(); + delete root_piece_; + } +} + +void Literal::DeallocateBuffers() { + root_piece_->ForEachMutableSubpiece( + [&](const ShapeIndex& index, Piece* piece) { + if (piece->buffer() != nullptr) { + delete[] piece->buffer(); + delete piece->sparse_indices(); + } + }); +} + +Literal::Literal(Literal&& other) : MutableLiteralBase() { + *this = std::move(other); +} + +Literal& Literal::operator=(Literal&& other) { + DCHECK(&other.root_piece_->subshape() == other.shape_.get()); + using std::swap; + swap(shape_, other.shape_); + swap(root_piece_, other.root_piece_); + DCHECK(&root_piece_->subshape() == shape_.get()); + + return *this; +} + +std::unique_ptr LiteralBase::CreateFromShape(const Shape& shape) { + auto literal = absl::make_unique(shape); + literal->root_piece_->ForEachMutableSubpiece( + [&](const ShapeIndex& index, Piece* piece) { + if (ShapeUtil::IsArray(piece->subshape())) { + memset(piece->untyped_data(), 0, piece->size_bytes()); + } + }); + return literal; +} + +const SparseIndexArray* LiteralBase::sparse_indices( + const ShapeIndex& shape_index) const { + return piece(shape_index).sparse_indices(); +} + +SparseIndexArray* MutableLiteralBase::sparse_indices( + const ShapeIndex& shape_index) { + return piece(shape_index).sparse_indices(); +} + +template +Status MutableLiteralBase::CopySliceFromInternal( + const LiteralBase& src_literal, absl::Span src_base, + absl::Span dest_base, absl::Span copy_size) { + TF_RET_CHECK(ShapeUtil::Rank(src_literal.shape()) == src_base.size()); + TF_RET_CHECK(ShapeUtil::Rank(shape()) == dest_base.size()); + + auto linear_index = [](const Shape& shape, + absl::Span multi_index) { + return IndexUtil::MultidimensionalIndexToLinearIndex(shape, multi_index); + }; + + if (ShapeUtil::Rank(src_literal.shape()) == 0 || + ShapeUtil::Rank(shape()) == 0) { + // If any of the two shapes are scalars, we can just call the StridedCopy() + // directly, and we know we will be copying only one value. + TF_RET_CHECK(copy_size.empty()); + StridedCopy(data(), linear_index(shape(), dest_base), 0, + src_literal.data(), + linear_index(src_literal.shape(), src_base), 0, 1); + } else if (!ShapeUtil::IsZeroElementArray(shape()) && + !ShapeUtil::IsZeroElementArray(src_literal.shape())) { + // Perform copy if neither src nor dest has dimensions with zero element, + // otherwise it's a no-op. + TF_RET_CHECK(src_base.size() == dest_base.size()); + TF_RET_CHECK(src_base.size() == copy_size.size()); + + // Scan the source from minor, stepping in copy size blocks, then within + // the index enumaration functor, do a strided copy advancing source index + // by one (walking through the minor dimension), and destination index by + // proper stride size at the matching dimension. + DimensionVector src_indexes(src_base.size(), 0); + DimensionVector dest_indexes(dest_base.size(), 0); + MutableLiteralBase::StrideConfig stride_config(src_literal.shape(), shape(), + copy_size); + + auto copy_proc = [&](absl::Span indexes) { + // Map from multi-dimensional index, to source index. + std::transform(indexes.begin(), indexes.end(), src_base.begin(), + src_indexes.begin(), std::plus()); + // Map from multi-dimensional index, to destination index. + std::transform(indexes.begin(), indexes.end(), dest_base.begin(), + dest_indexes.begin(), std::plus()); + + int64 src_index = linear_index(src_literal.shape(), src_indexes); + int64 dest_index = linear_index(shape(), dest_indexes); + + // `this->` is needed to workaround MSVC bug: #16882 + StridedCopy(this->data(), dest_index, stride_config.dest_stride, + src_literal.data(), src_index, + stride_config.source_stride, stride_config.minor_loop_size); + return true; + }; + + ShapeUtil::ForEachIndex(src_literal.shape(), stride_config.base, + stride_config.dimensions, stride_config.step, + copy_proc); + } + return Status::OK(); +} + +Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal, + absl::Span src_index, + absl::Span dest_index) { + DCHECK_EQ(shape().element_type(), src_literal.shape().element_type()); + const int64 src_linear_index = IndexUtil::MultidimensionalIndexToLinearIndex( + src_literal.shape(), src_index); + const int64 dest_linear_index = + IndexUtil::MultidimensionalIndexToLinearIndex(shape(), dest_index); + const int64 primitive_size = + ShapeUtil::ByteSizeOfPrimitiveType(shape().element_type()); + + char* dest_address = + static_cast(untyped_data()) + dest_linear_index * primitive_size; + const char* source_address = + static_cast(src_literal.untyped_data()) + + src_linear_index * primitive_size; + if (dest_address != source_address) { + memcpy(dest_address, source_address, primitive_size); + } + return Status::OK(); +} + +/* static */ StatusOr> +MutableLiteralBase::CreateFromProto(const LiteralProto& proto) { + if (!proto.has_shape()) { + return InvalidArgument("LiteralProto has no shape"); + } + if (!LayoutUtil::HasLayout(proto.shape())) { + return InvalidArgument("LiteralProto has no layout"); + } + + auto literal = absl::make_unique(proto.shape()); + + TF_RETURN_IF_ERROR(literal->root_piece_->ForEachMutableSubpieceWithStatus( + [&](const ShapeIndex& index, Piece* piece) { + const LiteralProto* proto_element = &proto; + for (int64 i : index) { + CHECK(i < proto_element->tuple_literals_size()); + proto_element = &proto_element->tuple_literals(i); + } + + if (ShapeUtil::IsTuple(piece->subshape())) { + if (proto_element->tuple_literals_size() != + ShapeUtil::TupleElementCount(piece->subshape())) { + return InvalidArgument( + "Expected %d tuple elements in LiteralProto, has %d", + ShapeUtil::TupleElementCount(piece->subshape()), + proto_element->tuple_literals_size()); + } + return Status::OK(); + } + if (piece->subshape().element_type() == TOKEN) { + return Status::OK(); + } + + CHECK(ShapeUtil::IsArray(piece->subshape())); + TF_RETURN_IF_ERROR(piece->CopyFromProto(*proto_element)); + + return Status::OK(); + })); + + return std::move(literal); +} + +std::vector Literal::DecomposeTuple() { + CHECK(ShapeUtil::IsTuple(shape())); + std::vector elements; + for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) { + elements.push_back(Literal(ShapeUtil::GetSubshape(shape(), {i}), + /*allocate_arrays=*/false)); + Literal& element = elements.back(); + element.root_piece_->ForEachMutableSubpiece( + [&](const ShapeIndex& index, Piece* dest_piece) { + ShapeIndex src_index = {i}; + for (int64 j : index) { + src_index.push_back(j); + } + Piece& src_piece = piece(src_index); + + // Move the respective buffer and sparse indices over to the element + // Literal. + dest_piece->set_buffer(src_piece.buffer()); + src_piece.set_buffer(nullptr); + dest_piece->set_sparse_indices(src_piece.sparse_indices()); + src_piece.set_sparse_indices(nullptr); + }); + } + // Set this literal to be nil-shaped. + *this = Literal(); + return elements; +} + +namespace { + +// Copies the elements in 'src' to 'dest'. The shape and layout of the data in +// the array slices are indicated by dest_shape and src_shape respectively. +template +void CopyElementsBetween(absl::Span dest, + absl::Span src, const Shape& dest_shape, + const Shape& src_shape) { + CHECK(ShapeUtil::Compatible(dest_shape, src_shape)); + if (ShapeUtil::IsZeroElementArray(dest_shape)) { + return; + } + std::vector index(ShapeUtil::Rank(dest_shape)); + do { + dest[IndexUtil::MultidimensionalIndexToLinearIndex(dest_shape, index)] = + src[IndexUtil::MultidimensionalIndexToLinearIndex(src_shape, index)]; + } while (IndexUtil::BumpIndices(dest_shape, absl::MakeSpan(index))); +} + +} // namespace + +Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) { + CHECK(subshape_ != nullptr); + CHECK(src.subshape_ != nullptr); + if (ShapeUtil::Equal(subshape(), src.subshape())) { + // If the layouts are equal it's faster just to memcpy. + memcpy(buffer(), src.buffer(), src.size_bytes()); + } else { + TF_RET_CHECK(ShapeUtil::Compatible(src.subshape(), subshape())); + std::vector origin(ShapeUtil::Rank(subshape()), 0); + switch (subshape().element_type()) { +#define COPY_ELEMENTS(XLA_T, NATIVE_T) \ + case (XLA_T): \ + CopyElementsBetween(data(), src.data(), \ + subshape(), src.subshape()); \ + break; + COPY_ELEMENTS(U8, uint8); + COPY_ELEMENTS(U16, uint16); + COPY_ELEMENTS(U32, uint32); + COPY_ELEMENTS(U64, uint64); + COPY_ELEMENTS(S8, int8); + COPY_ELEMENTS(S16, int16); + COPY_ELEMENTS(S32, int32); + COPY_ELEMENTS(S64, int64); + COPY_ELEMENTS(F16, half); + COPY_ELEMENTS(BF16, bfloat16); + COPY_ELEMENTS(F32, float); + COPY_ELEMENTS(F64, double); + COPY_ELEMENTS(C64, complex64); + COPY_ELEMENTS(PRED, bool); +#undef COPY_ELEMENTS + default: + return Unimplemented( + "Copying a Literal object with element type %s is not implemented.", + PrimitiveType_Name(subshape().element_type())); + } + } + return Status::OK(); +} + +Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal, + const ShapeIndex& dest_shape_index, + const ShapeIndex& src_shape_index) { + const Shape& dest_subshape = + ShapeUtil::GetSubshape(shape(), dest_shape_index); + const Shape& src_subshape = + ShapeUtil::GetSubshape(src_literal.shape(), src_shape_index); + if (!ShapeUtil::Compatible(dest_subshape, src_subshape)) { + return InvalidArgument( + "Destination subshape incompatible with source subshape: %s vs %s", + ShapeUtil::HumanString(dest_subshape), + ShapeUtil::HumanString(src_subshape)); + } + return root_piece_->ForEachMutableSubpieceWithStatus( + [&](const ShapeIndex& index, Piece* piece) { + if (!ShapeUtil::IsArray(piece->subshape())) { + return Status::OK(); + } + + // Determine if this index is in the part of this literal that we want + // to copy over from src_literal. + bool in_subtree_to_copy = true; + for (int i = 0; i < dest_shape_index.size(); ++i) { + if (index[i] != dest_shape_index[i]) { + in_subtree_to_copy = false; + break; + } + } + if (!in_subtree_to_copy) { + return Status::OK(); + } + // Construct the index of the corresponding piece in the source literal. + ShapeIndex src_piece_index = src_shape_index; + for (int64 i = dest_shape_index.size(); i < index.size(); ++i) { + src_piece_index.push_back(index[i]); + } + TF_RETURN_IF_ERROR(piece->CopyFrom(src_literal.piece(src_piece_index))); + return Status::OK(); + }); +} + +Status Literal::MoveFrom(Literal&& src_literal, + const ShapeIndex& dest_shape_index) { + const Shape& dest_subshape = + ShapeUtil::GetSubshape(shape(), dest_shape_index); + if (!ShapeUtil::Equal(dest_subshape, src_literal.shape())) { + return InvalidArgument( + "Destination subshape not equal to source shape: %s vs %s", + ShapeUtil::HumanString(dest_subshape), + ShapeUtil::HumanString(src_literal.shape())); + } + + src_literal.root_piece_->ForEachSubpiece( + [&](const ShapeIndex& src_index, const Piece& src_piece) { + if (!ShapeUtil::IsArray(src_piece.subshape())) { + return; + } + + ShapeIndex dest_index = dest_shape_index; + for (int64 i : src_index) { + dest_index.push_back(i); + } + Piece& dest_piece = piece(dest_index); + delete[] dest_piece.buffer(); + dest_piece.set_buffer(src_piece.buffer()); + delete dest_piece.sparse_indices(); + dest_piece.set_sparse_indices(src_piece.sparse_indices()); + }); + + src_literal.shape_ = absl::make_unique(ShapeUtil::MakeNil()); + delete src_literal.root_piece_; + src_literal.root_piece_ = new LiteralBase::Piece(); + src_literal.root_piece_->set_subshape(src_literal.shape_.get()); + + return Status::OK(); +} + +Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal, + absl::Span src_base, + absl::Span dest_base, + absl::Span copy_size) { + TF_RET_CHECK(ShapeUtil::IsArray(shape())) << ShapeUtil::HumanString(shape()); + TF_RET_CHECK(ShapeUtil::IsArray(src_literal.shape())) + << ShapeUtil::HumanString(src_literal.shape()); + TF_RET_CHECK(ShapeUtil::SameElementType(src_literal.shape(), shape())); + + switch (shape().element_type()) { + case U8: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case U16: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case U32: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case U64: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case S8: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case S16: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case S32: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case S64: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case F16: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case BF16: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case F32: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case F64: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case C64: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + case PRED: + return CopySliceFromInternal(src_literal, src_base, dest_base, + copy_size); + default: + break; + } + return Unimplemented( + "Copying a slice from a Literal object with element type %d is not " + "implemented.", + shape().element_type()); +} + +void MutableLiteralBase::PopulateR1(const tensorflow::core::Bitmap& values) { + CHECK(ShapeUtil::IsArray(shape())); + CHECK_EQ(ShapeUtil::Rank(shape()), 1); + CHECK_EQ(element_count(), values.bits()); + CHECK_EQ(shape().element_type(), PRED); + for (int64 i = 0; i < static_cast(values.bits()); ++i) { + Set({i}, values.get(i)); + } +} + +std::unique_ptr LiteralBase::Relayout( + const Layout& new_layout, const ShapeIndex& shape_index) const { + // Create new shape with 'new_layout' set at the given shape index. + Shape new_shape = shape(); + Shape* subshape = ShapeUtil::GetMutableSubshape(&new_shape, shape_index); + TF_CHECK_OK(LayoutUtil::ValidateLayoutForShape(new_layout, *subshape)); + *subshape->mutable_layout() = new_layout; + auto result = absl::make_unique(new_shape); + TF_CHECK_OK(result->CopyFrom(*this)); + return result; +} + +std::unique_ptr LiteralBase::Relayout( + const Shape& shape_with_layout) const { + CHECK(ShapeUtil::Compatible(shape_with_layout, shape())) + << "Given shape_with_layout " << ShapeUtil::HumanString(shape_with_layout) + << " not compatible with literal shape " + << ShapeUtil::HumanString(shape()); + std::unique_ptr result = CreateFromShape(shape_with_layout); + ShapeUtil::ForEachSubshape( + result->shape(), + [this, &result](const Shape& subshape, const ShapeIndex& index) { + if (ShapeUtil::IsArray(subshape)) { + TF_CHECK_OK(result->CopyFrom(*this, + /*dest_shape_index=*/index, + /*src_shape_index=*/index)); + } + }); + return result; +} + +StatusOr> LiteralBase::Broadcast( + const Shape& result_shape, absl::Span dimensions) const { + if (!ShapeUtil::IsArray(shape())) { + return InvalidArgument("Broadcast only supports arrays."); + } + + for (int64 i = 0; i < dimensions.size(); i++) { + TF_RET_CHECK(shape().dimensions(i) == + result_shape.dimensions(dimensions[i])); + } + + std::unique_ptr result = absl::make_unique(result_shape); + + // scratch_source_index is temporary storage space for the computed index into + // the input literal. We put it here to avoid allocating an std::vector in + // every iteration of ShapeUtil::ForEachIndex. + std::vector scratch_source_index(shape().dimensions_size()); + + char* dest_data = static_cast(result->untyped_data()); + const char* source_data = static_cast(untyped_data()); + const int64 primitive_size = + ShapeUtil::ByteSizeOfPrimitiveType(shape().element_type()); + + ShapeUtil::ForEachIndex( + result_shape, [&](absl::Span output_index) { + for (int64 i = 0; i < dimensions.size(); ++i) { + scratch_source_index[i] = output_index[dimensions[i]]; + } + int64 dest_index = IndexUtil::MultidimensionalIndexToLinearIndex( + result_shape, output_index); + int64 source_index = IndexUtil::MultidimensionalIndexToLinearIndex( + shape(), scratch_source_index); + memcpy(dest_data + primitive_size * dest_index, + source_data + primitive_size * source_index, primitive_size); + return true; + }); + + return std::move(result); +} + +StatusOr> LiteralBase::Reshape( + absl::Span dimensions) const { + if (!ShapeUtil::IsArray(shape())) { + return InvalidArgument("Reshape does not support tuples."); + } + std::unique_ptr output; + if (!LayoutUtil::IsMonotonicWithDim0Major(shape().layout())) { + output = + Relayout(LayoutUtil::GetDefaultLayoutForRank(ShapeUtil::Rank(shape()))); + } else { + output = CloneToUnique(); + } + // Because the layout is monotonic, we can simply reuse the same sequence of + // values without changing their order. + *output->mutable_shape_do_not_use() = + ShapeUtil::MakeShape(shape().element_type(), dimensions); + + int64 elements_before = ShapeUtil::ElementsIn(shape()); + int64 elements_after = ShapeUtil::ElementsIn(output->shape()); + if (elements_before != elements_after) { + return InvalidArgument( + "Shapes before and after Literal::Reshape have different numbers " + "of elements: %s vs %s.", + ShapeUtil::HumanString(shape()), + ShapeUtil::HumanString(output->shape())); + } + return std::move(output); +} + +std::unique_ptr LiteralBase::Transpose( + absl::Span permutation) const { + CHECK(ShapeUtil::IsArray(shape())) << "Tuple is not supported for transpose"; + CHECK(IsPermutation(permutation, ShapeUtil::Rank(shape()))) + << "Given permutation is not a permutation of dimension numbers"; + // To transpose the array, we just permute the dimensions and layout, and + // do a straight memory copy of the raw data set. + // This is considerably faster than iterating over every array element using + // the EachCell<>() and Set<>() APIs. + std::vector inverse_permutation = InversePermutation(permutation); + Shape permuted_shape = + ShapeUtil::PermuteDimensions(inverse_permutation, shape()); + // Replace the layout with one affine to this shape, such that a + // transpose operation can be performed by leaving the flat values + // representation intact. + // For example, consider the shape F32[11,8]{1,0} under a {1,0} permutation. + // The shape with affine layout resulting from that operation will be + // F32[8,11]{0,1}, since it leaves the original most minor (the 8 sized), the + // most minor. + // + // Essentially, given MinMaj(Di) the position of the Di dimension within the + // minor to major vector, and given T(Di) the index that the original Di + // dimension has within the transposed array, a layout is affine if + // MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major + // vector of the affine layout. + CHECK(LayoutUtil::IsDenseArray(permuted_shape)); + Layout* layout = permuted_shape.mutable_layout(); + layout->clear_minor_to_major(); + for (auto index : LayoutUtil::MinorToMajor(shape())) { + layout->add_minor_to_major(inverse_permutation[index]); + } + auto new_literal = absl::make_unique(permuted_shape); + DCHECK_EQ(ShapeUtil::ByteSizeOf(new_literal->shape()), + ShapeUtil::ByteSizeOf(shape())); + std::memcpy(new_literal->untyped_data(), untyped_data(), size_bytes()); + return new_literal; +} + +template +std::unique_ptr LiteralBase::SliceInternal( + const Shape& result_shape, absl::Span start_indices) const { + auto result_literal = absl::make_unique(result_shape); + DimensionVector new_indices(ShapeUtil::Rank(result_shape)); + result_literal->EachCell( + [&](absl::Span indices, NativeT /*value*/) { + for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) { + new_indices[i] = indices[i] + start_indices[i]; + } + NativeT value = Get(new_indices); + result_literal->Set(indices, value); + }); + return result_literal; +} + +std::unique_ptr LiteralBase::Slice( + absl::Span start_indices, + absl::Span limit_indices) const { + CHECK(ShapeUtil::IsArray(shape())) << "tuple is not supported for slice"; + + DimensionVector result_dimensions; + for (int64 dnum = 0; dnum < ShapeUtil::Rank(shape()); ++dnum) { + CHECK_GE(start_indices[dnum], 0); + CHECK_LE(limit_indices[dnum], shape().dimensions(dnum)) + << "dnum = " << dnum; + int64 dimension = limit_indices[dnum] - start_indices[dnum]; + CHECK_GE(dimension, 0) << "dnum = " << dnum; + result_dimensions.push_back(dimension); + } + const auto result_shape = + ShapeUtil::MakeShapeWithLayout(shape().element_type(), result_dimensions, + LayoutUtil::MinorToMajor(shape())); + switch (result_shape.element_type()) { + case F32: + return SliceInternal(result_shape, start_indices); + case BF16: + return SliceInternal(result_shape, start_indices); + case C64: + return SliceInternal(result_shape, start_indices); + case S32: + return SliceInternal(result_shape, start_indices); + case U32: + return SliceInternal(result_shape, start_indices); + default: + LOG(FATAL) << "not yet implemented: " + << PrimitiveType_Name(result_shape.element_type()); + } +} + +Literal LiteralBase::Clone() const { + Literal result(shape()); + TF_CHECK_OK(result.CopyFrom(*this)); + return result; +} + +std::unique_ptr LiteralBase::CloneToUnique() const { + auto result = absl::make_unique(shape()); + TF_CHECK_OK(result->CopyFrom(*this)); + return result; +} + +string LiteralBase::GetAsString(absl::Span multi_index, + const ShapeIndex& shape_index) const { + const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index); + CHECK(LayoutUtil::IsDenseArray(subshape)); + switch (subshape.element_type()) { + case PRED: + return Get(multi_index, shape_index) ? "true" : "false"; + case S8: + return StrCat(Get(multi_index, shape_index)); + case S16: + return StrCat(Get(multi_index, shape_index)); + case S32: + return StrCat(Get(multi_index, shape_index)); + case S64: + return StrCat(Get(multi_index, shape_index)); + case U8: + return StrCat(Get(multi_index, shape_index)); + case U16: + return StrCat(Get(multi_index, shape_index)); + case U32: + return StrCat(Get(multi_index, shape_index)); + case U64: + return StrCat(Get(multi_index, shape_index)); + case F16: + return StrCat(static_cast(Get(multi_index, shape_index))); + case F32: + return StrCat(Get(multi_index, shape_index)); + case BF16: + return StrCat( + static_cast(Get(multi_index, shape_index))); + case F64: + return StrCat(Get(multi_index, shape_index)); + case C64: { + complex64 c = Get(multi_index, shape_index); + return StrCat("(", c.real(), ", ", c.imag(), ")"); + } + default: + LOG(FATAL) << PrimitiveType_Name(subshape.element_type()); + } +} + +string LiteralBase::GetSparseElementAsString( + int64 sparse_element_number, const ShapeIndex& shape_index) const { + const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index); + CHECK(LayoutUtil::IsSparseArray(subshape)); + switch (subshape.element_type()) { + case PRED: + return GetSparseElement(sparse_element_number, shape_index) + ? "true" + : "false"; + case S8: + return StrCat(GetSparseElement(sparse_element_number, shape_index)); + case S16: + return StrCat( + GetSparseElement(sparse_element_number, shape_index)); + case S32: + return StrCat( + GetSparseElement(sparse_element_number, shape_index)); + case S64: + return StrCat( + GetSparseElement(sparse_element_number, shape_index)); + case U8: + return StrCat( + GetSparseElement(sparse_element_number, shape_index)); + case U16: + return StrCat( + GetSparseElement(sparse_element_number, shape_index)); + case U32: + return StrCat( + GetSparseElement(sparse_element_number, shape_index)); + case U64: + return StrCat( + GetSparseElement(sparse_element_number, shape_index)); + case F16: + return StrCat(static_cast( + GetSparseElement(sparse_element_number, shape_index))); + case F32: + return StrCat( + GetSparseElement(sparse_element_number, shape_index)); + case BF16: + return StrCat(static_cast( + GetSparseElement(sparse_element_number, shape_index))); + case F64: + return StrCat( + GetSparseElement(sparse_element_number, shape_index)); + case C64: { + complex64 c = + GetSparseElement(sparse_element_number, shape_index); + return StrCat("(", c.real(), ", ", c.imag(), ")"); + } + default: + LOG(FATAL) << "Invalid element type for sparse arrays: " + << PrimitiveType_Name(subshape.element_type()); + } +} + +StatusOr LiteralBase::GetIntegralAsS64( + absl::Span multi_index) const { + CHECK(LayoutUtil::IsDenseArray(shape())); + switch (shape().element_type()) { + case PRED: + return Get(multi_index); + case U8: + return Get(multi_index); + case S32: + return Get(multi_index); + case S64: + return Get(multi_index); + case U32: + return Get(multi_index); + case U64: + return Get(multi_index); + default: + return FailedPrecondition("Array element type is not integral: %s", + PrimitiveType_Name(shape().element_type())); + } +} + +size_t LiteralBase::Hash() const { + using tensorflow::Hash64; + using tensorflow::Hash64Combine; + + size_t hash_value = ShapeUtil::Hash(shape()); + + ShapeUtil::ForEachSubshape( + shape(), [&](const Shape& subshape, const ShapeIndex& index) { + if (!ShapeUtil::IsArray(subshape)) { + return; + } + + CHECK(LayoutUtil::IsDense(subshape.layout())); + hash_value = Hash64Combine( + hash_value, Hash64(static_cast(untyped_data(index)), + size_bytes(index))); + }); + + return hash_value; +} + +Status MutableLiteralBase::SetIntegralAsS64(absl::Span multi_index, + int64 value) { + CHECK(LayoutUtil::IsDenseArray(shape())); + switch (shape().element_type()) { + case PRED: + Set(multi_index, value); + break; + case U8: + Set(multi_index, value); + break; + case S32: + Set(multi_index, value); + break; + case S64: + Set(multi_index, value); + break; + case U32: + Set(multi_index, value); + break; + case U64: + Set(multi_index, value); + break; + default: + return FailedPrecondition("Array element type is not integral: %s", + PrimitiveType_Name(shape().element_type())); + } + return Status::OK(); +} + +absl::Span LiteralBase::GetSparseIndex( + int64 sparse_element_number, const ShapeIndex& shape_index) const { + const Piece& p = piece(shape_index); + CHECK_GE(sparse_element_number, 0); + CHECK_LT(sparse_element_number, p.sparse_indices()->index_count()); + return p.sparse_indices()->At(sparse_element_number); +} + +void MutableLiteralBase::SortSparseElements(const ShapeIndex& shape_index) { + piece(shape_index).SortSparseElements(); +} + +void LiteralBase::Piece::SortSparseElements() { + switch (subshape().element_type()) { + case PRED: + SortSparseElementsInternal(); + break; + case S8: + SortSparseElementsInternal(); + break; + case U8: + SortSparseElementsInternal(); + break; + case S16: + SortSparseElementsInternal(); + break; + case U16: + SortSparseElementsInternal(); + break; + case S32: + SortSparseElementsInternal(); + break; + case U32: + SortSparseElementsInternal(); + break; + case S64: + SortSparseElementsInternal(); + break; + case U64: + SortSparseElementsInternal(); + break; + case F32: + SortSparseElementsInternal(); + break; + case F64: + SortSparseElementsInternal(); + break; + case C64: + SortSparseElementsInternal(); + break; + case F16: + SortSparseElementsInternal(); + break; + case BF16: + SortSparseElementsInternal(); + break; + default: + LOG(FATAL) << "Element type not valid for sparse array: " + << PrimitiveType_Name(subshape().element_type()); + } +} + +template +void LiteralBase::Piece::SortSparseElementsInternal() { + CHECK(LayoutUtil::IsSparseArray(subshape())); + int64 num_elements = sparse_indices()->index_count(); + auto values = data(); + CHECK_LE(num_elements, values.size()); + sparse_indices()->SortWithValues( + absl::Span(values.data(), num_elements)); +} + +namespace { + +void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index, + bool print_layout, std::vector* pieces) { + const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index); + CHECK(LayoutUtil::HasLayout(literal.shape())); + CHECK(LayoutUtil::HasLayout(subshape)); + + auto shape_to_string = [print_layout](const Shape& shape) { + if (print_layout) { + return ShapeUtil::HumanStringWithLayout(shape); + } else { + return ShapeUtil::HumanString(shape); + } + }; + + // TODO(b/32894291): refactor this code to reduce code duplication. + if (ShapeUtil::IsTuple(subshape)) { + pieces->push_back(shape_to_string(subshape)); + pieces->push_back(" (\n"); + std::vector tuple_pieces; + for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) { + ShapeIndex element_index = shape_index; + element_index.push_back(i); + std::vector element_pieces; + ToStringHelper(literal, element_index, print_layout, &element_pieces); + tuple_pieces.push_back(absl::StrJoin(element_pieces, "")); + } + pieces->push_back(absl::StrJoin(tuple_pieces, ",\n")); + pieces->push_back("\n)"); + return; + } + + if (ShapeUtil::IsToken(subshape)) { + pieces->push_back("token"); + return; + } + + if (LayoutUtil::IsSparseArray(subshape)) { + pieces->push_back(shape_to_string(subshape)); + pieces->push_back("{"); + int64 rank = ShapeUtil::Rank(subshape); + int64 num_elements = literal.sparse_element_count(); + for (int64 i = 0; i < num_elements; ++i) { + if (i > 0) { + pieces->push_back(", "); + } + if (rank == 1) { + pieces->push_back(StrCat(literal.GetSparseIndex(i)[0])); + pieces->push_back(": "); + } else { + pieces->push_back("["); + pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", ")); + pieces->push_back("]: "); + } + pieces->push_back(literal.GetSparseElementAsString(i)); + } + pieces->push_back("}"); + return; + } + + CHECK(LayoutUtil::IsDenseArray(subshape)); + + auto element_to_string = [&](absl::Span indices) -> string { + PrimitiveType element_type = subshape.element_type(); + if (element_type == PRED) { + // We display predicates in a densely packed form. + return literal.Get(indices, shape_index) ? "1" : "0"; + } + return ((!indices.empty() && indices.back() > 0) ? ", " : "") + + literal.GetAsString(indices, shape_index); + }; + + if (ShapeUtil::Rank(subshape) == 0) { + pieces->push_back(literal.GetAsString({}, shape_index)); + } else if (ShapeUtil::Rank(subshape) == 1) { + pieces->push_back("{"); + for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { + pieces->push_back(element_to_string({i0})); + } + pieces->push_back("}"); + } else if (ShapeUtil::Rank(subshape) == 2) { + pieces->push_back(shape_to_string(subshape)); + pieces->push_back(" {\n"); + for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { + pieces->push_back(" { "); + for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { + pieces->push_back(element_to_string({i0, i1})); + } + pieces->push_back(" "); + pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "}\n" : "},\n"); + } + pieces->push_back("}"); + } else if (ShapeUtil::Rank(subshape) == 3) { + pieces->push_back(shape_to_string(subshape)); + pieces->push_back(" {\n"); + for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { + pieces->push_back(i0 > 0 ? ",\n{" : "{"); + for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { + pieces->push_back(i1 > 0 ? ",\n { " : " { "); + for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) { + pieces->push_back(element_to_string({i0, i1, i2})); + } + pieces->push_back(" }"); + } + pieces->push_back(" }"); + } + pieces->push_back("\n}"); + } else if (ShapeUtil::Rank(subshape) == 4) { + pieces->push_back(shape_to_string(subshape)); + pieces->push_back(" {\n"); + for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { + pieces->push_back(StrFormat(" { /*i0=%d*/\n", i0)); + for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { + pieces->push_back(StrFormat(" { /*i1=%d*/\n", i1)); + for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) { + pieces->push_back(" {"); + for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) { + pieces->push_back(element_to_string({i0, i1, i2, i3})); + } + pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "}\n" : "},\n"); + } + pieces->push_back(i1 == subshape.dimensions(1) - 1 ? " }\n" + : " },\n"); + } + pieces->push_back(i0 == subshape.dimensions(0) - 1 ? " }\n" : " },\n"); + } + pieces->push_back("}"); + } else if (ShapeUtil::Rank(subshape) == 5) { + pieces->push_back(shape_to_string(subshape)); + pieces->push_back(" {\n"); + for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { + pieces->push_back(StrFormat(" { /*i0=%d*/\n", i0)); + for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { + pieces->push_back(StrFormat(" { /*i1=%d*/\n", i1)); + for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) { + pieces->push_back(StrFormat(" { /*i2=%d*/\n", i2)); + for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) { + pieces->push_back(" {"); + for (int64 i4 = 0; i4 < subshape.dimensions(4); ++i4) { + pieces->push_back(element_to_string({i0, i1, i2, i3, i4})); + } + pieces->push_back(i3 == subshape.dimensions(3) - 1 ? "}\n" + : "},\n"); + } + pieces->push_back(i2 == subshape.dimensions(2) - 1 ? " }\n" + : " },\n"); + } + pieces->push_back(i1 == subshape.dimensions(1) - 1 ? " }\n" + : " },\n"); + } + pieces->push_back(i0 == subshape.dimensions(0) - 1 ? " }\n" : " },\n"); + } + pieces->push_back("}"); + } else { + pieces->push_back(shape_to_string(subshape)); + pieces->push_back(" {"); + literal.EachCellAsString( + [&](absl::Span indices, const string& value) { + pieces->push_back(" "); + pieces->push_back(value); + }); + pieces->push_back("}"); + } +} + +} // namespace + +int64 LiteralBase::sparse_element_count() const { + CHECK(LayoutUtil::IsSparseArray(shape())); + return sparse_indices()->index_count(); +} + +string LiteralBase::ToString(bool print_layout) const { + std::vector pieces; + CHECK(LayoutUtil::HasLayout(this->shape())); + ToStringHelper(*this, {}, print_layout, &pieces); + return absl::StrJoin(pieces, ""); +} + +void LiteralBase::EachCellAsString( + const std::function indices, + const string& value)>& per_cell) const { + if (ShapeUtil::IsZeroElementArray(shape())) { + return; + } + std::vector indices = IndexUtil::LinearIndexToMultidimensionalIndex( + shape(), /*linear_index=*/0); + do { + per_cell(indices, GetAsString(indices)); + } while (IndexUtil::BumpIndices(shape(), absl::MakeSpan(indices))); +} + +namespace { +template +std::unique_ptr ConvertBetweenNativeTypesWithConverter( + const LiteralBase& src_literal, const ConverterType& converter) { + CHECK(ShapeUtil::IsArray(src_literal.shape())); + auto result_literal = absl::make_unique(ShapeUtil::ChangeElementType( + src_literal.shape(), + primitive_util::NativeToPrimitiveType())); + auto src_data = src_literal.data(); + auto dest_data = result_literal->template data(); + int64 num_elements = src_literal.element_count(); + + for (int64 i = 0; i < num_elements; ++i) { + dest_data[i] = converter(src_data[i]); + } + return result_literal; +} + +template +std::unique_ptr ConvertBetweenNativeTypes( + const LiteralBase& src_literal) { + auto converter = [](NativeSrcT src) { return static_cast(src); }; + return ConvertBetweenNativeTypesWithConverter( + src_literal, converter); +} + +template +typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)), + std::unique_ptr>::type +BitcastBetweenNativeTypes(const LiteralBase& src_literal) { + auto converter = [](NativeSrcT src) { + return tensorflow::bit_cast(src); + }; + return ConvertBetweenNativeTypesWithConverter( + src_literal, converter); +} + +// This template specialization is here to make the compiler happy. bit_cast has +// a static check that the types are the same size. This specialization should +// never be used because the source and destination types are checked for +// identical sizes higher up. +template +typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)), + std::unique_ptr>::type +BitcastBetweenNativeTypes(const LiteralBase& src_literal) { + LOG(FATAL) << "Invalid bitcast between types of different sizes."; +} + +template +std::unique_ptr ConvertToC64(const LiteralBase& src_literal) { + CHECK(ShapeUtil::IsArray(src_literal.shape())); + auto result_literal = absl::make_unique( + ShapeUtil::ChangeElementType(src_literal.shape(), C64)); + using NativeSrcT = + typename primitive_util::PrimitiveTypeToNative::type; + absl::Span src_data = src_literal.data(); + absl::Span dest_data = result_literal->data(); + int64 num_elements = src_literal.element_count(); + for (int64 i = 0; i < num_elements; ++i) { + dest_data[i] = complex64(static_cast(src_data[i]), 0); + } + return result_literal; +} + +template +std::unique_ptr ConvertIfTypesMatch(const LiteralBase& src_literal, + bool bitcast) { + CHECK_EQ(primitive_src_type, src_literal.shape().element_type()); + if (bitcast) { + return BitcastBetweenNativeTypes< + typename primitive_util::PrimitiveTypeToNative< + primitive_src_type>::type, + typename primitive_util::PrimitiveTypeToNative< + primitive_dest_type>::type>(src_literal); + } else { + return ConvertBetweenNativeTypes< + typename primitive_util::PrimitiveTypeToNative< + primitive_src_type>::type, + typename primitive_util::PrimitiveTypeToNative< + primitive_dest_type>::type>(src_literal); + } +} + +template +StatusOr> ConvertIfDestTypeMatches( + const LiteralBase& src_literal, PrimitiveType primitive_dest_type, + bool bitcast) { + switch (primitive_dest_type) { +#define CONVERT_IF_TYPES_MATCH(type) \ + case (type): \ + return ConvertIfTypesMatch(src_literal, \ + bitcast); + CONVERT_IF_TYPES_MATCH(PRED) + CONVERT_IF_TYPES_MATCH(S8) + CONVERT_IF_TYPES_MATCH(S32) + CONVERT_IF_TYPES_MATCH(S64) + CONVERT_IF_TYPES_MATCH(U8) + CONVERT_IF_TYPES_MATCH(U32) + CONVERT_IF_TYPES_MATCH(U64) + CONVERT_IF_TYPES_MATCH(F16) + CONVERT_IF_TYPES_MATCH(F32) + CONVERT_IF_TYPES_MATCH(F64) + CONVERT_IF_TYPES_MATCH(BF16) +#undef CONVERT_IF_TYPES_MATCH + case C64: + if (!bitcast) { + return ConvertToC64(src_literal); + } + break; + // Other types are not yet supported. + default: + break; + } + return Unimplemented("Converting from type %s to type %s is not implemented.", + PrimitiveType_Name(src_literal.shape().element_type()), + PrimitiveType_Name(primitive_dest_type)); +} + +StatusOr> ConvertSwitch( + const LiteralBase& literal, PrimitiveType primitive_dest_type, + bool bitcast) { + TF_RET_CHECK(ShapeUtil::IsArray(literal.shape())); + if (literal.shape().element_type() == primitive_dest_type) { + return literal.CloneToUnique(); + } + switch (literal.shape().element_type()) { +#define CONVERT_IF_DEST_TYPE_MATCHES(type) \ + case (type): \ + return ConvertIfDestTypeMatches<(type)>(literal, primitive_dest_type, \ + bitcast); + CONVERT_IF_DEST_TYPE_MATCHES(PRED) + CONVERT_IF_DEST_TYPE_MATCHES(S8) + CONVERT_IF_DEST_TYPE_MATCHES(S32) + CONVERT_IF_DEST_TYPE_MATCHES(S64) + CONVERT_IF_DEST_TYPE_MATCHES(U8) + CONVERT_IF_DEST_TYPE_MATCHES(U32) + CONVERT_IF_DEST_TYPE_MATCHES(U64) + CONVERT_IF_DEST_TYPE_MATCHES(F16) + CONVERT_IF_DEST_TYPE_MATCHES(F32) + CONVERT_IF_DEST_TYPE_MATCHES(F64) + CONVERT_IF_DEST_TYPE_MATCHES(BF16) +#undef CONVERT_IF_DEST_TYPE_MATCHES + // Other types are not yet supported. + default: + return Unimplemented("%s from type %s to type %s is not implemented.", + (bitcast ? "Bitcast converting" : "Converting"), + PrimitiveType_Name(literal.shape().element_type()), + PrimitiveType_Name(primitive_dest_type)); + } +} + +} // namespace + +StatusOr> LiteralBase::Convert( + PrimitiveType primitive_dest_type) const { + return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/false); +} + +StatusOr> LiteralBase::BitcastConvert( + PrimitiveType primitive_dest_type) const { + if (primitive_util::BitWidth(shape().element_type()) != + primitive_util::BitWidth(primitive_dest_type)) { + return InvalidArgument( + "Cannot bitcast convert from %s to %s, bit widths are different: %d != " + "%d", + PrimitiveType_Name(shape().element_type()), + PrimitiveType_Name(primitive_dest_type), + primitive_util::BitWidth(shape().element_type()), + primitive_util::BitWidth(primitive_dest_type)); + } + return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/true); +} + +StatusOr> LiteralBase::ConvertToShape( + const Shape& dest_shape, bool round_f32_to_bf16) const { + if (!ShapeUtil::IsTuple(dest_shape)) { + if (round_f32_to_bf16 && shape().element_type() == F32 && + dest_shape.element_type() == BF16) { + auto converter = [](float src) { + return tensorflow::bfloat16::round_to_bfloat16(src); + }; + return ConvertBetweenNativeTypesWithConverter(*this, + converter); + } + return Convert(dest_shape.element_type()); + } + std::vector elements; + for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) { + auto element = LiteralSlice(*this, {i}); + TF_ASSIGN_OR_RETURN( + auto new_element, + element.ConvertToShape(ShapeUtil::GetSubshape(dest_shape, {i}))); + elements.push_back(std::move(*new_element)); + } + auto converted = absl::make_unique(); + *converted = MutableLiteralBase::MoveIntoTuple(absl::MakeSpan(elements)); + return std::move(converted); +} + +/* static */ Literal MutableLiteralBase::MoveIntoTuple( + absl::Span elements) { + std::vector element_shapes; + for (const Literal& element : elements) { + element_shapes.push_back(element.shape()); + } + Literal literal(ShapeUtil::MakeTupleShape(element_shapes), + /*allocate_arrays=*/false); + for (int i = 0; i < elements.size(); ++i) { + TF_CHECK_OK( + literal.MoveFrom(std::move(elements[i]), /*dest_shape_index=*/{i})); + } + return literal; +} + +template +bool LiteralBase::Piece::EqualElementsInternal( + const LiteralBase::Piece& other, std::vector* multi_index) const { + if (multi_index->size() == ShapeUtil::Rank(subshape())) { + return (Get(*multi_index) == other.Get(*multi_index)); + } + for (int64 i = 0; i < subshape().dimensions(multi_index->size()); ++i) { + multi_index->push_back(i); + if (!EqualElementsInternal(other, multi_index)) { + return false; + } + multi_index->pop_back(); + } + return true; +} + +bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const { + DCHECK(ShapeUtil::Compatible(subshape(), other.subshape())); + + if (ShapeUtil::Equal(subshape(), other.subshape()) && + LayoutUtil::IsDenseArray(subshape())) { + CHECK_EQ(size_bytes(), other.size_bytes()); + return memcmp(buffer(), other.buffer(), size_bytes()) == 0; + } + + std::vector multi_index; + switch (subshape().element_type()) { + case PRED: + return EqualElementsInternal(other, &multi_index); + case U8: + return EqualElementsInternal(other, &multi_index); + case S32: + return EqualElementsInternal(other, &multi_index); + case S64: + return EqualElementsInternal(other, &multi_index); + case U32: + return EqualElementsInternal(other, &multi_index); + case U64: + return EqualElementsInternal(other, &multi_index); + case F32: + return EqualElementsInternal(other, &multi_index); + case F64: + return EqualElementsInternal(other, &multi_index); + case F16: + return EqualElementsInternal(other, &multi_index); + case BF16: + return EqualElementsInternal(other, &multi_index); + case C64: + return EqualElementsInternal(other, &multi_index); + default: + LOG(FATAL) << "Unimplemented: LiteralBase::Piece::EqualElements for type " + << PrimitiveType_Name(subshape().element_type()); + } +} + +bool LiteralBase::operator==(const LiteralBase& other) const { + if (!ShapeUtil::Compatible(shape(), other.shape())) { + return false; + } + + return root_piece().ForEachSubpieceWithBool( + [&](const ShapeIndex& index, const Piece& piece) { + if (!ShapeUtil::IsArray(piece.subshape())) { + return true; + } + + const Piece& other_piece = other.piece(index); + if (!piece.EqualElements(other_piece)) { + return false; + } + return true; + }); +} + +namespace { + +template +static bool AllElementsEqualValue(absl::Span data, + NativeT value) { + for (int64 i = 0; i < data.size(); ++i) { + if (data[i] != value) { + return false; + } + } + return true; +} + +} // namespace + +bool LiteralBase::IsAll(int8 value) const { + return root_piece().ForEachSubpieceWithBool([&](const ShapeIndex& index, + const Piece& piece) { + if (!ShapeUtil::IsArray(piece.subshape())) { + return true; + } + + auto piece_is_all = [&]() { + switch (shape().element_type()) { + case U8: + if (value >= 0) { + return AllElementsEqualValue(piece.data(), value); + } + return false; + case U32: + if (value >= 0) { + return AllElementsEqualValue(piece.data(), value); + } + return false; + case U64: + if (value >= 0) { + return AllElementsEqualValue(piece.data(), value); + } + return false; + case S8: + return AllElementsEqualValue(piece.data(), value); + case S32: + return AllElementsEqualValue(piece.data(), value); + case S64: + return AllElementsEqualValue(piece.data(), value); + case F32: + return AllElementsEqualValue(piece.data(), value); + case F64: + return AllElementsEqualValue(piece.data(), value); + case F16: + return AllElementsEqualValue(piece.data(), + static_cast(value)); + case BF16: + return AllElementsEqualValue(piece.data(), + static_cast(value)); + case PRED: + if (value == 0) { + return AllElementsEqualValue(piece.data(), false); + } + if (value == 1) { + return AllElementsEqualValue(piece.data(), true); + } + return false; + default: + return false; + } + return false; + }; + + if (!piece_is_all()) { + return false; + } + return true; + }); +} + +bool LiteralBase::IsAllFloat(float value) const { + return root_piece().ForEachSubpieceWithBool( + [&](const ShapeIndex& index, const Piece& piece) { + if (!ShapeUtil::IsArray(piece.subshape())) { + return true; + } + + auto piece_is_all = [&]() { + switch (shape().element_type()) { + case F32: + return AllElementsEqualValue(piece.data(), value); + case F64: + return AllElementsEqualValue(piece.data(), value); + case F16: + return AllElementsEqualValue(piece.data(), + static_cast(value)); + case BF16: + return AllElementsEqualValue( + piece.data(), static_cast(value)); + default: + return false; + } + }; + if (!piece_is_all()) { + return false; + } + return true; + }); +} + +bool LiteralBase::IsAllComplex(complex64 value) const { + switch (shape().element_type()) { + case C64: + return AllElementsEqualValue(root_piece().data(), + value); + default: + return false; + } +} + +bool LiteralBase::IsAllFirst() const { + return root_piece().ForEachSubpieceWithBool( + [&](const ShapeIndex& index, const Piece& piece) { + if (!ShapeUtil::IsArray(piece.subshape())) { + return true; + } + + // Empty shapes are not all the first element since there is no first + // element. + if (ShapeUtil::IsZeroElementArray(piece.subshape())) { + return false; + } + auto piece_is_all = [&]() { + switch (piece.subshape().element_type()) { + case PRED: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + // 8 bit types + case S8: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + case U8: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + // 16 bit types + case BF16: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + case F16: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + case S16: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + case U16: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + // 32 bit types + case F32: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + case U32: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + case S32: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + // 64 bit types + case C64: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + case F64: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + case S64: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + case U64: { + auto data = piece.data(); + return AllElementsEqualValue(data, data[0]); + } + default: + return false; + } + }; + + if (!piece_is_all()) { + return false; + } + return true; + }); +} + +bool LiteralBase::IsR1Iota() const { + if (!ShapeUtil::IsArray(shape())) { + return false; + } + + if (ShapeUtil::Rank(shape()) != 1) { + return false; + } + + auto is_iota_at_idx = [&](const int64 idx) { + switch (shape().element_type()) { + case U8: + return Get({idx}) == idx; + case U16: + return Get({idx}) == idx; + case U32: + return Get({idx}) == idx; + case U64: + return Get({idx}) == idx; + case S8: + return Get({idx}) == idx; + case S16: + return Get({idx}) == idx; + case S32: + return Get({idx}) == idx; + case S64: + return Get({idx}) == idx; + case F32: + return Get({idx}) == idx; + case F64: + return Get({idx}) == idx; + case F16: + return Get({idx}) == static_cast(idx); + case BF16: + return Get({idx}) == static_cast(idx); + case C64: + return Get({idx}) == complex64(idx, 0.0f); + case PRED: + return Get({idx}) == idx; + // token, opaque, tuple, etc. are all not iota. + default: + return false; + } + }; + + const int64 elements = ShapeUtil::ElementsIn(shape()); + for (int64 idx = 0; idx < elements; ++idx) { + if (!is_iota_at_idx(idx)) { + return false; + } + } + + return true; +} + +bool LiteralBase::IsZero(absl::Span indices) const { + CHECK(ShapeUtil::IsArray(shape())); + switch (shape().element_type()) { + case U8: + return Get(indices) == 0; + case U32: + return Get(indices) == 0; + case U64: + return Get(indices) == 0; + case S8: + return Get(indices) == 0; + case S32: + return Get(indices) == 0; + case S64: + return Get(indices) == 0; + case F32: + return Get(indices) == 0.0f; + case F64: + return Get(indices) == 0.0; + case C64: + return Get(indices) == complex64(0.0f, 0.0f); + case F16: + return Get(indices) == static_cast(0.0f); + case BF16: + return Get(indices) == static_cast(0.0f); + case PRED: + return Get(indices) == false; + default: + LOG(FATAL) << "Input literal must be an array."; + } +} + +namespace { + +template +void CopyToRepeatedField(RepeatedFieldT* dest, + const absl::Span src) { + *dest = RepeatedFieldT(src.begin(), src.end()); +} + +} // namespace + +void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const { + *proto->mutable_shape() = subshape(); + switch (subshape().element_type()) { + case PRED: + CopyToRepeatedField(proto->mutable_preds(), data()); + break; + case U8: + proto->set_u8s(static_cast(data().data()), + element_count()); + break; + case U32: + CopyToRepeatedField(proto->mutable_u32s(), data()); + break; + case U64: + CopyToRepeatedField(proto->mutable_u64s(), data()); + break; + case S32: + CopyToRepeatedField(proto->mutable_s32s(), data()); + break; + case S64: + CopyToRepeatedField(proto->mutable_s64s(), data()); + break; + case F16: + *proto->mutable_f16s() = string( + reinterpret_cast(data().data()), size_bytes()); + if (!kLittleEndian) { + ConvertEndianShort(proto->mutable_f16s()); + } + break; + case BF16: + *proto->mutable_bf16s() = string( + reinterpret_cast(data().data()), size_bytes()); + if (!kLittleEndian) { + ConvertEndianShort(proto->mutable_bf16s()); + } + break; + case F32: + CopyToRepeatedField(proto->mutable_f32s(), data()); + break; + case F64: + CopyToRepeatedField(proto->mutable_f64s(), data()); + break; + case C64: + for (complex64 value : data()) { + proto->add_c64s(value.real()); + proto->add_c64s(value.imag()); + } + break; + case TUPLE: + case TOKEN: + // Nothing to do but assign the shape which is done above. + return; + default: + // TODO(b/111551621): Support serializing more PrimitiveTypes. + LOG(FATAL) << "Unhandled primitive type " + << PrimitiveType_Name(subshape().element_type()); + } +} + +const void* LiteralBase::Piece::untyped_data() const { + CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape()); + return buffer(); +} + +void* LiteralBase::Piece::untyped_data() { + CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape()); + return buffer(); +} + +namespace { + +template +Status CopyFromRepeatedField(absl::Span dest, + const RepeatedFieldT& src) { + if (dest.size() != src.size()) { + return InvalidArgument( + "Expected %lu elements in LiteralProto repeated field, has %d", + dest.size(), src.size()); + } + std::copy(src.begin(), src.end(), dest.begin()); + return Status::OK(); +} + +} // namespace + +Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) { + // These conditions should have been checked in + // MutableLiteralBase::CreateFromProto. + TF_RET_CHECK(proto.has_shape()); + TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape())); + TF_RET_CHECK(ShapeUtil::Equal(proto.shape(), subshape())); + + switch (subshape().element_type()) { + case PRED: + TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.preds())); + break; + case U8: { + auto u8_data = data(); + TF_RET_CHECK(proto.u8s().size() == u8_data.size()); + std::copy(proto.u8s().begin(), proto.u8s().end(), u8_data.begin()); + } break; + case S32: + TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.s32s())); + break; + case S64: + TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.s64s())); + break; + case U32: + TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.u32s())); + break; + case U64: + TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.u64s())); + break; + case F16: { + const string& s(proto.f16s()); + TF_RET_CHECK(data().size() * sizeof(half) == s.size()); + memcpy(untyped_data(), s.data(), s.size()); + if (!kLittleEndian) { + ConvertEndianShort(reinterpret_cast(untyped_data()), s.size()); + } + } break; + + case BF16: { + const string& s(proto.bf16s()); + TF_RET_CHECK(data().size() * sizeof(bfloat16) == s.size()); + memcpy(untyped_data(), s.data(), s.size()); + if (!kLittleEndian) { + ConvertEndianShort(reinterpret_cast(untyped_data()), s.size()); + } + } break; + case F32: + TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.f32s())); + break; + case F64: + TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.f64s())); + break; + case C64: { + auto complex_data = data(); + TF_RET_CHECK(proto.c64s_size() == complex_data.size() * 2); + for (int64 i = 0; i < complex_data.size(); ++i) { + complex_data[i] = complex64{proto.c64s(i * 2), proto.c64s(i * 2 + 1)}; + } + } break; + case TUPLE: + LOG(FATAL) << "Should not be called on tuple shapes: " + << ShapeUtil::HumanString(subshape()); + break; + default: + LOG(FATAL) << "Unhandled primitive type " << subshape().element_type(); + } + return Status::OK(); +} + +LiteralProto LiteralBase::ToProto() const { + LiteralProto proto; + root_piece().ForEachSubpiece( + [&](const ShapeIndex& index, const Piece& piece) { + LiteralProto* proto_piece = &proto; + for (int64 i : index) { + while (proto_piece->tuple_literals_size() <= i) { + proto_piece->add_tuple_literals(); + } + proto_piece = proto_piece->mutable_tuple_literals(i); + } + piece.WriteToProto(proto_piece); + }); + + if (LayoutUtil::IsSparseArray(shape())) { + CopyToRepeatedField(proto.mutable_sparse_indices(), + sparse_indices()->data()); + } + + return proto; +} + +const void* LiteralBase::untyped_data(const ShapeIndex& shape_index) const { + return piece(shape_index).untyped_data(); +} + +void* MutableLiteralBase::untyped_data(const ShapeIndex& shape_index) { + return piece(shape_index).untyped_data(); +} + +int64 LiteralBase::size_bytes(const ShapeIndex& shape_index) const { + return piece(shape_index).size_bytes(); +} + +string LiteralBase::GetR1U8AsString() const { + CHECK(ShapeUtil::IsArray(shape())); + CHECK_EQ(ShapeUtil::Rank(shape()), 1); + CHECK_EQ(shape().element_type(), U8); + return string(tensorflow::bit_cast(data().data()), + ShapeUtil::ElementsIn(shape())); +} + +void MutableBorrowingLiteral::CopyPieceSubtree(const Shape& shape, + Piece* src_piece, + Piece* dest_piece) { + DCHECK(ShapeUtil::Equal(src_piece->subshape(), dest_piece->subshape())) + << "src_piece has shape: " + << ShapeUtil::HumanString(src_piece->subshape()) + << "dest_piece has shape: " + << ShapeUtil::HumanString(dest_piece->subshape()); + if (ShapeUtil::IsTuple(shape)) { + for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { + const Shape& subshape = shape.tuple_shapes(i); + + auto child_piece = Piece(); + child_piece.set_subshape(&subshape); + + CopyPieceSubtree(subshape, &src_piece->child(i), &child_piece); + + dest_piece->emplace_back(std::move(child_piece)); + } + } else if (ShapeUtil::IsArray(shape)) { + dest_piece->set_buffer(src_piece->buffer()); + } else { + // If the shape is neither an array nor tuple, then it must be + // zero-sized. Otherwise, some memory needs to be allocated for it. + CHECK_EQ(dest_piece->size_bytes(), 0); + } +} + +MutableLiteralBase::~MutableLiteralBase() {} + +MutableBorrowingLiteral::MutableBorrowingLiteral( + const MutableBorrowingLiteral& literal) + : MutableLiteralBase() { + shape_ = absl::make_unique(literal.shape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal.root_piece(), root_piece_); +} + +MutableBorrowingLiteral& MutableBorrowingLiteral::operator=( + const MutableBorrowingLiteral& literal) { + shape_ = absl::make_unique(literal.shape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal.root_piece(), root_piece_); + + return *this; +} + +MutableBorrowingLiteral::MutableBorrowingLiteral( + const MutableLiteralBase& literal) + : MutableLiteralBase() { + shape_ = absl::make_unique(literal.shape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal.root_piece(), root_piece_); +} + +MutableBorrowingLiteral::MutableBorrowingLiteral(MutableLiteralBase* literal) + : MutableLiteralBase() { + shape_ = absl::make_unique(literal->shape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal->root_piece(), root_piece_); +} + +MutableBorrowingLiteral::MutableBorrowingLiteral( + MutableBorrowingLiteral literal, const ShapeIndex& view_root) + : MutableLiteralBase() { + shape_ = absl::make_unique(literal.piece(view_root).subshape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal.piece(view_root), root_piece_); +} + +MutableBorrowingLiteral::MutableBorrowingLiteral(const char* src_buf_ptr, + const Shape& shape) + : MutableLiteralBase() { + shape_ = absl::make_unique(shape); + CHECK(LayoutUtil::HasLayout(*shape_)); + CHECK(!ShapeUtil::IsTuple(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_buffer(const_cast(src_buf_ptr)); + root_piece_->set_subshape(shape_.get()); +} + +MutableBorrowingLiteral::~MutableBorrowingLiteral() { + if (root_piece_ != nullptr) { + root_piece_->ForEachMutableSubpiece( + [&](const ShapeIndex& index, Piece* piece) { + if (piece->buffer() != nullptr) { + delete piece->sparse_indices(); + } + }); + delete root_piece_; + } +} + +LiteralSlice::LiteralSlice(const LiteralBase& literal) + : LiteralBase(), root_piece_(&literal.root_piece()) {} + +LiteralSlice::LiteralSlice(const LiteralBase& literal, + const ShapeIndex& view_root) + : LiteralBase(), root_piece_(&literal.piece(view_root)) {} + +void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) { + CHECK(ShapeUtil::IsTuple(shape)); + for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { + const Shape& subshape = shape.tuple_shapes(i); + + auto child_piece = Piece(); + child_piece.set_subshape(&subshape); + + if (ShapeUtil::IsTuple(subshape)) { + BuildPieceSubtree(subshape, &child_piece); + } + + piece->emplace_back(std::move(child_piece)); + } +} + +BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape) + : LiteralBase(), shape_(absl::make_unique(shape)) { + CHECK(ShapeUtil::IsArray(*shape_)); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = Piece(); + root_piece_.set_buffer(const_cast(src_buf_ptr)); + root_piece_.set_subshape(shape_.get()); +} + +BorrowingLiteral::BorrowingLiteral(absl::Span src_buf_ptrs, + const Shape& shape) + : LiteralBase(), shape_(absl::make_unique(shape)) { + CHECK(ShapeUtil::IsTuple(*shape_)); + CHECK(!ShapeUtil::IsNestedTuple(*shape_)); + CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(*shape_)); + root_piece_ = Piece(); + root_piece_.set_subshape(shape_.get()); + BuildPieceSubtree(*shape_, &root_piece_); + + for (int i = 0; i < src_buf_ptrs.size(); ++i) { + const auto& src_shape = shape_->tuple_shapes(i); + CHECK(ShapeUtil::IsArray(src_shape)); + root_piece_.child(i).set_buffer(const_cast(src_buf_ptrs[i])); + } +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h new file mode 100644 index 0000000000000000000000000000000000000000..b928cb637494dec220a0912fdea96ed25cde13ef --- /dev/null +++ b/tensorflow/compiler/xla/literal.h @@ -0,0 +1,1177 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_LITERAL_H_ +#define TENSORFLOW_COMPILER_XLA_LITERAL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "absl/memory/memory.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/array2d.h" +#include "tensorflow/compiler/xla/array3d.h" +#include "tensorflow/compiler/xla/array4d.h" +#include "tensorflow/compiler/xla/index_util.h" +#include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/primitive_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/sparse_index_array.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/bitmap.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { + +// Forward declare Literal and LiteralSlice class to be used by the creation +// methods in the base class. +class Literal; +class LiteralSlice; + +// Abstract base class for literals. +class LiteralBase { + public: + virtual ~LiteralBase() = 0; + + // Literals are equal if they have compatible shapes and the same data + // values. Layout is not compared. + bool operator==(const LiteralBase& other) const; + bool operator!=(const LiteralBase& other) const { return !(*this == other); } + + // Returns the shape of the literal. + const Shape& shape() const { return root_piece().subshape(); } + + // Serialize to proto. + LiteralProto ToProto() const; + + // Returns a Span of the array for this literal for the given NativeT + // (e.g., float). CHECKs if the subshape of the literal at the given + // ShapeIndex is not array. See primitive_util.h for the mapping from XLA type + // to native type. + template + absl::Span data(const ShapeIndex& shape_index = {}) const; + + // Returns a const pointer to the sparse index array. Returns nullptr if the + // literal is not a sparse array. + const SparseIndexArray* sparse_indices( + const ShapeIndex& shape_index = {}) const; + + // Returns a const pointer to (or size of) the underlying buffer holding the + // array at the given shape index. CHECKs if the subshape of the literal at + // the given ShapeIndex is not array. + const void* untyped_data(const ShapeIndex& shape_index = {}) const; + int64 size_bytes(const ShapeIndex& shape_index = {}) const; + + // Returns this literal's data as a string. This literal must be a rank-1 U8 + // array. + string GetR1U8AsString() const; + + // Returns a string representation of the literal value. + // Warning: this function can take minutes for multi-million element Literals. + string ToString(bool print_layout = false) const; + + // Gets an element in the literal at the given index. The multi_index is + // CHECKed against the dimension sizes. + template + NativeT Get(absl::Span multi_index, + const ShapeIndex& shape_index) const; + // Overloads of Get for array literals. CHECKs if the literal is not + // array-shaped and dense. + template + NativeT Get(absl::Span multi_index) const; + + // Returns the element value at index (0, ..., 0), however many zeroes are + // required for that index. + template + NativeT GetFirstElement() const; + + // As Get(), but determines the correct type and converts the value + // into text. + string GetAsString(absl::Span multi_index, + const ShapeIndex& shape_index = {}) const; + // As GetSparseElement(), but determines the correct type and converts the + // value into text. + string GetSparseElementAsString(int64 sparse_element_number, + const ShapeIndex& shape_index = {}) const; + // As Get(), but determines the correct type and converts the value into + // int64. This literal must be an array. + StatusOr GetIntegralAsS64(absl::Span multi_index) const; + + // Returns the multi-index of the element in a sparse literal at the given + // sparse element number. The sparse element number is the position with in + // the sparse array's list of (index, value) pairs, and is checked against the + // total number of (index, value) pairs in the sparse array. + absl::Span GetSparseIndex( + int64 sparse_element_number, const ShapeIndex& shape_index = {}) const; + + // Returns the value of the element in a sparse literal at the given sparse + // element number. The sparse element number is the position with in the + // sparse array's list of (index, value) pairs, and is checked against the + // total number of (index, value) pairs in the sparse array. + template + NativeT GetSparseElement(int64 sparse_element_number, + const ShapeIndex& shape_index = {}) const; + + // Invokes the "per cell" callback for each element in the provided + // literal with the element's indices and a string representation of + // the element's value. + // + // This function is useful if you want a polymorphic representation + // of the tensor's elements (turning it to a string for something + // like representation in a protobuf). + // + // This literal must have a dense layout. + void EachCellAsString( + const std::function indices, + const string& value)>& per_cell) const; + template + void EachCell( + std::function indices, NativeT value)> + per_cell) const; + + // Returns whether every element in this literal is equal to value. + // + // value is an int8 because we expect this to be called with small + // compile-time constants (0, -1, etc.) and so that whatever value you pass + // can be represented exactly by floating-point types as small as 16 bits. + // + // If value doesn't fit in this literal's type, returns false. Values of 1/0 + // are considered equal to true/false; other values are not considered equal + // to true. Also if this literal is not array-shaped false is returned. + bool IsAll(int8 value) const; + + // Like IsAll(const Literal&, int8), except we check whether the literal is + // equal to a particular floating-point number. + // + // If the literal is not a floating-point value, this always returns false. + // + // This casts value to the type of literal, then compares using ==. The usual + // admonishments about floating-point equality checks apply. We expect you to + // use this to check for values that can be expressed precisely as a float, + // e.g. -0.5. Also if this literal is not array-shaped false is returned. + bool IsAllFloat(float value) const; + + // Like IsAll(const Literal&, int8), except we check whether the literal is + // equal to a particular complex number. + // + // If the literal is not a complex value, this always returns false. + // + // This casts value to the type of literal, then compares using ==. The usual + // admonishments about floating-point equality checks apply. We expect you to + // use this to check for complex values that can be expressed precisely as + // float pairs e.g. (-0.5, 1.0). + // + // This literal must have a dense layout. + bool IsAllComplex(complex64 value) const; + + // Literal consists entirely of the first element of the literal. + bool IsAllFirst() const; + + // Literal consists entirely of an iota. + bool IsR1Iota() const; + + // Returns whether this literal is zero at the specified index. This literal + // must be an array with a dense layout. + bool IsZero(absl::Span indices) const; + + // Returns the count of the elements in the array at the given shape index in + // this literal. + int64 element_count(const ShapeIndex& index = {}) const { + return ShapeUtil::ElementsIn(ShapeUtil::GetSubshape(shape(), index)); + } + + // Returns the count of the elements in the sparse array at the given shape + // index in this literal, which will be no larger than + // LayoutUtil::MaxSparseElements(SetSubshape(shape(), index).layout()). + int64 sparse_element_count() const; + + // Compute a hash for this literal. This literal must not be a sparse tensor + // or a tuple containing a sparse tensor. + size_t Hash() const; + + // Converts this literal to the given shape. Returns an error is the + // conversion is not possible. + // + // round_f32_to_bf16: if true, converting F32 elements to BF16 uses rounding + // instead of truncation; otherwise, truncation is used. + // + // TODO(b/69266521): remove the round_to_bfloat16 flag when rounding becomes + // the default behavior. + StatusOr> ConvertToShape( + const Shape& dest_shape, bool round_f32_to_bf16 = false) const; + + // Converts this literal to another primitive type using a bitcast + // conversion. The to and from primitive types must have the same bit + // width. Returns an error if the conversion is not possible. This literal + // must be array-shaped. + StatusOr> BitcastConvert( + PrimitiveType primitive_dest_type) const; + + // Converts this literal to another primitive type. Returns an error if the + // conversion is not possible. This literal must be array-shaped. + StatusOr> Convert( + PrimitiveType primitive_dest_type) const; + + // Clones the underlying buffers into a new Literal, or new + // std::unique_ptr. + Literal Clone() const; + std::unique_ptr CloneToUnique() const; + + // TODO(b/67651157): The methods below which perform computation on Literals + // (Reshape, Slice, etc) should be moved elsewhere, and perhaps combined with + // evaluator code which operates on Literals. + // + // Creates a new value that has the equivalent value as this + // literal, but conforms to new_layout; e.g. a literal matrix that was in {0, + // 1} minor-to-major dimension layout can be re-layed-out as {1, 0} + // minor-to-major dimension layout and the value in the cell at any given + // logical index (i0, i1) will be the same. + // + // For tuple shaped literals, shape_index should be used to select the inner + // array that the new layout applies to. + // + // Note: this is useful when the client wants to ensure that a value placed in + // the XLA allocation tracker has a particular layout; for efficiency + // purposes or avoiding unimplemented operation/layout combinations. + std::unique_ptr Relayout(const Layout& new_layout, + const ShapeIndex& shape_index = {}) const; + + // An overload of Relayout which changes the layout of the entire shape rather + // than being limited to a single array within the shape. + std::unique_ptr Relayout(const Shape& shape_with_layout) const; + + // Creates a new literal by reshaping this literal to have the given + // dimensions. The total number of elements must not change; The + // implementation currently only supports monotonic dim0-major layouts. + // This literal must be an array. + StatusOr> Reshape( + absl::Span dimensions) const; + + // Creates a new literal by broadcasting this literal with `dimensions` to + // yield a literal of shape `result_shape`. + StatusOr> Broadcast( + const Shape& result_shape, absl::Span dimensions) const; + + // Creates a new literal by reordering the dimensions of this literal. + // The given `permutation` must be a permutation of the dimension numbers + // in the original literal, and it specifies the order of the new dimensions + // in the result literal (i.e., new_order[i] = old_order[permutation[i]]). + // For example, a transpose call on a literal of shape [3 x 8 x 4] and + // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8]. + // This literal must be an array. + std::unique_ptr Transpose(absl::Span permutation) const; + + // Creates a sub-array from this literal by extracting the indices + // [start_index, limit_index) of each dimension. The result literal has the + // same rank and layout as for the given literal. The number of indices in + // start_indices and limit_indices must be the rank of the literal, and the + // indices follow the order of the dimensions. + // This literal must be an array. + std::unique_ptr Slice(absl::Span start_indices, + absl::Span limit_indices) const; + + // Creates a literal with a prepended dimension with bound "times"; e.g. a + // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from this + // literal replicated four times. + // This literal must be an array. + template + std::unique_ptr Replicate(int64 times) const; + + // Creates a new Literal object with the shape specified as parameter. + // The content of the literal values is the default value of the primitive + // type of literal itself (0 for numeric types, and false for predicates). + // + // Note: It's an antipattern to use this method then immediately call + // MutableLiteralBase::Populate on the result (since that results in zero + // initialization, then reinitialization. Conside if a call to + // absl::make_unique(shape), followed by the call to + // MutableLiteralBase::Populate can be used instead. + static std::unique_ptr CreateFromShape(const Shape& shape); + + protected: + // A data structure representing a subshape at a particular ShapeIndex within + // the literal. For array-shaped ShapeIndexes, this data structure holds the + // pointer to the memory allocated for the array data. + class Piece { + public: + // Returns the buffer holding the array data for this piece as an array + // slice. This piece must be array-shaped. + template + absl::Span data() const; + template + absl::Span data(); + + // Returns the buffer holding the array data for this piece as a void*. This + // piece must be array-shaped. + void* untyped_data(); + const void* untyped_data() const; + + // Gets or sets an element in the array at the given index. The multi_index + // is CHECKed against the dimension sizes of the array. This piece must be + // array-shaped. + template + NativeT Get(absl::Span index) const; + template + void Set(absl::Span index, NativeT value); + + // Gets/sets the buffer holding the array data. + char* buffer() const { return buffer_; } + void set_buffer(char* buffer) { buffer_ = buffer; } + + // The array of multi-indices that provide the locations of non-zero + // elements in a sparse array. Only used if + // LayoutUtil::IsSparseArray(shape()) is true. + SparseIndexArray* sparse_indices() const { return sparse_indices_; } + void set_sparse_indices(SparseIndexArray* sparse_indices) { + sparse_indices_ = sparse_indices; + } + + // Gets or sets the subshape of this piece. This reference points to a + // subshape within the shape in the containing Literal (Literal::shape_). + const Shape& subshape() const { return *subshape_; } + void set_subshape(const Shape* subshape) { subshape_ = subshape; } + + // Returns the size in bytes of the buffer holding the array data. + int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); } + + // Returns the number of elements in this piece's array. + int64 element_count() const { + // If this is a sparse array, use the number of elements represented by + // the indices in the associated SparseIndexArray. + return LayoutUtil::IsSparseArray(subshape()) + ? sparse_indices()->index_count() + : ShapeUtil::ElementsIn(subshape()); + } + + // Returns the child piece at 'index' of this piece. + Piece& child(int64 index) { return children_[index]; } + + // Adds a child piece to this piece's children. + void emplace_back(Piece child_piece) { + children_.emplace_back(std::move(child_piece)); + } + + // Returns the size of children pieces of this piece. + int64 children_size() { return children_.size(); } + + // Visitor functions that recursively traverses the piece and calls the + // given function at each child piece. The function has the type: + // void (const ShapeIndex& index, const Piece& piece) + template + void ForEachSubpiece(const Fn& func) const { + ShapeIndex index; + return ForEachHelper( + [&func](const ShapeIndex& index, const Piece& piece) { + func(index, piece); + return Status::OK(); + }, + *this, &index) + .IgnoreError(); + } + // Same as above, but the function has the type: + // Status (const ShapeIndex& index, const Piece& piece) + // The first non-OK return value is returned by the function. + template + Status ForEachSubpieceWithStatus(const Fn& func) const { + ShapeIndex index; + return ForEachHelper(func, *this, &index); + } + // Same as above, but the function has the type: + // Bool (const ShapeIndex& index, const Piece& piece) + // The first non-true return value is returned by the function. + template + bool ForEachSubpieceWithBool(const Fn& func) const { + ShapeIndex index; + return ForEachHelperBool(func, *this, &index); + } + // Same as above, but the function has the type: + // Void (const ShapeIndex& index, Piece& piece) + template + void ForEachMutableSubpiece(const Fn& func) { + ShapeIndex index; + return ForEachMutableHelper( + [&func](const ShapeIndex& index, Piece* piece) { + func(index, piece); + return Status::OK(); + }, + const_cast(this), &index) + .IgnoreError(); + } + // Same as above, but the function has the type: + // Status (const ShapeIndex& index, Piece& piece) + // The first non-OK return value is returned by the function. + template + Status ForEachMutableSubpieceWithStatus(const Fn& func) { + ShapeIndex index; + return ForEachMutableHelper( + func, const_cast(this), &index); + } + + // Returns true if this piece and 'other' contain the same data. This piece + // and 'other' must be array-shaped and compatible. + bool EqualElements(const Piece& other) const; + + // Writes the shape and data (if array-shaped) into the given proto. + void WriteToProto(LiteralProto* proto) const; + + // Copy the data from 'src' into this piece's buffer. Shapes of this piece + // and src must be compatible. + Status CopyFrom(const Piece& src); + + // Copies the data from the given proto into this piece. The shape of this + // piece must be equal (not just compatible) to the shape of the proto. + Status CopyFromProto(const LiteralProto& proto); + + // Sorts the elements in a sparse array. + void SortSparseElements(); + + private: + // Helpers for traversing the piece via ForEachSubpiece rooted at 'index'. + // The first non-OK (or non-true) value is returned by the function. + // The callable 'func' has the same signature as described above in + // ForEachSubpiece*. + template + Status ForEachHelper(const Fn& func, const Piece& piece, + ShapeIndex* index) const { + TF_RETURN_IF_ERROR(func(*index, piece)); + for (int64 i = 0; i < piece.children_.size(); ++i) { + index->push_back(i); + TF_RETURN_IF_ERROR(ForEachHelper(func, piece.children_[i], index)); + index->pop_back(); + } + return Status::OK(); + } + template + bool ForEachHelperBool(const Fn& func, const Piece& piece, + ShapeIndex* index) const { + if (!func(*index, piece)) { + return false; + } + for (int64 i = 0; i < piece.children_.size(); ++i) { + index->push_back(i); + if (!ForEachHelperBool(func, piece.children_[i], index)) { + return false; + } + index->pop_back(); + } + return true; + } + template + Status ForEachMutableHelper(const Fn& func, Piece* piece, + ShapeIndex* index) { + TF_RETURN_IF_ERROR(func(*index, piece)); + for (int64 i = 0; i < piece->children_.size(); ++i) { + index->push_back(i); + TF_RETURN_IF_ERROR( + ForEachMutableHelper(func, &piece->children_[i], index)); + index->pop_back(); + } + return Status::OK(); + } + + // Recursive helper for EqualElements. + template + bool EqualElementsInternal(const Piece& other, + std::vector* multi_index) const; + + // Helper for SortSparseElements that has the element type as a template + // parameter. + template + void SortSparseElementsInternal(); + + // For array-shaped pieces, this is the buffer holding the literal data. + char* buffer_ = nullptr; + + // For sparse arrays, this is the array of indices. + SparseIndexArray* sparse_indices_ = nullptr; + + // The shape of piece. This points into the shape of the containing Literal + // (Literal::shape_). + const Shape* subshape_ = nullptr; + + // Children pieces for tuple shaped pieces. + std::vector children_ = {}; + }; // class Piece + + const Piece& piece(const ShapeIndex& shape_index) const { + Piece* piece = &const_cast(root_piece()); + for (const auto i : shape_index) { + DCHECK_GE(i, 0); + DCHECK_LT(i, piece->children_size()); + piece = &piece->child(i); + } + return *piece; + } + + // Returns the piece at the root of the shape. + virtual const Piece& root_piece() const = 0; + + // LiteralSlice and Literal must access Pieces of other Literals. + friend class MutableLiteralBase; + friend class LiteralSlice; + friend class BorrowingLiteral; + + private: + template + std::unique_ptr SliceInternal( + const Shape& result_shape, absl::Span start_indices) const; +}; + +// Abstract base class representing a mutable literal in XLA. +class MutableLiteralBase : public LiteralBase { + public: + virtual ~MutableLiteralBase() = 0; + + // Returns a Span view of the array for this literal for the + // given NativeT (e.g., float). CHECKs if the subshape of the literal at the + // given ShapeIndex is not array. See primitive_util.h for the mapping from + // XLA type to native type. + template + absl::Span data(const ShapeIndex& shape_index = {}); + // Unhide const method from parent class. + using LiteralBase::data; + + // Returns a pointer to the sparse index array. Returns nullptr if the literal + // is not a sparse array. + SparseIndexArray* sparse_indices(const ShapeIndex& shape_index = {}); + + // TODO(b/67651157): Remove this accessor. Literal users should not be able to + // mutate the shape as this can produce malformed Literals. + Shape* mutable_shape_do_not_use() { return shape_.get(); } + + // Returns a pointer to the underlying buffer holding the array at the given + // shape index. CHECKs if the subshape of the literal at the given ShapeIndex + // is not array. + void* untyped_data(const ShapeIndex& shape_index = {}); + // Unhide const method from parent class. + using LiteralBase::untyped_data; + + // Populates a literal with a sparse layout with the given indices and values. + // Each index in the indices array is CHECKed against the dimensions in the + // literal's shape. If sort is true, then the indices and values will be + // sorted. If sort is false, then the indices and values are assumed to + // already be in sorted order. See CreateSparse for an example of how data + // are populated. + template + void PopulateSparse(SparseIndexArray indices, + absl::Span values, bool sort = true); + + // Copy values from 'src_literal' rooted at 'src_shape_index' into this + // literal rooted at 'dest_shape_index'. The subshape of this literal rooted + // at 'dest_shape_index' must be compatible with the subshape of 'src_literal' + // rooted at 'src_shape_index', but need not be arrays. + Status CopyFrom(const LiteralSlice& src_literal, + const ShapeIndex& dest_shape_index = {}, + const ShapeIndex& src_shape_index = {}); + + // Copies the values from src_literal, starting at src_base shape indexes, + // to this literal, starting at dest_base, where the copy size in each + // dimension is specified by copy_size. + // The src_literal and this literal must have the same primitive type, + // src_base+copy_size must fit the source literal dimensions, as well as + // dest_base+copy_size must fit the destination literal dimensions. + // Note: if either src_literal or this literal contains dimensions with zero + // element, then copy_size must be 0 in these dimensions while the + // corresponding base indices being 0. + // This literal and 'src_literal' must be arrays. + Status CopySliceFrom(const LiteralSlice& src_literal, + absl::Span src_base, + absl::Span dest_base, + absl::Span copy_size); + + // Copies one element from src_literal[src_index] to (*this)[dest_index]. + Status CopyElementFrom(const LiteralSlice& src_literal, + absl::Span src_index, + absl::Span dest_index); + + // Sets an element in the literal at the given index. The multi_index is + // CHECKed against the dimension sizes. + template + void Set(absl::Span multi_index, const ShapeIndex& shape_index, + NativeT value); + // Overloads of Set for array literals. CHECKs if the literal is not + // array-shaped and dense. + template + void Set(absl::Span multi_index, NativeT value); + + // Appends the given element to the literal. If the elements are not appended + // in sorted order, then SortSparseElements should be called before calling + // other methods. This literal must have a sparse layout. + template + void AppendSparseElement(absl::Span multi_index, NativeT value, + const ShapeIndex& shape_index = {}); + + // Sorts the elements in a sparse array. + void SortSparseElements(const ShapeIndex& shape_index = {}); + + // As Set(), but truncates `value` to the literal element type before storing. + // This literal must be an array. + Status SetIntegralAsS64(absl::Span multi_index, int64 value); + + // Populate this literal with the given values. Examples: + // + // // Populate with floats. + // Array2D float_values = ... + // literal.PopulateR2FromArray2D(values); + // + // // Populate with int32s. + // literal.PopulateR2({{1, 2}, {3, 4}}); + // + // The shape and element type of this literal must match given values. For + // example, in the call above to literal.PopulateR2(), 'literal' must be a 2x2 + // array of S32. + template + void PopulateR1(absl::Span values); + void PopulateR1(const tensorflow::core::Bitmap& values); + template + void PopulateR2(std::initializer_list> values); + template + void PopulateFromArray(const Array& values); + template + void PopulateR2FromArray2D(const Array2D& values); + template + void PopulateR3FromArray3D(const Array3D& values); + template + void PopulateR4FromArray4D(const Array4D& values); + + // Populates literal values by calling the generator function for every cell + // in this literal object. + // + // generator must be a callable of the type + // NativeT(absl::Span indexes) or compatible. + // + // This literal must have a dense layout. + template + Status Populate(const FnType& generator); + + // A parallel version of Populate(). This can be used if the generator is + // thread-safe and the values for the shape's different elements are + // independent. + template + Status PopulateParallel(const FnType& generator); + + // Fills this literal with the given value. + template + void PopulateWithValue(NativeT value); + + // This operation is the inverse of DecomposeTuple. The given elements are + // moved into the tuple elements of a new tuple-shaped Literal which is + // returned. Upon return, each of the Literals in 'elements' is set to a nil + // shape (empty tuple). + static Literal MoveIntoTuple(absl::Span elements); + + // Serialize from a proto. + static StatusOr> CreateFromProto( + const LiteralProto& proto); + + protected: + // Returns the piece at the given ShapeIndex. + Piece& piece(const ShapeIndex& shape_index) { + return const_cast(LiteralBase::piece(shape_index)); + } + + Piece& root_piece() const override { return *root_piece_; }; + + // Internal template helper for the Literal::CopySliceFrom(), matching its + // arguments one by one. + template + Status CopySliceFromInternal(const LiteralBase& src_literal, + absl::Span src_base, + absl::Span dest_base, + absl::Span copy_size); + + // Utility structure which is used to create the optimal configuration for + // a ShapeUtil::ForEachIndex() scan across two literals. + struct StrideConfig { + StrideConfig(const Shape& source_shape, const Shape& dest_shape, + absl::Span dimensions); + + // The dimensions of the stride operation. Essentially every dimension + // will be iterated from base[i] to base[i]+dimensions[i], in step[i] + // steps. + absl::Span dimensions; + DimensionVector base; + DimensionVector step; + int64 minor_dimension = 0; + // The size of the strides for source and destination. One of the two + // (the one looping through its most minor dimension) will be 1, while + // the other will be the stride size at the dimension matching the other + // shape most minor dimension being scanned. + int64 dest_stride = 1; + int64 source_stride = 1; + // The size of the inner loop on the most minor dimension. + int64 minor_loop_size = 1; + }; + + // Literal class always owns the shape. The parent class borrows this shape. + std::unique_ptr shape_; + + Piece* root_piece_ = nullptr; + + // Implementation details shared between Populate() and PopulateParallel() + template + Status PopulateInternal(const FnType& generator, bool parallel); + + friend class LiteralBase; + friend class MutableBorrowingLiteral; +}; +std::ostream& operator<<(std::ostream& out, const Literal& literal); + +// The underlying buffer and shape is always owned by this class. +class Literal : public MutableLiteralBase { + public: + Literal() : Literal(ShapeUtil::MakeNil()) {} + + // Create a literal of the given shape. The literal is allocated sufficient + // memory to hold the shape. Memory is uninitialized. + explicit Literal(const Shape& shape); + virtual ~Literal(); + + // Literals are moveable, but not copyable. To copy a literal use + // Literal::Clone or Literal::CloneToUnique. This prevents inadvertent copies + // of literals which can be expensive. + Literal(const Literal& other) = delete; + Literal& operator=(const Literal& other) = delete; + Literal(Literal&& other); + // 'allocate_arrays' indicates whether to allocate memory for the arrays in + // the shape. If false, buffer pointers inside of the Literal::Pieces are set + // to nullptr. + Literal(const Shape& shape, bool allocate_arrays); + Literal& operator=(Literal&& other); + + // Similar to CopyFrom, but with move semantincs. The subshape of this literal + // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal' + // (layouts and shapes must match), but need not be arrays. The memory + // allocated in this literal for the subshape at dest_shape_index is + // deallocated, and the respective buffers are replaced with those in + // src_literal. Upon return, src_literal is set to a nil shape (empty tuple). + virtual Status MoveFrom(Literal&& src_literal, + const ShapeIndex& dest_shape_index = {}); + + // Returns a vector containing the tuple elements of this Literal as separate + // Literals. This Literal must be tuple-shaped and can be a nested tuple. The + // elements are moved into the new Literals; no data is copied. Upon return + // this Literal is set to a nil shape (empty tuple) + std::vector DecomposeTuple(); + + private: + // Deallocate the buffers held by this literal. + void DeallocateBuffers(); + + // Recursively sets the subshapes and buffers of all subpieces rooted at + // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in + // the shape. + void SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays); +}; + +// The underlying buffer is not owned by this class and is always owned by +// others. The shape is not owned by this class and not mutable. +class MutableBorrowingLiteral : public MutableLiteralBase { + public: + virtual ~MutableBorrowingLiteral(); + + MutableBorrowingLiteral() : MutableLiteralBase() {} + + MutableBorrowingLiteral(const MutableBorrowingLiteral& literal); + MutableBorrowingLiteral& operator=(const MutableBorrowingLiteral& literal); + + // Implicit conversion constructors. + MutableBorrowingLiteral(const MutableLiteralBase& literal); + MutableBorrowingLiteral(MutableLiteralBase* literal); + MutableBorrowingLiteral(MutableBorrowingLiteral literal, + const ShapeIndex& view_root); + MutableBorrowingLiteral(const char* src_buf_ptr, const Shape& shape); + + private: + // Recursively copies the subtree from the `src_piece` at the given child + // index to the `dest_piece`. For buffers only the pointers are copied, but + // not the content. + void CopyPieceSubtree(const Shape& shape, Piece* src_piece, + Piece* dest_piece); +}; + +// A read-only view of a Literal. A LiteralSlice contains pointers to shape and +// literal buffers always owned by others. +class LiteralSlice : public LiteralBase { + public: + LiteralSlice() : LiteralBase() {} + + // Implicit conversion constructors. + LiteralSlice(const LiteralBase& literal); + LiteralSlice(const LiteralBase& literal, const ShapeIndex& view_root); + + private: + const Piece& root_piece() const override { return *root_piece_; }; + + const Piece* root_piece_; // Not owned. +}; + +// A read-only Literal where the underlying buffers are never owned by this +// class. +class BorrowingLiteral : public LiteralBase { + public: + BorrowingLiteral() : LiteralBase() {} + + // 'src_buf_ptr' is not owned by this class and must outlive the + // lifetime of this class. It points to an appropirately sized buffer with + // data interpretered as indicated by 'shape'. + // This constructor is only used for array shapes. + BorrowingLiteral(const char* src_buf_ptr, const Shape& shape); + // Similar as above, except to be used for constructing non-nested tuples. + BorrowingLiteral(absl::Span src_buf_ptrs, + const Shape& shape); + // TODO(b/79707221): adding constructors for nested tuples as well. + + private: + // Recursively builds the subtree for the given piece and sets the subshapes + // of the given piece with the given shape. + void BuildPieceSubtree(const Shape& shape, Piece* piece); + + // Accessor for the root piece of this literal. + const Piece& root_piece() const override { return root_piece_; }; + Piece root_piece_; + + // Shape of this literal. Stored as unique_ptr such that the (default) move + // construction of this class would be trivially correct: the pointer to Shape + // root_piece_ stores will still point to the correct address. + std::unique_ptr shape_; +}; + +template +absl::Span LiteralBase::Piece::data() const { + CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape()); + CHECK_EQ(subshape().element_type(), + primitive_util::NativeToPrimitiveType()) + << "Attempting to access " + << PrimitiveType_Name(primitive_util::NativeToPrimitiveType()) + << " type, but literal element type is " + << PrimitiveType_Name(subshape().element_type()); + return absl::Span(reinterpret_cast(buffer()), + element_count()); +} + +template +absl::Span LiteralBase::Piece::data() { + CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape()); + CHECK_EQ(subshape().element_type(), + primitive_util::NativeToPrimitiveType()) + << "Attempting to access " + << PrimitiveType_Name(primitive_util::NativeToPrimitiveType()) + << " type, but literal element type is " + << PrimitiveType_Name(subshape().element_type()); + return absl::Span(reinterpret_cast(buffer()), + element_count()); +} + +template +NativeT LiteralBase::Piece::Get(absl::Span multi_index) const { + CHECK(LayoutUtil::IsDenseArray(subshape())); + return data()[IndexUtil::MultidimensionalIndexToLinearIndex( + subshape(), multi_index)]; +} + +template +void LiteralBase::Piece::Set(absl::Span multi_index, + NativeT value) { + CHECK(LayoutUtil::IsDenseArray(subshape())); + data()[IndexUtil::MultidimensionalIndexToLinearIndex( + subshape(), multi_index)] = value; +} + +template +absl::Span LiteralBase::data( + const ShapeIndex& shape_index) const { + return piece(shape_index).data(); +} + +template +absl::Span MutableLiteralBase::data(const ShapeIndex& shape_index) { + return piece(shape_index).data(); +} + +template +inline NativeT LiteralBase::Get(absl::Span multi_index, + const ShapeIndex& shape_index) const { + return piece(shape_index).Get(multi_index); +} + +template +inline NativeT LiteralBase::Get(absl::Span multi_index) const { + return root_piece().Get(multi_index); +} + +template +inline void MutableLiteralBase::Set(absl::Span multi_index, + const ShapeIndex& shape_index, + NativeT value) { + return piece(shape_index).Set(multi_index, value); +} + +template +inline void MutableLiteralBase::Set(absl::Span multi_index, + NativeT value) { + return root_piece().Set(multi_index, value); +} + +template +NativeT LiteralBase::GetFirstElement() const { + return data().at(0); +} + +template +NativeT LiteralBase::GetSparseElement(int64 sparse_element_number, + const ShapeIndex& shape_index) const { + CHECK( + LayoutUtil::IsSparseArray(ShapeUtil::GetSubshape(shape(), shape_index))); + return data(shape_index)[sparse_element_number]; +} + +template +void MutableLiteralBase::AppendSparseElement( + absl::Span multi_index, NativeT value, + const ShapeIndex& shape_index) { + Piece& p = piece(shape_index); + const Shape& subshape = p.subshape(); + CHECK(LayoutUtil::IsSparseArray(subshape)); + int64 rank = ShapeUtil::Rank(subshape); + CHECK_EQ(multi_index.size(), rank); + int64 last_element = p.sparse_indices()->index_count(); + CHECK_LT(last_element, LayoutUtil::MaxSparseElements(subshape.layout())); + p.sparse_indices()->Append(multi_index); + CHECK_LT(last_element, p.data().size()); + p.data()[last_element] = value; +} + +template +void LiteralBase::EachCell( + std::function indices, NativeT value)> + per_cell) const { + if (ShapeUtil::IsZeroElementArray(shape())) { + return; + } + std::vector indices(ShapeUtil::Rank(shape()), 0); + do { + per_cell(indices, Get(indices)); + } while (IndexUtil::BumpIndices(shape(), absl::MakeSpan(indices))); +} + +template +inline void MutableLiteralBase::PopulateR1(absl::Span values) { + CHECK(ShapeUtil::IsArray(shape())); + CHECK_EQ(ShapeUtil::Rank(shape()), 1); + CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size()); + CHECK_EQ(shape().element_type(), + primitive_util::NativeToPrimitiveType()); + for (int64 i = 0; i < values.size(); ++i) { + Set({i}, values[i]); + } +} + +template +void MutableLiteralBase::PopulateR2( + std::initializer_list> values) { + CHECK(ShapeUtil::IsArray(shape())); + CHECK_EQ(ShapeUtil::Rank(shape()), 2); + CHECK_EQ(shape().element_type(), + primitive_util::NativeToPrimitiveType()); + + const int64 dim0_size = values.size(); + const int64 dim1_size = values.begin()->size(); + CHECK_EQ(dim0_size, shape().dimensions(0)); + CHECK_EQ(dim1_size, shape().dimensions(1)); + + int64 dim0 = 0; + for (auto inner_list : values) { + int64 dim1 = 0; + for (auto value : inner_list) { + Set({dim0, dim1}, value); + ++dim1; + } + CHECK_EQ(dim1_size, dim1); + ++dim0; + } +} + +template +void MutableLiteralBase::PopulateFromArray(const Array& values) { + CHECK(ShapeUtil::IsArray(shape())); + CHECK_EQ(shape().element_type(), + primitive_util::NativeToPrimitiveType()); + CHECK_EQ(ShapeUtil::Rank(shape()), values.num_dimensions()); + for (int dim = 0; dim < values.num_dimensions(); ++dim) { + CHECK_EQ(values.dim(dim), shape().dimensions(dim)); + } + values.Each([this](absl::Span indices, NativeT value) { + this->Set(indices, value); + }); +} + +template +void MutableLiteralBase::PopulateR2FromArray2D(const Array2D& values) { + PopulateFromArray(values); +} + +template +void MutableLiteralBase::PopulateR3FromArray3D(const Array3D& values) { + PopulateFromArray(values); +} + +template +void MutableLiteralBase::PopulateR4FromArray4D(const Array4D& values) { + PopulateFromArray(values); +} + +template +void MutableLiteralBase::PopulateSparse(SparseIndexArray indices, + absl::Span values, + bool sort) { + CHECK(LayoutUtil::IsSparseArray(shape())); + int rank = ShapeUtil::Rank(shape()); + CHECK_EQ(indices.rank(), rank); + int64 max_elements = LayoutUtil::MaxSparseElements(shape().layout()); + CHECK_LE(indices.max_indices(), max_elements); + int64 num_elements = values.size(); + CHECK_LE(num_elements, max_elements); + CHECK_EQ(num_elements, indices.index_count()); + auto root_data = root_piece().data(); + // Piece::data() returns a Span of size equal to the number of indices + // in the SparseIndexArray. So there is no need to adjust the size of the data + // here. It is enough to just copy the incoming values into the data buffer. + std::copy(values.begin(), values.end(), root_data.begin()); + *this->root_piece().sparse_indices() = std::move(indices); + if (sort) { + auto root_data = this->root_piece().data(); + this->root_piece().sparse_indices()->SortWithValues(root_data); + } + DCHECK(this->root_piece().sparse_indices()->Validate(shape())); +} + +template +Status MutableLiteralBase::PopulateInternal(const FnType& generator, + bool parallel) { + const Shape& this_shape = shape(); + const int64 rank = ShapeUtil::Rank(this_shape); + TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape)); + TF_RET_CHECK(this_shape.element_type() == + primitive_util::NativeToPrimitiveType()); + absl::Span literal_data = data(); + if (rank > 0) { + StrideConfig stride_config(this_shape, this_shape, + AsInt64Slice(this_shape.dimensions())); + int64 minor_dimension_size = + ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension); + + auto init_function = [&](absl::Span indexes) { + DimensionVector minor_scan_indexes(rank, 0); + const int64 index = + IndexUtil::MultidimensionalIndexToLinearIndex(shape(), indexes); + std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin()); + for (int64 i = 0; i < minor_dimension_size; ++i) { + minor_scan_indexes[stride_config.minor_dimension] = i; + literal_data.at(index + i) = generator(minor_scan_indexes); + } + }; + if (parallel) { + ShapeUtil::ForEachIndexParallel(this_shape, stride_config.base, + stride_config.dimensions, + stride_config.step, init_function); + } else { + ShapeUtil::ForEachIndex( + this_shape, stride_config.base, stride_config.dimensions, + stride_config.step, + [&init_function](absl::Span indexes) { + init_function(indexes); + return true; + }); + } + } else { + // For scalars. + literal_data.at(0) = generator({}); + } + return Status::OK(); +} +template +Status MutableLiteralBase::Populate(const FnType& generator) { + return PopulateInternal(generator, /*parallel=*/false); +} + +template +Status MutableLiteralBase::PopulateParallel(const FnType& generator) { + return PopulateInternal(generator, /*parallel=*/true); +} + +template +void MutableLiteralBase::PopulateWithValue(NativeT value) { + CHECK(ShapeUtil::IsArray(shape())); + CHECK_EQ(shape().element_type(), + primitive_util::NativeToPrimitiveType()); + for (NativeT& element : data()) { + element = value; + } +} + +template +std::unique_ptr LiteralBase::Replicate(int64 times) const { + DimensionVector bounds = {times}; + bounds.reserve(shape().dimensions_size() + 1); + for (int64 bound : shape().dimensions()) { + bounds.push_back(bound); + } + auto literal = absl::make_unique( + ShapeUtil::MakeShape(shape().element_type(), bounds)); + int64 elements = ShapeUtil::ElementsIn(literal->shape()); + if (elements == 0) { + return literal; + } + + DimensionVector output_indices(bounds.size(), 0); + absl::Span input_indices = output_indices; + input_indices.remove_prefix(1); + + bool done = false; + while (!done) { + const auto element = Get(input_indices); + literal->Set(output_indices, element); + + done = true; + for (int n = 0; n < output_indices.size(); ++n) { + ++output_indices[n]; + if (output_indices[n] < bounds[n]) { + done = false; + break; + } + output_indices[n] = 0; + } + } + return literal; +} + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_LITERAL_H_ diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc index bf9679cafec72c2e9dc5796e9058c6703239c508..3d8725ed7051cafc97987f25a96004fa876dfdd3 100644 --- a/tensorflow/compiler/xla/literal_comparison.cc +++ b/tensorflow/compiler/xla/literal_comparison.cc @@ -19,15 +19,16 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/casts.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/env.h" -using tensorflow::strings::Appendf; -using tensorflow::strings::Printf; -using tensorflow::strings::StrAppend; -using tensorflow::strings::StrCat; +using absl::StrAppend; +using absl::StrAppendFormat; +using absl::StrCat; namespace xla { namespace literal_comparison { @@ -37,7 +38,8 @@ namespace { // between the left-hand-side and right-hand-side, by bit-casting to UnsignedT // -- on miscompare, a nice error message is given in the AssertionFailure. template -Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) { +Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs, + absl::Span multi_index) { auto ulhs = tensorflow::bit_cast(lhs); auto urhs = tensorflow::bit_cast(rhs); auto lhs_double = static_cast(lhs); @@ -45,9 +47,10 @@ Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) { if (ulhs != urhs) { return InvalidArgument( "floating values are not bitwise-equal; and equality testing " - "was requested: %s=%g=%a vs %s=%g=%a", - StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double, lhs_double, - StrCat(tensorflow::strings::Hex(urhs)).c_str(), rhs_double, rhs_double); + "was requested: %s=%g=%a vs %s=%g=%a at array index %s", + StrCat(absl::Hex(ulhs)), lhs_double, lhs_double, + StrCat(absl::Hex(urhs)), rhs_double, rhs_double, + LiteralUtil::MultiIndexAsString(multi_index)); } return Status::OK(); } @@ -56,39 +59,47 @@ Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) { // bitwise helper above (this is the un-specialized fallback, to just use the // default gunit implementation). template -Status CompareEqual(NativeT lhs, NativeT rhs) { +Status CompareEqual(NativeT lhs, NativeT rhs, + absl::Span multi_index) { if (lhs == rhs) { return Status::OK(); } - return InvalidArgument("Expected equality of these values:\n %s\n %s", - StrCat(lhs).c_str(), StrCat(rhs).c_str()); + return InvalidArgument( + "first mismatch at array index %s:\n expected value: %s\n actual " + "value: %s", + LiteralUtil::MultiIndexAsString(multi_index), StrCat(lhs), StrCat(rhs)); } // Specializations for floating types that do bitwise comparisons when equality // comparison is requested. template <> -Status CompareEqual(bfloat16 lhs, bfloat16 rhs) { - return CompareFloatsBitwiseEqual(lhs, rhs); +Status CompareEqual(bfloat16 lhs, bfloat16 rhs, + absl::Span multi_index) { + return CompareFloatsBitwiseEqual(lhs, rhs, multi_index); } template <> -Status CompareEqual(Eigen::half lhs, Eigen::half rhs) { - return CompareFloatsBitwiseEqual(lhs, rhs); +Status CompareEqual(Eigen::half lhs, Eigen::half rhs, + absl::Span multi_index) { + return CompareFloatsBitwiseEqual(lhs, rhs, multi_index); } template <> -Status CompareEqual(float lhs, float rhs) { - return CompareFloatsBitwiseEqual(lhs, rhs); +Status CompareEqual(float lhs, float rhs, + absl::Span multi_index) { + return CompareFloatsBitwiseEqual(lhs, rhs, multi_index); } template <> -Status CompareEqual(double lhs, double rhs) { - return CompareFloatsBitwiseEqual(lhs, rhs); +Status CompareEqual(double lhs, double rhs, + absl::Span multi_index) { + return CompareFloatsBitwiseEqual(lhs, rhs, multi_index); } template <> -Status CompareEqual(complex64 lhs, complex64 rhs) { - auto res = CompareEqual(lhs.real(), rhs.real()); +Status CompareEqual(complex64 lhs, complex64 rhs, + absl::Span multi_index) { + auto res = CompareEqual(lhs.real(), rhs.real(), multi_index); if (!res.ok()) { return res; } - return CompareEqual(lhs.imag(), rhs.imag()); + return CompareEqual(lhs.imag(), rhs.imag(), multi_index); } // A recursive function which iterates through every index of expected and @@ -96,18 +107,18 @@ Status CompareEqual(complex64 lhs, complex64 rhs) { // elements are equal. template Status Equal(LiteralSlice expected, LiteralSlice actual, - tensorflow::gtl::MutableArraySlice multi_index, - int64 dimension) { + absl::Span multi_index, int64 dimension) { if (dimension == expected.shape().dimensions_size()) { NativeT expected_value = expected.Get(multi_index); NativeT actual_value = actual.Get(multi_index); - return CompareEqual(expected_value, actual_value); + return CompareEqual(expected_value, actual_value, multi_index); } Status result; for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) { multi_index[dimension] = i; - result.Update(Equal(expected, actual, multi_index, dimension + 1)); + TF_RETURN_IF_ERROR( + Equal(expected, actual, multi_index, dimension + 1)); } return result; } @@ -151,15 +162,26 @@ bool NanMismatch(half expected, half actual, bool relaxed_nans) { static_cast(actual), relaxed_nans); } +// Returns whether the given value is infinity. +template +bool IsInf(NativeT val) { + return std::isinf(val); +} + +template <> +bool IsInf(half val) { + return std::isinf(static_cast(val)); +} + // Converts the given floating-point value to a string. template string FpValueToString(NativeT value) { - return Printf("%8.4g", static_cast(value)); + return absl::StrFormat("%8.4g", static_cast(value)); } template <> string FpValueToString(complex64 value) { - return Printf("%8.4g + %8.4fi", value.real(), value.imag()); + return absl::StrFormat("%8.4g + %8.4fi", value.real(), value.imag()); } // Returns the absolute value of the given floating point value. This function @@ -214,13 +236,12 @@ class NearComparator { } string ToString(const Shape& shape) const { - return Printf( + return absl::StrFormat( "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g", - FpValueToString(actual).c_str(), FpValueToString(expected).c_str(), - Literal::MultiIndexAsString( + FpValueToString(actual), FpValueToString(expected), + LiteralUtil::MultiIndexAsString( IndexUtil::LinearIndexToMultidimensionalIndex(shape, - linear_index)) - .c_str(), + linear_index)), rel_error, abs_error); } }; @@ -239,17 +260,12 @@ class NearComparator { // Runs the comparison between expected and actual literals. Status Run() { - VLOG(1) << "expected:"; - XLA_VLOG_LINES(1, ToStringTruncated(expected_)); - VLOG(1) << "actual:"; - XLA_VLOG_LINES(1, ToStringTruncated(actual_)); - // If the shapes mismatch, we simply fail the expectation instead of // printing out data, as it's a type error rather than a value error. TF_RETURN_IF_ERROR(EqualShapes(expected_.shape(), actual_.shape())); if (!ShapeUtil::IsArray(expected_.shape())) { return InvalidArgument("Expected array shape; got %s.", - ShapeUtil::HumanString(expected_.shape()).c_str()); + ShapeUtil::HumanString(expected_.shape())); } mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED)); @@ -262,7 +278,7 @@ class NearComparator { } else if (!VLOG_IS_ON(1) && miscompare_callback_ != nullptr) { miscompare_callback_(expected_, actual_, mismatches_); } - return InvalidArgument("%s", ErrorMessage().c_str()); + return InvalidArgument("%s", ErrorMessage()); } // Insert the given absolute value into the absolute value bucket vector. The @@ -287,8 +303,7 @@ class NearComparator { } // Insert the given error into the given error bucket vector. - void UpdateErrorBucket( - float error, tensorflow::gtl::MutableArraySlice error_buckets) { + void UpdateErrorBucket(float error, absl::Span error_buckets) { CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size()); for (int i = 0; i < error_buckets.size(); ++i) { if (error >= kErrorBucketBounds[i]) { @@ -299,12 +314,13 @@ class NearComparator { // Compares the two given elements from the expected and actual literals at // the given literal_index and keeps track of various mismatch statistics. - void CompareValues(NativeT expected, NativeT actual, int64 linear_index) { + template + void CompareValues(T expected, T actual, int64 linear_index) { const bool is_nan_mismatch = NanMismatch(expected, actual, error_.relaxed_nans); float abs_error; float rel_error; - if (actual == expected) { + if (CompareEqual(expected, actual, {linear_index}).ok()) { abs_error = 0; rel_error = 0; } else if (is_nan_mismatch) { @@ -315,6 +331,12 @@ class NearComparator { // weak ordering requirement of std containers. abs_error = std::numeric_limits::infinity(); rel_error = std::numeric_limits::infinity(); + } else if (IsInf(expected) || IsInf(actual)) { + // If either the expected or actual value is infinity but not both, + // then both absolute and relative error are regarded as inifity. + CHECK(!CompareEqual(expected, actual, {linear_index}).ok()); + abs_error = std::numeric_limits::infinity(); + rel_error = std::numeric_limits::infinity(); } else { abs_error = FpAbsoluteValue(actual - expected); rel_error = abs_error / FpAbsoluteValue(expected); @@ -328,11 +350,11 @@ class NearComparator { // bound is exceeded and vice versa. if (is_abs_mismatch) { num_abs_mismatches_++; - UpdateErrorBucket(rel_error, &rel_error_buckets_); + UpdateErrorBucket(rel_error, absl::MakeSpan(rel_error_buckets_)); } if (is_rel_mismatch) { num_rel_mismatches_++; - UpdateErrorBucket(abs_error, &abs_error_buckets_); + UpdateErrorBucket(abs_error, absl::MakeSpan(abs_error_buckets_)); } UpdateAbsValueBucket(actual, is_mismatch); @@ -357,15 +379,36 @@ class NearComparator { mismatches_.data()[linear_index] = true; } + // For complex64 types, we compare real and imaginary parts individually. + void CompareValues(complex64 expected, complex64 actual, int64 linear_index) { + bool mismatch = false; + CompareValues(expected.real(), actual.real(), linear_index); + if (mismatches_.data()[linear_index] == true) { + mismatch = true; + // Delay the mismatch count increase for real part, instead increase + // mismatch by 1 for the entire complex number. + num_mismatches_--; + } + CompareValues(expected.imag(), actual.imag(), linear_index); + if (mismatches_.data()[linear_index] == true) { + mismatch = true; + // Delay the mismatch count increase for imag part, instead increase + // mismatch by 1 for the entire complex number. + num_mismatches_--; + } + if (mismatch == true) { + num_mismatches_++; + } + mismatches_.data()[linear_index] = mismatch; + } + // Compares the two literals elementwise. void CompareLiterals() { // Fast path optimization for the case were layouts match. if (LayoutUtil::Equal(actual_.shape().layout(), expected_.shape().layout())) { - tensorflow::gtl::ArraySlice expected_data = - expected_.data(); - tensorflow::gtl::ArraySlice actual_data = - actual_.data(); + absl::Span expected_data = expected_.data(); + absl::Span actual_data = actual_.data(); const int64 len = expected_data.size(); for (int64 i = 0; i < len; ++i) { CompareValues(expected_data[i], actual_data[i], i); @@ -401,23 +444,23 @@ class NearComparator { auto percent_string = [](float a, float b) { float pct = b == 0.0 ? 0.0 : 100.0 * a / b; - return Printf("%0.4f%%", pct); + return absl::StrFormat("%0.4f%%", pct); }; - Appendf(&out, - "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound " - "%g, rel bound %g\n", - num_mismatches_, - percent_string(num_mismatches_, element_count).c_str(), - ShapeUtil::HumanString(actual_.shape()).c_str(), - ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel); + StrAppendFormat( + &out, + "\nMismatch count %d (%s) in shape %s (%d elements), abs bound " + "%g, rel bound %g\n", + num_mismatches_, percent_string(num_mismatches_, element_count), + ShapeUtil::HumanString(actual_.shape()), + ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel); if (num_nan_mismatches_ > 0) { StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n"); } - Appendf(&out, "Top relative error mismatches:\n"); + StrAppendFormat(&out, "Top relative error mismatches:\n"); for (auto it = top_rel_mismatches_.rbegin(); it != top_rel_mismatches_.rend(); ++it) { - StrAppend(&out, " ", it->ToString(actual_.shape()).c_str(), "\n"); + StrAppend(&out, " ", it->ToString(actual_.shape()), "\n"); } if (!detailed_message_) { @@ -429,36 +472,37 @@ class NearComparator { for (int i = 0; i < abs_value_buckets_.size(); ++i) { const int64 bucket_size = abs_value_buckets_[i].first; const int64 bucket_mismatches = abs_value_buckets_[i].second; - string mismatch_str = bucket_mismatches > 0 - ? Printf(", mismatches %lld", bucket_mismatches) - : ""; - Appendf(&out, " %-6g <= x < %-6g : %7lld (%9s)%s\n", - kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1], - bucket_size, percent_string(bucket_size, element_count).c_str(), - mismatch_str.c_str()); + string mismatch_str = + bucket_mismatches > 0 + ? absl::StrFormat(", mismatches %d", bucket_mismatches) + : ""; + StrAppendFormat(&out, " %-6g <= x < %-6g : %7d (%9s)%s\n", + kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1], + bucket_size, percent_string(bucket_size, element_count), + mismatch_str); } auto print_accum_buckets = [&](const string& header, int64 total, - tensorflow::gtl::ArraySlice buckets) { + absl::Span buckets) { StrAppend(&out, header, ":\n"); - Appendf(&out, " < %-6g : %7lld (%s)\n", kErrorBucketBounds[0], - total - buckets[0], - percent_string(total - buckets[0], total).c_str()); + StrAppendFormat(&out, " < %-6g : %7d (%s)\n", kErrorBucketBounds[0], + total - buckets[0], + percent_string(total - buckets[0], total)); CHECK_EQ(buckets.size(), kErrorBucketBounds.size()); for (int i = 0; i < kErrorBucketBounds.size(); ++i) { - Appendf(&out, " >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i], - buckets[i], percent_string(buckets[i], total).c_str()); + StrAppendFormat(&out, " >= %-6g : %7d (%s)\n", kErrorBucketBounds[i], + buckets[i], percent_string(buckets[i], total)); } }; - Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n", - error_.abs, num_abs_mismatches_, - percent_string(num_abs_mismatches_, element_count).c_str()); + StrAppendFormat(&out, "Elements exceeding abs error bound %g: %d (%s)\n", + error_.abs, num_abs_mismatches_, + percent_string(num_abs_mismatches_, element_count)); print_accum_buckets( "Relative error breakdown of elements exceeding abs error bound", num_abs_mismatches_, rel_error_buckets_); - Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n", - error_.rel, num_rel_mismatches_, - percent_string(num_rel_mismatches_, element_count).c_str()); + StrAppendFormat(&out, "Elements exceeding rel error bound %g: %d (%s)\n", + error_.rel, num_rel_mismatches_, + percent_string(num_rel_mismatches_, element_count)); print_accum_buckets( "Absolute error breakdown of elements exceeding rel error bound", num_rel_mismatches_, abs_error_buckets_); @@ -527,6 +571,63 @@ constexpr std::array NearComparator::kAbsValueBucketBounds; template constexpr std::array NearComparator::kErrorBucketBounds; +Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual) { + TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape())); + std::vector multi_index(expected.shape().dimensions_size(), 0); + auto index = absl::MakeSpan(multi_index); + Status result; + switch (expected.shape().element_type()) { + case PRED: + result = Equal(expected, actual, index, 0); + break; + case U8: + result = Equal(expected, actual, index, 0); + break; + case S32: + result = Equal(expected, actual, index, 0); + break; + case S64: + result = Equal(expected, actual, index, 0); + break; + case U32: + result = Equal(expected, actual, index, 0); + break; + case U64: + result = Equal(expected, actual, index, 0); + break; + case BF16: + result = Equal(expected, actual, index, 0); + break; + case F16: + result = Equal(expected, actual, index, 0); + break; + case F32: + result = Equal(expected, actual, index, 0); + break; + case F64: + result = Equal(expected, actual, index, 0); + break; + case C64: + result = Equal(expected, actual, index, 0); + break; + case TUPLE: { + for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) { + result.Update(EqualHelper(LiteralSlice(expected, {i}), + LiteralSlice(actual, {i}))); + } + break; + } + case TOKEN: + // Tokens have no on-device representation and are trivially equal. + return Status::OK(); + default: + LOG(FATAL) << "Unsupported primitive type: " + << PrimitiveType_Name(expected.shape().element_type()); + } + + return result; +} + // Helper function for comparing two literals for nearness. Handles tuple-shapes // via recursion. shape_index is the ShapeIndex of expected (or actual) // currently being compared. @@ -543,17 +644,18 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual, const auto actual_element = LiteralSlice(actual, {i}); ShapeIndex element_index = shape_index; element_index.push_back(i); - Status res = + Status element_result = NearHelper(expected_element, actual_element, error, detailed_message, miscompare_callback, element_index); - if (!res.ok()) { - string err_message = Printf("\nArray at shape index %s%s", - element_index.ToString().c_str(), - res.error_message().c_str()); + if (!element_result.ok()) { + element_result = InvalidArgument("Array at shape index %s, %s", + element_index.ToString(), + element_result.error_message()); if (return_status.ok()) { - return_status = res; + return_status = element_result; } else { - return_status = AppendStatus(return_status, res.error_message()); + return_status = + AppendStatus(return_status, element_result.error_message()); } } } @@ -561,10 +663,10 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual, // Emit a top-level error message containing the top-level shape in case // of mismatch. int64 total_elements = RecursiveElementCount(actual.shape()); - return_status = InvalidArgument( - "\nMismatches in shape %s (%lld elements):\n%s", - ShapeUtil::HumanString(actual.shape()).c_str(), total_elements, - return_status.error_message().c_str()); + return_status = + InvalidArgument("\nMismatches in shape %s (%d elements):\n%s", + ShapeUtil::HumanString(actual.shape()), + total_elements, return_status.error_message()); } return return_status; } @@ -599,23 +701,23 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual, } } - // Non-floating point literal. - return literal_comparison::Equal(expected, actual); + // Non-floating point, non-tuple literal. + return EqualHelper(expected, actual); } } // namespace Status EqualShapes(const Shape& expected, const Shape& actual) { - if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) { - return InvalidArgument("tupleness-mismatch! want: %s got %s", - ShapeUtil::HumanString(expected).c_str(), - ShapeUtil::HumanString(actual).c_str()); + if (expected.element_type() != actual.element_type()) { + return InvalidArgument("element type mismatch, want: %s got %s", + ShapeUtil::HumanString(expected), + ShapeUtil::HumanString(actual)); } if (ShapeUtil::IsTuple(expected)) { if (ShapeUtil::TupleElementCount(expected) != ShapeUtil::TupleElementCount(actual)) { return InvalidArgument( - "want tuple element count: %lld got tuple element count: %lld", + "want tuple element count: %d got tuple element count: %d", ShapeUtil::TupleElementCount(expected), ShapeUtil::TupleElementCount(actual)); } @@ -626,17 +728,16 @@ Status EqualShapes(const Shape& expected, const Shape& actual) { return AppendStatus(result, StrCat("mismatch in tuple index", i)); } } - } else { + } else if (ShapeUtil::IsArray(expected)) { if (ShapeUtil::Rank(expected) != ShapeUtil::Rank(actual)) { return InvalidArgument("want rank of %s got rank of %s", - ShapeUtil::HumanString(expected).c_str(), - ShapeUtil::HumanString(actual).c_str()); + ShapeUtil::HumanString(expected), + ShapeUtil::HumanString(actual)); } if (expected.element_type() != actual.element_type()) { - return InvalidArgument( - "mismatch in primitive type %s vs %s", - PrimitiveType_Name(expected.element_type()).c_str(), - PrimitiveType_Name(actual.element_type()).c_str()); + return InvalidArgument("mismatch in primitive type %s vs %s", + PrimitiveType_Name(expected.element_type()), + PrimitiveType_Name(actual.element_type())); } if (expected.dimensions_size() != actual.dimensions_size()) { return InvalidArgument("want dimensions_size %d got dimensions_size %d", @@ -647,88 +748,51 @@ Status EqualShapes(const Shape& expected, const Shape& actual) { if (expected.dimensions(i) != actual.dimensions(i)) { return InvalidArgument( "mismatch in dimension #%d expected: %s actual: %s", i, - ShapeUtil::HumanString(expected).c_str(), - ShapeUtil::HumanString(actual).c_str()); + ShapeUtil::HumanString(expected), ShapeUtil::HumanString(actual)); } } } + // Non-array, non-tuple shapes are trivially equivalent. return Status::OK(); } +namespace { + +// If result is an error, extend the error message with the expected and actual +// literals. +Status EmitLiteralsInErrorMessage(const Status& result, + const LiteralSlice& expected, + const LiteralSlice& actual) { + if (result.ok()) { + return result; + } + return InvalidArgument("%s\n\nExpected literal:\n%s\n\nActual literal:\n%s", + result.error_message(), ToStringTruncated(expected), + ToStringTruncated(actual)); +} + +} // namespace + Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) { VLOG(1) << "expected:"; XLA_VLOG_LINES(1, expected.ToString()); VLOG(1) << "actual:"; XLA_VLOG_LINES(1, actual.ToString()); - - TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape())); - std::vector multi_index(expected.shape().dimensions_size(), 0); - Status result; - switch (expected.shape().element_type()) { - case PRED: - result = Equal(expected, actual, &multi_index, 0); - break; - case U8: - result = Equal(expected, actual, &multi_index, 0); - break; - case S32: - result = Equal(expected, actual, &multi_index, 0); - break; - case S64: - result = Equal(expected, actual, &multi_index, 0); - break; - case U32: - result = Equal(expected, actual, &multi_index, 0); - break; - case U64: - result = Equal(expected, actual, &multi_index, 0); - break; - case BF16: - result = Equal(expected, actual, &multi_index, 0); - break; - case F16: - result = Equal(expected, actual, &multi_index, 0); - break; - case F32: - result = Equal(expected, actual, &multi_index, 0); - break; - case F64: - result = Equal(expected, actual, &multi_index, 0); - break; - case C64: - result = Equal(expected, actual, &multi_index, 0); - break; - case TUPLE: { - for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) { - result.Update( - Equal(LiteralSlice(expected, {i}), LiteralSlice(actual, {i}))); - } - break; - } - default: - LOG(FATAL) - << "Unsupported primitive type in LiteralTestUtil::ExpectEqual: " - << PrimitiveType_Name(expected.shape().element_type()); - } - - if (result.ok()) { - return Status::OK(); - } - - return AppendStatus(result, - tensorflow::strings::Printf( - "\nat index: %s\nexpected: %s\nactual: %s", - Literal::MultiIndexAsString(multi_index).c_str(), - ToStringTruncated(expected).c_str(), - ToStringTruncated(actual).c_str())); + Status result = EqualHelper(expected, actual); + return EmitLiteralsInErrorMessage(result, expected, actual); } Status Near(const LiteralSlice& expected, const LiteralSlice& actual, const ErrorSpec& error, bool detailed_message, const MiscompareCallback& miscompare_callback) { - return NearHelper(expected, actual, error, detailed_message, - miscompare_callback, - /*shape_index=*/{}); + VLOG(1) << "Expected literal:"; + XLA_VLOG_LINES(1, expected.ToString()); + VLOG(1) << "Actual literal:"; + XLA_VLOG_LINES(1, actual.ToString()); + Status result = + NearHelper(expected, actual, error, detailed_message, miscompare_callback, + /*shape_index=*/{}); + return EmitLiteralsInErrorMessage(result, expected, actual); } string ToStringTruncated(const LiteralSlice& literal) { diff --git a/tensorflow/compiler/xla/literal_comparison.h b/tensorflow/compiler/xla/literal_comparison.h index 00a13e361932e74a9a1e614d5c851d3851208852..9e5bf7c1d062ef0f25d07a80d6ded8106df5dacc 100644 --- a/tensorflow/compiler/xla/literal_comparison.h +++ b/tensorflow/compiler/xla/literal_comparison.h @@ -20,7 +20,7 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_LITERAL_COMPARISON_H_ #include "tensorflow/compiler/xla/error_spec.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/core/lib/core/status.h" namespace xla { diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1a64594db86af31dcc196725d4b4f2a3ad9e4746 --- /dev/null +++ b/tensorflow/compiler/xla/literal_test.cc @@ -0,0 +1,1873 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/literal.h" + +#include + +#include "absl/memory/memory.h" +#include "absl/strings/match.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/xla/array3d.h" +#include "tensorflow/compiler/xla/array4d.h" +#include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/core/lib/core/casts.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace { + +using ::testing::ElementsAre; +using ::testing::HasSubstr; + +class LiteralUtilTest : public ::testing::Test { + protected: + LiteralUtilTest() { + Array4D arr4d({ + // clang-format off + { // i0=0 + { // i1=0 + {1, 2, 3}, // i2=0 + {4, 5, 6}, // i2=1 + {7, 8, 9}, // i2=2 + }, + { // i1=1 + {11, 12, 13}, + {14, 15, 16}, + {17, 18, 19}, + }, + }, + { // i0=1 + { // i1=0 + {101, 102, 103}, + {104, 105, 106}, + {107, 108, 109}, + }, + { // i1=1 + {201, 202, 203}, // i2=0 + {204, 205, 206}, // i2=1 + {207, 208, 209}, // i2=2 + }, + }, + // clang-format on + }); + + layout_r2_dim0major_ = LayoutUtil::MakeLayout({1, 0}); + layout_r2_dim0minor_ = LayoutUtil::MakeLayout({0, 1}); + layout_r3_dim0major_ = LayoutUtil::MakeLayout({2, 1, 0}); + layout_r3_dim0minor_ = LayoutUtil::MakeLayout({0, 1, 2}); + layout_r4_dim0major_ = LayoutUtil::MakeLayout({3, 2, 1, 0}); + layout_r4_dim0minor_ = LayoutUtil::MakeLayout({0, 1, 2, 3}); + + literal_r4_2x2x3x3_dim0major_ = + LiteralUtil::CreateR4FromArray4DWithLayout(arr4d, + layout_r4_dim0major_); + literal_r4_2x2x3x3_dim0minor_ = + LiteralUtil::CreateR4FromArray4DWithLayout(arr4d, + layout_r4_dim0minor_); + } + + Layout layout_r2_dim0major_; + Layout layout_r2_dim0minor_; + Layout layout_r3_dim0major_; + Layout layout_r3_dim0minor_; + Layout layout_r4_dim0major_; + Layout layout_r4_dim0minor_; + std::unique_ptr literal_r4_2x2x3x3_dim0major_; + std::unique_ptr literal_r4_2x2x3x3_dim0minor_; +}; + +TEST_F(LiteralUtilTest, LiteralScalarToString) { + auto true_lit = LiteralUtil::CreateR0(true); + EXPECT_EQ("true", true_lit->ToString()); + + auto false_lit = LiteralUtil::CreateR0(false); + EXPECT_EQ("false", false_lit->ToString()); + + auto u32_lit = LiteralUtil::CreateR0(42); + EXPECT_EQ("42", u32_lit->ToString()); + + auto s32_lit = LiteralUtil::CreateR0(-999); + EXPECT_EQ("-999", s32_lit->ToString()); + + auto f32_lit = LiteralUtil::CreateR0(3.14f); + EXPECT_EQ("3.14", f32_lit->ToString()); + + auto f16_lit = LiteralUtil::CreateR0(static_cast(0.5f)); + EXPECT_EQ("0.5", f16_lit->ToString()); + + auto c64_lit = LiteralUtil::CreateR0({3.14f, 2.78f}); + EXPECT_EQ("(3.14, 2.78)", c64_lit->ToString()); + + auto bf16_lit = LiteralUtil::CreateR0(static_cast(0.5f)); + EXPECT_EQ("0.5", bf16_lit->ToString()); + + // 3.14 will be rounded to 3.14062 in bfloat16 format. + auto bf16_lit_truncated = + LiteralUtil::CreateR0(static_cast(3.14f)); + ASSERT_EQ("3.14062", bf16_lit_truncated->ToString()); + + auto bf16_lit_truncated2 = + LiteralUtil::CreateR0(static_cast(9.001f)); + EXPECT_EQ("9", bf16_lit_truncated2->ToString()); +} + +TEST_F(LiteralUtilTest, LiteralVectorToString) { + auto pred_vec = LiteralUtil::CreateR1({true, false, true}); + EXPECT_EQ("{101}", pred_vec->ToString()); +} + +TEST_F(LiteralUtilTest, R2ToString) { + const auto literal = LiteralUtil::CreateR2({{1, 2}, {3, 4}, {5, 6}}); + const string expected = R"(s32[3,2] { + { 1, 2 }, + { 3, 4 }, + { 5, 6 } +})"; + EXPECT_EQ(expected, literal->ToString()); +} + +TEST_F(LiteralUtilTest, R3ToString) { + const auto literal = + LiteralUtil::CreateR3({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}}); + const string expected = R"(s32[3,2,1] { +{ { 1 }, + { 2 } }, +{ { 3 }, + { 4 } }, +{ { 5 }, + { 6 } } +})"; + EXPECT_EQ(expected, literal->ToString()); +} + +TEST_F(LiteralUtilTest, TupleToString) { + auto scalar = LiteralUtil::CreateR0(1.0); + auto matrix = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto tuple = LiteralUtil::MakeTuple({scalar.get(), matrix.get()}); + const string expected = R"((f32[], f32[2,2]) ( +1, +f32[2,2] { + { 1, 2 }, + { 3, 4 } +} +))"; + EXPECT_EQ(expected, tuple->ToString()); +} + +TEST_F(LiteralUtilTest, CreateR3FromArray3d) { + // clang-format off + Array3D array_3d({ + {{1.0f, 2.0f}, + {3.0f, 4.0f}, + {5.0f, 6.0f}}, + {{7.0f, 8.0f}, + {9.0f, 10.0f}, + {11.0f, 12.0f}}, + }); + // clang-format on + + auto literal = LiteralUtil::CreateR3FromArray3D(array_3d); + EXPECT_THAT(literal->shape().dimensions(), ElementsAre(2, 3, 2)); + string result = literal->ToString(); + const string expected = R"(f32[2,3,2] { +{ { 1, 2 }, + { 3, 4 }, + { 5, 6 } }, +{ { 7, 8 }, + { 9, 10 }, + { 11, 12 } } +})"; + EXPECT_EQ(expected, result); +} + +TEST_F(LiteralUtilTest, CreateSparse) { + std::vector dimensions = {8, 8, 8}; + Array2D indices = { + {3, 4, 5}, + {1, 2, 3}, + {2, 3, 4}, + {3, 5, 6}, + }; + std::vector values = {7, 8, 9, 10}; + auto literal = LiteralUtil::CreateSparse( + dimensions, SparseIndexArray(indices.n1() + 3, indices), values); + + Array2D expected_indices = { + {1, 2, 3}, + {2, 3, 4}, + {3, 4, 5}, + {3, 5, 6}, + }; + std::vector expected_values = {8, 9, 7, 10}; + + EXPECT_EQ(literal->sparse_indices()->data(), + absl::Span(expected_indices.data(), + expected_indices.num_elements())); + EXPECT_EQ(literal->data(), absl::Span(expected_values)); +} + +TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) { + // clang-format off + auto literal = LiteralUtil::CreateR4Projected({ + {1, 2}, + {1001, 1002}, + {2001, 2002}, + }, /*projection_p=*/1, /*projection_z=*/2); + // clang-format on + EXPECT_THAT(literal->shape().dimensions(), ElementsAre(1, 2, 3, 2)); + string result = literal->ToString(); + const string expected = R"(f32[1,2,3,2] { + { /*i0=0*/ + { /*i1=0*/ + {1, 2}, + {1001, 1002}, + {2001, 2002} + }, + { /*i1=1*/ + {1, 2}, + {1001, 1002}, + {2001, 2002} + } + } +})"; + EXPECT_EQ(expected, result); +} + +TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) { + EXPECT_THAT(literal_r4_2x2x3x3_dim0major_->shape().dimensions(), + ElementsAre(2, 2, 3, 3)); + string result = literal_r4_2x2x3x3_dim0major_->ToString(); + const string expected = R"(f32[2,2,3,3] { + { /*i0=0*/ + { /*i1=0*/ + {1, 2, 3}, + {4, 5, 6}, + {7, 8, 9} + }, + { /*i1=1*/ + {11, 12, 13}, + {14, 15, 16}, + {17, 18, 19} + } + }, + { /*i0=1*/ + { /*i1=0*/ + {101, 102, 103}, + {104, 105, 106}, + {107, 108, 109} + }, + { /*i1=1*/ + {201, 202, 203}, + {204, 205, 206}, + {207, 208, 209} + } + } +})"; + EXPECT_EQ(expected, result); +} + +TEST_F(LiteralUtilTest, EachCellR2F32) { + // clang-format off + auto literal = LiteralUtil::CreateR2({ + {3.1f, 4.2f}, + {9.3f, 12.4f}, + }); + // clang-format on + std::vector> seen; + literal->EachCellAsString( + [&seen](absl::Span indices, const string& value) { + seen.emplace_back(indices[0], indices[1], value); + }); + + using Elem = std::tuple; + std::vector expected = {Elem(0, 0, "3.1"), Elem(0, 1, "4.2"), + Elem(1, 0, "9.3"), Elem(1, 1, "12.4")}; + EXPECT_EQ(expected, seen); +} + +TEST_F(LiteralUtilTest, ScalarEquality) { + // Test equality with scalars. + auto f32_42 = LiteralUtil::CreateR0(42.0); + auto f32_42_clone = LiteralUtil::CreateR0(42.0); + + EXPECT_EQ(*f32_42, *f32_42); + EXPECT_EQ(*f32_42, *f32_42_clone); + + auto f32_123 = LiteralUtil::CreateR0(123.0); + EXPECT_NE(*f32_42, *f32_123); + + auto f64_42 = LiteralUtil::CreateR0(42.0); + EXPECT_NE(*f32_42, *f64_42); +} + +TEST_F(LiteralUtilTest, NonScalarEquality) { + // Test equality with nonscalars. + auto matrix = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto matrix_clone = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto matrix_different = + LiteralUtil::CreateR2({{4.0, 3.0}, {1.0, 2.0}}); + auto vector_literal = LiteralUtil::CreateR1({1.0, 2.0, 3.0, 4.0}); + auto scalar = LiteralUtil::CreateR0(1.0); + Literal nil(ShapeUtil::MakeNil()); + + EXPECT_EQ(*matrix, *matrix); + EXPECT_EQ(*matrix, *matrix_clone); + EXPECT_NE(*matrix, *matrix_different); + EXPECT_NE(*matrix, *vector_literal); + EXPECT_NE(*matrix, *scalar); + EXPECT_NE(*matrix, nil); + EXPECT_EQ(nil, nil); +} + +TEST_F(LiteralUtilTest, TokenEquality) { + auto token0 = LiteralUtil::CreateToken(); + auto token1 = LiteralUtil::CreateToken(); + auto scalar = LiteralUtil::CreateR0(1.0); + + EXPECT_EQ(*token0, *token1); + EXPECT_NE(*token0, *scalar); + + EXPECT_EQ(*LiteralUtil::MakeTuple({token0.get()}), + *LiteralUtil::MakeTuple({token0.get()})); + EXPECT_EQ(*LiteralUtil::MakeTuple({token0.get(), scalar.get()}), + *LiteralUtil::MakeTuple({token1.get(), scalar.get()})); + EXPECT_NE(*LiteralUtil::MakeTuple({token0.get(), scalar.get()}), + *LiteralUtil::MakeTuple({scalar.get(), token1.get()})); +} + +TEST_F(LiteralUtilTest, DifferentLayoutEquality) { + // Test equality with literals which have different layouts. + auto colmajor = absl::make_unique( + ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})); + colmajor->Set({0, 0}, 1.0); + colmajor->Set({0, 1}, 2.0); + colmajor->Set({1, 0}, 3.0); + colmajor->Set({1, 1}, 4.0); + + auto rowmajor = absl::make_unique( + ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0})); + rowmajor->Set({0, 0}, 1.0); + rowmajor->Set({0, 1}, 2.0); + rowmajor->Set({1, 0}, 3.0); + rowmajor->Set({1, 1}, 4.0); + + EXPECT_EQ(*rowmajor, *colmajor); +} + +TEST_F(LiteralUtilTest, TupleEquality) { + // Test equality with tuples. + auto scalar = LiteralUtil::CreateR0(1.0); + auto matrix = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto tuple1 = LiteralUtil::MakeTuple({scalar.get(), matrix.get()}); + + // Tuple with the same elements. One element is shared with the original + // tuple, the other is a clone of the element in the original tuple. + auto scalar_clone = LiteralUtil::CreateR0(1.0); + auto tuple2 = LiteralUtil::MakeTuple({scalar_clone.get(), matrix.get()}); + EXPECT_EQ(*tuple1, *tuple2); + + // Tuple with elements reversed. + auto reversed_tuple = LiteralUtil::MakeTuple({matrix.get(), scalar.get()}); + EXPECT_NE(*tuple1, *reversed_tuple); + + // Tuple with different value. + auto scalar_42 = LiteralUtil::CreateR0(42.0); + auto different_tuple = + LiteralUtil::MakeTuple({scalar_42.get(), matrix.get()}); + EXPECT_NE(*tuple1, *different_tuple); +} + +TEST_F(LiteralUtilTest, C64Equality) { + // Test equality with tuples. + auto vector = LiteralUtil::CreateR1({{1.0, 2.0}, {3.0, 4.0}}); + + // Tuple with the same elements. One element is shared with the original + // tuple, the other is a clone of the element in the original tuple. + auto vector_clone = + LiteralUtil::CreateR1({{1.0, 2.0}, {3.0, 4.0}}); + EXPECT_EQ(*vector, *vector_clone); + + auto vector_reversed = + LiteralUtil::CreateR1({{3.0, 4.0}, {1.0, 2.0}}); + EXPECT_NE(*vector, *vector_reversed); +} + +TEST_F(LiteralUtilTest, IsAllTuple) { + auto element1 = LiteralUtil::CreateR0(0.0); + auto element2 = LiteralUtil::CreateR2({{0.0, 0.0}, {0.0, 0.0}}); + auto tuple = LiteralUtil::MakeTuple({element1.get(), element1.get()}); + + // Tuples should always return false for IsAll. + EXPECT_FALSE(tuple->IsAll(0)); + EXPECT_FALSE(tuple->IsAll(1)); +} + +// Verifies that CreateFromShape works for tuples. +TEST_F(LiteralUtilTest, CreateFromShapeTuple) { + auto scalar = LiteralUtil::CreateR0(0.0); + auto matrix = LiteralUtil::CreateR2({{0, 0}, {0, 0}}); + auto tuple = LiteralUtil::MakeTuple({scalar.get(), matrix.get()}); + + auto x = Literal::CreateFromShape(tuple->shape()); + EXPECT_EQ(*tuple, *x); +} + +TEST_F(LiteralUtilTest, IsAll) { + EXPECT_TRUE(LiteralUtil::CreateR0(false)->IsAll(0)); + EXPECT_TRUE(LiteralUtil::CreateR0(true)->IsAll(1)); + EXPECT_FALSE(LiteralUtil::CreateR0(false)->IsAll(1)); + EXPECT_FALSE(LiteralUtil::CreateR0(false)->IsAll(2)); + EXPECT_FALSE(LiteralUtil::CreateR0(true)->IsAll(0)); + EXPECT_FALSE(LiteralUtil::CreateR0(true)->IsAll(2)); + EXPECT_FALSE(LiteralUtil::CreateR0(true)->IsAll(-1)); + + // We shouldn't reinterpret int8_min as an unsigned type and then decide that + // it is equal to 255. + auto int8_min = std::numeric_limits::min(); + EXPECT_FALSE(LiteralUtil::CreateR0(255)->IsAll(int8_min)); + + EXPECT_TRUE(LiteralUtil::CreateR0(42.0)->IsAll(42)); + EXPECT_FALSE(LiteralUtil::CreateR0(42.0001)->IsAll(42)); + + EXPECT_TRUE(LiteralUtil::CreateR1({100, 100, 100})->IsAll(100)); + EXPECT_FALSE(LiteralUtil::CreateR1({100, 100, 100.001})->IsAll(100)); + + EXPECT_TRUE(LiteralUtil::CreateR2({{8, 8}, {8, 8}})->IsAll(8)); + EXPECT_FALSE(LiteralUtil::CreateR2({{8, 8}, {8, 9}})->IsAll(8)); + EXPECT_FALSE(LiteralUtil::CreateR2({{9, 8}, {8, 8}})->IsAll(8)); + + half h8(8.0f); + half h9(9.0f); + EXPECT_TRUE(LiteralUtil::CreateR2({{h8}, {h8}})->IsAll(8)); + EXPECT_FALSE(LiteralUtil::CreateR2({{h8}, {h9}})->IsAll(8)); + EXPECT_FALSE(LiteralUtil::CreateR2({{h9}, {h8}})->IsAll(8)); + + bfloat16 b8(8.0f); + bfloat16 b9(9.0f); + + EXPECT_TRUE(LiteralUtil::CreateR2({{b8}, {b8}})->IsAll(8)); + EXPECT_FALSE(LiteralUtil::CreateR2({{b8}, {b9}})->IsAll(8)); + EXPECT_FALSE(LiteralUtil::CreateR2({{b9}, {b8}})->IsAll(8)); + + // 9.001 will be truncated to 9.0 + bfloat16 b91(9.001f); + bfloat16 b90(9.00f); + EXPECT_TRUE(LiteralUtil::CreateR2({{b91}, {b90}})->IsAll(9.0)); + + complex64 c8_9 = {8, 9}; + EXPECT_FALSE(LiteralUtil::CreateR2({{c8_9}, {c8_9}})->IsAll(8)); + + auto uint64_max = std::numeric_limits::max(); + EXPECT_FALSE(LiteralUtil::CreateR2( + {{uint64_max, uint64_max}, {uint64_max, uint64_max}}) + ->IsAll(-1)); +} + +TEST_F(LiteralUtilTest, IsAllFloat) { + // IsAllFloat always returns false when the literal is not floating-point. + EXPECT_FALSE(LiteralUtil::CreateR0(false)->IsAllFloat(0)); + EXPECT_FALSE(LiteralUtil::CreateR0(0)->IsAllFloat(0)); + EXPECT_FALSE(LiteralUtil::CreateR0(0)->IsAllFloat(0)); + EXPECT_FALSE(LiteralUtil::CreateR0(0)->IsAllFloat(0)); + + EXPECT_TRUE(LiteralUtil::CreateR0(0)->IsAllFloat(0)); + EXPECT_TRUE(LiteralUtil::CreateR0(.5)->IsAllFloat(.5)); + EXPECT_TRUE(LiteralUtil::CreateR0(-.5)->IsAllFloat(-.5)); + EXPECT_FALSE(LiteralUtil::CreateR0(-.5)->IsAllFloat(-.49)); + EXPECT_FALSE( + LiteralUtil::CreateR2({{0, 0, 0}, {0, .1, 0}})->IsAllFloat(0)); + EXPECT_TRUE(LiteralUtil::CreateR2({{.5, .5, .5}, {.5, .5, .5}}) + ->IsAllFloat(.5)); + + EXPECT_TRUE(LiteralUtil::CreateR0(0)->IsAllFloat(0)); + EXPECT_TRUE(LiteralUtil::CreateR0(.5)->IsAllFloat(.5)); + EXPECT_TRUE(LiteralUtil::CreateR0(-.5)->IsAllFloat(-.5)); + EXPECT_FALSE(LiteralUtil::CreateR0(-.5)->IsAllFloat(-.49)); + EXPECT_FALSE( + LiteralUtil::CreateR2({{0, 0, 0}, {0, .1, 0}})->IsAllFloat(0)); +} + +TEST_F(LiteralUtilTest, IsAllComplex) { + // IsAllComplex always returns false when the literal is not complex. + EXPECT_FALSE(LiteralUtil::CreateR0(false)->IsAllComplex(0)); + EXPECT_FALSE(LiteralUtil::CreateR0(0)->IsAllComplex(0)); + EXPECT_FALSE(LiteralUtil::CreateR0(0)->IsAllComplex(0)); + EXPECT_FALSE(LiteralUtil::CreateR0(0)->IsAllComplex(0)); + EXPECT_FALSE(LiteralUtil::CreateR0(0)->IsAllComplex(0)); + EXPECT_FALSE(LiteralUtil::CreateR0(0)->IsAllComplex(0)); + + complex64 c8_9 = {8, 9}; + complex64 c7_9 = {7, 9}; + EXPECT_TRUE(LiteralUtil::CreateR2({{c8_9}, {c8_9}}) + ->IsAllComplex({8.0f, 9.0f})); + EXPECT_FALSE(LiteralUtil::CreateR2({{c7_9}, {c8_9}}) + ->IsAllComplex({8.0f, 9.0f})); + EXPECT_FALSE(LiteralUtil::CreateR2({{c8_9}, {c7_9}}) + ->IsAllComplex({8.0f, 9.0f})); +} + +TEST_F(LiteralUtilTest, IsAllFirst) { + // IsAllComplex always returns false when the literal is not complex. + EXPECT_FALSE(LiteralUtil::CreateR1({false, true})->IsAllFirst()); + EXPECT_TRUE(LiteralUtil::CreateR1({false, false})->IsAllFirst()); + EXPECT_FALSE(LiteralUtil::CreateR1({1, 1, 2})->IsAllFirst()); + EXPECT_TRUE(LiteralUtil::CreateR1({5, 5, 5, 5})->IsAllFirst()); + EXPECT_FALSE(LiteralUtil::CreateR1({1, 1, 2})->IsAllFirst()); + EXPECT_TRUE(LiteralUtil::CreateR1({5, 5, 5, 5})->IsAllFirst()); + EXPECT_FALSE(LiteralUtil::CreateR1({1, 1, 2})->IsAllFirst()); + EXPECT_TRUE(LiteralUtil::CreateR1({5, 5, 5, 5})->IsAllFirst()); + EXPECT_FALSE(LiteralUtil::CreateR1({1, 1, 2})->IsAllFirst()); + + complex64 c8_9 = {8, 9}; + complex64 c7_9 = {7, 9}; + EXPECT_TRUE(LiteralUtil::CreateR2({{c8_9}, {c8_9}})->IsAllFirst()); + EXPECT_FALSE( + LiteralUtil::CreateR2({{c7_9}, {c8_9}})->IsAllFirst()); +} + +TEST_F(LiteralUtilTest, IsZero) { + auto scalar_zero = LiteralUtil::CreateR0(0.0f); + auto scalar_one = LiteralUtil::CreateR0(1.0f); + EXPECT_TRUE(scalar_zero->IsZero({})); + EXPECT_FALSE(scalar_one->IsZero({})); + + auto array = LiteralUtil::CreateR2({{1, 2, 0, 3}, {1, 0, 1, 2}}); + EXPECT_FALSE(array->IsZero({0, 1})); + EXPECT_TRUE(array->IsZero({0, 2})); + EXPECT_TRUE(array->IsZero({1, 1})); + EXPECT_FALSE(array->IsZero({1, 2})); + + auto complex_zero = LiteralUtil::CreateR0(0.0f); + auto complex_nonzero = LiteralUtil::CreateR0(0.5f); + EXPECT_TRUE(complex_zero->IsZero({})); + EXPECT_FALSE(complex_nonzero->IsZero({})); +} + +template +class LiteralUtilTestTemplated : public ::testing::Test {}; + +using TestedTypes = ::testing::Types; +TYPED_TEST_CASE(LiteralUtilTestTemplated, TestedTypes); + +TYPED_TEST(LiteralUtilTestTemplated, Relayout2x2) { + // Make a non-integer for floating point types. + TypeParam half = TypeParam(1) / TypeParam(2); + auto data = LiteralUtil::CreateR2({{half, 2}, {3, 4}}); + const Layout layout01 = LayoutUtil::MakeLayout({0, 1}); + const Layout layout10 = LayoutUtil::MakeLayout({1, 0}); + + auto data01 = data->Relayout(layout01); + EXPECT_TRUE(LayoutUtil::Equal(data01->shape().layout(), layout01)); + EXPECT_EQ(*data, *data01); + + auto data10 = data->Relayout(layout10); + EXPECT_TRUE(LayoutUtil::Equal(data10->shape().layout(), layout10)); + EXPECT_EQ(*data, *data10); +} + +TEST_F(LiteralUtilTest, ReshapeR0) { + auto original = LiteralUtil::CreateR0(1.7f); + auto reshape = original->Reshape(/*dimensions=*/{}).ConsumeValueOrDie(); + EXPECT_EQ(*original, *reshape); +} + +TEST_F(LiteralUtilTest, ReshapeR4) { + // clang-format off + // F32[1x3x2x4] + auto original = LiteralUtil::CreateR4WithLayout({{ + {{10, 11, 12, 13}, {14, 15, 16, 17}}, + {{18, 19, 20, 21}, {22, 23, 24, 25}}, + {{26, 27, 28, 29}, {30, 31, 32, 33}}, + }}, layout_r4_dim0major_); + // F32[1x3x4x2] + auto expected = LiteralUtil::CreateR3WithLayout({ + {{10, 11}, {12, 13}, {14, 15}, {16, 17}}, + {{18, 19}, {20, 21}, {22, 23}, {24, 25}}, + {{26, 27}, {28, 29}, {30, 31}, {32, 33}}, + }, layout_r3_dim0major_); + // clang-format on + auto reshape = original->Reshape({3, 4, 2}).ConsumeValueOrDie(); + + EXPECT_EQ(*expected, *reshape); +} + +TEST_F(LiteralUtilTest, ReshapeR4Dim0Minor) { + // clang-format off + // F32[1x3x2x4] + auto original = LiteralUtil::CreateR4WithLayout({{ + {{10, 11, 12, 13}, {14, 15, 16, 17}}, + {{18, 19, 20, 21}, {22, 23, 24, 25}}, + {{26, 27, 28, 29}, {30, 31, 32, 33}}, + }}, layout_r4_dim0minor_); + // F32[1x3x4x2] + auto expected = LiteralUtil::CreateR3WithLayout({ + {{10, 11}, {12, 13}, {14, 15}, {16, 17}}, + {{18, 19}, {20, 21}, {22, 23}, {24, 25}}, + {{26, 27}, {28, 29}, {30, 31}, {32, 33}}, + }, layout_r3_dim0major_); + // clang-format on + auto reshape = original->Reshape({3, 4, 2}).ConsumeValueOrDie(); + + EXPECT_EQ(*expected, *reshape); +} + +TEST_F(LiteralUtilTest, TransposeR0) { + auto original = LiteralUtil::CreateR0(1.7f); + auto reshape = original->Transpose(/*permutation=*/{}); + EXPECT_EQ(*original, *reshape); +} + +TEST_F(LiteralUtilTest, TransposeR4) { + // clang-format off + // F32[1x3x2x4] + auto original = LiteralUtil::CreateR4({{ + {{10, 11, 12, 13}, {14, 15, 16, 17}}, + {{18, 19, 20, 21}, {22, 23, 24, 25}}, + {{26, 27, 28, 29}, {30, 31, 32, 33}}, + }}); + // clang-format on + auto reshape = original->Transpose(/*permutation=*/{2, 3, 0, 1}); + + reshape->EachCell([&](absl::Span indices, float value) { + EXPECT_EQ(value, original->Get( + {indices[2], indices[3], indices[0], indices[1]})); + }); +} + +TEST_F(LiteralUtilTest, TestR4RelayoutEquivalence) { + // Tests that using Relayout on an array is equivalent to creating it in the + // target layout in the first place. + auto dim0minor_relaid_to_dim0major = + literal_r4_2x2x3x3_dim0minor_->Relayout(layout_r4_dim0major_); + EXPECT_EQ(*literal_r4_2x2x3x3_dim0major_, *dim0minor_relaid_to_dim0major); + + auto dim0major_relaid_to_dim0minor = + literal_r4_2x2x3x3_dim0major_->Relayout(layout_r4_dim0minor_); + EXPECT_EQ(*literal_r4_2x2x3x3_dim0minor_, *dim0major_relaid_to_dim0minor); +} + +TEST_F(LiteralUtilTest, TestR2LinearLayout) { + // Test expected memory layout of R2 dim0-minor (column-major) literal. + auto mat_dim0minor = LiteralUtil::CreateR2WithLayout( + {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0minor_); + EXPECT_EQ(mat_dim0minor->element_count(), 6); + EXPECT_THAT(mat_dim0minor->data(), ElementsAre(1, 4, 2, 5, 3, 6)); + + // Test expected memory layout when using Relayout to row major. + auto relaid_mat_to_dim0major = mat_dim0minor->Relayout(layout_r2_dim0major_); + EXPECT_THAT(relaid_mat_to_dim0major->data(), + ElementsAre(1, 2, 3, 4, 5, 6)); + + // Test expected memory layout of R2 created with dim0-major (row-major). + auto mat_dim0major = LiteralUtil::CreateR2WithLayout( + {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0major_); + EXPECT_EQ(mat_dim0major->element_count(), 6); + EXPECT_THAT(mat_dim0major->data(), ElementsAre(1, 2, 3, 4, 5, 6)); + + // Test expected memory layout when using Relayout to column major. + auto relaid_mat_to_dim0minor = mat_dim0major->Relayout(layout_r2_dim0minor_); + EXPECT_THAT(relaid_mat_to_dim0minor->data(), + ElementsAre(1, 4, 2, 5, 3, 6)); +} + +TEST_F(LiteralUtilTest, TestR3LinearLayout) { + // Test expected memory layout of R3 dim0-minor (column-major) literal. + Array3D arr3d( + // clang-format off + { + { + {1, 2, 3}, + {4, 5, 6}, + }, + { + {7, 8, 9}, + {10, 11, 12}, + }, + }); // clang-format on + auto lit_dim0minor = LiteralUtil::CreateR3FromArray3DWithLayout( + arr3d, layout_r3_dim0minor_); + + EXPECT_EQ(lit_dim0minor->element_count(), 12); + std::vector expected_dim0minor{1, 7, 4, 10, 2, 8, 5, 11, 3, 9, 6, 12}; + EXPECT_THAT(lit_dim0minor->data(), + testing::ElementsAreArray(expected_dim0minor)); + + // Test expected memory layout when using Relayout to row major. + auto relaid_lit_to_dim0major = lit_dim0minor->Relayout(layout_r3_dim0major_); + std::vector expected_dim0major{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + EXPECT_THAT(relaid_lit_to_dim0major->data(), + testing::ElementsAreArray(expected_dim0major)); + + // Test expected memory layout of R3 created with dim0-major (row-major). + auto lit_dim0major = LiteralUtil::CreateR3FromArray3DWithLayout( + arr3d, layout_r3_dim0major_); + EXPECT_EQ(lit_dim0major->element_count(), 12); + EXPECT_THAT(lit_dim0major->data(), + testing::ElementsAreArray(expected_dim0major)); + + // Test expected memory layout when using Relayout to column major. + auto relaid_lit_to_dim0minor = lit_dim0major->Relayout(layout_r3_dim0minor_); + EXPECT_THAT(relaid_lit_to_dim0minor->data(), + testing::ElementsAreArray(expected_dim0minor)); +} + +TEST_F(LiteralUtilTest, SliceR0S32) { + auto input = LiteralUtil::CreateR0(1); + auto result = input->Slice({}, {}); + EXPECT_EQ(*input, *result); +} + +TEST_F(LiteralUtilTest, SliceR1F32) { + auto input = LiteralUtil::CreateR1({1.0, 2.0, 3.0, 4.0, 5.0}); + auto result = input->Slice({3}, {4}); + auto expected = LiteralUtil::CreateR1({4.0}); + EXPECT_EQ(*expected, *result); +} + +TEST_F(LiteralUtilTest, SliceR2U32) { + auto input_3x4 = LiteralUtil::CreateR2( + {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}); + auto result = input_3x4->Slice({0, 2}, {2, 4}); + auto expected = LiteralUtil::CreateR2({{3, 4}, {7, 8}}); + EXPECT_EQ(*expected, *result); +} + +TEST_F(LiteralUtilTest, SliceR3U32Full) { + auto input_2x3x2 = LiteralUtil::CreateR3( + {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}); + auto result = input_2x3x2->Slice({0, 0, 0}, {2, 3, 2}); + EXPECT_EQ(*input_2x3x2, *result); +} + +TEST_F(LiteralUtilTest, PopulateR1S64) { + Literal output(ShapeUtil::MakeShape(S64, {1})); + output.PopulateR1({77}); + auto expected = LiteralUtil::CreateR1({77}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateR1U64) { + Literal output(ShapeUtil::MakeShape(U64, {2})); + output.PopulateR1({{77, 88}}); + auto expected = LiteralUtil::CreateR1({{77, 88}}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateR1C64) { + Literal output(ShapeUtil::MakeShape(C64, {1})); + output.PopulateR1({{77, 88}}); + auto expected = LiteralUtil::CreateR1({{77, 88}}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateR2C64) { + Literal output(ShapeUtil::MakeShape(C64, {2, 2})); + output.PopulateR2({{{7, 8}, {9, 10}}, {{1, 2}, {3, 4}}}); + auto expected = + LiteralUtil::CreateR2({{{7, 8}, {9, 10}}, {{1, 2}, {3, 4}}}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR0BF16) { + Literal output(ShapeUtil::MakeShape(BF16, {})); + bfloat16 h(0.25f); + output.PopulateWithValue(h); + auto expected = LiteralUtil::CreateR0(h); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR1BF16) { + Literal output(ShapeUtil::MakeShape(BF16, {3})); + bfloat16 h(0.5f); + output.PopulateWithValue(h); + auto expected = LiteralUtil::CreateR1({h, h, h}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR2BF16) { + Literal output(ShapeUtil::MakeShape(BF16, {2, 2})); + bfloat16 h(2.0f); + output.PopulateWithValue(h); + auto expected = LiteralUtil::CreateR2({{h, h}, {h, h}}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR0F32) { + Literal output(ShapeUtil::MakeShape(F32, {})); + output.PopulateWithValue(2.5f); + auto expected = LiteralUtil::CreateR0(2.5f); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR1S64) { + Literal output(ShapeUtil::MakeShape(S64, {3})); + output.PopulateWithValue(-7); + auto expected = LiteralUtil::CreateR1({-7, -7, -7}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR2U64) { + Literal output(ShapeUtil::MakeShape(U64, {2, 2})); + output.PopulateWithValue(42); + auto expected = LiteralUtil::CreateR2({{42, 42}, {42, 42}}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR2C64) { + Literal output(ShapeUtil::MakeShape(C64, {2, 2})); + output.PopulateWithValue({4, 2}); + auto expected = + LiteralUtil::CreateR2({{{4, 2}, {4, 2}}, {{4, 2}, {4, 2}}}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR0F16) { + Literal output(ShapeUtil::MakeShape(F16, {})); + half h(0.25f); + output.PopulateWithValue(h); + auto expected = LiteralUtil::CreateR0(h); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR1F16) { + Literal output(ShapeUtil::MakeShape(F16, {3})); + half h(0.5f); + output.PopulateWithValue(h); + auto expected = LiteralUtil::CreateR1({h, h, h}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, PopulateWithValueR2F16) { + Literal output(ShapeUtil::MakeShape(F16, {2, 2})); + half h(2.0f); + output.PopulateWithValue(h); + auto expected = LiteralUtil::CreateR2({{h, h}, {h, h}}); + EXPECT_EQ(output, *expected); +} + +TEST_F(LiteralUtilTest, ReplicateR2U32) { + auto input = LiteralUtil::CreateR2( + {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}); + auto output = input->Replicate(3); + auto expected = LiteralUtil::CreateR3( + {{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}, + {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}, + {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}}); + EXPECT_EQ(*output, *expected); +} + +TEST_F(LiteralUtilTest, CopySliceFrom) { + const int64 dimensions[] = {17, 15, 34, 21}; + const int64 layouts[][4] = { + {3, 2, 1, 0}, {0, 2, 1, 3}, {0, 1, 2, 3}, {2, 0, 3, 1}, {1, 3, 0, 2}}; + for (const auto& layout : layouts) { + Shape shape = ShapeUtil::MakeShapeWithLayout( + primitive_util::NativeToPrimitiveType(), dimensions, layout); + + auto source = Literal::CreateFromShape(shape); + const int64 zero_base[] = {0, 0, 0, 0}; + const int64 step[] = {1, 1, 1, 1}; + uint32 seqnr = 0; + auto init_proc = [&](absl::Span indexes) { + source->Set(indexes, ++seqnr); + return true; + }; + ShapeUtil::ForEachIndex(source->shape(), zero_base, dimensions, step, + init_proc); + + auto blank = Literal::CreateFromShape(shape); + const int64 src_base[] = {3, 1, 5, 7}; + const int64 dest_base[] = {6, 4, 12, 2}; + const int64 copy_size[] = {7, 8, 11, 9}; + TF_EXPECT_OK(blank->CopySliceFrom(*source, src_base, dest_base, copy_size)); + + std::vector source_indexes(TF_ARRAYSIZE(dimensions), 0); + std::vector blank_indexes(TF_ARRAYSIZE(dimensions), 0); + bool matched = true; + auto check_proc = [&](absl::Span indexes) { + std::copy(indexes.begin(), indexes.end(), source_indexes.begin()); + std::transform(source_indexes.begin(), source_indexes.end(), src_base, + source_indexes.begin(), std::plus()); + std::copy(indexes.begin(), indexes.end(), blank_indexes.begin()); + std::transform(blank_indexes.begin(), blank_indexes.end(), dest_base, + blank_indexes.begin(), std::plus()); + auto bval = blank->Get(blank_indexes); + matched = (bval != 0 && bval == source->Get(source_indexes)); + return matched; + }; + + ShapeUtil::ForEachIndex(source->shape(), zero_base, copy_size, step, + check_proc); + EXPECT_TRUE(matched); + } +} + +TEST_F(LiteralUtilTest, CopyFromScalars) { + auto zero = LiteralUtil::CreateR0(0); + auto nine = LiteralUtil::CreateR0(9); + TF_EXPECT_OK(zero->CopyFrom(*nine)); + EXPECT_EQ(*zero, *nine); + + auto vect = LiteralUtil::CreateR1({3, 4, 9, 12, 5, 17, 21}); + TF_EXPECT_OK(zero->CopySliceFrom(*vect, {5}, {}, {})); + EXPECT_EQ(zero->Get({}), 17); + TF_EXPECT_OK(vect->CopySliceFrom(*zero, {}, {4}, {})); + EXPECT_EQ(vect->Get({4}), 17); +} + +TEST_F(LiteralUtilTest, CopyFromAndToZeroElement) { + const Shape empty_r1_shape = ShapeUtil::MakeShape(F32, {0}); + const auto const_nine = LiteralUtil::CreateR1({9}); + const auto const_empty = Literal::CreateFromShape(empty_r1_shape); + + { + // Source contains dimension with zero elements. + const auto empty = Literal::CreateFromShape(empty_r1_shape); + auto nine = LiteralUtil::CreateR1({9}); + + TF_EXPECT_OK(nine->CopySliceFrom(*empty, {0}, {0}, {0})); + EXPECT_EQ(*nine, *const_nine); + } + + { + // Copy 0 element to destination with zero elements. + const auto empty = Literal::CreateFromShape(empty_r1_shape); + auto nine = LiteralUtil::CreateR1({9}); + + TF_EXPECT_OK(empty->CopySliceFrom(*nine, {0}, {0}, {0})); + EXPECT_EQ(*empty, *const_empty); + } +} + +TEST_F(LiteralUtilTest, CopyFromNilShape) { + Literal nil_literal0(ShapeUtil::MakeNil()); + Literal nil_literal1(ShapeUtil::MakeNil()); + // This doesn't actually do any copying, but it should succeed. + TF_ASSERT_OK(nil_literal0.CopyFrom(nil_literal1)); +} + +TEST_F(LiteralUtilTest, CopyFromArrays) { + auto scalar_42 = LiteralUtil::CreateR0(42.0); + auto scalar_123 = LiteralUtil::CreateR0(123.0); + EXPECT_NE(*scalar_42, *scalar_123); + TF_ASSERT_OK(scalar_42->CopyFrom(*scalar_123, /*dest_shape_index=*/{}, + /*src_shape_index=*/{})); + EXPECT_EQ(*scalar_42, *scalar_123); + EXPECT_EQ(scalar_42->Get({}), 123.0f); + + auto matrix_1234 = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto matrix_5678 = LiteralUtil::CreateR2({{5.0, 6.0}, {7.0, 8.0}}); + EXPECT_NE(*matrix_1234, *matrix_5678); + EXPECT_EQ(matrix_1234->Get({0, 0}), 1.0f); + TF_ASSERT_OK(matrix_1234->CopyFrom(*matrix_5678, /*dest_shape_index=*/{}, + /*src_shape_index=*/{})); + EXPECT_EQ(*matrix_1234, *matrix_5678); + EXPECT_EQ(matrix_1234->Get({0, 0}), 5.0f); +} + +TEST_F(LiteralUtilTest, CopyFromTuples) { + auto matrix = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + Literal nil_literal(ShapeUtil::MakeNil()); + auto nested_tuple = LiteralUtil::MakeTuple( + {matrix.get(), + LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(42).get(), + LiteralUtil::CreateR1({23.0, 44.0}).get(), &nil_literal}) + .get()}); + // Create a tuple the same shape as the inner tuple of nested_tuple but with + // different values.. + auto tuple = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(-5).get(), + LiteralUtil::CreateR1({2.0, 4.0}).get(), &nil_literal}); + + EXPECT_EQ(*matrix, LiteralSlice(*nested_tuple, {0})); + EXPECT_EQ(nested_tuple->Get({}, {1, 0}), 42); + EXPECT_EQ(nested_tuple->Get({0}, {1, 1}), 23.0); + EXPECT_EQ(nested_tuple->Get({1}, {1, 1}), 44.0); + + // Overwrite the inner tuple element of nested_tuple with the contents of + // 'tuple'. + TF_ASSERT_OK(nested_tuple->CopyFrom(*tuple, /*dest_shape_index=*/{1}, + /*src_shape_index=*/{})); + + // The matrix element should be unchanged. + EXPECT_EQ(*matrix, LiteralSlice(*nested_tuple, {0})); + + // The tuple element should have been copied from 'tuple'. + EXPECT_EQ(nested_tuple->Get({}, {1, 0}), -5); + EXPECT_EQ(nested_tuple->Get({0}, {1, 1}), 2.0); + EXPECT_EQ(nested_tuple->Get({1}, {1, 1}), 4.0); +} +TEST_F(LiteralUtilTest, CopyBetweenSameTuple) { + auto tuple = LiteralUtil::MakeTuple({LiteralUtil::CreateR0(-2).get(), + LiteralUtil::CreateR0(4).get()}); + + EXPECT_EQ(tuple->Get({}, {0}), -2); + EXPECT_EQ(tuple->Get({}, {1}), 4); + + // Copy from one element to the other. + TF_ASSERT_OK(tuple->CopyFrom(*tuple, /*dest_shape_index=*/{1}, + /*src_shape_index=*/{0})); + + EXPECT_EQ(tuple->Get({}, {0}), -2); + EXPECT_EQ(tuple->Get({}, {1}), -2); +} + +TEST_F(LiteralUtilTest, CopyFromDifferentShapes) { + auto matrix = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto vector = LiteralUtil::CreateR1({5.0, 7.0}); + Status status = matrix->CopyFrom(*vector); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Destination subshape incompatible")); +} + +TEST_F(LiteralUtilTest, F16) { + // Verify that the internal data views are consistent and that they + // are in little endian format + // TODO - modify if we make the data format machine endianess dependent + auto m1 = Literal::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2})); + Literal* l1 = m1.get(); + const char* d1 = reinterpret_cast(l1->data().data()); + EXPECT_EQ(d1[0], 0); + EXPECT_EQ(d1[1], 0); + EXPECT_EQ(d1[2], 0); + EXPECT_EQ(d1[3], 0); + EXPECT_EQ(d1[4], 0); + EXPECT_EQ(d1[5], 0); + EXPECT_EQ(d1[6], 0); + EXPECT_EQ(d1[7], 0); + + half h1(1.0f); + half h2(2.0f); + auto m2 = LiteralUtil::CreateR2({{h1, h2}, {h2, h1}}); + Literal* l2 = m2.get(); + const char* d2 = reinterpret_cast(l2->data().data()); + EXPECT_EQ(d2[0], 0); + EXPECT_EQ(d2[1], 0x3C); + EXPECT_EQ(d2[2], 0); + EXPECT_EQ(d2[3], 0x40); + EXPECT_EQ(d2[4], 0); + EXPECT_EQ(d2[5], 0x40); + EXPECT_EQ(d2[6], 0); + EXPECT_EQ(d2[7], 0x3C); +} + +TEST_F(LiteralUtilTest, Populate) { + struct PopulateData { + std::vector dimensions; + std::vector layout; + } populate_data[] = { + {{}, {}}, + {{0}, {0}}, + {{16}, {0}}, + {{2, 0}, {1, 0}}, + {{4, 16}, {1, 0}}, + {{21, 12}, {0, 1}}, + {{6, 11, 17}, {2, 0, 1}}, + {{6, 11, 5, 17}, {3, 2, 0, 1}}, + }; + for (const auto& data : populate_data) { + Shape shape = ShapeUtil::MakeShapeWithLayout( + primitive_util::NativeToPrimitiveType(), data.dimensions, + data.layout); + auto literal = absl::make_unique(shape); + auto generator = [&](absl::Span indexes) -> uint32 { + // Offsets from linear index just to avoid R0 literals to be initialized + // with zero. + return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(), + indexes) + + 17; + }; + TF_EXPECT_OK(literal->Populate(generator)); + + std::vector zero_base(data.dimensions.size(), 0); + std::vector step(data.dimensions.size(), 1); + bool matched = true; + auto check_function = [&](absl::Span indexes) { + auto value = literal->Get(indexes); + matched = matched && (value == generator(indexes)); + return matched; + }; + ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step, + check_function); + EXPECT_TRUE(matched); + } +} + +TEST_F(LiteralUtilTest, PopulateParallel) { + struct PopulateData { + std::vector dimensions; + std::vector layout; + } populate_data[] = { + {{}, {}}, + {{0}, {0}}, + {{16}, {0}}, + {{2, 0}, {1, 0}}, + {{4, 16}, {1, 0}}, + {{21, 12}, {0, 1}}, + {{6, 11, 17}, {2, 0, 1}}, + {{6, 11, 5, 17}, {3, 2, 0, 1}}, + }; + for (const auto& data : populate_data) { + Shape shape = ShapeUtil::MakeShapeWithLayout( + primitive_util::NativeToPrimitiveType(), data.dimensions, + data.layout); + auto literal = absl::make_unique(shape); + auto generator = [&](absl::Span indexes) -> uint32 { + // Offsets from linear index just to avoid R0 literals to be initialized + // with zero. + return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(), + indexes) + + 17; + }; + TF_EXPECT_OK(literal->PopulateParallel(generator)); + + std::vector zero_base(data.dimensions.size(), 0); + std::vector step(data.dimensions.size(), 1); + bool matched = true; + auto check_function = [&](absl::Span indexes) { + auto value = literal->Get(indexes); + matched = matched && (value == generator(indexes)); + return matched; + }; + ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step, + check_function); + EXPECT_TRUE(matched); + } +} + +TEST_F(LiteralUtilTest, ConvertR4) { + // clang-format off + auto original = LiteralUtil::CreateR4WithLayout({{ + {{10, 11, 12, 13}, {14, 15, 16, 17}}, + {{18, 19, 20, 21}, {22, 23, 24, 25}}, + {{26, 27, 28, 29}, {30, 31, 32, 33}}, + }}, layout_r4_dim0major_); + auto expected = LiteralUtil::CreateR4WithLayout({{ + {{10, 11, 12, 13}, {14, 15, 16, 17}}, + {{18, 19, 20, 21}, {22, 23, 24, 25}}, + {{26, 27, 28, 29}, {30, 31, 32, 33}}, + }}, layout_r4_dim0major_); + // clang-format on + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr converted, + original->Convert(U32)); + + EXPECT_EQ(*expected, *converted); +} + +TEST_F(LiteralUtilTest, ConvertIfTypesMatch) { + // clang-format off + auto s8 = LiteralUtil::CreateR4WithLayout({{ + {{10, 0, 12, 0}, {0, 15, 0, 17}}, + {{0, 19, 0, 21}, {22, 0, 24, 0}}, + {{26, 0, 28, 0}, {0, 31, 0, 33}}, + }}, layout_r4_dim0major_); + auto s32 = LiteralUtil::CreateR4WithLayout({{ + {{10, 0, 12, 0}, {0, 15, 0, 17}}, + {{0, 19, 0, 21}, {22, 0, 24, 0}}, + {{26, 0, 28, 0}, {0, 31, 0, 33}}, + }}, layout_r4_dim0major_); + auto u32 = LiteralUtil::CreateR4WithLayout({{ + {{10, 0, 12, 0}, {0, 15, 0, 17}}, + {{0, 19, 0, 21}, {22, 0, 24, 0}}, + {{26, 0, 28, 0}, {0, 31, 0, 33}}, + }}, layout_r4_dim0major_); + auto s64 = LiteralUtil::CreateR4WithLayout({{ + {{10, 0, 12, 0}, {0, 15, 0, 17}}, + {{0, 19, 0, 21}, {22, 0, 24, 0}}, + {{26, 0, 28, 0}, {0, 31, 0, 33}}, + }}, layout_r4_dim0major_); + auto u64 = LiteralUtil::CreateR4WithLayout({{ + {{10, 0, 12, 0}, {0, 15, 0, 17}}, + {{0, 19, 0, 21}, {22, 0, 24, 0}}, + {{26, 0, 28, 0}, {0, 31, 0, 33}}, + }}, layout_r4_dim0major_); + auto pred = LiteralUtil::CreateR4WithLayout({{ + {{true, false, true, false}, {false, true, false, true}}, + {{false, true, false, true}, {true, false, true, false}}, + {{true, false, true, false}, {false, true, false, true}}, + }}, layout_r4_dim0major_); + auto int32_pred = LiteralUtil::CreateR4WithLayout({{ + {{1, 0, 1, 0}, {0, 1, 0, 1}}, + {{0, 1, 0, 1}, {1, 0, 1, 0}}, + {{1, 0, 1, 0}, {0, 1, 0, 1}}, + }}, layout_r4_dim0major_); + auto f16 = LiteralUtil::CreateR4WithLayout({{ + {{half(10.0), half(0.0), half(12.0), half(0.0)}, + {half(0.0), half(15.0), half(0.0), half(17.0)}}, + {{half(0.0), half(19.0), half(0.0), half(21.0)}, + {half(22.0), half(0.0), half(24.0), half(0.0)}}, + {{half(26.0), half(0.0), half(28.0), half(0.0)}, + {half(0.0), half(31.0), half(0.0), half(33.0)}}, + }}, layout_r4_dim0major_); + auto bf16 = LiteralUtil::CreateR4WithLayout({{ + {{bfloat16(10.0), bfloat16(0.0), bfloat16(12.0), bfloat16(0.0)}, + {bfloat16(0.0), bfloat16(15.0), bfloat16(0.0), bfloat16(17.0)}}, + {{bfloat16(0.0), bfloat16(19.0), bfloat16(0.0), bfloat16(21.0)}, + {bfloat16(22.0), bfloat16(0.0), bfloat16(24.0), bfloat16(0.0)}}, + {{bfloat16(26.0), bfloat16(0.0), bfloat16(28.0), bfloat16(0.0)}, + {bfloat16(0.0), bfloat16(31.0), bfloat16(0.0), bfloat16(33.0)}}, + }}, layout_r4_dim0major_); + auto f32 = LiteralUtil::CreateR4WithLayout({{ + {{10.0f, 0.0f, 12.0f, 0.0f}, {0.0f, 15.0f, 0.0f, 17.0f}}, + {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}}, + {{26.0f, 0.0f, 28.0f, 0.0f}, {0.0f, 31.0f, 0.0f, 33.0f}}, + }}, layout_r4_dim0major_); + auto f64 = LiteralUtil::CreateR4WithLayout({{ + {{10.0, 0.0, 12.0, 0.0}, {0.0, 15.0, 0.0, 17.0}}, + {{0.0, 19.0, 0.0, 21.0}, {22.0, 0.0, 24.0, 0.0}}, + {{26.0, 0.0, 28.0, 0.0}, {0.0, 31.0, 0.0, 33.0}}, + }}, layout_r4_dim0major_); + auto c64 = LiteralUtil::CreateR4WithLayout({{ + {{10.0f, 0.0f, 12.0f, 0.0f}, {0.0f, 15.0f, 0.0f, 17.0f}}, + {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}}, + {{26.0f, 0.0f, 28.0f, 0.0f}, {0.0f, 31.0f, 0.0f, 33.0f}}, + }}, layout_r4_dim0major_); + // clang-format on + std::unique_ptr conv; + + conv = s8->Convert(U32).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *u32); + + conv = s8->Convert(S32).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *s32); + + conv = s8->Convert(U64).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *u64); + + conv = s8->Convert(S64).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *s64); + + conv = s8->Convert(PRED).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *pred); + + conv = bf16->Convert(S32).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *s32); + + conv = bf16->Convert(F32).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *f32); + + conv = pred->Convert(S32).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *int32_pred); + + conv = f32->Convert(S32).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *s32); + + conv = f64->Convert(S32).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *s32); + + conv = s32->Convert(F32).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *f32); + + conv = f32->Convert(F16).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *f16); + + conv = f64->Convert(F16).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *f16); + + conv = s32->Convert(F16).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *f16); + + conv = u32->Convert(F16).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *f16); + + conv = s32->Convert(C64).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *c64); + + conv = f16->Convert(C64).ConsumeValueOrDie(); + EXPECT_EQ(*conv, *c64); + + EXPECT_EQ(s32->Convert(TUPLE).status().code(), + tensorflow::error::UNIMPLEMENTED); + EXPECT_EQ(s32->Convert(S16).status().code(), + tensorflow::error::UNIMPLEMENTED); + EXPECT_EQ(s32->Convert(U16).status().code(), + tensorflow::error::UNIMPLEMENTED); + EXPECT_EQ(c64->Convert(F32).status().code(), + tensorflow::error::UNIMPLEMENTED); + EXPECT_EQ(c64->Convert(S32).status().code(), + tensorflow::error::UNIMPLEMENTED); +} + +TEST_F(LiteralUtilTest, BitcastConvert) { + auto original = LiteralUtil::CreateR1( + {tensorflow::bit_cast(2.5f), + tensorflow::bit_cast(-42.25f), + tensorflow::bit_cast(100.f), 0xbeef}); + auto expected = LiteralUtil::CreateR1( + {2.5f, -42.25f, 100.0f, tensorflow::bit_cast(0xbeef)}); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr converted, + original->BitcastConvert(F32)); +} + +TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) { + auto literal = LiteralUtil::CreateR0(1234); + Status status = literal->BitcastConvert(F64).status(); + EXPECT_NE(Status::OK(), status); + EXPECT_TRUE( + absl::StrContains(status.error_message(), "bit widths are different")); +} + +TEST_F(LiteralUtilTest, CopyFromProto_Bool) { + LiteralProto p; + p.mutable_shape()->set_element_type(PRED); + for (int len = 0; len < 25; ++len) { + p.mutable_shape()->clear_dimensions(); + p.mutable_shape()->add_dimensions(len); + LayoutUtil::SetToDefaultLayout(p.mutable_shape()); + p.clear_preds(); + for (int i = 0; i < len; ++i) { + p.add_preds((i % 2) == (len % 2)); + } + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr literal, + Literal::CreateFromProto(p)); + ASSERT_EQ(len, literal->data().size()); + int i = 0; + for (bool value : literal->data()) { + EXPECT_EQ((i % 2) == (len % 2), value); + ++i; + } + } +} + +// Note that f16 is currently stored in a byte array in little endian byte order +TEST_F(LiteralUtilTest, ToProto_f16) { + half h1(1.0f); + half h2(2.0f); + + auto m = LiteralUtil::CreateR2({{h1, h2}, {h2, h1}}); + Literal* l = m.get(); + EXPECT_EQ(4, ShapeUtil::ElementsIn(l->shape())); + EXPECT_EQ(4, l->data().size()); + + LiteralProto p = l->ToProto(); + EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape())); + EXPECT_EQ(8, p.f16s().size()); + const char* d = p.f16s().data(); + EXPECT_EQ(d[0], 0); + EXPECT_EQ(d[1], 0x3C); + EXPECT_EQ(d[2], 0); + EXPECT_EQ(d[3], 0x40); + EXPECT_EQ(d[4], 0); + EXPECT_EQ(d[5], 0x40); + EXPECT_EQ(d[6], 0); + EXPECT_EQ(d[7], 0x3C); +} + +// Note that f16 is currently stored in a byte array in little endian byte order +TEST_F(LiteralUtilTest, CopyFromProto_f16) { + half h1(1.0f); + half h2(2.0f); + + const char half_vals[8] = {0x00, 0x3C, 0x00, 0x40, 0x00, 0x40, 0x00, 0x3C}; + LiteralProto p; + p.mutable_shape()->set_element_type(F16); + p.mutable_shape()->clear_dimensions(); + p.mutable_shape()->add_dimensions(4); + LayoutUtil::SetToDefaultLayout(p.mutable_shape()); + p.clear_f16s(); + p.set_f16s(half_vals, 8); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr literal, + Literal::CreateFromProto(p)); + auto r = literal->data(); + ASSERT_EQ(4, r.size()); + EXPECT_EQ(h1, r[0]); + EXPECT_EQ(h2, r[1]); + EXPECT_EQ(h2, r[2]); + EXPECT_EQ(h1, r[3]); +} + +TEST_F(LiteralUtilTest, LiteralSliceTest) { + auto scalar = LiteralUtil::CreateR0(1.0); + auto matrix = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto tuple = LiteralUtil::MakeTuple({scalar.get(), matrix.get()}); + auto nested_tuple = LiteralUtil::MakeTuple({tuple.get(), scalar.get()}); + Literal nil(ShapeUtil::MakeNil()); + + EXPECT_EQ(LiteralSlice(*scalar, {}), *scalar); + EXPECT_EQ(LiteralSlice(*matrix, {}), *matrix); + EXPECT_EQ(LiteralSlice(*tuple, {}), *tuple); + EXPECT_EQ(LiteralSlice(*nested_tuple, {}), *nested_tuple); + EXPECT_EQ(LiteralSlice(nil, {}), nil); + + EXPECT_EQ(LiteralSlice(*tuple, {0}), *scalar); + EXPECT_EQ(LiteralSlice(*tuple, {1}), *matrix); + + EXPECT_EQ(LiteralSlice(*nested_tuple, {0}), *tuple); + EXPECT_EQ(LiteralSlice(*nested_tuple, {0, 0}), *scalar); + EXPECT_EQ(LiteralSlice(*nested_tuple, {0, 1}), *matrix); + EXPECT_EQ(LiteralSlice(*nested_tuple, {1}), *scalar); +} + +TEST_F(LiteralUtilTest, MutatingLiteralSlice) { + auto scalar = LiteralUtil::CreateR0(1.0); + auto matrix = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto tuple = LiteralUtil::MakeTuple({scalar.get(), matrix.get()}); + auto nested_tuple = LiteralUtil::MakeTuple({tuple.get(), scalar.get()}); + // Verify that changing the underlying data beneath the view changes the + // data of the view itself. + const auto nested_tuple_view = LiteralSlice(*nested_tuple); + EXPECT_EQ( + nested_tuple->Get(/*multi_index=*/{}, /*shape_index=*/{0, 0}), + 1.0f); + EXPECT_EQ(nested_tuple_view.Get(/*multi_index=*/{}, + /*shape_index=*/{0, 0}), + 1.0f); + nested_tuple->Set(/*multi_index=*/{}, /*shape_index=*/{0, 0}, 555.0f); + EXPECT_EQ( + nested_tuple->Get(/*multi_index=*/{}, /*shape_index=*/{0, 0}), + 555.0f); + EXPECT_EQ(nested_tuple_view.Get(/*multi_index=*/{}, + /*shape_index=*/{0, 0}), + 555.0f); +} + +TEST_F(LiteralUtilTest, LiteralSliceOfALiteralSlice) { + auto scalar = LiteralUtil::CreateR0(1.0); + auto matrix = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto tuple = LiteralUtil::MakeTuple({scalar.get(), matrix.get()}); + auto nested_tuple = LiteralUtil::MakeTuple({tuple.get(), scalar.get()}); + + const auto nested_tuple_view = LiteralSlice(*nested_tuple); + const auto tuple_view = LiteralSlice(nested_tuple_view, /*view_root=*/{0}); + const auto matrix_view = LiteralSlice(tuple_view, /*view_root=*/{1}); + EXPECT_EQ(matrix_view, + *LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}})); +} + +TEST_F(LiteralUtilTest, BorrowingLiteralFromOneBufferPtr) { + std::vector int64_values = {1, 2, 3}; + const Shape literal_shape = ShapeUtil::MakeShape(S64, {3}); + + BorrowingLiteral literal(reinterpret_cast(int64_values.data()), + literal_shape); + + EXPECT_EQ(literal.Get({0}), 1); + EXPECT_EQ(literal.Get({1}), 2); + EXPECT_EQ(literal.Get({2}), 3); +} + +TEST_F(LiteralUtilTest, BorrowingLiteralFromMultipleBufferPtrs) { + std::vector one_two_three = {1, 2, 3}; + const Shape one_two_three_shape = ShapeUtil::MakeShape(S64, {3}); + + std::vector hundred = {100}; + const Shape hundred_shape = ShapeUtil::MakeShape(S64, {1}); + + std::vector src_buf_ptrs; + src_buf_ptrs.emplace_back( + reinterpret_cast(one_two_three.data())); + src_buf_ptrs.emplace_back(reinterpret_cast(hundred.data())); + auto literal_tuple = BorrowingLiteral( + src_buf_ptrs, + ShapeUtil::MakeTupleShape({one_two_three_shape, hundred_shape})); + + EXPECT_EQ(literal_tuple.Get(/*multi_index=*/{0}, /*shape_index=*/{0}), + 1); + EXPECT_EQ(literal_tuple.Get(/*multi_index=*/{0}, /*shape_index=*/{1}), + 100); + + EXPECT_EQ(literal_tuple.Get(/*multi_index=*/{1}, /*shape_index=*/{0}), + 2); + + EXPECT_EQ(literal_tuple.Get(/*multi_index=*/{2}, /*shape_index=*/{0}), + 3); +} + +TEST_F(LiteralUtilTest, LiteralMove) { + std::unique_ptr matrix = + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + Literal literal(std::move(*matrix)); + + EXPECT_TRUE( + ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2, 2}), literal.shape())); + EXPECT_EQ(literal.Get({0, 0}), 1.0); + EXPECT_EQ(literal.Get({0, 1}), 2.0); + EXPECT_EQ(literal.Get({1, 0}), 3.0); + EXPECT_EQ(literal.Get({1, 1}), 4.0); +} + +TEST_F(LiteralUtilTest, DecomposeTuple) { + Literal nil_literal(ShapeUtil::MakeNil()); + auto nested_tuple = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{1, 2}, {3, 4}}).get(), + LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(42).get(), + LiteralUtil::CreateR1({23.0, 44.0}).get(), &nil_literal}) + .get(), + &nil_literal}); + + EXPECT_FALSE(ShapeUtil::IsNil(nested_tuple->shape())); + std::vector elements = nested_tuple->DecomposeTuple(); + EXPECT_TRUE(ShapeUtil::IsNil(nested_tuple->shape())); + + ASSERT_EQ(elements.size(), 3); + + EXPECT_TRUE(ShapeUtil::Compatible(elements[0].shape(), + ShapeUtil::MakeShape(S32, {2, 2}))); + EXPECT_EQ(elements[0].Get({0, 0}), 1); + EXPECT_EQ(elements[0].Get({0, 1}), 2); + EXPECT_EQ(elements[0].Get({1, 0}), 3); + EXPECT_EQ(elements[0].Get({1, 1}), 4); + + EXPECT_TRUE(ShapeUtil::Compatible( + elements[1].shape(), + ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {}), + ShapeUtil::MakeShape(F64, {2}), + ShapeUtil::MakeNil()}))); + EXPECT_EQ(elements[1].Get({}, /*shape_index=*/{0}), 42); + EXPECT_EQ(elements[1].Get({0}, /*shape_index=*/{1}), 23.0); + EXPECT_EQ(elements[1].Get({1}, /*shape_index=*/{1}), 44.0); + + EXPECT_TRUE(ShapeUtil::Compatible(elements[2].shape(), ShapeUtil::MakeNil())); +} + +TEST_F(LiteralUtilTest, DecomposeEmptyTuple) { + Literal nil_literal(ShapeUtil::MakeNil()); + std::vector elements = nil_literal.DecomposeTuple(); + EXPECT_EQ(elements.size(), 0); +} + +TEST_F(LiteralUtilTest, MoveIntoTuple) { + std::vector elements; + elements.push_back(std::move(*LiteralUtil::CreateR0(1.0))); + elements.push_back(std::move(*LiteralUtil::CreateR1({4, 8}))); + elements.push_back(std::move(*LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(42).get(), + LiteralUtil::CreateR1({23.0, 44.0}).get()}) + + )); + + Literal literal = Literal::MoveIntoTuple(absl::MakeSpan(elements)); + ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape())); + ASSERT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 3); + + EXPECT_EQ(literal.Get({}, /*shape_index=*/{0}), 1.0); + EXPECT_EQ(literal.Get({0}, /*shape_index=*/{1}), 4); + EXPECT_EQ(literal.Get({1}, /*shape_index=*/{1}), 8); + EXPECT_EQ(literal.Get({}, /*shape_index=*/{2, 0}), 42); + EXPECT_EQ(literal.Get({0}, /*shape_index=*/{2, 1}), 23.0); + EXPECT_EQ(literal.Get({1}, /*shape_index=*/{2, 1}), 44.0); + + for (const Literal& element : elements) { + EXPECT_TRUE(ShapeUtil::IsNil(element.shape())); + } +} + +TEST_F(LiteralUtilTest, MoveIntoEmptyTuple) { + Literal literal = Literal::MoveIntoTuple({}); + ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape())); + EXPECT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 0); +} + +TEST_F(LiteralUtilTest, LiteralMoveAssignment) { + Literal literal; + EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeNil(), literal.shape())); + + std::unique_ptr matrix = + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + literal = std::move(*matrix); + + EXPECT_TRUE( + ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2, 2}), literal.shape())); + EXPECT_EQ(literal.Get({0, 0}), 1.0); + EXPECT_EQ(literal.Get({0, 1}), 2.0); + EXPECT_EQ(literal.Get({1, 0}), 3.0); + EXPECT_EQ(literal.Get({1, 1}), 4.0); +} + +TEST_F(LiteralUtilTest, LiteralSliceCopy) { + std::unique_ptr matrix = + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + const auto matrix_view = LiteralSlice(*matrix); + LiteralSlice matrix_view_copy(matrix_view); + + EXPECT_EQ(matrix_view_copy.Get({0, 0}), 1.0); + EXPECT_EQ(matrix_view_copy.Get({0, 1}), 2.0); + EXPECT_EQ(matrix_view_copy.Get({1, 0}), 3.0); + EXPECT_EQ(matrix_view_copy.Get({1, 1}), 4.0); +} + +TEST_F(LiteralUtilTest, GetSetTuple) { + auto tuple = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(42.0).get(), + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}).get()}); + EXPECT_EQ(tuple->Get(/*multi_index=*/{}, /*shape_index=*/{0}), 42.0); + tuple->Set(/*multi_index=*/{}, /*shape_index=*/{0}, -5.0); + EXPECT_EQ(tuple->Get(/*multi_index=*/{}, /*shape_index=*/{0}), -5.0); + + EXPECT_EQ(tuple->Get(/*multi_index=*/{1, 0}, /*shape_index=*/{1}), + 3.0); + tuple->Set(/*multi_index=*/{1, 0}, /*shape_index=*/{1}, -4.0); + EXPECT_EQ(tuple->Get(/*multi_index=*/{1, 0}, /*shape_index=*/{1}), + -4.0); +} + +TEST_F(LiteralUtilTest, CreateFromShapeZeroInitialized) { + // Literals constructed using CreateFromShape should be zero initialized. + std::unique_ptr scalar_f32 = + Literal::CreateFromShape(ShapeUtil::MakeShape(F32, {})); + EXPECT_EQ(scalar_f32->Get({}), 0.0); + EXPECT_TRUE(scalar_f32->IsAll(0)); + + std::unique_ptr vector_s32 = + Literal::CreateFromShape(ShapeUtil::MakeShape(S32, {3})); + EXPECT_EQ(vector_s32->Get({0}), 0); + EXPECT_EQ(vector_s32->Get({1}), 0); + EXPECT_EQ(vector_s32->Get({2}), 0); + EXPECT_TRUE(vector_s32->IsAll(0)); + + std::unique_ptr tuple = + Literal::CreateFromShape(ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeShape(F64, {}), ShapeUtil::MakeShape(PRED, {2}), + ShapeUtil::MakeShape(U64, {2, 1}), ShapeUtil::MakeShape(C64, {})})); + + EXPECT_EQ(tuple->Get({}, {0}), 0.0); + EXPECT_EQ(tuple->Get({0}, {1}), false); + EXPECT_EQ(tuple->Get({1}, {1}), false); + EXPECT_EQ(tuple->Get({0, 0}, {2}), 0); + EXPECT_EQ(tuple->Get({1, 0}, {2}), 0); + EXPECT_EQ(tuple->Get({}, {3}), complex64(0.0f, 0.0f)); +} + +TEST_F(LiteralUtilTest, ProtoRoundTrip) { + // Test serializing then deserializing a Literal through a proto. + auto one_f32 = LiteralUtil::CreateR0(1.0); + auto two_f32 = LiteralUtil::CreateR0(2.0); + auto vector_int8 = LiteralUtil::CreateR1({-128, 0, 2, 4, 7, 56, 127}); + auto vector_c64 = LiteralUtil::CreateR1({{1.0, 2.0}, {3.0, 4.0}}); + auto vector_bfloat16 = LiteralUtil::CreateR1( + {bfloat16{-1.0}, bfloat16{2.0}, bfloat16{-3.0}}); + auto vector_half = + LiteralUtil::CreateR1({half{10.0}, half{20.0}, half{-30.0}}); + auto matrix_pred = + LiteralUtil::CreateR2({{true, false, true}, {false, false, true}}); + auto tuple = LiteralUtil::MakeTuple( + {one_f32.get(), vector_half.get(), matrix_pred.get(), matrix_pred.get()}); + Literal nil_literal(ShapeUtil::MakeNil()); + auto nested_tuple = LiteralUtil::MakeTuple( + {tuple.get(), vector_bfloat16.get(), tuple.get(), &nil_literal}); + + auto to_from_proto = [](const Literal& literal) -> Literal { + return std::move(*Literal::CreateFromProto(literal.ToProto()).ValueOrDie()); + }; + + EXPECT_EQ(*one_f32, to_from_proto(*one_f32)); + EXPECT_EQ(*vector_c64, to_from_proto(*vector_c64)); + EXPECT_EQ(*vector_bfloat16, to_from_proto(*vector_bfloat16)); + EXPECT_EQ(*matrix_pred, to_from_proto(*matrix_pred)); + EXPECT_EQ(*tuple, to_from_proto(*tuple)); + EXPECT_EQ(*nested_tuple, to_from_proto(*nested_tuple)); + EXPECT_EQ(nil_literal, to_from_proto(nil_literal)); + + EXPECT_NE(*one_f32, *two_f32); + EXPECT_NE(*one_f32, to_from_proto(*two_f32)); +} + +TEST_F(LiteralUtilTest, InvalidProtoNoValues) { + // Proto contains a shape, but no values. + LiteralProto proto; + *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}); + Status status = Literal::CreateFromProto(proto).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Expected 3 elements in LiteralProto")); +} + +TEST_F(LiteralUtilTest, InvalidProtoNoShape) { + // Proto contains values, but no shape. + LiteralProto proto; + proto.add_preds(false); + proto.add_preds(true); + proto.add_preds(false); + Status status = Literal::CreateFromProto(proto).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), HasSubstr("LiteralProto has no shape")); +} + +TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) { + // Proto contains values in wrong container. + LiteralProto proto; + *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}); + proto.add_preds(false); + proto.add_preds(true); + proto.add_preds(false); + Status status = Literal::CreateFromProto(proto).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Expected 3 elements in LiteralProto")); +} + +TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) { + // Proto contains too few values. + LiteralProto proto; + *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {42, 2}); + proto.add_f32s(1.0); + proto.add_f32s(2.0); + proto.add_f32s(3.0); + Status status = Literal::CreateFromProto(proto).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Expected 84 elements in LiteralProto")); +} + +TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) { + // Proto contains too many values. + LiteralProto proto; + *proto.mutable_shape() = ShapeUtil::MakeShape(S32, {2}); + proto.add_s32s(42); + proto.add_s32s(-10); + proto.add_s32s(100); + Status status = Literal::CreateFromProto(proto).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Expected 2 elements in LiteralProto")); +} + +TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) { + // Proto shape missing layout. + LiteralProto proto; + *proto.mutable_shape() = ShapeUtil::MakeShape(PRED, {2, 2}); + LayoutUtil::ClearLayout(proto.mutable_shape()); + proto.add_preds(true); + proto.add_preds(false); + proto.add_preds(true); + proto.add_preds(false); + Status status = Literal::CreateFromProto(proto).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), HasSubstr("LiteralProto has no layout")); +} + +TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) { + // Proto has the too few tuple elements. + LiteralProto proto; + *proto.mutable_shape() = ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})}); + LiteralProto* element0 = proto.add_tuple_literals(); + *element0->mutable_shape() = + ShapeUtil::GetTupleElementShape(proto.shape(), 0); + element0->add_preds(false); + element0->add_preds(true); + + Status status = Literal::CreateFromProto(proto).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements")); +} + +TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) { + // Proto has the too many tuple elements. + LiteralProto proto; + *proto.mutable_shape() = ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})}); + LiteralProto* element0 = proto.add_tuple_literals(); + *element0->mutable_shape() = + ShapeUtil::GetTupleElementShape(proto.shape(), 0); + element0->add_preds(false); + element0->add_preds(true); + LiteralProto* element1 = proto.add_tuple_literals(); + *element1->mutable_shape() = + ShapeUtil::GetTupleElementShape(proto.shape(), 1); + element1->add_f32s(42.0); + LiteralProto* element2 = proto.add_tuple_literals(); + *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {}); + element2->add_f32s(123.0); + + Status status = Literal::CreateFromProto(proto).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements")); +} + +TEST_F(LiteralUtilTest, SortSparseElements) { + auto literal = LiteralUtil::CreateSparse({10, 10, 10}, + SparseIndexArray(10, 3), {}); + literal->AppendSparseElement({2, 3, 4}, 2.0); + literal->AppendSparseElement({3, 4, 5}, 3.0); + literal->AppendSparseElement({1, 2, 3}, 1.0); + literal->SortSparseElements(); + EXPECT_EQ(literal->ToString(false), + "f32[10,10,10]{[1, 2, 3]: 1, [2, 3, 4]: 2, [3, 4, 5]: 3}"); +} + +TEST_F(LiteralUtilTest, GetSparseElementAsString) { + std::vector dimensions = {10, 10, 10}; + SparseIndexArray indices(10, {{1, 2, 3}, {2, 3, 4}, {3, 4, 5}}); + + EXPECT_EQ( + LiteralUtil::CreateSparse(dimensions, indices, {true, false, true}) + ->GetSparseElementAsString(1), + "false"); + EXPECT_EQ(LiteralUtil::CreateSparse(dimensions, indices, {1, 2, 3}) + ->GetSparseElementAsString(1), + absl::StrCat(int64{2})); + EXPECT_EQ( + LiteralUtil::CreateSparse(dimensions, indices, {1.0, 2.0, 3.0}) + ->GetSparseElementAsString(1), + absl::StrCat(double{2.0})); + EXPECT_EQ(LiteralUtil::CreateSparse(dimensions, indices, + {half{1.0}, half{2.0}, half{3.0}}) + ->GetSparseElementAsString(1), + absl::StrCat(static_cast(half{2.0}))); + EXPECT_EQ(LiteralUtil::CreateSparse( + dimensions, indices, + std::vector{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}) + ->GetSparseElementAsString(1), + absl::StrCat("(", float{3.0}, ", ", float{4.0}, ")")); +} + +TEST_F(LiteralUtilTest, BroadcastVectorToMatrix0) { + std::unique_ptr literal = LiteralUtil::CreateR1({1, 2}); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr broadcasted_literal, + literal->Broadcast( + /*result_shape=*/ShapeUtil::MakeShape(S64, {2, 2}), + /*dimensions=*/{0})); + EXPECT_EQ(*broadcasted_literal, + *LiteralUtil::CreateR2({{1, 1}, {2, 2}})); +} + +TEST_F(LiteralUtilTest, BroadcastVectorToMatrix1) { + std::unique_ptr literal = LiteralUtil::CreateR1({1, 2}); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr broadcasted_literal, + literal->Broadcast( + /*result_shape=*/ShapeUtil::MakeShape(S64, {2, 2}), + /*dimensions=*/{1})); + EXPECT_EQ(*broadcasted_literal, + *LiteralUtil::CreateR2({{1, 2}, {1, 2}})); +} + +TEST_F(LiteralUtilTest, BroadcastScalarToMatrix) { + std::unique_ptr literal = LiteralUtil::CreateR0(9); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr broadcasted_literal, + literal->Broadcast( + /*result_shape=*/ShapeUtil::MakeShape(S32, {2, 2}), + /*dimensions=*/{})); + EXPECT_EQ(*broadcasted_literal, + *LiteralUtil::CreateR2({{9, 9}, {9, 9}})); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc index 7563cc1e341be84264c8db66635b755404d183cf..613449cf10c785de55e8474c0ee35f78e8ed92b4 100644 --- a/tensorflow/compiler/xla/literal_util.cc +++ b/tensorflow/compiler/xla/literal_util.cc @@ -22,6 +22,9 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/index_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -30,37 +33,14 @@ limitations under the License. #include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/hash/hash.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/types.h" -using tensorflow::strings::Printf; -using tensorflow::strings::StrCat; - namespace xla { - namespace { -constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; - -// Converts between little and big endian. -// -// Precondition: size % 2 == 0 (elements in the array are 16 bits long) -void ConvertEndianShort(string* bytes) { - CHECK_EQ(bytes->size() / 2, 0); - for (int64 i = 0; i < bytes->size(); i += 2) { - std::swap((*bytes)[i], (*bytes)[i + 1]); - } -} - -void ConvertEndianShort(char* bytes, int64 size) { - CHECK_EQ(size / 2, 0); - for (int64 i = 0; i < size; i += 2) { - std::swap(bytes[i], bytes[i + 1]); - } -} +using absl::StrCat; // Return a literal with all arrays of type FromNativeT converted to type // ToNativeT in the given literal. @@ -76,7 +56,7 @@ std::unique_ptr ConvertType(LiteralSlice literal) { primitive_util::NativeToPrimitiveType()); } }); - auto result = MakeUnique(result_shape); + auto result = absl::make_unique(result_shape); // Then copy over the data from 'literal' converting FromNativeT values to // ToNativeT values as necessary. @@ -103,498 +83,53 @@ std::unique_ptr ConvertType(LiteralSlice literal) { } // namespace -LiteralBase::~LiteralBase() {} - -std::ostream& operator<<(std::ostream& out, const Literal& literal) { - out << literal.ToString(); - return out; -} - -Literal::StrideConfig::StrideConfig( - const Shape& source_shape, const Shape& dest_shape, - tensorflow::gtl::ArraySlice dimensions) - : dimensions(dimensions), - base(dimensions.size(), 0), - step(dimensions.size(), 1) { - if (!dimensions.empty()) { - // Selects the shape with the largest minor dimension as the one upon - // which to run the tight stride loop. - if (dimensions[LayoutUtil::Minor(source_shape.layout(), 0)] >= - dimensions[LayoutUtil::Minor(dest_shape.layout(), 0)]) { - minor_dimension = LayoutUtil::Minor(source_shape.layout(), 0); - dest_stride = IndexUtil::GetDimensionStride(dest_shape, minor_dimension); - } else { - minor_dimension = LayoutUtil::Minor(dest_shape.layout(), 0); - source_stride = - IndexUtil::GetDimensionStride(source_shape, minor_dimension); - } - minor_loop_size = dimensions[minor_dimension]; - step[minor_dimension] = minor_loop_size; - } -} - -Literal::Literal(const Shape& shape) - : Literal(shape, /*allocate_arrays=*/true) {} - -void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) { - if (ShapeUtil::IsTuple(shape)) { - for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { - const Shape& subshape = shape.tuple_shapes(i); - - auto child_piece = Piece(); - child_piece.set_subshape(&subshape); - - SetPiece(subshape, &child_piece, allocate_arrays); - - piece->emplace_back(std::move(child_piece)); - } - } else { - CHECK(ShapeUtil::IsArray(shape)); - if (allocate_arrays) { - if (LayoutUtil::IsSparseArray(shape)) { - // For sparse arrays, the buffer must be of the size of the maximum - // number of sparse elements possible. - const int64 max_sparse_elements = - LayoutUtil::MaxSparseElements(shape.layout()); - piece->set_buffer( - new char[max_sparse_elements * - ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type())]); - piece->set_sparse_indices( - new SparseIndexArray(max_sparse_elements, ShapeUtil::Rank(shape))); - } else { - piece->set_buffer(new char[piece->size_bytes()]); - } - } - } -} - -Literal::Literal(const Shape& shape, bool allocate_arrays) - : LiteralBase(), shape_(MakeUnique(shape)) { - CHECK(LayoutUtil::HasLayout(*shape_)); - root_piece_ = new Piece(); - root_piece_->set_subshape(shape_.get()); - CHECK(&root_piece_->subshape() == shape_.get()); - - SetPiece(*shape_, root_piece_, allocate_arrays); -} - -Literal::~Literal() { - if (root_piece_ != nullptr) { - DeallocateBuffers(); - delete root_piece_; - } -} - -void Literal::DeallocateBuffers() { - root_piece_->ForEachMutableSubpiece( - [&](const ShapeIndex& index, Piece* piece) { - if (piece->buffer() != nullptr) { - delete[] piece->buffer(); - delete piece->sparse_indices(); - } - }); -} - -Literal::Literal(Literal&& other) : LiteralBase() { *this = std::move(other); } - -Literal& Literal::operator=(Literal&& other) { - DCHECK(&other.root_piece_->subshape() == other.shape_.get()); - using std::swap; - swap(shape_, other.shape_); - swap(root_piece_, other.root_piece_); - DCHECK(&root_piece_->subshape() == shape_.get()); - - return *this; -} - -std::unique_ptr LiteralBase::CreateFromShape(const Shape& shape) { - auto literal = MakeUnique(shape); - literal->root_piece_->ForEachMutableSubpiece( - [&](const ShapeIndex& index, Piece* piece) { - if (ShapeUtil::IsArray(piece->subshape())) { - memset(piece->untyped_data(), 0, piece->size_bytes()); - } - }); - return literal; -} - -const SparseIndexArray* LiteralBase::sparse_indices( - const ShapeIndex& shape_index) const { - return piece(shape_index).sparse_indices(); -} - -SparseIndexArray* Literal::sparse_indices(const ShapeIndex& shape_index) { - return piece(shape_index).sparse_indices(); -} - -/* static */ std::unique_ptr Literal::CreateFromDimensions( - PrimitiveType primitive_type, - tensorflow::gtl::ArraySlice dimensions) { - return CreateFromShape(ShapeUtil::MakeShape(primitive_type, dimensions)); +/* static */ std::unique_ptr LiteralUtil::CreateFromDimensions( + PrimitiveType primitive_type, absl::Span dimensions) { + return Literal::CreateFromShape( + ShapeUtil::MakeShape(primitive_type, dimensions)); } -/* static */ std::unique_ptr Literal::ConvertBF16ToF32( +/* static */ std::unique_ptr LiteralUtil::ConvertBF16ToF32( const LiteralSlice& bf16_literal) { return ConvertType(bf16_literal); } -/* static */ std::unique_ptr Literal::ConvertF32ToBF16( +/* static */ std::unique_ptr LiteralUtil::ConvertF32ToBF16( const LiteralSlice& f32_literal) { return ConvertType(f32_literal); } -template -Status Literal::CopySliceFromInternal( - const LiteralBase& src_literal, tensorflow::gtl::ArraySlice src_base, - tensorflow::gtl::ArraySlice dest_base, - tensorflow::gtl::ArraySlice copy_size) { - TF_RET_CHECK(ShapeUtil::Rank(src_literal.shape()) == src_base.size()); - TF_RET_CHECK(ShapeUtil::Rank(shape()) == dest_base.size()); - - auto linear_index = [](const Shape& shape, - tensorflow::gtl::ArraySlice multi_index) { - return IndexUtil::MultidimensionalIndexToLinearIndex(shape, multi_index); - }; - - if (ShapeUtil::Rank(src_literal.shape()) == 0 || - ShapeUtil::Rank(shape()) == 0) { - // If any of the two shapes are scalars, we can just call the StridedCopy() - // directly, and we know we will be copying only one value. - TF_RET_CHECK(copy_size.empty()); - StridedCopy(data(), linear_index(shape(), dest_base), 0, - src_literal.data(), - linear_index(src_literal.shape(), src_base), 0, 1); - } else if (!ShapeUtil::HasZeroElements(shape()) && - !ShapeUtil::HasZeroElements(src_literal.shape())) { - // Perform copy if neither src nor dest has dimensions with zero element, - // otherwise it's a no-op. - TF_RET_CHECK(src_base.size() == dest_base.size()); - TF_RET_CHECK(src_base.size() == copy_size.size()); - - // Scan the source from minor, stepping in copy size blocks, then within - // the index enumaration functor, do a strided copy advancing source index - // by one (walking through the minor dimension), and destination index by - // proper stride size at the matching dimension. - DimensionVector src_indexes(src_base.size(), 0); - DimensionVector dest_indexes(dest_base.size(), 0); - Literal::StrideConfig stride_config(src_literal.shape(), shape(), - copy_size); - - auto copy_proc = [&](tensorflow::gtl::ArraySlice indexes) { - // Map from multi-dimensional index, to source index. - std::transform(indexes.begin(), indexes.end(), src_base.begin(), - src_indexes.begin(), std::plus()); - // Map from multi-dimensional index, to destination index. - std::transform(indexes.begin(), indexes.end(), dest_base.begin(), - dest_indexes.begin(), std::plus()); - - int64 src_index = linear_index(src_literal.shape(), src_indexes); - int64 dest_index = linear_index(shape(), dest_indexes); - - // `this->` is needed to workaround MSVC bug: #16882 - StridedCopy(this->data(), dest_index, stride_config.dest_stride, - src_literal.data(), src_index, - stride_config.source_stride, stride_config.minor_loop_size); - return true; - }; - - ShapeUtil::ForEachIndex(src_literal.shape(), stride_config.base, - stride_config.dimensions, stride_config.step, - copy_proc); - } - return Status::OK(); -} - -Status Literal::CopyElementFrom(const LiteralSlice& src_literal, - tensorflow::gtl::ArraySlice src_index, - tensorflow::gtl::ArraySlice dest_index) { - DCHECK_EQ(shape().element_type(), src_literal.shape().element_type()); - const int64 src_linear_index = IndexUtil::MultidimensionalIndexToLinearIndex( - src_literal.shape(), src_index); - const int64 dest_linear_index = - IndexUtil::MultidimensionalIndexToLinearIndex(shape(), dest_index); - const int64 primitive_size = - ShapeUtil::ByteSizeOfPrimitiveType(shape().element_type()); - - char* dest_address = - static_cast(untyped_data()) + dest_linear_index * primitive_size; - const char* source_address = - static_cast(src_literal.untyped_data()) + - src_linear_index * primitive_size; - if (dest_address != source_address) { - memcpy(dest_address, source_address, primitive_size); - } - return Status::OK(); -} - -std::vector Literal::DecomposeTuple() { - CHECK(ShapeUtil::IsTuple(shape())); - std::vector elements; - for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) { - elements.push_back(Literal(ShapeUtil::GetSubshape(shape(), {i}), - /*allocate_arrays=*/false)); - Literal& element = elements.back(); - element.root_piece_->ForEachMutableSubpiece( - [&](const ShapeIndex& index, Piece* dest_piece) { - ShapeIndex src_index = {i}; - for (int64 j : index) { - src_index.push_back(j); - } - Piece& src_piece = piece(src_index); - - // Move the respective buffer and sparse indices over to the element - // Literal. - dest_piece->set_buffer(src_piece.buffer()); - src_piece.set_buffer(nullptr); - dest_piece->set_sparse_indices(src_piece.sparse_indices()); - src_piece.set_sparse_indices(nullptr); - }); - } - // Set this literal to be nil-shaped. - *this = Literal(); - return elements; -} - -/* static */ Literal Literal::MoveIntoTuple( - tensorflow::gtl::MutableArraySlice elements) { - std::vector element_shapes; - for (const Literal& element : elements) { - element_shapes.push_back(element.shape()); - } - Literal literal(ShapeUtil::MakeTupleShape(element_shapes), - /*allocate_arrays=*/false); - for (int i = 0; i < elements.size(); ++i) { - TF_CHECK_OK( - literal.MoveFrom(std::move(elements[i]), /*dest_shape_index=*/{i})); - } - return literal; -} - -namespace { - -// Copies the elements in 'src' to 'dest'. The shape and layout of the data in -// the array slices are indicated by dest_shape and src_shape respectively. -template -void CopyElementsBetween(tensorflow::gtl::MutableArraySlice dest, - tensorflow::gtl::ArraySlice src, - const Shape& dest_shape, const Shape& src_shape) { - CHECK(ShapeUtil::Compatible(dest_shape, src_shape)); - if (ShapeUtil::HasZeroElements(dest_shape)) { - return; - } - std::vector index(ShapeUtil::Rank(dest_shape)); - do { - dest[IndexUtil::MultidimensionalIndexToLinearIndex(dest_shape, index)] = - src[IndexUtil::MultidimensionalIndexToLinearIndex(src_shape, index)]; - } while (IndexUtil::BumpIndices(dest_shape, &index)); -} - -} // namespace - -Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) { - CHECK(subshape_ != nullptr); - CHECK(src.subshape_ != nullptr); - if (ShapeUtil::Equal(subshape(), src.subshape())) { - // If the layouts are equal it's faster just to memcpy. - memcpy(buffer(), src.buffer(), src.size_bytes()); - } else { - TF_RET_CHECK(ShapeUtil::Compatible(src.subshape(), subshape())); - std::vector origin(ShapeUtil::Rank(subshape()), 0); - switch (subshape().element_type()) { -#define COPY_ELEMENTS(XLA_T, NATIVE_T) \ - case (XLA_T): \ - CopyElementsBetween(data(), src.data(), \ - subshape(), src.subshape()); \ - break; - COPY_ELEMENTS(U8, uint8); - COPY_ELEMENTS(U16, uint16); - COPY_ELEMENTS(U32, uint32); - COPY_ELEMENTS(U64, uint64); - COPY_ELEMENTS(S8, int8); - COPY_ELEMENTS(S16, int16); - COPY_ELEMENTS(S32, int32); - COPY_ELEMENTS(S64, int64); - COPY_ELEMENTS(F16, half); - COPY_ELEMENTS(BF16, bfloat16); - COPY_ELEMENTS(F32, float); - COPY_ELEMENTS(F64, double); - COPY_ELEMENTS(C64, complex64); - COPY_ELEMENTS(PRED, bool); -#undef COPY_ELEMENTS - default: - return Unimplemented( - "Copying a Literal object with element type %s is not implemented.", - PrimitiveType_Name(subshape().element_type()).c_str()); - } - } - return Status::OK(); -} - -Status Literal::CopyFrom(const LiteralSlice& src_literal, - const ShapeIndex& dest_shape_index, - const ShapeIndex& src_shape_index) { - const Shape& dest_subshape = - ShapeUtil::GetSubshape(shape(), dest_shape_index); - const Shape& src_subshape = - ShapeUtil::GetSubshape(src_literal.shape(), src_shape_index); - if (!ShapeUtil::Compatible(dest_subshape, src_subshape)) { - return InvalidArgument( - "Destination subshape incompatible with source subshape: %s vs %s", - ShapeUtil::HumanString(dest_subshape).c_str(), - ShapeUtil::HumanString(src_subshape).c_str()); - } - return root_piece_->ForEachMutableSubpieceWithStatus( - [&](const ShapeIndex& index, Piece* piece) { - if (!ShapeUtil::IsArray(piece->subshape())) { - return Status::OK(); - } - - // Determine if this index is in the part of this literal that we want - // to copy over from src_literal. - bool in_subtree_to_copy = true; - for (int i = 0; i < dest_shape_index.size(); ++i) { - if (index[i] != dest_shape_index[i]) { - in_subtree_to_copy = false; - break; - } - } - if (!in_subtree_to_copy) { - return Status::OK(); - } - // Construct the index of the corresponding piece in the source literal. - ShapeIndex src_piece_index = src_shape_index; - for (int64 i = dest_shape_index.size(); i < index.size(); ++i) { - src_piece_index.push_back(index[i]); - } - TF_RETURN_IF_ERROR(piece->CopyFrom(src_literal.piece(src_piece_index))); - return Status::OK(); - }); -} - -Status Literal::MoveFrom(Literal&& src_literal, - const ShapeIndex& dest_shape_index) { - const Shape& dest_subshape = - ShapeUtil::GetSubshape(shape(), dest_shape_index); - if (!ShapeUtil::Equal(dest_subshape, src_literal.shape())) { - return InvalidArgument( - "Destination subshape not equal to source shape: %s vs %s", - ShapeUtil::HumanString(dest_subshape).c_str(), - ShapeUtil::HumanString(src_literal.shape()).c_str()); - } - - src_literal.root_piece_->ForEachSubpiece( - [&](const ShapeIndex& src_index, const Piece& src_piece) { - if (!ShapeUtil::IsArray(src_piece.subshape())) { - return; - } - - ShapeIndex dest_index = dest_shape_index; - for (int64 i : src_index) { - dest_index.push_back(i); - } - Piece& dest_piece = piece(dest_index); - delete[] dest_piece.buffer(); - dest_piece.set_buffer(src_piece.buffer()); - delete dest_piece.sparse_indices(); - dest_piece.set_sparse_indices(src_piece.sparse_indices()); - }); - - src_literal.shape_ = MakeUnique(ShapeUtil::MakeNil()); - delete src_literal.root_piece_; - src_literal.root_piece_ = new LiteralBase::Piece(); - src_literal.root_piece_->set_subshape(src_literal.shape_.get()); - - return Status::OK(); -} - -Status Literal::CopySliceFrom(const LiteralSlice& src_literal, - tensorflow::gtl::ArraySlice src_base, - tensorflow::gtl::ArraySlice dest_base, - tensorflow::gtl::ArraySlice copy_size) { - TF_RET_CHECK(ShapeUtil::IsArray(shape())) << ShapeUtil::HumanString(shape()); - TF_RET_CHECK(ShapeUtil::IsArray(src_literal.shape())) - << ShapeUtil::HumanString(src_literal.shape()); - TF_RET_CHECK(ShapeUtil::SameElementType(src_literal.shape(), shape())); - - switch (shape().element_type()) { - case U8: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case U16: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case U32: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case U64: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case S8: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case S16: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case S32: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case S64: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case F16: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case BF16: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case F32: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case F64: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case C64: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - case PRED: - return CopySliceFromInternal(src_literal, src_base, dest_base, - copy_size); - default: - break; - } - return Unimplemented( - "Copying a slice from a Literal object with element type %d is not " - "implemented.", - shape().element_type()); +/* static */ std::unique_ptr LiteralUtil::CreateToken() { + return absl::make_unique(ShapeUtil::MakeTokenShape()); } -/* static */ Literal Literal::Zero(PrimitiveType primitive_type) { +/* static */ Literal LiteralUtil::Zero(PrimitiveType primitive_type) { switch (primitive_type) { case U8: - return std::move(*Literal::CreateR0(0)); + return std::move(*LiteralUtil::CreateR0(0)); case U32: - return std::move(*Literal::CreateR0(0)); + return std::move(*LiteralUtil::CreateR0(0)); case U64: - return std::move(*Literal::CreateR0(0)); + return std::move(*LiteralUtil::CreateR0(0)); case S8: - return std::move(*Literal::CreateR0(0)); + return std::move(*LiteralUtil::CreateR0(0)); case S32: - return std::move(*Literal::CreateR0(0)); + return std::move(*LiteralUtil::CreateR0(0)); case S64: - return std::move(*Literal::CreateR0(0)); + return std::move(*LiteralUtil::CreateR0(0)); case F16: - return std::move(*Literal::CreateR0(static_cast(0.0f))); + return std::move(*LiteralUtil::CreateR0(static_cast(0.0f))); case BF16: return std::move( - *Literal::CreateR0(static_cast(0.0f))); + *LiteralUtil::CreateR0(static_cast(0.0f))); case F32: - return std::move(*Literal::CreateR0(0)); + return std::move(*LiteralUtil::CreateR0(0)); case F64: - return std::move(*Literal::CreateR0(0)); + return std::move(*LiteralUtil::CreateR0(0)); case C64: - return std::move(*Literal::CreateR0(0)); + return std::move(*LiteralUtil::CreateR0(0)); case PRED: - return std::move(*Literal::CreateR0(false)); + return std::move(*LiteralUtil::CreateR0(false)); case S16: case U16: LOG(FATAL) << "u16/s16 literals not yet implemented"; @@ -607,33 +142,33 @@ Status Literal::CopySliceFrom(const LiteralSlice& src_literal, } } -/* static */ Literal Literal::One(PrimitiveType primitive_type) { +/* static */ Literal LiteralUtil::One(PrimitiveType primitive_type) { switch (primitive_type) { case U8: - return std::move(*Literal::CreateR0(1)); + return std::move(*LiteralUtil::CreateR0(1)); case U32: - return std::move(*Literal::CreateR0(1)); + return std::move(*LiteralUtil::CreateR0(1)); case U64: - return std::move(*Literal::CreateR0(1)); + return std::move(*LiteralUtil::CreateR0(1)); case S8: - return std::move(*Literal::CreateR0(1)); + return std::move(*LiteralUtil::CreateR0(1)); case S32: - return std::move(*Literal::CreateR0(1)); + return std::move(*LiteralUtil::CreateR0(1)); case S64: - return std::move(*Literal::CreateR0(1)); + return std::move(*LiteralUtil::CreateR0(1)); case F16: - return std::move(*Literal::CreateR0(static_cast(1.0f))); + return std::move(*LiteralUtil::CreateR0(static_cast(1.0f))); case BF16: return std::move( - *Literal::CreateR0(static_cast(1.0f))); + *LiteralUtil::CreateR0(static_cast(1.0f))); case F32: - return std::move(*Literal::CreateR0(1)); + return std::move(*LiteralUtil::CreateR0(1)); case F64: - return std::move(*Literal::CreateR0(1)); + return std::move(*LiteralUtil::CreateR0(1)); case C64: - return std::move(*Literal::CreateR0(1)); + return std::move(*LiteralUtil::CreateR0(1)); case PRED: - return std::move(*Literal::CreateR0(true)); + return std::move(*LiteralUtil::CreateR0(true)); case S16: case U16: LOG(FATAL) << "u16/s16 literals not yet implemented"; @@ -646,44 +181,44 @@ Status Literal::CopySliceFrom(const LiteralSlice& src_literal, } } -/* static */ Literal Literal::MinValue(PrimitiveType primitive_type) { +/* static */ Literal LiteralUtil::MinValue(PrimitiveType primitive_type) { switch (primitive_type) { case U8: return std::move( - *Literal::CreateR0(std::numeric_limits::min())); + *LiteralUtil::CreateR0(std::numeric_limits::min())); case U32: return std::move( - *Literal::CreateR0(std::numeric_limits::min())); + *LiteralUtil::CreateR0(std::numeric_limits::min())); case U64: return std::move( - *Literal::CreateR0(std::numeric_limits::min())); + *LiteralUtil::CreateR0(std::numeric_limits::min())); case S8: return std::move( - *Literal::CreateR0(std::numeric_limits::min())); + *LiteralUtil::CreateR0(std::numeric_limits::min())); case S32: return std::move( - *Literal::CreateR0(std::numeric_limits::min())); + *LiteralUtil::CreateR0(std::numeric_limits::min())); case S64: return std::move( - *Literal::CreateR0(std::numeric_limits::min())); + *LiteralUtil::CreateR0(std::numeric_limits::min())); case F32: - return std::move( - *Literal::CreateR0(-std::numeric_limits::infinity())); + return std::move(*LiteralUtil::CreateR0( + -std::numeric_limits::infinity())); case F64: - return std::move( - *Literal::CreateR0(-std::numeric_limits::infinity())); + return std::move(*LiteralUtil::CreateR0( + -std::numeric_limits::infinity())); case C64: LOG(FATAL) << "C64 element type has no minimum value"; case PRED: - return std::move(*Literal::CreateR0(false)); + return std::move(*LiteralUtil::CreateR0(false)); case S16: case U16: LOG(FATAL) << "u16/s16 literals not yet implemented"; case F16: - return std::move(*Literal::CreateR0( + return std::move(*LiteralUtil::CreateR0( static_cast(-std::numeric_limits::infinity()))); case BF16: - return std::move(*Literal::CreateR0( + return std::move(*LiteralUtil::CreateR0( static_cast(-std::numeric_limits::infinity()))); case TUPLE: LOG(FATAL) << "tuple element type has no minimum value"; @@ -694,42 +229,42 @@ Status Literal::CopySliceFrom(const LiteralSlice& src_literal, } } -/* static */ Literal Literal::MaxValue(PrimitiveType primitive_type) { +/* static */ Literal LiteralUtil::MaxValue(PrimitiveType primitive_type) { switch (primitive_type) { case U8: return std::move( - *Literal::CreateR0(std::numeric_limits::max())); + *LiteralUtil::CreateR0(std::numeric_limits::max())); case U32: return std::move( - *Literal::CreateR0(std::numeric_limits::max())); + *LiteralUtil::CreateR0(std::numeric_limits::max())); case U64: return std::move( - *Literal::CreateR0(std::numeric_limits::max())); + *LiteralUtil::CreateR0(std::numeric_limits::max())); case S8: return std::move( - *Literal::CreateR0(std::numeric_limits::max())); + *LiteralUtil::CreateR0(std::numeric_limits::max())); case S32: return std::move( - *Literal::CreateR0(std::numeric_limits::max())); + *LiteralUtil::CreateR0(std::numeric_limits::max())); case S64: return std::move( - *Literal::CreateR0(std::numeric_limits::max())); + *LiteralUtil::CreateR0(std::numeric_limits::max())); case F32: - return std::move( - *Literal::CreateR0(std::numeric_limits::infinity())); + return std::move(*LiteralUtil::CreateR0( + std::numeric_limits::infinity())); case F64: - return std::move( - *Literal::CreateR0(std::numeric_limits::infinity())); + return std::move(*LiteralUtil::CreateR0( + std::numeric_limits::infinity())); case PRED: - return std::move(*Literal::CreateR0(true)); + return std::move(*LiteralUtil::CreateR0(true)); case S16: case U16: LOG(FATAL) << "u16/s16 literals not yet implemented"; case F16: - return std::move(*Literal::CreateR0( + return std::move(*LiteralUtil::CreateR0( static_cast(std::numeric_limits::infinity()))); case BF16: - return std::move(*Literal::CreateR0( + return std::move(*LiteralUtil::CreateR0( static_cast(std::numeric_limits::infinity()))); case TUPLE: LOG(FATAL) << "tuple element type has no maximum value"; @@ -740,27 +275,17 @@ Status Literal::CopySliceFrom(const LiteralSlice& src_literal, } } -/* static */ std::unique_ptr Literal::CreateR1( +/* static */ std::unique_ptr LiteralUtil::CreateR1( const tensorflow::core::Bitmap& values) { - auto literal = MakeUnique( + auto literal = absl::make_unique( ShapeUtil::MakeShape(PRED, {static_cast(values.bits())})); literal->PopulateR1(values); return literal; } -void Literal::PopulateR1(const tensorflow::core::Bitmap& values) { - CHECK(ShapeUtil::IsArray(shape())); - CHECK_EQ(ShapeUtil::Rank(shape()), 1); - CHECK_EQ(element_count(), values.bits()); - CHECK_EQ(shape().element_type(), PRED); - for (int64 i = 0; i < static_cast(values.bits()); ++i) { - Set({i}, values.get(i)); - } -} - -/* static */ std::unique_ptr Literal::CreateR1U8( - tensorflow::StringPiece value) { - auto literal = MakeUnique( +/* static */ std::unique_ptr LiteralUtil::CreateR1U8( + absl::string_view value) { + auto literal = absl::make_unique( ShapeUtil::MakeShape(U8, {static_cast(value.size())})); for (int i = 0; i < value.size(); ++i) { literal->Set({i}, value[i]); @@ -768,119 +293,15 @@ void Literal::PopulateR1(const tensorflow::core::Bitmap& values) { return literal; } -/* static */ std::unique_ptr Literal::CreateR2F32Linspace(float from, - float to, - int64 rows, - int64 cols) { +/* static */ std::unique_ptr LiteralUtil::CreateR2F32Linspace( + float from, float to, int64 rows, int64 cols) { auto value = MakeLinspaceArray2D(from, to, rows, cols); return CreateR2FromArray2D(*value); } -std::unique_ptr LiteralBase::Relayout( - const Layout& new_layout, const ShapeIndex& shape_index) const { - // Create new shape with 'new_layout' set at the given shape index. - Shape new_shape = shape(); - Shape* subshape = ShapeUtil::GetMutableSubshape(&new_shape, shape_index); - TF_CHECK_OK(LayoutUtil::ValidateLayoutForShape(new_layout, *subshape)); - *subshape->mutable_layout() = new_layout; - auto result = MakeUnique(new_shape); - TF_CHECK_OK(result->CopyFrom(*this)); - return result; -} - -std::unique_ptr LiteralBase::Relayout( - const Shape& shape_with_layout) const { - CHECK(ShapeUtil::Compatible(shape_with_layout, shape())) - << "Given shape_with_layout " << ShapeUtil::HumanString(shape_with_layout) - << " not compatible with literal shape " - << ShapeUtil::HumanString(shape()); - std::unique_ptr result = CreateFromShape(shape_with_layout); - ShapeUtil::ForEachSubshape( - result->shape(), - [this, &result](const Shape& subshape, const ShapeIndex& index) { - if (ShapeUtil::IsArray(subshape)) { - TF_CHECK_OK(result->CopyFrom(*this, - /*dest_shape_index=*/index, - /*src_shape_index=*/index)); - } - }); - return result; -} - -StatusOr> LiteralBase::Broadcast( - const Shape& result_shape, - tensorflow::gtl::ArraySlice dimensions) const { - if (!ShapeUtil::IsArray(shape())) { - return InvalidArgument("Broadcast only supports arrays."); - } - - for (int64 i = 0; i < dimensions.size(); i++) { - TF_RET_CHECK(shape().dimensions(i) == - result_shape.dimensions(dimensions[i])); - } - - std::unique_ptr result = MakeUnique(result_shape); - - // scratch_source_index is temporary storage space for the computed index into - // the input literal. We put it here to avoid allocating an std::vector in - // every iteration of ShapeUtil::ForEachIndex. - std::vector scratch_source_index(shape().dimensions_size()); - - char* dest_data = static_cast(result->untyped_data()); - const char* source_data = static_cast(untyped_data()); - const int64 primitive_size = - ShapeUtil::ByteSizeOfPrimitiveType(shape().element_type()); - - ShapeUtil::ForEachIndex( - result_shape, [&](tensorflow::gtl::ArraySlice output_index) { - for (int64 i = 0; i < dimensions.size(); ++i) { - scratch_source_index[i] = output_index[dimensions[i]]; - } - int64 dest_index = IndexUtil::MultidimensionalIndexToLinearIndex( - result_shape, output_index); - int64 source_index = IndexUtil::MultidimensionalIndexToLinearIndex( - shape(), scratch_source_index); - memcpy(dest_data + primitive_size * dest_index, - source_data + primitive_size * source_index, primitive_size); - return true; - }); - - return std::move(result); -} - -StatusOr> LiteralBase::Reshape( - tensorflow::gtl::ArraySlice dimensions) const { - if (!ShapeUtil::IsArray(shape())) { - return InvalidArgument("Reshape does not support tuples."); - } - std::unique_ptr output; - if (!LayoutUtil::IsMonotonicWithDim0Major(shape().layout())) { - output = - Relayout(LayoutUtil::GetDefaultLayoutForRank(ShapeUtil::Rank(shape()))); - } else { - output = CloneToUnique(); - } - // Because the layout is monotonic, we can simply reuse the same sequence of - // values without changing their order. - *output->mutable_shape_do_not_use() = - ShapeUtil::MakeShape(shape().element_type(), dimensions); - - int64 elements_before = ShapeUtil::ElementsIn(shape()); - int64 elements_after = ShapeUtil::ElementsIn(output->shape()); - if (elements_before != elements_after) { - return InvalidArgument( - "Shapes before and after Literal::Reshape have different numbers " - "of elements: %s vs %s.", - ShapeUtil::HumanString(shape()).c_str(), - ShapeUtil::HumanString(output->shape()).c_str()); - } - return std::move(output); -} - -/* static */ std::unique_ptr Literal::ReshapeSlice( - tensorflow::gtl::ArraySlice new_dimensions, - tensorflow::gtl::ArraySlice minor_to_major, - const LiteralSlice& literal) { +/* static */ std::unique_ptr LiteralUtil::ReshapeSlice( + absl::Span new_dimensions, + absl::Span minor_to_major, const LiteralSlice& literal) { int64 new_num_elements = 1; for (int64 i = 0; i < new_dimensions.size(); ++i) { new_num_elements *= new_dimensions[i]; @@ -888,7 +309,7 @@ StatusOr> LiteralBase::Reshape( CHECK_EQ(ShapeUtil::ElementsIn(literal.shape()), new_num_elements); CHECK_EQ(new_dimensions.size(), minor_to_major.size()); - auto new_literal = MakeUnique( + auto new_literal = absl::make_unique( ShapeUtil::MakeShape(literal.shape().element_type(), new_dimensions)); // Create a new shape with the given minor-to-major layout. This shape is used @@ -949,620 +370,100 @@ StatusOr> LiteralBase::Reshape( return new_literal; } -std::unique_ptr LiteralBase::Transpose( - tensorflow::gtl::ArraySlice permutation) const { - CHECK(ShapeUtil::IsArray(shape())) << "Tuple is not supported for transpose"; - CHECK(IsPermutation(permutation, ShapeUtil::Rank(shape()))) - << "Given permutation is not a permutation of dimension numbers"; - // To transpose the array, we just permute the dimensions and layout, and - // do a straight memory copy of the raw data set. - // This is considerably faster than iterating over every array element using - // the EachCell<>() and Set<>() APIs. - std::vector inverse_permutation = InversePermutation(permutation); - Shape permuted_shape = - ShapeUtil::PermuteDimensions(inverse_permutation, shape()); - // Replace the layout with one affine to this shape, such that a - // transpose operation can be performed by leaving the flat values - // representation intact. - // For example, consider the shape F32[11,8]{1,0} under a {1,0} permutation. - // The shape with affine layout resulting from that operation will be - // F32[8,11]{0,1}, since it leaves the original most minor (the 8 sized), the - // most minor. - // - // Essentially, given MinMaj(Di) the position of the Di dimension within the - // minor to major vector, and given T(Di) the index that the original Di - // dimension has within the transposed array, a layout is affine if - // MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major - // vector of the affine layout. - CHECK(LayoutUtil::IsDenseArray(permuted_shape)); - Layout* layout = permuted_shape.mutable_layout(); - layout->clear_minor_to_major(); - for (auto index : LayoutUtil::MinorToMajor(shape())) { - layout->add_minor_to_major(inverse_permutation[index]); - } - auto new_literal = MakeUnique(permuted_shape); - DCHECK_EQ(ShapeUtil::ByteSizeOf(new_literal->shape()), - ShapeUtil::ByteSizeOf(shape())); - std::memcpy(new_literal->untyped_data(), untyped_data(), size_bytes()); - return new_literal; -} - -std::unique_ptr LiteralBase::Slice( - tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices) const { - CHECK(ShapeUtil::IsArray(shape())) << "tuple is not supported for slice"; - - DimensionVector result_dimensions; - for (int64 dnum = 0; dnum < ShapeUtil::Rank(shape()); ++dnum) { - CHECK_GE(start_indices[dnum], 0); - CHECK_LE(limit_indices[dnum], shape().dimensions(dnum)) - << "dnum = " << dnum; - int64 dimension = limit_indices[dnum] - start_indices[dnum]; - CHECK_GE(dimension, 0) << "dnum = " << dnum; - result_dimensions.push_back(dimension); - } - const auto result_shape = - ShapeUtil::MakeShapeWithLayout(shape().element_type(), result_dimensions, - LayoutUtil::MinorToMajor(shape())); - - auto result_literal = MakeUnique(result_shape); - - DimensionVector new_indices(ShapeUtil::Rank(result_shape)); - switch (result_shape.element_type()) { - case F32: - result_literal->EachCell( - [&](tensorflow::gtl::ArraySlice indices, float /*value*/) { - for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) { - new_indices[i] = indices[i] + start_indices[i]; - } - float value = Get(new_indices); - result_literal->Set(indices, value); - }); - return result_literal; - case C64: - result_literal->EachCell( - [&](tensorflow::gtl::ArraySlice indices, complex64 /*value*/) { - for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) { - new_indices[i] = indices[i] + start_indices[i]; - } - complex64 value = Get(new_indices); - result_literal->Set(indices, value); - }); - return result_literal; - case S32: - result_literal->EachCell( - [&](tensorflow::gtl::ArraySlice indices, int32 /*value*/) { - for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) { - new_indices[i] = indices[i] + start_indices[i]; - } - int32 value = Get(new_indices); - result_literal->Set(indices, value); - }); - return result_literal; - case U32: - result_literal->EachCell( - [&](tensorflow::gtl::ArraySlice indices, uint32 /*value*/) { - for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) { - new_indices[i] = indices[i] + start_indices[i]; - } - uint32 value = Get(new_indices); - result_literal->Set(indices, value); - }); - return result_literal; - default: - LOG(FATAL) << "not yet implemented: " - << PrimitiveType_Name(result_shape.element_type()); - } -} - -Literal LiteralBase::Clone() const { - Literal result(shape()); - TF_CHECK_OK(result.CopyFrom(*this)); - return result; -} - -std::unique_ptr LiteralBase::CloneToUnique() const { - auto result = MakeUnique(shape()); - TF_CHECK_OK(result->CopyFrom(*this)); - return result; -} - -string LiteralBase::GetAsString(tensorflow::gtl::ArraySlice multi_index, - const ShapeIndex& shape_index) const { - const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index); - CHECK(LayoutUtil::IsDenseArray(subshape)); - switch (subshape.element_type()) { +/* static */ Literal LiteralUtil::GetFirstScalarLiteral( + const LiteralSlice& literal) { + CHECK(ShapeUtil::IsArray(literal.shape())); + CHECK_GT(ShapeUtil::ElementsIn(literal.shape()), 0); + switch (literal.shape().element_type()) { case PRED: - return Get(multi_index, shape_index) ? "true" : "false"; + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); + // 8 bit types. case S8: - return StrCat(Get(multi_index, shape_index)); - case S16: - return StrCat(Get(multi_index, shape_index)); - case S32: - return StrCat(Get(multi_index, shape_index)); - case S64: - return StrCat(Get(multi_index, shape_index)); + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); case U8: - return StrCat(Get(multi_index, shape_index)); - case U16: - return StrCat(Get(multi_index, shape_index)); - case U32: - return StrCat(Get(multi_index, shape_index)); - case U64: - return StrCat(Get(multi_index, shape_index)); - case F16: - return StrCat(static_cast(Get(multi_index, shape_index))); - case F32: - return StrCat(Get(multi_index, shape_index)); + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); + // 16 bit types. case BF16: - return StrCat( - static_cast(Get(multi_index, shape_index))); - case F64: - return StrCat(Get(multi_index, shape_index)); - case C64: { - complex64 c = Get(multi_index, shape_index); - return StrCat("(", c.real(), ", ", c.imag(), ")"); - } - default: - LOG(FATAL) << PrimitiveType_Name(subshape.element_type()); - } -} - -string LiteralBase::GetSparseElementAsString( - int64 sparse_element_number, const ShapeIndex& shape_index) const { - const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index); - CHECK(LayoutUtil::IsSparseArray(subshape)); - switch (subshape.element_type()) { - case PRED: - return GetSparseElement(sparse_element_number, shape_index) - ? "true" - : "false"; - case S8: - return StrCat(GetSparseElement(sparse_element_number, shape_index)); + return std::move(*LiteralUtil::CreateR0( + literal.GetFirstElement())); + case F16: + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); case S16: - return StrCat( - GetSparseElement(sparse_element_number, shape_index)); - case S32: - return StrCat( - GetSparseElement(sparse_element_number, shape_index)); - case S64: - return StrCat( - GetSparseElement(sparse_element_number, shape_index)); - case U8: - return StrCat( - GetSparseElement(sparse_element_number, shape_index)); + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); case U16: - return StrCat( - GetSparseElement(sparse_element_number, shape_index)); - case U32: - return StrCat( - GetSparseElement(sparse_element_number, shape_index)); - case U64: - return StrCat( - GetSparseElement(sparse_element_number, shape_index)); - case F16: - return StrCat(static_cast( - GetSparseElement(sparse_element_number, shape_index))); + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); + // 32 bit types. case F32: - return StrCat( - GetSparseElement(sparse_element_number, shape_index)); - case BF16: - return StrCat(static_cast( - GetSparseElement(sparse_element_number, shape_index))); - case F64: - return StrCat( - GetSparseElement(sparse_element_number, shape_index)); - case C64: { - complex64 c = - GetSparseElement(sparse_element_number, shape_index); - return StrCat("(", c.real(), ", ", c.imag(), ")"); - } - default: - LOG(FATAL) << "Invalid element type for sparse arrays: " - << PrimitiveType_Name(subshape.element_type()); - } -} - -StatusOr LiteralBase::GetIntegralAsS64( - tensorflow::gtl::ArraySlice multi_index) const { - CHECK(LayoutUtil::IsDenseArray(shape())); - switch (shape().element_type()) { - case PRED: - return Get(multi_index); - case U8: - return Get(multi_index); + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); case S32: - return Get(multi_index); - case S64: - return Get(multi_index); + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); case U32: - return Get(multi_index); + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); + // 64 bit types. + case C64: + return std::move(*LiteralUtil::CreateR0( + literal.GetFirstElement())); + case F64: + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); + case S64: + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); case U64: - return Get(multi_index); + return std::move( + *LiteralUtil::CreateR0(literal.GetFirstElement())); default: - return FailedPrecondition( - "Array element type is not integral: %s", - PrimitiveType_Name(shape().element_type()).c_str()); + LOG(FATAL) << "Unhandled primitive type " + << literal.shape().element_type(); } } -size_t LiteralBase::Hash() const { - using tensorflow::Hash64; - using tensorflow::Hash64Combine; - - size_t hash_value = ShapeUtil::Hash(shape()); - - ShapeUtil::ForEachSubshape( - shape(), [&](const Shape& subshape, const ShapeIndex& index) { - if (ShapeUtil::IsTuple(subshape)) { - return; - } - - CHECK(LayoutUtil::IsDense(subshape.layout())); - hash_value = Hash64Combine( - hash_value, Hash64(static_cast(untyped_data(index)), - size_bytes(index))); - }); - - return hash_value; +/* static */ std::unique_ptr LiteralUtil::MakeTuple( + absl::Span elements) { + std::vector element_shapes; + for (const auto* element : elements) { + element_shapes.push_back(element->shape()); + } + auto literal = + absl::make_unique(ShapeUtil::MakeTupleShape(element_shapes)); + for (int i = 0; i < elements.size(); ++i) { + TF_CHECK_OK(literal->CopyFrom(*elements[i], /*dest_shape_index=*/{i})); + } + return literal; } -Status Literal::SetIntegralAsS64(tensorflow::gtl::ArraySlice multi_index, - int64 value) { - CHECK(LayoutUtil::IsDenseArray(shape())); - switch (shape().element_type()) { - case PRED: - Set(multi_index, value); - break; - case U8: - Set(multi_index, value); - break; - case S32: - Set(multi_index, value); - break; - case S64: - Set(multi_index, value); - break; - case U32: - Set(multi_index, value); - break; - case U64: - Set(multi_index, value); - break; - default: - return FailedPrecondition( - "Array element type is not integral: %s", - PrimitiveType_Name(shape().element_type()).c_str()); - } - return Status::OK(); -} - -tensorflow::gtl::ArraySlice LiteralBase::GetSparseIndex( - int64 sparse_element_number, const ShapeIndex& shape_index) const { - const Piece& p = piece(shape_index); - CHECK_GE(sparse_element_number, 0); - CHECK_LT(sparse_element_number, p.sparse_indices()->index_count()); - return p.sparse_indices()->At(sparse_element_number); -} - -void Literal::SortSparseElements(const ShapeIndex& shape_index) { - piece(shape_index).SortSparseElements(); -} - -Literal LiteralBase::GetFirstScalarLiteral() const { - CHECK(ShapeUtil::IsArray(shape())); - CHECK_GT(ShapeUtil::ElementsIn(shape()), 0); - switch (shape().element_type()) { - case PRED: - return std::move(*Literal::CreateR0(GetFirstElement())); - // 8 bit types. - case S8: - return std::move(*Literal::CreateR0(GetFirstElement())); - case U8: - return std::move(*Literal::CreateR0(GetFirstElement())); - // 16 bit types. - case BF16: - return std::move( - *Literal::CreateR0(GetFirstElement())); - case F16: - return std::move(*Literal::CreateR0(GetFirstElement())); - case S16: - return std::move(*Literal::CreateR0(GetFirstElement())); - case U16: - return std::move(*Literal::CreateR0(GetFirstElement())); - // 32 bit types. - case F32: - return std::move(*Literal::CreateR0(GetFirstElement())); - case S32: - return std::move(*Literal::CreateR0(GetFirstElement())); - case U32: - return std::move(*Literal::CreateR0(GetFirstElement())); - // 64 bit types. - case C64: - return std::move( - *Literal::CreateR0(GetFirstElement())); - case F64: - return std::move(*Literal::CreateR0(GetFirstElement())); - case S64: - return std::move(*Literal::CreateR0(GetFirstElement())); - case U64: - return std::move(*Literal::CreateR0(GetFirstElement())); - default: - LOG(FATAL) << "Unhandled primitive type " << shape().element_type(); - } -} - -void LiteralBase::Piece::SortSparseElements() { - switch (subshape().element_type()) { - case PRED: - SortSparseElementsInternal(); - break; - case S8: - SortSparseElementsInternal(); - break; - case U8: - SortSparseElementsInternal(); - break; - case S16: - SortSparseElementsInternal(); - break; - case U16: - SortSparseElementsInternal(); - break; - case S32: - SortSparseElementsInternal(); - break; - case U32: - SortSparseElementsInternal(); - break; - case S64: - SortSparseElementsInternal(); - break; - case U64: - SortSparseElementsInternal(); - break; - case F32: - SortSparseElementsInternal(); - break; - case F64: - SortSparseElementsInternal(); - break; - case C64: - SortSparseElementsInternal(); - break; - case F16: - SortSparseElementsInternal(); - break; - case BF16: - SortSparseElementsInternal(); - break; - default: - LOG(FATAL) << "Element type not valid for sparse array: " - << PrimitiveType_Name(subshape().element_type()); - } -} - -template -void LiteralBase::Piece::SortSparseElementsInternal() { - CHECK(LayoutUtil::IsSparseArray(subshape())); - int64 num_elements = sparse_indices()->index_count(); - auto values = data(); - CHECK_LE(num_elements, values.size()); - sparse_indices()->SortWithValues( - tensorflow::gtl::MutableArraySlice(values.data(), num_elements)); -} - -namespace { - -void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index, - bool print_layout, std::vector* pieces) { - const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index); - CHECK(LayoutUtil::HasLayout(literal.shape())); - CHECK(LayoutUtil::HasLayout(subshape)); - - auto shape_to_string = [print_layout](const Shape& shape) { - if (print_layout) { - return ShapeUtil::HumanStringWithLayout(shape); - } else { - return ShapeUtil::HumanString(shape); - } - }; - - // TODO(b/32894291): refactor this code to reduce code duplication. - if (ShapeUtil::IsTuple(subshape)) { - pieces->push_back(shape_to_string(subshape)); - pieces->push_back(" (\n"); - std::vector tuple_pieces; - for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) { - ShapeIndex element_index = shape_index; - element_index.push_back(i); - std::vector element_pieces; - ToStringHelper(literal, element_index, print_layout, &element_pieces); - tuple_pieces.push_back(tensorflow::str_util::Join(element_pieces, "")); - } - pieces->push_back(tensorflow::str_util::Join(tuple_pieces, ",\n")); - pieces->push_back("\n)"); - return; - } - - if (LayoutUtil::IsSparseArray(subshape)) { - pieces->push_back(shape_to_string(subshape)); - pieces->push_back("{"); - int64 rank = ShapeUtil::Rank(subshape); - int64 num_elements = literal.sparse_element_count(); - for (int64 i = 0; i < num_elements; ++i) { - if (i > 0) { - pieces->push_back(", "); - } - if (rank == 1) { - pieces->push_back(StrCat(literal.GetSparseIndex(i)[0])); - pieces->push_back(": "); - } else { - pieces->push_back("["); - pieces->push_back( - tensorflow::str_util::Join(literal.GetSparseIndex(i), ", ")); - pieces->push_back("]: "); - } - pieces->push_back(literal.GetSparseElementAsString(i)); - } - pieces->push_back("}"); - return; - } - - CHECK(LayoutUtil::IsDenseArray(subshape)); - - auto element_to_string = - [&](tensorflow::gtl::ArraySlice indices) -> string { - PrimitiveType element_type = subshape.element_type(); - if (element_type == PRED) { - // We display predicates in a densely packed form. - return literal.Get(indices, shape_index) ? "1" : "0"; - } - return ((!indices.empty() && indices.back() > 0) ? ", " : "") + - literal.GetAsString(indices, shape_index); - }; - - if (ShapeUtil::Rank(subshape) == 0) { - pieces->push_back(literal.GetAsString({}, shape_index)); - } else if (ShapeUtil::Rank(subshape) == 1) { - pieces->push_back("{"); - for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { - pieces->push_back(element_to_string({i0})); - } - pieces->push_back("}"); - } else if (ShapeUtil::Rank(subshape) == 2) { - pieces->push_back(shape_to_string(subshape)); - pieces->push_back(" {\n"); - for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { - pieces->push_back(" { "); - for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { - pieces->push_back(element_to_string({i0, i1})); - } - pieces->push_back(" "); - pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "}\n" : "},\n"); - } - pieces->push_back("}"); - } else if (ShapeUtil::Rank(subshape) == 3) { - pieces->push_back(shape_to_string(subshape)); - pieces->push_back(" {\n"); - for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { - pieces->push_back(i0 > 0 ? ",\n{" : "{"); - for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { - pieces->push_back(i1 > 0 ? ",\n { " : " { "); - for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) { - pieces->push_back(element_to_string({i0, i1, i2})); - } - pieces->push_back(" }"); - } - pieces->push_back(" }"); - } - pieces->push_back("\n}"); - } else if (ShapeUtil::Rank(subshape) == 4) { - pieces->push_back(shape_to_string(subshape)); - pieces->push_back(" {\n"); - for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { - pieces->push_back(Printf(" { /*i0=%lld*/\n", i0)); - for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { - pieces->push_back(Printf(" { /*i1=%lld*/\n", i1)); - for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) { - pieces->push_back(" {"); - for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) { - pieces->push_back(element_to_string({i0, i1, i2, i3})); - } - pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "}\n" : "},\n"); - } - pieces->push_back(i1 == subshape.dimensions(1) - 1 ? " }\n" - : " },\n"); - } - pieces->push_back(i0 == subshape.dimensions(0) - 1 ? " }\n" : " },\n"); - } - pieces->push_back("}"); - } else if (ShapeUtil::Rank(subshape) == 5) { - pieces->push_back(shape_to_string(subshape)); - pieces->push_back(" {\n"); - for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) { - pieces->push_back(Printf(" { /*i0=%lld*/\n", i0)); - for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) { - pieces->push_back(Printf(" { /*i1=%lld*/\n", i1)); - for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) { - pieces->push_back(Printf(" { /*i2=%lld*/\n", i2)); - for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) { - pieces->push_back(" {"); - for (int64 i4 = 0; i4 < subshape.dimensions(4); ++i4) { - pieces->push_back(element_to_string({i0, i1, i2, i3, i4})); - } - pieces->push_back(i3 == subshape.dimensions(3) - 1 ? "}\n" - : "},\n"); - } - pieces->push_back(i2 == subshape.dimensions(2) - 1 ? " }\n" - : " },\n"); - } - pieces->push_back(i1 == subshape.dimensions(1) - 1 ? " }\n" - : " },\n"); - } - pieces->push_back(i0 == subshape.dimensions(0) - 1 ? " }\n" : " },\n"); - } - pieces->push_back("}"); - } else { - pieces->push_back(shape_to_string(subshape)); - pieces->push_back(" {"); - literal.EachCellAsString( - [&](tensorflow::gtl::ArraySlice indices, const string& value) { - pieces->push_back(" "); - pieces->push_back(value); - }); - pieces->push_back("}"); - } -} - -} // namespace - -int64 LiteralBase::sparse_element_count() const { - CHECK(LayoutUtil::IsSparseArray(shape())); - return sparse_indices()->index_count(); -} - -string LiteralBase::ToString(bool print_layout) const { - std::vector pieces; - CHECK(LayoutUtil::HasLayout(this->shape())); - ToStringHelper(*this, {}, print_layout, &pieces); - return tensorflow::str_util::Join(pieces, ""); -} - -/* static */ std::unique_ptr Literal::MakeTuple( - tensorflow::gtl::ArraySlice elements) { - std::vector element_shapes; - for (const auto* element : elements) { - element_shapes.push_back(element->shape()); - } - auto literal = MakeUnique(ShapeUtil::MakeTupleShape(element_shapes)); - for (int i = 0; i < elements.size(); ++i) { - TF_CHECK_OK(literal->CopyFrom(*elements[i], /*dest_shape_index=*/{i})); - } - return literal; -} - -/* static */ std::unique_ptr Literal::MakeTupleFromSlices( - tensorflow::gtl::ArraySlice elements) { +/* static */ std::unique_ptr LiteralUtil::MakeTupleFromSlices( + absl::Span elements) { std::vector element_shapes; for (const auto& element : elements) { element_shapes.push_back(element.shape()); } - auto literal = MakeUnique(ShapeUtil::MakeTupleShape(element_shapes)); + auto literal = + absl::make_unique(ShapeUtil::MakeTupleShape(element_shapes)); for (int i = 0; i < elements.size(); ++i) { TF_CHECK_OK(literal->CopyFrom(elements[i], /*dest_shape_index=*/{i})); } return literal; } -/* static */ std::unique_ptr Literal::MakeTupleOwned( +/* static */ std::unique_ptr LiteralUtil::MakeTupleOwned( std::vector> elements) { std::vector element_shapes; element_shapes.reserve(elements.size()); for (const auto& element : elements) { element_shapes.push_back(element->shape()); } - auto literal = MakeUnique(ShapeUtil::MakeTupleShape(element_shapes)); + auto literal = + absl::make_unique(ShapeUtil::MakeTupleShape(element_shapes)); for (int64 i = 0; i < elements.size(); ++i) { TF_CHECK_OK( literal->MoveFrom(std::move(*elements[i]), /*dest_shape_index=*/{i})); @@ -1570,819 +471,9 @@ string LiteralBase::ToString(bool print_layout) const { return literal; } -void LiteralBase::EachCellAsString( - const std::function indices, - const string& value)>& per_cell) const { - if (ShapeUtil::HasZeroElements(shape())) { - return; - } - std::vector indices = IndexUtil::LinearIndexToMultidimensionalIndex( - shape(), /*linear_index=*/0); - do { - per_cell(indices, GetAsString(indices)); - } while (IndexUtil::BumpIndices(shape(), &indices)); -} - -namespace { -template -std::unique_ptr ConvertBetweenNativeTypesWithConverter( - const LiteralBase& src_literal, const ConverterType& converter) { - CHECK(ShapeUtil::IsArray(src_literal.shape())); - auto result_literal = MakeUnique(ShapeUtil::ChangeElementType( - src_literal.shape(), - primitive_util::NativeToPrimitiveType())); - auto src_data = src_literal.data(); - auto dest_data = result_literal->template data(); - int64 num_elements = src_literal.element_count(); - - for (int64 i = 0; i < num_elements; ++i) { - dest_data[i] = converter(src_data[i]); - } - return result_literal; -} - -template -std::unique_ptr ConvertBetweenNativeTypes( - const LiteralBase& src_literal) { - auto converter = [](NativeSrcT src) { return static_cast(src); }; - return ConvertBetweenNativeTypesWithConverter( - src_literal, converter); -} - -template -typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)), - std::unique_ptr>::type -BitcastBetweenNativeTypes(const LiteralBase& src_literal) { - auto converter = [](NativeSrcT src) { - return tensorflow::bit_cast(src); - }; - return ConvertBetweenNativeTypesWithConverter( - src_literal, converter); -} - -// This template specialization is here to make the compiler happy. bit_cast has -// a static check that the types are the same size. This specialization should -// never be used because the source and destination types are checked for -// identical sizes higher up. -template -typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)), - std::unique_ptr>::type -BitcastBetweenNativeTypes(const LiteralBase& src_literal) { - LOG(FATAL) << "Invalid bitcast between types of different sizes."; -} - -template -std::unique_ptr ConvertToC64(const LiteralBase& src_literal) { - CHECK(ShapeUtil::IsArray(src_literal.shape())); - auto result_literal = MakeUnique( - ShapeUtil::ChangeElementType(src_literal.shape(), C64)); - using NativeSrcT = - typename primitive_util::PrimitiveTypeToNative::type; - tensorflow::gtl::ArraySlice src_data = - src_literal.data(); - tensorflow::gtl::MutableArraySlice dest_data = - result_literal->data(); - int64 num_elements = src_literal.element_count(); - for (int64 i = 0; i < num_elements; ++i) { - dest_data[i] = complex64(static_cast(src_data[i]), 0); - } - return result_literal; -} - -template -std::unique_ptr ConvertIfTypesMatch(const LiteralBase& src_literal, - bool bitcast) { - CHECK_EQ(primitive_src_type, src_literal.shape().element_type()); - if (bitcast) { - return BitcastBetweenNativeTypes< - typename primitive_util::PrimitiveTypeToNative< - primitive_src_type>::type, - typename primitive_util::PrimitiveTypeToNative< - primitive_dest_type>::type>(src_literal); - } else { - return ConvertBetweenNativeTypes< - typename primitive_util::PrimitiveTypeToNative< - primitive_src_type>::type, - typename primitive_util::PrimitiveTypeToNative< - primitive_dest_type>::type>(src_literal); - } -} - -template -StatusOr> ConvertIfDestTypeMatches( - const LiteralBase& src_literal, PrimitiveType primitive_dest_type, - bool bitcast) { - switch (primitive_dest_type) { -#define CONVERT_IF_TYPES_MATCH(type) \ - case (type): \ - return ConvertIfTypesMatch(src_literal, \ - bitcast); - CONVERT_IF_TYPES_MATCH(PRED) - CONVERT_IF_TYPES_MATCH(S8) - CONVERT_IF_TYPES_MATCH(S32) - CONVERT_IF_TYPES_MATCH(S64) - CONVERT_IF_TYPES_MATCH(U8) - CONVERT_IF_TYPES_MATCH(U32) - CONVERT_IF_TYPES_MATCH(U64) - CONVERT_IF_TYPES_MATCH(F16) - CONVERT_IF_TYPES_MATCH(F32) - CONVERT_IF_TYPES_MATCH(F64) - CONVERT_IF_TYPES_MATCH(BF16) -#undef CONVERT_IF_TYPES_MATCH - case C64: - if (!bitcast) { - return ConvertToC64(src_literal); - } - break; - // Other types are not yet supported. - default: - break; - } - return Unimplemented( - "Converting from type %s to type %s is not implemented.", - PrimitiveType_Name(src_literal.shape().element_type()).c_str(), - PrimitiveType_Name(primitive_dest_type).c_str()); -} - -StatusOr> ConvertSwitch( - const LiteralBase& literal, PrimitiveType primitive_dest_type, - bool bitcast) { - TF_RET_CHECK(ShapeUtil::IsArray(literal.shape())); - if (literal.shape().element_type() == primitive_dest_type) { - return literal.CloneToUnique(); - } - switch (literal.shape().element_type()) { -#define CONVERT_IF_DEST_TYPE_MATCHES(type) \ - case (type): \ - return ConvertIfDestTypeMatches<(type)>(literal, primitive_dest_type, \ - bitcast); - CONVERT_IF_DEST_TYPE_MATCHES(PRED) - CONVERT_IF_DEST_TYPE_MATCHES(S8) - CONVERT_IF_DEST_TYPE_MATCHES(S32) - CONVERT_IF_DEST_TYPE_MATCHES(S64) - CONVERT_IF_DEST_TYPE_MATCHES(U8) - CONVERT_IF_DEST_TYPE_MATCHES(U32) - CONVERT_IF_DEST_TYPE_MATCHES(U64) - CONVERT_IF_DEST_TYPE_MATCHES(F16) - CONVERT_IF_DEST_TYPE_MATCHES(F32) - CONVERT_IF_DEST_TYPE_MATCHES(F64) - CONVERT_IF_DEST_TYPE_MATCHES(BF16) -#undef CONVERT_IF_DEST_TYPE_MATCHES - // Other types are not yet supported. - default: - return Unimplemented( - "%s from type %s to type %s is not implemented.", - (bitcast ? "Bitcast converting" : "Converting"), - PrimitiveType_Name(literal.shape().element_type()).c_str(), - PrimitiveType_Name(primitive_dest_type).c_str()); - } -} - -} // namespace - -StatusOr> LiteralBase::Convert( - PrimitiveType primitive_dest_type) const { - return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/false); -} - -StatusOr> LiteralBase::BitcastConvert( - PrimitiveType primitive_dest_type) const { - if (primitive_util::BitWidth(shape().element_type()) != - primitive_util::BitWidth(primitive_dest_type)) { - return InvalidArgument( - "Cannot bitcast convert from %s to %s, bit widths are different: %d != " - "%d", - PrimitiveType_Name(shape().element_type()).c_str(), - PrimitiveType_Name(primitive_dest_type).c_str(), - primitive_util::BitWidth(shape().element_type()), - primitive_util::BitWidth(primitive_dest_type)); - } - return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/true); -} - -StatusOr> LiteralBase::ConvertToShape( - const Shape& dest_shape, bool round_f32_to_bf16) const { - if (!ShapeUtil::IsTuple(dest_shape)) { - if (round_f32_to_bf16 && shape().element_type() == F32 && - dest_shape.element_type() == BF16) { - auto converter = [](float src) { - return tensorflow::bfloat16::round_to_bfloat16(src); - }; - return ConvertBetweenNativeTypesWithConverter(*this, - converter); - } - return Convert(dest_shape.element_type()); - } - std::vector elements; - for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) { - auto element = LiteralSlice(*this, {i}); - TF_ASSIGN_OR_RETURN( - auto new_element, - element.ConvertToShape(ShapeUtil::GetSubshape(dest_shape, {i}))); - elements.push_back(std::move(*new_element)); - } - auto converted = MakeUnique(); - *converted = Literal::MoveIntoTuple(&elements); - return std::move(converted); -} - -template -bool LiteralBase::Piece::EqualElementsInternal( - const LiteralBase::Piece& other, std::vector* multi_index) const { - if (multi_index->size() == ShapeUtil::Rank(subshape())) { - return (Get(*multi_index) == other.Get(*multi_index)); - } - for (int64 i = 0; i < subshape().dimensions(multi_index->size()); ++i) { - multi_index->push_back(i); - if (!EqualElementsInternal(other, multi_index)) { - return false; - } - multi_index->pop_back(); - } - return true; -} - -bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const { - DCHECK(ShapeUtil::Compatible(subshape(), other.subshape())); - - std::vector multi_index; - switch (subshape().element_type()) { - case PRED: - return EqualElementsInternal(other, &multi_index); - case U8: - return EqualElementsInternal(other, &multi_index); - case S32: - return EqualElementsInternal(other, &multi_index); - case S64: - return EqualElementsInternal(other, &multi_index); - case U32: - return EqualElementsInternal(other, &multi_index); - case U64: - return EqualElementsInternal(other, &multi_index); - case F32: - return EqualElementsInternal(other, &multi_index); - case F64: - return EqualElementsInternal(other, &multi_index); - case F16: - return EqualElementsInternal(other, &multi_index); - case BF16: - return EqualElementsInternal(other, &multi_index); - case C64: - return EqualElementsInternal(other, &multi_index); - default: - LOG(FATAL) << "Unimplemented: LiteralBase::Piece::EqualElements for type " - << PrimitiveType_Name(subshape().element_type()); - } -} - -bool LiteralBase::operator==(const LiteralBase& other) const { - if (!ShapeUtil::Compatible(shape(), other.shape())) { - return false; - } - - return root_piece().ForEachSubpieceWithBool( - [&](const ShapeIndex& index, const Piece& piece) { - if (!ShapeUtil::IsArray(piece.subshape())) { - return true; - } - - const Piece& other_piece = other.piece(index); - if (!piece.EqualElements(other_piece)) { - return false; - } - return true; - }); -} - -namespace { - -template -static bool AllElementsEqualValue(tensorflow::gtl::ArraySlice data, - NativeT value) { - for (int64 i = 0; i < data.size(); ++i) { - if (data[i] != value) { - return false; - } - } - return true; -} - -} // namespace - -bool LiteralBase::IsAll(int8 value) const { - return root_piece().ForEachSubpieceWithBool([&](const ShapeIndex& index, - const Piece& piece) { - if (!ShapeUtil::IsArray(piece.subshape())) { - return true; - } - - auto piece_is_all = [&]() { - switch (shape().element_type()) { - case U8: - if (value >= 0) { - return AllElementsEqualValue(piece.data(), value); - } - return false; - case U32: - if (value >= 0) { - return AllElementsEqualValue(piece.data(), value); - } - return false; - case U64: - if (value >= 0) { - return AllElementsEqualValue(piece.data(), value); - } - return false; - case S8: - return AllElementsEqualValue(piece.data(), value); - case S32: - return AllElementsEqualValue(piece.data(), value); - case S64: - return AllElementsEqualValue(piece.data(), value); - case F32: - return AllElementsEqualValue(piece.data(), value); - case F64: - return AllElementsEqualValue(piece.data(), value); - case F16: - return AllElementsEqualValue(piece.data(), - static_cast(value)); - case BF16: - return AllElementsEqualValue(piece.data(), - static_cast(value)); - case PRED: - if (value == 0) { - return AllElementsEqualValue(piece.data(), false); - } - if (value == 1) { - return AllElementsEqualValue(piece.data(), true); - } - return false; - default: - return false; - } - return false; - }; - - if (!piece_is_all()) { - return false; - } - return true; - }); -} - -bool LiteralBase::IsAllFloat(float value) const { - return root_piece().ForEachSubpieceWithBool( - [&](const ShapeIndex& index, const Piece& piece) { - if (!ShapeUtil::IsArray(piece.subshape())) { - return true; - } - - auto piece_is_all = [&]() { - switch (shape().element_type()) { - case F32: - return AllElementsEqualValue(piece.data(), value); - case F64: - return AllElementsEqualValue(piece.data(), value); - case F16: - return AllElementsEqualValue(piece.data(), - static_cast(value)); - case BF16: - return AllElementsEqualValue( - piece.data(), static_cast(value)); - default: - return false; - } - }; - if (!piece_is_all()) { - return false; - } - return true; - }); -} - -bool LiteralBase::IsAllComplex(complex64 value) const { - switch (shape().element_type()) { - case C64: - return AllElementsEqualValue(root_piece().data(), - value); - default: - return false; - } -} - -bool LiteralBase::IsAllFirst() const { - return root_piece().ForEachSubpieceWithBool( - [&](const ShapeIndex& index, const Piece& piece) { - if (!ShapeUtil::IsArray(piece.subshape())) { - return true; - } - - // Empty shapes are not all the first element since there is no first - // element. - if (ShapeUtil::HasZeroElements(piece.subshape())) { - return false; - } - auto piece_is_all = [&]() { - switch (piece.subshape().element_type()) { - case PRED: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - // 8 bit types - case S8: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - case U8: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - // 16 bit types - case BF16: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - case F16: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - case S16: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - case U16: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - // 32 bit types - case F32: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - case U32: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - case S32: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - // 64 bit types - case C64: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - case F64: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - case S64: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - case U64: { - auto data = piece.data(); - return AllElementsEqualValue(data, data[0]); - } - default: - return false; - } - }; - - if (!piece_is_all()) { - return false; - } - return true; - }); -} - -bool LiteralBase::IsZero(tensorflow::gtl::ArraySlice indices) const { - CHECK(ShapeUtil::IsArray(shape())); - switch (shape().element_type()) { - case U8: - return Get(indices) == 0; - case U32: - return Get(indices) == 0; - case U64: - return Get(indices) == 0; - case S8: - return Get(indices) == 0; - case S32: - return Get(indices) == 0; - case S64: - return Get(indices) == 0; - case F32: - return Get(indices) == 0.0f; - case F64: - return Get(indices) == 0.0; - case C64: - return Get(indices) == complex64(0.0f, 0.0f); - case F16: - return Get(indices) == static_cast(0.0f); - case BF16: - return Get(indices) == static_cast(0.0f); - case PRED: - return Get(indices) == false; - default: - LOG(FATAL) << "Input literal must be an array."; - } -} - -namespace { - -template -void CopyToRepeatedField(RepeatedFieldT* dest, - const tensorflow::gtl::ArraySlice src) { - *dest = RepeatedFieldT(src.begin(), src.end()); -} - -} // namespace - -void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const { - *proto->mutable_shape() = subshape(); - switch (subshape().element_type()) { - case PRED: - CopyToRepeatedField(proto->mutable_preds(), data()); - break; - case U8: - proto->set_u8s(static_cast(data().data()), - element_count()); - break; - case U32: - CopyToRepeatedField(proto->mutable_u32s(), data()); - break; - case U64: - CopyToRepeatedField(proto->mutable_u64s(), data()); - break; - case S32: - CopyToRepeatedField(proto->mutable_s32s(), data()); - break; - case S64: - CopyToRepeatedField(proto->mutable_s64s(), data()); - break; - case F16: - *proto->mutable_f16s() = string( - reinterpret_cast(data().data()), size_bytes()); - if (!kLittleEndian) { - ConvertEndianShort(proto->mutable_f16s()); - } - break; - case BF16: - *proto->mutable_bf16s() = string( - reinterpret_cast(data().data()), size_bytes()); - if (!kLittleEndian) { - ConvertEndianShort(proto->mutable_bf16s()); - } - break; - case F32: - CopyToRepeatedField(proto->mutable_f32s(), data()); - break; - case F64: - CopyToRepeatedField(proto->mutable_f64s(), data()); - break; - case C64: - for (complex64 value : data()) { - proto->add_c64s(value.real()); - proto->add_c64s(value.imag()); - } - break; - case TUPLE: - // Nothing to do but assign the shape which is done above. - return; - default: - LOG(FATAL) << "Unhandled primitive type " << subshape().element_type(); - } -} - -const void* LiteralBase::Piece::untyped_data() const { - CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape()); - return buffer(); -} - -void* LiteralBase::Piece::untyped_data() { - CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape()); - return buffer(); -} - -namespace { - -template -Status CopyFromRepeatedField(tensorflow::gtl::MutableArraySlice dest, - const RepeatedFieldT& src) { - if (dest.size() != src.size()) { - return InvalidArgument( - "Expected %lu elements in LiteralProto repeated field, has %d", - dest.size(), src.size()); - } - std::copy(src.begin(), src.end(), dest.begin()); - return Status::OK(); -} - -} // namespace - -Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) { - // These conditions should have been checked in Literal::CreateFromProto. - TF_RET_CHECK(proto.has_shape()); - TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape())); - TF_RET_CHECK(ShapeUtil::Equal(proto.shape(), subshape())); - - switch (subshape().element_type()) { - case PRED: - TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.preds())); - break; - case U8: { - auto u8_data = data(); - TF_RET_CHECK(proto.u8s().size() == u8_data.size()); - std::copy(proto.u8s().begin(), proto.u8s().end(), u8_data.begin()); - } break; - case S32: - TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.s32s())); - break; - case S64: - TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.s64s())); - break; - case U32: - TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.u32s())); - break; - case U64: - TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.u64s())); - break; - case F16: { - const string& s(proto.f16s()); - TF_RET_CHECK(data().size() * sizeof(half) == s.size()); - memcpy(untyped_data(), s.data(), s.size()); - if (!kLittleEndian) { - ConvertEndianShort(reinterpret_cast(untyped_data()), s.size()); - } - } break; - - case BF16: { - const string& s(proto.bf16s()); - TF_RET_CHECK(data().size() * sizeof(bfloat16) == s.size()); - memcpy(untyped_data(), s.data(), s.size()); - if (!kLittleEndian) { - ConvertEndianShort(reinterpret_cast(untyped_data()), s.size()); - } - } break; - case F32: - TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.f32s())); - break; - case F64: - TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.f64s())); - break; - case C64: { - auto complex_data = data(); - TF_RET_CHECK(proto.c64s_size() == complex_data.size() * 2); - for (int64 i = 0; i < complex_data.size(); ++i) { - complex_data[i] = complex64{proto.c64s(i * 2), proto.c64s(i * 2 + 1)}; - } - } break; - case TUPLE: - LOG(FATAL) << "Should not be called on tuple shapes: " - << ShapeUtil::HumanString(subshape()); - break; - default: - LOG(FATAL) << "Unhandled primitive type " << subshape().element_type(); - } - return Status::OK(); -} - -LiteralProto LiteralBase::ToProto() const { - LiteralProto proto; - root_piece().ForEachSubpiece( - [&](const ShapeIndex& index, const Piece& piece) { - LiteralProto* proto_piece = &proto; - for (int64 i : index) { - while (proto_piece->tuple_literals_size() <= i) { - proto_piece->add_tuple_literals(); - } - proto_piece = proto_piece->mutable_tuple_literals(i); - } - piece.WriteToProto(proto_piece); - }); - - if (LayoutUtil::IsSparseArray(shape())) { - CopyToRepeatedField(proto.mutable_sparse_indices(), - sparse_indices()->data()); - } - - return proto; -} - -/* static */ -StatusOr> Literal::CreateFromProto( - const LiteralProto& proto) { - if (!proto.has_shape()) { - return InvalidArgument("LiteralProto has no shape"); - } - if (!LayoutUtil::HasLayout(proto.shape())) { - return InvalidArgument("LiteralProto has no layout"); - } - - auto literal = MakeUnique(proto.shape()); - - TF_RETURN_IF_ERROR(literal->root_piece_->ForEachMutableSubpieceWithStatus( - [&](const ShapeIndex& index, Piece* piece) { - const LiteralProto* proto_element = &proto; - for (int64 i : index) { - CHECK(i < proto_element->tuple_literals_size()); - proto_element = &proto_element->tuple_literals(i); - } - - if (ShapeUtil::IsTuple(piece->subshape())) { - if (proto_element->tuple_literals_size() != - ShapeUtil::TupleElementCount(piece->subshape())) { - return InvalidArgument( - "Expected %lld tuple elements in LiteralProto, has %d", - ShapeUtil::TupleElementCount(piece->subshape()), - proto_element->tuple_literals_size()); - } - return Status::OK(); - } - - CHECK(ShapeUtil::IsArray(piece->subshape())); - TF_RETURN_IF_ERROR(piece->CopyFromProto(*proto_element)); - - return Status::OK(); - })); - - return std::move(literal); -} - -/* static */ string Literal::MultiIndexAsString( - tensorflow::gtl::ArraySlice multi_index) { - return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}"); -} - -const void* LiteralBase::untyped_data(const ShapeIndex& shape_index) const { - return piece(shape_index).untyped_data(); -} - -void* Literal::untyped_data(const ShapeIndex& shape_index) { - return piece(shape_index).untyped_data(); -} - -int64 LiteralBase::size_bytes(const ShapeIndex& shape_index) const { - return piece(shape_index).size_bytes(); -} - -string LiteralBase::GetR1U8AsString() const { - CHECK(ShapeUtil::IsArray(shape())); - CHECK_EQ(ShapeUtil::Rank(shape()), 1); - CHECK_EQ(shape().element_type(), U8); - return string(tensorflow::bit_cast(data().data()), - ShapeUtil::ElementsIn(shape())); -} - -void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) { - CHECK(ShapeUtil::IsTuple(shape)); - for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { - const Shape& subshape = shape.tuple_shapes(i); - - auto child_piece = Piece(); - child_piece.set_subshape(&subshape); - - if (ShapeUtil::IsTuple(subshape)) { - BuildPieceSubtree(subshape, &child_piece); - } - - piece->emplace_back(std::move(child_piece)); - } -} - -LiteralSlice::LiteralSlice(const LiteralBase& literal) - : LiteralBase(), root_piece_(&literal.root_piece()) {} - -LiteralSlice::LiteralSlice(const LiteralBase& literal, - const ShapeIndex& view_root) - : LiteralBase(), root_piece_(&literal.piece(view_root)) {} - -BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape) - : LiteralBase(), shape_(shape) { - CHECK(ShapeUtil::IsArray(shape_)); - CHECK_NE(src_buf_ptr, nullptr); - CHECK(LayoutUtil::HasLayout(shape_)); - - root_piece_ = Piece(); - root_piece_.set_buffer(const_cast(src_buf_ptr)); - root_piece_.set_subshape(&shape_); -} - -BorrowingLiteral::BorrowingLiteral( - tensorflow::gtl::ArraySlice src_buf_ptrs, const Shape& shape) - : LiteralBase(), shape_(shape) { - CHECK(ShapeUtil::IsTuple(shape_)); - CHECK(!ShapeUtil::IsNestedTuple(shape_)); - CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(shape_)); - root_piece_ = Piece(); - root_piece_.set_subshape(&shape_); - BuildPieceSubtree(shape_, &root_piece_); - - for (int i = 0; i < src_buf_ptrs.size(); ++i) { - const auto& src_shape = shape_.tuple_shapes(i); - CHECK(ShapeUtil::IsArray(src_shape)); - root_piece_.child(i).set_buffer(const_cast(src_buf_ptrs[i])); - } +/* static */ string LiteralUtil::MultiIndexAsString( + absl::Span multi_index) { + return StrCat("{", absl::StrJoin(multi_index, ","), "}"); } } // namespace xla diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h index 2ca9060cc78bf163336a65e0d22ea1d04e4586fe..2d6084a67a3b966d054103df0f06ddb82d0d6525 100644 --- a/tensorflow/compiler/xla/literal_util.h +++ b/tensorflow/compiler/xla/literal_util.h @@ -27,13 +27,16 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/index_util.h" #include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/primitive_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/sparse_index_array.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -42,8 +45,6 @@ limitations under the License. #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/bitmap.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/protobuf.h" @@ -51,673 +52,12 @@ limitations under the License. namespace xla { -// Forward declare Literal and LiteralSlice class to be used by the creation -// methods in the base class. -class Literal; -class LiteralSlice; - -// Abstract base class for literals. -class LiteralBase { +class LiteralUtil { public: - virtual ~LiteralBase() = 0; - - // Literals are equal if they have compatible shapes and the same data - // values. Layout is not compared. - bool operator==(const LiteralBase& other) const; - bool operator!=(const LiteralBase& other) const { return !(*this == other); } - - // Returns the shape of the literal. - const Shape& shape() const { return root_piece().subshape(); } - - // Serialize to proto. - LiteralProto ToProto() const; - - // Returns an ArraySlice of the array for this literal for the given NativeT - // (e.g., float). CHECKs if the subshape of the literal at the given - // ShapeIndex is not array. See primitive_util.h for the mapping from XLA type - // to native type. - template - tensorflow::gtl::ArraySlice data( - const ShapeIndex& shape_index = {}) const; - - // Returns a const pointer to the sparse index array. Returns nullptr if the - // literal is not a sparse array. - const SparseIndexArray* sparse_indices( - const ShapeIndex& shape_index = {}) const; - - // Returns a const pointer to (or size of) the underlying buffer holding the - // array at the given shape index. CHECKs if the subshape of the literal at - // the given ShapeIndex is not array. - const void* untyped_data(const ShapeIndex& shape_index = {}) const; - int64 size_bytes(const ShapeIndex& shape_index = {}) const; - - // Returns this literal's data as a string. This literal must be a rank-1 U8 - // array. - string GetR1U8AsString() const; - - // Returns a string representation of the literal value. - // Warning: this function can take minutes for multi-million element Literals. - string ToString(bool print_layout = false) const; - - // Gets an element in the literal at the given index. The multi_index is - // CHECKed against the dimension sizes. - template - NativeT Get(tensorflow::gtl::ArraySlice multi_index, - const ShapeIndex& shape_index) const; - // Overloads of Get for array literals. CHECKs if the literal is not - // array-shaped and dense. - template - NativeT Get(tensorflow::gtl::ArraySlice multi_index) const; - - // Returns the element value at index (0, ..., 0), however many zeroes are - // required for that index. - template - NativeT GetFirstElement() const; - - // As Get(), but determines the correct type and converts the value - // into text. - string GetAsString(tensorflow::gtl::ArraySlice multi_index, - const ShapeIndex& shape_index = {}) const; - // As GetSparseElement(), but determines the correct type and converts the - // value into text. - string GetSparseElementAsString(int64 sparse_element_number, - const ShapeIndex& shape_index = {}) const; - // As Get(), but determines the correct type and converts the value into - // int64. This literal must be an array. - StatusOr GetIntegralAsS64( - tensorflow::gtl::ArraySlice multi_index) const; - - // Returns the multi-index of the element in a sparse literal at the given - // sparse element number. The sparse element number is the position with in - // the sparse array's list of (index, value) pairs, and is checked against the - // total number of (index, value) pairs in the sparse array. - tensorflow::gtl::ArraySlice GetSparseIndex( - int64 sparse_element_number, const ShapeIndex& shape_index = {}) const; - - // Returns the value of the element in a sparse literal at the given sparse - // element number. The sparse element number is the position with in the - // sparse array's list of (index, value) pairs, and is checked against the - // total number of (index, value) pairs in the sparse array. - template - NativeT GetSparseElement(int64 sparse_element_number, - const ShapeIndex& shape_index = {}) const; - - // Invokes the "per cell" callback for each element in the provided - // literal with the element's indices and a string representation of - // the element's value. - // - // This function is useful if you want a polymorphic representation - // of the tensor's elements (turning it to a string for something - // like representation in a protobuf). - // - // This literal must have a dense layout. - void EachCellAsString( - const std::function indices, - const string& value)>& per_cell) const; - template - void EachCell(std::function indices, - NativeT value)> - per_cell) const; - - // Returns whether every element in this literal is equal to value. - // - // value is an int8 because we expect this to be called with small - // compile-time constants (0, -1, etc.) and so that whatever value you pass - // can be represented exactly by floating-point types as small as 16 bits. - // - // If value doesn't fit in this literal's type, returns false. Values of 1/0 - // are considered equal to true/false; other values are not considered equal - // to true. Also if this literal is not array-shaped false is returned. - bool IsAll(int8 value) const; - - // Like IsAll(const Literal&, int8), except we check whether the literal is - // equal to a particular floating-point number. - // - // If the literal is not a floating-point value, this always returns false. - // - // This casts value to the type of literal, then compares using ==. The usual - // admonishments about floating-point equality checks apply. We expect you to - // use this to check for values that can be expressed precisely as a float, - // e.g. -0.5. Also if this literal is not array-shaped false is returned. - bool IsAllFloat(float value) const; - - // Like IsAll(const Literal&, int8), except we check whether the literal is - // equal to a particular complex number. - // - // If the literal is not a complex value, this always returns false. - // - // This casts value to the type of literal, then compares using ==. The usual - // admonishments about floating-point equality checks apply. We expect you to - // use this to check for complex values that can be expressed precisely as - // float pairs e.g. (-0.5, 1.0). - // - // This literal must have a dense layout. - bool IsAllComplex(complex64 value) const; - - // Literal consists entirely of the first element of the literal. - bool IsAllFirst() const; - - // Returns whether this literal is zero at the specified index. This literal - // must be an array with a dense layout. - bool IsZero(tensorflow::gtl::ArraySlice indices) const; - - // Returns the count of the elements in the array at the given shape index in - // this literal. - int64 element_count(const ShapeIndex& index = {}) const { - return ShapeUtil::ElementsIn(ShapeUtil::GetSubshape(shape(), index)); - } - - // Returns the count of the elements in the sparse array at the given shape - // index in this literal, which will be no larger than - // LayoutUtil::MaxSparseElements(SetSubshape(shape(), index).layout()). - int64 sparse_element_count() const; - - // Compute a hash for this literal. This literal must not be a sparse tensor - // or a tuple containing a sparse tensor. - size_t Hash() const; - - // Converts this literal to the given shape. Returns an error is the - // conversion is not possible. - // - // round_f32_to_bf16: if true, converting F32 elements to BF16 uses rounding - // instead of truncation; otherwise, truncation is used. - // - // TODO(b/69266521): remove the round_to_bfloat16 flag when rounding becomes - // the default behavior. - StatusOr> ConvertToShape( - const Shape& dest_shape, bool round_f32_to_bf16 = false) const; - - // Converts this literal to another primitive type using a bitcast - // conversion. The to and from primitive types must have the same bit - // width. Returns an error if the conversion is not possible. This literal - // must be array-shaped. - StatusOr> BitcastConvert( - PrimitiveType primitive_dest_type) const; - - // Converts this literal to another primitive type. Returns an error if the - // conversion is not possible. This literal must be array-shaped. - StatusOr> Convert( - PrimitiveType primitive_dest_type) const; + LiteralUtil() = delete; // Returns a literal scalar representing the first element. - Literal GetFirstScalarLiteral() const; - - // Clones the underlying buffers into a new Literal, or new - // std::unique_ptr. - Literal Clone() const; - std::unique_ptr CloneToUnique() const; - - // TODO(b/67651157): The methods below which perform computation on Literals - // (Reshape, Slice, etc) should be moved elsewhere, and perhaps combined with - // evaluator code which operates on Literals. - // - // Creates a new value that has the equivalent value as this - // literal, but conforms to new_layout; e.g. a literal matrix that was in {0, - // 1} minor-to-major dimension layout can be re-layed-out as {1, 0} - // minor-to-major dimension layout and the value in the cell at any given - // logical index (i0, i1) will be the same. - // - // For tuple shaped literals, shape_index should be used to select the inner - // array that the new layout applies to. - // - // Note: this is useful when the client wants to ensure that a value placed in - // the XLA allocation tracker has a particular layout; for efficiency - // purposes or avoiding unimplemented operation/layout combinations. - std::unique_ptr Relayout(const Layout& new_layout, - const ShapeIndex& shape_index = {}) const; - - // An overload of Relayout which changes the layout of the entire shape rather - // than being limited to a single array within the shape. - std::unique_ptr Relayout(const Shape& shape_with_layout) const; - - // Creates a new literal by reshaping this literal to have the given - // dimensions. The total number of elements must not change; The - // implementation currently only supports monotonic dim0-major layouts. - // This literal must be an array. - StatusOr> Reshape( - tensorflow::gtl::ArraySlice dimensions) const; - - // Creates a new literal by broadcasting this literal with `dimensions` to - // yield a literal of shape `result_shape`. - StatusOr> Broadcast( - const Shape& result_shape, - tensorflow::gtl::ArraySlice dimensions) const; - - // Creates a new literal by reordering the dimensions of this literal. - // The given `permutation` must be a permutation of the dimension numbers - // in the original literal, and it specifies the order of the new dimensions - // in the result literal (i.e., new_order[i] = old_order[permutation[i]]). - // For example, a transpose call on a literal of shape [3 x 8 x 4] and - // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8]. - // This literal must be an array. - std::unique_ptr Transpose( - tensorflow::gtl::ArraySlice permutation) const; - - // Creates a sub-array from this literal by extracting the indices - // [start_index, limit_index) of each dimension. The result literal has the - // same rank and layout as for the given literal. The number of indices in - // start_indices and limit_indices must be the rank of the literal, and the - // indices follow the order of the dimensions. - // This literal must be an array. - std::unique_ptr Slice( - tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices) const; - - // Creates a literal with a prepended dimension with bound "times"; e.g. a - // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from this - // literal replicated four times. - // This literal must be an array. - template - std::unique_ptr Replicate(int64 times) const; - - // Creates a new Literal object with the shape specified as parameter. - // The content of the literal values is the default value of the primitive - // type of literal itself (0 for numeric types, and false for predicates). - // - // Note: It's an antipattern to use this method then immediately call - // Literal::Populate on the result (since that results in zero initialization, - // then reinitialization. Conside if a call to MakeUnique(shape), - // followed by the call to Literal::Populate can be used instead. - static std::unique_ptr CreateFromShape(const Shape& shape); - - protected: - // A data structure representing a subshape at a particular ShapeIndex within - // the literal. For array-shaped ShapeIndexes, this data structure holds the - // pointer to the memory allocated for the array data. - class Piece { - public: - // Returns the buffer holding the array data for this piece as an array - // slice. This piece must be array-shaped. - template - tensorflow::gtl::ArraySlice data() const; - template - tensorflow::gtl::MutableArraySlice data(); - - // Returns the buffer holding the array data for this piece as a void*. This - // piece must be array-shaped. - void* untyped_data(); - const void* untyped_data() const; - - // Gets or sets an element in the array at the given index. The multi_index - // is CHECKed against the dimension sizes of the array. This piece must be - // array-shaped. - template - NativeT Get(tensorflow::gtl::ArraySlice index) const; - template - void Set(tensorflow::gtl::ArraySlice index, NativeT value); - - // Gets/sets the buffer holding the array data. - char* buffer() const { return buffer_; } - void set_buffer(char* buffer) { buffer_ = buffer; } - - // The array of multi-indices that provide the locations of non-zero - // elements in a sparse array. Only used if - // LayoutUtil::IsSparseArray(shape()) is true. - SparseIndexArray* sparse_indices() const { return sparse_indices_; } - void set_sparse_indices(SparseIndexArray* sparse_indices) { - sparse_indices_ = sparse_indices; - } - - // Gets or sets the subshape of this piece. This reference points to a - // subshape within the shape in the containing Literal (Literal::shape_). - const Shape& subshape() const { return *subshape_; } - void set_subshape(const Shape* subshape) { subshape_ = subshape; } - - // Returns the size in bytes of the buffer holding the array data. - int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); } - - // Returns the number of elements in this piece's array. - int64 element_count() const { - // If this is a sparse array, use the number of elements represented by - // the indices in the associated SparseIndexArray. - return LayoutUtil::IsSparseArray(subshape()) - ? sparse_indices()->index_count() - : ShapeUtil::ElementsIn(subshape()); - } - - // Returns the child piece at 'index' of this piece. - Piece& child(int64 index) { return children_[index]; } - - // Adds a child piece to this piece's children. - void emplace_back(Piece child_piece) { - children_.emplace_back(std::move(child_piece)); - } - - // Returns the size of children pieces of this piece. - int64 children_size() { return children_.size(); } - - // Visitor functions that recursively traverses the piece and calls the - // given function at each child piece. The function has the type: - // void (const ShapeIndex& index, const Piece& piece) - template - void ForEachSubpiece(const Fn& func) const { - ShapeIndex index; - return ForEachHelper( - [&func](const ShapeIndex& index, const Piece& piece) { - func(index, piece); - return Status::OK(); - }, - *this, &index) - .IgnoreError(); - } - // Same as above, but the function has the type: - // Status (const ShapeIndex& index, const Piece& piece) - // The first non-OK return value is returned by the function. - template - Status ForEachSubpieceWithStatus(const Fn& func) const { - ShapeIndex index; - return ForEachHelper(func, *this, &index); - } - // Same as above, but the function has the type: - // Bool (const ShapeIndex& index, const Piece& piece) - // The first non-true return value is returned by the function. - template - bool ForEachSubpieceWithBool(const Fn& func) const { - ShapeIndex index; - return ForEachHelperBool(func, *this, &index); - } - // Same as above, but the function has the type: - // Void (const ShapeIndex& index, Piece& piece) - template - void ForEachMutableSubpiece(const Fn& func) { - ShapeIndex index; - return ForEachMutableHelper( - [&func](const ShapeIndex& index, Piece* piece) { - func(index, piece); - return Status::OK(); - }, - const_cast(this), &index) - .IgnoreError(); - } - // Same as above, but the function has the type: - // Status (const ShapeIndex& index, Piece& piece) - // The first non-OK return value is returned by the function. - template - Status ForEachMutableSubpieceWithStatus(const Fn& func) { - ShapeIndex index; - return ForEachMutableHelper( - func, const_cast(this), &index); - } - - // Returns true if this piece and 'other' contain the same data. This piece - // and 'other' must be array-shaped and compatible. - bool EqualElements(const Piece& other) const; - - // Writes the shape and data (if array-shaped) into the given proto. - void WriteToProto(LiteralProto* proto) const; - - // Copy the data from 'src' into this piece's buffer. Shapes of this piece - // and src must be compatible. - Status CopyFrom(const Piece& src); - - // Copies the data from the given proto into this piece. The shape of this - // piece must be equal (not just compatible) to the shape of the proto. - Status CopyFromProto(const LiteralProto& proto); - - // Sorts the elements in a sparse array. - void SortSparseElements(); - - private: - // Helpers for traversing the piece via ForEachSubpiece rooted at 'index'. - // The first non-OK (or non-true) value is returned by the function. - // The callable 'func' has the same signature as described above in - // ForEachSubpiece*. - template - Status ForEachHelper(const Fn& func, const Piece& piece, - ShapeIndex* index) const { - TF_RETURN_IF_ERROR(func(*index, piece)); - for (int64 i = 0; i < piece.children_.size(); ++i) { - index->push_back(i); - TF_RETURN_IF_ERROR(ForEachHelper(func, piece.children_[i], index)); - index->pop_back(); - } - return Status::OK(); - } - template - bool ForEachHelperBool(const Fn& func, const Piece& piece, - ShapeIndex* index) const { - if (!func(*index, piece)) { - return false; - } - for (int64 i = 0; i < piece.children_.size(); ++i) { - index->push_back(i); - if (!ForEachHelperBool(func, piece.children_[i], index)) { - return false; - } - index->pop_back(); - } - return true; - } - template - Status ForEachMutableHelper(const Fn& func, Piece* piece, - ShapeIndex* index) { - TF_RETURN_IF_ERROR(func(*index, piece)); - for (int64 i = 0; i < piece->children_.size(); ++i) { - index->push_back(i); - TF_RETURN_IF_ERROR( - ForEachMutableHelper(func, &piece->children_[i], index)); - index->pop_back(); - } - return Status::OK(); - } - - // Recursive helper for EqualElements. - template - bool EqualElementsInternal(const Piece& other, - std::vector* multi_index) const; - - // Helper for SortSparseElements that has the element type as a template - // parameter. - template - void SortSparseElementsInternal(); - - // For array-shaped pieces, this is the buffer holding the literal data. - char* buffer_ = nullptr; - - // For sparse arrays, this is the array of indices. - SparseIndexArray* sparse_indices_ = nullptr; - - // The shape of piece. This points into the shape of the containing Literal - // (Literal::shape_). - const Shape* subshape_ = nullptr; - - // Children pieces for tuple shaped pieces. - std::vector children_ = {}; - }; // class Piece - - const Piece& piece(const ShapeIndex& shape_index) const { - Piece* piece = &const_cast(root_piece()); - for (const auto i : shape_index) { - DCHECK_GE(i, 0); - DCHECK_LT(i, piece->children_size()); - piece = &piece->child(i); - } - return *piece; - } - - // Returns the piece at the root of the shape. - virtual const Piece& root_piece() const = 0; - - // LiteralSlice and Literal must access Pieces of other Literals. - friend class Literal; - friend class LiteralSlice; - friend class BorrowingLiteral; -}; - -// Class representing literal values in XLA. -// -// The underlying buffer and shape is always owned by this class. -class Literal : public LiteralBase { - public: - Literal() : Literal(ShapeUtil::MakeNil()) {} - - // Create a literal of the given shape. The literal is allocated sufficient - // memory to hold the shape. Memory is uninitialized. - explicit Literal(const Shape& shape); - virtual ~Literal(); - - // Literals are moveable, but not copyable. To copy a literal use - // Literal::Clone or Literal::CloneToUnique. This prevents inadvertent copies - // of literals which can be expensive. - Literal(const Literal& other) = delete; - Literal& operator=(const Literal& other) = delete; - Literal(Literal&& other); - // 'allocate_arrays' indicates whether to allocate memory for the arrays in - // the shape. If false, buffer pointers inside of the Literal::Pieces are set - // to nullptr. - Literal(const Shape& shape, bool allocate_arrays); - Literal& operator=(Literal&& other); - - // TODO(b/67651157): Remove this accessor. Literal users should not be able to - // mutate the shape as this can produce malformed Literals. - Shape* mutable_shape_do_not_use() { return shape_.get(); } - - // Returns a MutableArraySlice view of the array for this literal for the - // given NativeT (e.g., float). CHECKs if the subshape of the literal at the - // given ShapeIndex is not array. See primitive_util.h for the mapping from - // XLA type to native type. - template - tensorflow::gtl::MutableArraySlice data( - const ShapeIndex& shape_index = {}); - // Unhide const method from parent class. - using LiteralBase::data; - - // Returns a pointer to the sparse index array. Returns nullptr if the literal - // is not a sparse array. - SparseIndexArray* sparse_indices(const ShapeIndex& shape_index = {}); - - // Returns a pointer to the underlying buffer holding the array at the given - // shape index. CHECKs if the subshape of the literal at the given ShapeIndex - // is not array. - void* untyped_data(const ShapeIndex& shape_index = {}); - // Unhide const method from parent class. - using LiteralBase::untyped_data; - - // Populates a literal with a sparse layout with the given indices and values. - // Each index in the indices array is CHECKed against the dimensions in the - // literal's shape. If sort is true, then the indices and values will be - // sorted. If sort is false, then the indices and values are assumed to - // already be in sorted order. See CreateSparse for an example of how data - // are populated. - template - void PopulateSparse(SparseIndexArray indices, - tensorflow::gtl::ArraySlice values, - bool sort = true); - - // Copy values from 'src_literal' rooted at 'src_shape_index' into this - // literal rooted at 'dest_shape_index'. The subshape of this literal rooted - // at 'dest_shape_index' must be compatible with the subshape of 'src_literal' - // rooted at 'src_shape_index', but need not be arrays. - Status CopyFrom(const LiteralSlice& src_literal, - const ShapeIndex& dest_shape_index = {}, - const ShapeIndex& src_shape_index = {}); - - // Similar to CopyFrom, but with move semantincs. The subshape of this literal - // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal' - // (layouts and shapes must match), but need not be arrays. The memory - // allocated in this literal for the subshape at dest_shape_index is - // deallocated, and the respective buffers are replaced with those in - // src_literal. Upon return, src_literal is set to a nil shape (empty tuple). - Status MoveFrom(Literal&& src_literal, - const ShapeIndex& dest_shape_index = {}); - - // Copies the values from src_literal, starting at src_base shape indexes, - // to this literal, starting at dest_base, where the copy size in each - // dimension is specified by copy_size. - // The src_literal and this literal must have the same primitive type, - // src_base+copy_size must fit the source literal dimensions, as well as - // dest_base+copy_size must fit the destination literal dimensions. - // Note: if either src_literal or this literal contains dimensions with zero - // element, then copy_size must be 0 in these dimensions while the - // corresponding base indices being 0. - // This literal and 'src_literal' must be arrays. - Status CopySliceFrom(const LiteralSlice& src_literal, - tensorflow::gtl::ArraySlice src_base, - tensorflow::gtl::ArraySlice dest_base, - tensorflow::gtl::ArraySlice copy_size); - - // Copies one element from src_literal[src_index] to (*this)[dest_index]. - Status CopyElementFrom(const LiteralSlice& src_literal, - tensorflow::gtl::ArraySlice src_index, - tensorflow::gtl::ArraySlice dest_index); - - // Sets an element in the literal at the given index. The multi_index is - // CHECKed against the dimension sizes. - template - void Set(tensorflow::gtl::ArraySlice multi_index, - const ShapeIndex& shape_index, NativeT value); - // Overloads of Set for array literals. CHECKs if the literal is not - // array-shaped and dense. - template - void Set(tensorflow::gtl::ArraySlice multi_index, NativeT value); - - // Appends the given element to the literal. If the elements are not appended - // in sorted order, then SortSparseElements should be called before calling - // other methods. This literal must have a sparse layout. - template - void AppendSparseElement(tensorflow::gtl::ArraySlice multi_index, - NativeT value, const ShapeIndex& shape_index = {}); - - // Sorts the elements in a sparse array. - void SortSparseElements(const ShapeIndex& shape_index = {}); - - // As Set(), but truncates `value` to the literal element type before storing. - // This literal must be an array. - Status SetIntegralAsS64(tensorflow::gtl::ArraySlice multi_index, - int64 value); - - // Populate this literal with the given values. Examples: - // - // // Populate with floats. - // Array2D float_values = ... - // literal.PopulateR2FromArray2D(values); - // - // // Populate with int32s. - // literal.PopulateR2({{1, 2}, {3, 4}}); - // - // The shape and element type of this literal must match given values. For - // example, in the call above to literal.PopulateR2(), 'literal' must be a 2x2 - // array of S32. - template - void PopulateR1(tensorflow::gtl::ArraySlice values); - void PopulateR1(const tensorflow::core::Bitmap& values); - template - void PopulateR2(std::initializer_list> values); - template - void PopulateFromArray(const Array& values); - template - void PopulateR2FromArray2D(const Array2D& values); - template - void PopulateR3FromArray3D(const Array3D& values); - template - void PopulateR4FromArray4D(const Array4D& values); - - // Populates literal values by calling the generator function for every cell - // in this literal object. - // - // generator must be a callable of the type - // NativeT(tensorflow::gtl::ArraySlice indexes) or compatible. - // - // This literal must have a dense layout. - template - Status Populate(const FnType& generator); - - // A parallel version of Populate(). This can be used if the generator is - // thread-safe and the values for the shape's different elements are - // independent. - template - Status PopulateParallel(const FnType& generator); - - // Fills this literal with the given value. - template - void PopulateWithValue(NativeT value); - - // Factory methods below. - // - - // Serialize from a proto. - static StatusOr> CreateFromProto( - const LiteralProto& proto); + static Literal GetFirstScalarLiteral(const LiteralSlice& literal); // Creates a new literal of a given rank. To minimize ambiguity (for users // and the compiler) these CreateR[0-2] methods should explicitly specify the @@ -731,8 +71,7 @@ class Literal : public LiteralBase { template static std::unique_ptr CreateR0(NativeT value); template - static std::unique_ptr CreateR1( - tensorflow::gtl::ArraySlice values); + static std::unique_ptr CreateR1(absl::Span values); static std::unique_ptr CreateR1( const tensorflow::core::Bitmap& values); template @@ -801,8 +140,8 @@ class Literal : public LiteralBase { // template static std::unique_ptr CreateSparse( - tensorflow::gtl::ArraySlice dimensions, SparseIndexArray indices, - tensorflow::gtl::ArraySlice values, bool sort = true); + absl::Span dimensions, SparseIndexArray indices, + absl::Span values, bool sort = true); // Creates a scalar literal value zero of the given primitive type. static Literal Zero(PrimitiveType primitive_type); @@ -817,7 +156,7 @@ class Literal : public LiteralBase { // Creates a literal of the given shape where each element is `value`. template static std::unique_ptr CreateFullWithDescendingLayout( - tensorflow::gtl::ArraySlice dimensions, NativeT value); + absl::Span dimensions, NativeT value); // Creates a new literal from an Array type. The variants not ending with // WithLayout use the default XLA layout for the literal's linear @@ -847,7 +186,7 @@ class Literal : public LiteralBase { const Array4D& values, const Layout& layout); // Creates a new vector of U8s literal value from a string. - static std::unique_ptr CreateR1U8(tensorflow::StringPiece value); + static std::unique_ptr CreateR1U8(absl::string_view value); // Creates a linspace-populated literal with the given number of rows and // columns. @@ -875,15 +214,15 @@ class Literal : public LiteralBase { // Returns a tuple literal composed of given literals. Data is copied from the // given elements into the returned literal. static std::unique_ptr MakeTuple( - tensorflow::gtl::ArraySlice elements); + absl::Span elements); static std::unique_ptr MakeTupleFromSlices( - tensorflow::gtl::ArraySlice elements); + absl::Span elements); // As above, but intended to be invoked with move semantics; i.e. // // std::vector> elements = ...; - // auto result = Literal::MakeTupleOwned(std::move(elements)); + // auto result = LiteralUtil::MakeTupleOwned(std::move(elements)); // // This would have been declared as an overload, but there is ambiguity // in invocation between the above signature and this one. @@ -893,7 +232,7 @@ class Literal : public LiteralBase { // This overload lets you pass a braced list of unique_ptrs to // MakeTupleOwned: // - // Literal::MakeTupleOwned(Literal::CreateR1(...), ...). + // LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1(...), ...). // // Simply relying on the MakeTupleOwned(std::vector>) // overload doesn't work because std::initializer_list's elements are always @@ -911,26 +250,15 @@ class Literal : public LiteralBase { return MakeTupleOwned(std::move(v)); } - // Returns a vector containing the tuple elements of this Literal as separate - // Literals. This Literal must be tuple-shaped and can be a nested tuple. The - // elements are moved into the new Literals; no data is copied. Upon return - // this Literal is set to a nil shape (empty tuple) - std::vector DecomposeTuple(); - - // This operation is the inverse of DecomposeTuple. The given elements are - // moved into the tuple elements of a new tuple-shaped Literal which is - // returned. Upon return, each of the Literals in 'elements' is set to a nil - // shape (empty tuple). - static Literal MoveIntoTuple( - tensorflow::gtl::MutableArraySlice elements); + // Create a constant token literal. Token types have no value. + static std::unique_ptr CreateToken(); // Creates a new Literal object with its values havings the primitive_type // type, and with dimensions defined by the dimensions parameter. // The content of the literal values is the default value of the primitive // type of literal itself (0 for numeric types, and false for predicates). static std::unique_ptr CreateFromDimensions( - PrimitiveType primitive_type, - tensorflow::gtl::ArraySlice dimensions); + PrimitiveType primitive_type, absl::Span dimensions); // If the given literal's data type is bfloat16, converts it to a float // literal; otherwise, returns a copy of it. If the literal is a tuple, @@ -949,9 +277,8 @@ class Literal : public LiteralBase { // buffer of the input literal is assumed to have the given minor_to_major // layout order. static std::unique_ptr ReshapeSlice( - tensorflow::gtl::ArraySlice new_dimensions, - tensorflow::gtl::ArraySlice minor_to_major, - const LiteralSlice& literal); + absl::Span new_dimensions, + absl::Span minor_to_major, const LiteralSlice& literal); // Creates a literal with the supplied shape, and uses the provided value // generator to populate the literal's values. @@ -961,7 +288,7 @@ class Literal : public LiteralBase { typename T = typename primitive_util::PrimitiveTypeToNative::type> static StatusOr> CreateRandomLiteral( const Shape& shape, - const std::function)>& generator); + const std::function)>& generator); // Creates a literal with the supplied shape, and initializes the literal // values using a normal distribution with given mean and stddev standard @@ -989,204 +316,23 @@ class Literal : public LiteralBase { // Returns a multi-dimensional index as a string. For example: '{7, 8}' will // be returned for a 2-dimensional index with dimension 0 index equal to 7, // dimension 1 equal to 8. - static string MultiIndexAsString( - tensorflow::gtl::ArraySlice multi_index); - - private: - // Recursively sets the subshapes and buffers of all subpieces rooted at - // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in - // the shape. - void SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays); - - // Returns the piece at the given ShapeIndex. - Piece& piece(const ShapeIndex& shape_index) { - return const_cast(LiteralBase::piece(shape_index)); - } - - Piece& root_piece() const override { return *root_piece_; }; - - // Internal template helper for the Literal::CopySliceFrom(), matching its - // arguments one by one. - template - Status CopySliceFromInternal(const LiteralBase& src_literal, - tensorflow::gtl::ArraySlice src_base, - tensorflow::gtl::ArraySlice dest_base, - tensorflow::gtl::ArraySlice copy_size); - - // Utility structure which is used to create the optimal configuration for - // a ShapeUtil::ForEachIndex() scan across two literals. - struct StrideConfig { - StrideConfig(const Shape& source_shape, const Shape& dest_shape, - tensorflow::gtl::ArraySlice dimensions); - - // The dimensions of the stride operation. Essentially every dimension - // will be iterated from base[i] to base[i]+dimensions[i], in step[i] - // steps. - tensorflow::gtl::ArraySlice dimensions; - DimensionVector base; - DimensionVector step; - int64 minor_dimension = 0; - // The size of the strides for source and destination. One of the two - // (the one looping through its most minor dimension) will be 1, while - // the other will be the stride size at the dimension matching the other - // shape most minor dimension being scanned. - int64 dest_stride = 1; - int64 source_stride = 1; - // The size of the inner loop on the most minor dimension. - int64 minor_loop_size = 1; - }; - - // Literal class always owns the shape. The parent class borrows this shape. - std::unique_ptr shape_; - - Piece* root_piece_ = nullptr; - - // Implementation details shared between Populate() and PopulateParallel() - template - Status PopulateInternal(const FnType& generator, bool parallel); - - // Deallocate the buffers held by this literal. - void DeallocateBuffers(); - - friend class LiteralBase; -}; -std::ostream& operator<<(std::ostream& out, const Literal& literal); - -// A read-only view of a Literal. A LiteralSlice contains pointers to shape and -// literal buffers always owned by others. -class LiteralSlice : public LiteralBase { - public: - LiteralSlice() : LiteralBase() {} - - // Implicit conversion constructors. - LiteralSlice(const LiteralBase& literal); - LiteralSlice(const LiteralBase& literal, const ShapeIndex& view_root); - - private: - const Piece& root_piece() const override { return *root_piece_; }; - - const Piece* root_piece_; // Not owned. -}; - -// A read-only Literal where the underlying buffers are never owned by this -// class. -class BorrowingLiteral : public LiteralBase { - public: - BorrowingLiteral() : LiteralBase() {} - - // 'src_buf_ptr' is not owned by this class and must outlive the - // lifetime of this class. It points to an appropirately sized buffer with - // data interpretered as indicated by 'shape'. - // This constructor is only used for array shapes. - BorrowingLiteral(const char* src_buf_ptr, const Shape& shape); - // Similar as above, except to be used for constructing non-nested tuples. - BorrowingLiteral(tensorflow::gtl::ArraySlice src_buf_ptrs, - const Shape& shape); - // TODO(b/79707221): adding constructors for nested tuples as well. - - private: - // Recursively builds the subtree for the given piece and sets the subshapes - // of the given piece with the given shape. - void BuildPieceSubtree(const Shape& shape, Piece* piece); - - // Accessor for the root piece of this literal. - const Piece& root_piece() const override { return root_piece_; }; - Piece root_piece_; - - // Shape of this literal. - const Shape shape_; + static string MultiIndexAsString(absl::Span multi_index); }; -template -tensorflow::gtl::ArraySlice LiteralBase::Piece::data() const { - CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape()); - CHECK_EQ(subshape().element_type(), - primitive_util::NativeToPrimitiveType()) - << "Attempting to access " - << PrimitiveType_Name(primitive_util::NativeToPrimitiveType()) - << " type, but literal element type is " - << PrimitiveType_Name(subshape().element_type()); - return tensorflow::gtl::ArraySlice( - reinterpret_cast(buffer()), element_count()); -} - -template -tensorflow::gtl::MutableArraySlice LiteralBase::Piece::data() { - CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape()); - CHECK_EQ(subshape().element_type(), - primitive_util::NativeToPrimitiveType()) - << "Attempting to access " - << PrimitiveType_Name(primitive_util::NativeToPrimitiveType()) - << " type, but literal element type is " - << PrimitiveType_Name(subshape().element_type()); - return tensorflow::gtl::MutableArraySlice( - reinterpret_cast(buffer()), element_count()); -} - -template -NativeT LiteralBase::Piece::Get( - tensorflow::gtl::ArraySlice multi_index) const { - CHECK(LayoutUtil::IsDenseArray(subshape())); - return data()[IndexUtil::MultidimensionalIndexToLinearIndex( - subshape(), multi_index)]; -} - -template -void LiteralBase::Piece::Set(tensorflow::gtl::ArraySlice multi_index, - NativeT value) { - CHECK(LayoutUtil::IsDenseArray(subshape())); - data()[IndexUtil::MultidimensionalIndexToLinearIndex( - subshape(), multi_index)] = value; -} - -template -tensorflow::gtl::ArraySlice LiteralBase::data( - const ShapeIndex& shape_index) const { - return piece(shape_index).data(); -} - -template -tensorflow::gtl::MutableArraySlice Literal::data( - const ShapeIndex& shape_index) { - return piece(shape_index).data(); -} - -template -inline NativeT LiteralBase::Get(tensorflow::gtl::ArraySlice multi_index, - const ShapeIndex& shape_index) const { - return piece(shape_index).Get(multi_index); -} - -template -inline NativeT LiteralBase::Get( - tensorflow::gtl::ArraySlice multi_index) const { - return root_piece().Get(multi_index); -} - -template -inline void Literal::Set(tensorflow::gtl::ArraySlice multi_index, - const ShapeIndex& shape_index, NativeT value) { - return piece(shape_index).Set(multi_index, value); -} - -template -inline void Literal::Set(tensorflow::gtl::ArraySlice multi_index, - NativeT value) { - return root_piece().Set(multi_index, value); -} +std::ostream& operator<<(std::ostream& out, const Literal& literal); template -/* static */ std::unique_ptr Literal::CreateR0(NativeT value) { - auto literal = MakeUnique(ShapeUtil::MakeShape( +/* static */ std::unique_ptr LiteralUtil::CreateR0(NativeT value) { + auto literal = absl::make_unique(ShapeUtil::MakeShape( primitive_util::NativeToPrimitiveType(), {})); literal->Set({}, value); return literal; } template -/* static */ std::unique_ptr Literal::CreateR1( - tensorflow::gtl::ArraySlice values) { - auto literal = MakeUnique( +/* static */ std::unique_ptr LiteralUtil::CreateR1( + absl::Span values) { + auto literal = absl::make_unique( ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType(), {static_cast(values.size())})); literal->PopulateR1(values); @@ -1194,10 +340,10 @@ template } template -/* static */ std::unique_ptr Literal::CreateR2WithLayout( +/* static */ std::unique_ptr LiteralUtil::CreateR2WithLayout( std::initializer_list> values, const Layout& layout) { - auto literal = MakeUnique(ShapeUtil::MakeShapeWithLayout( + auto literal = absl::make_unique(ShapeUtil::MakeShapeWithLayout( primitive_util::NativeToPrimitiveType(), {static_cast(values.size()), static_cast(values.begin()->size())}, @@ -1207,13 +353,13 @@ template } template -/* static */ std::unique_ptr Literal::CreateR2( +/* static */ std::unique_ptr LiteralUtil::CreateR2( std::initializer_list> values) { return CreateR2WithLayout(values, LayoutUtil::GetDefaultLayoutForR2()); } template -/* static */ std::unique_ptr Literal::CreateR3WithLayout( +/* static */ std::unique_ptr LiteralUtil::CreateR3WithLayout( std::initializer_list>> values, const Layout& layout) { @@ -1238,14 +384,14 @@ template } template -/* static */ std::unique_ptr Literal::CreateR3( +/* static */ std::unique_ptr LiteralUtil::CreateR3( std::initializer_list>> values) { return CreateR3WithLayout(values, LayoutUtil::GetDefaultLayoutForR3()); } template -/* static */ std::unique_ptr Literal::CreateR4WithLayout( +/* static */ std::unique_ptr LiteralUtil::CreateR4WithLayout( std::initializer_list>>> values, @@ -1276,22 +422,23 @@ template } template -/* static */ std::unique_ptr Literal::CreateSparse( - tensorflow::gtl::ArraySlice dimensions, SparseIndexArray indices, - tensorflow::gtl::ArraySlice values, bool sort) { +/* static */ std::unique_ptr LiteralUtil::CreateSparse( + absl::Span dimensions, SparseIndexArray indices, + absl::Span values, bool sort) { int64 num_elements = values.size(); int64 rank = dimensions.size(); CHECK_EQ(num_elements, indices.index_count()); CHECK_EQ(rank, indices.rank()); - auto literal = MakeUnique(ShapeUtil::MakeShapeWithSparseLayout( - primitive_util::NativeToPrimitiveType(), dimensions, - indices.max_indices())); + auto literal = + absl::make_unique(ShapeUtil::MakeShapeWithSparseLayout( + primitive_util::NativeToPrimitiveType(), dimensions, + indices.max_indices())); literal->PopulateSparse(indices, values, sort); return literal; } template -/* static */ std::unique_ptr Literal::CreateR4( +/* static */ std::unique_ptr LiteralUtil::CreateR4( std::initializer_list>>> values) { @@ -1299,9 +446,9 @@ template } template -/* static */ std::unique_ptr Literal::CreateFromArrayWithLayout( +/* static */ std::unique_ptr LiteralUtil::CreateFromArrayWithLayout( const Array& values, const Layout& layout) { - auto literal = MakeUnique(ShapeUtil::MakeShapeWithLayout( + auto literal = absl::make_unique(ShapeUtil::MakeShapeWithLayout( primitive_util::NativeToPrimitiveType(), values.dimensions(), AsInt64Slice(layout.minor_to_major()))); literal->PopulateFromArray(values); @@ -1309,38 +456,40 @@ template } template -/* static */ std::unique_ptr Literal::CreateFromArray( +/* static */ std::unique_ptr LiteralUtil::CreateFromArray( const Array& values) { return CreateFromArrayWithLayout( values, LayoutUtil::GetDefaultLayoutForRank(values.num_dimensions())); } template -/* static */ std::unique_ptr Literal::CreateR2FromArray2DWithLayout( - const Array2D& values, const Layout& layout) { +/* static */ std::unique_ptr +LiteralUtil::CreateR2FromArray2DWithLayout(const Array2D& values, + const Layout& layout) { return CreateFromArrayWithLayout(values, layout); } template -/* static */ std::unique_ptr Literal::CreateR2FromArray2D( +/* static */ std::unique_ptr LiteralUtil::CreateR2FromArray2D( const Array2D& values) { return CreateFromArray(values); } template -/* static */ std::unique_ptr Literal::CreateR3FromArray3DWithLayout( - const Array3D& values, const Layout& layout) { +/* static */ std::unique_ptr +LiteralUtil::CreateR3FromArray3DWithLayout(const Array3D& values, + const Layout& layout) { return CreateFromArrayWithLayout(values, layout); } template -/* static */ std::unique_ptr Literal::CreateR3FromArray3D( +/* static */ std::unique_ptr LiteralUtil::CreateR3FromArray3D( const Array3D& values) { return CreateFromArray(values); } template -/* static */ std::unique_ptr Literal::CreateR3Projected( +/* static */ std::unique_ptr LiteralUtil::CreateR3Projected( std::initializer_list> values, int64 projection) { int64 dim0_size = projection; @@ -1365,7 +514,7 @@ template } template -/* static */ std::unique_ptr Literal::CreateR4Projected( +/* static */ std::unique_ptr LiteralUtil::CreateR4Projected( std::initializer_list> values, int64 projection_p, int64 projection_z) { int64 dim0_size = projection_p; @@ -1393,49 +542,21 @@ template } template -/* static */ std::unique_ptr Literal::CreateR4FromArray4D( +/* static */ std::unique_ptr LiteralUtil::CreateR4FromArray4D( const Array4D& values) { return CreateFromArray(values); } template -/* static */ std::unique_ptr Literal::CreateR4FromArray4DWithLayout( - const Array4D& values, const Layout& layout) { +/* static */ std::unique_ptr +LiteralUtil::CreateR4FromArray4DWithLayout(const Array4D& values, + const Layout& layout) { return CreateFromArrayWithLayout(values, layout); } -template -NativeT LiteralBase::GetFirstElement() const { - return data().at(0); -} - -template -NativeT LiteralBase::GetSparseElement(int64 sparse_element_number, - const ShapeIndex& shape_index) const { - CHECK( - LayoutUtil::IsSparseArray(ShapeUtil::GetSubshape(shape(), shape_index))); - return data(shape_index)[sparse_element_number]; -} - -template -void Literal::AppendSparseElement( - tensorflow::gtl::ArraySlice multi_index, NativeT value, - const ShapeIndex& shape_index) { - Piece& p = piece(shape_index); - const Shape& subshape = p.subshape(); - CHECK(LayoutUtil::IsSparseArray(subshape)); - int64 rank = ShapeUtil::Rank(subshape); - CHECK_EQ(multi_index.size(), rank); - int64 last_element = p.sparse_indices()->index_count(); - CHECK_LT(last_element, LayoutUtil::MaxSparseElements(subshape.layout())); - p.sparse_indices()->Append(multi_index); - CHECK_LT(last_element, p.data().size()); - p.data()[last_element] = value; -} - // Returns an identity matrix (rank 2) with the given row and column count. template -/* static */ std::unique_ptr Literal::MakeIdentityR2(int64 size) { +/* static */ std::unique_ptr LiteralUtil::MakeIdentityR2(int64 size) { Array2D array(size, size, 0); for (int64 i = 0; i < size; ++i) { array(i, i) = 1; @@ -1444,245 +565,43 @@ template } template -void LiteralBase::EachCell( - std::function indices, - NativeT value)> - per_cell) const { - if (ShapeUtil::HasZeroElements(shape())) { - return; - } - std::vector indices(ShapeUtil::Rank(shape()), 0); - do { - per_cell(indices, Get(indices)); - } while (IndexUtil::BumpIndices(shape(), &indices)); -} - -template -inline void Literal::PopulateR1(tensorflow::gtl::ArraySlice values) { - CHECK(ShapeUtil::IsArray(shape())); - CHECK_EQ(ShapeUtil::Rank(shape()), 1); - CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size()); - CHECK_EQ(shape().element_type(), - primitive_util::NativeToPrimitiveType()); - for (int64 i = 0; i < values.size(); ++i) { - Set({i}, values[i]); - } -} - -template -void Literal::PopulateR2( - std::initializer_list> values) { - CHECK(ShapeUtil::IsArray(shape())); - CHECK_EQ(ShapeUtil::Rank(shape()), 2); - CHECK_EQ(shape().element_type(), - primitive_util::NativeToPrimitiveType()); - - const int64 dim0_size = values.size(); - const int64 dim1_size = values.begin()->size(); - CHECK_EQ(dim0_size, shape().dimensions(0)); - CHECK_EQ(dim1_size, shape().dimensions(1)); - - int64 dim0 = 0; - for (auto inner_list : values) { - int64 dim1 = 0; - for (auto value : inner_list) { - Set({dim0, dim1}, value); - ++dim1; - } - CHECK_EQ(dim1_size, dim1); - ++dim0; - } -} - -template -void Literal::PopulateFromArray(const Array& values) { - CHECK(ShapeUtil::IsArray(shape())); - CHECK_EQ(shape().element_type(), - primitive_util::NativeToPrimitiveType()); - CHECK_EQ(ShapeUtil::Rank(shape()), values.num_dimensions()); - for (int dim = 0; dim < values.num_dimensions(); ++dim) { - CHECK_EQ(values.dim(dim), shape().dimensions(dim)); - } - values.Each([this](tensorflow::gtl::ArraySlice indices, - NativeT value) { this->Set(indices, value); }); -} - -template -void Literal::PopulateR2FromArray2D(const Array2D& values) { - PopulateFromArray(values); -} - -template -void Literal::PopulateR3FromArray3D(const Array3D& values) { - PopulateFromArray(values); -} - -template -void Literal::PopulateR4FromArray4D(const Array4D& values) { - PopulateFromArray(values); -} - -template -void Literal::PopulateSparse(SparseIndexArray indices, - tensorflow::gtl::ArraySlice values, - bool sort) { - CHECK(LayoutUtil::IsSparseArray(shape())); - int rank = ShapeUtil::Rank(shape()); - CHECK_EQ(indices.rank(), rank); - int64 max_elements = LayoutUtil::MaxSparseElements(shape().layout()); - CHECK_LE(indices.max_indices(), max_elements); - int64 num_elements = values.size(); - CHECK_LE(num_elements, max_elements); - CHECK_EQ(num_elements, indices.index_count()); - auto root_data = root_piece().data(); - // Piece::data() returns an ArraySlice of size equal to the number of indices - // in the SparseIndexArray. So there is no need to adjust the size of the data - // here. It is enough to just copy the incoming values into the data buffer. - std::copy(values.begin(), values.end(), root_data.begin()); - *this->root_piece().sparse_indices() = std::move(indices); - if (sort) { - auto root_data = this->root_piece().data(); - this->root_piece().sparse_indices()->SortWithValues(root_data); - } - DCHECK(this->root_piece().sparse_indices()->Validate(shape())); -} - -template -Status Literal::PopulateInternal(const FnType& generator, bool parallel) { - const Shape& this_shape = shape(); - const int64 rank = ShapeUtil::Rank(this_shape); - TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape)); - TF_RET_CHECK(this_shape.element_type() == - primitive_util::NativeToPrimitiveType()); - tensorflow::gtl::MutableArraySlice literal_data = data(); - if (rank > 0) { - StrideConfig stride_config(this_shape, this_shape, - AsInt64Slice(this_shape.dimensions())); - int64 minor_dimension_size = - ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension); - - auto init_function = [&](tensorflow::gtl::ArraySlice indexes) { - DimensionVector minor_scan_indexes(rank, 0); - const int64 index = - IndexUtil::MultidimensionalIndexToLinearIndex(shape(), indexes); - std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin()); - for (int64 i = 0; i < minor_dimension_size; ++i) { - minor_scan_indexes[stride_config.minor_dimension] = i; - literal_data.at(index + i) = generator(minor_scan_indexes); - } - }; - if (parallel) { - ShapeUtil::ForEachIndexParallel(this_shape, stride_config.base, - stride_config.dimensions, - stride_config.step, init_function); - } else { - ShapeUtil::ForEachIndex( - this_shape, stride_config.base, stride_config.dimensions, - stride_config.step, - [&init_function](tensorflow::gtl::ArraySlice indexes) { - init_function(indexes); - return true; - }); - } - } else { - // For scalars. - literal_data.at(0) = generator({}); - } - return Status::OK(); -} -template -Status Literal::Populate(const FnType& generator) { - return PopulateInternal(generator, /*parallel=*/false); -} - -template -Status Literal::PopulateParallel(const FnType& generator) { - return PopulateInternal(generator, /*parallel=*/true); -} - -template -void Literal::PopulateWithValue(NativeT value) { - CHECK(ShapeUtil::IsArray(shape())); - CHECK_EQ(shape().element_type(), - primitive_util::NativeToPrimitiveType()); - for (NativeT& element : data()) { - element = value; - } -} - -template -/* static */ std::unique_ptr Literal::CreateFullWithDescendingLayout( - tensorflow::gtl::ArraySlice dimensions, NativeT value) { - auto literal = MakeUnique(ShapeUtil::MakeShapeWithDescendingLayout( - primitive_util::NativeToPrimitiveType(), dimensions)); - literal->PopulateWithValue(value); - return literal; -} - -template -std::unique_ptr LiteralBase::Replicate(int64 times) const { - DimensionVector bounds = {times}; - bounds.reserve(shape().dimensions_size() + 1); - for (int64 bound : shape().dimensions()) { - bounds.push_back(bound); - } +/* static */ std::unique_ptr +LiteralUtil::CreateFullWithDescendingLayout(absl::Span dimensions, + NativeT value) { auto literal = - MakeUnique(ShapeUtil::MakeShape(shape().element_type(), bounds)); - int64 elements = ShapeUtil::ElementsIn(literal->shape()); - if (elements == 0) { - return literal; - } - - DimensionVector output_indices(bounds.size(), 0); - tensorflow::gtl::ArraySlice input_indices = output_indices; - input_indices.remove_prefix(1); - - bool done = false; - while (!done) { - const auto element = Get(input_indices); - literal->Set(output_indices, element); - - done = true; - for (int n = 0; n < output_indices.size(); ++n) { - ++output_indices[n]; - if (output_indices[n] < bounds[n]) { - done = false; - break; - } - output_indices[n] = 0; - } - } + absl::make_unique(ShapeUtil::MakeShapeWithDescendingLayout( + primitive_util::NativeToPrimitiveType(), dimensions)); + literal->PopulateWithValue(value); return literal; } template -/* static */ StatusOr> Literal::CreateRandomLiteral( +/* static */ StatusOr> +LiteralUtil::CreateRandomLiteral( const Shape& shape, - const std::function)>& generator) { + const std::function)>& generator) { using NativeT = typename primitive_util::PrimitiveTypeToNative::type; TF_RET_CHECK(shape.element_type() == type); - auto literal = MakeUnique(shape); + auto literal = absl::make_unique(shape); TF_RETURN_IF_ERROR(literal.get()->Populate( - [&](tensorflow::gtl::ArraySlice indexes) { - return generator(indexes); - })); + [&](absl::Span indexes) { return generator(indexes); })); return std::move(literal); } template -/* static */ StatusOr> Literal::CreateRandomLiteral( - const Shape& shape, E* engine, T mean, T stddev) { +/* static */ StatusOr> +LiteralUtil::CreateRandomLiteral(const Shape& shape, E* engine, T mean, + T stddev) { using NativeT = typename primitive_util::PrimitiveTypeToNative::type; std::normal_distribution generator(mean, stddev); return CreateRandomLiteral( - shape, [&](tensorflow::gtl::ArraySlice /*indexes*/) { - return generator(*engine); - }); + shape, + [&](absl::Span /*indexes*/) { return generator(*engine); }); } template -/* static */ StatusOr> Literal::CreateRandomLiteral( - const Shape& shape, T mean, T stddev) { +/* static */ StatusOr> +LiteralUtil::CreateRandomLiteral(const Shape& shape, T mean, T stddev) { std::minstd_rand0 engine; return CreateRandomLiteral(shape, &engine, mean, stddev); } diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc deleted file mode 100644 index f127cee0fdc126429ed423aace3b3b7764a05b2e..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/literal_util_test.cc +++ /dev/null @@ -1,1844 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/literal_util.h" - -#include - -#include "tensorflow/compiler/tf2xla/shape_util.h" -#include "tensorflow/compiler/xla/array3d.h" -#include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/casts.h" -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/types.h" - -namespace xla { -namespace { - -using tensorflow::gtl::ArraySlice; -using ::testing::ElementsAre; -using ::testing::HasSubstr; - -class LiteralUtilTest : public ::testing::Test { - protected: - LiteralUtilTest() { - Array4D arr4d({ - // clang-format off - { // i0=0 - { // i1=0 - {1, 2, 3}, // i2=0 - {4, 5, 6}, // i2=1 - {7, 8, 9}, // i2=2 - }, - { // i1=1 - {11, 12, 13}, - {14, 15, 16}, - {17, 18, 19}, - }, - }, - { // i0=1 - { // i1=0 - {101, 102, 103}, - {104, 105, 106}, - {107, 108, 109}, - }, - { // i1=1 - {201, 202, 203}, // i2=0 - {204, 205, 206}, // i2=1 - {207, 208, 209}, // i2=2 - }, - }, - // clang-format on - }); - - layout_r2_dim0major_ = LayoutUtil::MakeLayout({1, 0}); - layout_r2_dim0minor_ = LayoutUtil::MakeLayout({0, 1}); - layout_r3_dim0major_ = LayoutUtil::MakeLayout({2, 1, 0}); - layout_r3_dim0minor_ = LayoutUtil::MakeLayout({0, 1, 2}); - layout_r4_dim0major_ = LayoutUtil::MakeLayout({3, 2, 1, 0}); - layout_r4_dim0minor_ = LayoutUtil::MakeLayout({0, 1, 2, 3}); - - literal_r4_2x2x3x3_dim0major_ = - Literal::CreateR4FromArray4DWithLayout(arr4d, - layout_r4_dim0major_); - literal_r4_2x2x3x3_dim0minor_ = - Literal::CreateR4FromArray4DWithLayout(arr4d, - layout_r4_dim0minor_); - } - - Layout layout_r2_dim0major_; - Layout layout_r2_dim0minor_; - Layout layout_r3_dim0major_; - Layout layout_r3_dim0minor_; - Layout layout_r4_dim0major_; - Layout layout_r4_dim0minor_; - std::unique_ptr literal_r4_2x2x3x3_dim0major_; - std::unique_ptr literal_r4_2x2x3x3_dim0minor_; -}; - -TEST_F(LiteralUtilTest, LiteralScalarToString) { - auto true_lit = Literal::CreateR0(true); - ASSERT_EQ("true", true_lit->ToString()); - - auto false_lit = Literal::CreateR0(false); - ASSERT_EQ("false", false_lit->ToString()); - - auto u32_lit = Literal::CreateR0(42); - ASSERT_EQ("42", u32_lit->ToString()); - - auto s32_lit = Literal::CreateR0(-999); - ASSERT_EQ("-999", s32_lit->ToString()); - - auto f32_lit = Literal::CreateR0(3.14f); - ASSERT_EQ("3.14", f32_lit->ToString()); - - auto f16_lit = Literal::CreateR0(static_cast(0.5f)); - ASSERT_EQ("0.5", f16_lit->ToString()); - - auto c64_lit = Literal::CreateR0({3.14f, 2.78f}); - ASSERT_EQ("(3.14, 2.78)", c64_lit->ToString()); - - auto bf16_lit = Literal::CreateR0(static_cast(0.5f)); - ASSERT_EQ("0.5", bf16_lit->ToString()); - - // 3.14 will be truncated to 3.125 in bfloat16 format. - auto bf16_lit_truncated = - Literal::CreateR0(static_cast(3.14f)); - ASSERT_EQ("3.125", bf16_lit_truncated->ToString()); - - auto bf16_lit_truncated2 = - Literal::CreateR0(static_cast(9.001f)); - ASSERT_EQ("9", bf16_lit_truncated2->ToString()); -} - -TEST_F(LiteralUtilTest, LiteralVectorToString) { - auto pred_vec = Literal::CreateR1({true, false, true}); - ASSERT_EQ("{101}", pred_vec->ToString()); -} - -TEST_F(LiteralUtilTest, R2ToString) { - const auto literal = Literal::CreateR2({{1, 2}, {3, 4}, {5, 6}}); - const string expected = R"(s32[3,2] { - { 1, 2 }, - { 3, 4 }, - { 5, 6 } -})"; - ASSERT_EQ(expected, literal->ToString()); -} - -TEST_F(LiteralUtilTest, R3ToString) { - const auto literal = Literal::CreateR3({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}}); - const string expected = R"(s32[3,2,1] { -{ { 1 }, - { 2 } }, -{ { 3 }, - { 4 } }, -{ { 5 }, - { 6 } } -})"; - ASSERT_EQ(expected, literal->ToString()); -} - -TEST_F(LiteralUtilTest, TupleToString) { - auto scalar = Literal::CreateR0(1.0); - auto matrix = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()}); - const string expected = R"((f32[], f32[2,2]) ( -1, -f32[2,2] { - { 1, 2 }, - { 3, 4 } -} -))"; - ASSERT_EQ(expected, tuple->ToString()); -} - -TEST_F(LiteralUtilTest, CreateR3FromArray3d) { - // clang-format off - Array3D array_3d({ - {{1.0f, 2.0f}, - {3.0f, 4.0f}, - {5.0f, 6.0f}}, - {{7.0f, 8.0f}, - {9.0f, 10.0f}, - {11.0f, 12.0f}}, - }); - // clang-format on - - auto literal = Literal::CreateR3FromArray3D(array_3d); - EXPECT_THAT(literal->shape().dimensions(), ElementsAre(2, 3, 2)); - string result = literal->ToString(); - const string expected = R"(f32[2,3,2] { -{ { 1, 2 }, - { 3, 4 }, - { 5, 6 } }, -{ { 7, 8 }, - { 9, 10 }, - { 11, 12 } } -})"; - ASSERT_EQ(expected, result); -} - -TEST_F(LiteralUtilTest, CreateSparse) { - std::vector dimensions = {8, 8, 8}; - Array2D indices = { - {3, 4, 5}, - {1, 2, 3}, - {2, 3, 4}, - {3, 5, 6}, - }; - std::vector values = {7, 8, 9, 10}; - auto literal = Literal::CreateSparse( - dimensions, SparseIndexArray(indices.n1() + 3, indices), values); - - Array2D expected_indices = { - {1, 2, 3}, - {2, 3, 4}, - {3, 4, 5}, - {3, 5, 6}, - }; - std::vector expected_values = {8, 9, 7, 10}; - - EXPECT_EQ(literal->sparse_indices()->data(), - ArraySlice(expected_indices.data(), - expected_indices.num_elements())); - EXPECT_EQ(literal->data(), ArraySlice(expected_values)); -} - -TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) { - // clang-format off - auto literal = Literal::CreateR4Projected({ - {1, 2}, - {1001, 1002}, - {2001, 2002}, - }, /*projection_p=*/1, /*projection_z=*/2); - // clang-format on - EXPECT_THAT(literal->shape().dimensions(), ElementsAre(1, 2, 3, 2)); - string result = literal->ToString(); - const string expected = R"(f32[1,2,3,2] { - { /*i0=0*/ - { /*i1=0*/ - {1, 2}, - {1001, 1002}, - {2001, 2002} - }, - { /*i1=1*/ - {1, 2}, - {1001, 1002}, - {2001, 2002} - } - } -})"; - ASSERT_EQ(expected, result); -} - -TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) { - EXPECT_THAT(literal_r4_2x2x3x3_dim0major_->shape().dimensions(), - ElementsAre(2, 2, 3, 3)); - string result = literal_r4_2x2x3x3_dim0major_->ToString(); - const string expected = R"(f32[2,2,3,3] { - { /*i0=0*/ - { /*i1=0*/ - {1, 2, 3}, - {4, 5, 6}, - {7, 8, 9} - }, - { /*i1=1*/ - {11, 12, 13}, - {14, 15, 16}, - {17, 18, 19} - } - }, - { /*i0=1*/ - { /*i1=0*/ - {101, 102, 103}, - {104, 105, 106}, - {107, 108, 109} - }, - { /*i1=1*/ - {201, 202, 203}, - {204, 205, 206}, - {207, 208, 209} - } - } -})"; - ASSERT_EQ(expected, result); -} - -TEST_F(LiteralUtilTest, EachCellR2F32) { - // clang-format off - auto literal = Literal::CreateR2({ - {3.1f, 4.2f}, - {9.3f, 12.4f}, - }); - // clang-format on - std::vector> seen; - literal->EachCellAsString( - [&seen](ArraySlice indices, const string& value) { - seen.emplace_back(indices[0], indices[1], value); - }); - - using Elem = std::tuple; - std::vector expected = {Elem(0, 0, "3.1"), Elem(0, 1, "4.2"), - Elem(1, 0, "9.3"), Elem(1, 1, "12.4")}; - EXPECT_EQ(expected, seen); -} - -TEST_F(LiteralUtilTest, ScalarEquality) { - // Test equality with scalars. - auto f32_42 = Literal::CreateR0(42.0); - auto f32_42_clone = Literal::CreateR0(42.0); - - EXPECT_EQ(*f32_42, *f32_42); - EXPECT_EQ(*f32_42, *f32_42_clone); - - auto f32_123 = Literal::CreateR0(123.0); - EXPECT_NE(*f32_42, *f32_123); - - auto f64_42 = Literal::CreateR0(42.0); - EXPECT_NE(*f32_42, *f64_42); -} - -TEST_F(LiteralUtilTest, NonScalarEquality) { - // Test equality with nonscalars. - auto matrix = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - auto matrix_clone = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - auto matrix_different = Literal::CreateR2({{4.0, 3.0}, {1.0, 2.0}}); - auto vector_literal = Literal::CreateR1({1.0, 2.0, 3.0, 4.0}); - auto scalar = Literal::CreateR0(1.0); - Literal nil(ShapeUtil::MakeNil()); - - EXPECT_EQ(*matrix, *matrix); - EXPECT_EQ(*matrix, *matrix_clone); - EXPECT_NE(*matrix, *matrix_different); - EXPECT_NE(*matrix, *vector_literal); - EXPECT_NE(*matrix, *scalar); - EXPECT_NE(*matrix, nil); - EXPECT_EQ(nil, nil); -} - -TEST_F(LiteralUtilTest, DifferentLayoutEquality) { - // Test equality with literals which have different layouts. - auto colmajor = - MakeUnique(ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})); - colmajor->Set({0, 0}, 1.0); - colmajor->Set({0, 1}, 2.0); - colmajor->Set({1, 0}, 3.0); - colmajor->Set({1, 1}, 4.0); - - auto rowmajor = - MakeUnique(ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0})); - rowmajor->Set({0, 0}, 1.0); - rowmajor->Set({0, 1}, 2.0); - rowmajor->Set({1, 0}, 3.0); - rowmajor->Set({1, 1}, 4.0); - - EXPECT_EQ(*rowmajor, *colmajor); -} - -TEST_F(LiteralUtilTest, TupleEquality) { - // Test equality with tuples. - auto scalar = Literal::CreateR0(1.0); - auto matrix = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - auto tuple1 = Literal::MakeTuple({scalar.get(), matrix.get()}); - - // Tuple with the same elements. One element is shared with the original - // tuple, the other is a clone of the element in the original tuple. - auto scalar_clone = Literal::CreateR0(1.0); - auto tuple2 = Literal::MakeTuple({scalar_clone.get(), matrix.get()}); - EXPECT_EQ(*tuple1, *tuple2); - - // Tuple with elements reversed. - auto reversed_tuple = Literal::MakeTuple({matrix.get(), scalar.get()}); - EXPECT_NE(*tuple1, *reversed_tuple); - - // Tuple with different value. - auto scalar_42 = Literal::CreateR0(42.0); - auto different_tuple = Literal::MakeTuple({scalar_42.get(), matrix.get()}); - EXPECT_NE(*tuple1, *different_tuple); -} - -TEST_F(LiteralUtilTest, C64Equality) { - // Test equality with tuples. - auto vector = Literal::CreateR1({{1.0, 2.0}, {3.0, 4.0}}); - - // Tuple with the same elements. One element is shared with the original - // tuple, the other is a clone of the element in the original tuple. - auto vector_clone = Literal::CreateR1({{1.0, 2.0}, {3.0, 4.0}}); - EXPECT_EQ(*vector, *vector_clone); - - auto vector_reversed = Literal::CreateR1({{3.0, 4.0}, {1.0, 2.0}}); - EXPECT_NE(*vector, *vector_reversed); -} - -TEST_F(LiteralUtilTest, IsAllTuple) { - auto element1 = Literal::CreateR0(0.0); - auto element2 = Literal::CreateR2({{0.0, 0.0}, {0.0, 0.0}}); - auto tuple = Literal::MakeTuple({element1.get(), element1.get()}); - - // Tuples should always return false for IsAll. - EXPECT_FALSE(tuple->IsAll(0)); - EXPECT_FALSE(tuple->IsAll(1)); -} - -// Verifies that CreateFromShape works for tuples. -TEST_F(LiteralUtilTest, CreateFromShapeTuple) { - auto scalar = Literal::CreateR0(0.0); - auto matrix = Literal::CreateR2({{0, 0}, {0, 0}}); - auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()}); - - auto x = Literal::CreateFromShape(tuple->shape()); - EXPECT_EQ(*tuple, *x); -} - -TEST_F(LiteralUtilTest, IsAll) { - EXPECT_TRUE(Literal::CreateR0(false)->IsAll(0)); - EXPECT_TRUE(Literal::CreateR0(true)->IsAll(1)); - EXPECT_FALSE(Literal::CreateR0(false)->IsAll(1)); - EXPECT_FALSE(Literal::CreateR0(false)->IsAll(2)); - EXPECT_FALSE(Literal::CreateR0(true)->IsAll(0)); - EXPECT_FALSE(Literal::CreateR0(true)->IsAll(2)); - EXPECT_FALSE(Literal::CreateR0(true)->IsAll(-1)); - - // We shouldn't reinterpret int8_min as an unsigned type and then decide that - // it is equal to 255. - auto int8_min = std::numeric_limits::min(); - EXPECT_FALSE(Literal::CreateR0(255)->IsAll(int8_min)); - - EXPECT_TRUE(Literal::CreateR0(42.0)->IsAll(42)); - EXPECT_FALSE(Literal::CreateR0(42.0001)->IsAll(42)); - - EXPECT_TRUE(Literal::CreateR1({100, 100, 100})->IsAll(100)); - EXPECT_FALSE(Literal::CreateR1({100, 100, 100.001})->IsAll(100)); - - EXPECT_TRUE(Literal::CreateR2({{8, 8}, {8, 8}})->IsAll(8)); - EXPECT_FALSE(Literal::CreateR2({{8, 8}, {8, 9}})->IsAll(8)); - EXPECT_FALSE(Literal::CreateR2({{9, 8}, {8, 8}})->IsAll(8)); - - half h8(8.0f); - half h9(9.0f); - EXPECT_TRUE(Literal::CreateR2({{h8}, {h8}})->IsAll(8)); - EXPECT_FALSE(Literal::CreateR2({{h8}, {h9}})->IsAll(8)); - EXPECT_FALSE(Literal::CreateR2({{h9}, {h8}})->IsAll(8)); - - bfloat16 b8(8.0f); - bfloat16 b9(9.0f); - - EXPECT_TRUE(Literal::CreateR2({{b8}, {b8}})->IsAll(8)); - EXPECT_FALSE(Literal::CreateR2({{b8}, {b9}})->IsAll(8)); - EXPECT_FALSE(Literal::CreateR2({{b9}, {b8}})->IsAll(8)); - - // 9.001 will be truncated to 9.0 - bfloat16 b91(9.001f); - bfloat16 b90(9.00f); - EXPECT_TRUE(Literal::CreateR2({{b91}, {b90}})->IsAll(9.0)); - - complex64 c8_9 = {8, 9}; - EXPECT_FALSE(Literal::CreateR2({{c8_9}, {c8_9}})->IsAll(8)); - - auto uint64_max = std::numeric_limits::max(); - EXPECT_FALSE(Literal::CreateR2( - {{uint64_max, uint64_max}, {uint64_max, uint64_max}}) - ->IsAll(-1)); -} - -TEST_F(LiteralUtilTest, IsAllFloat) { - // IsAllFloat always returns false when the literal is not floating-point. - EXPECT_FALSE(Literal::CreateR0(false)->IsAllFloat(0)); - EXPECT_FALSE(Literal::CreateR0(0)->IsAllFloat(0)); - EXPECT_FALSE(Literal::CreateR0(0)->IsAllFloat(0)); - EXPECT_FALSE(Literal::CreateR0(0)->IsAllFloat(0)); - - EXPECT_TRUE(Literal::CreateR0(0)->IsAllFloat(0)); - EXPECT_TRUE(Literal::CreateR0(.5)->IsAllFloat(.5)); - EXPECT_TRUE(Literal::CreateR0(-.5)->IsAllFloat(-.5)); - EXPECT_FALSE(Literal::CreateR0(-.5)->IsAllFloat(-.49)); - EXPECT_FALSE( - Literal::CreateR2({{0, 0, 0}, {0, .1, 0}})->IsAllFloat(0)); - EXPECT_TRUE( - Literal::CreateR2({{.5, .5, .5}, {.5, .5, .5}})->IsAllFloat(.5)); - - EXPECT_TRUE(Literal::CreateR0(0)->IsAllFloat(0)); - EXPECT_TRUE(Literal::CreateR0(.5)->IsAllFloat(.5)); - EXPECT_TRUE(Literal::CreateR0(-.5)->IsAllFloat(-.5)); - EXPECT_FALSE(Literal::CreateR0(-.5)->IsAllFloat(-.49)); - EXPECT_FALSE( - Literal::CreateR2({{0, 0, 0}, {0, .1, 0}})->IsAllFloat(0)); -} - -TEST_F(LiteralUtilTest, IsAllComplex) { - // IsAllComplex always returns false when the literal is not complex. - EXPECT_FALSE(Literal::CreateR0(false)->IsAllComplex(0)); - EXPECT_FALSE(Literal::CreateR0(0)->IsAllComplex(0)); - EXPECT_FALSE(Literal::CreateR0(0)->IsAllComplex(0)); - EXPECT_FALSE(Literal::CreateR0(0)->IsAllComplex(0)); - EXPECT_FALSE(Literal::CreateR0(0)->IsAllComplex(0)); - EXPECT_FALSE(Literal::CreateR0(0)->IsAllComplex(0)); - - complex64 c8_9 = {8, 9}; - complex64 c7_9 = {7, 9}; - EXPECT_TRUE(Literal::CreateR2({{c8_9}, {c8_9}}) - ->IsAllComplex({8.0f, 9.0f})); - EXPECT_FALSE(Literal::CreateR2({{c7_9}, {c8_9}}) - ->IsAllComplex({8.0f, 9.0f})); - EXPECT_FALSE(Literal::CreateR2({{c8_9}, {c7_9}}) - ->IsAllComplex({8.0f, 9.0f})); -} - -TEST_F(LiteralUtilTest, IsAllFirst) { - // IsAllComplex always returns false when the literal is not complex. - EXPECT_FALSE(Literal::CreateR1({false, true})->IsAllFirst()); - EXPECT_TRUE(Literal::CreateR1({false, false})->IsAllFirst()); - EXPECT_FALSE(Literal::CreateR1({1, 1, 2})->IsAllFirst()); - EXPECT_TRUE(Literal::CreateR1({5, 5, 5, 5})->IsAllFirst()); - EXPECT_FALSE(Literal::CreateR1({1, 1, 2})->IsAllFirst()); - EXPECT_TRUE(Literal::CreateR1({5, 5, 5, 5})->IsAllFirst()); - EXPECT_FALSE(Literal::CreateR1({1, 1, 2})->IsAllFirst()); - EXPECT_TRUE(Literal::CreateR1({5, 5, 5, 5})->IsAllFirst()); - EXPECT_FALSE(Literal::CreateR1({1, 1, 2})->IsAllFirst()); - - complex64 c8_9 = {8, 9}; - complex64 c7_9 = {7, 9}; - EXPECT_TRUE(Literal::CreateR2({{c8_9}, {c8_9}})->IsAllFirst()); - EXPECT_FALSE(Literal::CreateR2({{c7_9}, {c8_9}})->IsAllFirst()); -} - -TEST_F(LiteralUtilTest, IsZero) { - auto scalar_zero = Literal::CreateR0(0.0f); - auto scalar_one = Literal::CreateR0(1.0f); - EXPECT_TRUE(scalar_zero->IsZero({})); - EXPECT_FALSE(scalar_one->IsZero({})); - - auto array = Literal::CreateR2({{1, 2, 0, 3}, {1, 0, 1, 2}}); - EXPECT_FALSE(array->IsZero({0, 1})); - EXPECT_TRUE(array->IsZero({0, 2})); - EXPECT_TRUE(array->IsZero({1, 1})); - EXPECT_FALSE(array->IsZero({1, 2})); - - auto complex_zero = Literal::CreateR0(0.0f); - auto complex_nonzero = Literal::CreateR0(0.5f); - EXPECT_TRUE(complex_zero->IsZero({})); - EXPECT_FALSE(complex_nonzero->IsZero({})); -} - -template -class LiteralUtilTestTemplated : public ::testing::Test {}; - -using TestedTypes = ::testing::Types; -TYPED_TEST_CASE(LiteralUtilTestTemplated, TestedTypes); - -TYPED_TEST(LiteralUtilTestTemplated, Relayout2x2) { - // Make a non-integer for floating point types. - TypeParam half = TypeParam(1) / TypeParam(2); - auto data = Literal::CreateR2({{half, 2}, {3, 4}}); - const Layout layout01 = LayoutUtil::MakeLayout({0, 1}); - const Layout layout10 = LayoutUtil::MakeLayout({1, 0}); - - auto data01 = data->Relayout(layout01); - EXPECT_TRUE(LayoutUtil::Equal(data01->shape().layout(), layout01)); - EXPECT_EQ(*data, *data01); - - auto data10 = data->Relayout(layout10); - EXPECT_TRUE(LayoutUtil::Equal(data10->shape().layout(), layout10)); - EXPECT_EQ(*data, *data10); -} - -TEST_F(LiteralUtilTest, ReshapeR0) { - auto original = Literal::CreateR0(1.7f); - auto reshape = original->Reshape(/*dimensions=*/{}).ConsumeValueOrDie(); - EXPECT_EQ(*original, *reshape); -} - -TEST_F(LiteralUtilTest, ReshapeR4) { - // clang-format off - // F32[1x3x2x4] - auto original = Literal::CreateR4WithLayout({{ - {{10, 11, 12, 13}, {14, 15, 16, 17}}, - {{18, 19, 20, 21}, {22, 23, 24, 25}}, - {{26, 27, 28, 29}, {30, 31, 32, 33}}, - }}, layout_r4_dim0major_); - // F32[1x3x4x2] - auto expected = Literal::CreateR3WithLayout({ - {{10, 11}, {12, 13}, {14, 15}, {16, 17}}, - {{18, 19}, {20, 21}, {22, 23}, {24, 25}}, - {{26, 27}, {28, 29}, {30, 31}, {32, 33}}, - }, layout_r3_dim0major_); - // clang-format on - auto reshape = original->Reshape({3, 4, 2}).ConsumeValueOrDie(); - - EXPECT_EQ(*expected, *reshape); -} - -TEST_F(LiteralUtilTest, ReshapeR4Dim0Minor) { - // clang-format off - // F32[1x3x2x4] - auto original = Literal::CreateR4WithLayout({{ - {{10, 11, 12, 13}, {14, 15, 16, 17}}, - {{18, 19, 20, 21}, {22, 23, 24, 25}}, - {{26, 27, 28, 29}, {30, 31, 32, 33}}, - }}, layout_r4_dim0minor_); - // F32[1x3x4x2] - auto expected = Literal::CreateR3WithLayout({ - {{10, 11}, {12, 13}, {14, 15}, {16, 17}}, - {{18, 19}, {20, 21}, {22, 23}, {24, 25}}, - {{26, 27}, {28, 29}, {30, 31}, {32, 33}}, - }, layout_r3_dim0major_); - // clang-format on - auto reshape = original->Reshape({3, 4, 2}).ConsumeValueOrDie(); - - EXPECT_EQ(*expected, *reshape); -} - -TEST_F(LiteralUtilTest, TransposeR0) { - auto original = Literal::CreateR0(1.7f); - auto reshape = original->Transpose(/*permutation=*/{}); - EXPECT_EQ(*original, *reshape); -} - -TEST_F(LiteralUtilTest, TransposeR4) { - // clang-format off - // F32[1x3x2x4] - auto original = Literal::CreateR4({{ - {{10, 11, 12, 13}, {14, 15, 16, 17}}, - {{18, 19, 20, 21}, {22, 23, 24, 25}}, - {{26, 27, 28, 29}, {30, 31, 32, 33}}, - }}); - // clang-format on - auto reshape = original->Transpose(/*permutation=*/{2, 3, 0, 1}); - - reshape->EachCell([&](ArraySlice indices, float value) { - EXPECT_EQ(value, original->Get( - {indices[2], indices[3], indices[0], indices[1]})); - }); -} - -TEST_F(LiteralUtilTest, TestR4RelayoutEquivalence) { - // Tests that using Relayout on an array is equivalent to creating it in the - // target layout in the first place. - auto dim0minor_relaid_to_dim0major = - literal_r4_2x2x3x3_dim0minor_->Relayout(layout_r4_dim0major_); - EXPECT_EQ(*literal_r4_2x2x3x3_dim0major_, *dim0minor_relaid_to_dim0major); - - auto dim0major_relaid_to_dim0minor = - literal_r4_2x2x3x3_dim0major_->Relayout(layout_r4_dim0minor_); - EXPECT_EQ(*literal_r4_2x2x3x3_dim0minor_, *dim0major_relaid_to_dim0minor); -} - -TEST_F(LiteralUtilTest, TestR2LinearLayout) { - // Test expected memory layout of R2 dim0-minor (column-major) literal. - auto mat_dim0minor = Literal::CreateR2WithLayout( - {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0minor_); - EXPECT_EQ(mat_dim0minor->element_count(), 6); - EXPECT_THAT(mat_dim0minor->data(), ElementsAre(1, 4, 2, 5, 3, 6)); - - // Test expected memory layout when using Relayout to row major. - auto relaid_mat_to_dim0major = mat_dim0minor->Relayout(layout_r2_dim0major_); - EXPECT_THAT(relaid_mat_to_dim0major->data(), - ElementsAre(1, 2, 3, 4, 5, 6)); - - // Test expected memory layout of R2 created with dim0-major (row-major). - auto mat_dim0major = Literal::CreateR2WithLayout( - {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0major_); - EXPECT_EQ(mat_dim0major->element_count(), 6); - EXPECT_THAT(mat_dim0major->data(), ElementsAre(1, 2, 3, 4, 5, 6)); - - // Test expected memory layout when using Relayout to column major. - auto relaid_mat_to_dim0minor = mat_dim0major->Relayout(layout_r2_dim0minor_); - EXPECT_THAT(relaid_mat_to_dim0minor->data(), - ElementsAre(1, 4, 2, 5, 3, 6)); -} - -TEST_F(LiteralUtilTest, TestR3LinearLayout) { - // Test expected memory layout of R3 dim0-minor (column-major) literal. - Array3D arr3d( - // clang-format off - { - { - {1, 2, 3}, - {4, 5, 6}, - }, - { - {7, 8, 9}, - {10, 11, 12}, - }, - }); // clang-format on - auto lit_dim0minor = - Literal::CreateR3FromArray3DWithLayout(arr3d, layout_r3_dim0minor_); - - EXPECT_EQ(lit_dim0minor->element_count(), 12); - std::vector expected_dim0minor{1, 7, 4, 10, 2, 8, 5, 11, 3, 9, 6, 12}; - EXPECT_THAT(lit_dim0minor->data(), - testing::ElementsAreArray(expected_dim0minor)); - - // Test expected memory layout when using Relayout to row major. - auto relaid_lit_to_dim0major = lit_dim0minor->Relayout(layout_r3_dim0major_); - std::vector expected_dim0major{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - EXPECT_THAT(relaid_lit_to_dim0major->data(), - testing::ElementsAreArray(expected_dim0major)); - - // Test expected memory layout of R3 created with dim0-major (row-major). - auto lit_dim0major = - Literal::CreateR3FromArray3DWithLayout(arr3d, layout_r3_dim0major_); - EXPECT_EQ(lit_dim0major->element_count(), 12); - EXPECT_THAT(lit_dim0major->data(), - testing::ElementsAreArray(expected_dim0major)); - - // Test expected memory layout when using Relayout to column major. - auto relaid_lit_to_dim0minor = lit_dim0major->Relayout(layout_r3_dim0minor_); - EXPECT_THAT(relaid_lit_to_dim0minor->data(), - testing::ElementsAreArray(expected_dim0minor)); -} - -TEST_F(LiteralUtilTest, SliceR0S32) { - auto input = Literal::CreateR0(1); - auto result = input->Slice({}, {}); - EXPECT_EQ(*input, *result); -} - -TEST_F(LiteralUtilTest, SliceR1F32) { - auto input = Literal::CreateR1({1.0, 2.0, 3.0, 4.0, 5.0}); - auto result = input->Slice({3}, {4}); - auto expected = Literal::CreateR1({4.0}); - EXPECT_EQ(*expected, *result); -} - -TEST_F(LiteralUtilTest, SliceR2U32) { - auto input_3x4 = - Literal::CreateR2({{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}); - auto result = input_3x4->Slice({0, 2}, {2, 4}); - auto expected = Literal::CreateR2({{3, 4}, {7, 8}}); - EXPECT_EQ(*expected, *result); -} - -TEST_F(LiteralUtilTest, SliceR3U32Full) { - auto input_2x3x2 = Literal::CreateR3( - {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}); - auto result = input_2x3x2->Slice({0, 0, 0}, {2, 3, 2}); - EXPECT_EQ(*input_2x3x2, *result); -} - -TEST_F(LiteralUtilTest, PopulateR1S64) { - Literal output(ShapeUtil::MakeShape(S64, {1})); - output.PopulateR1({77}); - auto expected = Literal::CreateR1({77}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateR1U64) { - Literal output(ShapeUtil::MakeShape(U64, {2})); - output.PopulateR1({{77, 88}}); - auto expected = Literal::CreateR1({{77, 88}}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateR1C64) { - Literal output(ShapeUtil::MakeShape(C64, {1})); - output.PopulateR1({{77, 88}}); - auto expected = Literal::CreateR1({{77, 88}}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateR2C64) { - Literal output(ShapeUtil::MakeShape(C64, {2, 2})); - output.PopulateR2({{{7, 8}, {9, 10}}, {{1, 2}, {3, 4}}}); - auto expected = - Literal::CreateR2({{{7, 8}, {9, 10}}, {{1, 2}, {3, 4}}}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR0BF16) { - Literal output(ShapeUtil::MakeShape(BF16, {})); - bfloat16 h(0.25f); - output.PopulateWithValue(h); - auto expected = Literal::CreateR0(h); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR1BF16) { - Literal output(ShapeUtil::MakeShape(BF16, {3})); - bfloat16 h(0.5f); - output.PopulateWithValue(h); - auto expected = Literal::CreateR1({h, h, h}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR2BF16) { - Literal output(ShapeUtil::MakeShape(BF16, {2, 2})); - bfloat16 h(2.0f); - output.PopulateWithValue(h); - auto expected = Literal::CreateR2({{h, h}, {h, h}}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR0F32) { - Literal output(ShapeUtil::MakeShape(F32, {})); - output.PopulateWithValue(2.5f); - auto expected = Literal::CreateR0(2.5f); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR1S64) { - Literal output(ShapeUtil::MakeShape(S64, {3})); - output.PopulateWithValue(-7); - auto expected = Literal::CreateR1({-7, -7, -7}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR2U64) { - Literal output(ShapeUtil::MakeShape(U64, {2, 2})); - output.PopulateWithValue(42); - auto expected = Literal::CreateR2({{42, 42}, {42, 42}}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR2C64) { - Literal output(ShapeUtil::MakeShape(C64, {2, 2})); - output.PopulateWithValue({4, 2}); - auto expected = - Literal::CreateR2({{{4, 2}, {4, 2}}, {{4, 2}, {4, 2}}}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR0F16) { - Literal output(ShapeUtil::MakeShape(F16, {})); - half h(0.25f); - output.PopulateWithValue(h); - auto expected = Literal::CreateR0(h); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR1F16) { - Literal output(ShapeUtil::MakeShape(F16, {3})); - half h(0.5f); - output.PopulateWithValue(h); - auto expected = Literal::CreateR1({h, h, h}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, PopulateWithValueR2F16) { - Literal output(ShapeUtil::MakeShape(F16, {2, 2})); - half h(2.0f); - output.PopulateWithValue(h); - auto expected = Literal::CreateR2({{h, h}, {h, h}}); - EXPECT_EQ(output, *expected); -} - -TEST_F(LiteralUtilTest, ReplicateR2U32) { - auto input = - Literal::CreateR2({{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}); - auto output = input->Replicate(3); - auto expected = Literal::CreateR3( - {{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}, - {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}, - {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}}); - EXPECT_EQ(*output, *expected); -} - -TEST_F(LiteralUtilTest, CopySliceFrom) { - const int64 dimensions[] = {17, 15, 34, 21}; - const int64 layouts[][4] = { - {3, 2, 1, 0}, {0, 2, 1, 3}, {0, 1, 2, 3}, {2, 0, 3, 1}, {1, 3, 0, 2}}; - for (const auto& layout : layouts) { - Shape shape = ShapeUtil::MakeShapeWithLayout( - primitive_util::NativeToPrimitiveType(), dimensions, layout); - - auto source = Literal::CreateFromShape(shape); - const int64 zero_base[] = {0, 0, 0, 0}; - const int64 step[] = {1, 1, 1, 1}; - uint32 seqnr = 0; - auto init_proc = [&](ArraySlice indexes) { - source->Set(indexes, ++seqnr); - return true; - }; - ShapeUtil::ForEachIndex(source->shape(), zero_base, dimensions, step, - init_proc); - - auto blank = Literal::CreateFromShape(shape); - const int64 src_base[] = {3, 1, 5, 7}; - const int64 dest_base[] = {6, 4, 12, 2}; - const int64 copy_size[] = {7, 8, 11, 9}; - TF_EXPECT_OK(blank->CopySliceFrom(*source, src_base, dest_base, copy_size)); - - std::vector source_indexes(TF_ARRAYSIZE(dimensions), 0); - std::vector blank_indexes(TF_ARRAYSIZE(dimensions), 0); - bool matched = true; - auto check_proc = [&](ArraySlice indexes) { - std::copy(indexes.begin(), indexes.end(), source_indexes.begin()); - std::transform(source_indexes.begin(), source_indexes.end(), src_base, - source_indexes.begin(), std::plus()); - std::copy(indexes.begin(), indexes.end(), blank_indexes.begin()); - std::transform(blank_indexes.begin(), blank_indexes.end(), dest_base, - blank_indexes.begin(), std::plus()); - auto bval = blank->Get(blank_indexes); - matched = (bval != 0 && bval == source->Get(source_indexes)); - return matched; - }; - - ShapeUtil::ForEachIndex(source->shape(), zero_base, copy_size, step, - check_proc); - EXPECT_TRUE(matched); - } -} - -TEST_F(LiteralUtilTest, CopyFromScalars) { - auto zero = Literal::CreateR0(0); - auto nine = Literal::CreateR0(9); - TF_EXPECT_OK(zero->CopyFrom(*nine)); - EXPECT_EQ(*zero, *nine); - - auto vect = Literal::CreateR1({3, 4, 9, 12, 5, 17, 21}); - TF_EXPECT_OK(zero->CopySliceFrom(*vect, {5}, {}, {})); - EXPECT_EQ(zero->Get({}), 17); - TF_EXPECT_OK(vect->CopySliceFrom(*zero, {}, {4}, {})); - EXPECT_EQ(vect->Get({4}), 17); -} - -TEST_F(LiteralUtilTest, CopyFromAndToZeroElement) { - const Shape empty_r1_shape = ShapeUtil::MakeShape(F32, {0}); - const auto const_nine = Literal::CreateR1({9}); - const auto const_empty = Literal::CreateFromShape(empty_r1_shape); - - { - // Source contains dimension with zero elements. - const auto empty = Literal::CreateFromShape(empty_r1_shape); - auto nine = Literal::CreateR1({9}); - - TF_EXPECT_OK(nine->CopySliceFrom(*empty, {0}, {0}, {0})); - EXPECT_EQ(*nine, *const_nine); - } - - { - // Copy 0 element to destination with zero elements. - const auto empty = Literal::CreateFromShape(empty_r1_shape); - auto nine = Literal::CreateR1({9}); - - TF_EXPECT_OK(empty->CopySliceFrom(*nine, {0}, {0}, {0})); - EXPECT_EQ(*empty, *const_empty); - } -} - -TEST_F(LiteralUtilTest, CopyFromNilShape) { - Literal nil_literal0(ShapeUtil::MakeNil()); - Literal nil_literal1(ShapeUtil::MakeNil()); - // This doesn't actually do any copying, but it should succeed. - TF_ASSERT_OK(nil_literal0.CopyFrom(nil_literal1)); -} - -TEST_F(LiteralUtilTest, CopyFromArrays) { - auto scalar_42 = Literal::CreateR0(42.0); - auto scalar_123 = Literal::CreateR0(123.0); - EXPECT_NE(*scalar_42, *scalar_123); - TF_ASSERT_OK(scalar_42->CopyFrom(*scalar_123, /*dest_shape_index=*/{}, - /*src_shape_index=*/{})); - EXPECT_EQ(*scalar_42, *scalar_123); - EXPECT_EQ(scalar_42->Get({}), 123.0f); - - auto matrix_1234 = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - auto matrix_5678 = Literal::CreateR2({{5.0, 6.0}, {7.0, 8.0}}); - EXPECT_NE(*matrix_1234, *matrix_5678); - EXPECT_EQ(matrix_1234->Get({0, 0}), 1.0f); - TF_ASSERT_OK(matrix_1234->CopyFrom(*matrix_5678, /*dest_shape_index=*/{}, - /*src_shape_index=*/{})); - EXPECT_EQ(*matrix_1234, *matrix_5678); - EXPECT_EQ(matrix_1234->Get({0, 0}), 5.0f); -} - -TEST_F(LiteralUtilTest, CopyFromTuples) { - auto matrix = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - Literal nil_literal(ShapeUtil::MakeNil()); - auto nested_tuple = Literal::MakeTuple( - {matrix.get(), - Literal::MakeTuple({Literal::CreateR0(42).get(), - Literal::CreateR1({23.0, 44.0}).get(), - &nil_literal}) - .get()}); - // Create a tuple the same shape as the inner tuple of nested_tuple but with - // different values.. - auto tuple = Literal::MakeTuple({Literal::CreateR0(-5).get(), - Literal::CreateR1({2.0, 4.0}).get(), - &nil_literal}); - - EXPECT_EQ(*matrix, LiteralSlice(*nested_tuple, {0})); - EXPECT_EQ(nested_tuple->Get({}, {1, 0}), 42); - EXPECT_EQ(nested_tuple->Get({0}, {1, 1}), 23.0); - EXPECT_EQ(nested_tuple->Get({1}, {1, 1}), 44.0); - - // Overwrite the inner tuple element of nested_tuple with the contents of - // 'tuple'. - TF_ASSERT_OK(nested_tuple->CopyFrom(*tuple, /*dest_shape_index=*/{1}, - /*src_shape_index=*/{})); - - // The matrix element should be unchanged. - EXPECT_EQ(*matrix, LiteralSlice(*nested_tuple, {0})); - - // The tuple element should have been copied from 'tuple'. - EXPECT_EQ(nested_tuple->Get({}, {1, 0}), -5); - EXPECT_EQ(nested_tuple->Get({0}, {1, 1}), 2.0); - EXPECT_EQ(nested_tuple->Get({1}, {1, 1}), 4.0); -} -TEST_F(LiteralUtilTest, CopyBetweenSameTuple) { - auto tuple = Literal::MakeTuple( - {Literal::CreateR0(-2).get(), Literal::CreateR0(4).get()}); - - EXPECT_EQ(tuple->Get({}, {0}), -2); - EXPECT_EQ(tuple->Get({}, {1}), 4); - - // Copy from one element to the other. - TF_ASSERT_OK(tuple->CopyFrom(*tuple, /*dest_shape_index=*/{1}, - /*src_shape_index=*/{0})); - - EXPECT_EQ(tuple->Get({}, {0}), -2); - EXPECT_EQ(tuple->Get({}, {1}), -2); -} - -TEST_F(LiteralUtilTest, CopyFromDifferentShapes) { - auto matrix = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - auto vector = Literal::CreateR1({5.0, 7.0}); - Status status = matrix->CopyFrom(*vector); - ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), - HasSubstr("Destination subshape incompatible")); -} - -TEST_F(LiteralUtilTest, F16) { - // Verify that the internal data views are consistent and that they - // are in little endian format - // TODO - modify if we make the data format machine endianess dependent - auto m1 = Literal::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2})); - Literal* l1 = m1.get(); - const char* d1 = reinterpret_cast(l1->data().data()); - EXPECT_EQ(d1[0], 0); - EXPECT_EQ(d1[1], 0); - EXPECT_EQ(d1[2], 0); - EXPECT_EQ(d1[3], 0); - EXPECT_EQ(d1[4], 0); - EXPECT_EQ(d1[5], 0); - EXPECT_EQ(d1[6], 0); - EXPECT_EQ(d1[7], 0); - - half h1(1.0f); - half h2(2.0f); - auto m2 = Literal::CreateR2({{h1, h2}, {h2, h1}}); - Literal* l2 = m2.get(); - const char* d2 = reinterpret_cast(l2->data().data()); - EXPECT_EQ(d2[0], 0); - EXPECT_EQ(d2[1], 0x3C); - EXPECT_EQ(d2[2], 0); - EXPECT_EQ(d2[3], 0x40); - EXPECT_EQ(d2[4], 0); - EXPECT_EQ(d2[5], 0x40); - EXPECT_EQ(d2[6], 0); - EXPECT_EQ(d2[7], 0x3C); -} - -TEST_F(LiteralUtilTest, Populate) { - struct PopulateData { - std::vector dimensions; - std::vector layout; - } populate_data[] = { - {{}, {}}, - {{0}, {0}}, - {{16}, {0}}, - {{2, 0}, {1, 0}}, - {{4, 16}, {1, 0}}, - {{21, 12}, {0, 1}}, - {{6, 11, 17}, {2, 0, 1}}, - {{6, 11, 5, 17}, {3, 2, 0, 1}}, - }; - for (const auto& data : populate_data) { - Shape shape = ShapeUtil::MakeShapeWithLayout( - primitive_util::NativeToPrimitiveType(), data.dimensions, - data.layout); - auto literal = MakeUnique(shape); - auto generator = [&](ArraySlice indexes) -> uint32 { - // Offsets from linear index just to avoid R0 literals to be initialized - // with zero. - return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(), - indexes) + - 17; - }; - TF_EXPECT_OK(literal->Populate(generator)); - - std::vector zero_base(data.dimensions.size(), 0); - std::vector step(data.dimensions.size(), 1); - bool matched = true; - auto check_function = [&](ArraySlice indexes) { - auto value = literal->Get(indexes); - matched = matched && (value == generator(indexes)); - return matched; - }; - ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step, - check_function); - EXPECT_TRUE(matched); - } -} - -TEST_F(LiteralUtilTest, PopulateParallel) { - struct PopulateData { - std::vector dimensions; - std::vector layout; - } populate_data[] = { - {{}, {}}, - {{0}, {0}}, - {{16}, {0}}, - {{2, 0}, {1, 0}}, - {{4, 16}, {1, 0}}, - {{21, 12}, {0, 1}}, - {{6, 11, 17}, {2, 0, 1}}, - {{6, 11, 5, 17}, {3, 2, 0, 1}}, - }; - for (const auto& data : populate_data) { - Shape shape = ShapeUtil::MakeShapeWithLayout( - primitive_util::NativeToPrimitiveType(), data.dimensions, - data.layout); - auto literal = MakeUnique(shape); - auto generator = [&](ArraySlice indexes) -> uint32 { - // Offsets from linear index just to avoid R0 literals to be initialized - // with zero. - return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(), - indexes) + - 17; - }; - TF_EXPECT_OK(literal->PopulateParallel(generator)); - - std::vector zero_base(data.dimensions.size(), 0); - std::vector step(data.dimensions.size(), 1); - bool matched = true; - auto check_function = [&](ArraySlice indexes) { - auto value = literal->Get(indexes); - matched = matched && (value == generator(indexes)); - return matched; - }; - ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step, - check_function); - EXPECT_TRUE(matched); - } -} - -TEST_F(LiteralUtilTest, ConvertR4) { - // clang-format off - auto original = Literal::CreateR4WithLayout({{ - {{10, 11, 12, 13}, {14, 15, 16, 17}}, - {{18, 19, 20, 21}, {22, 23, 24, 25}}, - {{26, 27, 28, 29}, {30, 31, 32, 33}}, - }}, layout_r4_dim0major_); - auto expected = Literal::CreateR4WithLayout({{ - {{10, 11, 12, 13}, {14, 15, 16, 17}}, - {{18, 19, 20, 21}, {22, 23, 24, 25}}, - {{26, 27, 28, 29}, {30, 31, 32, 33}}, - }}, layout_r4_dim0major_); - // clang-format on - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr converted, - original->Convert(U32)); - - EXPECT_EQ(*expected, *converted); -} - -TEST_F(LiteralUtilTest, ConvertIfTypesMatch) { - // clang-format off - auto s8 = Literal::CreateR4WithLayout({{ - {{10, 0, 12, 0}, {0, 15, 0, 17}}, - {{0, 19, 0, 21}, {22, 0, 24, 0}}, - {{26, 0, 28, 0}, {0, 31, 0, 33}}, - }}, layout_r4_dim0major_); - auto s32 = Literal::CreateR4WithLayout({{ - {{10, 0, 12, 0}, {0, 15, 0, 17}}, - {{0, 19, 0, 21}, {22, 0, 24, 0}}, - {{26, 0, 28, 0}, {0, 31, 0, 33}}, - }}, layout_r4_dim0major_); - auto u32 = Literal::CreateR4WithLayout({{ - {{10, 0, 12, 0}, {0, 15, 0, 17}}, - {{0, 19, 0, 21}, {22, 0, 24, 0}}, - {{26, 0, 28, 0}, {0, 31, 0, 33}}, - }}, layout_r4_dim0major_); - auto s64 = Literal::CreateR4WithLayout({{ - {{10, 0, 12, 0}, {0, 15, 0, 17}}, - {{0, 19, 0, 21}, {22, 0, 24, 0}}, - {{26, 0, 28, 0}, {0, 31, 0, 33}}, - }}, layout_r4_dim0major_); - auto u64 = Literal::CreateR4WithLayout({{ - {{10, 0, 12, 0}, {0, 15, 0, 17}}, - {{0, 19, 0, 21}, {22, 0, 24, 0}}, - {{26, 0, 28, 0}, {0, 31, 0, 33}}, - }}, layout_r4_dim0major_); - auto pred = Literal::CreateR4WithLayout({{ - {{true, false, true, false}, {false, true, false, true}}, - {{false, true, false, true}, {true, false, true, false}}, - {{true, false, true, false}, {false, true, false, true}}, - }}, layout_r4_dim0major_); - auto int32_pred = Literal::CreateR4WithLayout({{ - {{1, 0, 1, 0}, {0, 1, 0, 1}}, - {{0, 1, 0, 1}, {1, 0, 1, 0}}, - {{1, 0, 1, 0}, {0, 1, 0, 1}}, - }}, layout_r4_dim0major_); - auto f16 = Literal::CreateR4WithLayout({{ - {{half(10.0), half(0.0), half(12.0), half(0.0)}, - {half(0.0), half(15.0), half(0.0), half(17.0)}}, - {{half(0.0), half(19.0), half(0.0), half(21.0)}, - {half(22.0), half(0.0), half(24.0), half(0.0)}}, - {{half(26.0), half(0.0), half(28.0), half(0.0)}, - {half(0.0), half(31.0), half(0.0), half(33.0)}}, - }}, layout_r4_dim0major_); - auto bf16 = Literal::CreateR4WithLayout({{ - {{bfloat16(10.0), bfloat16(0.0), bfloat16(12.0), bfloat16(0.0)}, - {bfloat16(0.0), bfloat16(15.0), bfloat16(0.0), bfloat16(17.0)}}, - {{bfloat16(0.0), bfloat16(19.0), bfloat16(0.0), bfloat16(21.0)}, - {bfloat16(22.0), bfloat16(0.0), bfloat16(24.0), bfloat16(0.0)}}, - {{bfloat16(26.0), bfloat16(0.0), bfloat16(28.0), bfloat16(0.0)}, - {bfloat16(0.0), bfloat16(31.0), bfloat16(0.0), bfloat16(33.0)}}, - }}, layout_r4_dim0major_); - auto f32 = Literal::CreateR4WithLayout({{ - {{10.0f, 0.0f, 12.0f, 0.0f}, {0.0f, 15.0f, 0.0f, 17.0f}}, - {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}}, - {{26.0f, 0.0f, 28.0f, 0.0f}, {0.0f, 31.0f, 0.0f, 33.0f}}, - }}, layout_r4_dim0major_); - auto f64 = Literal::CreateR4WithLayout({{ - {{10.0, 0.0, 12.0, 0.0}, {0.0, 15.0, 0.0, 17.0}}, - {{0.0, 19.0, 0.0, 21.0}, {22.0, 0.0, 24.0, 0.0}}, - {{26.0, 0.0, 28.0, 0.0}, {0.0, 31.0, 0.0, 33.0}}, - }}, layout_r4_dim0major_); - auto c64 = Literal::CreateR4WithLayout({{ - {{10.0f, 0.0f, 12.0f, 0.0f}, {0.0f, 15.0f, 0.0f, 17.0f}}, - {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}}, - {{26.0f, 0.0f, 28.0f, 0.0f}, {0.0f, 31.0f, 0.0f, 33.0f}}, - }}, layout_r4_dim0major_); - // clang-format on - std::unique_ptr conv; - - conv = s8->Convert(U32).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *u32); - - conv = s8->Convert(S32).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *s32); - - conv = s8->Convert(U64).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *u64); - - conv = s8->Convert(S64).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *s64); - - conv = s8->Convert(PRED).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *pred); - - conv = bf16->Convert(S32).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *s32); - - conv = bf16->Convert(F32).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *f32); - - conv = pred->Convert(S32).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *int32_pred); - - conv = f32->Convert(S32).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *s32); - - conv = f64->Convert(S32).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *s32); - - conv = s32->Convert(F32).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *f32); - - conv = f32->Convert(F16).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *f16); - - conv = f64->Convert(F16).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *f16); - - conv = s32->Convert(F16).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *f16); - - conv = u32->Convert(F16).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *f16); - - conv = s32->Convert(C64).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *c64); - - conv = f16->Convert(C64).ConsumeValueOrDie(); - EXPECT_EQ(*conv, *c64); - - EXPECT_EQ(s32->Convert(TUPLE).status().code(), - tensorflow::error::UNIMPLEMENTED); - EXPECT_EQ(s32->Convert(S16).status().code(), - tensorflow::error::UNIMPLEMENTED); - EXPECT_EQ(s32->Convert(U16).status().code(), - tensorflow::error::UNIMPLEMENTED); - EXPECT_EQ(c64->Convert(F32).status().code(), - tensorflow::error::UNIMPLEMENTED); - EXPECT_EQ(c64->Convert(S32).status().code(), - tensorflow::error::UNIMPLEMENTED); -} - -TEST_F(LiteralUtilTest, BitcastConvert) { - auto original = - Literal::CreateR1({tensorflow::bit_cast(2.5f), - tensorflow::bit_cast(-42.25f), - tensorflow::bit_cast(100.f), 0xbeef}); - auto expected = Literal::CreateR1( - {2.5f, -42.25f, 100.0f, tensorflow::bit_cast(0xbeef)}); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr converted, - original->BitcastConvert(F32)); -} - -TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) { - auto literal = Literal::CreateR0(1234); - Status status = literal->BitcastConvert(F64).status(); - EXPECT_NE(Status::OK(), status); - EXPECT_TRUE(tensorflow::str_util::StrContains(status.error_message(), - "bit widths are different")); -} - -TEST_F(LiteralUtilTest, CopyFromProto_Bool) { - LiteralProto p; - p.mutable_shape()->set_element_type(PRED); - for (int len = 0; len < 25; ++len) { - p.mutable_shape()->clear_dimensions(); - p.mutable_shape()->add_dimensions(len); - LayoutUtil::SetToDefaultLayout(p.mutable_shape()); - p.clear_preds(); - for (int i = 0; i < len; ++i) { - p.add_preds((i % 2) == (len % 2)); - } - - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr literal, - Literal::CreateFromProto(p)); - ASSERT_EQ(len, literal->data().size()); - int i = 0; - for (bool value : literal->data()) { - EXPECT_EQ((i % 2) == (len % 2), value); - ++i; - } - } -} - -// Note that f16 is currently stored in a byte array in little endian byte order -TEST_F(LiteralUtilTest, ToProto_f16) { - half h1(1.0f); - half h2(2.0f); - - auto m = Literal::CreateR2({{h1, h2}, {h2, h1}}); - Literal* l = m.get(); - EXPECT_EQ(4, ShapeUtil::ElementsIn(l->shape())); - EXPECT_EQ(4, l->data().size()); - - LiteralProto p = l->ToProto(); - EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape())); - EXPECT_EQ(8, p.f16s().size()); - const char* d = p.f16s().data(); - EXPECT_EQ(d[0], 0); - EXPECT_EQ(d[1], 0x3C); - EXPECT_EQ(d[2], 0); - EXPECT_EQ(d[3], 0x40); - EXPECT_EQ(d[4], 0); - EXPECT_EQ(d[5], 0x40); - EXPECT_EQ(d[6], 0); - EXPECT_EQ(d[7], 0x3C); -} - -// Note that f16 is currently stored in a byte array in little endian byte order -TEST_F(LiteralUtilTest, CopyFromProto_f16) { - half h1(1.0f); - half h2(2.0f); - - const char half_vals[8] = {0x00, 0x3C, 0x00, 0x40, 0x00, 0x40, 0x00, 0x3C}; - LiteralProto p; - p.mutable_shape()->set_element_type(F16); - p.mutable_shape()->clear_dimensions(); - p.mutable_shape()->add_dimensions(4); - LayoutUtil::SetToDefaultLayout(p.mutable_shape()); - p.clear_f16s(); - p.set_f16s(half_vals, 8); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr literal, - Literal::CreateFromProto(p)); - auto r = literal->data(); - ASSERT_EQ(4, r.size()); - ASSERT_EQ(h1, r[0]); - ASSERT_EQ(h2, r[1]); - ASSERT_EQ(h2, r[2]); - ASSERT_EQ(h1, r[3]); -} - -TEST_F(LiteralUtilTest, LiteralSliceTest) { - auto scalar = Literal::CreateR0(1.0); - auto matrix = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()}); - auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()}); - Literal nil(ShapeUtil::MakeNil()); - - EXPECT_EQ(LiteralSlice(*scalar, {}), *scalar); - EXPECT_EQ(LiteralSlice(*matrix, {}), *matrix); - EXPECT_EQ(LiteralSlice(*tuple, {}), *tuple); - EXPECT_EQ(LiteralSlice(*nested_tuple, {}), *nested_tuple); - EXPECT_EQ(LiteralSlice(nil, {}), nil); - - EXPECT_EQ(LiteralSlice(*tuple, {0}), *scalar); - EXPECT_EQ(LiteralSlice(*tuple, {1}), *matrix); - - EXPECT_EQ(LiteralSlice(*nested_tuple, {0}), *tuple); - EXPECT_EQ(LiteralSlice(*nested_tuple, {0, 0}), *scalar); - EXPECT_EQ(LiteralSlice(*nested_tuple, {0, 1}), *matrix); - EXPECT_EQ(LiteralSlice(*nested_tuple, {1}), *scalar); -} - -TEST_F(LiteralUtilTest, MutatingLiteralSlice) { - auto scalar = Literal::CreateR0(1.0); - auto matrix = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()}); - auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()}); - // Verify that changing the underlying data beneath the view changes the - // data of the view itself. - const auto nested_tuple_view = LiteralSlice(*nested_tuple); - EXPECT_EQ( - nested_tuple->Get(/*multi_index=*/{}, /*shape_index=*/{0, 0}), - 1.0f); - EXPECT_EQ(nested_tuple_view.Get(/*multi_index=*/{}, - /*shape_index=*/{0, 0}), - 1.0f); - nested_tuple->Set(/*multi_index=*/{}, /*shape_index=*/{0, 0}, 555.0f); - EXPECT_EQ( - nested_tuple->Get(/*multi_index=*/{}, /*shape_index=*/{0, 0}), - 555.0f); - EXPECT_EQ(nested_tuple_view.Get(/*multi_index=*/{}, - /*shape_index=*/{0, 0}), - 555.0f); -} - -TEST_F(LiteralUtilTest, LiteralSliceOfALiteralSlice) { - auto scalar = Literal::CreateR0(1.0); - auto matrix = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()}); - auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()}); - - const auto nested_tuple_view = LiteralSlice(*nested_tuple); - const auto tuple_view = LiteralSlice(nested_tuple_view, /*view_root=*/{0}); - const auto matrix_view = LiteralSlice(tuple_view, /*view_root=*/{1}); - EXPECT_EQ(matrix_view, *Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}})); -} - -TEST_F(LiteralUtilTest, BorrowingLiteralFromOneBufferPtrTest) { - std::vector int64_values = {1, 2, 3}; - const Shape literal_shape = ShapeUtil::MakeShape(S64, {3}); - - BorrowingLiteral literal(reinterpret_cast(int64_values.data()), - literal_shape); - - EXPECT_EQ(literal.Get({0}), 1); - EXPECT_EQ(literal.Get({1}), 2); - EXPECT_EQ(literal.Get({2}), 3); -} - -TEST_F(LiteralUtilTest, BorrowingLiteralFromMultipleBufferPtrsTest) { - std::vector one_two_three = {1, 2, 3}; - const Shape one_two_three_shape = ShapeUtil::MakeShape(S64, {3}); - - std::vector hundred = {100}; - const Shape hundred_shape = ShapeUtil::MakeShape(S64, {1}); - - std::vector src_buf_ptrs; - src_buf_ptrs.emplace_back( - reinterpret_cast(one_two_three.data())); - src_buf_ptrs.emplace_back(reinterpret_cast(hundred.data())); - auto literal_tuple = BorrowingLiteral( - src_buf_ptrs, - ShapeUtil::MakeTupleShape({one_two_three_shape, hundred_shape})); - - EXPECT_EQ(literal_tuple.Get(/*multi_index=*/{0}, /*shape_index=*/{0}), - 1); - EXPECT_EQ(literal_tuple.Get(/*multi_index=*/{0}, /*shape_index=*/{1}), - 100); - - EXPECT_EQ(literal_tuple.Get(/*multi_index=*/{1}, /*shape_index=*/{0}), - 2); - - EXPECT_EQ(literal_tuple.Get(/*multi_index=*/{2}, /*shape_index=*/{0}), - 3); -} - -TEST_F(LiteralUtilTest, LiteralMove) { - std::unique_ptr matrix = - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - Literal literal(std::move(*matrix)); - - EXPECT_TRUE( - ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2, 2}), literal.shape())); - EXPECT_EQ(literal.Get({0, 0}), 1.0); - EXPECT_EQ(literal.Get({0, 1}), 2.0); - EXPECT_EQ(literal.Get({1, 0}), 3.0); - EXPECT_EQ(literal.Get({1, 1}), 4.0); -} - -TEST_F(LiteralUtilTest, DecomposeTuple) { - Literal nil_literal(ShapeUtil::MakeNil()); - auto nested_tuple = Literal::MakeTuple( - {Literal::CreateR2({{1, 2}, {3, 4}}).get(), - Literal::MakeTuple({Literal::CreateR0(42).get(), - Literal::CreateR1({23.0, 44.0}).get(), - &nil_literal}) - .get(), - &nil_literal}); - - EXPECT_FALSE(ShapeUtil::IsNil(nested_tuple->shape())); - std::vector elements = nested_tuple->DecomposeTuple(); - EXPECT_TRUE(ShapeUtil::IsNil(nested_tuple->shape())); - - ASSERT_EQ(elements.size(), 3); - - EXPECT_TRUE(ShapeUtil::Compatible(elements[0].shape(), - ShapeUtil::MakeShape(S32, {2, 2}))); - EXPECT_EQ(elements[0].Get({0, 0}), 1); - EXPECT_EQ(elements[0].Get({0, 1}), 2); - EXPECT_EQ(elements[0].Get({1, 0}), 3); - EXPECT_EQ(elements[0].Get({1, 1}), 4); - - EXPECT_TRUE(ShapeUtil::Compatible( - elements[1].shape(), - ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {}), - ShapeUtil::MakeShape(F64, {2}), - ShapeUtil::MakeNil()}))); - EXPECT_EQ(elements[1].Get({}, /*shape_index=*/{0}), 42); - EXPECT_EQ(elements[1].Get({0}, /*shape_index=*/{1}), 23.0); - EXPECT_EQ(elements[1].Get({1}, /*shape_index=*/{1}), 44.0); - - EXPECT_TRUE(ShapeUtil::Compatible(elements[2].shape(), ShapeUtil::MakeNil())); -} - -TEST_F(LiteralUtilTest, DecomposeEmptyTuple) { - Literal nil_literal(ShapeUtil::MakeNil()); - std::vector elements = nil_literal.DecomposeTuple(); - EXPECT_EQ(elements.size(), 0); -} - -TEST_F(LiteralUtilTest, MoveIntoTuple) { - std::vector elements; - elements.push_back(std::move(*Literal::CreateR0(1.0))); - elements.push_back(std::move(*Literal::CreateR1({4, 8}))); - elements.push_back(std::move( - *Literal::MakeTuple({Literal::CreateR0(42).get(), - Literal::CreateR1({23.0, 44.0}).get()}) - - )); - - Literal literal = Literal::MoveIntoTuple(&elements); - ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape())); - ASSERT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 3); - - EXPECT_EQ(literal.Get({}, /*shape_index=*/{0}), 1.0); - EXPECT_EQ(literal.Get({0}, /*shape_index=*/{1}), 4); - EXPECT_EQ(literal.Get({1}, /*shape_index=*/{1}), 8); - EXPECT_EQ(literal.Get({}, /*shape_index=*/{2, 0}), 42); - EXPECT_EQ(literal.Get({0}, /*shape_index=*/{2, 1}), 23.0); - EXPECT_EQ(literal.Get({1}, /*shape_index=*/{2, 1}), 44.0); - - for (const Literal& element : elements) { - EXPECT_TRUE(ShapeUtil::IsNil(element.shape())); - } -} - -TEST_F(LiteralUtilTest, MoveIntoEmptyTuple) { - Literal literal = Literal::MoveIntoTuple({}); - ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape())); - ASSERT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 0); -} - -TEST_F(LiteralUtilTest, LiteralMoveAssignment) { - Literal literal; - EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeNil(), literal.shape())); - - std::unique_ptr matrix = - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - literal = std::move(*matrix); - - EXPECT_TRUE( - ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2, 2}), literal.shape())); - EXPECT_EQ(literal.Get({0, 0}), 1.0); - EXPECT_EQ(literal.Get({0, 1}), 2.0); - EXPECT_EQ(literal.Get({1, 0}), 3.0); - EXPECT_EQ(literal.Get({1, 1}), 4.0); -} - -TEST_F(LiteralUtilTest, LiteralSliceCopy) { - std::unique_ptr matrix = - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); - const auto matrix_view = LiteralSlice(*matrix); - LiteralSlice matrix_view_copy(matrix_view); - - EXPECT_EQ(matrix_view_copy.Get({0, 0}), 1.0); - EXPECT_EQ(matrix_view_copy.Get({0, 1}), 2.0); - EXPECT_EQ(matrix_view_copy.Get({1, 0}), 3.0); - EXPECT_EQ(matrix_view_copy.Get({1, 1}), 4.0); -} - -TEST_F(LiteralUtilTest, GetSetTuple) { - auto tuple = Literal::MakeTuple( - {Literal::CreateR0(42.0).get(), - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}).get()}); - EXPECT_EQ(tuple->Get(/*multi_index=*/{}, /*shape_index=*/{0}), 42.0); - tuple->Set(/*multi_index=*/{}, /*shape_index=*/{0}, -5.0); - EXPECT_EQ(tuple->Get(/*multi_index=*/{}, /*shape_index=*/{0}), -5.0); - - EXPECT_EQ(tuple->Get(/*multi_index=*/{1, 0}, /*shape_index=*/{1}), - 3.0); - tuple->Set(/*multi_index=*/{1, 0}, /*shape_index=*/{1}, -4.0); - EXPECT_EQ(tuple->Get(/*multi_index=*/{1, 0}, /*shape_index=*/{1}), - -4.0); -} - -TEST_F(LiteralUtilTest, CreateFromShapeZeroInitialized) { - // Literals constructed using CreateFromShape should be zero initialized. - std::unique_ptr scalar_f32 = - Literal::CreateFromShape(ShapeUtil::MakeShape(F32, {})); - EXPECT_EQ(scalar_f32->Get({}), 0.0); - EXPECT_TRUE(scalar_f32->IsAll(0)); - - std::unique_ptr vector_s32 = - Literal::CreateFromShape(ShapeUtil::MakeShape(S32, {3})); - EXPECT_EQ(vector_s32->Get({0}), 0); - EXPECT_EQ(vector_s32->Get({1}), 0); - EXPECT_EQ(vector_s32->Get({2}), 0); - EXPECT_TRUE(vector_s32->IsAll(0)); - - std::unique_ptr tuple = - Literal::CreateFromShape(ShapeUtil::MakeTupleShape( - {ShapeUtil::MakeShape(F64, {}), ShapeUtil::MakeShape(PRED, {2}), - ShapeUtil::MakeShape(U64, {2, 1}), ShapeUtil::MakeShape(C64, {})})); - - EXPECT_EQ(tuple->Get({}, {0}), 0.0); - EXPECT_EQ(tuple->Get({0}, {1}), false); - EXPECT_EQ(tuple->Get({1}, {1}), false); - EXPECT_EQ(tuple->Get({0, 0}, {2}), 0); - EXPECT_EQ(tuple->Get({1, 0}, {2}), 0); - EXPECT_EQ(tuple->Get({}, {3}), complex64(0.0f, 0.0f)); -} - -TEST_F(LiteralUtilTest, ProtoRoundTrip) { - // Test serializing then deserializing a Literal through a proto. - auto one_f32 = Literal::CreateR0(1.0); - auto two_f32 = Literal::CreateR0(2.0); - auto vector_int8 = Literal::CreateR1({-128, 0, 2, 4, 7, 56, 127}); - auto vector_c64 = Literal::CreateR1({{1.0, 2.0}, {3.0, 4.0}}); - auto vector_bfloat16 = Literal::CreateR1( - {bfloat16{-1.0}, bfloat16{2.0}, bfloat16{-3.0}}); - auto vector_half = - Literal::CreateR1({half{10.0}, half{20.0}, half{-30.0}}); - auto matrix_pred = - Literal::CreateR2({{true, false, true}, {false, false, true}}); - auto tuple = Literal::MakeTuple( - {one_f32.get(), vector_half.get(), matrix_pred.get(), matrix_pred.get()}); - Literal nil_literal(ShapeUtil::MakeNil()); - auto nested_tuple = Literal::MakeTuple( - {tuple.get(), vector_bfloat16.get(), tuple.get(), &nil_literal}); - - auto to_from_proto = [](const Literal& literal) -> Literal { - return std::move(*Literal::CreateFromProto(literal.ToProto()).ValueOrDie()); - }; - - EXPECT_EQ(*one_f32, to_from_proto(*one_f32)); - EXPECT_EQ(*vector_c64, to_from_proto(*vector_c64)); - EXPECT_EQ(*vector_bfloat16, to_from_proto(*vector_bfloat16)); - EXPECT_EQ(*matrix_pred, to_from_proto(*matrix_pred)); - EXPECT_EQ(*tuple, to_from_proto(*tuple)); - EXPECT_EQ(*nested_tuple, to_from_proto(*nested_tuple)); - EXPECT_EQ(nil_literal, to_from_proto(nil_literal)); - - EXPECT_NE(*one_f32, *two_f32); - EXPECT_NE(*one_f32, to_from_proto(*two_f32)); -} - -TEST_F(LiteralUtilTest, InvalidProtoNoValues) { - // Proto contains a shape, but no values. - LiteralProto proto; - *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}); - Status status = Literal::CreateFromProto(proto).status(); - ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), - HasSubstr("Expected 3 elements in LiteralProto")); -} - -TEST_F(LiteralUtilTest, InvalidProtoNoShape) { - // Proto contains values, but no shape. - LiteralProto proto; - proto.add_preds(false); - proto.add_preds(true); - proto.add_preds(false); - Status status = Literal::CreateFromProto(proto).status(); - ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), HasSubstr("LiteralProto has no shape")); -} - -TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) { - // Proto contains values in wrong container. - LiteralProto proto; - *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}); - proto.add_preds(false); - proto.add_preds(true); - proto.add_preds(false); - Status status = Literal::CreateFromProto(proto).status(); - ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), - HasSubstr("Expected 3 elements in LiteralProto")); -} - -TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) { - // Proto contains too few values. - LiteralProto proto; - *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {42, 2}); - proto.add_f32s(1.0); - proto.add_f32s(2.0); - proto.add_f32s(3.0); - Status status = Literal::CreateFromProto(proto).status(); - ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), - HasSubstr("Expected 84 elements in LiteralProto")); -} - -TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) { - // Proto contains too many values. - LiteralProto proto; - *proto.mutable_shape() = ShapeUtil::MakeShape(S32, {2}); - proto.add_s32s(42); - proto.add_s32s(-10); - proto.add_s32s(100); - Status status = Literal::CreateFromProto(proto).status(); - ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), - HasSubstr("Expected 2 elements in LiteralProto")); -} - -TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) { - // Proto shape missing layout. - LiteralProto proto; - *proto.mutable_shape() = ShapeUtil::MakeShape(PRED, {2, 2}); - LayoutUtil::ClearLayout(proto.mutable_shape()); - proto.add_preds(true); - proto.add_preds(false); - proto.add_preds(true); - proto.add_preds(false); - Status status = Literal::CreateFromProto(proto).status(); - ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), HasSubstr("LiteralProto has no layout")); -} - -TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) { - // Proto has the too few tuple elements. - LiteralProto proto; - *proto.mutable_shape() = ShapeUtil::MakeTupleShape( - {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})}); - LiteralProto* element0 = proto.add_tuple_literals(); - *element0->mutable_shape() = - ShapeUtil::GetTupleElementShape(proto.shape(), 0); - element0->add_preds(false); - element0->add_preds(true); - - Status status = Literal::CreateFromProto(proto).status(); - ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements")); -} - -TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) { - // Proto has the too many tuple elements. - LiteralProto proto; - *proto.mutable_shape() = ShapeUtil::MakeTupleShape( - {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})}); - LiteralProto* element0 = proto.add_tuple_literals(); - *element0->mutable_shape() = - ShapeUtil::GetTupleElementShape(proto.shape(), 0); - element0->add_preds(false); - element0->add_preds(true); - LiteralProto* element1 = proto.add_tuple_literals(); - *element1->mutable_shape() = - ShapeUtil::GetTupleElementShape(proto.shape(), 1); - element1->add_f32s(42.0); - LiteralProto* element2 = proto.add_tuple_literals(); - *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {}); - element2->add_f32s(123.0); - - Status status = Literal::CreateFromProto(proto).status(); - ASSERT_FALSE(status.ok()); - ASSERT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements")); -} - -TEST_F(LiteralUtilTest, SortSparseElements) { - auto literal = - Literal::CreateSparse({10, 10, 10}, SparseIndexArray(10, 3), {}); - literal->AppendSparseElement({2, 3, 4}, 2.0); - literal->AppendSparseElement({3, 4, 5}, 3.0); - literal->AppendSparseElement({1, 2, 3}, 1.0); - literal->SortSparseElements(); - ASSERT_EQ(literal->ToString(false), - "f32[10,10,10]{[1, 2, 3]: 1, [2, 3, 4]: 2, [3, 4, 5]: 3}"); -} - -TEST_F(LiteralUtilTest, GetSparseElementAsString) { - std::vector dimensions = {10, 10, 10}; - SparseIndexArray indices(10, {{1, 2, 3}, {2, 3, 4}, {3, 4, 5}}); - - ASSERT_EQ( - Literal::CreateSparse(dimensions, indices, {true, false, true}) - ->GetSparseElementAsString(1), - "false"); - ASSERT_EQ(Literal::CreateSparse(dimensions, indices, {1, 2, 3}) - ->GetSparseElementAsString(1), - tensorflow::strings::StrCat(int64{2})); - ASSERT_EQ(Literal::CreateSparse(dimensions, indices, {1.0, 2.0, 3.0}) - ->GetSparseElementAsString(1), - tensorflow::strings::StrCat(double{2.0})); - ASSERT_EQ(Literal::CreateSparse(dimensions, indices, - {half{1.0}, half{2.0}, half{3.0}}) - ->GetSparseElementAsString(1), - tensorflow::strings::StrCat(static_cast(half{2.0}))); - ASSERT_EQ( - Literal::CreateSparse( - dimensions, indices, - std::vector{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}) - ->GetSparseElementAsString(1), - tensorflow::strings::StrCat("(", float{3.0}, ", ", float{4.0}, ")")); -} - -TEST_F(LiteralUtilTest, BroadcastVectorToMatrix0) { - std::unique_ptr literal = Literal::CreateR1({1, 2}); - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr broadcasted_literal, - literal->Broadcast( - /*result_shape=*/ShapeUtil::MakeShape(S64, {2, 2}), - /*dimensions=*/{0})); - EXPECT_EQ(*broadcasted_literal, *Literal::CreateR2({{1, 1}, {2, 2}})); -} - -TEST_F(LiteralUtilTest, BroadcastVectorToMatrix1) { - std::unique_ptr literal = Literal::CreateR1({1, 2}); - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr broadcasted_literal, - literal->Broadcast( - /*result_shape=*/ShapeUtil::MakeShape(S64, {2, 2}), - /*dimensions=*/{1})); - EXPECT_EQ(*broadcasted_literal, *Literal::CreateR2({{1, 2}, {1, 2}})); -} - -TEST_F(LiteralUtilTest, BroadcastScalarToMatrix) { - std::unique_ptr literal = Literal::CreateR0(9); - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr broadcasted_literal, - literal->Broadcast( - /*result_shape=*/ShapeUtil::MakeShape(S32, {2, 2}), - /*dimensions=*/{})); - EXPECT_EQ(*broadcasted_literal, *Literal::CreateR2({{9, 9}, {9, 9}})); -} - -} // namespace -} // namespace xla diff --git a/tensorflow/compiler/xla/map_util.h b/tensorflow/compiler/xla/map_util.h index 3c74e070da529b7f1431e01fbaf31932f582db44..fcff48b6b18ba115a67f3141a9aea4ca461be55d 100644 --- a/tensorflow/compiler/xla/map_util.h +++ b/tensorflow/compiler/xla/map_util.h @@ -60,7 +60,7 @@ MaybeFind(const Collection& collection, if (it == collection.end()) { std::ostringstream os; os << key; - return NotFound("key not found: %s", os.str().c_str()); + return NotFound("key not found: %s", os.str()); } return {it->second}; } diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc index fed0e58e66a04df2ff9554cb0dd0053b7c669803..4eab4fa4290c270697c00be20840cf4e85459183 100644 --- a/tensorflow/compiler/xla/metric_table_report.cc +++ b/tensorflow/compiler/xla/metric_table_report.cc @@ -18,7 +18,8 @@ limitations under the License. #include #include -#include "tensorflow/core/lib/strings/stringprintf.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -84,7 +85,7 @@ void MetricTableReport::WriteReportToInfoLog(double expected_metric_sum) { if (end_of_line == string::npos) { end_of_line = report.size(); } - tensorflow::StringPiece line(report.data() + pos, end_of_line - pos); + absl::string_view line(report.data() + pos, end_of_line - pos); // TODO(b/34779244): Figure out how to do this without the verbose log-line // prefix. The usual way didn't compile on open source. @@ -134,8 +135,7 @@ void MetricTableReport::AppendHeader() { void MetricTableReport::AppendCategoryTable() { const std::vector categories = MakeCategories(&entries_); - AppendLine("********** categories table **********"); - AppendLine("The left hand side numbers are ", metric_name_, "."); + AppendLine("********** categories table for ", metric_name_, " **********"); AppendLine(); double metric_sum = UnaccountedMetric(); @@ -153,8 +153,8 @@ void MetricTableReport::AppendCategoryTable() { if (text.empty()) { text = "[no category]"; } - tensorflow::strings::StrAppend(&text, " (", category.entries.size(), " ", - entry_name_, ")"); + absl::StrAppend(&text, " (", category.entries.size(), " ", entry_name_, + ")"); AppendTableRow(text, category.metric_sum, metric_sum); // Show the top entries in the category. @@ -178,15 +178,15 @@ void MetricTableReport::AppendCategoryTable() { } const int64 remaining_categories = categories.size() - categories_shown; if (remaining_categories > 0) { - AppendTableRow(tensorflow::strings::StrCat("... (", remaining_categories, - " more categories)"), - expected_metric_sum_ - metric_sum, expected_metric_sum_); + AppendTableRow( + absl::StrCat("... (", remaining_categories, " more categories)"), + expected_metric_sum_ - metric_sum, expected_metric_sum_); } } void MetricTableReport::AppendEntryTable() { - AppendLine("********** ", entry_name_, " table **********"); - AppendLine("The left hand side numbers are ", metric_name_, "."); + AppendLine("********** ", entry_name_, " table for ", metric_name_, + " **********"); AppendLine(); double metric_sum = UnaccountedMetric(); @@ -207,9 +207,9 @@ void MetricTableReport::AppendEntryTable() { } const int64 remaining_entries = entries_.size() - entries_shown; if (remaining_entries > 0) { - AppendTableRow(tensorflow::strings::StrCat("... (", remaining_entries, - " more ", entry_name_, ")"), - expected_metric_sum_ - metric_sum, expected_metric_sum_); + AppendTableRow( + absl::StrCat("... (", remaining_entries, " more ", entry_name_, ")"), + expected_metric_sum_ - metric_sum, expected_metric_sum_); } } @@ -242,10 +242,10 @@ double MetricTableReport::UnaccountedMetric() { string MetricTableReport::MetricString(double metric) { // Round to integer and stringify. - string s1 = tensorflow::strings::StrCat(std::llround(metric)); + string s1 = absl::StrCat(std::llround(metric)); // Code below commafies the string, e.g. "1234" becomes "1,234". - tensorflow::StringPiece sp1(s1); + absl::string_view sp1(s1); string output; // Copy leading non-digit characters unconditionally. // This picks up the leading sign. @@ -264,8 +264,7 @@ string MetricTableReport::MetricString(double metric) { } string MetricTableReport::MetricPercent(double metric) { - return tensorflow::strings::Printf("%5.2f%%", - metric / expected_metric_sum_ * 100.0); + return absl::StrFormat("%5.2f%%", metric / expected_metric_sum_ * 100.0); } } // namespace xla diff --git a/tensorflow/compiler/xla/metric_table_report.h b/tensorflow/compiler/xla/metric_table_report.h index 818fb1d3fe0b8bbe1a8eba363ff6445e2f3df9d2..062d8ed99b213535ad39d840aaaf10a6fe0da84c 100644 --- a/tensorflow/compiler/xla/metric_table_report.h +++ b/tensorflow/compiler/xla/metric_table_report.h @@ -18,9 +18,8 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { @@ -108,7 +107,7 @@ class MetricTableReport { // Append all parameters to the report. template void AppendLine(Args... args) { - tensorflow::strings::StrAppend(&report_, std::forward(args)..., "\n"); + absl::StrAppend(&report_, std::forward(args)..., "\n"); } // Represents a set of entries with the same category_text. diff --git a/tensorflow/compiler/xla/overflow_util.h b/tensorflow/compiler/xla/overflow_util.h new file mode 100644 index 0000000000000000000000000000000000000000..8657d3a4bfa992b9ca0619f24923fd4542eed894 --- /dev/null +++ b/tensorflow/compiler/xla/overflow_util.h @@ -0,0 +1,50 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_OVERFLOW_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_OVERFLOW_UTIL_H_ + +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { + +// Multiply two nonnegative int64's, returning negative for overflow +inline int64 MultiplyWithoutOverflow(const int64 x, const int64 y) { + // Multiply in uint64 rather than int64 since signed overflow is undefined. + // Negative values will wrap around to large unsigned values in the casts + // (see section 4.7 [conv.integral] of the C++14 standard). + const uint64 ux = x; + const uint64 uy = y; + const uint64 uxy = ux * uy; + + // Check if we overflow uint64, using a cheap check if both inputs are small + if (TF_PREDICT_FALSE((ux | uy) >> 32 != 0)) { + // Ensure nonnegativity. Note that negative numbers will appear "large" + // to the unsigned comparisons above. + CHECK(x >= 0 && y >= 0); + + // Otherwise, detect overflow using a division + if (ux != 0 && uxy / ux != uy) return -1; + } + + // Cast back to signed. Any negative value will signal an error. + return static_cast(uxy); +} + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_OVERFLOW_UTIL_H_ diff --git a/tensorflow/compiler/xla/packed_literal_reader.cc b/tensorflow/compiler/xla/packed_literal_reader.cc index 857aae0a7982a57bb3057a6f267f5f033a0fdde4..f9473d372bb15058d7413e2ac8a303dd34322180 100644 --- a/tensorflow/compiler/xla/packed_literal_reader.cc +++ b/tensorflow/compiler/xla/packed_literal_reader.cc @@ -19,15 +19,15 @@ limitations under the License. #include #include +#include "absl/base/casts.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/types.h" @@ -54,17 +54,17 @@ StatusOr> PackedLiteralReader::Read( if (shape.element_type() != F32) { return Unimplemented( "not yet implemented element type for packed literal reading: %s", - PrimitiveType_Name(shape.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } - auto result = MakeUnique(literal_shape); + auto result = absl::make_unique(literal_shape); result->PopulateWithValue(std::numeric_limits::quiet_NaN()); int64 elements = ShapeUtil::ElementsIn(shape); - tensorflow::gtl::ArraySlice field = result->data(); - char* data = tensorflow::bit_cast(field.data()); + absl::Span field = result->data(); + char* data = absl::bit_cast(field.data()); uint64 bytes = elements * sizeof(float); - tensorflow::StringPiece sp; + absl::string_view sp; auto s = file_->Read(offset_, bytes, &sp, data); offset_ += sp.size(); if (!s.ok()) { @@ -85,7 +85,7 @@ bool PackedLiteralReader::IsExhausted() const { // Try to read a single byte from offset_. If we can't, we've // exhausted the data. char single_byte[1]; - tensorflow::StringPiece sp; + absl::string_view sp; auto s = file_->Read(offset_, sizeof(single_byte), &sp, single_byte); return !s.ok(); } diff --git a/tensorflow/compiler/xla/packed_literal_reader.h b/tensorflow/compiler/xla/packed_literal_reader.h index 45a9fe012784d3e4168e7549240dec962aa1a17a..98dccaa9a246520bf60217b96d67a13a24c34b4a 100644 --- a/tensorflow/compiler/xla/packed_literal_reader.h +++ b/tensorflow/compiler/xla/packed_literal_reader.h @@ -18,7 +18,7 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc index 143c9a2366be5786b7ef2148580caeb97d67d2d8..b16147e3be71771269d8b7a18528bef3a8c72d99 100644 --- a/tensorflow/compiler/xla/primitive_util.cc +++ b/tensorflow/compiler/xla/primitive_util.cc @@ -85,5 +85,10 @@ PrimitiveType ComplexComponentType(PrimitiveType complex_type) { } } +bool IsArrayType(PrimitiveType primitive_type) { + return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE && + primitive_type != OPAQUE && primitive_type != TOKEN; +} + } // namespace primitive_util } // namespace xla diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h index b26a10ade63a5dad3bf8f9f3a2a33c3c5e67bdb2..889e9a1ceca675689406d255d348c82c398563aa 100644 --- a/tensorflow/compiler/xla/primitive_util.h +++ b/tensorflow/compiler/xla/primitive_util.h @@ -133,6 +133,9 @@ bool IsUnsignedIntegralType(PrimitiveType type); bool IsIntegralType(PrimitiveType type); +// Returns true if values of the given primitive type are held in array shapes. +bool IsArrayType(PrimitiveType primitive_type); + // Returns the number of bits in the representation for a given type. int BitWidth(PrimitiveType type); diff --git a/tensorflow/compiler/xla/ptr_util.h b/tensorflow/compiler/xla/ptr_util.h deleted file mode 100644 index bfcdfc62f9541ab09b94a48d5121e16bad4d43cd..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/ptr_util.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_ -#define TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_ - -// As this was moved to tensorflow/core/util, provide indirections here to -// maintain current functionality of the library. - -#include - -#include -#include -#include - -#include "tensorflow/core/util/ptr_util.h" - -namespace xla { -using tensorflow::MakeUnique; -using tensorflow::WrapUnique; -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_ diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index 83834c1ff65ea2f9989fe08279c29056d9070adb..f0d84646b9f01ad3ad209073f13b7b3ec21635d1 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -33,11 +33,15 @@ cc_library( srcs = ["numpy_bridge.cc"], hdrs = ["numpy_bridge.h"], deps = [ + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/python:numpy_lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -52,12 +56,14 @@ cc_library( "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:executable_build_options", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", - "//tensorflow/compiler/xla/service:hlo_proto", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", + "//tensorflow/compiler/xla/client/lib:math", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/core:framework_lite", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], ) @@ -70,7 +76,7 @@ tf_py_wrap_cc( deps = [ ":local_computation_builder", ":numpy_bridge", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:cpu_plugin", diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc index f808990cadeab5fd2c4857920ee1daaac7262edd..cd6e20b69366c064e20c6e0a7d1aebe6229690d8 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.cc +++ b/tensorflow/compiler/xla/python/local_computation_builder.cc @@ -14,13 +14,14 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/python/local_computation_builder.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/client/lib/math.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/executable_run_options.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/platform/thread_annotations.h" namespace xla { - namespace swig { // TODO(b/34473877) Ideally XLA would support AllReduce among arbitrary sets of @@ -97,6 +98,36 @@ const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const { return &shaped_buffer_; } +ShapedBuffer LocalShapedBuffer::Release() { return shaped_buffer_.release(); } + +LocalShapedBufferTuple::LocalShapedBufferTuple( + std::vector elements) + : elements_(std::move(elements)) { + for (auto* element : elements_) { + DCHECK(element != nullptr); + } +} + +LocalShapedBufferTuple::~LocalShapedBufferTuple() { + for (LocalShapedBuffer* element : elements_) { + if (element != nullptr) { + delete element; + } + } +} + +StatusOr LocalShapedBufferTuple::Release(int i) { + LocalShapedBuffer* element = elements_[i]; + if (element == nullptr) { + return InvalidArgument("Attempted to release already-released element %d.", + i); + } + elements_[i] = nullptr; + return element; +} + +int LocalShapedBufferTuple::size() const { return elements_.size(); } + static StatusOr ToBuffer(LocalClient* client, int device_ordinal, const Literal& arg) { @@ -106,8 +137,7 @@ static StatusOr ToBuffer(LocalClient* client, /* static */ StatusOr LocalShapedBuffer::FromLiteral( - const Literal& argument, - const tensorflow::gtl::optional& shape_with_layout) { + const Literal& argument, const absl::optional& shape_with_layout) { LocalClient* client = GetOrCreateLocalClient(); StatusOr buf = [&] { if (shape_with_layout) { @@ -132,7 +162,7 @@ CompiledLocalComputation::CompiledLocalComputation( StatusOr> CompiledLocalComputation::Execute( const std::vector& arguments, - const std::vector>& shapes_with_layout) { + const std::vector>& shapes_with_layout) { LocalClient* client = GetOrCreateLocalClient(); VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas."; @@ -145,73 +175,73 @@ StatusOr> CompiledLocalComputation::Execute( GetReplicaCount()); for (int replica = 0; replica < GetReplicaCount(); ++replica) { - pool.Schedule([this, client, replica, &arguments, &shapes_with_layout, - &results] { - StatusOr device_ordinal_status = - client->ReplicaNumberToDeviceOrdinal(replica); - if (!device_ordinal_status.ok()) { - results[replica] = device_ordinal_status.status(); - return; - } - const int device_ordinal = device_ordinal_status.ValueOrDie(); - VLOG(3) << "Replica " << replica - << " mapped to device ordinal for execution: " - << device_ordinal; - - // Transfer arguments in - std::vector scoped_buffers; - scoped_buffers.reserve(arguments.size()); - for (int i = 0; i < arguments.size(); ++i) { - const Literal& argument = arguments[i]; - const tensorflow::gtl::optional& shape_with_layout = - shapes_with_layout[i]; - - StatusOr pushed; - if (shape_with_layout) { - std::unique_ptr relaid = - argument.Relayout(shape_with_layout.value()); - pushed = ToBuffer(client, device_ordinal, *relaid); - } else { - pushed = ToBuffer(client, device_ordinal, argument); - } - if (!pushed.ok()) { - results[replica] = pushed.status(); - return; - } - - scoped_buffers.push_back(std::move(pushed).ValueOrDie()); - } - - // Execute - std::vector argument_buffers; - argument_buffers.reserve(scoped_buffers.size()); - for (auto& buffer : scoped_buffers) { - argument_buffers.push_back(&buffer); - } - - DeviceAssignment device_assignment = - client->backend() - .computation_placer() - ->AssignDevices(GetReplicaCount(), /*computation_count=*/1) - .ConsumeValueOrDie(); - - ExecutableRunOptions options; - options.set_device_ordinal(device_ordinal); - options.set_allocator(client->backend().memory_allocator()); - options.set_intra_op_thread_pool( - client->backend().eigen_intra_op_thread_pool_device()); - options.set_device_assignment(&device_assignment); - StatusOr result_buffer_status = - executable_->Run(argument_buffers, options); - if (!result_buffer_status.ok()) { - results[replica] = result_buffer_status.status(); - return; - } - - // Transfer result out - results[replica] = client->ShapedBufferToLiteral( - std::move(result_buffer_status).ValueOrDie()); - }); + pool.Schedule( + [this, client, replica, &arguments, &shapes_with_layout, &results] { + StatusOr device_ordinal_status = + client->ReplicaNumberToDeviceOrdinal(replica); + if (!device_ordinal_status.ok()) { + results[replica] = device_ordinal_status.status(); + return; + } + const int device_ordinal = device_ordinal_status.ValueOrDie(); + VLOG(3) << "Replica " << replica + << " mapped to device ordinal for execution: " + << device_ordinal; + + // Transfer arguments in + std::vector scoped_buffers; + scoped_buffers.reserve(arguments.size()); + for (int i = 0; i < arguments.size(); ++i) { + const Literal& argument = arguments[i]; + const absl::optional& shape_with_layout = + shapes_with_layout[i]; + + StatusOr pushed; + if (shape_with_layout) { + std::unique_ptr relaid = + argument.Relayout(shape_with_layout.value()); + pushed = ToBuffer(client, device_ordinal, *relaid); + } else { + pushed = ToBuffer(client, device_ordinal, argument); + } + if (!pushed.ok()) { + results[replica] = pushed.status(); + return; + } + + scoped_buffers.push_back(std::move(pushed).ValueOrDie()); + } + + // Execute + std::vector argument_buffers; + argument_buffers.reserve(scoped_buffers.size()); + for (auto& buffer : scoped_buffers) { + argument_buffers.push_back(&buffer); + } + + DeviceAssignment device_assignment = + client->backend() + .computation_placer() + ->AssignDevices(GetReplicaCount(), /*computation_count=*/1) + .ConsumeValueOrDie(); + + ExecutableRunOptions options; + options.set_device_ordinal(device_ordinal); + options.set_allocator(client->backend().memory_allocator()); + options.set_intra_op_thread_pool( + client->backend().eigen_intra_op_thread_pool_device()); + options.set_device_assignment(&device_assignment); + StatusOr result_buffer_status = + executable_->Run(argument_buffers, options); + if (!result_buffer_status.ok()) { + results[replica] = result_buffer_status.status(); + return; + } + + // Transfer result out + results[replica] = client->ShapedBufferToLiteral( + std::move(result_buffer_status).ValueOrDie()); + }); } } @@ -221,7 +251,7 @@ StatusOr> CompiledLocalComputation::Execute( return InternalError( "Failed running replica %d (other replicas may have failed as well): " "%s.", - replica, statusor.status().ToString().c_str()); + replica, statusor.status().ToString()); } } @@ -229,7 +259,7 @@ StatusOr> CompiledLocalComputation::Execute( } LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers( - tensorflow::gtl::ArraySlice argument_handles) { + absl::Span argument_handles) { LocalClient* client = GetOrCreateLocalClient(); std::vector argument_buffers; @@ -312,14 +342,11 @@ StatusOr LocalComputationBuilder::Build() { LocalOp LocalComputationBuilder::Parameter(int64 parameter_number, const Shape& shape, const string& name) { - return builder_.Parameter(parameter_number, shape, name); + return xla::Parameter(&builder_, parameter_number, shape, name); } -std::unique_ptr LocalComputationBuilder::GetShape( - const LocalOp& operand) { - auto result = MakeUnique(); - *result = builder_.GetShape(operand.op()).ValueOrDie(); - return result; +StatusOr LocalComputationBuilder::GetShape(const LocalOp& operand) { + return builder_.GetShape(operand.op()); } StatusOr LocalComputationBuilder::GetReturnValueShape() { @@ -328,196 +355,185 @@ StatusOr LocalComputationBuilder::GetReturnValueShape() { } LocalOp LocalComputationBuilder::Infeed(const Shape& shape) { - return builder_.Infeed(shape); + return xla::Infeed(&builder_, shape); } void LocalComputationBuilder::Outfeed(const LocalOp& operand, const Shape& shape, const string& outfeed_config) { - builder_.Outfeed(operand.op(), shape, outfeed_config); + xla::Outfeed(operand.op(), shape, outfeed_config); } LocalOp LocalComputationBuilder::ConstantLiteral(const Literal& literal) { - return builder_.ConstantLiteral(literal); + return xla::ConstantLiteral(&builder_, literal); } LocalOp LocalComputationBuilder::Broadcast( - const LocalOp& operand, - tensorflow::gtl::ArraySlice broadcast_sizes) { - return builder_.Broadcast(operand.op(), broadcast_sizes); + const LocalOp& operand, absl::Span broadcast_sizes) { + return xla::Broadcast(operand.op(), broadcast_sizes); } LocalOp LocalComputationBuilder::Pad(const LocalOp& operand, const LocalOp& padding_value, const PaddingConfig& padding_config) { - return builder_.Pad(operand.op(), padding_value.op(), padding_config); + return xla::Pad(operand.op(), padding_value.op(), padding_config); } -LocalOp LocalComputationBuilder::Reshape( - const LocalOp& operand, tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice new_sizes) { - return builder_.Reshape(operand.op(), dimensions, new_sizes); +LocalOp LocalComputationBuilder::Reshape(const LocalOp& operand, + absl::Span dimensions, + absl::Span new_sizes) { + return xla::Reshape(operand.op(), dimensions, new_sizes); } -LocalOp LocalComputationBuilder::Collapse( - const LocalOp& operand, tensorflow::gtl::ArraySlice dimensions) { - return builder_.Collapse(operand.op(), dimensions); +LocalOp LocalComputationBuilder::Collapse(const LocalOp& operand, + absl::Span dimensions) { + return xla::Collapse(operand.op(), dimensions); } LocalOp LocalComputationBuilder::CrossReplicaSum(const LocalOp& operand) { - return builder_.CrossReplicaSum(operand.op()); + return xla::CrossReplicaSum(operand.op()); } -LocalOp LocalComputationBuilder::Slice( - const LocalOp& operand, tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices, - tensorflow::gtl::ArraySlice strides) { - return builder_.Slice(operand.op(), start_indices, limit_indices, strides); +LocalOp LocalComputationBuilder::Slice(const LocalOp& operand, + absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides) { + return xla::Slice(operand.op(), start_indices, limit_indices, strides); } LocalOp LocalComputationBuilder::SliceInDim(const LocalOp& operand, int64 start_index, int64 limit_index, int64 stride, int64 dimno) { - return builder_.SliceInDim(operand.op(), start_index, limit_index, stride, - dimno); + return xla::SliceInDim(operand.op(), start_index, limit_index, stride, dimno); } LocalOp LocalComputationBuilder::DynamicSlice( const LocalOp& operand, const LocalOp& start_indices, - tensorflow::gtl::ArraySlice slice_sizes) { - return builder_.DynamicSlice(operand.op(), start_indices.op(), slice_sizes); + absl::Span slice_sizes) { + return xla::DynamicSlice(operand.op(), start_indices.op(), slice_sizes); } LocalOp LocalComputationBuilder::DynamicUpdateSlice( const LocalOp& operand, const LocalOp& update, const LocalOp& start_indices) { - return builder_.DynamicUpdateSlice(operand.op(), update.op(), - start_indices.op()); + return xla::DynamicUpdateSlice(operand.op(), update.op(), start_indices.op()); } -LocalOp LocalComputationBuilder::ConcatInDim( - tensorflow::gtl::ArraySlice operands, int64 dimension) { +LocalOp LocalComputationBuilder::ConcatInDim(absl::Span operands, + int64 dimension) { std::vector xla_ops; xla_ops.reserve(operands.size()); for (const auto& op : operands) { xla_ops.push_back(op.op()); } - return builder_.ConcatInDim(xla_ops, dimension); + return xla::ConcatInDim(&builder_, xla_ops, dimension); } LocalOp LocalComputationBuilder::SelectAndScatterWithGeneralPadding( const LocalOp& operand, const LocalComputation& select, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - const LocalOp& source, const LocalOp& init_value, - const LocalComputation& scatter) { - return builder_.SelectAndScatterWithGeneralPadding( + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding, const LocalOp& source, + const LocalOp& init_value, const LocalComputation& scatter) { + return xla::SelectAndScatterWithGeneralPadding( operand.op(), select.computation(), window_dimensions, window_strides, padding, source.op(), init_value.op(), scatter.computation()); } -LocalOp LocalComputationBuilder::Tuple( - tensorflow::gtl::ArraySlice elements) { +LocalOp LocalComputationBuilder::Tuple(absl::Span elements) { std::vector xla_ops; xla_ops.reserve(elements.size()); for (const auto& op : elements) { xla_ops.push_back(op.op()); } - return builder_.Tuple(xla_ops); + return xla::Tuple(&builder_, xla_ops); } LocalOp LocalComputationBuilder::GetTupleElement(const LocalOp& tuple_data, int64 index) { - return builder_.GetTupleElement(tuple_data.op(), index); + return xla::GetTupleElement(tuple_data.op(), index); } LocalOp LocalComputationBuilder::Dot(const LocalOp& lhs, const LocalOp& rhs) { - return builder_.Dot(lhs.op(), rhs.op()); + return xla::Dot(lhs.op(), rhs.op()); } LocalOp LocalComputationBuilder::DotGeneral( const LocalOp& lhs, const LocalOp& rhs, const DotDimensionNumbers& dimension_numbers) { - return builder_.DotGeneral(lhs.op(), rhs.op(), dimension_numbers); + return xla::DotGeneral(lhs.op(), rhs.op(), dimension_numbers); } LocalOp LocalComputationBuilder::ConvGeneralDilated( const LocalOp& lhs, const LocalOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding, - tensorflow::gtl::ArraySlice lhs_dilation, - tensorflow::gtl::ArraySlice rhs_dilation, + absl::Span window_strides, + absl::Span> padding, + absl::Span lhs_dilation, absl::Span rhs_dilation, const ConvolutionDimensionNumbers& dimension_numbers) { - return builder_.ConvGeneralDilated(lhs.op(), rhs.op(), window_strides, - padding, lhs_dilation, rhs_dilation, - dimension_numbers); + return xla::ConvGeneralDilated(lhs.op(), rhs.op(), window_strides, padding, + lhs_dilation, rhs_dilation, dimension_numbers); } LocalOp LocalComputationBuilder::ConvertElementType( const LocalOp& operand, PrimitiveType new_element_type) { - return builder_.ConvertElementType(operand.op(), new_element_type); + return xla::ConvertElementType(operand.op(), new_element_type); } -LocalOp LocalComputationBuilder::Call( - const LocalComputation& local_computation, - tensorflow::gtl::ArraySlice operands) { +LocalOp LocalComputationBuilder::BitcastConvertType( + const LocalOp& operand, PrimitiveType new_element_type) { + return xla::BitcastConvertType(operand.op(), new_element_type); +} + +LocalOp LocalComputationBuilder::Call(const LocalComputation& local_computation, + absl::Span operands) { std::vector xla_ops; xla_ops.reserve(operands.size()); for (const auto& op : operands) { xla_ops.push_back(op.op()); } - return builder_.Call(local_computation.computation(), xla_ops); + return xla::Call(&builder_, local_computation.computation(), xla_ops); } LocalOp LocalComputationBuilder::Transpose( - const LocalOp& operand, tensorflow::gtl::ArraySlice permutation) { - return builder_.Transpose(operand.op(), permutation); + const LocalOp& operand, absl::Span permutation) { + return xla::Transpose(operand.op(), permutation); } -LocalOp LocalComputationBuilder::Rev( - const LocalOp& operand, tensorflow::gtl::ArraySlice dimensions) { - return builder_.Rev(operand.op(), dimensions); +LocalOp LocalComputationBuilder::Rev(const LocalOp& operand, + absl::Span dimensions) { + return xla::Rev(operand.op(), dimensions); } -LocalOp LocalComputationBuilder::Map( - tensorflow::gtl::ArraySlice operands, - const LocalComputation& local_computation, - tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice static_operands) { +LocalOp LocalComputationBuilder::Map(absl::Span operands, + const LocalComputation& local_computation, + absl::Span dimensions) { std::vector xla_ops; xla_ops.reserve(operands.size()); for (const auto& op : operands) { xla_ops.push_back(op.op()); } - std::vector static_xla_ops; - static_xla_ops.reserve(static_operands.size()); - for (const auto& op : static_operands) { - static_xla_ops.push_back(op.op()); - } - - return builder_.Map(xla_ops, local_computation.computation(), dimensions, - static_xla_ops); + return xla::Map(&builder_, xla_ops, local_computation.computation(), + dimensions); } LocalOp LocalComputationBuilder::Reduce( const LocalOp& operand, const LocalOp& init_value, const LocalComputation& local_computation, - tensorflow::gtl::ArraySlice dimensions_to_reduce) { - return builder_.Reduce(operand.op(), init_value.op(), - local_computation.computation(), dimensions_to_reduce); + absl::Span dimensions_to_reduce) { + return xla::Reduce(operand.op(), init_value.op(), + local_computation.computation(), dimensions_to_reduce); } LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding( const LocalOp& operand, const LocalOp& init_value, const LocalComputation& local_computation, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice> padding) { - return builder_.ReduceWindowWithGeneralPadding( + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span> padding) { + return xla::ReduceWindowWithGeneralPadding( operand.op(), init_value.op(), local_computation.computation(), window_dimensions, window_strides, padding); } @@ -525,33 +541,43 @@ LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding( LocalOp LocalComputationBuilder::RngNormal(const LocalOp& mu, const LocalOp& sigma, const Shape& shape) { - return builder_.RngNormal(mu.op(), sigma.op(), shape); + return xla::RngNormal(mu.op(), sigma.op(), shape); } LocalOp LocalComputationBuilder::RngUniform(const LocalOp& a, const LocalOp& b, const Shape& shape) { - return builder_.RngUniform(a.op(), b.op(), shape); + return xla::RngUniform(a.op(), b.op(), shape); } LocalOp LocalComputationBuilder::While(const LocalComputation& condition, const LocalComputation& body, const LocalOp& init) { - return builder_.While(condition.computation(), body.computation(), init.op()); + return xla::While(condition.computation(), body.computation(), init.op()); } LocalOp LocalComputationBuilder::Conditional( const LocalOp& predicate, const LocalOp& true_operand, const LocalComputation& true_computation, const LocalOp& false_operand, const LocalComputation& false_computation) { - return builder_.Conditional( - predicate.op(), true_operand.op(), true_computation.computation(), - false_operand.op(), false_computation.computation()); + return xla::Conditional(predicate.op(), true_operand.op(), + true_computation.computation(), false_operand.op(), + false_computation.computation()); } StatusOr LocalComputationBuilder::IsConstant(const LocalOp& operand) { return builder_.IsConstant(operand.op()); } +LocalOp LocalComputationBuilder::Sort(const LocalOp& operand, int64 dimension) { + return xla::Sort(operand.op(), absl::nullopt, dimension); +} + +LocalOp LocalComputationBuilder::SortKeyVal(const LocalOp& keys, + const LocalOp& values, + int64 dimension) { + return xla::Sort(keys.op(), values.op(), dimension); +} + StatusOr LocalComputationBuilder::BuildConstantSubGraph( const LocalOp& operand) { TF_ASSIGN_OR_RETURN(XlaComputation computation, @@ -561,16 +587,16 @@ StatusOr LocalComputationBuilder::BuildConstantSubGraph( #define _FORWARD(method_name, return_sig, args_sig, args) \ return_sig LocalComputationBuilder::method_name args_sig { \ - return builder_.method_name args; \ + return xla::method_name args; \ } #define _FORWARD_UNOP(method_name) \ _FORWARD(method_name, LocalOp, (const LocalOp& operand), (operand.op())) -#define _FORWARD_BINOP(method_name) \ - _FORWARD(method_name, LocalOp, \ - (const LocalOp& lhs, const LocalOp& rhs, \ - tensorflow::gtl::ArraySlice broadcast_dimensions), \ +#define _FORWARD_BINOP(method_name) \ + _FORWARD(method_name, LocalOp, \ + (const LocalOp& lhs, const LocalOp& rhs, \ + absl::Span broadcast_dimensions), \ (lhs.op(), rhs.op(), broadcast_dimensions)) #define _FORWARD_TRIOP(method_name) \ @@ -595,24 +621,49 @@ _FORWARD_BINOP(Max) _FORWARD_BINOP(Min) _FORWARD_BINOP(And) _FORWARD_BINOP(Or) +_FORWARD_BINOP(Xor) +_FORWARD_BINOP(ShiftLeft) +_FORWARD_BINOP(ShiftRightArithmetic) +_FORWARD_BINOP(ShiftRightLogical) +_FORWARD_BINOP(Atan2) +_FORWARD_BINOP(Pow) +_FORWARD_BINOP(Complex) _FORWARD_UNOP(Not) _FORWARD_UNOP(Abs) _FORWARD_UNOP(Exp) +_FORWARD_UNOP(Expm1) _FORWARD_UNOP(Floor) _FORWARD_UNOP(Ceil) _FORWARD_UNOP(Round) _FORWARD_UNOP(Log) +_FORWARD_UNOP(Log1p) _FORWARD_UNOP(Sign) _FORWARD_UNOP(Cos) _FORWARD_UNOP(Sin) _FORWARD_UNOP(Tanh) -_FORWARD_UNOP(SqrtF32) -_FORWARD_UNOP(SquareF32) -_FORWARD_BINOP(Pow) _FORWARD_UNOP(IsFinite) -_FORWARD_UNOP(ReciprocalF32) _FORWARD_UNOP(Neg) -_FORWARD_UNOP(Sort) +_FORWARD_UNOP(Sqrt) +_FORWARD_UNOP(Rsqrt) +_FORWARD_UNOP(Square) +_FORWARD_UNOP(Reciprocal) +_FORWARD_UNOP(Erfc) +_FORWARD_UNOP(Erf) +_FORWARD_UNOP(ErfInv) +_FORWARD_UNOP(Lgamma) +_FORWARD_UNOP(Digamma) +_FORWARD_UNOP(Acos) +_FORWARD_UNOP(Asin) +_FORWARD_UNOP(Atan) +_FORWARD_UNOP(Tan) +_FORWARD_UNOP(Acosh) +_FORWARD_UNOP(Asinh) +_FORWARD_UNOP(Atanh) +_FORWARD_UNOP(Cosh) +_FORWARD_UNOP(Sinh) +_FORWARD_UNOP(Real) +_FORWARD_UNOP(Imag) +_FORWARD_UNOP(Conj) #undef _FORWARD #undef _FORWARD_UNOP @@ -631,6 +682,53 @@ void DeleteLocalComputation(LocalComputation* computation) { delete computation; } -} // namespace swig +StatusOr DestructureLocalShapedBufferTuple( + LocalShapedBuffer* local_shaped_buffer) { + if (!ShapeUtil::IsTuple( + local_shaped_buffer->shaped_buffer()->on_device_shape())) { + return InvalidArgument( + "Attemped to destructure a LocalShapedBuffer that did not have a tuple " + "shape; shape: %s", + ShapeUtil::HumanString( + local_shaped_buffer->shaped_buffer()->on_device_shape())); + } + DeviceMemoryAllocator* allocator = + local_shaped_buffer->shaped_buffer()->memory_allocator(); + ShapedBuffer tuple_buffer = local_shaped_buffer->Release(); + + // Extract some metadata we use to construct scoped buffers. + const se::Platform* platform = tuple_buffer.platform(); + int device_ordinal = tuple_buffer.device_ordinal(); + + ShapeTree& shape_tree = tuple_buffer.buffers(); + const Shape& tuple_shape = tuple_buffer.on_device_shape(); + std::vector results; + for (int64 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) { + // Create a shaped buffer for this destructured tuple element. + const Shape& subshape = ShapeUtil::GetSubshape(tuple_shape, {i}); + VLOG(3) << "Starting tuple element " << i << " subshape: " << subshape; + ShapedBuffer shaped_buffer(subshape, subshape, platform, device_ordinal); + + ShapeUtil::ForEachSubshape( + subshape, [&](const Shape& s, const ShapeIndex& index) { + ShapeIndex original(index); + original.push_front(i); + se::DeviceMemoryBase* device_memory = + shape_tree.mutable_element(original); + shaped_buffer.set_buffer(*device_memory, index); + *device_memory = se::DeviceMemoryBase(); + }); + + VLOG(3) << "Completed tuple element: " << i; + results.push_back(new LocalShapedBuffer( + ScopedShapedBuffer(std::move(shaped_buffer), allocator))); + } + // Deallocate the root buffer. + se::DeviceMemoryBase root_buffer = tuple_buffer.root_buffer(); + TF_RETURN_IF_ERROR(allocator->Deallocate(device_ordinal, root_buffer)); + return new LocalShapedBufferTuple(std::move(results)); +} + +} // namespace swig } // namespace xla diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h index 9ac13b65231c932f152c1e79eb8e576cc6331fbd..78b3c598b97294d2ba4deb72ec9c1251ef68b7cf 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.h +++ b/tensorflow/compiler/xla/python/local_computation_builder.h @@ -16,17 +16,16 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_ #define TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_ +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/executable_build_options.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { - namespace swig { // Initializes the number of replicas that XLA will be initialized with (when @@ -61,18 +60,49 @@ StatusOr > TransferFromOutfeedLocalReplica( class LocalShapedBuffer { public: static StatusOr FromLiteral( - const Literal& argument, - const tensorflow::gtl::optional& shape_with_layout); + const Literal& argument, const absl::optional& shape_with_layout); LocalShapedBuffer(ScopedShapedBuffer shaped_buffer); const ScopedShapedBuffer* shaped_buffer() const; StatusOr > ToLiteral() const; + // Transfers ownership of the encapsulated ShapedBuffer to the caller, + // analogous to std::unique_ptr::release(). + ShapedBuffer Release(); + private: ScopedShapedBuffer shaped_buffer_; }; +// Result of a tuple destructuring operation on a LocalShapedBuffer -- this +// appears to be a simpler mechanism for the time being than an alternative like +// using SWIG to transform std::vectors into Python lists of SWIG objects +// directly. +class LocalShapedBufferTuple { + public: + // Note: any LocalShapedBuffer elements that are not Release()'d will be + // deallocated in the destructor. + explicit LocalShapedBufferTuple(std::vector elements); + + ~LocalShapedBufferTuple(); + + // Releases the ith element to the caller. Further attempts to release the ith + // element will return an invalid argument error. + StatusOr Release(int i); + + // Returns the number of elements in the destructured tuple. + int size() const; + + private: + std::vector elements_; +}; + +// Destructures a tuple-valued LocalShapedBuffer into its constitutent elements +// in LocalShapedBufferTuple form. +StatusOr DestructureLocalShapedBufferTuple( + LocalShapedBuffer* local_shaped_buffer); + // Wraps a LocalExecutable produced by compiling a // LocalComputation. The Execute method forwards to that of the // underlying LocalExecutable, and additionally handles tranferring @@ -89,10 +119,10 @@ class CompiledLocalComputation { // shapes_with_layout. StatusOr > Execute( const std::vector& arguments, - const std::vector >& shapes_with_layout); + const std::vector >& shapes_with_layout); LocalShapedBuffer* ExecuteWithShapedBuffers( - tensorflow::gtl::ArraySlice argument_handles); + absl::Span argument_handles); private: std::unique_ptr executable_; @@ -156,7 +186,7 @@ class LocalComputationBuilder { LocalOp Parameter(int64 parameter_number, const Shape& shape, const string& name); - std::unique_ptr GetShape(const LocalOp& operand); + StatusOr GetShape(const LocalOp& operand); // Returns the shape of the current return value for the computation. StatusOr GetReturnValueShape(); @@ -169,46 +199,41 @@ class LocalComputationBuilder { LocalOp ConstantLiteral(const Literal& literal); LocalOp Broadcast(const LocalOp& operand, - tensorflow::gtl::ArraySlice broadcast_sizes); + absl::Span broadcast_sizes); LocalOp Pad(const LocalOp& operand, const LocalOp& padding_value, const PaddingConfig& padding_config); - LocalOp Reshape(const LocalOp& operand, - tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice new_sizes); + LocalOp Reshape(const LocalOp& operand, absl::Span dimensions, + absl::Span new_sizes); - LocalOp Collapse(const LocalOp& operand, - tensorflow::gtl::ArraySlice dimensions); + LocalOp Collapse(const LocalOp& operand, absl::Span dimensions); LocalOp CrossReplicaSum(const LocalOp& operand); - LocalOp Slice(const LocalOp& operand, - tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices, - tensorflow::gtl::ArraySlice strides); + LocalOp Slice(const LocalOp& operand, absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides); LocalOp SliceInDim(const LocalOp& operand, int64 start_index, int64 limit_index, int64 stride, int64 dimno); LocalOp DynamicSlice(const LocalOp& operand, const LocalOp& start_indices, - tensorflow::gtl::ArraySlice slice_sizes); + absl::Span slice_sizes); LocalOp DynamicUpdateSlice(const LocalOp& operand, const LocalOp& update, const LocalOp& start_indices); - LocalOp ConcatInDim(tensorflow::gtl::ArraySlice operands, - int64 dimension); + LocalOp ConcatInDim(absl::Span operands, int64 dimension); LocalOp SelectAndScatterWithGeneralPadding( const LocalOp& operand, const LocalComputation& select, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice > padding, - const LocalOp& source, const LocalOp& init_value, - const LocalComputation& scatter); + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span > padding, const LocalOp& source, + const LocalOp& init_value, const LocalComputation& scatter); - LocalOp Tuple(tensorflow::gtl::ArraySlice elements); + LocalOp Tuple(absl::Span elements); LocalOp GetTupleElement(const LocalOp& tuple_data, int64 index); @@ -219,39 +244,40 @@ class LocalComputationBuilder { LocalOp ConvGeneralDilated( const LocalOp& lhs, const LocalOp& rhs, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice > padding, - tensorflow::gtl::ArraySlice lhs_dilation, - tensorflow::gtl::ArraySlice rhs_dilation, + absl::Span window_strides, + absl::Span > padding, + absl::Span lhs_dilation, + absl::Span rhs_dilation, const ConvolutionDimensionNumbers& dimension_numbers); LocalOp ConvertElementType(const LocalOp& operand, PrimitiveType new_element_type); + LocalOp BitcastConvertType(const LocalOp& operand, + PrimitiveType new_element_type); + LocalOp Call(const LocalComputation& local_computation, - tensorflow::gtl::ArraySlice operands); + absl::Span operands); LocalOp Transpose(const LocalOp& operand, - tensorflow::gtl::ArraySlice permutation); + absl::Span permutation); - LocalOp Rev(const LocalOp& operand, - tensorflow::gtl::ArraySlice dimensions); + LocalOp Rev(const LocalOp& operand, absl::Span dimensions); - LocalOp Map(tensorflow::gtl::ArraySlice operands, + LocalOp Map(absl::Span operands, const LocalComputation& local_computation, - tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice static_operands); + absl::Span dimensions); LocalOp Reduce(const LocalOp& operand, const LocalOp& init_value, const LocalComputation& local_computation, - tensorflow::gtl::ArraySlice dimensions_to_reduce); + absl::Span dimensions_to_reduce); LocalOp ReduceWindowWithGeneralPadding( const LocalOp& operand, const LocalOp& init_value, const LocalComputation& local_computation, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, - tensorflow::gtl::ArraySlice > padding); + absl::Span window_dimensions, + absl::Span window_strides, + absl::Span > padding); LocalOp RngNormal(const LocalOp& mu, const LocalOp& sigma, const Shape& shape); @@ -268,6 +294,11 @@ class LocalComputationBuilder { StatusOr IsConstant(const LocalOp& operand); + LocalOp Sort(const LocalOp& operand, int64 dimension); + + LocalOp SortKeyVal(const LocalOp& keys, const LocalOp& values, + int64 dimension); + StatusOr BuildConstantSubGraph(const LocalOp& operand); #define _FORWARD(method_name, return_sig, args_sig) \ @@ -279,7 +310,7 @@ class LocalComputationBuilder { #define _FORWARD_BINOP(method_name) \ _FORWARD(method_name, LocalOp, \ (const LocalOp& lhs, const LocalOp& rhs, \ - tensorflow::gtl::ArraySlice broadcast_dimensions)) + absl::Span broadcast_dimensions)) #define _FORWARD_TRIOP(method_name) \ _FORWARD(method_name, LocalOp, \ @@ -302,24 +333,49 @@ class LocalComputationBuilder { _FORWARD_BINOP(Min) _FORWARD_BINOP(And) _FORWARD_BINOP(Or) + _FORWARD_BINOP(Xor) + _FORWARD_BINOP(ShiftLeft) + _FORWARD_BINOP(ShiftRightArithmetic) + _FORWARD_BINOP(ShiftRightLogical) + _FORWARD_BINOP(Atan2) + _FORWARD_BINOP(Pow) + _FORWARD_BINOP(Complex) _FORWARD_UNOP(Not) _FORWARD_UNOP(Abs) _FORWARD_UNOP(Exp) + _FORWARD_UNOP(Expm1) _FORWARD_UNOP(Floor) _FORWARD_UNOP(Ceil) _FORWARD_UNOP(Round) _FORWARD_UNOP(Log) + _FORWARD_UNOP(Log1p) _FORWARD_UNOP(Sign) _FORWARD_UNOP(Cos) _FORWARD_UNOP(Sin) _FORWARD_UNOP(Tanh) - _FORWARD_UNOP(SqrtF32) - _FORWARD_UNOP(SquareF32) - _FORWARD_BINOP(Pow) _FORWARD_UNOP(IsFinite) - _FORWARD_UNOP(ReciprocalF32) _FORWARD_UNOP(Neg) - _FORWARD_UNOP(Sort) + _FORWARD_UNOP(Sqrt) + _FORWARD_UNOP(Rsqrt) + _FORWARD_UNOP(Square) + _FORWARD_UNOP(Reciprocal) + _FORWARD_UNOP(Erfc) + _FORWARD_UNOP(Erf) + _FORWARD_UNOP(ErfInv) + _FORWARD_UNOP(Lgamma) + _FORWARD_UNOP(Digamma) + _FORWARD_UNOP(Acos) + _FORWARD_UNOP(Asin) + _FORWARD_UNOP(Atan) + _FORWARD_UNOP(Tan) + _FORWARD_UNOP(Acosh) + _FORWARD_UNOP(Asinh) + _FORWARD_UNOP(Atanh) + _FORWARD_UNOP(Cosh) + _FORWARD_UNOP(Sinh) + _FORWARD_UNOP(Real) + _FORWARD_UNOP(Imag) + _FORWARD_UNOP(Conj) #undef _FORWARD #undef _FORWARD_UNOP @@ -336,7 +392,6 @@ void DeleteCompiledLocalComputation(CompiledLocalComputation* computation); void DeleteLocalComputation(LocalComputation* computation); } // namespace swig - } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_ diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i index 51412ca4744728c5f9b26e47cbe2dde29de548c3..76c09512d82006af35e2508ce8e60f23a4c056c3 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.i +++ b/tensorflow/compiler/xla/python/local_computation_builder.i @@ -22,15 +22,15 @@ limitations under the License. // // C++ Python // -------------------------------------+--------------------------------------- -// ArraySlice <- sequence of int -// ArraySlice <- sequence of LocalOp +// Span <- sequence of int +// Span <- sequence of LocalOp // Literal <-> (nested tuple of) numpy ndarray // std::vector <- sequence of (nested tuple of) ndarray // Shape -> pair holding (dtype, dimensions) // <- object duck-typed as xla_client.Shape // std::vector <- sequence of xla_client.Shape objects // PrimitiveType <- int -// ArraySlice> <- sequence of int pairs +// Span> <- sequence of int pairs // PaddingConfig proto <- corresponding Python proto // ConvolutionDimensionNumbers proto <- corresponding Python proto // DotDimensionNumbers proto <- corresponding Python proto @@ -109,10 +109,12 @@ limitations under the License. // Must be included first #include "tensorflow/python/lib/core/numpy.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "third_party/absl/strings/str_cat.h" +#include "third_party/absl/strings/str_format.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" +#include "third_party/absl/types/span.h" #include "tensorflow/compiler/xla/python/numpy_bridge.h" #include "tensorflow/compiler/xla/python/local_computation_builder.h" @@ -154,8 +156,8 @@ bool HandleStringAttribute(PyObject* o, return true; // The attribute is None, which we consider ok. } if (!PyString_Check(attr)) { - string message = tensorflow::strings::Printf("%s must be a string or none; got %s", - attr_name, numpy::PyObjectCppRepr(attr).c_str()); + string message = absl::StrFormat("%s must be a string or none; got %s", + attr_name, numpy::PyObjectCppRepr(attr)); PyErr_SetString(PyExc_TypeError, message.c_str()); Py_DECREF(attr); return false; // Type error, not ok. @@ -200,6 +202,20 @@ tensorflow::ImportNumpy(); } } +%typemap(out) StatusOr { + if ($1.ok()) { + auto* value = $1.ValueOrDie(); + { + auto* $1 = value; + $typemap(out, xla::swig::LocalShapedBufferTuple*) + } + } else { + PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str()); + SWIG_fail; + } +} + + %typemap(out) StatusOr< std::unique_ptr > { if ($1.ok()) { std::unique_ptr value = $1.ConsumeValueOrDie(); @@ -251,9 +267,9 @@ tensorflow::ImportNumpy(); $result = Py_None; } -// ArraySlice +// Span -%typemap(in) tensorflow::gtl::ArraySlice +%typemap(in) absl::Span (std::vector temps) { if (!PySequence_Check($input)) { PyErr_SetString(PyExc_TypeError, "Argument is not a sequence"); @@ -283,9 +299,9 @@ tensorflow::ImportNumpy(); $1 = temps; } -// ArraySlice +// Span -%typemap(in) tensorflow::gtl::ArraySlice( +%typemap(in) absl::Span( std::vector temps) { if (!PySequence_Check($input)) { PyErr_SetString(PyExc_TypeError, "Argument is not a sequence"); @@ -307,7 +323,7 @@ tensorflow::ImportNumpy(); // LocalShapedBuffer* -%typemap(in) tensorflow::gtl::ArraySlice +%typemap(in) absl::Span (std::vector temps) { if (!PySequence_Check($input)) { PyErr_SetString(PyExc_TypeError, "Argument is not a sequence"); @@ -395,10 +411,10 @@ tensorflow::ImportNumpy(); $1 = &temp; } -%typemap(in) const tensorflow::gtl::optional& ( - tensorflow::gtl::optional temp) { +%typemap(in) const absl::optional& ( + absl::optional temp) { if ($input == Py_None) { - temp = tensorflow::gtl::nullopt; + temp = absl::nullopt; $1 = &temp; } else { StatusOr statusor = numpy::XlaShapeFromPyShape($input); @@ -434,8 +450,8 @@ tensorflow::ImportNumpy(); $1 = &temps; } -%typemap(in) const std::vector >& ( - std::vector > temps) { +%typemap(in) const std::vector >& ( + std::vector > temps) { if (!PySequence_Check($input)) { PyErr_SetString(PyExc_TypeError, "Argument is not a sequence"); SWIG_fail; @@ -444,7 +460,7 @@ tensorflow::ImportNumpy(); for (int i = 0; i < size; ++i) { PyObject* o = PySequence_GetItem($input, i); if (o == Py_None) { - temps.push_back(tensorflow::gtl::nullopt); + temps.push_back(absl::nullopt); } else { StatusOr statusor = numpy::XlaShapeFromPyShape(o); Py_DECREF(o); @@ -480,9 +496,9 @@ tensorflow::ImportNumpy(); $1 = static_cast(value); } -// ArraySlice> +// Span> -%typemap(in) tensorflow::gtl::ArraySlice > +%typemap(in) absl::Span > (std::vector > temps) { if (!PySequence_Check($input)) { PyErr_SetString(PyExc_TypeError, "Argument is not a sequence"); @@ -851,6 +867,11 @@ tensorflow::ImportNumpy(); })) { return nullptr; } + if (!HandleStringAttribute($input, "dump_unoptimized_hlo_proto_to", [&](string s) { + build_options.set_dump_unoptimized_hlo_proto_to(std::move(s)); + })) { + return nullptr; + } if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) { build_options.set_dump_per_pass_hlo_proto_to(std::move(s)); })) { @@ -877,7 +898,7 @@ tensorflow::ImportNumpy(); if (o != Py_None) { StatusOr statusor = numpy::XlaShapeFromPyShape(o); if (!statusor.ok()) { - PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat("ExecutableBuildOptions.result_shape could not be created from Python shape value: ", statusor.status().ToString()).c_str()); + PyErr_SetString(PyExc_TypeError, absl::StrCat("ExecutableBuildOptions.result_shape could not be created from Python shape value: ", statusor.status().ToString()).c_str()); Py_DECREF(o); SWIG_fail; } @@ -900,6 +921,9 @@ tensorflow::ImportNumpy(); %unignore xla::swig::LocalShapedBuffer; %unignore xla::swig::LocalShapedBuffer::FromLiteral; %unignore xla::swig::LocalShapedBuffer::ToLiteral; +%unignore xla::swig::LocalShapedBufferTuple; +%unignore xla::swig::LocalShapedBufferTuple::Release; +%unignore xla::swig::LocalShapedBufferTuple::size; %unignore xla::swig::CompiledLocalComputation; %unignore xla::swig::CompiledLocalComputation::Execute; %unignore xla::swig::CompiledLocalComputation::ExecuteWithShapedBuffers; @@ -935,6 +959,7 @@ tensorflow::ImportNumpy(); %unignore xla::swig::LocalComputationBuilder::Tuple; %unignore xla::swig::LocalComputationBuilder::GetTupleElement; %unignore xla::swig::LocalComputationBuilder::ConvertElementType; +%unignore xla::swig::LocalComputationBuilder::BitcastConvertType; %unignore xla::swig::LocalComputationBuilder::Call; %unignore xla::swig::LocalComputationBuilder::Transpose; %unignore xla::swig::LocalComputationBuilder::Rev; @@ -966,24 +991,52 @@ tensorflow::ImportNumpy(); %unignore xla::swig::LocalComputationBuilder::Min; %unignore xla::swig::LocalComputationBuilder::And; %unignore xla::swig::LocalComputationBuilder::Or; +%unignore xla::swig::LocalComputationBuilder::Xor; +%unignore xla::swig::LocalComputationBuilder::ShiftLeft; +%unignore xla::swig::LocalComputationBuilder::ShiftRightArithmetic; +%unignore xla::swig::LocalComputationBuilder::ShiftRightLogical; %unignore xla::swig::LocalComputationBuilder::Not; %unignore xla::swig::LocalComputationBuilder::Abs; %unignore xla::swig::LocalComputationBuilder::Exp; +%unignore xla::swig::LocalComputationBuilder::Expm1; %unignore xla::swig::LocalComputationBuilder::Floor; %unignore xla::swig::LocalComputationBuilder::Ceil; %unignore xla::swig::LocalComputationBuilder::Round; %unignore xla::swig::LocalComputationBuilder::Log; +%unignore xla::swig::LocalComputationBuilder::Log1p; %unignore xla::swig::LocalComputationBuilder::Sign; %unignore xla::swig::LocalComputationBuilder::Cos; %unignore xla::swig::LocalComputationBuilder::Sin; %unignore xla::swig::LocalComputationBuilder::Tanh; -%unignore xla::swig::LocalComputationBuilder::SqrtF32; -%unignore xla::swig::LocalComputationBuilder::SquareF32; -%unignore xla::swig::LocalComputationBuilder::Pow; +%unignore xla::swig::LocalComputationBuilder::Atan2; %unignore xla::swig::LocalComputationBuilder::IsFinite; -%unignore xla::swig::LocalComputationBuilder::ReciprocalF32; +%unignore xla::swig::LocalComputationBuilder::Pow; %unignore xla::swig::LocalComputationBuilder::Neg; %unignore xla::swig::LocalComputationBuilder::Sort; +%unignore xla::swig::LocalComputationBuilder::SortKeyVal; +%unignore xla::swig::LocalComputationBuilder::Sqrt; +%unignore xla::swig::LocalComputationBuilder::Rsqrt; +%unignore xla::swig::LocalComputationBuilder::Square; +%unignore xla::swig::LocalComputationBuilder::Reciprocal; +%unignore xla::swig::LocalComputationBuilder::Erfc; +%unignore xla::swig::LocalComputationBuilder::Erf; +%unignore xla::swig::LocalComputationBuilder::ErfInv; +%unignore xla::swig::LocalComputationBuilder::Lgamma; +%unignore xla::swig::LocalComputationBuilder::Digamma; +%unignore xla::swig::LocalComputationBuilder::Acos; +%unignore xla::swig::LocalComputationBuilder::Asin; +%unignore xla::swig::LocalComputationBuilder::Atan; +%unignore xla::swig::LocalComputationBuilder::Tan; +%unignore xla::swig::LocalComputationBuilder::Acosh; +%unignore xla::swig::LocalComputationBuilder::Asinh; +%unignore xla::swig::LocalComputationBuilder::Atanh; +%unignore xla::swig::LocalComputationBuilder::Cosh; +%unignore xla::swig::LocalComputationBuilder::Sinh; +%unignore xla::swig::LocalComputationBuilder::Real; +%unignore xla::swig::LocalComputationBuilder::Imag; +%unignore xla::swig::LocalComputationBuilder::Conj; +%unignore xla::swig::LocalComputationBuilder::Complex; +%unignore xla::swig::DestructureLocalShapedBufferTuple; %unignore xla::swig::DeleteLocalShapedBuffer; %unignore xla::swig::DeleteLocalComputation; %unignore xla::swig::DeleteCompiledLocalComputation; diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc index 68648a3a176363de69a56ecb8070f82862874e94..fc6511bef566cb6f4e0d4e52972954de0792e959 100644 --- a/tensorflow/compiler/xla/python/numpy_bridge.cc +++ b/tensorflow/compiler/xla/python/numpy_bridge.cc @@ -14,6 +14,9 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/python/numpy_bridge.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/core/platform/logging.h" @@ -49,6 +52,8 @@ int PrimitiveTypeToNumpyType(PrimitiveType primitive_type) { return NPY_FLOAT32; case F64: return NPY_FLOAT64; + case C64: + return NPY_COMPLEX64; case TUPLE: return NPY_OBJECT; default: @@ -82,6 +87,8 @@ PrimitiveType NumpyTypeToPrimitiveType(int np_type) { return F32; case NPY_FLOAT64: return F64; + case NPY_COMPLEX64: + return C64; case NPY_OBJECT: return TUPLE; default: @@ -103,6 +110,7 @@ bool NumpyTypeIsValid(int np_type) { case NPY_FLOAT16: case NPY_FLOAT32: case NPY_FLOAT64: + case NPY_COMPLEX64: case NPY_OBJECT: return true; default: @@ -143,9 +151,7 @@ static int NumpyTypenum(PyObject* o) { // // NOTE: this is an internal helper for conversion to a C++, and so decrefs r. static string ExtractStringAndDecref(PyObject* r) { - auto error = [r] { - return tensorflow::strings::Printf("", r); - }; + auto error = [r] { return absl::StrFormat("", r); }; if (r == nullptr) { return error(); } @@ -185,8 +191,8 @@ StatusOr XlaShapeFromPyShape(PyObject* o) { PyObject* result = PyObject_CallMethod(o, const_cast(method.c_str()), nullptr); if (result == nullptr) { - return error(tensorflow::strings::StrCat( - "Failed to call method of shape object:", method)); + return error( + absl::StrCat("Failed to call method of shape object:", method)); } return result; }; @@ -275,15 +281,15 @@ StatusOr XlaShapeFromPyShape(PyObject* o) { // Helper that retrieves the member with attr_name, stringifies it if is not // None, and returns it as a C++ string. -static tensorflow::gtl::optional GetAttrAsString( - PyObject* o, const string& attr_name) { +static absl::optional GetAttrAsString(PyObject* o, + const string& attr_name) { if (!PyObject_HasAttrString(o, attr_name.c_str())) { - return tensorflow::gtl::nullopt; + return absl::nullopt; } PyObject* attr = PyObject_GetAttrString(o, attr_name.c_str()); if (attr == Py_None) { Py_DECREF(attr); - return tensorflow::gtl::nullopt; + return absl::nullopt; } string result = PyObjectCppStr(attr); Py_DECREF(attr); @@ -292,48 +298,46 @@ static tensorflow::gtl::optional GetAttrAsString( // Helper that retrieves the member with attr_name, checks that it is an integer // if it is not None, and returns it as an int32 value. -static tensorflow::gtl::optional GetAttrAsInt32( - PyObject* o, const string& attr_name) { +static absl::optional GetAttrAsInt32(PyObject* o, + const string& attr_name) { if (!PyObject_HasAttrString(o, attr_name.c_str())) { - return tensorflow::gtl::nullopt; + return absl::nullopt; } PyObject* attr = PyObject_GetAttrString(o, attr_name.c_str()); if (attr == Py_None) { Py_DECREF(attr); - return tensorflow::gtl::nullopt; + return absl::nullopt; } if (!CheckPyIntOrLong(attr)) { Py_DECREF(attr); - return tensorflow::gtl::nullopt; + return absl::nullopt; } long value = PyIntOrPyLongToLong(attr); // NOLINT Py_DECREF(attr); if (value == -1 && PyErr_Occurred() != nullptr) { - return tensorflow::gtl::nullopt; + return absl::nullopt; } if (static_cast(value) != value) { - return tensorflow::gtl::nullopt; + return absl::nullopt; } return value; } StatusOr OpMetadataFromPyObject(PyObject* o) { OpMetadata result; - tensorflow::gtl::optional op_type = GetAttrAsString(o, "op_type"); + absl::optional op_type = GetAttrAsString(o, "op_type"); if (op_type.has_value()) { result.set_op_type(op_type.value()); } - tensorflow::gtl::optional op_name = GetAttrAsString(o, "op_name"); + absl::optional op_name = GetAttrAsString(o, "op_name"); if (op_name.has_value()) { result.set_op_name(op_name.value()); } - tensorflow::gtl::optional source_file = - GetAttrAsString(o, "source_file"); + absl::optional source_file = GetAttrAsString(o, "source_file"); if (source_file.has_value()) { result.set_source_file(source_file.value()); } - tensorflow::gtl::optional source_line = - GetAttrAsInt32(o, "source_line"); + absl::optional source_line = GetAttrAsInt32(o, "source_line"); if (source_line.has_value()) { result.set_source_line(source_line.value()); } @@ -374,7 +378,7 @@ StatusOr> XlaLiteralFromPyObject(PyObject* o) { TF_ASSIGN_OR_RETURN(auto literal, XlaLiteralFromPyObject(element)); elements.push_back(std::move(literal)); } - return Literal::MakeTupleOwned(std::move(elements)); + return LiteralUtil::MakeTupleOwned(std::move(elements)); } else if (PyArray_Check(o)) { PyArrayObject* py_array = reinterpret_cast(o); int rank = PyArray_NDIM(py_array); @@ -383,7 +387,7 @@ StatusOr> XlaLiteralFromPyObject(PyObject* o) { dimensions[i] = PyArray_DIM(py_array, i); } int np_type = PyArray_TYPE(py_array); - auto literal = Literal::CreateFromDimensions( + auto literal = LiteralUtil::CreateFromDimensions( NumpyTypeToPrimitiveType(np_type), dimensions); TF_RETURN_IF_ERROR( CopyNumpyArrayToLiteral(np_type, py_array, literal.get())); @@ -424,6 +428,9 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array, case NPY_FLOAT64: CopyNumpyArrayToLiteral(py_array, literal); break; + case NPY_COMPLEX64: + CopyNumpyArrayToLiteral(py_array, literal); + break; default: return InvalidArgument( "No XLA literal container for Numpy type number: %d", np_type); @@ -461,6 +468,9 @@ void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal, case NPY_FLOAT64: CopyLiteralToNumpyArray(literal, py_array); break; + case NPY_COMPLEX64: + CopyLiteralToNumpyArray(literal, py_array); + break; default: LOG(FATAL) << "No XLA literal container for Numpy type" << np_type; } diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h index 64f0aae0f9790f0199ac6cb931a5c9f6dc356f4c..8cae1751853f3cd18033ecf6edca40bf99c6d917 100644 --- a/tensorflow/compiler/xla/python/numpy_bridge.h +++ b/tensorflow/compiler/xla/python/numpy_bridge.h @@ -25,9 +25,9 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/python/lib/core/numpy.h" namespace xla { diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index 50b548afa5f26af8b2961edfe120d2cb0af860d9..fa4366ff0789a3d05c26479a746a18dfcf7e902b 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -89,20 +89,39 @@ _UNARY_OPS = [ 'Not', 'Abs', 'Exp', + 'Expm1', 'Floor', 'Round', 'Ceil', 'Log', + 'Log1p', 'Sign', 'Cos', 'Sin', 'Tanh', - 'SqrtF32', - 'SquareF32', 'IsFinite', - 'ReciprocalF32', + 'Sqrt', + 'Rsqrt', + 'Square', + 'Reciprocal', 'Neg', - 'Sort', + 'Erf', + 'Erfc', + 'ErfInv', + 'Lgamma', + 'Digamma', + 'Acos', + 'Asin', + 'Atan', + 'Tan', + 'Acosh', + 'Asinh', + 'Atanh', + 'Cosh', + 'Sinh', + 'Real', + 'Imag', + 'Conj', ] _BINARY_OPS = [ @@ -121,7 +140,13 @@ _BINARY_OPS = [ 'Min', 'And', 'Or', + 'Xor', 'Pow', + 'ShiftLeft', + 'ShiftRightArithmetic', + 'ShiftRightLogical', + 'Atan2', + 'Complex', ] @@ -184,6 +209,14 @@ class LocalBuffer(object): self._delete(self.c_local_shaped_buffer) self.c_local_shaped_buffer = None + def destructure(self): + assert self.c_local_shaped_buffer is not None + result = c_api.DestructureLocalShapedBufferTuple(self.c_local_shaped_buffer) + self.c_local_shaped_buffer = None + size = result.size() + destructured = tuple(LocalBuffer(result.Release(i)) for i in xrange(size)) + return destructured + def is_deleted(self): return self.c_local_shaped_buffer is None @@ -247,9 +280,12 @@ class Shape(object): self._dimensions == other._dimensions and self._minor_to_major == other._minor_to_major) + def __ne__(self, other): + return not self == other + def __repr__(self): return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, ' - '_is_tuple={!r}), _minor_to_major={!r}').format( + '_is_tuple={!r}, _minor_to_major={!r})').format( self._dtype, self._dimensions, self._is_tuple, self._minor_to_major) @@ -353,6 +389,7 @@ class CompileOptions(object): def __init__(self): self.generate_hlo_graph = None self.dump_optimized_hlo_proto_to = None + self.dump_unoptimized_hlo_proto_to = None self.dump_per_pass_hlo_proto_to = None self.hlo_profile = False @@ -446,14 +483,16 @@ class LocalComputation(object): if self.is_compiled: raise ValueError('Attempt to compile a compiled local XLA computation.') + result_shape = _wrap_shape(self.c_local_computation.GetReturnValueShape()) + if layout_fn: argument_shapes = [ shape.map_leaves(layout_fn) for shape in argument_shapes ] - result_shape = _wrap_shape(self.c_local_computation.GetReturnValueShape()) result_shape = result_shape.map_leaves(layout_fn) - compile_options = compile_options or CompileOptions() - compile_options.result_shape = result_shape + + compile_options = compile_options or CompileOptions() + compile_options.result_shape = result_shape return LocalComputation( self.c_local_computation.Compile(argument_shapes, compile_options), is_compiled=True) @@ -685,6 +724,18 @@ class ComputationBuilder(object): """ return self._client.ConvertElementType(operand, new_element_type) + def BitcastConvertType(self, operand, new_element_type): + """Enqueues a bitcast type conversion operation onto the computation. + + Args: + operand: the operand to convert. + new_element_type: the target primitive type. + + Returns: + A LocalOp representing the added conversion op. + """ + return self._client.BitcastConvertType(operand, new_element_type) + def GetShape(self, operand): return _wrap_shape(self._client.GetShape(operand)) @@ -894,20 +945,19 @@ class ComputationBuilder(object): """ return self._client.Call(computation_to_apply.c_local_computation, operands) - def Map(self, operands, computation_to_apply, dimensions, static_operands=()): + def Map(self, operands, computation_to_apply, dimensions): """Enqueues a map operation onto the computation. Args: operands: an iterable of LocalOp. computation_to_apply: a Computation object. dimensions: dimensions over which to apply map the function. - static_operands: auxiliary arguments passed to the applied computation. Returns: A LocalOp representing the added Map op. """ return self._client.Map(operands, computation_to_apply.c_local_computation, - dimensions, static_operands) + dimensions) def Reduce(self, operand, init_value, computation_to_apply, dimensions): """Enqueues a reduction operation onto the computation. @@ -1112,6 +1162,69 @@ class ComputationBuilder(object): dimension_numbers.output_spatial_dimensions.extend(range(2, 2 + nd)) return dimension_numbers + def ConvGeneralDilated(self, lhs, rhs, window_strides, padding, lhs_dilation, + rhs_dilation, dimension_numbers): + """Enqueues a ConvGeneralDilated operation onto the computation. + + Args: + lhs: LocalOp for the rank N+2 array of inputs. + rhs: LocalOp for the rank N+2 array of kernel weights. + window_strides: length-N array-like of integer kernel strides. + padding: length-N array-like of pairs of integers of (low, high) padding. + lhs_dilation: length-N array-like of integer dilation factors. + rhs_dilation: length-N array-like of integer dilation factors. + dimension_numbers: either an xla_data_pb2.ConvolutionDimensionNumbers or a + triple (lhs_spec, rhs_spec, out_spec) where each element is a string of + length N+2 identifying by position (1) batch dimensions in lhs, rhs, and + the output with the character 'N', (2) feature dimensions in lhs and the + output with the character 'C', (3) input and output feature dimensions + in rhs with the characters 'I' and 'O' respectively, and (4) spatial + dimension correspondences between lhs, rhs, and the output using any + distinct characters. For example, to indicate dimension numbers + consistent with the Conv operation with two spatial dimensions, one + could use ('NCHW', 'OIHW', 'NCHW'). As another example, to indicate + dimension numbers consistent with the TensorFlow Conv2D operation, one + could use ('NHWC', 'HWIO', 'NHWC'). When using the latter form of + convolution dimension specification, window strides are associated with + spatial dimension character labels according to the order in which the + labels appear in the rhs_spec string, so that window_strides[0] is + matched with the dimension corresponding to the first character + appearing in rhs_spec that is not 'I' or 'O'. + + Returns: a LocalOp representing the ConvGenralDilated operation. + """ + if not isinstance(dimension_numbers, + xla_data_pb2.ConvolutionDimensionNumbers): + lhs_spec, rhs_spec, out_spec = dimension_numbers + dimension_numbers = xla_data_pb2.ConvolutionDimensionNumbers() + + dimension_numbers.input_batch_dimension = lhs_spec.index('N') + dimension_numbers.input_feature_dimension = lhs_spec.index('C') + dimension_numbers.output_batch_dimension = out_spec.index('N') + dimension_numbers.output_feature_dimension = out_spec.index('C') + dimension_numbers.kernel_output_feature_dimension = rhs_spec.index('O') + dimension_numbers.kernel_input_feature_dimension = rhs_spec.index('I') + + dimension_numbers.kernel_spatial_dimensions.extend( + i for i, c in enumerate(rhs_spec) if c not in {'I', 'O'}) + dimension_numbers.input_spatial_dimensions.extend( + sorted((i for i, c in enumerate(lhs_spec) if c not in {'N', 'C'}), + key=lambda i: rhs_spec.index(lhs_spec[i]))) + dimension_numbers.output_spatial_dimensions.extend( + sorted((i for i, c in enumerate(out_spec) if c not in {'N', 'C'}), + key=lambda i: rhs_spec.index(out_spec[i]))) + return self._client.ConvGeneralDilated(lhs, rhs, window_strides, padding, + lhs_dilation, rhs_dilation, + dimension_numbers) + + def Sort(self, operand, dimension=-1): + """Enqueues a sort operation onto the computation.""" + return self._client.Sort(operand, dimension) + + def SortKeyVal(self, keys, values, dimension=-1): + """Enqueues a key-value sort operation onto the computation.""" + return self._client.SortKeyVal(keys, values, dimension) + def _forward_methods_to_local_builder(): """Forward remaining ComputationBuilder methods to the C API. diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py index e3d393bcccb69a91ea3e45f8177bd65dc12fd89f..fd98e19457f61aade947aa354d2e415148d127f6 100644 --- a/tensorflow/compiler/xla/python/xla_client_test.py +++ b/tensorflow/compiler/xla/python/xla_client_test.py @@ -157,6 +157,13 @@ class ComputationsWithConstantsTest(LocalComputationTest): c.Constant(NumpyArrayBool([True, True, False, False]))) self._ExecuteAndCompareExact(c, expected=[True, True, True, False]) + def testBooleanXor(self): + c = self._NewComputation() + c.Xor( + c.Constant(NumpyArrayBool([True, False, True, False])), + c.Constant(NumpyArrayBool([True, True, False, False]))) + self._ExecuteAndCompareExact(c, expected=[False, True, True, False]) + def testSum2DF32(self): c = self._NewComputation() c.Add( @@ -164,6 +171,24 @@ class ComputationsWithConstantsTest(LocalComputationTest): c.Constant(NumpyArrayF32([[1, -1, 1], [-1, 1, -1]]))) self._ExecuteAndCompareClose(c, expected=[[2, 1, 4], [3, 6, 5]]) + def testShiftLeft(self): + c = self._NewComputation() + c.ShiftLeft(c.Constant(NumpyArrayS32([3])), + c.Constant(NumpyArrayS32([2]))) + self._ExecuteAndCompareClose(c, expected=[12]) + + def testShiftRightArithmetic(self): + c = self._NewComputation() + c.ShiftRightArithmetic(c.Constant(NumpyArrayS32([-2])), + c.Constant(NumpyArrayS32([1]))) + self._ExecuteAndCompareClose(c, expected=[-1]) + + def testShiftRightLogical(self): + c = self._NewComputation() + c.ShiftRightLogical(c.Constant(NumpyArrayS32([-1])), + c.Constant(NumpyArrayS32([1]))) + self._ExecuteAndCompareClose(c, expected=[2**31 - 1]) + def testGetProto(self): c = self._NewComputation() c.Add( @@ -365,6 +390,55 @@ class LocalBufferTest(LocalComputationTest): with self.assertRaises(ValueError): compiled_c.ExecuteWithLocalBuffers([arg_buffer]) + def testDestructureTupleEmpty(self): + t = () + local_buffer = xla_client.LocalBuffer.from_pyval(t) + pieces = local_buffer.destructure() + self.assertTrue(local_buffer.is_deleted()) + self.assertEqual(len(pieces), 0) + + def testDestructureTupleOneArrayElement(self): + t = (np.array([1, 2, 3, 4], dtype=np.int32),) + local_buffer = xla_client.LocalBuffer.from_pyval(t) + pieces = local_buffer.destructure() + self.assertTrue(local_buffer.is_deleted()) + self.assertEqual(len(pieces), 1) + array = pieces[0] + got = array.to_py() + want = NumpyArrayS32([1, 2, 3, 4]) + np.testing.assert_equal(want, got) + + def testDestructureTupleTwoArrayElementDifferentType(self): + t = (np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32), + np.array([2, 3, 4, 5], dtype=np.int32)) + local_buffer = xla_client.LocalBuffer.from_pyval(t) + pieces = local_buffer.destructure() + self.assertTrue(local_buffer.is_deleted()) + self.assertEqual(len(pieces), 2) + array0, array1 = pieces + got = array0.to_py() + want = NumpyArrayF32([1.0, 2.0, 3.0, 4.0]) + np.testing.assert_equal(want, got) + got = array1.to_py() + want = NumpyArrayS32([2, 3, 4, 5]) + np.testing.assert_equal(want, got) + + def testDestructureTupleNested(self): + t = ((NumpyArrayF32([1.0, 2.0]), NumpyArrayS32([3, 4])), NumpyArrayS32([5])) + local_buffer = xla_client.LocalBuffer.from_pyval(t) + pieces = local_buffer.destructure() + self.assertTrue(local_buffer.is_deleted()) + self.assertEqual(len(pieces), 2) + tuple0, array1 = pieces + got = array1.to_py() + want = NumpyArrayS32([5]) + np.testing.assert_equal(want, got) + got = tuple0.to_py() + self.assertEqual(type(got), tuple) + self.assertEqual(len(got), 2) + np.testing.assert_equal(NumpyArrayF32([1.0, 2.0]), got[0]) + np.testing.assert_equal(NumpyArrayS32([3, 4]), got[1]) + class SingleOpTest(LocalComputationTest): """Tests for single ops. @@ -415,6 +489,34 @@ class SingleOpTest(LocalComputationTest): for src_dtype, dst_dtype in itertools.product(xla_types, xla_types): _ConvertAndTest(x, src_dtype, dst_dtype) + def testBitcastConvertType(self): + xla_x32_types = { + np.int32: xla_client.xla_data_pb2.S32, + np.float32: xla_client.xla_data_pb2.F32, + } + + xla_x64_types = { + np.int64: xla_client.xla_data_pb2.S64, + np.float64: xla_client.xla_data_pb2.F64, + } + + def _ConvertAndTest(template, src_dtype, dst_dtype, dst_etype): + c = self._NewComputation() + x = c.Constant(np.array(template, dtype=src_dtype)) + c.BitcastConvertType(x, dst_etype) + + result = c.Build().Compile().Execute() + expected = np.array(template, src_dtype).view(dst_dtype) + + self.assertEqual(result.shape, expected.shape) + self.assertEqual(result.dtype, expected.dtype) + np.testing.assert_equal(result, expected) + + x = [0, 1, 0, 0, 1] + for xla_types in [xla_x32_types, xla_x64_types]: + for src_dtype, dst_dtype in itertools.product(xla_types, xla_types): + _ConvertAndTest(x, src_dtype, dst_dtype, xla_types[dst_dtype]) + def testCrossReplicaSumOneReplica(self): samples = [ NumpyArrayF32(42.0), @@ -519,6 +621,46 @@ class SingleOpTest(LocalComputationTest): [40., 50., 0.]]]]) self._ExecuteAndCompareClose(c, expected=result) + def testConvGeneralDilatedF32(self): + c = self._NewComputation() + a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32") + lhs = a(1, 1, 2, 3) + rhs = a(1, 1, 1, 2) * 10 + strides = [1, 1] + pads = [(1, 0), (0, 1)] + lhs_dilation = (2, 1) + rhs_dilation = (1, 1) + dimension_numbers = ("NCHW", "OIHW", "NCHW") + c.ConvGeneralDilated(c.Constant(lhs), c.Constant(rhs), + strides, pads, lhs_dilation, rhs_dilation, + dimension_numbers) + result = np.array([[[[0., 0., 0.], + [10., 20., 0.], + [0., 0., 0.], + [40., 50., 0.]]]]) + self._ExecuteAndCompareClose(c, expected=result) + + def testConvGeneralDilatedPermutedF32(self): + c = self._NewComputation() + a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32") + lhs = a(1, 1, 2, 3) + rhs = a(1, 1, 1, 2) * 10 + strides = [1, 1] + pads = [(1, 0), (0, 1)] + lhs_dilation = (2, 1) + rhs_dilation = (1, 1) + + dimension_numbers = ("NHWC", "OIHW", "CWNH") + c.ConvGeneralDilated(c.Constant(np.transpose(lhs, (0, 2, 3, 1))), + c.Constant(rhs), + strides, pads, lhs_dilation, rhs_dilation, + dimension_numbers) + result = np.array([[[[0., 0., 0.], + [10., 20., 0.], + [0., 0., 0.], + [40., 50., 0.]]]]) + self._ExecuteAndCompareClose(c, expected=np.transpose(result, (1, 3, 0, 2))) + def testBooleanNot(self): c = self._NewComputation() arr = NumpyArrayBool([True, False, True]) @@ -531,6 +673,12 @@ class SingleOpTest(LocalComputationTest): c.Exp(c.Constant(arr)) self._ExecuteAndCompareClose(c, expected=np.exp(arr)) + def testExpm1(self): + c = self._NewComputation() + arr = NumpyArrayF32([3.3, 12.1]) + c.Expm1(c.Constant(arr)) + self._ExecuteAndCompareClose(c, expected=np.expm1(arr)) + def testRound(self): c = self._NewComputation() arr = NumpyArrayF32([3.3, 12.1]) @@ -543,6 +691,12 @@ class SingleOpTest(LocalComputationTest): c.Log(c.Constant(arr)) self._ExecuteAndCompareClose(c, expected=np.log(arr)) + def testLog1p(self): + c = self._NewComputation() + arr = NumpyArrayF32([3.3, 12.1]) + c.Log1p(c.Constant(arr)) + self._ExecuteAndCompareClose(c, expected=np.log1p(arr)) + def testNeg(self): c = self._NewComputation() arr = NumpyArrayF32([3.3, 12.1]) @@ -1067,14 +1221,6 @@ class EmbeddedComputationsTest(LocalComputationTest): self._CreateBinaryDivF64Computation(), [0]) self._ExecuteAndCompareClose(c, expected=[0.2, 0.4, 0.75, 1.0]) - def DISABLED_testMapWithStaticOperands(self): - c = self._NewComputation() - factor = c.ConstantF32Scalar(3.0) - c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))], - self._CreateMulF32ByParamComputation(), [0], - static_operands=[factor]) - self._ExecuteAndCompareClose(c, expected=[3.0, 6.0, 9.0, 12.0]) - def testSelectAndScatterF32(self): c = self._NewComputation() c.SelectAndScatter(c.Constant(NumpyArrayF32([[1., 2., 6.], [4., 5., 3.]])), diff --git a/tensorflow/compiler/xla/python_api/BUILD b/tensorflow/compiler/xla/python_api/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..d790c4db6c466a2bf4d2cf30365749fb901f74a0 --- /dev/null +++ b/tensorflow/compiler/xla/python_api/BUILD @@ -0,0 +1,38 @@ +# Description: +# Python API for XLA. + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//tensorflow:internal"]) + +py_library( + name = "types", + srcs = ["types.py"], + deps = [ + "//tensorflow/compiler/xla:xla_data_proto_py", + "//tensorflow/python:dtypes", + "//tensorflow/python:platform", + "//third_party/py/numpy", + ], +) + +py_library( + name = "xla_shape", + srcs = ["xla_shape.py"], + visibility = ["//visibility:public"], + deps = [ + ":types", + "//tensorflow/compiler/xla:xla_data_proto_py", + ], +) + +py_library( + name = "xla_literal", + srcs = ["xla_literal.py"], + visibility = ["//visibility:public"], + deps = [ + ":types", + ":xla_shape", + "//tensorflow/compiler/xla:xla_data_proto_py", + ], +) diff --git a/tensorflow/compiler/xla/python_api/types.py b/tensorflow/compiler/xla/python_api/types.py new file mode 100644 index 0000000000000000000000000000000000000000..57dfce3971b829d2a3052d347e5d2d322db0c841 --- /dev/null +++ b/tensorflow/compiler/xla/python_api/types.py @@ -0,0 +1,131 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ====================================== +"""Utilities for XLA-specific Python types.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +import numpy as _np # Avoids becoming a part of public Tensorflow API. + +from tensorflow.compiler.xla import xla_data_pb2 +from tensorflow.python.framework import dtypes + +# Records corresponsence between a XLA primitive type and Python/Numpy types. +# +# primitive_type: value of type xla_data_pb2.PrimitiveType +# numpy_dtype: corresponsing Numpy "dtype" (like np.float32) +# literal_field_name: name of the field in the LiteralProto message elements +# of this type go into. +# literal_field_type: type of the field named 'literal_field_name'. +# +# TODO(eliben): figure out how to avoid knowing the extra Python type and the +# astype cast when writing into Literals. +TypeConversionRecord = collections.namedtuple('TypeConversionRecord', [ + 'primitive_type', 'numpy_dtype', 'literal_field_name', 'literal_field_type' +]) + +# Maps from XLA primitive types to TypeConversionRecord. +MAP_XLA_TYPE_TO_RECORD = { + xla_data_pb2.BF16: + TypeConversionRecord( + primitive_type=xla_data_pb2.BF16, + numpy_dtype=dtypes.bfloat16.as_numpy_dtype, + literal_field_name='bf16s', + literal_field_type=float), + xla_data_pb2.F16: + TypeConversionRecord( + primitive_type=xla_data_pb2.F16, + numpy_dtype=_np.float16, + literal_field_name='f16s', + literal_field_type=float), + xla_data_pb2.F32: + TypeConversionRecord( + primitive_type=xla_data_pb2.F32, + numpy_dtype=_np.float32, + literal_field_name='f32s', + literal_field_type=float), + xla_data_pb2.F64: + TypeConversionRecord( + primitive_type=xla_data_pb2.F64, + numpy_dtype=_np.float64, + literal_field_name='f64s', + literal_field_type=float), + xla_data_pb2.S8: + TypeConversionRecord( + primitive_type=xla_data_pb2.S8, + numpy_dtype=_np.int8, + literal_field_name='s8s', + literal_field_type=int), + xla_data_pb2.S16: + TypeConversionRecord( + primitive_type=xla_data_pb2.S16, + numpy_dtype=_np.int16, + literal_field_name='s16s', + literal_field_type=int), + xla_data_pb2.S32: + TypeConversionRecord( + primitive_type=xla_data_pb2.S32, + numpy_dtype=_np.int32, + literal_field_name='s32s', + literal_field_type=int), + xla_data_pb2.S64: + TypeConversionRecord( + primitive_type=xla_data_pb2.S64, + numpy_dtype=_np.int64, + literal_field_name='s64s', + literal_field_type=int), + xla_data_pb2.U8: + TypeConversionRecord( + primitive_type=xla_data_pb2.U8, + numpy_dtype=_np.uint8, + literal_field_name='s8s', + literal_field_type=int), + xla_data_pb2.U16: + TypeConversionRecord( + primitive_type=xla_data_pb2.U16, + numpy_dtype=_np.uint16, + literal_field_name='s16s', + literal_field_type=int), + xla_data_pb2.U32: + TypeConversionRecord( + primitive_type=xla_data_pb2.U32, + numpy_dtype=_np.uint32, + literal_field_name='s32s', + literal_field_type=int), + xla_data_pb2.U64: + TypeConversionRecord( + primitive_type=xla_data_pb2.U64, + numpy_dtype=_np.uint64, + literal_field_name='s64s', + literal_field_type=int), + xla_data_pb2.PRED: + TypeConversionRecord( + primitive_type=xla_data_pb2.PRED, + numpy_dtype=_np.bool, + literal_field_name='preds', + literal_field_type=bool) +} + +# Maps from Numpy dtypes to TypeConversionRecord. +# Note the conversion on the key. Numpy has a known issue wherein dtype hashing +# doesn't work as expected (https://github.com/numpy/numpy/issues/7242). Thus, +# when keying by dtype in this dict, we use the string form of dtypes. +MAP_DTYPE_TO_RECORD = { + str(_np.dtype(record.numpy_dtype)): record + for record in MAP_XLA_TYPE_TO_RECORD.values() +} diff --git a/tensorflow/compiler/xla/python_api/xla_literal.py b/tensorflow/compiler/xla/python_api/xla_literal.py new file mode 100644 index 0000000000000000000000000000000000000000..757e41a78ad2b57d2ef6e1f3055160be22c7b3ed --- /dev/null +++ b/tensorflow/compiler/xla/python_api/xla_literal.py @@ -0,0 +1,95 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ====================================== +"""XLA LiteralProto utilities.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as _np # Avoids becoming a part of public Tensorflow API. + +from tensorflow.compiler.xla import xla_data_pb2 +from tensorflow.compiler.xla.python_api import types +from tensorflow.compiler.xla.python_api import xla_shape + + +def ConvertLiteralToNumpyArray(literal): + """Converts a XLA literal to a Numpy array.""" + element_type = literal.shape.element_type + if element_type == xla_data_pb2.TUPLE: + return tuple( + ConvertLiteralToNumpyArray(subliteral) + for subliteral in literal.tuple_literals) + + type_record = types.MAP_XLA_TYPE_TO_RECORD[element_type] + if not literal.shape.dimensions: + return _np.array( + getattr(literal, type_record.literal_field_name)[0], + type_record.numpy_dtype) + else: + # Infer the proper Numpy order from the LiteralProto's layout. The repeated + # field representing the array's content in the Literal is linearized. + # Reading is done in two steps: + # + # 1. Read the array as 1D from the LiteralProto repeated field. + # 2. Reshape the array to its proper shape, using the right order depending + # on the LiteralProto's layout. + layout_order = literal.shape.layout.minor_to_major + numpy_shape = tuple(literal.shape.dimensions) + if layout_order == range(len(literal.shape.dimensions)): + numpy_reshaper = lambda arr: arr.reshape(numpy_shape, order='F') + elif layout_order == range(len(literal.shape.dimensions) - 1, -1, -1): + numpy_reshaper = lambda arr: arr.reshape(numpy_shape, order='C') + else: + raise NotImplementedError('Unsupported layout: {0}'.format(layout_order)) + ndarray = _np.array( + getattr(literal, type_record.literal_field_name), + copy=False, + dtype=type_record.numpy_dtype) + return numpy_reshaper(ndarray) + + +def _ConvertNumpyArrayToLiteral(ndarray): + """Converts a Numpy array to a XLA literal.""" + type_record = types.MAP_DTYPE_TO_RECORD[str(ndarray.dtype)] + literal = xla_data_pb2.LiteralProto() + literal.shape.CopyFrom(xla_shape.CreateShapeFromNumpy(ndarray).message) + + if ndarray.ndim == 0: + getattr(literal, type_record.literal_field_name).append( + _np.asscalar(ndarray.astype(type_record.literal_field_type))) + else: + # Ndarrays with boolean dtypes need special type conversion with protobufs + if ndarray.dtype in {_np.bool_, _np.dtype('bool')}: + for element in _np.nditer(ndarray): + getattr(literal, type_record.literal_field_name).append( + type_record.literal_field_type(element)) + else: + ndarray_flat = ndarray.ravel(order='A') + getattr(literal, type_record.literal_field_name).extend(ndarray_flat) + return literal + + +def ConvertNumpyArrayToLiteral(value): + """Converts a Numpy array or a nested tuple thereof to an XLA literal.""" + if isinstance(value, tuple): + literal = xla_data_pb2.LiteralProto() + literal.shape.CopyFrom(xla_shape.CreateShapeFromNumpy(value).message) + for component in value: + component_literal = literal.tuple_literals.add() + component_literal.CopyFrom(ConvertNumpyArrayToLiteral(component)) + return literal + else: + return _ConvertNumpyArrayToLiteral(value) diff --git a/tensorflow/compiler/xla/python_api/xla_shape.py b/tensorflow/compiler/xla/python_api/xla_shape.py new file mode 100644 index 0000000000000000000000000000000000000000..f158f6b2410352432445f669155aff0af5526abf --- /dev/null +++ b/tensorflow/compiler/xla/python_api/xla_shape.py @@ -0,0 +1,155 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ====================================== +"""XLA Shape utilities.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as _np # Avoids becoming a part of public Tensorflow API. + +from tensorflow.compiler.xla import xla_data_pb2 +from tensorflow.compiler.xla.python_api import types + + +class Shape(object): + """Wraps a xla_data_pb2.Shape message with a convenient Python type. + + Provides direct access to the underlying xla_data_pb2.Shape message in the + message attribute, along with accessor wrappers to the message's fields. + Avoid direct access to .message unless interacting directly with protobuf APIs + like CopyFrom. In other words, prefer hauling the shape around in a Shape, and + only access .message when strictly required by the protobuf API. + """ + + def __init__(self, element_type, dimensions, layout=None): + """Creates a new XLA Shape. + + Args: + element_type: element type from xla_data_pb2. + dimensions: sequence of dimensions sizes (integers), or sequence + of Shapes in the case of a tuple, i.e. when element_type is + TUPLE. + layout: optional minor_to_major sequence for layout. If not given, the + default major-to-minor layout is used. + + Raises: + ValueError: if element_type is TUPLE but dimensions are not Shape objects. + """ + self.message = xla_data_pb2.Shape() + self.message.element_type = element_type + if element_type == xla_data_pb2.TUPLE: + if not all(isinstance(subshape, Shape) for subshape in dimensions): + raise ValueError( + 'XLA tuple requires sequence of Shape objects as dimensions') + self._tuple_shapes = tuple(dimensions) + for component_shape in self._tuple_shapes: + component_message = self.message.tuple_shapes.add() + component_message.CopyFrom(component_shape.message) + else: + self.message.dimensions.extend(dimensions) + if layout is None: + layout = list(reversed(range(len(dimensions)))) + self.message.layout.format = xla_data_pb2.DENSE + self.message.layout.minor_to_major.extend(layout) + + def element_type(self): + return self.message.element_type + + def is_tuple(self): + return self.element_type() == xla_data_pb2.TUPLE + + def dimensions(self): + if self.is_tuple(): + raise ValueError('Tuple shape has no dimensions. Try tuple_shapes()?') + return self.message.dimensions + + def tuple_shapes(self): + """If this is a tuple, returns its sequence of constituent Shape objects. + + Returns: + Tuple sub-shapes. + + Raises: + ValueError: if this is not a tuple. + """ + if not self.is_tuple(): + raise ValueError('tuple_shapes() called on a non-tuple shape') + return self._tuple_shapes + + def layout(self): + return self.message.layout + + @staticmethod + def from_pyval(pyval): + return CreateShapeFromNumpy(pyval) + + +def _CreateShapeFromNumpy(ndarray): # pylint: disable=invalid-name + """Create a Shape from a given Numpy array. + + Args: + ndarray: Numpy array. + + Returns: + A Shape object. + """ + element_type = types.MAP_DTYPE_TO_RECORD[str(ndarray.dtype)].primitive_type + dimensions = ndarray.shape + + # Set the shape's layout based on the ordering of ndarray. + # Numpy arrays come in two orders: Fortran (column-major) and C (row-major). + if _np.isfortran(ndarray): + # Column-major layout. This corresponds to a "dimension order is + # minor-to-major" layout in XLA. + layout = range(ndarray.ndim) + else: + # Row-major layout. This corresponds to a "dimension order is + # major-to-minor" layout int XLA. + layout = list(reversed(xrange(ndarray.ndim))) + + return Shape(element_type, dimensions, layout) + + +def CreateShapeFromNumpy(value): # pylint: disable=invalid-name + """Create a Shape from a Numpy array or a nested tuple structure thereof. + + Args: + value: Numpy array or (possibly nested) tuple structure that bottoms out in + Numpy arrays. + + Returns: + A Shape object. + """ + if isinstance(value, tuple): + return Shape( + xla_data_pb2.TUPLE, + [CreateShapeFromNumpy(component) for component in value]) + else: + return _CreateShapeFromNumpy(value) + + +def CreateShapeFromDtypeAndTuple(dtype, shape_tuple): # pylint: disable=invalid-name + """Create a shape from a Numpy dtype and a sequence of nonnegative integers. + + Args: + dtype: a numpy dtype, e.g. np.dtype('int32'). + shape_tuple: a sequence of nonnegative integers. + + Returns: + A Shape object. + """ + element_type = types.MAP_DTYPE_TO_RECORD[str(dtype)].primitive_type + return Shape(element_type, shape_tuple) diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc index c289c84cff743871a7126cb932d6cda823ceb696..a4854f593f0a579e3461b35033620e762593c6a6 100644 --- a/tensorflow/compiler/xla/reference_util.cc +++ b/tensorflow/compiler/xla/reference_util.cc @@ -18,7 +18,9 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h" #include "tensorflow/compiler/xla/service/hlo_evaluator.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -42,7 +44,7 @@ std::unique_ptr> MatmulArray2DImpl( int m = lhs.height(); int n = rhs.width(); int k = lhs.width(); - auto result = MakeUnique>(m, n); + auto result = absl::make_unique>(m, n); // Because Eigen is a header-oriented library, make sure that the Eigen code // is the same as the code used by the CPU backend (otherwise the linker will // randomly pick *some* definition). @@ -76,7 +78,8 @@ std::unique_ptr> MatmulArray2DImpl( /* static */ std::unique_ptr> ReferenceUtil::Array2DF32ToF64( const Array2D& input) { - auto result = MakeUnique>(input.height(), input.width()); + auto result = + absl::make_unique>(input.height(), input.width()); for (int64 rowno = 0; rowno < input.height(); ++rowno) { for (int64 colno = 0; colno < input.height(); ++colno) { (*result)(rowno, colno) = input(rowno, colno); @@ -105,17 +108,15 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated( // array by adding a fourth dummy dimension of size 1 without stride, padding // and dilation. Array4D a4dlhs(lhs.n1(), lhs.n2(), lhs.n3(), 1); - a4dlhs.Each( - [&](tensorflow::gtl::ArraySlice indices, float* value_ptr) { - CHECK_EQ(indices[3], 0); - *value_ptr = lhs.operator()(indices[0], indices[1], indices[2]); - }); + a4dlhs.Each([&](absl::Span indices, float* value_ptr) { + CHECK_EQ(indices[3], 0); + *value_ptr = lhs.operator()(indices[0], indices[1], indices[2]); + }); Array4D a4drhs(rhs.n1(), rhs.n2(), rhs.n3(), 1); - a4drhs.Each( - [&](tensorflow::gtl::ArraySlice indices, float* value_ptr) { - CHECK_EQ(indices[3], 0); - *value_ptr = rhs.operator()(indices[0], indices[1], indices[2]); - }); + a4drhs.Each([&](absl::Span indices, float* value_ptr) { + CHECK_EQ(indices[3], 0); + *value_ptr = rhs.operator()(indices[0], indices[1], indices[2]); + }); // Add a second dummy spatial dimensions. ConvolutionDimensionNumbers dnums2d = dnums; dnums2d.add_input_spatial_dimensions(3); @@ -125,13 +126,12 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated( a4dlhs, a4drhs, {kernel_stride, 1}, padding, {lhs_dilation, 1}, {rhs_dilation, 1}, dnums2d); - auto convr3 = MakeUnique>(convr4->planes(), convr4->depth(), - convr4->height()); - convr4->Each( - [&](tensorflow::gtl::ArraySlice indices, float* value_ptr) { - CHECK_EQ(indices[3], 0); - convr3->operator()(indices[0], indices[1], indices[2]) = *value_ptr; - }); + auto convr3 = absl::make_unique>( + convr4->planes(), convr4->depth(), convr4->height()); + convr4->Each([&](absl::Span indices, float* value_ptr) { + CHECK_EQ(indices[3], 0); + convr3->operator()(indices[0], indices[1], indices[2]) = *value_ptr; + }); return convr3; } @@ -186,11 +186,11 @@ ReferenceUtil::SeparableConvArray4D(const Array4D& input, /* static */ std::unique_ptr> ReferenceUtil::ReduceWindow1DGeneric( - const tensorflow::gtl::ArraySlice& operand, float init, + const absl::Span& operand, float init, const std::function& reduce_func, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, - const tensorflow::gtl::ArraySlice>& padding) { + const absl::Span& window, + const absl::Span& stride, + const absl::Span>& padding) { std::vector dim_lengths{static_cast(operand.size())}; std::vector window_counts(window.size(), 0); std::vector pad_low(window.size(), 0); @@ -200,7 +200,7 @@ ReferenceUtil::ReduceWindow1DGeneric( window_util::StridedBound(padded_width, window[i], stride[i]); pad_low[i] = padding[i].first; } - auto result = MakeUnique>(window_counts[0]); + auto result = absl::make_unique>(window_counts[0]); // Do a full 1D reduce window. for (int64 i0 = 0; i0 < window_counts[0]; ++i0) { @@ -218,10 +218,11 @@ ReferenceUtil::ReduceWindow1DGeneric( } /* static */ std::unique_ptr> -ReferenceUtil::ReduceWindow1DAdd( - const tensorflow::gtl::ArraySlice& operand, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding) { +ReferenceUtil::ReduceWindow1DAdd(const absl::Span& operand, + float init, + const absl::Span& window, + const absl::Span& stride, + Padding padding) { const auto add_reduce = [](float arg1, float arg2) { return arg1 + arg2; }; std::vector dim_lengths{static_cast(operand.size())}; return ReduceWindow1DGeneric( @@ -233,9 +234,9 @@ ReferenceUtil::ReduceWindow1DAdd( ReferenceUtil::ReduceWindow2DGeneric( const Array2D& operand, float init, const std::function& reduce_func, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, - const tensorflow::gtl::ArraySlice>& padding) { + const absl::Span& window, + const absl::Span& stride, + const absl::Span>& padding) { std::vector dim_lengths{operand.height(), operand.width()}; std::vector window_counts(window.size(), 0); @@ -246,7 +247,8 @@ ReferenceUtil::ReduceWindow2DGeneric( window_util::StridedBound(padded_width, window[i], stride[i]); pad_low[i] = padding[i].first; } - auto result = MakeUnique>(window_counts[0], window_counts[1]); + auto result = + absl::make_unique>(window_counts[0], window_counts[1]); // Do a full 2D reduce window. for (int64 i0 = 0; i0 < window_counts[0]; ++i0) { @@ -272,8 +274,8 @@ ReferenceUtil::ReduceWindow2DGeneric( /* static */ std::unique_ptr> ReferenceUtil::ReduceWindow2DAdd( const Array2D& operand, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding) { + const absl::Span& window, + const absl::Span& stride, Padding padding) { const auto add_reduce = [](float arg1, float arg2) { return arg1 + arg2; }; std::vector dim_lengths{operand.height(), operand.width()}; return ReduceWindow2DGeneric( @@ -283,8 +285,8 @@ ReferenceUtil::ReduceWindow2DGeneric( /* static */ std::unique_ptr> ReferenceUtil::ReduceWindow3DAdd( const Array3D& operand, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding) { + const absl::Span& window, + const absl::Span& stride, Padding padding) { std::vector dim_lengths{operand.n1(), operand.n2(), operand.n3()}; auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding); @@ -295,8 +297,8 @@ ReferenceUtil::ReduceWindow2DGeneric( WindowCount(dim_lengths[i], window[i], stride[i], padding); pad_low[i] = padding_both[i].first; } - auto result = MakeUnique>(window_counts[0], window_counts[1], - window_counts[2]); + auto result = absl::make_unique>( + window_counts[0], window_counts[1], window_counts[2]); for (int64 i0 = 0; i0 < window_counts[0]; ++i0) { for (int64 i1 = 0; i1 < window_counts[1]; ++i1) { @@ -330,8 +332,8 @@ ReferenceUtil::ReduceWindow2DGeneric( ReferenceUtil::ReduceWindow4DGeneric( const Array4D& operand, float init, const std::function& reduce_func, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding) { + const absl::Span& window, + const absl::Span& stride, Padding padding) { std::vector dim_lengths{operand.n1(), operand.n2(), operand.n3(), operand.n4()}; return ReduceWindow4DGeneric( @@ -343,9 +345,9 @@ ReferenceUtil::ReduceWindow4DGeneric( ReferenceUtil::ReduceWindow4DGeneric( const Array4D& operand, float init, const std::function& reduce_func, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, - const tensorflow::gtl::ArraySlice>& padding) { + const absl::Span& window, + const absl::Span& stride, + const absl::Span>& padding) { std::vector dim_lengths{operand.n1(), operand.n2(), operand.n3(), operand.n4()}; @@ -357,8 +359,8 @@ ReferenceUtil::ReduceWindow4DGeneric( window_util::StridedBound(padded_width, window[i], stride[i]); pad_low[i] = padding[i].first; } - auto result = MakeUnique>(window_counts[0], window_counts[1], - window_counts[2], window_counts[3]); + auto result = absl::make_unique>( + window_counts[0], window_counts[1], window_counts[2], window_counts[3]); // Do a full 4D reduce window. for (int64 i0 = 0; i0 < window_counts[0]; ++i0) { for (int64 i1 = 0; i1 < window_counts[1]; ++i1) { @@ -398,8 +400,8 @@ ReferenceUtil::ReduceWindow4DGeneric( /* static */ std::unique_ptr> ReferenceUtil::ReduceWindow4DAdd( const Array4D& operand, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding) { + const absl::Span& window, + const absl::Span& stride, Padding padding) { const auto add_reduce = [](float arg1, float arg2) { return arg1 + arg2; }; return ReduceWindow4DGeneric(operand, init, add_reduce, window, stride, padding); @@ -420,13 +422,15 @@ ReferenceUtil::ReduceWindow4DGeneric( } /* static */ std::unique_ptr> -ReferenceUtil::SelectAndScatter4DGePlus( - const Array4D& operand, const Array4D& source, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, bool same_padding) { +ReferenceUtil::SelectAndScatter4DGePlus(const Array4D& operand, + const Array4D& source, + float init, + const absl::Span& window, + const absl::Span& stride, + bool same_padding) { Padding padding = same_padding ? Padding::kSame : Padding::kValid; - auto result = MakeUnique>(operand.n1(), operand.n2(), - operand.n3(), operand.n4()); + auto result = absl::make_unique>(operand.n1(), operand.n2(), + operand.n3(), operand.n4()); std::vector dim_lengths{operand.n1(), operand.n2(), operand.n3(), operand.n4()}; auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding); @@ -510,8 +514,8 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated( std::pair lhs_dilation, std::pair rhs_dilation, ConvolutionDimensionNumbers dnums) { HloComputation::Builder b("ConvArray4DGeneralDimensionDilated"); - auto lhs_literal = Literal::CreateR4FromArray4D(lhs); - auto rhs_literal = Literal::CreateR4FromArray4D(rhs); + auto lhs_literal = LiteralUtil::CreateR4FromArray4D(lhs); + auto rhs_literal = LiteralUtil::CreateR4FromArray4D(rhs); std::array ordered_kernel_strides; std::array ordered_input_dimensions; @@ -582,12 +586,12 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated( CHECK_EQ(ShapeUtil::Rank(result_literal->shape()), 4); auto result = - MakeUnique>(result_literal->shape().dimensions(0), - result_literal->shape().dimensions(1), - result_literal->shape().dimensions(2), - result_literal->shape().dimensions(3)); + absl::make_unique>(result_literal->shape().dimensions(0), + result_literal->shape().dimensions(1), + result_literal->shape().dimensions(2), + result_literal->shape().dimensions(3)); - result->Each([&](tensorflow::gtl::ArraySlice indices, float* value) { + result->Each([&](absl::Span indices, float* value) { *value = result_literal->Get(indices); }); @@ -600,7 +604,7 @@ ReferenceUtil::ReduceToColArray2D( const std::function& reduce_function) { int64 rows = matrix.height(); int64 cols = matrix.width(); - auto result = MakeUnique>(); + auto result = absl::make_unique>(); for (int64 i = 0; i < rows; ++i) { float acc = init; for (int64 j = 0; j < cols; ++j) { @@ -617,7 +621,7 @@ ReferenceUtil::ReduceToRowArray2D( const std::function& reduce_function) { int64 rows = matrix.height(); int64 cols = matrix.width(); - auto result = MakeUnique>(); + auto result = absl::make_unique>(); for (int64 i = 0; i < cols; ++i) { float acc = init; for (int64 j = 0; j < rows; ++j) { @@ -629,8 +633,7 @@ ReferenceUtil::ReduceToRowArray2D( } /*static*/ std::vector ReferenceUtil::Reduce4DTo1D( - const Array4D& array, float init, - tensorflow::gtl::ArraySlice dims, + const Array4D& array, float init, absl::Span dims, const std::function& reduce_function) { std::vector result; CHECK_EQ(dims.size(), 3); @@ -673,8 +676,8 @@ ReferenceUtil::ReduceToRowArray2D( /* static */ std::unique_ptr> ReferenceUtil::Broadcast1DTo4D( const std::vector& array, const std::vector& bounds, int64 broadcast_from_dim) { - auto result = - MakeUnique>(bounds[0], bounds[1], bounds[2], bounds[3]); + auto result = absl::make_unique>(bounds[0], bounds[1], + bounds[2], bounds[3]); for (int64 i = 0; i < result->n1(); ++i) { for (int64 j = 0; j < result->n2(); ++j) { for (int64 k = 0; k < result->n3(); ++k) { @@ -703,13 +706,12 @@ ReferenceUtil::ReduceToRowArray2D( } /* static */ std::unique_ptr> ReferenceUtil::Reduce3DTo2D( - const Array3D& array, float init, - tensorflow::gtl::ArraySlice dims, + const Array3D& array, float init, absl::Span dims, const std::function& reduce_function) { CHECK_EQ(dims.size(), 1); int64 rows = dims[0] == 0 ? array.n2() : array.n1(); int64 cols = dims[0] == 2 ? array.n2() : array.n3(); - auto result = MakeUnique>(rows, cols); + auto result = absl::make_unique>(rows, cols); result->Fill(init); for (int i0 = 0; i0 < array.n1(); ++i0) { for (int i1 = 0; i1 < array.n2(); ++i1) { @@ -729,7 +731,7 @@ ReferenceUtil::ReduceToRowArray2D( const std::function& map_function) { int64 rows = matrix.height(); int64 cols = matrix.width(); - auto result = MakeUnique>(rows, cols); + auto result = absl::make_unique>(rows, cols); for (int64 i = 0; i < rows; ++i) { for (int64 j = 0; j < cols; ++j) { (*result)(i, j) = map_function(matrix(i, j)); @@ -745,7 +747,7 @@ ReferenceUtil::ReduceToRowArray2D( CHECK_EQ(lhs.width(), rhs.width()); int64 rows = lhs.height(); int64 cols = rhs.width(); - auto result = MakeUnique>(rows, cols); + auto result = absl::make_unique>(rows, cols); for (int64 i = 0; i < rows; ++i) { for (int64 j = 0; j < cols; ++j) { (*result)(i, j) = map_function(lhs(i, j), rhs(i, j)); @@ -759,7 +761,7 @@ ReferenceUtil::ReduceToRowArray2D( const std::function& map_function) { int64 rows = matrix.height(); int64 cols = matrix.width(); - auto result = MakeUnique>(rows, cols); + auto result = absl::make_unique>(rows, cols); for (int64 i = 0; i < rows; ++i) { for (int64 j = 0; j < cols; ++j) { (*result)(i, j) = map_function(matrix(i, j), i, j); diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h index 8fa6961d197dce519cf151283b8bc0836a4615c0..9ce098029dbc35f6b4bab2efd77bee2b7e1a6255 100644 --- a/tensorflow/compiler/xla/reference_util.h +++ b/tensorflow/compiler/xla/reference_util.h @@ -22,14 +22,14 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/padding.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -42,7 +42,8 @@ class ReferenceUtil { template static std::unique_ptr> TransposeArray2D( const Array2D& operand) { - auto result = MakeUnique>(operand.width(), operand.height()); + auto result = + absl::make_unique>(operand.width(), operand.height()); for (int64 w = 0; w < operand.width(); ++w) { for (int64 h = 0; h < operand.height(); ++h) { (*result)(w, h) = operand(h, w); @@ -143,8 +144,7 @@ class ReferenceUtil { // Returns the result of reducing the 4D array to a vector, reducing away // the dimensions specified in dims. static std::vector Reduce4DTo1D( - const Array4D& array, float init, - tensorflow::gtl::ArraySlice dims, + const Array4D& array, float init, absl::Span dims, const std::function& reduce_function); // Broadcast 1D dimension to 4D, from the dimension `broadcast_from_dim`. @@ -155,8 +155,7 @@ class ReferenceUtil { // Returns the result of reducing the 3D array to a 2D array, reducing away // the dimensions specified in dims. static std::unique_ptr> Reduce3DTo2D( - const Array3D& array, float init, - tensorflow::gtl::ArraySlice dims, + const Array3D& array, float init, absl::Span dims, const std::function& reduce_function); // Applies map_function to each element in the input (2D array) and returns @@ -178,47 +177,47 @@ class ReferenceUtil { // Windowed reductions with Add as the function to apply. static std::unique_ptr> ReduceWindow1DAdd( - const tensorflow::gtl::ArraySlice& operand, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding); + const absl::Span& operand, float init, + const absl::Span& window, + const absl::Span& stride, Padding padding); static std::unique_ptr> ReduceWindow2DAdd( const Array2D& operand, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding); + const absl::Span& window, + const absl::Span& stride, Padding padding); static std::unique_ptr> ReduceWindow3DAdd( const Array3D& operand, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding); + const absl::Span& window, + const absl::Span& stride, Padding padding); static std::unique_ptr> ReduceWindow4DAdd( const Array4D& operand, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding); + const absl::Span& window, + const absl::Span& stride, Padding padding); // Windowed reductions with a generic reduce function. static std::unique_ptr> ReduceWindow1DGeneric( - const tensorflow::gtl::ArraySlice& operand, float init, + const absl::Span& operand, float init, const std::function& reduce_func, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, - const tensorflow::gtl::ArraySlice>& padding); + const absl::Span& window, + const absl::Span& stride, + const absl::Span>& padding); static std::unique_ptr> ReduceWindow2DGeneric( const Array2D& operand, float init, const std::function& reduce_func, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, - const tensorflow::gtl::ArraySlice>& padding); + const absl::Span& window, + const absl::Span& stride, + const absl::Span>& padding); static std::unique_ptr> ReduceWindow4DGeneric( const Array4D& operand, float init, const std::function& reduce_func, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, Padding padding); + const absl::Span& window, + const absl::Span& stride, Padding padding); // With arbitrary padding. static std::unique_ptr> ReduceWindow4DGeneric( const Array4D& operand, float init, const std::function& reduce_func, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, - const tensorflow::gtl::ArraySlice>& padding); + const absl::Span& window, + const absl::Span& stride, + const absl::Span>& padding); // Batch normalize data. static std::unique_ptr> BatchNorm4D( @@ -231,8 +230,8 @@ class ReferenceUtil { // TODO(b/74533103) Switch tests to evaluator and remove this implementation. static std::unique_ptr> SelectAndScatter4DGePlus( const Array4D& operand, const Array4D& source, float init, - const tensorflow::gtl::ArraySlice& window, - const tensorflow::gtl::ArraySlice& stride, bool same_padding); + const absl::Span& window, + const absl::Span& stride, bool same_padding); // Concatenates the lhs and rhs arrays along the concatenate_dimension. // E.g. if concatenate_dimension is 0, the "n1"/height dimension is @@ -242,7 +241,7 @@ class ReferenceUtil { const Array2D& rhs, int concatenate_dimension) { CHECK(0 <= concatenate_dimension && concatenate_dimension < 2); - auto result = MakeUnique>( + auto result = absl::make_unique>( concatenate_dimension == 0 ? lhs.n1() + rhs.n1() : lhs.n1(), concatenate_dimension == 1 ? lhs.n2() + rhs.n2() : lhs.n2()); for (int64 i0 = 0; i0 < result->n1(); ++i0) { @@ -276,7 +275,8 @@ class ReferenceUtil { out_dims[i] = lhs_dims[i] + rhs_dims[i]; } } - auto result = MakeUnique>(out_dims[0], out_dims[1], out_dims[2]); + auto result = + absl::make_unique>(out_dims[0], out_dims[1], out_dims[2]); for (int64 i0 = 0; i0 < result->n1(); ++i0) { for (int64 i1 = 0; i1 < result->n2(); ++i1) { for (int64 i2 = 0; i2 < result->n3(); ++i2) { @@ -310,8 +310,8 @@ class ReferenceUtil { out_dims[i] = lhs_dims[i] + rhs_dims[i]; } } - auto result = MakeUnique>(out_dims[0], out_dims[1], out_dims[2], - out_dims[3]); + auto result = absl::make_unique>(out_dims[0], out_dims[1], + out_dims[2], out_dims[3]); for (int64 i0 = 0; i0 < result->n1(); ++i0) { for (int64 i1 = 0; i1 < result->n2(); ++i1) { for (int64 i2 = 0; i2 < result->n3(); ++i2) { @@ -332,8 +332,8 @@ class ReferenceUtil { // Slices with index clamping template - static std::vector ClampSlice1D( - const tensorflow::gtl::ArraySlice& input, int64 start, int64 size) { + static std::vector ClampSlice1D(const absl::Span& input, + int64 start, int64 size) { start = std::min(std::max(0, start), input.size() - size); std::vector result; for (int64 i = 0; i < size; ++i) { @@ -355,9 +355,9 @@ class ReferenceUtil { CHECK_LE(limits[1], input.n2()); CHECK_GE(strides[0], 1); CHECK_GE(strides[1], 1); - auto result = - MakeUnique>(CeilOfRatio(limits[0] - starts[0], strides[0]), - CeilOfRatio(limits[1] - starts[1], strides[1])); + auto result = absl::make_unique>( + CeilOfRatio(limits[0] - starts[0], strides[0]), + CeilOfRatio(limits[1] - starts[1], strides[1])); for (int64 i0 = 0; i0 < result->n1(); ++i0) { for (int64 i1 = 0; i1 < result->n2(); ++i1) { (*result)(i0, i1) = @@ -381,10 +381,10 @@ class ReferenceUtil { CHECK_GE(strides[0], 1); CHECK_GE(strides[1], 1); CHECK_GE(strides[2], 1); - auto result = - MakeUnique>(CeilOfRatio(limits[0] - starts[0], strides[0]), - CeilOfRatio(limits[1] - starts[1], strides[1]), - CeilOfRatio(limits[2] - starts[2], strides[2])); + auto result = absl::make_unique>( + CeilOfRatio(limits[0] - starts[0], strides[0]), + CeilOfRatio(limits[1] - starts[1], strides[1]), + CeilOfRatio(limits[2] - starts[2], strides[2])); for (int64 i0 = 0; i0 < result->n1(); ++i0) { for (int64 i1 = 0; i1 < result->n2(); ++i1) { @@ -415,11 +415,11 @@ class ReferenceUtil { CHECK_GE(strides[1], 1); CHECK_GE(strides[2], 1); CHECK_GE(strides[3], 1); - auto result = - MakeUnique>(CeilOfRatio(limits[0] - starts[0], strides[0]), - CeilOfRatio(limits[1] - starts[1], strides[1]), - CeilOfRatio(limits[2] - starts[2], strides[2]), - CeilOfRatio(limits[3] - starts[3], strides[3])); + auto result = absl::make_unique>( + CeilOfRatio(limits[0] - starts[0], strides[0]), + CeilOfRatio(limits[1] - starts[1], strides[1]), + CeilOfRatio(limits[2] - starts[2], strides[2]), + CeilOfRatio(limits[3] - starts[3], strides[3])); for (int64 i0 = 0; i0 < result->n1(); ++i0) { for (int64 i1 = 0; i1 < result->n2(); ++i1) { for (int64 i2 = 0; i2 < result->n3(); ++i2) { @@ -460,8 +460,8 @@ class ReferenceUtil { template static std::unique_ptr> MapWithIndexArray4D( const Array4D& input, F&& map_function) { - auto result = MakeUnique>(input.planes(), input.depth(), - input.height(), input.width()); + auto result = absl::make_unique>( + input.planes(), input.depth(), input.height(), input.width()); for (int64 plane = 0; plane < input.planes(); ++plane) { for (int64 depth = 0; depth < input.depth(); ++depth) { for (int64 height = 0; height < input.height(); ++height) { @@ -495,8 +495,8 @@ class ReferenceUtil { template static std::unique_ptr> MapWithIndexArray4D( const Array4D& lhs, const Array4D& rhs, F&& map_function) { - auto result = MakeUnique>(lhs.planes(), lhs.depth(), - lhs.height(), lhs.width()); + auto result = absl::make_unique>(lhs.planes(), lhs.depth(), + lhs.height(), lhs.width()); for (int64 plane = 0; plane < lhs.planes(); ++plane) { for (int64 depth = 0; depth < lhs.depth(); ++depth) { for (int64 height = 0; height < lhs.height(); ++height) { @@ -530,7 +530,7 @@ class ReferenceUtil { int64 out1 = in1 + low_padding1 + high_padding1 + (in1 - 1) * interior_padding1; - auto result = MakeUnique>(out0, out1); + auto result = absl::make_unique>(out0, out1); result->Fill(pad); int64 o0 = low_padding0; for (int64 i0 = 0; i0 < in0; ++i0) { @@ -631,7 +631,7 @@ class ReferenceUtil { Array4D result(output_bounds[0], output_bounds[1], output_bounds[2], output_bounds[3]); result.Each( - [&](tensorflow::gtl::ArraySlice indices, NativeT* value) { + [&](absl::Span indices, NativeT* value) { for (int i = 0; i < 4; ++i) { bool in_low_padding = indices[i] < pad_low[i]; bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i]; @@ -669,7 +669,7 @@ class ReferenceUtil { static std::unique_ptr> ApplyElementwise2D( F&& f, const Array2D& array1, const Array2D&... arrays) { AssertSameSize2D(array1, arrays...); - auto result = MakeUnique>(array1.n1(), array1.n2()); + auto result = absl::make_unique>(array1.n1(), array1.n2()); for (int64 i = 0; i < array1.n1(); ++i) { for (int64 j = 0; j < array1.n2(); ++j) { (*result)(i, j) = f(array1(i, j), arrays(i, j)...); diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc index 9da9bc60a2025e63b57a3be9ed360d150f88d73c..3ec0192148492c2516bf1c14fd4b960b08014388 100644 --- a/tensorflow/compiler/xla/reference_util_test.cc +++ b/tensorflow/compiler/xla/reference_util_test.cc @@ -18,12 +18,12 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/padding.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -36,7 +36,7 @@ namespace { class ReferenceUtilTest : public ::testing::Test { protected: ReferenceUtilTest() { - matrix_ = MakeUnique>(rows_, cols_); + matrix_ = absl::make_unique>(rows_, cols_); // [1.f 2.f 3.f] // [4.f 5.f 6.f] for (int64 i = 0; i < rows_; ++i) { @@ -53,7 +53,7 @@ class ReferenceUtilTest : public ::testing::Test { TEST_F(ReferenceUtilTest, TransposeArray2D) { auto result = ReferenceUtil::TransposeArray2D(*matrix_); - auto actual_literal = Literal::CreateR2FromArray2D(*result); + auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result); LiteralTestUtil::ExpectR2Near({{1.f, 4.f}, {2.f, 5.f}, {3.f, 6.f}}, *actual_literal, ErrorSpec(0.0001)); } @@ -65,7 +65,7 @@ TEST_F(ReferenceUtilTest, MatmulArray2D) { {11.f, 12.f}, }); auto result = ReferenceUtil::MatmulArray2D(*matrix_, rhs); - auto actual_literal = Literal::CreateR2FromArray2D(*result); + auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result); LiteralTestUtil::ExpectR2Near({{58.f, 64.f}, {139.f, 154.f}}, *actual_literal, ErrorSpec(0.0001)); } @@ -73,7 +73,7 @@ TEST_F(ReferenceUtilTest, MatmulArray2D) { TEST_F(ReferenceUtilTest, ReduceToColArray2D) { auto add = [](float lhs, float rhs) { return lhs + rhs; }; auto result = ReferenceUtil::ReduceToColArray2D(*matrix_, 0.0f, add); - auto actual_literal = Literal::CreateR1(*result); + auto actual_literal = LiteralUtil::CreateR1(*result); LiteralTestUtil::ExpectR1Near({6.f, 15.f}, *actual_literal, ErrorSpec(0.0001)); } @@ -81,13 +81,13 @@ TEST_F(ReferenceUtilTest, ReduceToColArray2D) { TEST_F(ReferenceUtilTest, ReduceToRowArray2D) { auto add = [](float lhs, float rhs) { return lhs + rhs; }; auto result = ReferenceUtil::ReduceToRowArray2D(*matrix_, 0.0f, add); - auto actual_literal = Literal::CreateR1(*result); + auto actual_literal = LiteralUtil::CreateR1(*result); LiteralTestUtil::ExpectR1Near({5.f, 7.f, 9.f}, *actual_literal, ErrorSpec(0.0001)); } TEST_F(ReferenceUtilTest, Reduce4Dto1DZeroSizedArray) { - auto result = Literal::CreateR1(ReferenceUtil::Reduce4DTo1D( + auto result = LiteralUtil::CreateR1(ReferenceUtil::Reduce4DTo1D( Array4D(1, 0, 1, 1), /*init=*/0, /*dims=*/{0, 1, 2}, [](float a, float b) { return a + b; })); LiteralTestUtil::ExpectR1Equal({0}, *result); @@ -96,7 +96,7 @@ TEST_F(ReferenceUtilTest, Reduce4Dto1DZeroSizedArray) { TEST_F(ReferenceUtilTest, MapArray2D) { auto identity = [](float value) { return log(exp(value)); }; auto result = ReferenceUtil::MapArray2D(*matrix_, identity); - auto actual_literal = Literal::CreateR2FromArray2D(*result); + auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result); LiteralTestUtil::ExpectR2NearArray2D(*matrix_, *actual_literal, ErrorSpec(0.0001)); } @@ -106,18 +106,18 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray2D) { return value + row + col; }; auto result = ReferenceUtil::MapWithIndexArray2D(*matrix_, add_index); - auto actual_literal = Literal::CreateR2FromArray2D(*result); + auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result); LiteralTestUtil::ExpectR2Near({{1.f, 3.f, 5.f}, {5.f, 7.f, 9.f}}, *actual_literal, ErrorSpec(0.0001)); } TEST_F(ReferenceUtilTest, MapArray4D) { - auto input = MakeUnique>(/*planes=*/2, /*depth=*/3, - /*height=*/4, /*width=*/5); + auto input = absl::make_unique>(/*planes=*/2, /*depth=*/3, + /*height=*/4, /*width=*/5); input->FillWithMultiples(1.0f); auto multiply_by_two = [](float value) { return 2 * value; }; auto result = ReferenceUtil::MapArray4D(*input, multiply_by_two); - auto actual_literal = Literal::CreateR4FromArray4D(*result); + auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result); Array4D expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5); expected.FillWithMultiples(2.0f); @@ -126,15 +126,15 @@ TEST_F(ReferenceUtilTest, MapArray4D) { } TEST_F(ReferenceUtilTest, MapWithIndexArray4D) { - auto input = MakeUnique>(/*planes=*/2, /*depth=*/3, - /*height=*/4, /*width=*/5); + auto input = absl::make_unique>(/*planes=*/2, /*depth=*/3, + /*height=*/4, /*width=*/5); input->FillWithMultiples(1.0f); auto subtract_index = [](float value, int64 plane, int64 depth, int64 height, int64 width) { return value - (3 * 4 * 5 * plane + 4 * 5 * depth + 5 * height + width); }; auto result = ReferenceUtil::MapWithIndexArray4D(*input, subtract_index); - auto actual_literal = Literal::CreateR4FromArray4D(*result); + auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result); Array4D expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5); expected.Fill(0.0f); @@ -144,7 +144,7 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray4D) { TEST_F(ReferenceUtilTest, SliceArray2D) { auto result = ReferenceUtil::Slice2D(*matrix_, {{0, 0}}, {{2, 2}}, {{1, 1}}); - auto actual_literal = Literal::CreateR2FromArray2D(*result); + auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result); LiteralTestUtil::ExpectR2Near({{1.f, 2.f}, {4.f, 5.f}}, *actual_literal, ErrorSpec(0.0001)); @@ -152,7 +152,7 @@ TEST_F(ReferenceUtilTest, SliceArray2D) { TEST_F(ReferenceUtilTest, SliceStridedArray2D) { auto result = ReferenceUtil::Slice2D(*matrix_, {{0, 0}}, {{2, 3}}, {{1, 2}}); - auto actual_literal = Literal::CreateR2FromArray2D(*result); + auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result); LiteralTestUtil::ExpectR2Near({{1.f, 3.f}, {4.f, 6.f}}, *actual_literal, ErrorSpec(0.0001)); @@ -164,7 +164,7 @@ TEST_F(ReferenceUtilTest, SliceArray3D) { auto result = ReferenceUtil::Slice3D(input, {{0, 0, 0}}, {{2, 2, 2}}, {{1, 1, 1}}); - auto actual_literal = Literal::CreateR3FromArray3D(*result); + auto actual_literal = LiteralUtil::CreateR3FromArray3D(*result); LiteralTestUtil::ExpectR3Near( {{{0.f, 1.f}, {4.f, 5.f}}, {{12.f, 13.f}, {16.f, 17.f}}}, *actual_literal, @@ -177,7 +177,7 @@ TEST_F(ReferenceUtilTest, SliceStridedArray3D) { auto result = ReferenceUtil::Slice3D(input, {{0, 0, 0}}, {{2, 3, 4}}, {{1, 2, 2}}); - auto actual_literal = Literal::CreateR3FromArray3D(*result); + auto actual_literal = LiteralUtil::CreateR3FromArray3D(*result); LiteralTestUtil::ExpectR3Near( {{{0.f, 2.f}, {8.f, 10.f}}, {{12.f, 14.f}, {20.f, 22.f}}}, @@ -190,7 +190,7 @@ TEST_F(ReferenceUtilTest, SliceArray4D) { auto result = ReferenceUtil::Slice4D(input, {{1, 0, 0, 0}}, {{2, 2, 2, 2}}, {{1, 1, 1, 1}}); - auto actual_literal = Literal::CreateR4FromArray4D(*result); + auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result); LiteralTestUtil::ExpectR4Near( {{{{60.f, 61.f}, {65.f, 66.f}}, {{80.f, 81.f}, {85.f, 86.f}}}}, @@ -203,7 +203,7 @@ TEST_F(ReferenceUtilTest, SliceStridedArray4D) { auto result = ReferenceUtil::Slice4D(input, {{1, 0, 0, 0}}, {{2, 3, 4, 5}}, {{1, 2, 2, 2}}); - auto actual_literal = Literal::CreateR4FromArray4D(*result); + auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result); LiteralTestUtil::ExpectR4Near( {{{{60.f, 62.f, 64.f}, {70.f, 72.f, 74.f}}, @@ -218,7 +218,7 @@ TEST_F(ReferenceUtilTest, ConvArray3DWithSamePadding) { ReferenceUtil::ConvArray3D(input, weights, 1, Padding::kSame); Array3D expected = {{{17, 28, 39, 20}}}; - auto actual_literal = Literal::CreateR3FromArray3D(*actual); + auto actual_literal = LiteralUtil::CreateR3FromArray3D(*actual); LiteralTestUtil::ExpectR3NearArray3D(expected, *actual_literal, ErrorSpec(0.0001)); @@ -231,7 +231,7 @@ TEST_F(ReferenceUtilTest, ConvArray3DWithValidPadding) { ReferenceUtil::ConvArray3D(input, weights, 1, Padding::kValid); Array3D expected = {{{17, 28, 39}}}; - auto actual_literal = Literal::CreateR3FromArray3D(*actual); + auto actual_literal = LiteralUtil::CreateR3FromArray3D(*actual); LiteralTestUtil::ExpectR3NearArray3D(expected, *actual_literal, ErrorSpec(0.0001)); @@ -266,7 +266,7 @@ TEST_F(ReferenceUtilTest, ConvWithSamePadding) { })); // clang-format on - auto actual_literal = Literal::CreateR4FromArray4D(*actual); + auto actual_literal = LiteralUtil::CreateR4FromArray4D(*actual); LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal, ErrorSpec(0.0001)); @@ -300,7 +300,7 @@ TEST_F(ReferenceUtilTest, ConvWithValidPadding) { })); // clang-format on - auto actual_literal = Literal::CreateR4FromArray4D(*actual); + auto actual_literal = LiteralUtil::CreateR4FromArray4D(*actual); LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal, ErrorSpec(0.0001)); @@ -356,7 +356,7 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithSamePadding) { }}); // clang-format on - auto actual_literal = Literal::CreateR4FromArray4D(*actual); + auto actual_literal = LiteralUtil::CreateR4FromArray4D(*actual); LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal, ErrorSpec(0.0001)); @@ -409,7 +409,7 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithValidPadding) { Array4D expected({{{{2514, 2685}}}}); // clang-format on - auto actual_literal = Literal::CreateR4FromArray4D(*actual); + auto actual_literal = LiteralUtil::CreateR4FromArray4D(*actual); LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal, ErrorSpec(0.0001)); @@ -422,7 +422,7 @@ TEST_F(ReferenceUtilTest, ApplyElementwise2D) { auto actual = ReferenceUtil::ApplyElementwise2D( [](float x, float y, float z) { return 100 * x + 10 * y + z; }, a, b, c); - auto actual_literal = Literal::CreateR2FromArray2D(*actual); + auto actual_literal = LiteralUtil::CreateR2FromArray2D(*actual); LiteralTestUtil::ExpectR2Near({{300.f, 600.f}, {900.f, 1200.f}}, *actual_literal, ErrorSpec(0.0001)); } diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD index 0d56a9a477b15964ad45e798865aa8d2c7385073..97fcd37f6b89d6dd737c233ef19f55a8faa1b624 100644 --- a/tensorflow/compiler/xla/rpc/BUILD +++ b/tensorflow/compiler/xla/rpc/BUILD @@ -39,10 +39,11 @@ tf_cc_binary( srcs = ["grpc_service_main.cc"], deps = [ ":grpc_service", + "//tensorflow:grpc++", "//tensorflow/compiler/xla/service:cpu_plugin", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", - "@grpc//:grpc++_unsecure", + "@com_google_absl//absl/strings:str_format", ], ) @@ -54,14 +55,15 @@ tf_cc_test( ], deps = [ ":grpc_stub", + "//tensorflow:grpc++", "//tensorflow/compiler/xla/client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", - "@grpc//:grpc++_unsecure", + "@com_google_absl//absl/strings:str_format", ], ) @@ -71,9 +73,9 @@ cc_library( hdrs = ["grpc_service.h"], deps = [ ":xla_service_proto", + "//tensorflow:grpc++", "//tensorflow/compiler/xla/service", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/core/distributed_runtime/rpc:grpc_util", - "@grpc//:grpc++_unsecure", ], ) diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc index 313f11a9a957155eb277dc02ba5d2565c87e0235..43fd8fe1bd0f41eb2ac5c42021a8ca4f63282646 100644 --- a/tensorflow/compiler/xla/rpc/grpc_client_test.cc +++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc @@ -20,15 +20,15 @@ limitations under the License. #include #include -#include "grpc++/create_channel.h" -#include "grpc++/security/credentials.h" +#include "grpcpp/create_channel.h" +#include "grpcpp/security/credentials.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/client/client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/rpc/grpc_stub.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/net.h" #include "tensorflow/core/platform/subprocess.h" @@ -46,7 +46,7 @@ class GRPCClientTestBase : public ::testing::Test { int port = tensorflow::internal::PickUnusedPortOrDie(); subprocess_.SetProgram( service_main_path, - {service_main_path, tensorflow::strings::Printf("--port=%d", port)}); + {service_main_path, absl::StrFormat("--port=%d", port)}); subprocess_.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_DUPPARENT); subprocess_.SetChannelAction(tensorflow::CHAN_STDERR, @@ -54,9 +54,8 @@ class GRPCClientTestBase : public ::testing::Test { CHECK(subprocess_.Start()); LOG(INFO) << "Launched subprocess"; - auto channel = - ::grpc::CreateChannel(tensorflow::strings::Printf("localhost:%d", port), - ::grpc::InsecureChannelCredentials()); + auto channel = ::grpc::CreateChannel(absl::StrFormat("localhost:%d", port), + ::grpc::InsecureChannelCredentials()); channel->WaitForConnected(gpr_time_add( gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(10, GPR_TIMESPAN))); LOG(INFO) << "Channel to server is connected on port " << port; @@ -85,19 +84,19 @@ TEST_F(GRPCClientTestBase, ItsAlive) { TEST_F(GRPCClientTestBase, AxpyTenValues) { XlaBuilder builder("axpy_10"); - auto alpha = builder.ConstantR0(3.1415926535); - auto x = builder.ConstantR1( - {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0}); - auto y = builder.ConstantR1( - {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0}); - auto ax = builder.Mul(alpha, x); - auto axpy = builder.Add(ax, y); + auto alpha = ConstantR0(&builder, 3.1415926535); + auto x = ConstantR1( + &builder, {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0}); + auto y = ConstantR1( + &builder, {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0}); + auto ax = Mul(alpha, x); + Add(ax, y); std::vector expected = { 1.85840735, -1.85840735, 2.28318531, -2.28318531, -6.42477796, 6.42477796, 10.56637061, -10.56637061, -14.70796327, 14.70796327}; std::unique_ptr expected_literal = - Literal::CreateR1(expected); + LiteralUtil::CreateR1(expected); TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build()); TF_ASSERT_OK_AND_ASSIGN(auto result_literal, client_->ExecuteAndTransfer( computation, {}, nullptr)); diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc index 5f4dc6bd08f18b50e60b173432d3d305759bccea..4e1435fa30a24c320ddbedb84d37b369a3158a54 100644 --- a/tensorflow/compiler/xla/rpc/grpc_service.cc +++ b/tensorflow/compiler/xla/rpc/grpc_service.cc @@ -32,19 +32,6 @@ namespace xla { return tensorflow::ToGrpcStatus(s); } -::grpc::Status GRPCService::Computation(::grpc::ServerContext* context, - const ComputationRequest* arg, - ComputationResponse* result) { - return DelegateRPC( - [this, arg, result]() { return service_->Computation(arg, result); }); -} - -::grpc::Status GRPCService::CreateOp(::grpc::ServerContext* context, - const OpRequest* arg, OpResponse* result) { - return DelegateRPC( - [this, arg, result]() { return service_->Op(arg, result); }); -} - ::grpc::Status GRPCService::Unregister(::grpc::ServerContext* context, const UnregisterRequest* arg, UnregisterResponse* result) { @@ -60,21 +47,6 @@ namespace xla { }); } -::grpc::Status GRPCService::SetReturnValue(::grpc::ServerContext* context, - const SetReturnValueRequest* arg, - SetReturnValueResponse* results) { - return DelegateRPC([this, arg, results]() { - return service_->SetReturnValue(arg, results); - }); -} - -::grpc::Status GRPCService::Execute(::grpc::ServerContext* context, - const ExecuteRequest* arg, - ExecuteResponse* result) { - return DelegateRPC( - [this, arg, result]() { return service_->Execute(arg, result); }); -} - ::grpc::Status GRPCService::ExecuteGraph(::grpc::ServerContext* /*context*/, const ExecuteGraphRequest* arg, ExecuteResponse* result) { @@ -82,13 +54,6 @@ namespace xla { [this, arg, result]() { return service_->ExecuteGraph(arg, result); }); } -::grpc::Status GRPCService::ExecuteAsync(::grpc::ServerContext* context, - const ExecuteAsyncRequest* arg, - ExecuteAsyncResponse* result) { - return DelegateRPC( - [this, arg, result]() { return service_->ExecuteAsync(arg, result); }); -} - ::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context, const WaitForExecutionRequest* arg, WaitForExecutionResponse* result) { @@ -136,20 +101,6 @@ namespace xla { [this, arg, result]() { return service_->ResetDevice(arg, result); }); } -::grpc::Status GRPCService::IsConstant(::grpc::ServerContext* context, - const IsConstantRequest* arg, - IsConstantResponse* result) { - return DelegateRPC( - [this, arg, result]() { return service_->IsConstant(arg, result); }); -} - -::grpc::Status GRPCService::ComputeConstant(::grpc::ServerContext* context, - const ComputeConstantRequest* arg, - ComputeConstantResponse* result) { - return DelegateRPC( - [this, arg, result]() { return service_->ComputeConstant(arg, result); }); -} - ::grpc::Status GRPCService::GetShape(::grpc::ServerContext* context, const GetShapeRequest* arg, GetShapeResponse* result) { @@ -157,43 +108,4 @@ namespace xla { [this, arg, result]() { return service_->GetShape(arg, result); }); } -::grpc::Status GRPCService::GetComputationShape( - ::grpc::ServerContext* context, const GetComputationShapeRequest* arg, - GetComputationShapeResponse* result) { - return DelegateRPC([this, arg, result]() { - return service_->GetComputationShape(arg, result); - }); -} - -::grpc::Status GRPCService::GetLocalShape(::grpc::ServerContext* context, - const GetLocalShapeRequest* arg, - GetLocalShapeResponse* result) { - return DelegateRPC( - [this, arg, result]() { return service_->GetLocalShape(arg, result); }); -} - -::grpc::Status GRPCService::GetComputationStats( - ::grpc::ServerContext* context, const ComputationStatsRequest* arg, - ComputationStatsResponse* result) { - return DelegateRPC([this, arg, result]() { - return service_->GetComputationStats(arg, result); - }); -} - -::grpc::Status GRPCService::SnapshotComputation( - ::grpc::ServerContext* context, const SnapshotComputationRequest* arg, - SnapshotComputationResponse* result) { - return DelegateRPC([this, arg, result]() { - return service_->SnapshotComputation(arg, result); - }); -} - -::grpc::Status GRPCService::LoadComputationSnapshot( - ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg, - LoadComputationSnapshotResponse* result) { - return DelegateRPC([this, arg, result]() { - return service_->LoadComputationSnapshot(arg, result); - }); -} - } // namespace xla diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h index 50f02796f2d45baf894841782cd96d8d51a5ba00..ca1b09b648013ad45d806040c5ddcf11d9e5604e 100644 --- a/tensorflow/compiler/xla/rpc/grpc_service.h +++ b/tensorflow/compiler/xla/rpc/grpc_service.h @@ -16,7 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_ #define TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_ -#include "grpc++/server_context.h" +#include "grpcpp/server_context.h" #include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h" #include "tensorflow/compiler/xla/service/service.h" @@ -31,13 +31,6 @@ class GRPCService : public grpc::XlaService::Service { static StatusOr> NewService( se::Platform* platform = nullptr); - ::grpc::Status Computation(::grpc::ServerContext* context, - const ComputationRequest* arg, - ComputationResponse* result) override; - - ::grpc::Status CreateOp(::grpc::ServerContext* context, const OpRequest* arg, - OpResponse* result) override; - ::grpc::Status Unregister(::grpc::ServerContext* context, const UnregisterRequest* arg, UnregisterResponse* result) override; @@ -46,22 +39,10 @@ class GRPCService : public grpc::XlaService::Service { const DeconstructTupleRequest* arg, DeconstructTupleResponse* result) override; - ::grpc::Status SetReturnValue(::grpc::ServerContext* context, - const SetReturnValueRequest* arg, - SetReturnValueResponse* results) override; - - ::grpc::Status Execute(::grpc::ServerContext* context, - const ExecuteRequest* arg, - ExecuteResponse* result) override; - ::grpc::Status ExecuteGraph(::grpc::ServerContext* context, const ExecuteGraphRequest* arg, ExecuteResponse* result) override; - ::grpc::Status ExecuteAsync(::grpc::ServerContext* context, - const ExecuteAsyncRequest* arg, - ExecuteAsyncResponse* result) override; - ::grpc::Status WaitForExecution(::grpc::ServerContext* context, const WaitForExecutionRequest* arg, WaitForExecutionResponse* result) override; @@ -86,38 +67,10 @@ class GRPCService : public grpc::XlaService::Service { const ResetDeviceRequest* arg, ResetDeviceResponse* result) override; - ::grpc::Status IsConstant(::grpc::ServerContext* context, - const IsConstantRequest* arg, - IsConstantResponse* result) override; - - ::grpc::Status ComputeConstant(::grpc::ServerContext* context, - const ComputeConstantRequest* arg, - ComputeConstantResponse* result) override; - ::grpc::Status GetShape(::grpc::ServerContext* context, const GetShapeRequest* arg, GetShapeResponse* result) override; - ::grpc::Status GetComputationShape( - ::grpc::ServerContext* context, const GetComputationShapeRequest* arg, - GetComputationShapeResponse* result) override; - - ::grpc::Status GetLocalShape(::grpc::ServerContext* context, - const GetLocalShapeRequest* arg, - GetLocalShapeResponse* result) override; - - ::grpc::Status GetComputationStats(::grpc::ServerContext* context, - const ComputationStatsRequest* arg, - ComputationStatsResponse* result) override; - - ::grpc::Status SnapshotComputation( - ::grpc::ServerContext* context, const SnapshotComputationRequest* arg, - SnapshotComputationResponse* result) override; - - ::grpc::Status LoadComputationSnapshot( - ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg, - LoadComputationSnapshotResponse* result) override; - private: std::unique_ptr<::xla::Service> service_; diff --git a/tensorflow/compiler/xla/rpc/grpc_service_main.cc b/tensorflow/compiler/xla/rpc/grpc_service_main.cc index e29908ccec80db76e3b5b856e57382c56430c379..d6b5149a24c491d1e9d7cd9119b36d7eb2ad65d3 100644 --- a/tensorflow/compiler/xla/rpc/grpc_service_main.cc +++ b/tensorflow/compiler/xla/rpc/grpc_service_main.cc @@ -15,11 +15,11 @@ limitations under the License. // Basic server binary that exposes a xla::Service through a GRPC interface // on a configurable port. -#include "grpc++/security/server_credentials.h" -#include "grpc++/server.h" -#include "grpc++/server_builder.h" +#include "grpcpp/security/server_credentials.h" +#include "grpcpp/server.h" +#include "grpcpp/server_builder.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/rpc/grpc_service.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/util/command_line_flags.h" @@ -44,7 +44,7 @@ int RealMain(int argc, char** argv) { xla::GRPCService::NewService().ConsumeValueOrDie(); ::grpc::ServerBuilder builder; - string server_address(tensorflow::strings::Printf("localhost:%d", port)); + string server_address(absl::StrFormat("localhost:%d", port)); builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials()); builder.RegisterService(service.get()); diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc index 620ac6cec4f76d938e57e87849066df59514938a..7b8ab158e1396d7087a407be180ab44d2e16e121 100644 --- a/tensorflow/compiler/xla/rpc/grpc_stub.cc +++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc @@ -62,21 +62,6 @@ Status GRPCStub::ResetDevice(const ResetDeviceRequest* request, }); } -Status GRPCStub::LoadComputationSnapshot( - const LoadComputationSnapshotRequest* request, - LoadComputationSnapshotResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->LoadComputationSnapshot(context, *request, response); - }); -} - -Status GRPCStub::Execute(const ExecuteRequest* request, - ExecuteResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->Execute(context, *request, response); - }); -} - Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request, ExecuteResponse* response) { return MakeRPC([this, request, response](::grpc::ClientContext* context) { @@ -84,13 +69,6 @@ Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request, }); } -Status GRPCStub::ExecuteParallel(const ExecuteParallelRequest* request, - ExecuteParallelResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->ExecuteParallel(context, *request, response); - }); -} - Status GRPCStub::ExecuteGraphParallel( const ExecuteGraphParallelRequest* request, ExecuteParallelResponse* response) { @@ -99,13 +77,6 @@ Status GRPCStub::ExecuteGraphParallel( }); } -Status GRPCStub::ExecuteAsync(const ExecuteAsyncRequest* request, - ExecuteAsyncResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->ExecuteAsync(context, *request, response); - }); -} - Status GRPCStub::WaitForExecution(const WaitForExecutionRequest* request, WaitForExecutionResponse* response) { return MakeRPC([this, request, response](::grpc::ClientContext* context) { @@ -120,13 +91,6 @@ Status GRPCStub::DeconstructTuple(const DeconstructTupleRequest* request, }); } -Status GRPCStub::GetComputationStats(const ComputationStatsRequest* request, - ComputationStatsResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->GetComputationStats(context, *request, response); - }); -} - Status GRPCStub::GetComputationGraphStats( const ComputationGraphStatsRequest* request, ComputationStatsResponse* response) { @@ -135,13 +99,6 @@ Status GRPCStub::GetComputationGraphStats( }); } -Status GRPCStub::GetComputationShape(const GetComputationShapeRequest* request, - GetComputationShapeResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->GetComputationShape(context, *request, response); - }); -} - Status GRPCStub::GetShape(const GetShapeRequest* request, GetShapeResponse* response) { return MakeRPC([this, request, response](::grpc::ClientContext* context) { @@ -163,48 +120,6 @@ Status GRPCStub::CreateChannelHandle(const CreateChannelHandleRequest* request, }); } -// Methods used by ComputationBuilder. -Status GRPCStub::Computation(const ComputationRequest* request, - ComputationResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->Computation(context, *request, response); - }); -} - -Status GRPCStub::Op(const OpRequest* request, OpResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->CreateOp(context, *request, response); - }); -} - -Status GRPCStub::GetLocalShape(const GetLocalShapeRequest* request, - GetLocalShapeResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->GetLocalShape(context, *request, response); - }); -} - -Status GRPCStub::SetReturnValue(const SetReturnValueRequest* request, - SetReturnValueResponse* responses) { - return MakeRPC([this, request, responses](::grpc::ClientContext* context) { - return grpc_stub_->SetReturnValue(context, *request, responses); - }); -} - -Status GRPCStub::IsConstant(const IsConstantRequest* request, - IsConstantResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->IsConstant(context, *request, response); - }); -} - -Status GRPCStub::ComputeConstant(const ComputeConstantRequest* request, - ComputeConstantResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->ComputeConstant(context, *request, response); - }); -} - Status GRPCStub::ComputeConstantGraph( const ComputeConstantGraphRequest* request, ComputeConstantResponse* response) { @@ -213,14 +128,6 @@ Status GRPCStub::ComputeConstantGraph( }); } -// Methods used by Computation. -Status GRPCStub::SnapshotComputation(const SnapshotComputationRequest* request, - SnapshotComputationResponse* response) { - return MakeRPC([this, request, response](::grpc::ClientContext* context) { - return grpc_stub_->SnapshotComputation(context, *request, response); - }); -} - // Methods used by GlobalData. Status GRPCStub::Unregister(const UnregisterRequest* request, UnregisterResponse* response) { diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h index 5906d45769b5749b0c590dbc0e1972077dc3e7ba..8dfcb761387d608abbb1f62974f49b976a7ff7ff 100644 --- a/tensorflow/compiler/xla/rpc/grpc_stub.h +++ b/tensorflow/compiler/xla/rpc/grpc_stub.h @@ -43,39 +43,21 @@ class GRPCStub : public ServiceInterface { Status ResetDevice(const ResetDeviceRequest* arg, ResetDeviceResponse* result) override; - Status LoadComputationSnapshot( - const LoadComputationSnapshotRequest* request, - LoadComputationSnapshotResponse* result) override; - - Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override; - Status ExecuteGraph(const ExecuteGraphRequest* request, ExecuteResponse* response) override; - Status ExecuteParallel(const ExecuteParallelRequest* arg, - ExecuteParallelResponse* result) override; - Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* request, ExecuteParallelResponse* response) override; - Status ExecuteAsync(const ExecuteAsyncRequest* arg, - ExecuteAsyncResponse* result) override; - Status WaitForExecution(const WaitForExecutionRequest* arg, WaitForExecutionResponse* result) override; Status DeconstructTuple(const DeconstructTupleRequest* arg, DeconstructTupleResponse* result) override; - Status GetComputationStats(const ComputationStatsRequest* arg, - ComputationStatsResponse* result) override; - Status GetComputationGraphStats(const ComputationGraphStatsRequest* request, ComputationStatsResponse* response) override; - Status GetComputationShape(const GetComputationShapeRequest* arg, - GetComputationShapeResponse* result) override; - Status GetShape(const GetShapeRequest* arg, GetShapeResponse* result) override; @@ -85,30 +67,9 @@ class GRPCStub : public ServiceInterface { Status CreateChannelHandle(const CreateChannelHandleRequest* arg, CreateChannelHandleResponse* result) override; - // Methods used by ComputationBuilder. - Status Computation(const ComputationRequest* arg, - ComputationResponse* result) override; - - Status Op(const OpRequest* arg, OpResponse* result) override; - Status GetLocalShape(const GetLocalShapeRequest* arg, - GetLocalShapeResponse* result) override; - - Status SetReturnValue(const SetReturnValueRequest* arg, - SetReturnValueResponse* results) override; - - Status IsConstant(const IsConstantRequest* arg, - IsConstantResponse* result) override; - - Status ComputeConstant(const ComputeConstantRequest* arg, - ComputeConstantResponse* result) override; - Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) override; - // Methods used by Computation. - Status SnapshotComputation(const SnapshotComputationRequest* ag, - SnapshotComputationResponse* result) override; - // Methods used by GlobalData. Status Unregister(const UnregisterRequest* arg, UnregisterResponse* result) override; diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto index c47164ee1b7657ae378a053f553442bee751753e..551ae895e05586daec0ffcd425f4950f76bdd50d 100644 --- a/tensorflow/compiler/xla/rpc/xla_service.proto +++ b/tensorflow/compiler/xla/rpc/xla_service.proto @@ -75,19 +75,7 @@ service XlaService { rpc GetShape(GetShapeRequest) returns (GetShapeResponse) { } - // Requests the program shape of the referenced computation. - rpc GetComputationShape(GetComputationShapeRequest) - returns (GetComputationShapeResponse) { - } - - // Requests the statistics of the given computation. - rpc GetComputationStats(ComputationStatsRequest) - returns (ComputationStatsResponse) { - } - // Requests the statistics of the given computation. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. rpc GetComputationGraphStats(ComputationGraphStatsRequest) returns (ComputationStatsResponse) { } @@ -121,25 +109,12 @@ service XlaService { rpc ResetDevice(ResetDeviceRequest) returns (ResetDeviceResponse) { } - // Tests if an expression is a compile-time constant. - rpc IsConstant(IsConstantRequest) returns (IsConstantResponse) { - } - - // Computes the value of a constant expression. - rpc ComputeConstant(ComputeConstantRequest) - returns (ComputeConstantResponse) { - } - // Computes the value of a constant expression. The request contains the // computation graph for the constant expression. rpc ComputeConstantGraph(ComputeConstantGraphRequest) returns (ComputeConstantResponse) { } - // Retrieves the inferred shape for a value within a computation. - rpc GetLocalShape(GetLocalShapeRequest) returns (GetLocalShapeResponse) { - } - // Requests one or more device handles from the target. The returned device // handles can be used to specify the device on which to execute computations // or transfer data. @@ -153,32 +128,6 @@ service XlaService { returns (CreateChannelHandleResponse) { } - // Requests that the referenced computation be specialized for the provided - // arguments for subsequent execution. This permits things such as value - // specialization. - rpc Specialize(SpecializeRequest) returns (SpecializeResponse) { - } - - // Modifies the provided computation so that subsequent executions - // will compute the provided ComputationDataHandle, rather than the - // last expression enqueued on that Computation. - rpc SetReturnValue(SetReturnValueRequest) returns (SetReturnValueResponse) { - } - - // Computation creates a new computation with the given name. - // A unique ComputationHandle is returned. - rpc Computation(ComputationRequest) returns (ComputationResponse) { - } - - // Adds a new op to a computation. - rpc CreateOp(OpRequest) returns (OpResponse) { - } - - // Invokes the provided computation with the provided global data passed as - // immutable arguments. Returns global data output and execution timing. - rpc Execute(ExecuteRequest) returns (ExecuteResponse) { - } - // Invokes the provided computation with the provided global data passed as // immutable arguments. The request contains the whole computation graph. // Returns global data output and execution timing. @@ -188,38 +137,13 @@ service XlaService { // Invokes the provided list of computations in parallel with the provided // global data for each computation. Returns a list of global data output and // execution timing. - rpc ExecuteParallel(ExecuteParallelRequest) - returns (ExecuteParallelResponse) { - } - - // Invokes the provided list of computations in parallel with the provided - // global data for each computation. Returns a list of global data output and - // execution timing. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. rpc ExecuteGraphParallel(ExecuteGraphParallelRequest) returns (ExecuteParallelResponse) { } - // Invokes the provided computation with the provided global data passed as - // immutable arguments. Returns a handle to the execution. - rpc ExecuteAsync(ExecuteAsyncRequest) returns (ExecuteAsyncResponse) { - } - // Waits until the given execution (aysnchronously launched) is complete, and // returns the global data output. rpc WaitForExecution(WaitForExecutionRequest) returns (WaitForExecutionResponse) { } - - // Serializes a computation to proto form, so it can be loaded via - // LoadComputationSnapshot. - rpc SnapshotComputation(SnapshotComputationRequest) - returns (SnapshotComputationResponse) { - } - - // Loads a computation from a captured snapshot. - rpc LoadComputationSnapshot(LoadComputationSnapshotRequest) - returns (LoadComputationSnapshotResponse) { - } } diff --git a/tensorflow/compiler/xla/scanner.cc b/tensorflow/compiler/xla/scanner.cc deleted file mode 100644 index f23a1417fcec9b567f330d78957e94eca91a49da..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/scanner.cc +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/scanner.h" - -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" - -namespace xla { -namespace { - -// Returns true if c can be the first character in an identifier. -bool IsIdentifierFirst(int c) { return std::isalpha(c) || c == '_'; } - -// Returns true if c can be the non-first character in an identifier. -bool IsIdentifierLater(int c) { return std::isalnum(c) || c == '_'; } - -// Returns true if str is an identifier. -bool IsIdentifier(tensorflow::StringPiece str) { - if (str.empty() || !IsIdentifierFirst(str[0])) { - return false; - } - for (int64 i = 1; i < str.size(); ++i) { - if (!IsIdentifierLater(str[i])) { - return false; - } - } - return true; -} - -} // namespace - -Scanner::Scanner(tensorflow::StringPiece input) : input_(input), position_(0) {} - -bool Scanner::ok() const { return status().ok(); } - -const Status& Scanner::status() const { return status_; } - -bool Scanner::Match(tensorflow::StringPiece match) { - SkipWhitespace(); - if (ok() && position_ + match.size() <= input_.size() && - std::equal(match.begin(), match.end(), input_.begin() + position_)) { - SkipChars(match.size()); - - VLOG(10) << "Matched \"" << match << "\""; - return true; - } else { - return false; - } -} - -void Scanner::Expect(tensorflow::StringPiece expect) { - if (!Match(expect)) { - SetError(tensorflow::strings::StrCat("Expected \"", expect, "\".")); - } -} - -bool Scanner::MatchReadIdentifier(string* identifier) { - SkipWhitespace(); - if (!IsIdentifierFirst(PeekChar())) { - return false; - } - identifier->clear(); - do { - *identifier += ReadChar(); - } while (IsIdentifierLater(PeekChar())); - - VLOG(10) << "Read identifier " << identifier; - CHECK(IsIdentifier(*identifier)); - return true; -} - -string Scanner::ReadIdentifier() { - string identifier; - if (!MatchReadIdentifier(&identifier)) { - SetError("Expected identifier."); - } - return identifier; -} - -void Scanner::ExpectIdentifier(tensorflow::StringPiece expect) { - CHECK(IsIdentifier(expect)); - - string identifier; - if (!MatchReadIdentifier(&identifier)) { - SetError(tensorflow::strings::StrCat("Expected identifier ", expect, ".")); - } - if (identifier != expect) { - SetError(tensorflow::strings::StrCat("Expected identifier ", expect, - ", but got ", identifier, ".")); - } -} - -// Matches the end of the input, also known as End Of File (EOF). -bool Scanner::MatchEof() { - SkipWhitespace(); - return PeekChar() == EOF; -} - -void Scanner::ExpectEof() { - if (!MatchEof()) { - SetError("Expected end of input."); - } -} - -// Reads a vector of the format "(1, 2, 3)". -std::vector Scanner::ReadIntVector() { - std::vector ints; - Expect("("); - if (!Match(")") && ok()) { - ints.push_back(ReadInt()); - while (Match(",")) { - ints.push_back(ReadInt()); - } - Expect(")"); - } - - VLOG(10) << "Read int vector with " << ints.size() << " elements."; - return ints; -} - -int64 Scanner::ReadInt() { - bool negative = Match("-"); - if (!PeekDigit()) { - SetError("Expected integer."); - return 0; - } - - int64 integer = 0; - do { - integer = (ReadChar() - '0') + integer * 10; - } while (PeekDigit()); - integer = negative ? -integer : integer; - - VLOG(10) << "Read integer " << integer; - return integer; -} - -void Scanner::SkipWhitespace() { - while (PeekWhitespace()) { - SkipChars(1); - } -} - -int Scanner::ReadChar() { - int c = PeekChar(); - SkipChars(1); - - VLOG(20) << "Read char " << c; - return c; -} - -int Scanner::PeekChar() const { - return ok() && position_ < input_.size() ? input_[position_] : EOF; -} - -bool Scanner::PeekDigit() const { - // Do not use std::isdigit since it depends on the locale and we do not - // handle any digits beyond 0-9. - const char c = PeekChar(); - return '0' <= c && c <= '9'; -} - -bool Scanner::PeekAlnum() const { return std::isalnum(PeekChar()); } - -bool Scanner::PeekWhitespace() const { return std::isspace(PeekChar()); } - -void Scanner::SkipChars(int64 count) { - CHECK_GE(count, 0); - position_ += count; -} - -void Scanner::SetError(string error_message) { - // Only the first error is recorded since any later errors will likely be a - // consequence of the first error. - if (ok()) { - status_ = InvalidArgumentStrCat(std::move(error_message)); - position_ = input_.size(); - VLOG(10) << "Failed scanner with error " << status_.ToString(); - } else { - VLOG(10) << "Error on already failed scanner is " << error_message; - } -} - -} // namespace xla diff --git a/tensorflow/compiler/xla/scanner.h b/tensorflow/compiler/xla/scanner.h deleted file mode 100644 index 86b04ae7f9a04c6197ed5bfe9aed2466535e507f..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/scanner.h +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SCANNER_H_ -#define TENSORFLOW_COMPILER_XLA_SCANNER_H_ - -#include "tensorflow/compiler/xla/status.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" - -namespace xla { - -// Simple class for parsing data. The concepts for the interface are: -// -// Match(x): Returns true if x is next in the input and in that case skips -// past it. Otherwise returns false. -// -// Expect(x): As Match(x), but requires x to be next in the input. -// -// MatchReadX(x): Returns true if an X is next in the input and in that case -// skips past it and assigns it to x. Otherwise returns false. -// -// ReadX(): As ReadMatchX(), but requires an X to be next in the input and -// returns it. -// -// PeekX(): Returns true if an X is next in the input and does not skip -// past it either way. -// -// All of these, except those that work on individual characters, skip -// whitespace. -// -// If a requirement is not met, the error is available in status(). A Scanner -// with a failed status() will behave as though the rest of the input is EOF and -// will not record further errors after that point. -class Scanner { - public: - Scanner(tensorflow::StringPiece input); - - bool ok() const; - const Status& status() const; - - bool Match(tensorflow::StringPiece match); - void Expect(tensorflow::StringPiece expect); - - // Match-reads an identifier. An identifier starts with an alphabetic - // character or an underscore followed by any number of characters that are - // each alphanumeric or underscore. - bool MatchReadIdentifier(string* identifier); - - string ReadIdentifier(); - - void ExpectIdentifier(tensorflow::StringPiece expect); - - // Matches the end of the input, also known as End Of File (EOF). - bool MatchEof(); - void ExpectEof(); - - // Reads a vector of the format "(1, 4, 5)". - std::vector ReadIntVector(); - - // Reads an integer. Can start with a minus but not a plus. - int64 ReadInt(); - - // Keeps skipping until encountering a non-whitespace character. - void SkipWhitespace(); - - // *** Below here are character-level methods that do not skip whitespace. - - int ReadChar(); - int PeekChar() const; - bool PeekDigit() const; - bool PeekAlnum() const; - bool PeekWhitespace() const; - - // Skip past the next count characters. - void SkipChars(int64 count); - - private: - // Sets a failed status. The input is in effect replaced with EOF after - // this. Only the first error is recorded. - void SetError(string error_message); - - const tensorflow::StringPiece input_; - int64 position_; - Status status_; -}; - -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SCANNER_H_ diff --git a/tensorflow/compiler/xla/scanner_test.cc b/tensorflow/compiler/xla/scanner_test.cc deleted file mode 100644 index 10cd0c6a042f3bb97d3b6797a51302001d8f0ed0..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/scanner_test.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// TODO(b/80179519): Fix open source build for real. -#if 0 -#include "tensorflow/compiler/xla/scanner.h" - -#include - -#include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/platform/env.h" - -namespace xla { -namespace { - -TEST(Scanner, Empty) { - Scanner scanner(""); - - EXPECT_EQ(scanner.PeekChar(), EOF); - EXPECT_TRUE(scanner.MatchEof()); - EXPECT_TRUE(scanner.Match("")); - EXPECT_FALSE(scanner.Match("1")); - EXPECT_TRUE(scanner.ok()); -} - -TEST(Scanner, Prefix) { - Scanner scanner("1234 5"); - EXPECT_FALSE(scanner.MatchEof()); - EXPECT_TRUE(scanner.Match("12")); - EXPECT_TRUE(scanner.Match("34 ")); - EXPECT_FALSE(scanner.MatchEof()); - EXPECT_FALSE(scanner.Match("5 ")); - EXPECT_TRUE(scanner.Match("5")); - EXPECT_TRUE(scanner.MatchEof()); -} - -TEST(Scanner, Whitespace) { - Scanner scanner(" \t\n\r 1\t2\n\n"); - - EXPECT_FALSE(scanner.Match(" ")); - EXPECT_TRUE(scanner.Match("1")); - EXPECT_TRUE(scanner.Match("2")); - EXPECT_TRUE(scanner.MatchEof()); - EXPECT_TRUE(scanner.ok()); -} - -TEST(Scanner, Fail) { - Scanner scanner("153 4q"); - - scanner.Expect("5"); - EXPECT_FALSE(scanner.ok()); - EXPECT_FALSE(scanner.status().ok()); - - EXPECT_TRUE(scanner.MatchEof()); -} - -TEST(Scanner, Identifier) { - Scanner scanner("1 q1 _1_ _1a= qqb"); - - string identifier = "foo"; - EXPECT_FALSE(scanner.MatchReadIdentifier(&identifier)); - EXPECT_EQ(identifier, "foo"); - scanner.Match("1"); - - EXPECT_TRUE(scanner.MatchReadIdentifier(&identifier)); - EXPECT_EQ(identifier, "q1"); - - scanner.ExpectIdentifier("_1_"); - EXPECT_TRUE(scanner.ok()); - - scanner.ExpectIdentifier("_1a"); - EXPECT_TRUE(scanner.ok()); - - // The = after _1a is not included in the identifier. - scanner.Expect("="); - - // The expected identifier matches a prefix but is not the full identifier in - // the input. - EXPECT_TRUE(scanner.ok()); - scanner.ExpectIdentifier("qq"); - EXPECT_FALSE(scanner.ok()); -} - -TEST(Scanner, Int) { - Scanner scanner("1_2 3% -1 124345 -363 0 -0"); - EXPECT_EQ(1, scanner.ReadInt()); - EXPECT_TRUE(scanner.Match("_")); - EXPECT_EQ(2, scanner.ReadInt()); - EXPECT_EQ(3, scanner.ReadInt()); - EXPECT_TRUE(scanner.Match("%")); - EXPECT_EQ(-1, scanner.ReadInt()); - EXPECT_EQ(124345, scanner.ReadInt()); - EXPECT_EQ(-363, scanner.ReadInt()); - EXPECT_EQ(0, scanner.ReadInt()); - EXPECT_EQ(0, scanner.ReadInt()); - EXPECT_TRUE(scanner.MatchEof()); -} - -TEST(Scanner, IntVector) { - Scanner scanner("()(0) (-1,2) ( 3 , 4 )"); - EXPECT_THAT(scanner.ReadIntVector(), testing::IsEmpty()); - EXPECT_THAT(scanner.ReadIntVector(), testing::ElementsAre(0)); - EXPECT_THAT(scanner.ReadIntVector(), testing::ElementsAre(-1, 2)); - EXPECT_THAT(scanner.ReadIntVector(), testing::ElementsAre(3, 4)); - EXPECT_TRUE(scanner.MatchEof()); - EXPECT_TRUE(scanner.ok()); -} - -} // namespace -} // namespace xla -#endif diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 4d653a0196f70fa2f860d754d2ed57c976fb0eb5..26b48cf4196ce24a8a20f407f698d951e18193f9 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -21,13 +21,6 @@ load( "tf_proto_library_py", ) -xla_proto_library( - name = "session_proto", - srcs = ["session.proto"], - visibility = ["//visibility:public"], - deps = ["//tensorflow/compiler/xla:xla_data_proto"], -) - xla_proto_library( name = "hlo_proto", srcs = ["hlo.proto"], @@ -39,6 +32,7 @@ tf_proto_library_py( name = "hlo_proto", # bzl adds a _py suffix only to the OSS target. srcs = ["hlo.proto"], visibility = ["//visibility:public"], + deps = ["//tensorflow/compiler/xla:xla_data_proto_py"], ) xla_proto_library( @@ -75,6 +69,7 @@ cc_library( "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -105,9 +100,11 @@ cc_library( ":bfloat16_support", ":hlo", ":hlo_pass", + "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -142,7 +139,7 @@ cc_library( ":hlo_dce", ":hlo_pass", ":tuple_simplifier", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_tree", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:util", @@ -181,6 +178,10 @@ cc_library( "//tensorflow/compiler/xla:window_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -188,6 +189,7 @@ tf_cc_test( name = "shape_inference_test", srcs = ["shape_inference_test.cc"], deps = [ + ":hlo", ":shape_inference", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", @@ -196,6 +198,7 @@ tf_cc_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -231,8 +234,10 @@ cc_library( hdrs = ["hlo_evaluator.h"], deps = [ ":hlo", + ":hlo_casting_utils", ":hlo_query", ":shape_inference", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", @@ -241,6 +246,12 @@ cc_library( "//tensorflow/compiler/xla:window_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", ], ) @@ -250,7 +261,7 @@ tf_cc_test( deps = [ ":hlo", ":hlo_evaluator", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status", @@ -260,13 +271,14 @@ tf_cc_test( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:hlo_element_type_converter", "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -276,6 +288,7 @@ cc_library( "dfs_hlo_visitor.cc", "hlo_computation.cc", "hlo_instruction.cc", + "hlo_instructions.cc", "hlo_module.cc", "hlo_opcode.cc", "hlo_sharding.cc", @@ -287,17 +300,19 @@ cc_library( "hlo_computation.h", "hlo_domain_metadata.h", "hlo_instruction.h", + "hlo_instructions.h", "hlo_module.h", "hlo_opcode.h", "hlo_sharding.h", ], deps = [ + ":hlo_casting_utils", ":hlo_module_config", ":hlo_proto", ":hlo_reachability", ":name_uniquer", - ":versioned_computation_handle", "//tensorflow/compiler/xla:array", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla:shape_tree", @@ -309,8 +324,14 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:window_util", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/core:human_readable_json", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -337,7 +358,7 @@ cc_library( deps = [ ":hlo", "//tensorflow/compiler/xla:shape_util", - "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -348,8 +369,8 @@ tf_cc_test( ":hlo", ":pattern_matcher", "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:test", ], ) @@ -363,6 +384,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/types:span", ], ) @@ -387,9 +409,10 @@ cc_library( deps = [ ":hlo", "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", - "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", ], ) @@ -398,36 +421,29 @@ tf_cc_test( srcs = ["hlo_matchers_test.cc"], deps = [ ":hlo_matchers", + ":hlo_parser", + "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", ], ) -cc_library( - name = "versioned_computation_handle", - srcs = ["versioned_computation_handle.cc"], - hdrs = ["versioned_computation_handle.h"], - deps = [ - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/core:lib", - ], -) - tf_cc_test( name = "hlo_instruction_test", srcs = ["hlo_instruction_test.cc"], deps = [ ":hlo", - "//tensorflow/compiler/xla:literal_util", + ":hlo_parser", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:window_util", "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) @@ -436,15 +452,15 @@ tf_cc_test( srcs = ["hlo_sharding_test.cc"], deps = [ ":hlo", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) @@ -457,6 +473,9 @@ cc_library( "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -465,7 +484,7 @@ tf_cc_test( srcs = ["call_graph_test.cc"], deps = [ ":call_graph", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", @@ -499,6 +518,7 @@ cc_library( hdrs = ["call_inliner.h"], deps = [ ":call_graph", + ":hlo_dce", ":hlo_pass", "//tensorflow/compiler/xla:statusor", "//tensorflow/core:lib", @@ -514,7 +534,7 @@ tf_cc_test( ":hlo", ":hlo_matchers", ":hlo_pass", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:types", @@ -524,6 +544,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -533,7 +554,7 @@ tf_cc_test( deps = [ ":call_graph", ":flatten_call_graph", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", @@ -547,45 +568,6 @@ tf_cc_test( ], ) -cc_library( - name = "user_computation", - srcs = ["user_computation.cc"], - hdrs = ["user_computation.h"], - deps = [ - ":hlo", - ":session_proto", - ":shape_inference", - ":versioned_computation_handle", - "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla:xla_proto", - "//tensorflow/core:lib", - ], -) - -tf_cc_test( - name = "user_computation_test", - srcs = ["user_computation_test.cc"], - deps = [ - ":hlo_matchers", - ":user_computation", - "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla:test_helpers", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/core:test", - ], -) - cc_library( name = "platform_util", srcs = ["platform_util.cc"], @@ -598,6 +580,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/strings", ], ) @@ -610,16 +593,19 @@ cc_library( ":computation_placer", ":device_memory_allocator", ":platform_util", - ":pool", + ":stream_pool", ":transfer_manager", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", - "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:core_cpu_lib", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//third_party/eigen3", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -631,10 +617,8 @@ cc_library( ":allocation_tracker", ":backend", ":channel_tracker", - ":compilation_cache", ":compiler", ":computation_layout", - ":computation_tracker", ":device_memory_allocator", ":executable", ":execution_tracker", @@ -645,11 +629,9 @@ cc_library( ":hlo_module_config", ":hlo_proto_util", ":platform_util", - ":session_proto", ":source_map_util", + ":stream_pool", ":transfer_manager", - ":user_computation", - ":versioned_computation_handle", "//tensorflow/compiler/xla:executable_run_options", "//tensorflow/compiler/xla:execution_options_util", "//tensorflow/compiler/xla:service_interface", @@ -663,7 +645,12 @@ cc_library( "//tensorflow/compiler/xla:xla_proto", "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", "//tensorflow/core:lib", + "//tensorflow/core:ptr_util", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], alwayslink = 1, ) @@ -676,7 +663,6 @@ cc_library( ":backend", ":compiler", ":computation_layout", - ":computation_tracker", ":device_memory_allocator", ":executable", ":hlo", @@ -685,8 +671,6 @@ cc_library( ":platform_util", ":service", ":shaped_buffer", - ":user_computation", - ":versioned_computation_handle", "//tensorflow/compiler/xla:execution_options_util", "//tensorflow/compiler/xla:shape_layout", "//tensorflow/compiler/xla:shape_util", @@ -696,9 +680,13 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:executable_build_options", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -710,7 +698,6 @@ cc_library( ":backend", ":compiler", ":computation_layout", - ":computation_tracker", ":platform_util", ":service", "//tensorflow/compiler/xla:status_macros", @@ -722,6 +709,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/strings", ], ) @@ -772,6 +760,10 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -789,6 +781,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:ptr_util", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -806,11 +799,11 @@ cc_library( ":hlo_execution_profile", ":hlo_graph_dumper", ":hlo_proto", - ":pool", - ":session_proto", + ":maybe_owning_device_memory", ":shaped_buffer", - ":versioned_computation_handle", + ":stream_pool", "//tensorflow/compiler/xla:executable_run_options", + "//tensorflow/compiler/xla:shape_tree", "//tensorflow/compiler/xla:status", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -821,6 +814,10 @@ cc_library( "//tensorflow/core:lib_internal", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/stream_executor", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@com_google_absl//absl/types:variant", ], ) @@ -839,6 +836,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/types:span", ], ) @@ -859,7 +857,7 @@ cc_library( hdrs = ["transfer_manager.h"], deps = [ ":shaped_buffer", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -868,6 +866,9 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -886,6 +887,8 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -895,32 +898,14 @@ cc_library( hdrs = ["execution_tracker.h"], deps = [ ":backend", - ":pool", + ":stream_pool", "//tensorflow/compiler/xla:executable_run_options", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", - ], -) - -cc_library( - name = "computation_tracker", - srcs = ["computation_tracker.cc"], - hdrs = ["computation_tracker.h"], - deps = [ - ":hlo", - ":hlo_module_config", - ":session_proto", - ":user_computation", - ":versioned_computation_handle", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -930,9 +915,6 @@ cc_library( hdrs = ["channel_tracker.h"], deps = [ ":hlo", - ":session_proto", - ":user_computation", - ":versioned_computation_handle", "//tensorflow/compiler/xla:status", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -941,6 +923,9 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -951,6 +936,7 @@ cc_library( deps = [ "//tensorflow/compiler/xla:types", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -985,6 +971,8 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -994,12 +982,14 @@ tf_cc_test( deps = [ ":buffer_liveness", ":hlo", + ":hlo_dataflow_analysis", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "@com_google_absl//absl/memory", ], ) @@ -1025,9 +1015,12 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -1038,24 +1031,25 @@ tf_cc_test( ":buffer_assignment", ":buffer_value", ":call_graph", - ":computation_tracker", ":copy_insertion", ":cpu_plugin", ":flatten_call_graph", ":hlo", ":hlo_ordering", ":hlo_scheduling", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -1075,6 +1069,8 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -1090,9 +1086,9 @@ tf_cc_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) @@ -1110,6 +1106,7 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -1123,11 +1120,12 @@ tf_cc_test( ":hlo_ordering", ":hlo_value", ":tuple_points_to_analysis", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -1137,12 +1135,16 @@ cc_library( hdrs = ["hlo_module_group_metadata.h"], deps = [ ":hlo", + ":hlo_casting_utils", + ":tuple_points_to_analysis", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:optional", ], ) @@ -1152,6 +1154,7 @@ cc_library( hdrs = ["hlo_module_group_util.h"], deps = [ ":hlo", + ":hlo_casting_utils", ":hlo_module_group_metadata", ":hlo_reachability", "//tensorflow/compiler/xla:status", @@ -1160,6 +1163,9 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -1179,6 +1185,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", ], ) @@ -1186,16 +1193,18 @@ tf_cc_test( name = "hlo_scheduling_test", srcs = ["hlo_scheduling_test.cc"], deps = [ - ":buffer_value", + ":heap_simulator", ":hlo", + ":hlo_dce", ":hlo_ordering", + ":hlo_parser", ":hlo_scheduling", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", + "//tensorflow/core:test", ], ) @@ -1205,7 +1214,7 @@ cc_library( hdrs = ["hlo_query.h"], deps = [ ":hlo", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", ], ) @@ -1219,6 +1228,7 @@ cc_library( ":hlo_pass", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", ], ) @@ -1228,9 +1238,23 @@ tf_cc_test( deps = [ ":hlo_matchers", ":instruction_fusion", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", + ], +) + +cc_library( + name = "multi_output_fusion", + srcs = ["multi_output_fusion.cc"], + hdrs = ["multi_output_fusion.h"], + deps = [ + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_pass", + "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -1241,9 +1265,13 @@ cc_library( deps = [ ":hlo", ":shape_inference", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -1259,8 +1287,10 @@ tf_cc_test( "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -1271,6 +1301,7 @@ cc_library( deps = [ ":hlo", ":hlo_pass", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -1278,6 +1309,8 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", ], ) @@ -1290,8 +1323,25 @@ cc_library( ":hlo_creation_utils", ":hlo_pass", ":while_util", + "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", + "@com_google_absl//absl/algorithm:container", + ], +) + +cc_library( + name = "scatter_expander", + srcs = ["scatter_expander.cc"], + hdrs = ["scatter_expander.h"], + deps = [ + ":hlo", + ":hlo_creation_utils", + ":hlo_pass", + ":while_util", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:statusor", + "@com_google_absl//absl/algorithm:container", ], ) @@ -1303,8 +1353,9 @@ tf_cc_test( ":batchnorm_expander", ":hlo", ":hlo_matchers", + ":hlo_parser", ":hlo_pass", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:types", @@ -1313,6 +1364,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -1322,10 +1374,12 @@ cc_library( hdrs = ["algebraic_simplifier.h"], deps = [ ":hlo", + ":hlo_casting_utils", ":hlo_creation_utils", ":hlo_pass", ":hlo_query", ":pattern_matcher", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -1334,6 +1388,11 @@ cc_library( "//tensorflow/compiler/xla:window_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", ], ) @@ -1343,9 +1402,10 @@ tf_cc_test( deps = [ ":algebraic_simplifier", ":hlo", + ":hlo_casting_utils", ":hlo_matchers", ":hlo_pass", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:types", @@ -1357,6 +1417,8 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -1368,8 +1430,7 @@ cc_library( ":hlo", ":hlo_creation_utils", ":hlo_pass", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", ], ) @@ -1381,7 +1442,7 @@ tf_cc_test( ":hlo", ":hlo_matchers", ":hlo_pass", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:types", @@ -1402,9 +1463,9 @@ tf_cc_test( deps = [ ":gather_expander", "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:test_macros_header", "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) @@ -1416,12 +1477,13 @@ cc_library( ":call_inliner", ":hlo", ":hlo_pass", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -1432,6 +1494,7 @@ tf_cc_test( ":conditional_simplifier", ":hlo", ":hlo_matchers", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", @@ -1443,6 +1506,52 @@ tf_cc_test( ], ) +cc_library( + name = "convolution_feature_group_converter", + srcs = ["convolution_feature_group_converter.cc"], + hdrs = ["convolution_feature_group_converter.h"], + deps = [ + ":hlo", + ":hlo_pass", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + ], +) + +tf_cc_test( + name = "convolution_feature_group_converter_test", + size = "small", + srcs = ["convolution_feature_group_converter_test.cc"], + deps = [ + ":convolution_feature_group_converter", + ":hlo", + ":hlo_matchers", + ":hlo_parser", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla/tests:hlo_test_base", + ], +) + +cc_library( + name = "while_loop_analysis", + srcs = ["while_loop_analysis.cc"], + hdrs = ["while_loop_analysis.h"], + deps = [ + ":hlo", + ":hlo_evaluator", + "@com_google_absl//absl/types:optional", + ], +) + cc_library( name = "while_loop_simplifier", srcs = ["while_loop_simplifier.cc"], @@ -1450,10 +1559,12 @@ cc_library( deps = [ ":call_inliner", ":hlo", - ":hlo_evaluator", ":hlo_pass", + ":while_loop_analysis", "//tensorflow/compiler/xla:statusor", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", ], ) @@ -1467,6 +1578,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", ], ) @@ -1491,7 +1603,7 @@ tf_cc_test( deps = [ ":defuser", ":hlo_matchers", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/tests:hlo_verified_test_base", ], @@ -1519,7 +1631,7 @@ tf_cc_test( deps = [ ":hlo_matchers", ":implicit_broadcast_remover", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/tests:hlo_verified_test_base", ], @@ -1561,7 +1673,7 @@ tf_cc_test( ":hlo", ":hlo_matchers", ":tuple_simplifier", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:types", @@ -1576,11 +1688,12 @@ cc_library( hdrs = ["reshape_mover.h"], deps = [ ":hlo_pass", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", ], ) @@ -1591,7 +1704,7 @@ tf_cc_test( ":hlo", ":hlo_matchers", ":reshape_mover", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", @@ -1601,6 +1714,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -1615,6 +1729,7 @@ cc_library( "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:types", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -1626,7 +1741,7 @@ tf_cc_test( ":hlo", ":hlo_matchers", ":inliner", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:util", @@ -1634,6 +1749,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "@com_google_absl//absl/memory", ], ) @@ -1643,7 +1759,7 @@ cc_library( hdrs = ["computation_placer.h"], deps = [ "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status", "//tensorflow/compiler/xla:status_macros", @@ -1653,6 +1769,8 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], alwayslink = True, # Contains per-platform computation placer registration ) @@ -1666,6 +1784,8 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -1675,7 +1795,7 @@ cc_library( hdrs = ["generic_transfer_manager.h"], deps = [ ":transfer_manager", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -1703,6 +1823,7 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/types:span", ], ) @@ -1710,14 +1831,11 @@ tf_cc_test( name = "hlo_cost_analysis_test", srcs = ["hlo_cost_analysis_test.cc"], deps = [ - ":computation_tracker", ":cpu_plugin", ":hlo", ":hlo_cost_analysis", ":local_service", ":service", - ":user_computation", - ":versioned_computation_handle", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test_helpers", @@ -1725,8 +1843,8 @@ tf_cc_test( "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -1746,6 +1864,8 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/memory", ], ) @@ -1756,10 +1876,11 @@ tf_cc_test( ":cpu_plugin", ":hlo_cost_analysis", ":hlo_execution_profile", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -1769,7 +1890,7 @@ tf_cc_test( deps = [ ":hlo", ":hlo_matchers", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", @@ -1784,12 +1905,15 @@ tf_cc_binary( deps = [ ":hlo", ":hlo_graph_dumper", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -1798,7 +1922,7 @@ tf_cc_test( srcs = ["hlo_module_test.cc"], deps = [ ":hlo", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:util", @@ -1806,6 +1930,8 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], ) @@ -1821,6 +1947,8 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -1848,6 +1976,8 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -1865,6 +1995,9 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -1883,6 +2016,10 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -1896,7 +2033,7 @@ tf_cc_test( ":hlo_matchers", ":hlo_ordering", ":instruction_fusion", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", @@ -1924,6 +2061,8 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -1933,15 +2072,15 @@ tf_cc_test( deps = [ ":hlo", ":hlo_liveness_analysis", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", "//tensorflow/core:test", ], @@ -1960,6 +2099,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -1980,6 +2120,8 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -1994,12 +2136,13 @@ tf_cc_test( ":hlo_matchers", ":hlo_ordering", ":instruction_fusion", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", @@ -2017,6 +2160,7 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/memory", ], ) @@ -2026,6 +2170,7 @@ cc_library( hdrs = ["tuple_points_to_analysis.h"], deps = [ ":hlo", + ":hlo_dataflow_analysis", ":logical_buffer", ":logical_buffer_analysis", "//tensorflow/compiler/xla:shape_tree", @@ -2035,6 +2180,11 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -2046,6 +2196,7 @@ tf_cc_test( ":hlo_matchers", ":instruction_fusion", ":tuple_points_to_analysis", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", @@ -2058,20 +2209,6 @@ tf_cc_test( ], ) -cc_library( - name = "compilation_cache", - srcs = ["compilation_cache.cc"], - hdrs = ["compilation_cache.h"], - deps = [ - ":executable", - ":hlo_module_config", - ":versioned_computation_handle", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/core:lib", - ], -) - cc_library( name = "layout_assignment", srcs = [ @@ -2083,6 +2220,7 @@ cc_library( deps = [ ":computation_layout", ":hlo", + ":hlo_casting_utils", ":hlo_dce", ":hlo_graph_dumper", ":hlo_pass", @@ -2097,6 +2235,10 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -2119,6 +2261,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -2131,7 +2274,7 @@ tf_cc_test( ":hlo_graph_dumper", ":hlo_matchers", ":hlo_runner", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", @@ -2182,10 +2325,14 @@ cc_library( hdrs = ["hlo_verifier.h"], deps = [ ":hlo", + ":hlo_casting_utils", ":hlo_pass", ":shape_inference", "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -2194,6 +2341,7 @@ tf_cc_test( srcs = ["hlo_verifier_test.cc"], deps = [ ":hlo", + ":hlo_parser", ":hlo_verifier", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", @@ -2213,6 +2361,7 @@ cc_library( ":buffer_liveness", ":buffer_value", ":call_graph", + ":copy_insertion", ":flatten_call_graph", ":hlo", ":hlo_dce", @@ -2226,6 +2375,10 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -2233,6 +2386,7 @@ tf_cc_test( name = "hlo_rematerialization_test", srcs = ["hlo_rematerialization_test.cc"], deps = [ + ":flatten_call_graph", ":hlo", ":hlo_matchers", ":hlo_ordering", @@ -2242,6 +2396,7 @@ tf_cc_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:test", ], ) @@ -2251,6 +2406,7 @@ tf_cc_test( deps = [ ":hlo", ":hlo_dce", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", @@ -2262,6 +2418,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -2271,16 +2428,16 @@ tf_cc_test( deps = [ ":hlo", ":hlo_module_dce", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", "//tensorflow/core:test", ], @@ -2295,18 +2452,19 @@ tf_cc_test( ":hlo", ":hlo_matchers", ":layout_assignment", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_layout", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:test_utils", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/types:span", ], ) @@ -2343,6 +2501,9 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -2354,7 +2515,7 @@ cc_library( ":hlo", ":hlo_domain_map", ":hlo_pass", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", @@ -2370,16 +2531,17 @@ tf_cc_test( ":hlo", ":hlo_cse", ":hlo_matchers", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -2392,10 +2554,11 @@ cc_library( ":hlo_evaluator", ":hlo_pass", ":hlo_query", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -2406,8 +2569,9 @@ tf_cc_test( ":hlo", ":hlo_constant_folding", ":hlo_matchers", + ":hlo_parser", ":hlo_pass", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:types", @@ -2427,6 +2591,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -2441,6 +2606,22 @@ cc_library( "//tensorflow/compiler/xla:shape_tree", "//tensorflow/compiler/xla:shape_util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", + ], +) + +cc_library( + name = "hlo_domain_verifier", + srcs = ["hlo_domain_verifier.cc"], + hdrs = ["hlo_domain_verifier.h"], + deps = [ + ":hlo", + ":hlo_domain_map", + ":hlo_graph_dumper", + ":hlo_pass", + "//tensorflow/compiler/xla:types", + "//tensorflow/core:lib", ], ) @@ -2463,12 +2644,11 @@ cc_library( hdrs = ["hlo_domain_remover.h"], deps = [ ":hlo", - ":hlo_domain_isolator", ":hlo_domain_map", + ":hlo_domain_verifier", ":hlo_graph_dumper", ":hlo_pass", "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", ], ) @@ -2480,13 +2660,15 @@ tf_cc_test( ":hlo", ":hlo_domain_isolator", ":hlo_domain_remover", + ":hlo_parser", ":hlo_sharding_metadata", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -2499,7 +2681,7 @@ cc_library( ":hlo_evaluator", ":hlo_pass", ":hlo_query", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/core:lib", @@ -2533,6 +2715,22 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/types:span", + ], +) + +cc_library( + name = "maybe_owning_device_memory", + srcs = [ + "maybe_owning_device_memory.cc", + ], + hdrs = [ + "maybe_owning_device_memory.h", + ], + deps = [ + ":device_memory_allocator", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:variant", ], ) @@ -2542,6 +2740,7 @@ cc_library( hdrs = ["elemental_ir_emitter.h"], deps = [ ":hlo", + ":hlo_casting_utils", ":hlo_module_config", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -2550,11 +2749,14 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", + "//tensorflow/compiler/xla/service/llvm_ir:ir_builder_mixin", "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", "@llvm//:core", "@llvm//:transform_utils", ], @@ -2571,10 +2773,10 @@ xla_test( "//tensorflow/compiler/xla:execution_options_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) @@ -2586,10 +2788,11 @@ cc_library( ":computation_layout", "//tensorflow/compiler/xla:shape_layout", "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla:xla_proto", - "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", ], ) @@ -2602,6 +2805,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -2632,15 +2836,14 @@ cc_library( name = "hlo_tfgraph_builder", srcs = ["hlo_tfgraph_builder.cc"], hdrs = ["hlo_tfgraph_builder.h"], - visibility = ["//tensorflow/compiler/xla/tools:__pkg__"], deps = [ ":hlo", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:xla_proto", "//tensorflow/core:framework", - "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/strings", ], ) @@ -2663,9 +2866,10 @@ cc_library( hdrs = ["hlo_graph_dumper.h"], deps = [ ":hlo", + ":hlo_casting_utils", ":hlo_execution_profile", ":hlo_tfgraph_builder", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:window_util", @@ -2673,6 +2877,9 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:regexp_internal", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", ], alwayslink = 1, ) @@ -2683,11 +2890,13 @@ tf_cc_test( deps = [ ":hlo", ":hlo_graph_dumper", + "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:xla_proto", "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -2714,16 +2923,16 @@ tf_cc_test( ":hlo_matchers", ":shape_inference", ":transpose_folding", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service/gpu:ir_emission_utils", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", ], ) @@ -2735,7 +2944,7 @@ cc_library( deps = [ ":hlo", ":hlo_pass", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", @@ -2750,13 +2959,13 @@ tf_cc_test( ":hlo", ":shape_inference", ":zero_sized_hlo_elimination", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -2764,21 +2973,25 @@ tf_cc_test( ) cc_library( - name = "pool", - hdrs = ["pool.h"], + name = "stream_pool", + srcs = ["stream_pool.cc"], + hdrs = ["stream_pool.h"], deps = [ - "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:types", "//tensorflow/core:lib", + "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", ], ) tf_cc_test( - name = "pool_test", - srcs = ["pool_test.cc"], + name = "stream_pool_test", + srcs = ["stream_pool_test.cc"], deps = [ - ":pool", + ":stream_pool", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:stream_executor_no_cuda", ], ) @@ -2860,11 +3073,13 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:backend", "//tensorflow/compiler/xla/service:compiler", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//third_party/eigen3", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], ) @@ -2885,7 +3100,7 @@ cc_library( hdrs = ["tuple_util.h"], deps = [ ":hlo", - "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -2896,8 +3111,8 @@ tf_cc_test( ":tuple_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) @@ -2910,7 +3125,9 @@ cc_library( ":hlo", ":hlo_creation_utils", ":tuple_util", - "//tensorflow/core:lib", + "//tensorflow/compiler/xla:literal_util", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", ], ) @@ -2920,9 +3137,11 @@ tf_cc_test( deps = [ ":while_util", "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", + "@com_google_absl//absl/algorithm:container", ], ) @@ -2938,6 +3157,8 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:inlined_vector", ], ) @@ -2948,8 +3169,8 @@ tf_cc_test( ":hlo_matchers", ":while_loop_invariant_code_motion", "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_verified_test_base", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:test", ], ) @@ -2965,6 +3186,7 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", ], ) @@ -2975,8 +3197,8 @@ tf_cc_test( ":hlo_matchers", ":while_loop_constant_sinking", "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_verified_test_base", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:test", ], ) @@ -2998,13 +3220,13 @@ cc_library( cc_library( name = "source_map_util", - srcs = ["source_map_util.cc"], + srcs = [], hdrs = ["source_map_util.h"], deps = [ ":executable", "//tensorflow/compiler/xla:status", - "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", ], ) @@ -3019,6 +3241,10 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "//tensorflow/core:ptr_util", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", ], ) @@ -3029,9 +3255,83 @@ tf_cc_test( ":hlo_matchers", ":indexed_array_analysis", "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/compiler/xla/tests:test_utils", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", + "//tensorflow/core:test", + ], +) + +cc_library( + name = "hlo_parser", + srcs = ["hlo_parser.cc"], + hdrs = ["hlo_parser.h"], + deps = [ + ":hlo", + ":hlo_lexer", + ":hlo_sharding_metadata", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/core:lib_internal", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + ], +) + +tf_cc_test( + name = "hlo_parser_test", + size = "small", + srcs = ["hlo_parser_test.cc"], + deps = [ + ":hlo_matchers", + ":hlo_parser", + "//tensorflow/compiler/xla:window_util", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", # fixdeps: keep + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "hlo_lexer", + srcs = ["hlo_lexer.cc"], + hdrs = [ + "hlo_lexer.h", + "hlo_token.h", + ], + deps = [ + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/core:lib", + "//tensorflow/core:regexp_internal", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + ], +) + +cc_library( + name = "hlo_casting_utils", + hdrs = ["hlo_casting_utils.h"], + deps = ["//tensorflow/core:lib"], +) + +tf_cc_test( + name = "hlo_casting_utils_test", + srcs = ["hlo_casting_utils_test.cc"], + deps = [ + ":hlo", + ":hlo_casting_utils", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep "//tensorflow/core:test", ], ) diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index c65c91e8e0a6e2511a2068e225e4f5572385c851..7c078f07d72ab4243d50b7f7910cb7c794e306c4 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -22,12 +22,20 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/types/optional.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_creation_utils.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_query.h" #include "tensorflow/compiler/xla/service/pattern_matcher.h" @@ -39,8 +47,6 @@ limitations under the License. #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -50,20 +56,15 @@ namespace { namespace m = match; -// Returns whether operand is a literal with the given value. -bool IsLiteralWithValue(const HloInstruction* operand, int8 value) { - return operand->opcode() == HloOpcode::kConstant && - operand->literal().IsAll(value); -} - bool IsAll(const HloInstruction* op, int8 value) { - if (IsLiteralWithValue(op, value)) { - return true; - } - if (op->opcode() == HloOpcode::kBroadcast && IsAll(op->operand(0), value)) { - return true; + switch (op->opcode()) { + case HloOpcode::kBroadcast: + return IsAll(op->operand(0), value); + case HloOpcode::kConstant: + return op->literal().IsAll(value); + default: + return false; } - return false; } // Returns whether the given transpose produces a result which is bit-wise @@ -75,21 +76,22 @@ bool TransposeIsBitcast(const HloInstruction* transpose) { transpose->dimensions()); } -// Returns true if the given reshape produces a result which is bit-wise +// Returns true if the given reshape/copy produces a result which is bit-wise // identical to its operand and thus may be replaced with a bitcast. // // This function is conservative -- even if this function returns false, the // reshape may still be a bitcast. For example, a reshape from [28x28] to [784]. -bool ReshapeIsBitcast( - const HloInstruction* reshape, +bool ReshapeOrCopyIsBitcast( + const HloInstruction* instr, const AlgebraicSimplifier::ValidBitcastCallback& valid_bitcast_callback) { - CHECK_EQ(HloOpcode::kReshape, reshape->opcode()); + CHECK(HloOpcode::kReshape == instr->opcode() || + HloOpcode::kCopy == instr->opcode()); - const HloInstruction* operand = reshape->operand(0); + const HloInstruction* operand = instr->operand(0); // Can't insert bitcasts if the compiler used a memory layout which isn't // compatible. - return ShapeUtil::ReshapeIsBitcast(operand->shape(), reshape->shape()) && - valid_bitcast_callback(operand->shape(), reshape->shape()); + return ShapeUtil::ReshapeIsBitcast(operand->shape(), instr->shape()) && + valid_bitcast_callback(operand->shape(), instr->shape()); } // AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain @@ -125,6 +127,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { Status HandleImag(HloInstruction* imag) override; + Status HandleIota(HloInstruction* instruction) override; + Status HandleConvolution(HloInstruction* convolution) override; Status HandleDivide(HloInstruction* divide) override; @@ -153,15 +157,14 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { Status HandleDynamicUpdateSlice( HloInstruction* dynamic_update_slice) override; + Status HandleSort(HloInstruction* sort) override; + Status HandleTranspose(HloInstruction* transpose) override; Status HandleSubtract(HloInstruction* sub) override; Status HandleMap(HloInstruction* map) override; - Status HandleMaximum(HloInstruction* maximum) override; - Status HandleMinimum(HloInstruction* minimum) override; - // Returns whether algebraic simplification has occurred. const bool changed() const { return changed_; } @@ -200,8 +203,9 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { // Helper method to perform and add reduction in a single dimension. HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) { - HloInstruction* zero = computation_->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction* zero = + computation_->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::Zero(hlo->shape().element_type()).CloneToUnique())); HloComputation* AddReduce_computation = GetOrCreateScalarAddComputation(); Shape shape = ShapeUtil::DeleteDimension(dim, hlo->shape()); return computation_->AddInstruction(HloInstruction::CreateReduce( @@ -233,10 +237,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { HloInstruction* operand, HloInstruction* max, HloInstruction* max_operand); - // A Reshape or Broadcast that feeds an element-wise operation with a unique - // non-scalar operand can sink to after the operation. - StatusOr TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand( - HloInstruction* reshape_or_broadcast); + // A Broadcast that feeds an element-wise operation with a unique non-scalar + // operand can sink to after the operation. + StatusOr TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand( + HloInstruction* broadcast); // Replaces the existing HLO instruction old_instruction, with // new_instruction, and marks the optimizer status as changed. @@ -269,7 +273,7 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { StatusOr OptimizeDotOfConcat(HloInstruction* dot); StatusOr OptimizeDotOfConcatHelper( - const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim, + const HloInstruction& dot, HloInstruction* lhs, int64 lhs_contracting_dim, HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped); StatusOr OptimizeDotOfGather(HloInstruction* dot); @@ -433,14 +437,21 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) { copy, HloInstruction::CreateUnary(copy->shape(), HloOpcode::kCopy, op)); } // All copies can be eliminated (assuming layout constraints are satisified). - ReplaceInstructionIfSameShape(copy, copy->mutable_operand(0)); + if (ReplaceInstructionIfSameShape(copy, copy->mutable_operand(0))) { + return Status::OK(); + } + + if (is_layout_sensitive_ && + ReshapeOrCopyIsBitcast(copy, valid_bitcast_callback_)) { + ReplaceWithBitcast(copy); + } + return Status::OK(); } Status AlgebraicSimplifierVisitor::HandleConcatenate( HloInstruction* concatenate) { - tensorflow::gtl::ArraySlice operands( - concatenate->operands()); + absl::Span operands(concatenate->operands()); if (operands.size() == 1) { // Unary concatenates are useless. ReplaceInstructionIfSameShape(concatenate, operands[0]); @@ -449,7 +460,7 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate( // Filter out and remove empty operands. std::vector nonempty_operands; for (HloInstruction* operand : operands) { - if (!ShapeUtil::HasZeroElements(operand->shape())) { + if (!ShapeUtil::IsZeroElementArray(operand->shape())) { nonempty_operands.push_back(operand); } } @@ -528,17 +539,29 @@ Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) { constant, BuildTupleConstant(computation_, constant->literal())); } + if (constant->shape().element_type() == TOKEN) { + return Status::OK(); + } + // If a literal is all the same element replace it with a scalar broadcast. if (ShapeUtil::ElementsIn(constant->shape()) > 1 && constant->literal().IsAllFirst()) { - std::unique_ptr unique_scalar = - MakeUnique(constant->literal().GetFirstScalarLiteral()); + std::unique_ptr unique_scalar = absl::make_unique( + LiteralUtil::GetFirstScalarLiteral(constant->literal())); HloInstruction* scalar = computation_->AddInstruction( HloInstruction::CreateConstant(std::move(unique_scalar))); return ReplaceWithNewInstruction( constant, HloInstruction::CreateBroadcast(constant->shape(), scalar, {})); } + + // If a literal is an increasing sequence from zero, replace it with an iota. + if (ShapeUtil::Rank(constant->shape()) == 1 && + ShapeUtil::ElementsIn(constant->shape()) > 1 && + constant->literal().IsR1Iota()) { + return ReplaceWithNewInstruction( + constant, HloInstruction::CreateIota(constant->shape(), 0)); + } return Status::OK(); } @@ -563,6 +586,14 @@ Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) { return Status::OK(); } +namespace { +template +Status InvertConstant(const HloInstruction& constant, Literal* result) { + return result->Populate([&](absl::Span indices) { + return T{1.0} / constant.literal().Get(indices); + }); +} +} // namespace Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) { Shape* shape; @@ -624,14 +655,31 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) { // (Backends can do this transformation, but generally only if the constant is // a scalar.) if (Match(divide, m::Divide(m::NonConstant(&a), m::Constant(&b)))) { - HloInstruction* one = - computation_->AddInstruction(HloInstruction::CreateConstant( - Literal::One(a->shape().element_type()).CloneToUnique())); - HloInstruction* inverse = computation_->AddInstruction( - HloInstruction::CreateBinary(b->shape(), HloOpcode::kDivide, one, b)); - return ReplaceWithNewInstruction( - divide, HloInstruction::CreateBinary(divide->shape(), - HloOpcode::kMultiply, a, inverse)); + Literal new_literal(b->shape()); + switch (b->shape().element_type()) { + case F16: + TF_RETURN_IF_ERROR(InvertConstant(*b, &new_literal)); + break; + case F32: + TF_RETURN_IF_ERROR(InvertConstant(*b, &new_literal)); + break; + case BF16: + TF_RETURN_IF_ERROR(InvertConstant(*b, &new_literal)); + break; + case F64: + TF_RETURN_IF_ERROR(InvertConstant(*b, &new_literal)); + break; + case C64: + TF_RETURN_IF_ERROR(InvertConstant(*b, &new_literal)); + break; + default: + return Status::OK(); + } + auto inverse = computation_->AddInstruction( + HloInstruction::CreateConstant((new_literal.CloneToUnique()))); + TF_ASSIGN_OR_RETURN(auto new_divide, + MakeBinaryHlo(HloOpcode::kMultiply, a, inverse)); + return ReplaceInstruction(divide, new_divide); } // (A / B) / (C / D) => (A / B)*(D / C) => (A * D) / (B * C) @@ -651,18 +699,18 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) { if (Match(divide, m::Divide(m::Divide(m::Op(&a), m::Op(&b)), m::Op(&c)))) { TF_ASSIGN_OR_RETURN(auto b_times_c, MakeBinaryHlo(HloOpcode::kMultiply, b, c)); - return ReplaceWithNewInstruction( - divide, HloInstruction::CreateBinary(divide->shape(), - HloOpcode::kDivide, a, b_times_c)); + TF_ASSIGN_OR_RETURN(auto new_divide, + MakeBinaryHlo(HloOpcode::kDivide, a, b_times_c)); + return ReplaceInstruction(divide, new_divide); } // A / (B / C) => (A*C) / B if (Match(divide, m::Divide(m::Op(&a), m::Divide(m::Op(&b), m::Op(&c))))) { TF_ASSIGN_OR_RETURN(auto a_times_c, MakeBinaryHlo(HloOpcode::kMultiply, a, c)); - return ReplaceWithNewInstruction( - divide, HloInstruction::CreateBinary(divide->shape(), - HloOpcode::kDivide, a_times_c, b)); + TF_ASSIGN_OR_RETURN(auto new_divide, + MakeBinaryHlo(HloOpcode::kDivide, a_times_c, b)); + return ReplaceInstruction(divide, new_divide); } return Status::OK(); @@ -793,18 +841,18 @@ StatusOr AlgebraicSimplifierVisitor::OptimizeDotOfConcat( TF_ASSIGN_OR_RETURN( HloInstruction * optimized_lhs_concat, - OptimizeDotOfConcatHelper(dot->shape(), lhs, lhs_contracting_dim, rhs, + OptimizeDotOfConcatHelper(*dot, lhs, lhs_contracting_dim, rhs, rhs_contracting_dim, /*swapped=*/false)); if (optimized_lhs_concat) { return optimized_lhs_concat; } - return OptimizeDotOfConcatHelper(dot->shape(), rhs, rhs_contracting_dim, lhs, + return OptimizeDotOfConcatHelper(*dot, rhs, rhs_contracting_dim, lhs, lhs_contracting_dim, /*swapped=*/true); } StatusOr AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper( - const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim, + const HloInstruction& dot, HloInstruction* lhs, int64 lhs_contracting_dim, HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped) { bool can_optimize = lhs->opcode() == HloOpcode::kConcatenate && lhs->concatenate_dimension() == lhs_contracting_dim && @@ -903,11 +951,12 @@ StatusOr AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper( } auto* new_dot = computation_->AddInstruction(HloInstruction::CreateDot( - dot_shape, new_dot_lhs, new_dot_rhs, new_dot_dnums)); + dot.shape(), new_dot_lhs, new_dot_rhs, new_dot_dnums)); + new_dot->set_precision_config(dot.precision_config()); if (add_result) { add_result = computation_->AddInstruction(HloInstruction::CreateBinary( - dot_shape, HloOpcode::kAdd, add_result, new_dot)); + dot.shape(), HloOpcode::kAdd, add_result, new_dot)); } else { add_result = new_dot; } @@ -1006,6 +1055,7 @@ StatusOr AlgebraicSimplifierVisitor::OptimizeDotOfGather( auto memoized_shape = ShapeUtil::MakeShape(F32, {m, n}); auto* memoized_inst = computation_->AddInstruction(HloInstruction::CreateDot( memoized_shape, left_operand, right_operand, dnums)); + memoized_inst->set_precision_config(dot->precision_config()); // Get pair {start, 0} or {0, start}. HloInstruction* original_start_indices = lhs_is_dynamic_slice ? lhs->mutable_operand(1) : rhs->mutable_operand(1); @@ -1058,11 +1108,11 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) { } // Replace a zero element dot with a broadcast of the constant 0. - if (ShapeUtil::HasZeroElements(dot->shape()) || - ShapeUtil::HasZeroElements(lhs->shape()) || - ShapeUtil::HasZeroElements(rhs->shape())) { + if (ShapeUtil::IsZeroElementArray(dot->shape()) || + ShapeUtil::IsZeroElementArray(lhs->shape()) || + ShapeUtil::IsZeroElementArray(rhs->shape())) { auto zero = computation_->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); return ReplaceWithNewInstruction( dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {})); } @@ -1103,6 +1153,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) { ShapeUtil::PermuteDimensions({1, 0}, dot->shape()), rhs->mutable_operand(0), lhs->mutable_operand(0), dot_dimension_numbers)); + new_dot->set_precision_config(dot->precision_config()); return ReplaceWithNewInstruction( dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0})); } @@ -1124,6 +1175,19 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) { return Status::OK(); } + // 0*A => 0. Only applies for integral types for correct NaN-handling. + if (IsAll(lhs, 0) && + primitive_util::IsIntegralType(multiply->shape().element_type()) && + ReplaceInstructionIfSameShape(multiply, lhs)) { + return Status::OK(); + } + // A*0 => 0 + if (IsAll(rhs, 0) && + primitive_util::IsIntegralType(multiply->shape().element_type()) && + ReplaceInstructionIfSameShape(multiply, rhs)) { + return Status::OK(); + } + // exp(A) * exp(B) => exp(A+B) if (Match(multiply, m::Multiply(m::Exp(m::Op(&lhs)), m::Exp(m::Op(&rhs))))) { auto add = computation_->AddInstruction(HloInstruction::CreateBinary( @@ -1185,9 +1249,8 @@ namespace { // return value = {1, 3} // // Precondition: input_dim_indices is sorted. -std::pair> ReshapeLeavesDimensionsUnmodified( - const HloInstruction* hlo, - tensorflow::gtl::ArraySlice input_dim_indices) { +absl::optional> ReshapeLeavesDimensionsUnmodified( + const HloInstruction* hlo, absl::Span input_dim_indices) { CHECK_EQ(HloOpcode::kReshape, hlo->opcode()); CHECK(std::is_sorted(input_dim_indices.begin(), input_dim_indices.end())); @@ -1205,11 +1268,11 @@ std::pair> ReshapeLeavesDimensionsUnmodified( } if (i >= unmodified_dims.size() || unmodified_dims[i].first != input_dim_index) { - return std::make_pair(false, std::vector()); + return absl::nullopt; } output_dim_indices.push_back(unmodified_dims[i].second); } - return std::make_pair(true, output_dim_indices); + return output_dim_indices; } // Returns true if the output of "instruction" is a permutation of the @@ -1221,9 +1284,10 @@ bool OutputIsPermutationOfOperandElements(HloInstruction* instruction, switch (instruction->opcode()) { case HloOpcode::kReshape: case HloOpcode::kReverse: - case HloOpcode::kSort: case HloOpcode::kTranspose: return true; + case HloOpcode::kSort: + return (!ShapeUtil::IsTuple(instruction->shape())); default: return false; } @@ -1305,7 +1369,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) { // broadcast after the unary element-wise operation. TF_ASSIGN_OR_RETURN( bool sink_succeeded, - TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(broadcast)); + TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(broadcast)); changed_ |= sink_succeeded; if (sink_succeeded) { return Status::OK(); @@ -1337,6 +1401,15 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) { return Status::OK(); } + // broadcast(iota) -> iota. + if (operand->opcode() == HloOpcode::kIota) { + return ReplaceWithNewInstruction( + broadcast, + HloInstruction::CreateIota( + broadcast->shape(), + dims[Cast(operand)->iota_dimension()])); + } + // Merge two consecutive broadcasts into a single one. if (operand->opcode() == HloOpcode::kBroadcast) { std::vector new_dimensions; @@ -1391,8 +1464,21 @@ Status AlgebraicSimplifierVisitor::HandleImag(HloInstruction* imag) { return Status::OK(); } +Status AlgebraicSimplifierVisitor::HandleIota(HloInstruction* instruction) { + // iota -> zero if the iota dimension never produces an element other than + // zero. + auto* iota = Cast(instruction); + if (iota->shape().dimensions(iota->iota_dimension()) <= 1) { + auto zero = computation_->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::Zero(iota->shape().element_type()).CloneToUnique())); + return ReplaceWithNewInstruction( + iota, HloInstruction::CreateBroadcast(iota->shape(), zero, {})); + } + return Status::OK(); +} + Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) { - if (ShapeUtil::HasZeroElements(pad->operand(0)->shape())) { + if (ShapeUtil::IsZeroElementArray(pad->operand(0)->shape())) { return ReplaceWithNewInstruction( pad, HloInstruction::CreateBroadcast(pad->shape(), pad->mutable_operand(1), {})); @@ -1487,7 +1573,7 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) { CHECK(Match(power, m::Power(m::Op(&lhs), m::Op(&rhs)))); if (IsAll(rhs, 0)) { auto one = HloInstruction::CreateConstant( - Literal::One(power->shape().element_type()).CloneToUnique()); + LiteralUtil::One(power->shape().element_type()).CloneToUnique()); std::unique_ptr ones; if (ShapeUtil::IsScalar(power->shape())) { ones = std::move(one); @@ -1522,7 +1608,7 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) { VLOG(10) << "trying transform [pow(A, -1) => 1/A]: " << power->ToString(); if (IsAll(rhs, -1)) { auto* one = computation_->AddInstruction(HloInstruction::CreateConstant( - Literal::One(rhs->shape().element_type()).CloneToUnique())); + LiteralUtil::One(rhs->shape().element_type()).CloneToUnique())); // Explicitly broadcast scalar 1 to the output shape, to avoid implicit // broadcast in divide HLO as we are trying to eliminate implicit @@ -1557,15 +1643,16 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) { return Status::OK(); } -StatusOr AlgebraicSimplifierVisitor:: - TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand( - HloInstruction* reshape_or_broadcast) { +StatusOr +AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand( + HloInstruction* broadcast) { + TF_RET_CHECK(broadcast->opcode() == HloOpcode::kBroadcast); bool changed = false; - if (ShapeUtil::IsScalar(reshape_or_broadcast->shape())) { + if (ShapeUtil::IsScalar(broadcast->shape())) { return false; } - HloInstruction* operand = reshape_or_broadcast->mutable_operand(0); - for (HloInstruction* user : reshape_or_broadcast->users()) { + HloInstruction* operand = broadcast->mutable_operand(0); + for (HloInstruction* user : broadcast->users()) { if (user->user_count() == 0 && user != computation_->root_instruction()) { continue; } @@ -1583,55 +1670,50 @@ StatusOr AlgebraicSimplifierVisitor:: continue; } - int64 reshape_or_broadcast_operand_index = -1; // Find the unique non-scalar operand or continue if there isn't one. - int64 scalar_count = 0; - for (int64 i = 0; i < user->operand_count(); ++i) { - if (ShapeUtil::IsScalar(user->operand(i)->shape())) { - ++scalar_count; - } else { - reshape_or_broadcast_operand_index = i; + int64 scalar_broadcast_count = 0; + int64 broadcast_use_count = 0; + for (HloInstruction* user_operand : user->operands()) { + if (user_operand->opcode() == HloOpcode::kBroadcast && + ShapeUtil::IsScalar(user_operand->operand(0)->shape())) { + ++scalar_broadcast_count; + } else if (broadcast == user_operand) { + ++broadcast_use_count; } } - if (scalar_count != user->operand_count() - 1) { + if (scalar_broadcast_count + broadcast_use_count != user->operand_count()) { continue; } - VLOG(4) << "Sinking reshape or broadcast after user:"; - VLOG(4) << " old reshape/broadcast: " << reshape_or_broadcast->ToString(); + std::vector new_operands; + new_operands.reserve(user->operand_count()); + + for (HloInstruction* user_operand : user->operands()) { + if (user_operand->opcode() == HloOpcode::kBroadcast && + ShapeUtil::IsScalar(user_operand->operand(0)->shape())) { + new_operands.push_back( + computation_->AddInstruction(HloInstruction::CreateBroadcast( + ShapeUtil::ChangeElementType( + operand->shape(), user_operand->shape().element_type()), + user_operand->mutable_operand(0), {}))); + } else { + CHECK_EQ(broadcast, user_operand); + new_operands.push_back(operand); + } + } + VLOG(4) << "Sinking broadcast after user:"; + VLOG(4) << " old broadcast: " << broadcast->ToString(); VLOG(4) << " old user: " << user->ToString(); - CHECK_EQ(user->operand(reshape_or_broadcast_operand_index), - reshape_or_broadcast); - auto new_user_operands = user->operands(); - new_user_operands[reshape_or_broadcast_operand_index] = operand; - auto new_user = computation_->AddInstruction(user->CloneWithNewOperands( - ShapeUtil::MakeShapeWithLayout( - user->shape().element_type(), - AsInt64Slice(operand->shape().dimensions()), - LayoutUtil::MinorToMajor(operand->shape())), - new_user_operands)); + HloInstruction* new_user = + computation_->AddInstruction(user->CloneWithNewOperands( + ShapeUtil::ChangeElementType(operand->shape(), + user->shape().element_type()), + new_operands)); VLOG(4) << " new user: " << new_user->ToString(); - HloInstruction* new_reshape_or_broadcast = nullptr; - if (reshape_or_broadcast->opcode() == HloOpcode::kReshape) { - new_reshape_or_broadcast = - computation_->AddInstruction(HloInstruction::CreateReshape( - ShapeUtil::MakeShapeWithLayout( - user->shape().element_type(), - AsInt64Slice(reshape_or_broadcast->shape().dimensions()), - LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())), - new_user)); - } else { - TF_RET_CHECK(reshape_or_broadcast->opcode() == HloOpcode::kBroadcast); - new_reshape_or_broadcast = - computation_->AddInstruction(HloInstruction::CreateBroadcast( - ShapeUtil::MakeShapeWithLayout( - user->shape().element_type(), - AsInt64Slice(reshape_or_broadcast->shape().dimensions()), - LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())), - new_user, reshape_or_broadcast->dimensions())); - } - VLOG(4) << " new reshape/broadcast: " - << new_reshape_or_broadcast->ToString(); - TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(new_reshape_or_broadcast)); + HloInstruction* new_broadcast = + computation_->AddInstruction(HloInstruction::CreateBroadcast( + user->shape(), new_user, broadcast->dimensions())); + VLOG(4) << " new broadcast: " << new_broadcast->ToString(); + TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(new_broadcast)); changed = true; } return changed; @@ -1642,7 +1724,7 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) { // Reshape directly to empty constant if the shape contains zero-element // dimension. - if (ShapeUtil::HasZeroElements(reshape->shape())) { + if (ShapeUtil::IsZeroElementArray(reshape->shape())) { auto empty_constant = HloInstruction::CreateConstant( Literal::CreateFromShape(reshape->shape())); @@ -1661,32 +1743,39 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) { reshape, HloInstruction::CreateReshape(reshape->shape(), operand->mutable_operand(0))); } + if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) { + *operand->mutable_shape() = reshape->shape(); + return ReplaceInstruction(reshape, operand); + } if (HloOpcode::kBroadcast == reshape->operand(0)->opcode()) { auto opt_dims = ReshapeLeavesDimensionsUnmodified( reshape, reshape->operand(0)->dimensions()); - if (opt_dims.first) { + if (opt_dims.has_value()) { return ReplaceWithNewInstruction( reshape, HloInstruction::CreateBroadcast( reshape->shape(), reshape->mutable_operand(0)->mutable_operand(0), - opt_dims.second)); + *opt_dims)); } } - // A Reshape that feeds a unary element-wise operation can sink the - // reshape after the unary element-wise operation. - TF_ASSIGN_OR_RETURN( - bool sink_succeeded, - TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(reshape)); - changed_ |= sink_succeeded; - if (sink_succeeded) { - return Status::OK(); + // reshape(iota) -> iota. + if (operand->opcode() == HloOpcode::kIota) { + auto* iota = Cast(operand); + auto opt_dims = + ReshapeLeavesDimensionsUnmodified(reshape, {iota->iota_dimension()}); + if (opt_dims.has_value()) { + CHECK_EQ(opt_dims->size(), 1); + return ReplaceWithNewInstruction( + reshape, + HloInstruction::CreateIota(reshape->shape(), opt_dims->front())); + } } // Make this a bitcast if possible. if (is_layout_sensitive_ && - ReshapeIsBitcast(reshape, valid_bitcast_callback_)) { + ReshapeOrCopyIsBitcast(reshape, valid_bitcast_callback_)) { ReplaceWithBitcast(reshape); return Status::OK(); } @@ -1712,19 +1801,37 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) { if (ReplaceInstructionIfSameShape(slice, slice->mutable_operand(0))) { return Status::OK(); } + + auto is_unstrided_slice = [](const HloInstruction* hlo) { + return absl::c_all_of(hlo->slice_strides(), + [](int64 stride) { return stride == 1; }); + }; + if (slice->operand(0)->opcode() == HloOpcode::kSlice && + is_unstrided_slice(slice) && is_unstrided_slice(slice->operand(0))) { + HloInstruction* operand_slice = slice->mutable_operand(0); + std::vector new_slice_starts = slice->slice_starts(); + std::vector new_slice_limits = slice->slice_limits(); + for (int64 i = 0; i < new_slice_starts.size(); ++i) { + new_slice_starts[i] += operand_slice->slice_starts(i); + new_slice_limits[i] += operand_slice->slice_starts(i); + } + return ReplaceWithNewInstruction( + slice, HloInstruction::CreateSlice( + slice->shape(), operand_slice->mutable_operand(0), + new_slice_starts, new_slice_limits, slice->slice_strides())); + } return Status::OK(); } Status AlgebraicSimplifierVisitor::HandleDynamicSlice( HloInstruction* dynamic_slice) { auto operand = dynamic_slice->mutable_operand(0); - auto start_indices = dynamic_slice->operand(1); if (ShapeUtil::IsScalar(dynamic_slice->shape())) { return ReplaceInstruction(dynamic_slice, operand); } - // DynamicSlice where operand has the same size as the output and - // start_indices are all zero is simply equal to operand. - if (IsAll(start_indices, 0) && SameShape(operand, dynamic_slice)) { + // DynamicSlice where operand has the same size as the output is simply equal + // to operand. + if (SameShape(operand, dynamic_slice)) { return ReplaceInstruction(dynamic_slice, operand); } return Status::OK(); @@ -1733,27 +1840,17 @@ Status AlgebraicSimplifierVisitor::HandleDynamicSlice( Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice( HloInstruction* dynamic_update_slice) { auto update = dynamic_update_slice->mutable_operand(1); - auto start_indices = dynamic_update_slice->operand(2); - // DynamicUpdateSlice on a scalar just passes through the update argument. - if (ShapeUtil::IsScalar(dynamic_update_slice->shape())) { - return ReplaceInstruction(dynamic_update_slice, update); - } - // DynamicUpdateSlice where operand and update have the same size and - // start_indices are all zero is simply equal to update. - // - // (We require start_indices to be all zero because we want this optimization - // not to affect the visible behavior of this op even when the indices are out - // of range. Currently dynamic-update-slice wraps out-of-range indices, so - // we can only remove the op if its indices never wrap.) - if (IsAll(start_indices, 0) && SameShape(dynamic_update_slice, update)) { + // DynamicUpdateSlice where operand and update have the same size is simply + // equal to update. + if (SameShape(dynamic_update_slice, update)) { return ReplaceInstruction(dynamic_update_slice, update); } // If any dimension of update is 0, elide the DynamicUpdateSlice. This // optimization becomes invalid should we later prefer to warn about out of // bound indices. - if (ShapeUtil::HasZeroElements(update->shape())) { + if (ShapeUtil::IsZeroElementArray(update->shape())) { return ReplaceInstruction(dynamic_update_slice, dynamic_update_slice->mutable_operand(0)); } @@ -1761,12 +1858,18 @@ Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice( } Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) { + // TODO(b/112040122): Most of those optimizations can be done for multi-output + // reduces. + if (ShapeUtil::IsTuple(reduce->shape())) { + return Status::OK(); + } + auto arg = reduce->mutable_operand(0); auto init_value = reduce->mutable_operand(1); - tensorflow::gtl::ArraySlice dimensions(reduce->dimensions()); + absl::Span dimensions(reduce->dimensions()); HloComputation* function = reduce->to_apply(); - if (ShapeUtil::HasZeroElements(arg->shape()) || - ShapeUtil::HasZeroElements(reduce->shape())) { + if (ShapeUtil::IsZeroElementArray(arg->shape()) || + ShapeUtil::IsZeroElementArray(reduce->shape())) { return ReplaceWithNewInstruction( reduce, HloInstruction::CreateBroadcast(reduce->shape(), init_value, {})); @@ -1788,6 +1891,46 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) { new_reduce_dimensions, function)); } + // If the reduction results in the same number of elements, then the only + // possible side effect would be a reshape. Since the init_value is an + // identity of the reduction function, we can therefore replace the reduce + // with a simple reshape, ignoring the reduction function completely. + if (ShapeUtil::ElementsIn(reduce->shape()) == + ShapeUtil::ElementsIn(arg->shape())) { + return ReplaceWithNewInstruction( + reduce, HloInstruction::CreateReshape(reduce->shape(), arg)); + } + + // If a reduce feeds a reduce with the same computation and initial value, + // they can be combined into a single reduce. + if (arg->opcode() == HloOpcode::kReduce && + init_value->Identical(*arg->operand(1)) && + *function == *arg->to_apply()) { + // Create a new reduce with the combined reduction dimensions of both + // reduces. + std::vector arg_dims = arg->dimensions(); + std::sort(arg_dims.begin(), arg_dims.end()); + std::vector reduce_dims = reduce->dimensions(); + std::sort(reduce_dims.begin(), reduce_dims.end()); + // Transform reduce_dims to the same rank as the operand of the operand. + for (int64 arg_dim : arg_dims) { + for (int64& dim : reduce_dims) { + if (dim >= arg_dim) { + ++dim; + } + } + } + std::vector new_dimensions; + new_dimensions.reserve(arg->dimensions().size() + + reduce->dimensions().size()); + std::merge(arg_dims.begin(), arg_dims.end(), reduce_dims.begin(), + reduce_dims.end(), std::back_inserter(new_dimensions)); + return ReplaceWithNewInstruction( + reduce, + HloInstruction::CreateReduce(reduce->shape(), arg->mutable_operand(0), + init_value, new_dimensions, function)); + } + // A reshape that collapses multiple dimensions into a dimension being // reduced can just reduce all of those dimensions instead of doing a // collapsing reshape before a reduction. @@ -1832,21 +1975,33 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) { new_reduce_dimensions, function)); } } - if (ShapeUtil::ElementsIn(reduce->shape()) == - ShapeUtil::ElementsIn(arg->shape()) || - ShapeUtil::HasZeroElements(arg->shape())) { - auto reshape = computation_->AddInstruction( - HloInstruction::CreateReshape(reduce->shape(), arg)); - return ReplaceWithNewInstruction( - reduce, HloInstruction::CreateMap(reduce->shape(), - {init_value, reshape}, function)); + // Convert Reduce(concat({a,b,...})) to + // map(reduce(a),map(reduce(b),...,)) + // + // This should make fusion easier or use less memory bandwidth in the unfused + // case. + if (arg->opcode() == HloOpcode::kConcatenate && + absl::c_linear_search(reduce->dimensions(), + arg->concatenate_dimension())) { + HloInstruction* old_reduce = nullptr; + for (HloInstruction* operand : arg->operands()) { + HloInstruction* new_reduce = computation_->AddInstruction( + HloInstruction::CreateReduce(reduce->shape(), operand, init_value, + reduce->dimensions(), function)); + if (old_reduce != nullptr) { + new_reduce = computation_->AddInstruction(HloInstruction::CreateMap( + reduce->shape(), {old_reduce, new_reduce}, function)); + } + old_reduce = new_reduce; + } + return ReplaceInstruction(reduce, old_reduce); } return Status::OK(); } Status AlgebraicSimplifierVisitor::HandleReduceWindow( HloInstruction* reduce_window) { - if (ShapeUtil::HasZeroElements(reduce_window->operand(0)->shape())) { + if (ShapeUtil::IsZeroElementArray(reduce_window->operand(0)->shape())) { return ReplaceWithNewInstruction( reduce_window, HloInstruction::CreateBroadcast(reduce_window->shape(), @@ -1860,7 +2015,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow( return ReplaceWithNewInstruction( reduce_window, HloInstruction::CreateMap(reduce_window->shape(), - {operand, reduce_window->mutable_operand(1)}, + {reduce_window->mutable_operand(1), operand}, function)); } @@ -1880,9 +2035,9 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow( VLOG(10) << "Considering folding Pad: " << pad->ToString() << "\ninto reduce-window: " << reduce_window->ToString() - << (convert != nullptr ? tensorflow::strings::StrCat( - "\nvia convert: ", convert->ToString()) - : ""); + << (convert != nullptr + ? absl::StrCat("\nvia convert: ", convert->ToString()) + : ""); // Do not fold interior padding into ReduceWindow since the backends do not // support it. @@ -2014,6 +2169,21 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow( /*reduce_computation=*/function)); } +Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) { + auto operand = sort->mutable_operand(0); + int64 dimension_to_sort = sort->dimensions(0); + if (ShapeUtil::IsZeroElementArray(operand->shape()) || + operand->shape().dimensions(dimension_to_sort) <= 1) { + if (sort->operand_count() == 1) { + return ReplaceInstruction(sort, operand); + } + // If it is key/value sort, the output of sort is a tuple. + return ReplaceWithNewInstruction( + sort, HloInstruction::CreateTuple({operand, sort->mutable_operand(1)})); + } + return Status::OK(); +} + Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) { auto operand = transpose->mutable_operand(0); if (std::is_sorted(transpose->dimensions().begin(), @@ -2030,6 +2200,11 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) { transpose->dimensions()))); } + if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) { + *operand->mutable_shape() = transpose->shape(); + return ReplaceInstruction(transpose, operand); + } + if (is_layout_sensitive_ && TransposeIsBitcast(transpose)) { ReplaceWithBitcast(transpose); return Status::OK(); @@ -2042,19 +2217,152 @@ Status AlgebraicSimplifierVisitor::HandleConvolution( HloInstruction* convolution) { auto lhs = convolution->mutable_operand(0); auto rhs = convolution->mutable_operand(1); - if (ShapeUtil::HasZeroElements(lhs->shape()) || - ShapeUtil::HasZeroElements(rhs->shape())) { + if (ShapeUtil::IsZeroElementArray(lhs->shape()) || + ShapeUtil::IsZeroElementArray(rhs->shape())) { return ReplaceWithNewInstruction( convolution, HloInstruction::CreateBroadcast( convolution->shape(), - computation_->AddInstruction(HloInstruction::CreateConvert( - ShapeUtil::MakeShape(convolution->shape().element_type(), {}), - computation_->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))))), + computation_->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::Zero(convolution->shape().element_type()) + .CloneToUnique())), {})); } + const auto& window = convolution->window(); + const ConvolutionDimensionNumbers& dnums = + convolution->convolution_dimension_numbers(); + + // Try to merge padding/dilation of the input with the convolution's window. + TF_ASSIGN_OR_RETURN(bool folded_input_pad, [&]() -> StatusOr { + if (lhs->opcode() != HloOpcode::kPad) { + return false; + } + + // Convolution's padding is always zero, so bail if the kPad is adding + // something other than zero. + if (!IsAll(lhs->operand(1), 0)) { + return false; + } + + const auto& padding = lhs->padding_config(); + + // Can't pad batch or feature dims. + for (int64 dim : + {dnums.input_batch_dimension(), dnums.input_feature_dimension()}) { + const auto& p = padding.dimensions(dim); + if (p.edge_padding_low() != 0 || p.edge_padding_high() != 0 || + p.interior_padding() != 0) { + return false; + } + } + + // Compute the window which is the result of merging the kPad and the + // convolution's existing window. + Window new_window = window; + for (int64 dim = 0; dim < dnums.input_spatial_dimensions_size(); ++dim) { + auto& w = *new_window.mutable_dimensions(dim); + const auto& p = padding.dimensions(dnums.input_spatial_dimensions(dim)); + // Edge padding composes with itself in the straightforward way, but + // composing interior padding is nontrivial, and we cowardly refuse to + // think about it. If we see interior padding in either the kPad or conv, + // bail if there's any sort of padding in the other. + if (p.interior_padding() != 0 && + (w.padding_low() != 0 || w.padding_high() != 0 || + w.base_dilation() != 1)) { + return false; + } + if (w.base_dilation() != 1 && + (p.edge_padding_low() != 0 || p.edge_padding_high() != 0 || + p.interior_padding() != 0)) { + return false; + } + + w.set_padding_low(w.padding_low() + p.edge_padding_low()); + w.set_padding_high(w.padding_high() + p.edge_padding_high()); + if (p.interior_padding() != 0) { + CHECK_EQ(w.base_dilation(), 1); + w.set_base_dilation(1 + p.interior_padding()); + } + } + + auto new_conv = convolution->CloneWithNewOperands( + convolution->shape(), {lhs->mutable_operand(0), rhs}); + new_conv->set_window(new_window); + TF_RETURN_IF_ERROR( + ReplaceWithNewInstruction(convolution, std::move(new_conv))); + return true; + }()); + + if (folded_input_pad) { + return Status::OK(); + } + + // Try to merge dilation of the filter with the convolution's window. + TF_ASSIGN_OR_RETURN(bool folded_filter_pad, [&]() -> StatusOr { + if (rhs->opcode() != HloOpcode::kPad) { + return false; + } + + // Convolution's padding is always zero, so bail if the kPad is adding + // something other than zero. + if (!IsAll(rhs->operand(1), 0)) { + return false; + } + + const auto& padding = rhs->padding_config(); + + // Can't pad or dilate feature dims. + for (int64 dim : {dnums.kernel_input_feature_dimension(), + dnums.kernel_output_feature_dimension()}) { + const auto& p = padding.dimensions(dim); + if (p.edge_padding_low() != 0 || p.edge_padding_high() != 0 || + p.interior_padding() != 0) { + return false; + } + } + + // Compute the window which is the result of merging the kPad and the + // convolution's existing window. + Window new_window = convolution->window(); + for (int64 dim = 0; dim < dnums.kernel_spatial_dimensions_size(); ++dim) { + auto& w = *new_window.mutable_dimensions(dim); + const auto& p = padding.dimensions(dnums.kernel_spatial_dimensions(dim)); + + // We can only do this transformation if p adds dilation to the filter -- + // edge padding on the filter is not supported in conv. + if (p.edge_padding_low() != 0 || p.edge_padding_high() != 0) { + return false; + } + + // Nothing to do if the kPad for this dim is entirely a nop. + if (p.interior_padding() == 0) { + continue; + } + + // We cowardly refuse to think about how dilation composes with itself; + // bail if both the kPad and conv have dilation on this dimension. + if (w.window_dilation() > 1) { + return false; + } + CHECK_EQ(w.window_dilation(), 1); + w.set_window_dilation(1 + p.interior_padding()); + w.set_size(rhs->operand(0)->shape().dimensions( + dnums.kernel_spatial_dimensions(dim))); + } + + auto new_conv = convolution->CloneWithNewOperands( + convolution->shape(), {lhs, rhs->mutable_operand(0)}); + new_conv->set_window(new_window); + TF_RETURN_IF_ERROR( + ReplaceWithNewInstruction(convolution, std::move(new_conv))); + return true; + }()); + + if (folded_filter_pad) { + return Status::OK(); + } + if (!enable_conv_simplification_) { return Status::OK(); } @@ -2071,8 +2379,6 @@ Status AlgebraicSimplifierVisitor::HandleConvolution( return Status::OK(); } - const ConvolutionDimensionNumbers& dnums = - convolution->convolution_dimension_numbers(); const Shape& input_shape = lhs->shape(); const Shape& filter_shape = rhs->shape(); const Shape& convolution_shape = convolution->shape(); @@ -2172,6 +2478,8 @@ Status AlgebraicSimplifierVisitor::HandleConvolution( dot_dimension_numbers.add_rhs_contracting_dimensions(0); auto dot = computation_->AddInstruction(HloInstruction::CreateDot( dot_output_shape, new_lhs, new_rhs, dot_dimension_numbers)); + dot->set_precision_config(convolution->precision_config()); + return ReplaceInstruction(convolution, add_bitcast(convolution_shape, dot)); } @@ -2223,68 +2531,6 @@ Status AlgebraicSimplifierVisitor::HandleMap(HloInstruction* map) { return ReplaceWithNewInstruction(map, std::move(clone)); } -Status AlgebraicSimplifierVisitor::HandleMaximum(HloInstruction* maximum) { - // Match the following tree: - // min_operand operand - // \ / - // max_operand min - // \ / - // max - // where max_operand and min_operand are scalar constants. - { - HloInstruction* min; - HloInstruction* max_operand; - HloInstruction* min_operand; - HloInstruction* operand; - - if (hlo_query::MatchBinaryInstructionOperandOpcode( - HloOpcode::kMinimum, maximum, - /*matching_operand=*/&min, - /*other_operand=*/&max_operand) && - hlo_query::MatchBinaryInstructionOperand( - hlo_query::IsScalarConstant, min, - /*matching_operand=*/&min_operand, - /*other_operand=*/&operand) && - TransformToClampIfSameShape(maximum, min, min_operand, operand, maximum, - max_operand)) { - return Status::OK(); - } - } - - return Status::OK(); -} - -Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum) { - // Match the following tree: - // max_operand operand - // \ / - // min_operand max - // \ / - // min - // where max_operand and min_operand are scalar constants. - { - HloInstruction* max; - HloInstruction* max_operand; - HloInstruction* min_operand; - HloInstruction* operand; - - if (hlo_query::MatchBinaryInstructionOperandOpcode( - HloOpcode::kMaximum, minimum, - /*matching_operand=*/&max, - /*other_operand=*/&min_operand) && - hlo_query::MatchBinaryInstructionOperand( - hlo_query::IsScalarConstant, max, - /*matching_operand=*/&max_operand, - /*other_operand=*/&operand) && - TransformToClampIfSameShape(minimum, minimum, min_operand, operand, max, - max_operand)) { - return Status::OK(); - } - } - - return Status::OK(); -} - StatusOr AlgebraicSimplifier::Run(HloModule* module) { XLA_VLOG_LINES(2, "AlgebraicSimplifier::Run(), before:\n" + module->ToString()); diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h index c48196e861a559a5abfa360841ec70b39356fa2b..b864c372fa5877ca329d2efbbf7d747c763ae2c0 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.h +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h @@ -47,7 +47,7 @@ class AlgebraicSimplifier : public HloPassInterface { enable_dot_strength_reduction_(enable_dot_strength_reduction), enable_conv_simplification_(enable_conv_simplification) {} ~AlgebraicSimplifier() override = default; - tensorflow::StringPiece name() const override { return "algsimp"; } + absl::string_view name() const override { return "algsimp"; } // Run algebraic simplification on the given computation. Returns whether the // computation was changed. diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc index d5f0afe960f414cb611ef84f1d25a8009f1af78a..43a891e4fa163e833692a8e71b8f2f21d377e323 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc @@ -18,11 +18,15 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_pass_fix.h" @@ -34,13 +38,12 @@ limitations under the License. #include "tensorflow/compiler/xla/window_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/strings/str_util.h" - -using ::testing::ElementsAre; namespace xla { namespace { +using ::testing::ElementsAre; + namespace op = xla::testing::opcode_matchers; AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() { @@ -60,7 +63,7 @@ TEST_F(AlgebraicSimplifierTest, AddZero) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, zero)); @@ -74,6 +77,64 @@ TEST_F(AlgebraicSimplifierTest, AddZero) { EXPECT_EQ(root, param0); } +// Test that A * 0 is simplified to 0 +TEST_F(AlgebraicSimplifierTest, MulZero) { + Shape r0s32 = ShapeUtil::MakeShape(S32, {}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r0s32, "param0")); + HloInstruction* zero = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); + builder.AddInstruction( + HloInstruction::CreateBinary(r0s32, HloOpcode::kMultiply, param0, zero)); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kMultiply); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + EXPECT_EQ(computation->root_instruction(), zero); +} + +// Test that Reduce(Reduce(A)) -> Reduce(A) +TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) { + HloComputation::Builder builder(TestName()); + // Create add computation. + HloInstruction* zero = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); + HloComputation* add_computation = nullptr; + { + HloComputation::Builder builder(TestName() + ".add"); + const Shape scalar_shape = ShapeUtil::MakeShape(F32, {}); + HloInstruction* p0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, scalar_shape, "p0")); + HloInstruction* p1 = builder.AddInstruction( + HloInstruction::CreateParameter(1, scalar_shape, "p1")); + builder.AddInstruction( + HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1)); + add_computation = module().AddEmbeddedComputation(builder.Build()); + } + Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7}); + HloInstruction* param = builder.AddInstruction( + HloInstruction::CreateParameter(0, r4f32, "param")); + std::vector dims0({0}); + Shape r3f32 = ShapeUtil::MakeShape(F32, {5, 6, 7}); + HloInstruction* reduce0 = builder.AddInstruction( + HloInstruction::CreateReduce(r3f32, param, zero, dims0, add_computation)); + std::vector dims1({1, 2}); + Shape r1f32 = ShapeUtil::MakeShape(F32, {5}); + builder.AddInstruction(HloInstruction::CreateReduce(r1f32, reduce0, zero, + dims1, add_computation)); + module().AddEntryComputation(builder.Build()); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + HloInstruction* root = module().entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Reduce(param, zero)); + EXPECT_EQ(root->dimensions(), std::vector({0, 2, 3})); +} + // Test that Const + A is canonicalized to A + Const. TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) { Shape r0f32 = ShapeUtil::MakeShape(F32, {}); @@ -81,7 +142,7 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, constant, param0)); @@ -102,9 +163,9 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); HloInstruction* constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.14159f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.14159f))); HloInstruction* add1 = builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, constant1)); @@ -127,7 +188,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r2f32, "param0")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); HloInstruction* bcast = builder.AddInstruction( HloInstruction::CreateBroadcast(r2f32, zero, {0, 1})); builder.AddInstruction( @@ -162,9 +223,12 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r2f32, "param0")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); - builder.AddInstruction( - HloInstruction::CreateMap(r2f32, {param0, zero}, add_computation)); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); + builder.AddInstruction(HloInstruction::CreateMap( + r2f32, + {param0, builder.AddInstruction( + HloInstruction::CreateBroadcast(r2f32, zero, {}))}, + add_computation)); auto computation = module().AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); @@ -173,7 +237,7 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) { non_bitcasting_callback()); ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); root = computation->root_instruction(); - EXPECT_THAT(root, op::Add(param0, zero)); + EXPECT_THAT(root, op::Add(param0, op::Broadcast(zero))); } TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) { @@ -182,7 +246,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r2f32, "param0")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0, 0, 0}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({0, 0, 0}))); HloInstruction* bcast = builder.AddInstruction(HloInstruction::CreateBroadcast(r2f32, zero, {1})); builder.AddInstruction( @@ -201,7 +265,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) { TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) { HloComputation::Builder builder(TestName()); builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({3.14f, 3.14f, 3.14f}))); + LiteralUtil::CreateR1({3.14f, 3.14f, 3.14f}))); auto computation = module().AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); @@ -217,7 +281,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) { TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) { HloComputation::Builder builder(TestName()); builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({3.14, 3.14, 4}))); + LiteralUtil::CreateR1({3.14, 3.14, 4}))); auto computation = module().AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); @@ -229,6 +293,21 @@ TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) { EXPECT_THAT(root, op::Constant()); } +TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) { + HloComputation::Builder builder(TestName()); + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({0.0f, 1.0f, 2.0f}))); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_THAT(root, op::Constant()); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_THAT(root, op::Iota()); +} + // Test that A - 0 is simplified to A TEST_F(AlgebraicSimplifierTest, SubZero) { Shape r0f32 = ShapeUtil::MakeShape(F32, {}); @@ -236,7 +315,7 @@ TEST_F(AlgebraicSimplifierTest, SubZero) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kSubtract, param0, zero)); @@ -257,7 +336,7 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); builder.AddInstruction(HloInstruction::CreateBinary( r0f32, HloOpcode::kSubtract, param0, constant)); @@ -329,17 +408,16 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) { // Test that (A/B)/(C/D) is simplified to (A*D)/(B*C). TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) { - Shape r0f32 = ShapeUtil::MakeShape(F32, {}); Shape r2f32 = ShapeUtil::MakeShape(F32, {42, 123}); HloComputation::Builder builder(TestName()); HloInstruction* param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, r0f32, "param0")); + HloInstruction::CreateParameter(0, r2f32, "param0")); HloInstruction* param1 = builder.AddInstruction( HloInstruction::CreateParameter(1, r2f32, "param1")); HloInstruction* param2 = builder.AddInstruction( HloInstruction::CreateParameter(2, r2f32, "param2")); HloInstruction* param3 = builder.AddInstruction( - HloInstruction::CreateParameter(3, r0f32, "param3")); + HloInstruction::CreateParameter(3, r2f32, "param3")); HloInstruction* div0 = builder.AddInstruction( HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, param1)); HloInstruction* div1 = builder.AddInstruction( @@ -360,8 +438,6 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) { EXPECT_THAT( computation->root_instruction(), op::Divide(op::Multiply(param0, param3), op::Multiply(param1, param2))); - EXPECT_TRUE( - ShapeUtil::Compatible(computation->root_instruction()->shape(), r2f32)); } // Test that A/exp(B) is simplified to A*exp(-B). @@ -421,7 +497,6 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) { // Test that broadcasting is done on the right step when simplifying A/pow(B,C) // to A*pow(B,-C). TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) { - Shape r0f32 = ShapeUtil::MakeShape(F32, {}); Shape r1f32 = ShapeUtil::MakeShape(F32, {7}); HloComputation::Builder builder(TestName()); HloInstruction* param0 = builder.AddInstruction( @@ -429,7 +504,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) { HloInstruction* param1 = builder.AddInstruction( HloInstruction::CreateParameter(1, r1f32, "param1")); HloInstruction* param2 = builder.AddInstruction( - HloInstruction::CreateParameter(2, r0f32, "param2")); + HloInstruction::CreateParameter(2, r1f32, "param2")); HloInstruction* power = builder.AddInstruction( HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param1, param2)); builder.AddInstruction( @@ -446,14 +521,9 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) { ASSERT_THAT(computation->root_instruction(), op::Multiply(param0, op::Power(param1, op::Negate(param2)))); - - const HloInstruction* negate = - computation->root_instruction()->operand(1)->operand(1); - const Shape& negate_shape = negate->shape(); - EXPECT_EQ(0, negate_shape.dimensions_size()); } -// A / Const => A * (1 / Const) +// A / Const => A * InvertedConst TEST_F(AlgebraicSimplifierTest, DivideByConstant) { Shape r1f32 = ShapeUtil::MakeShape(F32, {3}); HloComputation::Builder builder(TestName()); @@ -461,7 +531,7 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) { HloInstruction::CreateParameter(0, r1f32, "param0")); HloInstruction* constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({0.f, 1.f, 2.f}))); + LiteralUtil::CreateR1({1.f, 2.f, 3.f}))); builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide, param0, constant)); @@ -472,20 +542,19 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) { ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), - op::Multiply(param0, op::Divide(op::Constant(), constant))); + op::Multiply(param0, op::Constant())); } // pow(pow(A, X), Y) => pow(A, X*Y) TEST_F(AlgebraicSimplifierTest, PowerOfPower) { - Shape r0f32 = ShapeUtil::MakeShape(F32, {}); Shape r1f32 = ShapeUtil::MakeShape(F32, {7}); HloComputation::Builder builder(TestName()); HloInstruction* base = builder.AddInstruction( HloInstruction::CreateParameter(0, r1f32, "param0")); HloInstruction* exp1 = builder.AddInstruction( - HloInstruction::CreateParameter(1, r0f32, "param1")); + HloInstruction::CreateParameter(1, r1f32, "param1")); HloInstruction* exp2 = builder.AddInstruction( - HloInstruction::CreateParameter(2, r0f32, "param2")); + HloInstruction::CreateParameter(2, r1f32, "param2")); HloInstruction* inner_power = builder.AddInstruction( HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, base, exp1)); builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, @@ -502,15 +571,14 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) { // Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex // numbers. TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) { - Shape r0c64 = ShapeUtil::MakeShape(C64, {}); Shape r1c64 = ShapeUtil::MakeShape(C64, {7}); HloComputation::Builder builder(TestName()); HloInstruction* base = builder.AddInstruction( HloInstruction::CreateParameter(0, r1c64, "param0")); HloInstruction* exp1 = builder.AddInstruction( - HloInstruction::CreateParameter(1, r0c64, "param1")); + HloInstruction::CreateParameter(1, r1c64, "param1")); HloInstruction* exp2 = builder.AddInstruction( - HloInstruction::CreateParameter(2, r0c64, "param2")); + HloInstruction::CreateParameter(2, r1c64, "param2")); HloInstruction* inner_power = builder.AddInstruction( HloInstruction::CreateBinary(r1c64, HloOpcode::kPower, base, exp1)); builder.AddInstruction(HloInstruction::CreateBinary(r1c64, HloOpcode::kPower, @@ -529,7 +597,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); HloInstruction* div = builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, one)); @@ -550,7 +618,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r2f32, "param0")); HloInstruction* one = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 1.0}, {1.0, 1.0}}))); + LiteralUtil::CreateR2({{1.0, 1.0}, {1.0, 1.0}}))); HloInstruction* div = builder.AddInstruction( HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, one)); @@ -830,7 +898,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, zero)); @@ -854,7 +922,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r1f32, "param0")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); builder.AddInstruction( HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param0, zero)); @@ -882,7 +950,7 @@ TEST_F(AlgebraicSimplifierTest, Pow1) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, one)); @@ -904,7 +972,7 @@ TEST_F(AlgebraicSimplifierTest, Pow2) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* two = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2))); builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, two)); @@ -926,7 +994,7 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param0")); HloInstruction* negative_one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(-1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(-1))); builder.AddInstruction(HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, negative_one)); @@ -1017,7 +1085,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) { builder.AddInstruction(HloInstruction::CreateReduceWindow( ShapeUtil::MakeShape(F32, {5, 2}), param, builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))), + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))), window, add_computation)); module().AddEntryComputation(builder.Build()); HloPassFix simplifier(/*is_layout_sensitive=*/false, @@ -1044,7 +1112,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) { builder.AddInstruction(HloInstruction::CreatePad( ShapeUtil::MakeShape(F32, {5, 2}), param, builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))), + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))), padding)); module().AddEntryComputation(builder.Build()); EXPECT_THAT(module().entry_computation()->root_instruction(), @@ -1086,7 +1154,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) { TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) { HloComputation::Builder builder(TestName()); HloInstruction* input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); builder.AddInstruction( HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input)); @@ -1121,6 +1189,33 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) { EXPECT_THAT(computation->root_instruction(), param0); } +TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) { + HloComputation::Builder builder(TestName()); + HloInstruction* param = + builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {1, 14, 14, 64}), "param")); + *param->mutable_shape()->mutable_layout() = + LayoutUtil::MakeLayout({0, 1, 2, 3}); + HloInstruction* copy = builder.AddInstruction(HloInstruction::CreateUnary( + ShapeUtil::MakeShape(F32, {1, 14, 14, 64}), HloOpcode::kCopy, param)); + *copy->mutable_shape()->mutable_layout() = + LayoutUtil::MakeLayout({1, 2, 0, 3}); + auto computation = module().AddEntryComputation(builder.Build()); + EXPECT_THAT(computation->root_instruction(), op::Copy(param)); + + AlgebraicSimplifier simplifier1(/*is_layout_sensitive=*/true, + non_bitcasting_callback()); + ASSERT_FALSE(simplifier1.Run(&module()).ValueOrDie()); + // Verify that the copy is not replaced. + EXPECT_THAT(computation->root_instruction(), op::Copy(param)); + + AlgebraicSimplifier simplifier2(/*is_layout_sensitive=*/true, + bitcasting_callback()); + ASSERT_TRUE(simplifier2.Run(&module()).ValueOrDie()); + // Verify that the copy is replaced. + EXPECT_THAT(computation->root_instruction(), op::Bitcast(param)); +} + // Test that unary concatenates are removed. TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) { Shape r1f32 = ShapeUtil::MakeShape(F32, {100}); @@ -1151,7 +1246,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) { HloInstruction* param1 = builder.AddInstruction( HloInstruction::CreateParameter(1, r1f32, "param1")); HloInstruction* empty_literal = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({}))); HloInstruction* empty_slice = builder.AddInstruction(HloInstruction::CreateSlice( ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}, {1})); @@ -1173,6 +1268,55 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) { op::Concatenate(param0, param0, param1)); } +// Test that reduce of concat is simplified. +TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) { + const int kParamLength = 100; + Shape r3f32 = + ShapeUtil::MakeShape(F32, {kParamLength, kParamLength, kParamLength}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r3f32, "param0")); + HloInstruction* param1 = builder.AddInstruction( + HloInstruction::CreateParameter(1, r3f32, "param1")); + HloInstruction* param2 = builder.AddInstruction( + HloInstruction::CreateParameter(2, r3f32, "param2")); + Shape concat_shape = + ShapeUtil::MakeShape(F32, {kParamLength, 3 * kParamLength, kParamLength}); + HloInstruction* Concatenate = + builder.AddInstruction(HloInstruction::CreateConcatenate( + concat_shape, {param0, param1, param2}, 1)); + HloComputation* add_computation = nullptr; + { + HloComputation::Builder builder(TestName() + ".add"); + const Shape scalar_shape = ShapeUtil::MakeShape(F32, {}); + HloInstruction* p0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, scalar_shape, "p0")); + HloInstruction* p1 = builder.AddInstruction( + HloInstruction::CreateParameter(1, scalar_shape, "p1")); + builder.AddInstruction( + HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1)); + add_computation = module().AddEmbeddedComputation(builder.Build()); + } + Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7}); + Shape reduce_shape = ShapeUtil::MakeShape(F32, {kParamLength}); + + HloInstruction* zero = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); + builder.AddInstruction(HloInstruction::CreateReduce( + reduce_shape, Concatenate, zero, {1, 2}, add_computation)); + + auto computation = module().AddEntryComputation(builder.Build()); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + + EXPECT_THAT( + computation->root_instruction(), + op::Map(op::Map(op::Reduce(param0, zero), op::Reduce(param1, zero)), + op::Reduce(param2, zero))); +} + // Test a concatenate with only empty operands is removed. TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) { const int kParamLength = 100; @@ -1181,7 +1325,7 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, r1f32, "param0")); HloInstruction* empty_literal = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({}))); HloInstruction* empty_slice = builder.AddInstruction(HloInstruction::CreateSlice( ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}, {1})); @@ -1302,6 +1446,37 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) { EXPECT_THAT(computation->root_instruction(), op::Reshape(param0)); } +// Test transforming reshapes and transposes of rng. +TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) { + HloComputation::Builder builder(TestName()); + HloInstruction* zero = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); + HloInstruction* one = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); + HloInstruction* rng0 = builder.AddInstruction( + HloInstruction::CreateRng(ShapeUtil::MakeShape(F32, {2, 2}), + RandomDistribution::RNG_UNIFORM, {zero, one})); + + HloInstruction* transpose = builder.AddInstruction( + HloInstruction::CreateTranspose(rng0->shape(), rng0, {1, 0})); + Shape reshape_shape = builder + .AddInstruction(HloInstruction::CreateReshape( + ShapeUtil::MakeShape(F32, {4}), transpose)) + ->shape(); + + auto computation = module().AddEntryComputation(builder.Build()); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + bitcasting_callback()); + EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie()); + + // Verify that that reshape(transpose(rng)) is replace by a single rng of the + // same shape as the reshape. + EXPECT_THAT(computation->root_instruction(), op::Rng()); + EXPECT_TRUE(ShapeUtil::Equal(computation->root_instruction()->shape(), + reshape_shape)); +} + // Test transforming reshapes to bitcasts under various conditions. TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) { HloComputation::Builder builder(TestName()); @@ -1351,59 +1526,6 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) { op::Tuple(op::Bitcast(), dimensions_wrong_reshape, layout_wrong_reshape)); } -TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) { - HloComputation::Builder builder(TestName()); - HloInstruction* param = - builder.AddInstruction(HloInstruction::CreateParameter( - 0, ShapeUtil::MakeShape(F32, {2, 3, 4, 5}), "param")); - HloInstruction* movable_reshape = - builder.AddInstruction(HloInstruction::CreateReshape( - ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}), param)); - HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); - builder.AddInstruction( - HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}), - HloOpcode::kMaximum, movable_reshape, zero)); - auto computation = module().AddEntryComputation(builder.Build()); - - EXPECT_THAT(computation->root_instruction(), - op::Maximum(op::Reshape(param), zero)); - - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); - - simplifier.Run(&module()).ValueOrDie(); - EXPECT_THAT(computation->root_instruction(), - op::Reshape(op::Maximum(param, zero))); -} - -// Regression test for a bug in the reshape sinking transformation, where -// moving a reshape to a scalar led to a crash. -TEST_F(AlgebraicSimplifierTest, ReshapeToScalarNotHoistedAfterEffectiveUnary) { - HloComputation::Builder builder(TestName()); - HloInstruction* param = - builder.AddInstruction(HloInstruction::CreateParameter( - 0, ShapeUtil::MakeShape(F32, {1, 1}), "param")); - HloInstruction* reshape = builder.AddInstruction( - HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {}), param)); - HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1., 2., 3.}))); - builder.AddInstruction(HloInstruction::CreateBinary( - ShapeUtil::MakeShape(F32, {3}), HloOpcode::kMaximum, reshape, zero)); - auto computation = module().AddEntryComputation(builder.Build()); - - EXPECT_THAT(computation->root_instruction(), - op::Maximum(op::Reshape(param), zero)); - - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); - - simplifier.Run(&module()).ValueOrDie(); - - EXPECT_THAT(computation->root_instruction(), - op::Maximum(op::Reshape(param), zero)); -} - // Regression test for a bug where if we failed to sink a reshape, we'd set the // 'changed' bit in AlgebraicSimplifier to false. TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) { @@ -1416,7 +1538,7 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) { builder.AddInstruction( HloInstruction::CreateParameter(0, shape, "param0")), builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{0, 0}, {0, 0}}))))); + LiteralUtil::CreateR2({{0, 0}, {0, 0}}))))); builder.AddInstruction( HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {4}), add)); @@ -1439,7 +1561,7 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) { builder.AddInstruction( HloInstruction::CreateParameter(0, shape, "param0")), builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{0, 0}, {0, 0}}))))); + LiteralUtil::CreateR2({{0, 0}, {0, 0}}))))); builder.AddInstruction( HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add, @@ -1716,13 +1838,133 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) { op::Reshape(op::Broadcast(param))); } +TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) { + HloComputation::Builder builder(TestName()); + auto iota = builder.AddInstruction(HloInstruction::CreateIota( + ShapeUtil::MakeShape(F32, {1, 2, 3, 7, 12, 1}), 2)); + Shape result_shape = ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2}); + builder.AddInstruction(HloInstruction::CreateReshape(result_shape, iota)); + + auto computation = module().AddEntryComputation(builder.Build()); + + EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + + EXPECT_THAT(computation->root_instruction(), op::Iota()); + EXPECT_TRUE( + ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape)); +} + +TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) { + HloComputation::Builder builder(TestName()); + auto iota = builder.AddInstruction( + HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {1, 1}), 0)); + auto result_shape = iota->shape(); + + auto computation = module().AddEntryComputation(builder.Build()); + + EXPECT_THAT(computation->root_instruction(), op::Iota()); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + + auto root = computation->root_instruction(); + EXPECT_THAT(root, op::Broadcast(op::Constant())); + EXPECT_EQ(0.0f, root->operand(0)->literal().GetFirstElement()); + EXPECT_TRUE( + ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape)); +} + +TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2_6) { + HloComputation::Builder builder(TestName()); + auto iota = builder.AddInstruction( + HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2}), 1)); + builder.AddInstruction( + HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6}), iota)); + + auto computation = module().AddEntryComputation(builder.Build()); + + EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie()); + + EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); +} + +TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) { + HloComputation::Builder builder(TestName()); + auto iota = builder.AddInstruction( + HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 4}), 2)); + builder.AddInstruction(HloInstruction::CreateReshape( + ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), iota)); + + HloComputation* computation = module().AddEntryComputation(builder.Build()); + + EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + + EXPECT_THAT(computation->root_instruction(), op::Iota()); + EXPECT_EQ(Cast(computation->root_instruction()) + ->iota_dimension(), + 3); +} + +TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) { + HloComputation::Builder builder(TestName()); + auto iota = builder.AddInstruction( + HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 2}), 2)); + builder.AddInstruction(HloInstruction::CreateReshape( + ShapeUtil::MakeShape(F32, {6, 1, 1, 2}), iota)); + + HloComputation* computation = module().AddEntryComputation(builder.Build()); + + EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + + EXPECT_THAT(computation->root_instruction(), op::Iota()); + const int64 iota_dim = + Cast(computation->root_instruction()) + ->iota_dimension(); + EXPECT_THAT(iota_dim, ::testing::AnyOf(1, 2, 3)); +} + +TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4x2_6x8) { + HloComputation::Builder builder(TestName()); + auto iota = builder.AddInstruction( + HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 4, 2}), 2)); + builder.AddInstruction( + HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6, 8}), iota)); + + HloComputation* computation = module().AddEntryComputation(builder.Build()); + + EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie()); + + EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); +} + TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) { HloComputation::Builder builder(TestName()); HloInstruction* param = builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(F32, {2, 2}), "param")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); PaddingConfig no_padding; for (int i = 0; i < 2; ++i) { auto dimension = no_padding.add_dimensions(); @@ -1740,7 +1982,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) { AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, non_bitcasting_callback()); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), param); } @@ -1753,7 +1995,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) { builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(F32, {10, 10}), "param")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); PaddingConfig padding; int64 low_padding[2] = {-1, -2}; int64 high_padding[2] = {2, -3}; @@ -1785,7 +2027,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) { EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero)); EXPECT_TRUE(has_negative_padding(pad)); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero))); EXPECT_FALSE( @@ -1807,7 +2049,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) { AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, non_bitcasting_callback()); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), param); } @@ -1830,11 +2072,336 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) { AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, non_bitcasting_callback()); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), param); } +TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) { + HloComputation::Builder builder(TestName()); + const int64 dim0 = 11; + const int64 dim1 = 12; + HloInstruction* param = + builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {dim0, dim1}), "param")); + HloInstruction* original_slice = + builder.AddInstruction(HloInstruction::CreateSlice( + ShapeUtil::MakeShape(F32, {dim0 - 2, dim1 - 4}), param, + /*start_indices=*/{1, 2}, + /*limit_indices=*/{dim0 - 1, dim1 - 2}, /*strides=*/{1, 1})); + + builder.AddInstruction(HloInstruction::CreateSlice( + ShapeUtil::MakeShape(F32, {dim0 - 5, dim1 - 9}), original_slice, + /*start_indices=*/{2, 3}, + /*limit_indices=*/{dim0 - 3, dim1 - 6}, /*strides=*/{1, 1})); + auto module = CreateNewModule(); + HloComputation* computation = module->AddEntryComputation(builder.Build()); + + EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param))); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); + + EXPECT_THAT(computation->root_instruction(), op::Slice(param)); + EXPECT_EQ(computation->root_instruction()->slice_starts(0), 3); + EXPECT_EQ(computation->root_instruction()->slice_starts(1), 5); + EXPECT_EQ(computation->root_instruction()->slice_limits(0), dim0 - 2); + EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4); +} + +TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) { + auto builder = HloComputation::Builder(TestName()); + + Shape keys_shape = ShapeUtil::MakeShape(F32, {1}); + auto keys = builder.AddInstruction( + HloInstruction::CreateParameter(0, keys_shape, "keys")); + builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys)); + auto module = CreateNewModule(); + HloComputation* computation = module->AddEntryComputation(builder.Build()); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); + EXPECT_THAT(computation->root_instruction(), keys); +} + +TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) { + auto builder = HloComputation::Builder(TestName()); + + Shape keys_shape = ShapeUtil::MakeShape(F32, {5, 0}); + Shape values_shape = ShapeUtil::MakeShape(S32, {5, 0}); + auto keys = builder.AddInstruction( + HloInstruction::CreateParameter(0, keys_shape, "keys")); + auto values = builder.AddInstruction( + HloInstruction::CreateParameter(1, values_shape, "values")); + builder.AddInstruction(HloInstruction::CreateSort( + ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys, values)); + auto module = CreateNewModule(); + HloComputation* computation = module->AddEntryComputation(builder.Build()); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); + EXPECT_THAT(computation->root_instruction(), op::Tuple(keys, values)); +} + +// Used for TEST_Ps that test merging (or not) of a kPad instruction into a +// convolution's Window. +struct ConvPaddingTestcase { + ConvPaddingTestcase(absl::string_view padding, + absl::string_view orig_conv_window, + absl::string_view expected_conv_window) + : ConvPaddingTestcase(padding, orig_conv_window, expected_conv_window, + /*pad_value=*/0) {} + + ConvPaddingTestcase(absl::string_view padding, + absl::string_view orig_conv_window, + absl::string_view expected_conv_window, float pad_value) + : padding(padding), + orig_conv_window(orig_conv_window), + expected_conv_window(expected_conv_window), + pad_value(pad_value) {} + + string ToString() const { + return absl::StrFormat( + "padding=%s, orig_conv_window=%s, expected_conv_window=%s, " + "pad_value=%f", + padding, orig_conv_window, expected_conv_window, pad_value); + } + + string padding; + string orig_conv_window; + string expected_conv_window; + float pad_value; +}; + +// ConvInputPaddingTest (and its one associated TEST_P testcase) checks that a +// computation that does +// +// conv(pad(param0, padding=padding), param1), window=orig_conv_window +// +// gets transformed by AlgebraicSimplifier to +// +// conv(param0, param1), window=expected_conv_window +// +// or, if expected_conv_window is the empty string, checks that +// AlgebraicSimplifier does *not* transform the original convolution. +class ConvInputPaddingTest + : public AlgebraicSimplifierTest, + public ::testing::WithParamInterface {}; + +INSTANTIATE_TEST_CASE_P( + ConvInputPaddingTestCases, ConvInputPaddingTest, + ::testing::ValuesIn(std::vector{ + // Merge this edge padding into the conv. + {"0_0x0_0x1_1x2_2", "", "pad=1_1x2_2"}, + // Merge this edge padding with the conv's edge padding. + {"0_0x0_0x1_2x3_4", "pad=10_10x20_20", "pad=11_12x23_24"}, + // Merge this interior-padded kPad with the unpadded conv. The 3x6 + // interior padding gets transformed to 4x7 conv lhs dilation. + {"0_0x0_0x1_2_3x4_5_6", "", "pad=1_2x4_5 lhs_dilate=4x7"}, + // kPad has dilation on one dim, conv has it on the other; merge them. + {"0_0x0_0x0_0_1x0_0_0", "lhs_dilate=1x10", "lhs_dilate=2x10"}, + // kPad has dilation and edge padding on one dim, conv has them on the + // other; merge them. + {"0_0x0_0x0_1_1x0_0_0", "pad=0_0x3_0 lhs_dilate=1x10", + "pad=0_1x3_0 lhs_dilate=2x10"}, + + // Don't transform if the pad value is nonzero. + {"0_0x0_0x1_1x2_2", "", "", /*pad_value=*/1}, + + // We refuse to transform the following because on some dimension, one + // of the kPad and conv has dilation and the other has some sort of + // padding. + {"0_0x0_0x0_0_1x0_0", "pad=1_0x0_0", ""}, + {"0_0x0_0x0_0_1x0_0", "pad=0_1x0_0", ""}, + {"0_0x0_0x0_0_1x0_0", "lhs_dilate=2x1", ""}, + {"0_0x0_0x1_0_0x0_0", "lhs_dilate=2x1", ""}, + {"0_0x0_0x0_1_0x0_0", "lhs_dilate=2x1", ""}, + {"0_0x0_0x0_0_1x0_0", "lhs_dilate=2x1", ""}, + + // We can't merge feature or batch padding into the conv. + {"1_0x0_0x0_0x0_0", "", ""}, + {"0_0x1_0x0_0x0_0", "", ""}, + })); + +TEST_P(ConvInputPaddingTest, DoTest) { + ConvPaddingTestcase testcase = GetParam(); + + // It would be better to put the testcase's ToString into the test name, but + // gUnit has constraints on what can go into test names, and any reasonable + // implementation of ToString() seems to violate them. + SCOPED_TRACE(testcase.ToString()); + + auto builder = HloComputation::Builder(TestName()); + auto* input = builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {1024, 128, 100, 100}), // bf01 + "input")); + auto* pad_value = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(testcase.pad_value))); + + PaddingConfig padding_config = + ParsePaddingConfig(testcase.padding).ValueOrDie(); + auto* lhs_pad = builder.AddInstruction(HloInstruction::CreatePad( + ShapeInference::InferPadShape(input->shape(), pad_value->shape(), + padding_config) + .ValueOrDie(), + input, pad_value, padding_config)); + + auto* filter = builder.AddInstruction(HloInstruction::CreateParameter( + 1, + ShapeUtil::MakeShape( + F32, {lhs_pad->shape().dimensions(1), 256, 3, 3}), // io01 + "input")); + + ConvolutionDimensionNumbers dnums = + ParseConvolutionDimensionNumbers("bf01_io01->bf01").ValueOrDie(); + Window window = + ParseWindow(absl::StrCat("size=3x3 ", testcase.orig_conv_window)) + .ValueOrDie(); + builder.AddInstruction(HloInstruction::CreateConvolve( + ShapeInference::InferConvolveShape(lhs_pad->shape(), filter->shape(), + window, dnums) + .ValueOrDie(), + lhs_pad, filter, window, dnums)); + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build()); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + if (testcase.expected_conv_window.empty()) { + ASSERT_FALSE(simplifier.Run(module).ValueOrDie()); + } else { + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); + auto* conv = module->entry_computation()->root_instruction(); + SCOPED_TRACE(module->ToString()); + ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter())); + EXPECT_EQ(window_util::ToString(conv->window()), + absl::StrCat("size=3x3 ", testcase.expected_conv_window)); + } +} + +// ConvFilterPaddingTest (and its one associated TEST_P) checks that a +// computation that does +// +// conv(param0, pad(param1, padding=padding)), window=orig_conv_window +// +// gets transformed by AlgebraicSimplifier to +// +// conv(param0, param1), window=expected_conv_window +// +// or, if expected_conv_window is the empty string, checks that +// AlgebraicSimplifier does *not* transform the original convolution. +class ConvFilterPaddingTest + : public AlgebraicSimplifierTest, + public ::testing::WithParamInterface {}; + +INSTANTIATE_TEST_CASE_P( + ConvFilterPaddingTestCases, ConvFilterPaddingTest, + ::testing::ValuesIn(std::vector{ + // Can only merge interior padding on the filter's spatial dimensions; + // all + // other paddings (edge padding and interior padding on the channel + // dims) + // should be rejected out of hand. + {"1_0_0x0_0_0x0_0x0_0", "", ""}, + {"0_1_0x0_0_0x0_0x0_0", "", ""}, + {"0_0_1x0_0_0x0_0x0_0", "", ""}, + {"0_0_0x1_0_0x0_0x0_0", "", ""}, + {"0_0_0x0_1_0x0_0x0_0", "", ""}, + {"0_0_0x0_0_1x0_0x0_0", "", ""}, + {"0_0_0x0_0_0x1_0x0_0", "", ""}, + {"0_0_0x0_0_0x0_1x0_0", "", ""}, + {"0_0_0x0_0_0x0_0x1_0", "", ""}, + {"0_0_0x0_0_0x0_0x0_1", "", ""}, + + // Interior padding on channel dims can be merged into the conv, so long + // as the conv and pad don't have interior padding on the same dim. + {"0_0x0_0x0_0_5x0_0", "", "rhs_dilate=6x1"}, + {"0_0x0_0x0_0x0_0_10", "", "rhs_dilate=1x11"}, + {"0_0x0_0x0_0_10x0_0_100", "", "rhs_dilate=11x101"}, + {"0_0x0_0x0_0_1x0_0", "rhs_dilate=1x10", "rhs_dilate=2x10"}, + {"0_0x0_0x0_0x0_0_5", "rhs_dilate=10x1", "rhs_dilate=10x6"}, + + // Can't merge if for a given dim there's interior padding on both the + // pad and conv. + {"0_0x0_0x0_0_1x0_0", "rhs_dilate=2x10", ""}, + {"0_0x0_0x0_0x0_0_5", "rhs_dilate=10x2", ""}, + + // Don't transform if the pad value is nonzero. + {"0_0x0_0x0_0_5x0_0", "", "", /*pad_value=*/1}, + })); + +TEST_P(ConvFilterPaddingTest, DoIt) { + ConvPaddingTestcase testcase = GetParam(); + + // It would be better to put the testcase's ToString into the test name, but + // gUnit has constraints on what can go into test names, and any reasonable + // implementation of ToString() seems to violate them. + SCOPED_TRACE(testcase.ToString()); + + auto builder = HloComputation::Builder(TestName()); + auto* pad_value = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(testcase.pad_value))); + auto* filter = builder.AddInstruction(HloInstruction::CreateParameter( + 1, ShapeUtil::MakeShape(F32, {128, 256, 3, 3}), // io01 + "input")); + PaddingConfig padding_config = + ParsePaddingConfig(testcase.padding).ValueOrDie(); + auto* rhs_pad = builder.AddInstruction(HloInstruction::CreatePad( + ShapeInference::InferPadShape(filter->shape(), pad_value->shape(), + padding_config) + .ValueOrDie(), + filter, pad_value, padding_config)); + + auto* input = builder.AddInstruction(HloInstruction::CreateParameter( + 0, + ShapeUtil::MakeShape( + F32, {1024, rhs_pad->shape().dimensions(0), 100, 100}), // bf01 + "input")); + + ConvolutionDimensionNumbers dnums = + ParseConvolutionDimensionNumbers("bf01_io01->bf01").ValueOrDie(); + Window window = ParseWindow(absl::StrFormat("size=%dx%d %s", + rhs_pad->shape().dimensions(2), + rhs_pad->shape().dimensions(3), + testcase.orig_conv_window)) + .ValueOrDie(); + auto* orig_conv = builder.AddInstruction(HloInstruction::CreateConvolve( + ShapeInference::InferConvolveShape(input->shape(), rhs_pad->shape(), + window, dnums) + .ValueOrDie(), + input, rhs_pad, window, dnums)); + + // Add a PrecisionConfig and check that AlgebraicSimplifier keeps it in place + // after the transformation. + PrecisionConfigProto precision_config; + precision_config.add_operand_precision(PrecisionConfigProto::HIGH); + precision_config.add_operand_precision(PrecisionConfigProto::HIGHEST); + orig_conv->set_precision_config(precision_config); + + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build()); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + if (testcase.expected_conv_window.empty()) { + ASSERT_FALSE(simplifier.Run(module).ValueOrDie()); + } else { + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); + auto* conv = module->entry_computation()->root_instruction(); + SCOPED_TRACE(module->ToString()); + ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter())); + EXPECT_EQ(window_util::ToString(conv->window()), + absl::StrFormat("size=%dx%d %s", + conv->operand(1)->shape().dimensions(2), + conv->operand(1)->shape().dimensions(3), + testcase.expected_conv_window)); + EXPECT_THAT( + conv->precision_config().operand_precision(), + ElementsAre(PrecisionConfigProto::HIGH, PrecisionConfigProto::HIGHEST)); + } +} + TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) { struct ConvTestOptions { int in_batch = 10; @@ -1866,7 +2433,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) { // Builds a convolution from and runs algebraic simplification on // the computation. Returns a string description of the result of // simplification. - auto build_and_simplify = [&options, this]() -> string { + auto build_and_simplify = [&]() -> string { HloComputation::Builder b(TestName()); Window window; @@ -1938,7 +2505,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) { auto out_dims = in_dims; out_dims[in_channel_idx] = options.f_output_channels; - auto make_shape = [](tensorflow::gtl::ArraySlice dims, + auto make_shape = [](absl::Span dims, bool minor_to_major_layout) { if (minor_to_major_layout) { return ShapeUtil::MakeShapeWithLayout(F32, dims, {0, 1, 2, 3}); @@ -1958,7 +2525,8 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) { b.AddInstruction(HloInstruction::CreateConvolve(out_shape, input, filter, window, dnums)); - auto module = CreateNewModule(); + // TODO(b/80488902): verify this module. + auto module = HloTestBase::CreateNewModule(); auto* computation = module->AddEntryComputation(b.Build()); AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true, @@ -1971,9 +2539,8 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) { root->operand(0)->opcode() == HloOpcode::kDot) { auto lhs_shape = root->operand(0)->operand(0)->shape(); auto rhs_shape = root->operand(0)->operand(1)->shape(); - return tensorflow::strings::StrCat( - tensorflow::str_util::Join(lhs_shape.dimensions(), "x"), " DOT ", - tensorflow::str_util::Join(rhs_shape.dimensions(), "x")); + return absl::StrCat(absl::StrJoin(lhs_shape.dimensions(), "x"), " DOT ", + absl::StrJoin(rhs_shape.dimensions(), "x")); } return "UNEXPECTED CHANGE"; }; @@ -2063,160 +2630,6 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) { EXPECT_EQ("NO_CHANGE", build_and_simplify()); } -// Test that max(min(A, x), y) is transformed to clamp(y, A, x) -TEST_F(AlgebraicSimplifierTest, MaxMinToClamp) { - Shape r0f32 = ShapeUtil::MakeShape(F32, {}); - HloComputation::Builder builder(TestName()); - HloInstruction* param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, r0f32, "param0")); - HloInstruction* min_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); - HloInstruction* max_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); - HloInstruction* min = builder.AddInstruction(HloInstruction::CreateBinary( - r0f32, HloOpcode::kMinimum, param0, min_value)); - builder.AddInstruction( - HloInstruction::CreateBinary(r0f32, HloOpcode::kMaximum, min, max_value)); - - auto module = CreateNewModule(); - auto computation = module->AddEntryComputation(builder.Build()); - - EXPECT_THAT(computation->root_instruction(), - op::Maximum(op::Minimum(param0, min_value), max_value)); - - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); - - EXPECT_THAT(computation->root_instruction(), - op::Clamp(max_value, param0, min_value)); -} - -// Test that min(max(A, x), y) is transformed to clamp(x, A, y) for scalar -// values. -TEST_F(AlgebraicSimplifierTest, MinMaxToClamp) { - Shape r0f32 = ShapeUtil::MakeShape(F32, {}); - HloComputation::Builder builder(TestName()); - HloInstruction* param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, r0f32, "param0")); - HloInstruction* min_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); - HloInstruction* max_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); - HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary( - r0f32, HloOpcode::kMaximum, param0, max_value)); - builder.AddInstruction( - HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value)); - - auto module = CreateNewModule(); - auto computation = module->AddEntryComputation(builder.Build()); - - EXPECT_THAT(computation->root_instruction(), - op::Minimum(op::Maximum(param0, max_value), min_value)); - - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); - - EXPECT_THAT(computation->root_instruction(), - op::Clamp(max_value, param0, min_value)); -} - -// Test that min(max(A, x), y) is transformed to clamp(x, A, y) for -// broadcasted scalar values. -TEST_F(AlgebraicSimplifierTest, MinMaxWithBroadcastToClamp) { - Shape r0f32 = ShapeUtil::MakeShape(F32, {}); - Shape r1f32 = ShapeUtil::MakeShape(F32, {100}); - HloComputation::Builder builder(TestName()); - HloInstruction* param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, r1f32, "param0")); - HloInstruction* min_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); - HloInstruction* max_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); - HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary( - r1f32, HloOpcode::kMaximum, param0, max_value)); - builder.AddInstruction( - HloInstruction::CreateBinary(r1f32, HloOpcode::kMinimum, max, min_value)); - - auto module = CreateNewModule(); - auto computation = module->AddEntryComputation(builder.Build()); - - EXPECT_THAT(computation->root_instruction(), - op::Minimum(op::Maximum(param0, max_value), min_value)); - - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); - - EXPECT_THAT(computation->root_instruction(), - op::Clamp(max_value, param0, min_value)); -} - -// Test that min(max(A, non-constant1), non-constant2) is not canonicalized to -// clamp(non-constant1, A, non-constant2) -TEST_F(AlgebraicSimplifierTest, MinMaxNotToClamp) { - Shape r0f32 = ShapeUtil::MakeShape(F32, {}); - HloComputation::Builder builder(TestName()); - HloInstruction* param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, r0f32, "param0")); - HloInstruction* min_value = builder.AddInstruction( - HloInstruction::CreateParameter(1, r0f32, "param1")); - HloInstruction* max_value = builder.AddInstruction( - HloInstruction::CreateParameter(2, r0f32, "param2")); - HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary( - r0f32, HloOpcode::kMaximum, param0, max_value)); - builder.AddInstruction( - HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value)); - - auto module = CreateNewModule(); - auto computation = module->AddEntryComputation(builder.Build()); - - EXPECT_THAT(computation->root_instruction(), - op::Minimum(op::Maximum(param0, max_value), min_value)); - - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); - EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie()); - - EXPECT_THAT(computation->root_instruction(), - op::Minimum(op::Maximum(param0, max_value), min_value)); -} - -// Test that min(f(max(A, constant1)), constant2) is not transformed to -// clamp(constant1, A, constant2) -TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) { - Shape r0f32 = ShapeUtil::MakeShape(F32, {}); - HloComputation::Builder builder(TestName()); - HloInstruction* param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, r0f32, "param0")); - HloInstruction* min_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); - HloInstruction* max_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); - HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary( - r0f32, HloOpcode::kMaximum, param0, max_value)); - HloInstruction* fmax = builder.AddInstruction( - HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, max, max_value)); - builder.AddInstruction(HloInstruction::CreateBinary( - r0f32, HloOpcode::kMinimum, fmax, min_value)); - - auto module = CreateNewModule(); - auto computation = module->AddEntryComputation(builder.Build()); - - EXPECT_THAT(computation->root_instruction(), - op::Minimum(op::Add(op::Maximum(param0, max_value), max_value), - min_value)); - - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); - EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie()); - - EXPECT_THAT(computation->root_instruction(), - op::Minimum(op::Add(op::Maximum(param0, max_value), max_value), - min_value)); -} - // Test that slice(broadcast(/*scalar value*/)) simplifies to a single // broadcast. TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) { @@ -2226,10 +2639,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) { HloInstruction::CreateParameter(0, r0f32, "scalar_param")); Shape broadcast_shape = ShapeUtil::MakeShape(F32, {4, 5, 6, 7}); - HloInstruction* broadcast = - builder.AddInstruction(HloInstruction::CreateBroadcast( - broadcast_shape, scalar_param, - AsInt64Slice(broadcast_shape.dimensions()))); + HloInstruction* broadcast = builder.AddInstruction( + HloInstruction::CreateBroadcast(broadcast_shape, scalar_param, {})); Shape slice_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 3}); HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice( @@ -2245,10 +2656,10 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) { AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, non_bitcasting_callback()); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); // Running simplification again should not result in any further changes. - ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie()); + ASSERT_FALSE(simplifier.Run(module).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Broadcast(scalar_param)); @@ -2260,13 +2671,11 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) { TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) { HloComputation::Builder builder(TestName()); HloInstruction* forty_two = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); Shape broadcast_shape = ShapeUtil::MakeShape(F32, {4, 5, 6}); - HloInstruction* broadcast = - builder.AddInstruction(HloInstruction::CreateBroadcast( - broadcast_shape, forty_two, - AsInt64Slice(broadcast_shape.dimensions()))); + HloInstruction* broadcast = builder.AddInstruction( + HloInstruction::CreateBroadcast(broadcast_shape, forty_two, {})); HloInstruction* transpose = builder.AddInstruction(HloInstruction::CreateTranspose( @@ -2285,7 +2694,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) { AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, non_bitcasting_callback()); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Broadcast(forty_two)); @@ -2294,7 +2703,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) { // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x). TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) { - auto module = CreateNewModule(); + // TODO(b/80488902): verify this module. + auto module = HloTestBase::CreateNewModule(); HloComputation::Builder builder(TestName()); // Create operand to the pad. @@ -2308,7 +2718,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) { padding.mutable_dimensions(3)->set_edge_padding_high(2); HloInstruction* pad_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5.0f))); HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad( ShapeUtil::MakeShape(F32, {1, 3, 3, 5}), operand, pad_value, padding)); @@ -2339,7 +2749,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) { const Shape reduce_window_shape = ShapeUtil::MakeShape(F32, {111, 113, 113, 115}); HloInstruction* reduce_init_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5.0f))); HloInstruction* reduce_window = builder.AddInstruction(HloInstruction::CreateReduceWindow( reduce_window_shape, pad, reduce_init_value, window, @@ -2375,7 +2785,8 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) { // Test that ReduceWindow(Convert(Pad(op, x)), y) can simplify to // ReduceWindow(Convert(op), x). TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) { - auto module = CreateNewModule(); + // TODO(b/80488902): verify this module. + auto module = HloTestBase::CreateNewModule(); HloComputation::Builder builder(TestName()); // Create operand to the pad. @@ -2389,7 +2800,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) { padding.mutable_dimensions(3)->set_edge_padding_high(2); HloInstruction* pad_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5.0f))); HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad( ShapeUtil::MakeShape(BF16, {1, 3, 3, 5}), parameter, pad_value, padding)); @@ -2424,7 +2835,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) { const Shape reduce_window_shape = ShapeUtil::MakeShape(F32, {111, 113, 113, 115}); HloInstruction* reduce_init_value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5.0f))); HloInstruction* reduce_window = builder.AddInstruction(HloInstruction::CreateReduceWindow( reduce_window_shape, convert, reduce_init_value, window, @@ -2470,7 +2881,7 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) { AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, non_bitcasting_callback()); - ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + ASSERT_TRUE(simplifier.Run(module).ValueOrDie()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(a, root); @@ -2495,9 +2906,9 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) { HloComputation::Builder call_builder(TestName() + ".Call"); HloInstruction* zero = call_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0.0f}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({0.0f}))); HloInstruction* one = call_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1.0f}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({1.0f}))); call_builder.AddInstruction( HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get())); @@ -2513,9 +2924,9 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) { HloComputation::Builder builder(TestName()); const float constant_scalar = 7.3f; std::initializer_list constant_vector = {1.1f, 2.0f, 3.3f}; - std::unique_ptr value = - Literal::MakeTuple({Literal::CreateR0(constant_scalar).get(), - Literal::CreateR1(constant_vector).get()}); + std::unique_ptr value = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(constant_scalar).get(), + LiteralUtil::CreateR1(constant_vector).get()}); builder.AddInstruction(HloInstruction::CreateConstant(std::move(value))); auto computation = module().AddEntryComputation(builder.Build()); @@ -2538,8 +2949,8 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) { shape, builder.AddInstruction( HloInstruction::CreateParameter(0, shape, "slice_from")), - builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0, 0, 0}))), + builder.AddInstruction(HloInstruction::CreateParameter( + 1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")), /*slice_sizes=*/{10, 100, 1000})); auto computation = module().AddEntryComputation(builder.Build()); @@ -2572,8 +2983,8 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) { builder.AddInstruction( HloInstruction::CreateParameter(2, slice_shape, "to_update")), slice, - builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0, 0, 0}))))); + builder.AddInstruction(HloInstruction::CreateParameter( + 3, ShapeUtil::MakeShape(U32, {3}), "update_indices")))); auto computation = module().AddEntryComputation(builder.Build()); AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, @@ -2588,7 +2999,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) { HloComputation::Builder builder(TestName()); Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2}); HloInstruction* input_array = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({3, 4}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({3, 4}))); HloInstruction* inner_bcast = builder.AddInstruction( HloInstruction::CreateBroadcast(r2f32, input_array, {1})); Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 2, 2}); @@ -2632,6 +3043,47 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) { EXPECT_THAT(root->dimensions(), ElementsAre(1, 3)); } +// Test that a broadcast of an iota can be merged to one iota. +TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) { + HloComputation::Builder builder(TestName()); + Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2}); + HloInstruction* iota = + builder.AddInstruction(HloInstruction::CreateIota(r2f32, 1)); + Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 2, 2}); + builder.AddInstruction(HloInstruction::CreateBroadcast(r3f32, iota, {0, 2})); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_THAT(root, op::Iota()); + EXPECT_EQ(Cast(root)->iota_dimension(), 2); +} + +// Test that a broadcast of an iota can be merged to one iota. +TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) { + HloComputation::Builder builder(TestName()); + Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 5, 3}); + HloInstruction* iota = + builder.AddInstruction(HloInstruction::CreateIota(r3f32, 1)); + Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 2, 5, 3}); + builder.AddInstruction( + HloInstruction::CreateBroadcast(r4f32, iota, {1, 2, 3})); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_THAT(root, op::Iota()); + EXPECT_EQ(Cast(root)->iota_dimension(), 2); +} + struct PadReduceWindowEffectiveBroadcastCase { std::vector input_spatials; std::vector symmetric_pad_spatials; @@ -2644,11 +3096,10 @@ struct PadReduceWindowEffectiveBroadcastCase { bool should_become_broadcast; string ToTestCaseName() const { - return tensorflow::strings::StrCat( - tensorflow::str_util::Join(input_spatials, ","), ";", - tensorflow::str_util::Join(symmetric_pad_spatials, ","), ";", - tensorflow::str_util::Join(reduce_window_spatials, ","), ";", prepend_a, - ";", should_become_broadcast); + return absl::StrCat(absl::StrJoin(input_spatials, ","), ";", + absl::StrJoin(symmetric_pad_spatials, ","), ";", + absl::StrJoin(reduce_window_spatials, ","), ";", + prepend_a, ";", should_become_broadcast); } }; @@ -2666,8 +3117,8 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) { // a and b are parallel bounds we can either turn into a B F S0 S1 or // `B S0 S1 F` kind of pattern. - auto decorate_spatials = [¶m](tensorflow::gtl::ArraySlice spatials, - int64 a, int64 b) { + auto decorate_spatials = [¶m](absl::Span spatials, int64 a, + int64 b) { std::vector result; if (param.prepend_a) { result.push_back(a); @@ -2697,7 +3148,7 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) { HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad( pad_shape, input, builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))), + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))), padding)); HloComputation* add_computation = nullptr; @@ -2716,7 +3167,7 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) { Window window = window_util::MakeWindow( decorate_spatials(param.reduce_window_spatials, 1, 1)); auto zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ShapeInference::InferReduceWindowShape( pad->shape(), zero->shape(), window, @@ -2855,7 +3306,7 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) { Shape lhs_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.k}); auto* lhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2F32Linspace( + HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace( /*from=*/10.0, /*to=*/10000.0, /*rows=*/spec.m, /*cols=*/spec.k))); Shape rhs0_shape = ShapeUtil::MakeShape(F32, {k0, spec.n}); @@ -2934,7 +3385,7 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) { Shape rhs_shape = ShapeUtil::MakeShape(F32, {spec.k, spec.n}); auto* rhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2F32Linspace( + HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace( /*from=*/10.0, /*to=*/10000.0, /*rows=*/spec.k, /*cols=*/spec.n))); DotDimensionNumbers dot_dnums; @@ -2981,7 +3432,7 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) { HloInstruction* const update = builder.AddInstruction( HloInstruction::CreateParameter(1, update_shape, "update")); HloInstruction* const start_indices = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({0}))); builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( dslice_shape, operand, update, start_indices)); const HloComputation* const computation = @@ -3030,7 +3481,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) { int64 lhs_cols = (spec.lcd == 0) ? spec.m : (spec.k + k_increase); Shape lhs_shape = ShapeUtil::MakeShape(F32, {lhs_rows, lhs_cols}); auto* lhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2F32Linspace( + HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace( /*from=*/10.0, /*to=*/10000.0, /*rows=*/lhs_rows, /*cols=*/lhs_cols))); @@ -3038,7 +3489,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) { int32 start_col = (spec.lcd == 0) ? spec.s : 0; const auto start_indices = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({start_row, start_col}))); + LiteralUtil::CreateR1({start_row, start_col}))); int64 slice_row_size = (spec.lcd == 0) ? spec.k : 1; int64 slice_col_size = (spec.lcd == 0) ? 1 : spec.k; Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size}); @@ -3049,7 +3500,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) { int64 rhs_cols = (spec.rcd == 0) ? spec.n : spec.k; Shape rhs_shape = ShapeUtil::MakeShape(F32, {rhs_rows, rhs_cols}); auto* rhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2F32Linspace( + HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace( /*from=*/10.0, /*to=*/10000.0, /*rows=*/rhs_rows, /*cols=*/rhs_cols))); @@ -3097,7 +3548,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) { int64 lhs_cols = (spec.lcd == 0) ? spec.m : spec.k; Shape lhs_shape = ShapeUtil::MakeShape(F32, {lhs_rows, lhs_cols}); auto* lhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2F32Linspace( + HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace( /*from=*/10.0, /*to=*/10000.0, /*rows=*/lhs_rows, /*cols=*/lhs_cols))); @@ -3108,7 +3559,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) { int64 rhs_cols = (spec.rcd == 0) ? spec.n : (spec.k + k_increase); Shape rhs_shape = ShapeUtil::MakeShape(F32, {rhs_rows, rhs_cols}); auto* rhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2F32Linspace( + HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace( /*from=*/10.0, /*to=*/10000.0, /*rows=*/rhs_rows, /*cols=*/rhs_cols))); @@ -3116,7 +3567,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) { int32 start_col = (spec.rcd == 0) ? spec.s : 0; const auto start_indices = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({start_row, start_col}))); + LiteralUtil::CreateR1({start_row, start_col}))); int64 slice_row_size = (spec.rcd == 0) ? spec.k : 1; int64 slice_col_size = (spec.rcd == 0) ? 1 : spec.k; Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size}); diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 95b4cb6d2e694063b648b264bd2454ae0a5469ff..1ed6142dcecdc830cb7b8386e0cc20a2ea54aa7f 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -17,15 +17,15 @@ limitations under the License. #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -69,8 +69,7 @@ StatusOr AllocationTracker::RegisterInternal( return InvalidArgument( "AllocationTracker for platform %s cannot register buffer from " "platform %s", - backend_->platform()->Name().c_str(), - shaped_buffer.platform()->Name().c_str()); + backend_->platform()->Name(), shaped_buffer.platform()->Name()); } } @@ -91,8 +90,9 @@ StatusOr AllocationTracker::RegisterInternal( // If ShapedBufferTy is ScopedShapedBuffer, release the ScopedShapedBuffer // into a regular ShapedBuffer, which is stored in // handle_to_shaped_buffers_. - handle_to_shaped_buffers_[handle].emplace_back(MakeUnique( - ReleaseIfScopedShapedBuffer(std::move(shaped_buffer)))); + handle_to_shaped_buffers_[handle].emplace_back( + absl::make_unique( + ReleaseIfScopedShapedBuffer(std::move(shaped_buffer)))); } GlobalDataHandle result; @@ -109,11 +109,11 @@ Status AllocationTracker::Unregister(const GlobalDataHandle& data) { ResolveInternal(data)); for (const auto& shaped_buffer : replicated_buffers) { std::vector shape_indices; - ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(), - [this, &shape_indices](const Shape& /*subshape*/, - const ShapeIndex& index) { - shape_indices.push_back(index); - }); + ShapeUtil::ForEachSubshape( + shaped_buffer->on_device_shape(), + [&shape_indices](const Shape& /*subshape*/, const ShapeIndex& index) { + shape_indices.push_back(index); + }); for (const ShapeIndex& index : shape_indices) { TF_RETURN_IF_ERROR(DecrementRefCount(shaped_buffer->buffer(index), shaped_buffer->device_ordinal())); @@ -124,7 +124,7 @@ Status AllocationTracker::Unregister(const GlobalDataHandle& data) { // "handle does not exist". auto it = handle_to_shaped_buffers_.find(data.handle()); if (it == handle_to_shaped_buffers_.end()) { - return NotFound("no allocation record for global data handle: %lld", + return NotFound("no allocation record for global data handle: %d", data.handle()); } for (auto& shaped_buffer : it->second) { @@ -143,7 +143,7 @@ StatusOr> AllocationTracker::DeconstructTuple( // the same for all buffers across replicas. const ShapedBuffer* shaped_buffer = replicated_buffers[0]; if (!ShapeUtil::IsTuple(shaped_buffer->on_host_shape())) { - return InvalidArgument("global data handle %lld is not a tuple", + return InvalidArgument("global data handle %d is not a tuple", data.handle()); } // If the on-host representation is a tuple, then the on-device one should be @@ -200,14 +200,14 @@ StatusOr> AllocationTracker::ResolveInternal( VLOG(2) << "resolve:" << data.handle(); auto it = handle_to_shaped_buffers_.find(data.handle()); if (it == handle_to_shaped_buffers_.end()) { - return NotFound("no allocation record for global data handle: %lld", + return NotFound("no allocation record for global data handle: %d", data.handle()); } std::vector replicated_buffers; for (const auto& shaped_buffer : it->second) { if (shaped_buffer == nullptr) { - return InvalidArgument( - "global data handle %lld was previously deallocated", data.handle()); + return InvalidArgument("global data handle %d was previously deallocated", + data.handle()); } replicated_buffers.push_back(shaped_buffer.get()); } diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc index 349b32451a697dbd6804b44cd1a36419c753bb14..5c180cbdd492031e133b81149f0f4698619b7788 100644 --- a/tensorflow/compiler/xla/service/backend.cc +++ b/tensorflow/compiler/xla/service/backend.cc @@ -21,6 +21,7 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/platform_util.h" @@ -96,31 +97,26 @@ Backend::CreateDefaultBackend() { return CreateBackend(backend_options); } -StatusOr Backend::BorrowStream(int device_ordinal) { - TF_ASSIGN_OR_RETURN(auto exec, stream_executor(device_ordinal)); - return BorrowStream(exec); +StatusOr Backend::BorrowStream(int device_ordinal) { + TF_ASSIGN_OR_RETURN(auto executor, stream_executor(device_ordinal)); + return BorrowStream(executor); } -StatusOr Backend::BorrowStream( - se::StreamExecutor* executor) { +StatusOr Backend::BorrowStream(se::StreamExecutor* executor) { tensorflow::mutex_lock l(mu_); if (0 == stream_pools_.count(executor)) { stream_pools_.emplace(std::piecewise_construct, std::forward_as_tuple(executor), - std::forward_as_tuple([executor]() { - auto stream = MakeUnique(executor); - stream->Init(); - return stream; - })); + std::forward_as_tuple()); } - return stream_pools_.at(executor).Allocate(); + return stream_pools_.at(executor).BorrowStream(executor); } -Backend::Backend( - se::Platform* platform, Compiler* compiler, - tensorflow::gtl::ArraySlice stream_executors, - TransferManager* transfer_manager, ComputationPlacer* computation_placer, - int intra_op_parallelism_threads) +Backend::Backend(se::Platform* platform, Compiler* compiler, + absl::Span stream_executors, + TransferManager* transfer_manager, + ComputationPlacer* computation_placer, + int intra_op_parallelism_threads) : platform_(platform), compiler_(compiler), transfer_manager_(transfer_manager), @@ -132,8 +128,8 @@ Backend::Backend( } } // Create a memory allocator for the valid stream executors. - memory_allocator_ = - MakeUnique(platform, stream_executors); + memory_allocator_ = absl::make_unique( + platform, stream_executors); CHECK(!stream_executors_.empty()) << "Service found no devices for backend " << platform_->Name() << '.'; @@ -181,7 +177,7 @@ StatusOr Backend::stream_executor( } } return InvalidArgument("device %s not supported by XLA service", - device_name(device_ordinal).c_str()); + device_name(device_ordinal)); } StatusOr Backend::devices_equivalent(int device_ordinal_a, diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h index 6546602473e3381cf13879ddebd05d34d1f7a055..a2dafbe803f8bd5f23e4e9f3f6d3e6f744c9fab9 100644 --- a/tensorflow/compiler/xla/service/backend.h +++ b/tensorflow/compiler/xla/service/backend.h @@ -21,15 +21,15 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/computation_placer.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" -#include "tensorflow/compiler/xla/service/pool.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -63,11 +63,9 @@ class BackendOptions { // // It also offers a pooling API for creation/use of initialized streams: // -// StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie(); +// StreamPool::Ptr stream = backend->BorrowStream().ConsumeValueOrDie(); class Backend { public: - using StreamPtr = Pool::SmartPtr; - // Creates a new backend. static StatusOr> CreateBackend( const BackendOptions& options); @@ -114,13 +112,13 @@ class Backend { // Borrows a stream for use by the caller, either by grabbing it from an // internal pool, or by constructing/initializating it, and returns the result // to the caller. - StatusOr BorrowStream(int device_ordinal); - StatusOr BorrowStream(se::StreamExecutor* executor); + StatusOr BorrowStream(int device_ordinal); + StatusOr BorrowStream(se::StreamExecutor* executor); // Returns a function to borrow a stream, as `BorrowStream` above does. // Purely for convenience, the caller could rather make this anonymous // function itself. - std::function(int)> StreamBorrower() { + std::function(int)> StreamBorrower() { return [this](int device_ordinal) { return BorrowStream(device_ordinal); }; } @@ -132,7 +130,7 @@ class Backend { // Return a string identifier for the given device, eg: "GPU:3". string device_name(int device_ordinal) const { - return tensorflow::strings::StrCat(platform_->Name(), ":", device_ordinal); + return absl::StrCat(platform_->Name(), ":", device_ordinal); } // Returns true if the devices with the given ordinals are equivalent from @@ -151,7 +149,7 @@ class Backend { private: struct EigenThreadPoolWrapper; Backend(se::Platform* platform, Compiler* compiler, - tensorflow::gtl::ArraySlice stream_executors, + absl::Span stream_executors, TransferManager* transfer_manager, ComputationPlacer* computation_placer, int intra_op_parallelism_threads); @@ -169,7 +167,7 @@ class Backend { tensorflow::mutex mu_; // Mapping from stream executor to stream pools, used by `BorrowStream` above. - std::map> stream_pools_ GUARDED_BY(mu_); + std::map stream_pools_ GUARDED_BY(mu_); // The default memory allocator to use. std::unique_ptr memory_allocator_; diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.cc b/tensorflow/compiler/xla/service/batch_dot_simplification.cc index 2099916509acdbc2680cc2b5bd405e96f2f7bfb8..a16b85a0a5e3f72f54e9733bb974b01377e0c358 100644 --- a/tensorflow/compiler/xla/service/batch_dot_simplification.cc +++ b/tensorflow/compiler/xla/service/batch_dot_simplification.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/batch_dot_simplification.h" +#include "absl/algorithm/container.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_creation_utils.h" @@ -63,6 +64,7 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot( TF_ASSIGN_OR_RETURN(HloInstruction * new_dot, MakeDotHlo(new_lhs, new_rhs, new_dim_numbers)); + new_dot->set_precision_config(batch_dot->precision_config()); TF_ASSIGN_OR_RETURN(HloInstruction * new_dot_reshaped, MakeReshapeHlo(batch_dot->shape(), new_dot)); @@ -76,7 +78,7 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot( return true; } -tensorflow::StringPiece BatchDotSimplification::name() const { +absl::string_view BatchDotSimplification::name() const { return "batch-dot-simplification"; } @@ -84,10 +86,10 @@ StatusOr BatchDotSimplification::Run(HloModule* module) { bool changed = false; std::vector dot_instrs; for (HloComputation* computation : module->MakeNonfusionComputations()) { - c_copy_if(computation->instructions(), std::back_inserter(dot_instrs), - [](HloInstruction* instr) { - return instr->opcode() == HloOpcode::kDot; - }); + absl::c_copy_if(computation->instructions(), std::back_inserter(dot_instrs), + [](HloInstruction* instr) { + return instr->opcode() == HloOpcode::kDot; + }); } for (HloInstruction* dot_instr : dot_instrs) { TF_ASSIGN_OR_RETURN(bool elided_batch_dim_from_one, diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.h b/tensorflow/compiler/xla/service/batch_dot_simplification.h index c0ca8d8ebac1a3b218e7bd4d6db02b69cfb6916f..79d37f08d3553321ebbabc44c8f2488b194954d5 100644 --- a/tensorflow/compiler/xla/service/batch_dot_simplification.h +++ b/tensorflow/compiler/xla/service/batch_dot_simplification.h @@ -28,7 +28,7 @@ namespace xla { class BatchDotSimplification : public HloPassInterface { public: StatusOr Run(HloModule* module) override; - tensorflow::StringPiece name() const override; + absl::string_view name() const override; private: StatusOr ElideDegenerateBatchDimensionFromBatchDot( diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc index 96e02b82b97ff2fd682638f4c6297cbc2019c481..ec281ae68fe76bac4029058997c44b1f7e71aeae 100644 --- a/tensorflow/compiler/xla/service/batchnorm_expander.cc +++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc @@ -20,6 +20,9 @@ limitations under the License. #include #include +#include "absl/types/optional.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" @@ -32,7 +35,6 @@ limitations under the License. #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -41,6 +43,8 @@ namespace xla { namespace { +using absl::optional; + // BatchNormExpanderVisitor traverses the HLO computation and rewrites BatchNorm // operations into smaller operations. class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault { @@ -58,8 +62,7 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault { // Runs the visitor on a computation. static bool Run(HloComputation* computation, bool rewrite_training_op, - bool rewrite_inference_op, bool rewrite_grad_op, - bool use_fusion); + bool rewrite_inference_op, bool rewrite_grad_op); // Returns whether any batch norm ops were rewritten. const bool changed() const { return changed_; } @@ -70,21 +73,14 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault { explicit BatchNormExpanderVisitor(HloComputation* computation, bool rewrite_training_op, bool rewrite_inference_op, - bool rewrite_grad_op, bool use_fusion) + bool rewrite_grad_op) : computation_(computation), rewrite_training_op_(rewrite_training_op), rewrite_inference_op_(rewrite_inference_op), - rewrite_grad_op_(rewrite_grad_op), - use_fusion_(use_fusion) {} + rewrite_grad_op_(rewrite_grad_op) {} HloComputation* GetOrCreateScalarAddComputation( PrimitiveType primitive_type) { - HloComputation** scalar_add_computation = - &scalar_add_computations_[primitive_type]; - if (*scalar_add_computation) { - return *scalar_add_computation; - } - HloComputation::Builder b("scalar_add_computation"); Shape shape = ShapeUtil::MakeShape(primitive_type, {}); auto scalar_lhs = b.AddInstruction( @@ -93,26 +89,39 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault { HloInstruction::CreateParameter(1, shape, "scalar_rhs")); auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary( shape, HloOpcode::kAdd, scalar_lhs, scalar_rhs)); - *scalar_add_computation = - computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op)); - return *scalar_add_computation; + return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op)); } - // Current HloComputation instance the BatchNormExpander is - // traversing. - HloComputation* computation_; - - bool rewrite_training_op_; - bool rewrite_inference_op_; - bool rewrite_grad_op_; - bool use_fusion_; - - // Whether rewrite has occurred. - bool changed_ = false; + std::unique_ptr Rsqrt( + HloInstruction* operand, + const std::function)>& + add_instruction) { + HloInstruction* exponent = add_instruction(HloInstruction::CreateBroadcast( + operand->shape(), + add_instruction(HloInstruction::CreateConvert( + ShapeUtil::MakeShape(operand->shape().element_type(), {}), + add_instruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(-0.5f))))), + {})); + return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kPower, + operand, exponent); + } - // Cached computations for adding two scalars. - tensorflow::gtl::FlatMap - scalar_add_computations_; + std::unique_ptr Mean( + int64 element_count, HloInstruction* operand, + const std::function)>& + add_instruction) { + HloInstruction* elem_count_recip = + add_instruction(HloInstruction::CreateBroadcast( + operand->shape(), + add_instruction(HloInstruction::CreateConvert( + ShapeUtil::MakeShape(operand->shape().element_type(), {}), + add_instruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(1.0 / element_count))))), + {})); + return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kMultiply, + operand, elem_count_recip); + } // Replaces the existing HLO instruction old_instruction, with // new_instruction, and marks the optimizer status as changed. @@ -136,6 +145,16 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault { changed_ = true; return Status::OK(); } + // Current HloComputation instance the BatchNormExpander is + // traversing. + HloComputation* computation_; + + bool rewrite_training_op_; + bool rewrite_inference_op_; + bool rewrite_grad_op_; + + // Whether rewrite has occurred. + bool changed_ = false; }; } // namespace @@ -143,13 +162,12 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault { bool BatchNormExpanderVisitor::Run(HloComputation* computation, bool rewrite_training_op, bool rewrite_inference_op, - bool rewrite_grad_op, bool use_fusion) { + bool rewrite_grad_op) { BatchNormExpanderVisitor visitor( computation, /*rewrite_training_op=*/rewrite_training_op, /*rewrite_inference_op=*/rewrite_inference_op, - /*rewrite_grad_op=*/rewrite_grad_op, - /*use_fusion=*/use_fusion); + /*rewrite_grad_op=*/rewrite_grad_op); TF_CHECK_OK(computation->Accept(&visitor)); return visitor.changed_; } @@ -167,6 +185,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining( added_instructions.push_back(added_inst); return added_inst; }; + auto add_binary = [&](const Shape& shape, const HloOpcode opcode, + HloInstruction* a, HloInstruction* b) { + return add(HloInstruction::CreateBinary(shape, opcode, a, b)); + }; int64 instruction_count_before = computation_->instruction_count(); // Expand batch norm training into smaller HLO ops. @@ -176,25 +198,21 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining( int64 feature_index = batch_norm->feature_index(); const int64 feature_count = operand_shape.dimensions(feature_index); const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape); - auto elements_per_feature_literal = - Literal::CreateR0(size_in_elements / feature_count); - TF_ASSIGN_OR_RETURN(elements_per_feature_literal, - elements_per_feature_literal->Convert(ptype)); - auto elements_per_feature = add( - HloInstruction::CreateConstant(std::move(elements_per_feature_literal))); + int64 elements_per_feature_int64 = size_in_elements / feature_count; HloInstruction* scale = batch_norm->mutable_operand(1); HloInstruction* offset = batch_norm->mutable_operand(2); const Shape feature_shape = scale->shape(); - auto zero_literal = Literal::CreateR0(0.0f); + auto zero_literal = LiteralUtil::CreateR0(0.0f); TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype)); auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal))); - auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon()); + auto epsilon_literal = LiteralUtil::CreateR0(batch_norm->epsilon()); TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype)); - auto epsilon = - add(HloInstruction::CreateConstant(std::move(epsilon_literal))); + auto epsilon = add(HloInstruction::CreateBroadcast( + operand_shape, + add(HloInstruction::CreateConstant(std::move(epsilon_literal))), {})); std::vector dimensions_without_feature; for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) { @@ -213,8 +231,8 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining( GetOrCreateScalarAddComputation(ptype); // X^2. - auto operand_squared = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kMultiply, operand, operand)); + auto operand_squared = + add_binary(operand_shape, HloOpcode::kMultiply, operand, operand); // Sum[X]. auto sum = add(HloInstruction::CreateReduce(feature_shape, operand, zero, dimensions_without_feature, @@ -225,71 +243,48 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining( feature_shape, operand_squared, zero, dimensions_without_feature, add_reduce_computation)); - // Fuse two parallel reduces together to improve performance. - if (use_fusion_ && !batch_norm->has_sharding()) { - auto tuple = add(HloInstruction::CreateTuple({sum, squared_sum})); - - auto fused = computation_->CreateFusionInstruction( - {tuple, sum, squared_sum, operand_squared}, - HloInstruction::FusionKind::kInput); - - sum = add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0)); - - squared_sum = - add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1)); - } - // E[X]. - auto mean = add(HloInstruction::CreateBinary( - feature_shape, HloOpcode::kDivide, sum, elements_per_feature)); + auto mean = add(Mean(elements_per_feature_int64, sum, add)); auto mean_broadcasted = add( HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index})); // E[X^2]. - auto square_mean = add(HloInstruction::CreateBinary( - feature_shape, HloOpcode::kDivide, squared_sum, elements_per_feature)); + auto square_mean = add(Mean(elements_per_feature_int64, squared_sum, add)); // E^2[X]. - auto mean_square = add(HloInstruction::CreateBinary( - feature_shape, HloOpcode::kMultiply, mean, mean)); + auto mean_square = + add_binary(feature_shape, HloOpcode::kMultiply, mean, mean); // Var[X]. - auto var = add(HloInstruction::CreateBinary( - feature_shape, HloOpcode::kSubtract, square_mean, mean_square)); + auto var = + add_binary(feature_shape, HloOpcode::kSubtract, square_mean, mean_square); auto var_broadcasted = add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index})); // Var[X] + epsilon. - auto var_add_epsilon = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon)); - - auto neg_half_literal = Literal::CreateR0(-0.5f); - TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype)); - auto neg_half = - add(HloInstruction::CreateConstant(std::move(neg_half_literal))); + auto var_add_epsilon = + add_binary(operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon); // 1 / Sqrt[Var[X] + epsilon]. - auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half)); + auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon, add)); // X - E[X]. - auto operand_minus_mean = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted)); + auto operand_minus_mean = add_binary(operand_shape, HloOpcode::kSubtract, + operand, mean_broadcasted); // (X - E[X]) / Sqrt[Var[X] + epsilon]. - auto normalized = add( - HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply, - operand_minus_mean, rsqrt_var_add_epsilon)); + auto normalized = add_binary(operand_shape, HloOpcode::kMultiply, + operand_minus_mean, rsqrt_var_add_epsilon); // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale. - auto scaled_normalized = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted)); + auto scaled_normalized = add_binary(operand_shape, HloOpcode::kMultiply, + normalized, scale_broadcasted); // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset. - auto shifted_normalized = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted)); + auto shifted_normalized = add_binary(operand_shape, HloOpcode::kAdd, + scaled_normalized, offset_broadcasted); auto tuple = HloInstruction::CreateTuple({shifted_normalized, mean, var}); @@ -297,16 +292,22 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining( int64 instruction_count_after = computation_->instruction_count(); CHECK_EQ(instruction_count_after, instruction_count_before + added_instructions.size()); + const HloSharding& sharding = batch_norm->sharding(); HloSharding operand_sharding = - batch_norm->sharding().GetAsShapeTree(batch_norm->shape()).element({0}); + sharding.GetAsShapeTree(batch_norm->shape()).element({0}); + optional unique_device = batch_norm->sharding_unique_device(); + HloSharding default_sharding = + unique_device.has_value() + ? HloSharding::AssignDevice(unique_device.value()) + : HloSharding::Replicate(); for (HloInstruction* inst : added_instructions) { if (ShapeUtil::Equal(inst->shape(), operand_shape)) { inst->set_sharding(operand_sharding); } else { - inst->set_sharding(HloSharding::Replicate()); + inst->set_sharding(default_sharding); } } - tuple->set_sharding(batch_norm->sharding()); + tuple->set_sharding(sharding); } TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple))); return Status::OK(); @@ -329,10 +330,13 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference( HloInstruction* var = batch_norm->mutable_operand(4); const Shape feature_shape = scale->shape(); - auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon()); + auto epsilon_literal = LiteralUtil::CreateR0(batch_norm->epsilon()); TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype)); - auto epsilon = computation_->AddInstruction( - HloInstruction::CreateConstant(std::move(epsilon_literal))); + auto epsilon = computation_->AddInstruction(HloInstruction::CreateBroadcast( + operand_shape, + computation_->AddInstruction( + HloInstruction::CreateConstant(std::move(epsilon_literal))), + {})); std::vector dimensions_without_feature; @@ -349,6 +353,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference( added_instructions.push_back(added_inst); return added_inst; }; + auto add_binary = [&](const Shape& shape, const HloOpcode opcode, + HloInstruction* a, HloInstruction* b) { + return add(HloInstruction::CreateBinary(shape, opcode, a, b)); + }; int64 instruction_count_before = computation_->instruction_count(); auto scale_broadcasted = add( @@ -364,30 +372,23 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference( add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index})); // Var[X] + epsilon. - auto var_add_epsilon = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon)); - - auto neg_half_literal = Literal::CreateR0(-0.5f); - TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype)); - auto neg_half = - add(HloInstruction::CreateConstant(std::move(neg_half_literal))); + auto var_add_epsilon = + add_binary(operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon); // 1 / Sqrt[Var[X] + epsilon]. - auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half)); + auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon, add)); // X - E[X]. - auto operand_minus_mean = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted)); + auto operand_minus_mean = add_binary(operand_shape, HloOpcode::kSubtract, + operand, mean_broadcasted); // (X - E[X]) / Sqrt[Var[X] + epsilon]. - auto normalized = add( - HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply, - operand_minus_mean, rsqrt_var_add_epsilon)); + auto normalized = add_binary(operand_shape, HloOpcode::kMultiply, + operand_minus_mean, rsqrt_var_add_epsilon); // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale. - auto scaled_normalized = add(HloInstruction::CreateBinary( - operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted)); + auto scaled_normalized = add_binary(operand_shape, HloOpcode::kMultiply, + normalized, scale_broadcasted); // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset. auto shifted_normalized = HloInstruction::CreateBinary( @@ -397,14 +398,20 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference( CHECK_EQ(instruction_count_after, instruction_count_before + added_instructions.size()); if (batch_norm->has_sharding()) { + const HloSharding& sharding = batch_norm->sharding(); + optional unique_device = batch_norm->sharding_unique_device(); + HloSharding default_sharding = + unique_device.has_value() + ? HloSharding::AssignDevice(unique_device.value()) + : HloSharding::Replicate(); for (HloInstruction* inst : added_instructions) { if (ShapeUtil::Equal(inst->shape(), operand_shape)) { - inst->set_sharding(batch_norm->sharding()); + inst->set_sharding(sharding); } else { - inst->set_sharding(HloSharding::Replicate()); + inst->set_sharding(default_sharding); } } - shifted_normalized->set_sharding(batch_norm->sharding()); + shifted_normalized->set_sharding(sharding); } TF_CHECK_OK( ReplaceWithNewInstruction(batch_norm, std::move(shifted_normalized))); @@ -435,6 +442,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad( added_instructions.push_back(added_inst); return added_inst; }; + auto add_binary = [&](const Shape& shape, const HloOpcode opcode, + HloInstruction* a, HloInstruction* b) { + return add(HloInstruction::CreateBinary(shape, opcode, a, b)); + }; int64 instruction_count_before = computation_->instruction_count(); HloInstruction* activation = batch_norm->mutable_operand(0); @@ -450,26 +461,20 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad( const int64 size_in_elements = ShapeUtil::ElementsIn(activation_shape); const int64 feature_count = activation_shape.dimensions(feature_index); - auto elements_per_feature_literal = - Literal::CreateR0(size_in_elements / feature_count); - TF_ASSIGN_OR_RETURN(elements_per_feature_literal, - elements_per_feature_literal->Convert(ptype)); - auto elements_per_feature = add( - HloInstruction::CreateConstant(std::move(elements_per_feature_literal))); + const int64 elements_per_feature_int64 = size_in_elements / feature_count; - auto zero_literal = Literal::CreateR0(0.0f); + auto zero_literal = LiteralUtil::CreateR0(0.0f); TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype)); auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal))); - auto neg_half_literal = Literal::CreateR0(-0.5f); - TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype)); - auto neg_half = - add(HloInstruction::CreateConstant(std::move(neg_half_literal))); - - auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon()); + auto epsilon_literal = LiteralUtil::CreateR0(batch_norm->epsilon()); TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype)); - auto epsilon = + auto epsilon_scalar = add(HloInstruction::CreateConstant(std::move(epsilon_literal))); + auto epsilon_activation = add( + HloInstruction::CreateBroadcast(activation_shape, epsilon_scalar, {})); + auto epsilon_feature = + add(HloInstruction::CreateBroadcast(feature_shape, epsilon_scalar, {})); std::vector dimensions_without_feature; @@ -489,26 +494,23 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad( HloInstruction::CreateBroadcast(activation_shape, mean, {feature_index})); // rsqrt[Var[X] + epsilon]. - auto rsqrt_var_add_epsilon_broadcasted = add(HloInstruction::CreateBinary( - activation_shape, HloOpcode::kPower, - add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd, - variance_broadcasted, epsilon)), - neg_half)); - - auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary( - feature_shape, HloOpcode::kPower, - add(HloInstruction::CreateBinary(feature_shape, HloOpcode::kAdd, variance, - epsilon)), - neg_half)); + auto rsqrt_var_add_epsilon_broadcasted = + add(Rsqrt(add_binary(activation_shape, HloOpcode::kAdd, + variance_broadcasted, epsilon_activation), + add)); + + auto rsqrt_var_add_epsilon = add(Rsqrt( + add_binary(feature_shape, HloOpcode::kAdd, variance, epsilon_feature), + add)); // X - E[X]. - auto activation_minus_mean = add(HloInstruction::CreateBinary( - activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted)); + auto activation_minus_mean = add_binary( + activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted); // Grad[Y] * (X - E[X]). auto grad_output_times_activiation_minus_mean = - add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply, - grad_output, activation_minus_mean)); + add_binary(activation_shape, HloOpcode::kMultiply, grad_output, + activation_minus_mean); HloComputation* add_reduce_computation = GetOrCreateScalarAddComputation(ptype); @@ -524,25 +526,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad( feature_shape, grad_output, zero, dimensions_without_feature, add_reduce_computation)); - if (use_fusion_ && !batch_norm->has_sharding()) { - auto tuple = add(HloInstruction::CreateTuple( - {sum_grad_output_times_activiation_minus_mean, grad_beta})); - - auto fused = computation_->CreateFusionInstruction( - {tuple, sum_grad_output_times_activiation_minus_mean, grad_beta}, - HloInstruction::FusionKind::kInput); - - sum_grad_output_times_activiation_minus_mean = - add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0)); - - grad_beta = - add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1)); - } - // Grad[scale] = Sum(Grad[Y] * (X - E[X]) * rsqrt[Var[X] + epsilon]). - auto grad_scale = add(HloInstruction::CreateBinary( - feature_shape, HloOpcode::kMultiply, - sum_grad_output_times_activiation_minus_mean, rsqrt_var_add_epsilon)); + auto grad_scale = add_binary(feature_shape, HloOpcode::kMultiply, + sum_grad_output_times_activiation_minus_mean, + rsqrt_var_add_epsilon); // I2 = Sum(Grad[Y]) auto i2 = add(HloInstruction::CreateBroadcast(activation_shape, grad_beta, @@ -554,55 +541,62 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad( {feature_index})); // I4 = (X - E[X]) * I3 - auto i4 = add(HloInstruction::CreateBinary( - activation_shape, HloOpcode::kMultiply, i3, activation_minus_mean)); + auto i4 = add_binary(activation_shape, HloOpcode::kMultiply, i3, + activation_minus_mean); // I5 = I4 / (Var[X] + epsilon) - auto i5 = add(HloInstruction::CreateBinary( - activation_shape, HloOpcode::kDivide, i4, - add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd, - variance_broadcasted, epsilon)))); + auto i5 = add_binary(activation_shape, HloOpcode::kDivide, i4, + add_binary(activation_shape, HloOpcode::kAdd, + variance_broadcasted, epsilon_activation)); // scale * rsqrt[Var[X] + epsilon] * 1/N - auto scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary( - activation_shape, HloOpcode::kMultiply, scale_broadcasted, - rsqrt_var_add_epsilon_broadcasted)); + auto scale_times_rsqrt_var_add_epsilon = + add_binary(activation_shape, HloOpcode::kMultiply, scale_broadcasted, + rsqrt_var_add_epsilon_broadcasted); - scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary( - activation_shape, HloOpcode::kDivide, scale_times_rsqrt_var_add_epsilon, - elements_per_feature)); + scale_times_rsqrt_var_add_epsilon = add( + Mean(elements_per_feature_int64, scale_times_rsqrt_var_add_epsilon, add)); - auto i1 = - add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply, - grad_output, elements_per_feature)); + auto elements_per_feature_literal = + LiteralUtil::CreateR0(elements_per_feature_int64); + TF_ASSIGN_OR_RETURN(elements_per_feature_literal, + elements_per_feature_literal->Convert(ptype)); + auto elements_per_feature = add( + HloInstruction::CreateConstant(std::move(elements_per_feature_literal))); + auto i1 = add_binary(activation_shape, HloOpcode::kMultiply, grad_output, + add(HloInstruction::CreateBroadcast( + activation_shape, elements_per_feature, {}))); // I6 = I1 - I2 - I5 - auto i6 = add(HloInstruction::CreateBinary( + auto i6 = add_binary( activation_shape, HloOpcode::kSubtract, - add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kSubtract, - i1, i2)), - i5)); + add_binary(activation_shape, HloOpcode::kSubtract, i1, i2), i5); // Grad[X] = scale * rsqrt[Var[X] + epsilon] * 1/N * I6. - auto grad_activation = - add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply, - scale_times_rsqrt_var_add_epsilon, i6)); + auto grad_activation = add_binary(activation_shape, HloOpcode::kMultiply, + scale_times_rsqrt_var_add_epsilon, i6); auto tuple = HloInstruction::CreateTuple({grad_activation, grad_scale, grad_beta}); if (batch_norm->has_sharding()) { + const HloSharding& sharding = batch_norm->sharding(); int64 instruction_count_after = computation_->instruction_count(); CHECK_EQ(instruction_count_after, instruction_count_before + added_instructions.size()); HloSharding activation_sharding = - batch_norm->sharding().GetAsShapeTree(batch_norm->shape()).element({0}); + sharding.GetAsShapeTree(batch_norm->shape()).element({0}); + auto unique_device = batch_norm->sharding_unique_device(); + HloSharding default_sharding = + unique_device.has_value() + ? HloSharding::AssignDevice(unique_device.value()) + : HloSharding::Replicate(); for (HloInstruction* inst : added_instructions) { if (ShapeUtil::Equal(inst->shape(), activation_shape)) { inst->set_sharding(activation_sharding); } else { - inst->set_sharding(HloSharding::Replicate()); + inst->set_sharding(default_sharding); } } - tuple->set_sharding(batch_norm->sharding()); + tuple->set_sharding(sharding); } TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple))); @@ -615,8 +609,8 @@ StatusOr BatchNormExpander::Run(HloModule* module) { bool changed = false; for (auto* comp : module->MakeNonfusionComputations()) { if (BatchNormExpanderVisitor::Run(comp, rewrite_training_op_, - rewrite_inference_op_, rewrite_grad_op_, - use_fusion_)) { + rewrite_inference_op_, + rewrite_grad_op_)) { changed = true; } } diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.h b/tensorflow/compiler/xla/service/batchnorm_expander.h index 4ad987085da91684bb7891070afeefd19be4138f..76e32174f3ee7d319df6f1f465e19d265d5330f2 100644 --- a/tensorflow/compiler/xla/service/batchnorm_expander.h +++ b/tensorflow/compiler/xla/service/batchnorm_expander.h @@ -31,13 +31,12 @@ class BatchNormExpander : public HloPassInterface { // When use_fusion is set, a multi-output fusion node is created. BatchNormExpander(bool rewrite_training_op = false, bool rewrite_inference_op = false, - bool rewrite_grad_op = false, bool use_fusion = true) + bool rewrite_grad_op = false) : rewrite_training_op_(rewrite_training_op), rewrite_inference_op_(rewrite_inference_op), - rewrite_grad_op_(rewrite_grad_op), - use_fusion_(use_fusion) {} + rewrite_grad_op_(rewrite_grad_op) {} ~BatchNormExpander() = default; - tensorflow::StringPiece name() const override { return "batchnorm_expander"; } + absl::string_view name() const override { return "batchnorm_expander"; } // Run operation expander on the given computation. Returns whether the // computation was changed. @@ -47,7 +46,6 @@ class BatchNormExpander : public HloPassInterface { bool rewrite_training_op_; bool rewrite_inference_op_; bool rewrite_grad_op_; - bool use_fusion_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc index aa36e64b07099a372dab67babc7a18a2d39596bc..aba0d9bb5b977d89656580df46838eefb8cd6662 100644 --- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc +++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc @@ -18,20 +18,20 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/hlo_pass_fix.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace xla { namespace { @@ -114,5 +114,33 @@ TEST_F(BatchNormExpanderTest, BatchNormGrad) { EXPECT_EQ(root->opcode(), HloOpcode::kTuple); } +TEST_F(BatchNormExpanderTest, BatchNormTrainingSharding) { + const char* module_str = R"( +HloModule module +ENTRY entry { + %param.0 = f32[8,4] parameter(0) + %param.1 = f32[4] parameter(1) + %param.2 = f32[4] parameter(2) + ROOT %batch-norm-training = (f32[8,4], f32[4], f32[4]) + batch-norm-training(f32[8,4] %param.0, f32[4] %param.1, f32[4] %param.2), + epsilon=0.001, feature_index=1, sharding={maximal device=1} +})"; + + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(module_str)); + BatchNormExpander rewriter(/*rewrite_training_op=*/true, + /*rewrite_inference_op=*/true, + /*rewrite_grad_op=*/true); + ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie()); + + for (auto* instruction : module->entry_computation()->instructions()) { + if (instruction->opcode() == HloOpcode::kParameter) { + continue; + } + auto device = instruction->sharding_unique_device(); + ASSERT_TRUE(device); + EXPECT_EQ(*device, 1); + } +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc index 1b8b2d204503576c3fcb02f6d5b37f2db45e1768..d63287539dfde5bb4890ab8303ef2205133d8125 100644 --- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc +++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc @@ -15,12 +15,12 @@ limitations under the License. #include "tensorflow/compiler/xla/service/bfloat16_conversion_folding.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h index c9398387098fad84ba28735c30e426fedd9b0cb0..5dcd31b83d24f836d31f44181f39cb8371ca1033 100644 --- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h +++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h @@ -37,7 +37,7 @@ class BFloat16ConversionFolding : public HloPassInterface { : bfloat16_support_(bfloat16_support) {} ~BFloat16ConversionFolding() override = default; - tensorflow::StringPiece name() const override { return "bfloat16-fold"; } + absl::string_view name() const override { return "bfloat16-fold"; } // Run BF16 conversion folding on the given computation. Returns whether the // computation was changed. diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc index 28e71c2054f59ba4d5d096bf7d898161877bb42f..6363a21c3bafe8353a6ebfde405bb7a3736c2074 100644 --- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc +++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc @@ -211,6 +211,17 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) { TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) { auto builder = HloComputation::Builder(TestName()); + + auto module = CreateNewModule(); + HloComputation::Builder sum_builder("add"); + auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x")); + auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {}), "y")); + sum_builder.AddInstruction(HloInstruction::CreateBinary( + ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, x, y)); + HloComputation* sum = module->AddEmbeddedComputation(sum_builder.Build()); + Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4}); Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4}); @@ -223,7 +234,9 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) { HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateCrossReplicaSum( - ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b})); + ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b}, + sum, /*replica_groups=*/{}, /*barrier=*/"", + /*all_reduce_id=*/absl::nullopt)); HloInstruction* gte_a = builder.AddInstruction( HloInstruction::CreateGetTupleElement(f32_shape, crs, 0)); HloInstruction* gte_b = builder.AddInstruction( @@ -233,7 +246,6 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) { HloInstruction* tuple = builder.AddInstruction( HloInstruction::CreateTuple({gte_a, convert_gte_b})); - auto module = CreateNewModule(); auto computation = module->AddEntryComputation(builder.Build()); EXPECT_TRUE(FoldConversions(module.get())); diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.cc b/tensorflow/compiler/xla/service/bfloat16_normalization.cc index 14c54ddd135af024327f63418b410da1ed3c4fd4..d5b1148058898596bfdb837826a590bbc74e202a 100644 --- a/tensorflow/compiler/xla/service/bfloat16_normalization.cc +++ b/tensorflow/compiler/xla/service/bfloat16_normalization.cc @@ -15,12 +15,13 @@ limitations under the License. #include "tensorflow/compiler/xla/service/bfloat16_normalization.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -34,9 +35,6 @@ class BFloat16NormalizationVisitor : public DfsHloVisitorWithDefault { Status DefaultAction(HloInstruction* hlo) override; - // Special handling for cross-replica-sum which can have a tuple output. - Status HandleCrossReplicaSum(HloInstruction* crs) override; - static bool Run(HloComputation* computation, const BFloat16Support* bfloat16_support) { BFloat16NormalizationVisitor visitor(computation, bfloat16_support); @@ -49,6 +47,10 @@ class BFloat16NormalizationVisitor : public DfsHloVisitorWithDefault { // conversions between F32 and BF16 to make it supported. Status HandleInstruction(HloInstruction* hlo); + // Handle instructions with tuple outputs by examining each output + // independently. + Status HandleMultipleOutputs(HloInstruction* hlo); + // Inserts a conversion HLO that changes the given HLO's output type. Status InsertConvertAfterOutput(HloInstruction* hlo, PrimitiveType to, HloComputation* computation); @@ -67,8 +69,7 @@ class BFloat16NormalizationVisitor : public DfsHloVisitorWithDefault { // Inserts conversion HLOs to replace the called computations' BF16 // operands/outputs to F32. Status ConvertCalledComputations( - HloInstruction* hlo, - tensorflow::gtl::ArraySlice bf16_called_comps); + HloInstruction* hlo, absl::Span bf16_called_comps); HloComputation* computation_; const BFloat16Support* bfloat16_support_; @@ -112,8 +113,7 @@ Status BFloat16NormalizationVisitor::InsertConvertBeforeOperand( } Status BFloat16NormalizationVisitor::ConvertCalledComputations( - HloInstruction* hlo, - tensorflow::gtl::ArraySlice bf16_called_comps) { + HloInstruction* hlo, absl::Span bf16_called_comps) { std::map cloned_computations; for (auto& comp : bf16_called_comps) { auto cloned = comp->parent()->AddEmbeddedComputation(comp->Clone()); @@ -144,26 +144,22 @@ Status BFloat16NormalizationVisitor::ConvertCalledComputations( return Status::OK(); } -Status BFloat16NormalizationVisitor::HandleCrossReplicaSum( - HloInstruction* crs) { - if (!ShapeUtil::IsTuple(crs->shape())) { - return HandleInstruction(crs); - } - - std::vector operand_types(crs->operand_count()); - std::vector output_types(crs->operand_count()); +Status BFloat16NormalizationVisitor::HandleMultipleOutputs( + HloInstruction* hlo) { + std::vector operand_types(hlo->operand_count()); + std::vector output_types(hlo->operand_count()); int64 f32_count = 0; int64 bf16_count = 0; bool has_unsupported_bf16_operand = false; bool has_unsupported_bf16_output = false; - for (int64 i = 0; i < crs->operand_count(); ++i) { - operand_types[i] = crs->operand(i)->shape().element_type(); - output_types[i] = ShapeUtil::GetSubshape(crs->shape(), {i}).element_type(); + for (int64 i = 0; i < hlo->operand_count(); ++i) { + operand_types[i] = hlo->operand(i)->shape().element_type(); + output_types[i] = ShapeUtil::GetSubshape(hlo->shape(), {i}).element_type(); if (operand_types[i] == F32) { f32_count += 1; } else if (operand_types[i] == BF16) { bf16_count += 1; - if (!bfloat16_support_->SupportsBF16Operand(*crs, i)) { + if (!bfloat16_support_->SupportsBF16Operand(*hlo, i)) { has_unsupported_bf16_operand = true; } } @@ -171,7 +167,7 @@ Status BFloat16NormalizationVisitor::HandleCrossReplicaSum( f32_count += 1; } else if (output_types[i] == BF16) { bf16_count += 1; - if (!bfloat16_support_->SupportsBF16Output(*crs)) { + if (!bfloat16_support_->SupportsBF16Output(*hlo)) { has_unsupported_bf16_output = true; } } @@ -185,43 +181,43 @@ Status BFloat16NormalizationVisitor::HandleCrossReplicaSum( if (operand_types[i] != BF16) { return false; } - if (!bfloat16_support_->SupportsBF16Operand(*crs, i)) { + if (!bfloat16_support_->SupportsBF16Operand(*hlo, i)) { return true; } - if (bfloat16_support_->SupportsMixedPrecisions(*crs)) { + if (bfloat16_support_->SupportsMixedPrecisions(*hlo)) { return false; } return has_unsupported_bf16_operand || has_unsupported_bf16_output || f32_count > 0; }; - for (int64 i = 0; i < crs->operand_count(); ++i) { + for (int64 i = 0; i < hlo->operand_count(); ++i) { if (should_convert_operand(i)) { - TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(crs, i, F32, computation_)); + TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(hlo, i, F32, computation_)); f32_count += 1; bf16_count -= 1; } } if (!has_unsupported_bf16_output && - (bfloat16_support_->SupportsMixedPrecisions(*crs) || f32_count == 0 || + (bfloat16_support_->SupportsMixedPrecisions(*hlo) || f32_count == 0 || bf16_count == 0)) { return Status::OK(); } - std::vector materialized_users = crs->users(); - std::vector output_elements(crs->operand_count()); - auto original_shape = crs->shape(); - for (int64 i = 0; i < crs->operand_count(); ++i) { - auto subshape = ShapeUtil::GetMutableSubshape(crs->mutable_shape(), {i}); + std::vector materialized_users = hlo->users(); + std::vector output_elements(hlo->operand_count()); + auto original_shape = hlo->shape(); + for (int64 i = 0; i < hlo->operand_count(); ++i) { + auto subshape = ShapeUtil::GetMutableSubshape(hlo->mutable_shape(), {i}); if (output_types[i] != BF16) { output_elements[i] = computation_->AddInstruction( - HloInstruction::CreateGetTupleElement(*subshape, crs, i)); + HloInstruction::CreateGetTupleElement(*subshape, hlo, i)); continue; } subshape->set_element_type(F32); auto gte = computation_->AddInstruction( - HloInstruction::CreateGetTupleElement(*subshape, crs, i)); + HloInstruction::CreateGetTupleElement(*subshape, hlo, i)); output_elements[i] = computation_->AddInstruction(HloInstruction::CreateConvert( ShapeUtil::ChangeElementType(*subshape, BF16), gte)); @@ -229,11 +225,11 @@ Status BFloat16NormalizationVisitor::HandleCrossReplicaSum( auto tuple = computation_->AddInstruction( HloInstruction::CreateTuple(output_elements)); - // Use the crs' shape temporarily, in order to pass checks in + // Use the hlo' shape temporarily, in order to pass checks in // ReplaceUseWith. - *tuple->mutable_shape() = crs->shape(); + *tuple->mutable_shape() = hlo->shape(); for (auto* user : materialized_users) { - TF_RETURN_IF_ERROR(crs->ReplaceUseWith(user, tuple)); + TF_RETURN_IF_ERROR(hlo->ReplaceUseWith(user, tuple)); } *tuple->mutable_shape() = original_shape; return Status::OK(); @@ -361,6 +357,12 @@ Status BFloat16NormalizationVisitor::DefaultAction(HloInstruction* hlo) { hlo->opcode() == HloOpcode::kConditional) { return Status::OK(); } + // TODO(b/112040122): Correctly normalize variadic reduce. + if ((hlo->opcode() == HloOpcode::kSort || + hlo->opcode() == HloOpcode::kCrossReplicaSum) && + ShapeUtil::IsTuple(hlo->shape())) { + return HandleMultipleOutputs(hlo); + } return HandleInstruction(hlo); } diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.h b/tensorflow/compiler/xla/service/bfloat16_normalization.h index 2a60fe0af3218484acb95e6c69815d551350764c..30b6346312790f0a199f96f1956ba9ce3e617f72 100644 --- a/tensorflow/compiler/xla/service/bfloat16_normalization.h +++ b/tensorflow/compiler/xla/service/bfloat16_normalization.h @@ -31,7 +31,7 @@ class BFloat16Normalization : public HloPassInterface { : bfloat16_support_(bfloat16_support) {} ~BFloat16Normalization() override = default; - tensorflow::StringPiece name() const override { return "bf16-normalization"; } + absl::string_view name() const override { return "bf16-normalization"; } // Run BF16 normalization on the given computation. Returns whether the // computation was changed. @@ -54,7 +54,7 @@ class BFloat16MixedPrecisionRemoval : public HloPassInterface { ~BFloat16MixedPrecisionRemoval() override = default; - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "bf16-mixed-precision-removal"; } diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc index 1afaefd9df9c5771fb9e134ae9050f3abb00ea4a..b08705d4c2b644fe1a7ba9994876fd6397f8a5df 100644 --- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc +++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc @@ -76,7 +76,8 @@ class BFloat16NormalizationTest : public HloTestBase { StatusOr result = normalization.Run(module); EXPECT_IS_OK(result.status()); - HloVerifier verifier(/*allow_mixed_precision=*/true); + HloVerifier verifier(/*layout_sensitive=*/false, + /*allow_mixed_precision=*/true); EXPECT_IS_OK(verifier.Run(module).status()); return result.ValueOrDie(); @@ -228,6 +229,17 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) { } TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) { + auto module = CreateNewModule(); + HloComputation::Builder sum_builder("sum"); + auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x")); + auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {}), "y")); + sum_builder.AddInstruction(HloInstruction::CreateBinary( + ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, x, y)); + HloComputation* reduction = + module->AddEmbeddedComputation(sum_builder.Build()); + auto builder = HloComputation::Builder(TestName()); Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4}); Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4}); @@ -239,11 +251,12 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) { HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateCrossReplicaSum( - ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b})); + ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}, reduction, + /*replica_groups=*/{}, /*barrier=*/"", + /*all_reduce_id=*/absl::nullopt)); HloInstruction* gte = builder.AddInstruction( HloInstruction::CreateGetTupleElement(bf16_shape, crs, 1)); - auto module = CreateNewModule(); auto computation = module->AddEntryComputation(builder.Build()); EXPECT_TRUE(Normalize(module.get())); @@ -254,6 +267,33 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) { EXPECT_EQ(ShapeUtil::GetSubshape(crs->shape(), {1}).element_type(), F32); } +TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) { + auto module = CreateNewModule(); + auto builder = HloComputation::Builder(TestName()); + Shape f32_shape = ShapeUtil::MakeShape(F32, {1024}); + Shape bf16_shape = ShapeUtil::MakeShape(BF16, {1024}); + Shape s32_shape = ShapeUtil::MakeShape(BF16, {1024}); + + HloInstruction* key = builder.AddInstruction( + HloInstruction::CreateParameter(0, f32_shape, "key")); + HloInstruction* value = builder.AddInstruction( + HloInstruction::CreateParameter(1, s32_shape, "value")); + + HloInstruction* sort = builder.AddInstruction(HloInstruction::CreateSort( + ShapeUtil::MakeTupleShape({bf16_shape, s32_shape}), 0, key, value)); + HloInstruction* gte = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(bf16_shape, sort, 0)); + + auto computation = module->AddEntryComputation(builder.Build()); + + EXPECT_TRUE(Normalize(module.get())); + + EXPECT_EQ(computation->root_instruction(), gte); + EXPECT_EQ(gte->shape().element_type(), BF16); + EXPECT_EQ(sort->operand(0)->shape().element_type(), F32); + EXPECT_EQ(ShapeUtil::GetSubshape(sort->shape(), {0}).element_type(), F32); +} + // Tests that the normalization should not cause unsupported mixed precision due // to resolving unsupported BF16 operand. TEST_F(BFloat16NormalizationTest, DoNotAddUnsupportedMixedPrecision) { diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc index ed0746980f87ac2bea79c308644dc63769f9e309..545a6ecfb1fca88c2c759e820f9d87a38b1941ca 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc +++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/bfloat16_propagation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_dce.h" @@ -85,9 +85,9 @@ void BFloat16Propagation::RevertIfFusionInternalBF16Changes( auto root_changes_it = changes_to_bf16_.find(root); if (root_changes_it != changes_to_bf16_.end()) { - for (const auto& index : root_changes_it->second) { + for (const auto& entry : root_changes_it->second) { for (const HloValue* value : - dataflow_->GetValueSet(root, index).values()) { + dataflow_->GetValueSet(root, entry.second).values()) { changed_root_buffers.insert(value); } } @@ -204,12 +204,23 @@ void BFloat16Propagation::DetermineWhileComputationsPrecision( bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo, const ShapeIndex& index) const { + // If the subshape isn't floating point then none of the users will be BF16. + const Shape& subshape = ShapeUtil::GetSubshape(hlo.shape(), index); + if (subshape.element_type() != BF16 && subshape.element_type() != F32) { + return false; + } + auto& value_set = dataflow_->GetValueSet(&hlo, index); for (const HloValue* value : value_set.values()) { if (ContainsKey(values_that_must_be_kept_as_f32_, value)) { return false; } - if (ValueTypeAfterChange(value) == BF16) { + // We use the original type for the value because we are going to examine + // the uses of it, instead of the value itself. If ValueTypeAfterChange() + // were used, it would cause problems when there are aliasing buffers, i.e., + // ResolveInconsistencyOfAliasingBuffers() would fail to revert the + // tentative change to BF16 even if the uses require F32. + if (value->shape().element_type() == BF16) { continue; } for (const HloUse& use : value->uses()) { @@ -257,23 +268,34 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo, // If the op propagates precision and it outputs a BF16, then it's OK to // supply BF16 also as the input. In the backward pass, the users shapes // should have already been processed. - PrimitiveType user_output_type = PRIMITIVE_TYPE_INVALID; - if (use.instruction->opcode() == HloOpcode::kTuple || - (use.instruction->opcode() == HloOpcode::kCrossReplicaSum && - ShapeUtil::IsTuple(use.instruction->shape()))) { - ShapeIndex use_output_index{use.operand_number}; - for (int64 i : use.operand_index) { - use_output_index.push_back(i); - } - user_output_type = - OutputTypeAfterChange(use.instruction, use_output_index); - } else { - user_output_type = OutputTypeAfterChange(use.instruction, {}); - } if (bfloat16_support_->EffectiveOperandPrecisionIsOutputPrecision( - *use.instruction, use.operand_number) && - user_output_type == BF16) { - continue; + *use.instruction, use.operand_number)) { + if (use.instruction->opcode() == HloOpcode::kTuple || + (use.instruction->opcode() == HloOpcode::kCrossReplicaSum && + ShapeUtil::IsTuple(use.instruction->shape()))) { + ShapeIndex use_output_index{use.operand_number}; + for (int64 i : use.operand_index) { + use_output_index.push_back(i); + } + if (OutputTypeAfterChange(use.instruction, use_output_index) == + BF16) { + continue; + } + } else if (use.instruction->opcode() == HloOpcode::kGetTupleElement) { + ShapeIndex use_output_index; + for (int64 i = 1; i < use.operand_index.size(); ++i) { + use_output_index.push_back(use.operand_index[i]); + } + if (OutputTypeAfterChange(use.instruction, use_output_index) == + BF16) { + continue; + } + } else { + if (OutputTypeAfterChange(use.instruction, use.operand_index) == + BF16) { + continue; + } + } } return false; } @@ -368,6 +390,7 @@ bool BFloat16Propagation::InstructionIsCandidateForBF16Output( if (!bfloat16_support_->SupportsMixedPrecisions(*hlo) && hlo->opcode() != HloOpcode::kTuple && hlo->opcode() != HloOpcode::kGetTupleElement && + hlo->opcode() != HloOpcode::kDomain && hlo->shape().element_type() != BF16) { for (int64 i = 0; i < hlo->operand_count(); ++i) { if (!bfloat16_support_->EffectiveOperandPrecisionIsOutputPrecision(*hlo, @@ -384,7 +407,7 @@ void BFloat16Propagation::AdjustCalledComputationParameters( HloInstruction* hlo) { auto adjust_computation = [this, hlo](HloComputation* computation, - tensorflow::gtl::ArraySlice operands) { + absl::Span operands) { // Adjust parameters. CHECK_EQ(operands.size(), computation->num_parameters()); for (int64 i = 0; i < operands.size(); ++i) { @@ -548,6 +571,9 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper( } visited_computations->insert(visited_in_while.begin(), visited_in_while.end()); + } else if (hlo->opcode() == HloOpcode::kFusion) { + ResolveInconsistencyOfAliasingBuffersHelper( + hlo->fused_instructions_computation(), visited_computations); } } // Now adjust parameters of called computations. @@ -559,7 +585,7 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper( void BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers( HloModule* module) { - std::list computations_topological_order = + const auto& computations_topological_order = module->MakeComputationPostOrder(); tensorflow::gtl::FlatSet resolved; for (auto comp_it = computations_topological_order.rbegin(); @@ -597,7 +623,6 @@ Status BFloat16Propagation::ResolveInconsistentFusions(HloModule* module) { // (1) a is F32 but tuple is BF16 // (2) after adding conversion // (3) after tuple simplifier and DCE. - bool needs_tuple_simplifier = false; for (auto computation : module->MakeComputationPostOrder()) { auto insts = computation->MakeInstructionPostOrder(); for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) { @@ -611,67 +636,25 @@ Status BFloat16Propagation::ResolveInconsistentFusions(HloModule* module) { continue; } ShapeTree converted_outputs(hlo->shape()); - // Iterate through nodes in the shape tree in pre-order and initialize - // each non-root node with a corresponding get-tuple-element. For a leaf - // node, if its shape does not match the fusion output, create a - // conversion node to overwrite the node value. - for (auto it = converted_outputs.begin(); it != converted_outputs.end(); - ++it) { - ShapeIndex output_index = it->first; - HloInstruction*& output = it->second; - const Shape subshape = - ShapeUtil::GetSubshape(hlo->shape(), output_index); - if (output_index.empty()) { - output = fusion_root; - } else { - ShapeIndex parent_index = output_index; - parent_index.pop_back(); - output = fusion_computation->AddInstruction( - HloInstruction::CreateGetTupleElement( - subshape, converted_outputs.element(parent_index), - output_index.back())); - } - if (ShapeUtil::IsTuple(subshape)) { - continue; - } - if (!ShapeUtil::Compatible( - subshape, - ShapeUtil::GetSubshape(fusion_root->shape(), output_index))) { - output = fusion_computation->AddInstruction( - HloInstruction::CreateConvert(subshape, output)); - } - } - // Iterate through nodes in the shape tree in reverse pre-order and create - // a tuple instruction for each non-leaf node where the elements are the - // values of its child nodes. - for (auto it = converted_outputs.rbegin(); it != converted_outputs.rend(); - ++it) { - ShapeIndex output_index = it->first; - HloInstruction*& output = it->second; - const Shape& subshape = - ShapeUtil::GetSubshape(hlo->shape(), output_index); - if (!ShapeUtil::IsTuple(subshape)) { - continue; - } - std::vector elements( - ShapeUtil::TupleElementCount(subshape)); - ShapeIndex child_index = output_index; - for (int64 i = 0; i < elements.size(); ++i) { - child_index.push_back(i); - elements[i] = converted_outputs.element(child_index); - child_index.pop_back(); - } - output = fusion_computation->AddInstruction( - HloInstruction::CreateTuple(elements)); - } - fusion_computation->set_root_instruction(converted_outputs.element({})); - needs_tuple_simplifier |= ShapeUtil::IsTuple(hlo->shape()); + // Deep copy the fusion root, and convert a leaf node only if its shape + // does not match the fusion output. + TF_ASSIGN_OR_RETURN( + HloInstruction * copy, + fusion_computation->DeepCopyInstructionWithCustomCopier( + fusion_root, + [hlo](HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* comp) { + const Shape& hlo_subshape = + ShapeUtil::GetSubshape(hlo->shape(), leaf_index); + if (ShapeUtil::Compatible(leaf->shape(), hlo_subshape)) { + return leaf; + } + return comp->AddInstruction( + HloInstruction::CreateConvert(hlo_subshape, leaf)); + })); + fusion_computation->set_root_instruction(copy); } } - if (needs_tuple_simplifier) { - TupleSimplifier tuple_simplifier; - TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); - } return Status::OK(); } @@ -740,10 +723,38 @@ StatusOr BFloat16Propagation::Run(HloModule* module) { changes_to_bf16_.clear(); changed_ = false; + auto computations_topological_order = module->MakeComputationPostOrder(); + + // Before running the propagation pass, we insert copies (kConvert to the same + // type) of F32 inputs to while loops. This prevents other uses of the same + // input from aliasing the while loop input/output, so that there's greater + // chance to use BF16 inside the loop. If some of these added copies do not + // help, they will remain F32 after BF16 propagation and will be removed since + // they are no-ops. + for (auto computation : computations_topological_order) { + for (auto inst : computation->MakeInstructionPostOrder()) { + if (inst->opcode() != HloOpcode::kWhile) { + continue; + } + + auto operand = inst->mutable_operand(0); + TF_ASSIGN_OR_RETURN( + HloInstruction * copy, + computation->DeepCopyInstructionWithCustomCopier( + operand, [](HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* comp) { + if (leaf->shape().element_type() != F32) { + return leaf; + } + return comp->AddInstruction( + HloInstruction::CreateConvert(leaf->shape(), leaf)); + })); + TF_RETURN_IF_ERROR(operand->ReplaceUseWith(inst, copy)); + } + } + TF_ASSIGN_OR_RETURN(dataflow_, HloDataflowAnalysis::Run(*module)); - std::list computations_topological_order = - module->MakeComputationPostOrder(); // The first step is a forward pass (parameters to root), where we determine // the potential candidate instructions to use bfloat16 in the outputs that // are not likely to cause overhead from extra explicit conversions. This is @@ -766,8 +777,7 @@ StatusOr BFloat16Propagation::Run(HloModule* module) { // propagation in reverse topological order. for (auto comp_it = computations_topological_order.rbegin(); comp_it != computations_topological_order.rend(); ++comp_it) { - if ((*comp_it)->IsFusionComputation()) { - // Fusion computations are handled when visiting the fusion instruction. + if (ContainsKey(computations_visited_in_backward_pass_, *comp_it)) { continue; } auto insts = (*comp_it)->MakeInstructionPostOrder(); @@ -775,6 +785,7 @@ StatusOr BFloat16Propagation::Run(HloModule* module) { DetermineInstructionPrecision(*inst_it, /*skip_parameters=*/true); } + computations_visited_in_backward_pass_.insert(*comp_it); } // It's possible that an instruction does not define a buffer, but the @@ -784,39 +795,42 @@ StatusOr BFloat16Propagation::Run(HloModule* module) { // Apply the changes in changes_to_bf16_. for (auto& change : changes_to_bf16_) { - auto shape = change.first->mutable_shape(); - for (const auto& index : change.second) { - auto subshape = ShapeUtil::GetMutableSubshape(shape, index); + for (const auto& entry : change.second) { + auto subshape = entry.first; CHECK_EQ(subshape->element_type(), F32); subshape->set_element_type(BF16); changed_ = true; } } + // Removes redundant HLOs added by this pass, either when inserting + // de-aliasing copies to while loop inputs, or later when converting output + // types. + auto clean_up = [this, module]() { + TF_RETURN_IF_ERROR(SkipNoopConversions(module)); + TupleSimplifier tuple_simplifier; + TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); + HloDCE dce; + TF_RETURN_IF_ERROR(dce.Run(module).status()); + return Status::OK(); + }; + if (!changed_) { + TF_RETURN_IF_ERROR(clean_up()); return false; } TF_RETURN_IF_ERROR(ResolveInconsistentFusions(module)); TF_RETURN_IF_ERROR(ResolveConvertedConstants(module)); - // This pass could have turned an F32 -> BF16 conversion to a no-op (BF16 -> - // BF16), so we skip them now. - TF_RETURN_IF_ERROR(SkipNoopConversions(module)); - - { - // We may have dead HLOs after ResolveInconsistentFusions, - // ResolveConvertedConstants and SkipNoopConversions. - HloDCE dce; - TF_RETURN_IF_ERROR(dce.Run(module).status()); - } + TF_RETURN_IF_ERROR(clean_up()); return true; } PrimitiveType BFloat16Propagation::OutputTypeAfterChange( HloInstruction* hlo, const ShapeIndex& index) const { - PrimitiveType type_on_hlo = - ShapeUtil::GetSubshape(hlo->shape(), index).element_type(); + Shape* subshape = ShapeUtil::GetMutableSubshape(hlo->mutable_shape(), index); + const PrimitiveType type_on_hlo = subshape->element_type(); if (type_on_hlo != F32) { return type_on_hlo; } @@ -824,7 +838,7 @@ PrimitiveType BFloat16Propagation::OutputTypeAfterChange( if (it == changes_to_bf16_.end()) { return type_on_hlo; } - return ContainsKey(it->second, index) ? BF16 : F32; + return ContainsKey(it->second, subshape) ? BF16 : F32; } PrimitiveType BFloat16Propagation::ValueTypeAfterChange( @@ -838,14 +852,16 @@ void BFloat16Propagation::AddToOrRemoveFromBF16ChangeSet( HloInstruction* hlo, const ShapeIndex& index, PrimitiveType target_type) { if (target_type == BF16) { auto& entry = changes_to_bf16_[hlo]; - entry.insert(index); + entry.emplace(ShapeUtil::GetMutableSubshape(hlo->mutable_shape(), index), + index); } else { CHECK_EQ(target_type, F32); auto it = changes_to_bf16_.find(hlo); if (it == changes_to_bf16_.end()) { return; } - it->second.erase(index); + it->second.erase( + ShapeUtil::GetMutableSubshape(hlo->mutable_shape(), index)); } } diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.h b/tensorflow/compiler/xla/service/bfloat16_propagation.h index de0355ddfca127753f90d1899b424a8e77c9b291..1ee64971ab53e1775294afde1c779369a838008a 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation.h +++ b/tensorflow/compiler/xla/service/bfloat16_propagation.h @@ -64,9 +64,7 @@ class BFloat16Propagation : public HloPassInterface { ~BFloat16Propagation() override = default; - tensorflow::StringPiece name() const override { - return "bfloat16-propagation"; - } + absl::string_view name() const override { return "bfloat16-propagation"; } // Runs the pass on the given module. Returns whether the module was changed // (precision reductions were added). @@ -194,17 +192,11 @@ class BFloat16Propagation : public HloPassInterface { // are subject to further adjustment, then finally applied to the HLOs. This // avoids setting changed_ to true but all changes are reverted during // adjustment. - struct IndexHasher { - int64 operator()(const ShapeIndex& index) const { - int64 hash = 0; - for (int64 i : index) { - hash = tensorflow::Hash64Combine(hash, std::hash()(i)); - } - return hash; - } - }; + // + // For each HloInstruction, changes_to_bf16_ stores the affected buffers in + // the output as a map from in-place pointers to subshapes to shape indices. tensorflow::gtl::FlatMap> + tensorflow::gtl::FlatMap> changes_to_bf16_; // Whether the last processed HLO module has been changed by this pass. diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc index 5e1499ee6b6ef397f95f7ed29e808d530777bd07..69b654d30e42b1ed69304206f09120e86831d468 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc +++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc @@ -133,9 +133,9 @@ TEST_F(BFloat16PropagationTest, ConvertConstantLiteral) { array_b.FillUnique(10.0f); HloInstruction* a = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateFromArray(array_a))); + HloInstruction::CreateConstant(LiteralUtil::CreateFromArray(array_a))); HloInstruction* b = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateFromArray(array_b))); + HloInstruction::CreateConstant(LiteralUtil::CreateFromArray(array_b))); HloInstruction* dot = builder.AddInstruction( HloInstruction::CreateBinary(shape, HloOpcode::kDot, a, b)); @@ -150,11 +150,11 @@ TEST_F(BFloat16PropagationTest, ConvertConstantLiteral) { EXPECT_EQ(dot->operand(0)->opcode(), HloOpcode::kConstant); EXPECT_EQ(dot->operand(1)->opcode(), HloOpcode::kConstant); EXPECT_TRUE(LiteralTestUtil::Equal( - dot->operand(0)->literal(), - *Literal::ConvertF32ToBF16(*Literal::CreateFromArray(array_a)))); + *LiteralUtil::ConvertF32ToBF16(*LiteralUtil::CreateFromArray(array_a)), + dot->operand(0)->literal())); EXPECT_TRUE(LiteralTestUtil::Equal( - dot->operand(1)->literal(), - *Literal::ConvertF32ToBF16(*Literal::CreateFromArray(array_b)))); + *LiteralUtil::ConvertF32ToBF16(*LiteralUtil::CreateFromArray(array_b)), + dot->operand(1)->literal())); } // Tests that BF16 can be propagated through nested tuples. @@ -240,12 +240,10 @@ TEST_F(BFloat16PropagationTest, SameValueReferencedTwice) { EXPECT_TRUE(PropagatePrecision(module.get())); EXPECT_EQ(computation->root_instruction(), dot); - EXPECT_TRUE(OutputsBF16(add0)); EXPECT_TRUE(OutputsBF16(add1)); EXPECT_TRUE(OutputsBF16(lhs)); - // rhs is a get-tuple-element, which does not define a buffer, but its shape - // should also be adjusted accordingly. - EXPECT_TRUE(OutputsBF16(rhs)); + + // add0 and rhs have been eliminated by simplification and DCE. } // Tests that a non-fusion computation's root should not be changed. @@ -434,7 +432,7 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) { HloInstruction* tuple1 = builder.AddInstruction(HloInstruction::CreateTuple({param, add1})); HloInstruction* sel = builder.AddInstruction(HloInstruction::CreateTernary( - tuple0->shape(), HloOpcode::kSelect, pred, tuple0, tuple1)); + tuple0->shape(), HloOpcode::kTupleSelect, pred, tuple0, tuple1)); HloInstruction* gte0 = builder.AddInstruction( HloInstruction::CreateGetTupleElement(shape, sel, 0)); HloInstruction* gte1 = builder.AddInstruction( @@ -510,6 +508,63 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) { EXPECT_FALSE(OutputsBF16(dot)); } +// Tests that if the while condition prevents using BF16, no changes should be +// made to the while body and thus the fusion node inside it. +TEST_F(BFloat16PropagationTest, + ConditionPreventsPropagationForFusionInsideWhile) { + auto module = CreateNewModule(); + auto builder = HloComputation::Builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {4, 4}); + + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, shape, "param0")); + HloInstruction* param1 = builder.AddInstruction( + HloInstruction::CreateParameter(1, shape, "param1")); + HloInstruction* add = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1)); + + auto builder_cond = HloComputation::Builder("cond"); + auto cond_param = builder_cond.AddInstruction( + HloInstruction::CreateParameter(0, shape, "cond_param")); + builder_cond.AddInstruction(HloInstruction::CreateBinary( + ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, + builder_cond.AddInstruction(HloInstruction::CreateSlice( + ShapeUtil::MakeShape(F32, {}), cond_param, {0, 0}, {1, 1}, {1, 1})), + builder_cond.AddInstruction(HloInstruction::CreateSlice( + ShapeUtil::MakeShape(F32, {}), cond_param, {1, 1}, {2, 2}, {1, 1})))); + auto cond = module->AddEmbeddedComputation(builder_cond.Build()); + + auto builder_body = HloComputation::Builder("body"); + auto body_param = builder_body.AddInstruction( + HloInstruction::CreateParameter(0, shape, "body_param")); + auto body_transpose = builder_body.AddInstruction( + HloInstruction::CreateTranspose(shape, body_param, {0, 1})); + + auto builder_f = HloComputation::Builder("fusion"); + HloInstruction* a_f = + builder_f.AddInstruction(HloInstruction::CreateParameter(0, shape, "a")); + builder_f.AddInstruction(HloInstruction::CreateTranspose(shape, a_f, {0, 1})); + auto comp_f = module->AddEmbeddedComputation(builder_f.Build()); + auto body_fusion = builder_body.AddInstruction(HloInstruction::CreateFusion( + shape, HloInstruction::FusionKind::kCustom, {body_transpose}, comp_f)); + auto body = module->AddEmbeddedComputation(builder_body.Build()); + + auto while_hlo = builder.AddInstruction( + HloInstruction::CreateWhile(shape, cond, body, add)); + + auto dot = builder.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kDot, while_hlo, while_hlo)); + auto computation = module->AddEntryComputation(builder.Build()); + + EXPECT_FALSE(PropagatePrecision(module.get())); + EXPECT_EQ(computation->root_instruction(), dot); + EXPECT_FALSE(OutputsBF16(add)); + EXPECT_FALSE(OutputsBF16(body_fusion)); + EXPECT_FALSE(OutputsBF16(body_param)); + EXPECT_FALSE(OutputsBF16(body_transpose)); + EXPECT_FALSE(OutputsBF16(a_f)); +} + // Tests that BF16 is propagated properly through while computations with // tuple-shaped input/output. TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) { @@ -555,10 +610,14 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) { HloInstruction::CreateGetTupleElement(shape, body_param, 0)); auto body_rhs = builder_body.AddInstruction( HloInstruction::CreateGetTupleElement(shape, body_param, 1)); - auto body_dot = builder_body.AddInstruction( + auto body_dot1 = builder_body.AddInstruction( HloInstruction::CreateBinary(shape, HloOpcode::kDot, body_lhs, body_rhs)); + auto body_dot2 = builder_body.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kDot, body_rhs, body_lhs)); + auto body_transpose = builder_body.AddInstruction( + HloInstruction::CreateTranspose(shape, body_dot2, {0, 1})); builder_body.AddInstruction( - HloInstruction::CreateTuple({body_dot, body_rhs})); + HloInstruction::CreateTuple({body_dot1, body_transpose})); auto body = module->AddEmbeddedComputation(builder_body.Build()); auto while_hlo = builder.AddInstruction( @@ -577,9 +636,11 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) { EXPECT_EQ(computation->root_instruction(), dot); EXPECT_TRUE(OutputsBF16(lhs)); EXPECT_FALSE(OutputsBF16(rhs)); - EXPECT_TRUE(OutputsBF16(body_dot)); + EXPECT_TRUE(OutputsBF16(body_dot1)); EXPECT_TRUE(OutputsBF16(body_lhs)); EXPECT_FALSE(OutputsBF16(body_rhs)); + EXPECT_FALSE(OutputsBF16(body_dot2)); + EXPECT_FALSE(OutputsBF16(body_transpose)); EXPECT_TRUE(OutputsBF16(cond_lhs)); EXPECT_FALSE(OutputsBF16(cond_rhs)); EXPECT_TRUE(OutputsBF16(add0)); @@ -734,12 +795,95 @@ TEST_F(BFloat16PropagationTest, NoopConversionRemoved) { EXPECT_TRUE(PropagatePrecision(module.get())); EXPECT_EQ(computation->root_instruction(), add2); - EXPECT_EQ(add2->operand(0), gte0); - EXPECT_EQ(add2->operand(1), gte1); - EXPECT_EQ(gte0->shape().element_type(), BF16); - EXPECT_EQ(gte1->shape().element_type(), BF16); + EXPECT_EQ(add2->operand(0), add0); + EXPECT_EQ(add2->operand(1), add1); EXPECT_EQ(add0->shape().element_type(), BF16); EXPECT_EQ(add1->shape().element_type(), BF16); } +TEST_F(BFloat16PropagationTest, TupleDomain) { + auto builder = HloComputation::Builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {4, 4}); + + HloInstruction* a = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a")); + HloInstruction* b = + builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b")); + HloInstruction* a_trans = + builder.AddInstruction(HloInstruction::CreateTranspose(shape, a, {0, 1})); + HloInstruction* b_trans = + builder.AddInstruction(HloInstruction::CreateTranspose(shape, b, {0, 1})); + HloInstruction* tuple = + builder.AddInstruction(HloInstruction::CreateTuple({a_trans, b_trans})); + HloInstruction* domain = builder.AddInstruction( + HloInstruction::CreateDomain(tuple->shape(), tuple, nullptr, nullptr)); + HloInstruction* a_gte = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(shape, domain, 0)); + HloInstruction* b_gte = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(shape, domain, 1)); + HloInstruction* dot = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kDot, a_gte, b_gte)); + HloInstruction* root = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot)); + + auto module = CreateNewModule(); + auto computation = module->AddEntryComputation(builder.Build()); + + EXPECT_TRUE(PropagatePrecision(module.get())); + EXPECT_EQ(computation->root_instruction(), root); + + // test BF16 propagated through domain + EXPECT_EQ(ShapeUtil::GetTupleElementShape(domain->shape(), 0).element_type(), + BF16); + EXPECT_EQ(ShapeUtil::GetTupleElementShape(domain->shape(), 1).element_type(), + BF16); + + EXPECT_TRUE(OutputsBF16(a_trans)); + EXPECT_TRUE(OutputsBF16(b_trans)); + EXPECT_TRUE(OutputsBF16(a_gte)); + EXPECT_TRUE(OutputsBF16(b_gte)); + EXPECT_FALSE(OutputsBF16(a)); + EXPECT_FALSE(OutputsBF16(b)); +} + +// Tests that bf16 is not propagated through a domain in case its input cannot +// be propagated. In the case below the input of the domain is the parameter +// tuple which cannot be propagated, so the domain instruction is not propagated +// either. +TEST_F(BFloat16PropagationTest, TupleDomainNoPropagation) { + auto builder = HloComputation::Builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {4, 4}); + Shape tuple_shape = ShapeUtil::MakeTupleShape({shape, shape}); + + HloInstruction* param = builder.AddInstruction( + HloInstruction::CreateParameter(0, tuple_shape, "param")); + HloInstruction* domain = builder.AddInstruction( + HloInstruction::CreateDomain(param->shape(), param, nullptr, nullptr)); + HloInstruction* a_gte = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(shape, domain, 0)); + HloInstruction* b_gte = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(shape, domain, 1)); + HloInstruction* a_trans = builder.AddInstruction( + HloInstruction::CreateTranspose(shape, a_gte, {0, 1})); + HloInstruction* b_trans = builder.AddInstruction( + HloInstruction::CreateTranspose(shape, b_gte, {0, 1})); + HloInstruction* dot = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kDot, a_trans, b_trans)); + HloInstruction* root = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot)); + + auto module = CreateNewModule(); + auto computation = module->AddEntryComputation(builder.Build()); + + EXPECT_TRUE(PropagatePrecision(module.get())); + + EXPECT_EQ(computation->root_instruction(), root); + EXPECT_TRUE(OutputsBF16(a_trans)); + EXPECT_TRUE(OutputsBF16(b_trans)); + EXPECT_FALSE(OutputsBF16(a_gte)); + EXPECT_FALSE(OutputsBF16(b_gte)); + EXPECT_FALSE(OutputsBF16(domain)); + EXPECT_FALSE(OutputsBF16(param)); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc index 07b4b14b5ec1bdbc01345091105df69368b0b2fb..23645346e6f491beb5171cc839c013ce5f83d789 100644 --- a/tensorflow/compiler/xla/service/bfloat16_support.cc +++ b/tensorflow/compiler/xla/service/bfloat16_support.cc @@ -25,6 +25,7 @@ bool BFloat16Support::SupportsBF16Operand(const HloInstruction& hlo, case HloOpcode::kCall: case HloOpcode::kConditional: case HloOpcode::kCustomCall: + case HloOpcode::kDomain: case HloOpcode::kGetTupleElement: case HloOpcode::kTuple: case HloOpcode::kWhile: @@ -43,6 +44,7 @@ bool BFloat16Support::SupportsBF16Output(const HloInstruction& hlo) const { case HloOpcode::kCall: case HloOpcode::kConditional: case HloOpcode::kCustomCall: + case HloOpcode::kDomain: case HloOpcode::kGetTupleElement: case HloOpcode::kTuple: case HloOpcode::kWhile: @@ -81,6 +83,7 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision( case HloOpcode::kConcatenate: case HloOpcode::kConvert: case HloOpcode::kCopy: + case HloOpcode::kDomain: case HloOpcode::kGetTupleElement: case HloOpcode::kMaximum: case HloOpcode::kMinimum: @@ -92,11 +95,15 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision( case HloOpcode::kTranspose: case HloOpcode::kTuple: return true; + case HloOpcode::kBitcast: + return hlo.shape().element_type() == + hlo.operand(0)->shape().element_type(); case HloOpcode::kDynamicSlice: return operand_index == 0; case HloOpcode::kDynamicUpdateSlice: return operand_index == 0 || operand_index == 1; case HloOpcode::kSelect: + case HloOpcode::kTupleSelect: return operand_index == 1 || operand_index == 2; default: break; diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc index c0b8bf903923a327fb1378eafb51a7d493d5e62d..8b8c6bfd269971efa6fcd186e4825e6f13bb4094 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment.cc @@ -22,8 +22,10 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/buffer_value_containers.h" #include "tensorflow/compiler/xla/service/heap_simulator.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" @@ -36,20 +38,15 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { +namespace { +using absl::StrAppend; +using absl::StrAppendFormat; using ::tensorflow::gtl::FlatMap; using ::tensorflow::gtl::FlatSet; -using ::tensorflow::strings::Appendf; using ::tensorflow::strings::HumanReadableNumBytes; -using ::tensorflow::strings::Printf; -using ::tensorflow::strings::StrAppend; - -namespace { template string ColocatedBufferSetsToString(const T& container, const char* title) { @@ -61,12 +58,65 @@ string ColocatedBufferSetsToString(const T& container, const char* title) { return result; } -// Walk the call graph of the HLO module and place each computation into either -// thread_local_computations or global_computations depending upon whether the -// computation requires thread-local allocations or global allocations. The -// elements in thread_local_computations and global_computations are in post -// order (if computation A has an instruction which calls computation B, then A -// will appear after B in the vector). +// Checks that points-to set of 'instruction' is unambiguous and distinct +// (ensured by CopyInsertion), then adds the buffer from the points-to set at +// 'index' to 'colocated_set'. +const LogicalBuffer* AddBufferToColocatedSet( + const HloInstruction* instruction, const ShapeIndex& index, + const TuplePointsToAnalysis& points_to_analysis, + std::vector* colocated_set) { + // CopyInsertion ensures root points-to set is unambiguous and distinct. + const auto& points_to = points_to_analysis.GetPointsToSet(instruction); + DCHECK(!points_to.IsAmbiguous()); + colocated_set->push_back(points_to.element(index)[0]); + return colocated_set->back(); +} + +// Given the interference map of a graph (the list of interfering node indices +// for each node), perform graph coloring such that interfering nodes are +// assigned to different colors. Returns the assigned color of the nodes, where +// the colors are represented as integer values [0, color_count). +std::vector ColorInterferenceGraph( + const std::vector>& interference_map) { + const int64 node_count = interference_map.size(); + + // Sort the nodes such that we assign nodes with more interference first. This + // relies on the common heuristic of assigning the most constrained node + // first, but it would be good to investigate other ordering heuristics too. + std::vector nodes(node_count); + std::iota(nodes.begin(), nodes.end(), 0); + std::sort(nodes.begin(), nodes.end(), + [&interference_map](const int64 i, const int64 j) { + return interference_map[i].size() > interference_map[j].size(); + }); + + const int64 kColorUnassigned = -1; + std::vector assigned_colors(node_count, kColorUnassigned); + for (int64 node : nodes) { + // Mark the colors that are already assigned to the neighbors. + std::vector available_colors(node_count, true); + for (int64 neighbor : interference_map[node]) { + int64 color = assigned_colors[neighbor]; + if (color != kColorUnassigned) { + available_colors[color] = false; + } + } + + // Find the color that is not yet assigned to the neighbors. + int64 color = kColorUnassigned; + for (color = 0; color < available_colors.size(); ++color) { + if (available_colors[color]) { + break; + } + } + CHECK_NE(color, kColorUnassigned); + assigned_colors[node] = color; + } + return assigned_colors; +} + +} // namespace + Status GatherComputationsByAllocationType( const HloModule* module, std::vector* thread_local_computations, @@ -107,7 +157,7 @@ Status GatherComputationsByAllocationType( return InvalidArgument( "computation %s has conflicting allocation requirements (global " "and thread-local)", - computation->name().c_str()); + computation->name()); } if (is_thread_local) { @@ -130,14 +180,16 @@ Status GatherComputationsByAllocationType( return InvalidArgument( "computation %s cannot contain call/while op because it " "requires thread-local buffer allocations", - computation->name().c_str()); + computation->name()); } worklist.push_back(std::make_pair(subcomputation, false)); // Not thread local. break; + case HloOpcode::kCrossReplicaSum: case HloOpcode::kMap: case HloOpcode::kReduce: case HloOpcode::kReduceWindow: + case HloOpcode::kScatter: case HloOpcode::kSelectAndScatter: case HloOpcode::kFusion: // Map/reduce etc computations are always thread-local. @@ -145,9 +197,8 @@ Status GatherComputationsByAllocationType( true)); // Thread local. break; default: - return InternalError( - "Unexpected calling opcode: %s", - HloOpcodeString(instruction->opcode()).c_str()); + return InternalError("Unexpected calling opcode: %s", + HloOpcodeString(instruction->opcode())); } } } @@ -167,65 +218,6 @@ Status GatherComputationsByAllocationType( return Status::OK(); } -// Checks that points-to set of 'instruction' is unambiguous and distinct -// (ensured by CopyInsertion), then adds the buffer from the points-to set at -// 'index' to 'colocated_set'. -const LogicalBuffer* AddBufferToColocatedSet( - const HloInstruction* instruction, const ShapeIndex& index, - const TuplePointsToAnalysis& points_to_analysis, - std::vector* colocated_set) { - // CopyInsertion ensures root points-to set is unambiguous and distinct. - const auto& points_to = points_to_analysis.GetPointsToSet(instruction); - DCHECK(!points_to.IsAmbiguous()); - colocated_set->push_back(points_to.element(index)[0]); - return colocated_set->back(); -} - -// Given the interference map of a graph (the list of interfering node indices -// for each node), perform graph coloring such that interfering nodes are -// assigned to different colors. Returns the assigned color of the nodes, where -// the colors are represented as integer values [0, color_count). -std::vector ColorInterferenceGraph( - const std::vector>& interference_map) { - const int64 node_count = interference_map.size(); - - // Sort the nodes such that we assign nodes with more interference first. This - // relies on the common heuristic of assigning the most constrained node - // first, but it would be good to investigate other ordering heuristics too. - std::vector nodes(node_count); - std::iota(nodes.begin(), nodes.end(), 0); - std::sort(nodes.begin(), nodes.end(), - [&interference_map](const int64 i, const int64 j) { - return interference_map[i].size() > interference_map[j].size(); - }); - - const int64 kColorUnassigned = -1; - std::vector assigned_colors(node_count, kColorUnassigned); - for (int64 node : nodes) { - // Mark the colors that are already assigned to the neighbors. - std::vector available_colors(node_count, true); - for (int64 neighbor : interference_map[node]) { - int64 color = assigned_colors[neighbor]; - if (color != kColorUnassigned) { - available_colors[color] = false; - } - } - - // Find the color that is not yet assigned to the neighbors. - int64 color = kColorUnassigned; - for (color = 0; color < available_colors.size(); ++color) { - if (available_colors[color]) { - break; - } - } - CHECK_NE(color, kColorUnassigned); - assigned_colors[node] = color; - } - return assigned_colors; -} - -} // namespace - size_t BufferAllocation::Slice::Hasher::operator()(Slice s) const { uint64 h = std::hash()(s.index()); h = tensorflow::Hash64Combine(h, std::hash()(s.offset())); @@ -234,8 +226,8 @@ size_t BufferAllocation::Slice::Hasher::operator()(Slice s) const { } string BufferAllocation::Slice::ToString() const { - return tensorflow::strings::StrCat("{index:", index(), ", offset:", offset_, - ", size:", size_, "}"); + return absl::StrCat("{index:", index(), ", offset:", offset_, + ", size:", size_, "}"); } BufferAllocation::Slice BufferAllocation::GetSlice( @@ -269,7 +261,7 @@ BufferAllocationProto BufferAllocation::ToProto() const { proto.set_index(index_); proto.set_size(size_); proto.set_is_thread_local(is_thread_local_); - proto.set_is_reusable(is_reusable_); + proto.set_is_tuple(is_tuple_); proto.set_color(color_.value()); if (is_entry_computation_parameter_) { proto.set_is_entry_computation_parameter(true); @@ -278,6 +270,7 @@ BufferAllocationProto BufferAllocation::ToProto() const { } proto.set_parameter_number(parameter_number_); } + proto.set_is_constant(is_constant_); proto.set_maybe_live_out(maybe_live_out_); for (const auto& buffer_offset_size : assigned_buffers_) { BufferAllocationProto::Assigned* proto_assigned = proto.add_assigned(); @@ -295,7 +288,7 @@ BufferAllocationProto BufferAllocation::ToProto() const { string BufferAllocation::ToString() const { string output; - Appendf(&output, "allocation %lld: %p, size %lld", index_, this, size()); + StrAppendFormat(&output, "allocation %d: %p, size %d", index_, this, size()); if (color().value() != 0) { StrAppend(&output, ", color ", color().value()); } @@ -303,6 +296,9 @@ string BufferAllocation::ToString() const { StrAppend(&output, ", parameter ", parameter_number(), " at ShapeIndex ", param_shape_index().ToString()); } + if (is_constant()) { + StrAppend(&output, ", constant"); + } if (is_thread_local()) { StrAppend(&output, ", thread-local"); } @@ -324,11 +320,10 @@ string BufferAllocation::ToString() const { }); for (const LogicalBuffer* buffer : sorted_buffers) { const OffsetSize& offset_size = FindOrDie(assigned_buffers_, buffer); - StrAppend(&output, - tensorflow::strings::Printf( - " %s [%lld,%lld]: %s\n", buffer->ToString().c_str(), - offset_size.offset, offset_size.size, - ShapeUtil::HumanStringWithLayout(buffer->shape()).c_str())); + StrAppend(&output, absl::StrFormat( + " %s [%d,%d]: %s\n", buffer->ToString(), + offset_size.offset, offset_size.size, + ShapeUtil::HumanStringWithLayout(buffer->shape()))); } return output; } @@ -421,7 +416,7 @@ StatusOr BufferAssignment::GetUniqueSlice( return FailedPrecondition( "BufferAllocation::Slice for instruction %s at index %s cannot " "be determined at compile-time.", - instruction->name().c_str(), index.ToString().c_str()); + instruction->name(), index.ToString()); } } else { VLOG(3) << "No allocation"; @@ -430,7 +425,7 @@ StatusOr BufferAssignment::GetUniqueSlice( if (result.allocation() == nullptr) { return FailedPrecondition( "BufferAllocation::Slice not assigned for instruction %s at index %s", - instruction->name().c_str(), index.ToString().c_str()); + instruction->name(), index.ToString()); } return result; } @@ -490,20 +485,16 @@ BufferAssignment::GetUniqueTopLevelOutputSlice() const { } BufferAllocation* BufferAssignment::NewEmptyAllocation( - int64 size, bool is_thread_local, bool is_reusable, - LogicalBuffer::Color color) { + int64 size, LogicalBuffer::Color color) { BufferAllocation::Index index = allocations_.size(); - allocations_.emplace_back(index, size, is_thread_local, is_reusable, color); + allocations_.emplace_back(index, size, color); BufferAllocation* allocation = &allocations_.back(); return allocation; } BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer, - int64 size, - bool is_thread_local, - bool is_reusable) { - BufferAllocation* allocation = - NewEmptyAllocation(size, is_thread_local, is_reusable, buffer.color()); + int64 size) { + BufferAllocation* allocation = NewEmptyAllocation(size, buffer.color()); AddAssignment(allocation, buffer, /*offset=*/0, size); allocation->peak_buffers_.push_back(&buffer); return allocation; @@ -516,7 +507,8 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation, CHECK_EQ(0, allocation_index_for_buffer_.count(&buffer)) << "LogicalBuffer " << buffer << " already has an allocation."; CHECK(allocation->is_reusable() || allocation->assigned_buffers().empty()) - << "Non-reusable allocation already assigned a buffer"; + << "Non-reusable allocation already assigned a buffer: " + << allocation->ToString(); TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer)); @@ -608,6 +600,10 @@ Status BufferAssignment::ComputeSummaryStats() { stats_.parameter_allocation_count++; stats_.parameter_allocation_bytes += allocation.size(); } + if (allocation.is_constant()) { + stats_.constant_allocation_count++; + stats_.constant_allocation_bytes += allocation.size(); + } if (allocation.maybe_live_out()) { stats_.maybe_live_out_allocation_count++; stats_.maybe_live_out_allocation_bytes += allocation.size(); @@ -620,7 +616,7 @@ Status BufferAssignment::ComputeSummaryStats() { stats_.total_allocation_bytes += allocation.size(); } - // Only compute total fragmentation if all computations are sequential. + // Only compute total fragmentation if all computations have schedules. SequentialHloOrdering::HloModuleSequence module_sequence; for (const auto& computation : module_->computations()) { const std::vector* sequence = @@ -632,7 +628,7 @@ Status BufferAssignment::ComputeSummaryStats() { if (module_sequence.size() == module_->computation_count()) { TF_ASSIGN_OR_RETURN( const int64 min_size, - MinimumMemoryForSequence(module_sequence, buffer_size_)); + HeapSimulator::MinimumMemoryForModule(module_sequence, buffer_size_)); stats_.total_fragmentation_bytes = stats_.total_allocation_bytes - min_size; } @@ -641,37 +637,38 @@ Status BufferAssignment::ComputeSummaryStats() { string BufferAssignment::Stats::ToString() const { string s; - Appendf(&s, "BufferAssignment stats:\n"); - Appendf(&s, " parameter allocation: %10s\n", - HumanReadableNumBytes(parameter_allocation_bytes).c_str()); - Appendf(&s, " maybe_live_out allocation: %10s\n", - HumanReadableNumBytes(maybe_live_out_allocation_bytes).c_str()); - Appendf(&s, " preallocated temp allocation: %10s\n", - HumanReadableNumBytes(preallocated_temp_allocation_bytes).c_str()); + StrAppendFormat(&s, "BufferAssignment stats:\n"); + StrAppendFormat(&s, " parameter allocation: %10s\n", + HumanReadableNumBytes(parameter_allocation_bytes)); + StrAppendFormat(&s, " constant allocation: %10s\n", + HumanReadableNumBytes(constant_allocation_bytes)); + StrAppendFormat(&s, " maybe_live_out allocation: %10s\n", + HumanReadableNumBytes(maybe_live_out_allocation_bytes)); + StrAppendFormat(&s, " preallocated temp allocation: %10s\n", + HumanReadableNumBytes(preallocated_temp_allocation_bytes)); if (preallocated_temp_fragmentation_bytes >= 0) { const double percent = 100. * preallocated_temp_fragmentation_bytes / preallocated_temp_allocation_bytes; - Appendf( + StrAppendFormat( &s, " preallocated temp fragmentation: %10s (%.2f%%)\n", - HumanReadableNumBytes(preallocated_temp_fragmentation_bytes).c_str(), - percent); + HumanReadableNumBytes(preallocated_temp_fragmentation_bytes), percent); } - Appendf(&s, " total allocation: %10s\n", - HumanReadableNumBytes(total_allocation_bytes).c_str()); + StrAppendFormat(&s, " total allocation: %10s\n", + HumanReadableNumBytes(total_allocation_bytes)); if (total_fragmentation_bytes >= 0) { const double percent = 100. * total_fragmentation_bytes / total_allocation_bytes; - Appendf(&s, " total fragmentation: %10s (%.2f%%)\n", - HumanReadableNumBytes(total_fragmentation_bytes).c_str(), percent); + StrAppendFormat(&s, " total fragmentation: %10s (%.2f%%)\n", + HumanReadableNumBytes(total_fragmentation_bytes), percent); } return s; } string BufferAssignment::ToString() const { string output; - tensorflow::strings::StrAppend(&output, "BufferAssignment:\n"); + absl::StrAppend(&output, "BufferAssignment:\n"); for (auto& allocation : allocations_) { - tensorflow::strings::StrAppend(&output, allocation.ToString()); + absl::StrAppend(&output, allocation.ToString()); } return output; } @@ -721,8 +718,10 @@ StatusOr> BufferAssigner::Run( const HloModule* module, std::unique_ptr hlo_ordering, LogicalBuffer::SizeFunction buffer_size, LogicalBuffer::AlignmentFunction color_alignment, - bool allow_input_output_aliasing, BufferLiveness::Colorer colorer) { - BufferAssigner assigner(allow_input_output_aliasing, std::move(colorer)); + bool allow_input_output_aliasing, bool allocate_buffers_for_constants, + BufferLiveness::Colorer colorer) { + BufferAssigner assigner(allow_input_output_aliasing, + allocate_buffers_for_constants, std::move(colorer)); return assigner.CreateAssignment(module, std::move(hlo_ordering), std::move(buffer_size), std::move(color_alignment)); @@ -750,8 +749,8 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation, return false; } - if (allocation->is_entry_computation_parameter()) { - VLOG(4) << "Can't assign: allocation holds parameter"; + if (allocation->is_readonly()) { + VLOG(4) << "Can't assign: allocation is readonly"; return false; } @@ -807,8 +806,7 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation, } Status BufferAssigner::AssignBuffersForComputation( - const HloComputation* computation, const DebugOptions& debug_options, - bool is_thread_local, + const HloComputation* computation, bool is_thread_local, const FlatSet& colocated_buffers, const FlatSet& colocated_allocations, FlatMap>* @@ -868,8 +866,8 @@ Status BufferAssigner::AssignBuffersForComputation( // important reuse case where an elementwise instruction reuses one of its // operand's buffer. This improves locality. std::sort(sorted_buffers.begin(), sorted_buffers.end(), - [this, has_sequential_order, &liveness, &post_order_position, - assignment](const LogicalBuffer* a, const LogicalBuffer* b) { + [has_sequential_order, &liveness, &post_order_position, assignment]( + const LogicalBuffer* a, const LogicalBuffer* b) { // Primary sort is by decreasing buffer size. const int64 a_size = assignment->buffer_size_(*a); const int64 b_size = assignment->buffer_size_(*b); @@ -904,15 +902,19 @@ Status BufferAssigner::AssignBuffersForComputation( TF_RET_CHECK(!assignment->HasAllocation(*buffer)); const HloInstruction* instruction = buffer->instruction(); + const int64 buffer_size = assignment->buffer_size_(*buffer); + if (instruction->opcode() == HloOpcode::kConstant) { - // No BufferAllocations for constants. - // TODO(b/32248867): For consistency, constants should get allocations. - VLOG(3) << "Skipping constant: " << *buffer; + if (allocate_buffers_for_constants_) { + BufferAllocation* allocation = + assignment->NewAllocation(*buffer, buffer_size); + allocation->set_constant(true); + VLOG(3) << "New allocation #" << allocation->index() << " for constant " + << *buffer; + } continue; } - const int64 buffer_size = assignment->buffer_size_(*buffer); - const bool is_entry_parameter = instruction->opcode() == HloOpcode::kParameter && computation == computation->parent()->entry_computation(); @@ -922,9 +924,7 @@ Status BufferAssigner::AssignBuffersForComputation( // computations do not need special allocations because they live inside // callers. BufferAllocation* allocation = - assignment->NewAllocation(*buffer, buffer_size, - /*is_thread_local=*/false, - /*is_reusable=*/false); + assignment->NewAllocation(*buffer, buffer_size); allocation->set_entry_computation_parameter( instruction->parameter_number(), buffer->index()); VLOG(3) << "New allocation #" << allocation->index() @@ -933,20 +933,18 @@ Status BufferAssigner::AssignBuffersForComputation( } if (is_thread_local) { - // We do not reuse thread-local buffers for now, because they are - // dynamically allocated and their lifetimes are hard to compute. - BufferAllocation* allocation = assignment->NewAllocation( - *buffer, buffer_size, is_thread_local, /*is_reusable=*/false); + BufferAllocation* allocation = + assignment->NewAllocation(*buffer, buffer_size); + allocation->set_is_thread_local(true); VLOG(3) << "New allocation #" << allocation->index() << " for thread-local: " << *buffer; continue; } if (ShapeUtil::IsTuple(buffer->shape())) { - // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend - // assumes longer buffer liveness than indicated by the analysis. - BufferAllocation* allocation = assignment->NewAllocation( - *buffer, buffer_size, is_thread_local, /*is_reusable=*/false); + BufferAllocation* allocation = + assignment->NewAllocation(*buffer, buffer_size); + allocation->set_is_tuple(true); VLOG(3) << "New allocation #" << allocation->index() << " for tuple-shaped buffer: " << *buffer; continue; @@ -1029,8 +1027,8 @@ Status BufferAssigner::AssignBuffersForComputation( } if (!assignment->HasAllocation(*buffer)) { - BufferAllocation* allocation = assignment->NewAllocation( - *buffer, buffer_size, is_thread_local, /*is_reusable=*/true); + BufferAllocation* allocation = + assignment->NewAllocation(*buffer, buffer_size); allocation_indices.push_back(allocation->index()); VLOG(3) << "New allocation #" << allocation->index() << " for: " << *buffer; @@ -1084,13 +1082,14 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering( VLOG(2) << "Simulating heap for color " << color; int64 alignment = assignment->color_alignment_(color); HeapSimulator::Options options; + options.alloc_constants = allocate_buffers_for_constants_; BufferValueFlatSet buffer_value_set = ToBufferValueFlatSet(single_colored_set.second); options.buffers_to_assign = &buffer_value_set; TF_ASSIGN_OR_RETURN( const HeapSimulator::Result result, - HeapSimulator::Run(MakeUnique( - MakeUnique(alignment)), + HeapSimulator::Run(absl::make_unique( + absl::make_unique(alignment)), assignment->module(), module_sequence, assignment->points_to_analysis(), assignment->buffer_size_, options)); @@ -1119,11 +1118,12 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering( options.buffers_to_assign = &buffer_value_set; TF_ASSIGN_OR_RETURN( const HeapSimulator::Result result, - HeapSimulator::Run(MakeUnique( - MakeUnique(alignment)), - *computation, *instruction_sequence, - assignment->points_to_analysis(), - assignment->buffer_size_, options)); + HeapSimulator::Run( + absl::make_unique( + absl::make_unique(alignment)), + *computation, *instruction_sequence, + assignment->points_to_analysis(), assignment->buffer_size_, + options)); AssignBuffersFromHeapSimulator(result, assignment, single_colored_set.first); } @@ -1226,8 +1226,8 @@ void BufferAssigner::AssignBuffersFromHeapSimulator( result.fragmentation_size; } - BufferAllocation* allocation = assignment->NewEmptyAllocation( - result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true, color); + BufferAllocation* allocation = + assignment->NewEmptyAllocation(result.heap_size, color); for (const auto& buffer_chunk : result.chunk_map) { // TODO(lauj) Remove this down_cast after downstream users of // BufferAllocation::assigned_buffers() are updated to use BufferValue. @@ -1331,11 +1331,25 @@ BufferAssigner::MergeColocatedBufferSets( auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness, &buffer_size, &is_entry_parameter](int64 i, int64 j) { - // Do not merge if one of the sets includes live outs or entry parameters. + // Do not merge if one of the sets includes live outs, entry parameters or + // constants. + // + // Buffer liveness does not report the correct live range for entry + // parameter and live out buffers so we have to special case them here. On + // backends that support constant buffer allocations, constant buffers are + // assigned globals in readonly storage so we can't merge colocated buffer + // sets containing constants with colocated buffer sets containing writing + // instructions or other constants. + // + // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to + // the caller of the executable so we can't write to entry parameters + // either, and the argument for not merging constants also applies to entry + // parameters. for (int64 key : {i, j}) { for (auto& buffer : colocated_buffer_sets[key]) { if (buffer_liveness.MaybeLiveOut(*buffer) || - is_entry_parameter(*buffer)) { + is_entry_parameter(*buffer) || + buffer->instruction()->opcode() == HloOpcode::kConstant) { return true; } } @@ -1417,9 +1431,9 @@ void BufferAssigner::BuildColocatedBufferSets( const HloInstruction* while_hlo = instruction; ShapeUtil::ForEachSubshape( while_hlo->shape(), - [this, while_hlo, &points_to_analysis, &buffer_liveness, - buffer_size, computation, colocated_buffer_sets]( - const Shape& /*subshape*/, const ShapeIndex& index) { + [this, while_hlo, &points_to_analysis, buffer_size, + colocated_buffer_sets](const Shape& /*subshape*/, + const ShapeIndex& index) { std::vector colocated_set; // Add while.init. AddBufferToColocatedSet(while_hlo->operand(0), index, @@ -1443,8 +1457,23 @@ void BufferAssigner::BuildColocatedBufferSets( }); } else if (opcode == HloOpcode::kCall) { const HloInstruction* call_hlo = instruction; - const HloInstruction* root_hlo = - call_hlo->to_apply()->root_instruction(); + const HloComputation* callee = call_hlo->to_apply(); + const HloInstruction* root_hlo = callee->root_instruction(); + for (int64 i = 0; i < call_hlo->operand_count(); i++) { + const HloInstruction* call_param = callee->parameter_instruction(i); + const HloInstruction* call_operand = call_hlo->operand(i); + ShapeUtil::ForEachSubshape( + call_operand->shape(), + [&](const Shape& /*subshape*/, const ShapeIndex& index) { + std::vector colocated_set; + AddBufferToColocatedSet(call_param, index, points_to_analysis, + &colocated_set); + AddBufferToColocatedSet(call_operand, index, points_to_analysis, + &colocated_set); + AddSetToColocatedBufferSets(colocated_set, + colocated_buffer_sets); + }); + } ShapeUtil::ForEachSubshape( call_hlo->shape(), [this, call_hlo, root_hlo, &points_to_analysis, @@ -1550,6 +1579,7 @@ void BufferAssigner::AssignColocatedBufferSets( // param in 'colocated_buffer_set'. int64 entry_parameter_number = -1; const ShapeIndex* entry_parameter_shape_idx = nullptr; + bool is_constant = false; for (const LogicalBuffer* buffer : colocated_buffer_set) { const HloInstruction* instruction = buffer->instruction(); const HloComputation* computation = instruction->parent(); @@ -1557,10 +1587,14 @@ void BufferAssigner::AssignColocatedBufferSets( computation == computation->parent()->entry_computation()) { entry_parameter_number = instruction->parameter_number(); entry_parameter_shape_idx = &buffer->index(); - break; + } else if (instruction->opcode() == HloOpcode::kConstant) { + is_constant = true; } } + CHECK(!is_constant || entry_parameter_number == -1) + << "Copy insertion should have inserted copies to prevent this."; + for (const LogicalBuffer* buffer : colocated_buffer_set) { const int64 buffer_size = assignment->buffer_size_(*buffer); if (allocation == nullptr) { @@ -1568,18 +1602,14 @@ void BufferAssigner::AssignColocatedBufferSets( // allocations for each colocated buffer set. When liveness has // module-level scope, we can allow buffers to be shared across // computations (in some cases). - allocation = assignment->NewAllocation(*buffer, buffer_size, - /*is_thread_local=*/false, - /*is_reusable=*/true); + allocation = assignment->NewAllocation(*buffer, buffer_size); if (entry_parameter_number >= 0) { - // This colocated buffer set contains an entry parameter and other - // logical buffers which use the parameter as read-only in a while - // body computation (which updates in place). - // Set 'entry_computation_parameter' to indicate that it contains - // an entry parameter, and to prevent reuse in MaybeAssignBuffer. allocation->set_entry_computation_parameter( entry_parameter_number, *entry_parameter_shape_idx); } + if (is_constant) { + allocation->set_constant(true); + } colocated_allocations->insert(allocation->index()); } else { CHECK_EQ(buffer_size, allocation->size()) @@ -1605,7 +1635,8 @@ StatusOr> BufferAssigner::CreateAssignment( XLA_VLOG_LINES(3, liveness->ToString()); XLA_VLOG_LINES(3, liveness->points_to_analysis().ToString()); - // Can't use MakeUnique because BufferAssignment constructor is private. + // Can't use absl::make_unique because BufferAssignment constructor is + // private. std::unique_ptr assignment( new BufferAssignment(module, std::move(liveness), std::move(buffer_size), std::move(color_alignment))); @@ -1637,7 +1668,7 @@ StatusOr> BufferAssigner::CreateAssignment( buffers_to_assign_sequentially; for (auto* computation : global_computations) { TF_RETURN_IF_ERROR(AssignBuffersForComputation( - computation, module->config().debug_options(), + computation, /*is_thread_local=*/false, colocated_buffers, colocated_allocations, &buffers_to_assign_sequentially, assignment.get())); } @@ -1658,7 +1689,7 @@ StatusOr> BufferAssigner::CreateAssignment( continue; } TF_RETURN_IF_ERROR(AssignBuffersForComputation( - computation, module->config().debug_options(), + computation, /*is_thread_local=*/true, colocated_buffers, colocated_allocations, /*buffers_to_assign_sequentially=*/nullptr, assignment.get())); } diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h index ad0b0bf7c25d7194a06801e4ef1c9ee961f6b915..24ba7c16f548c10f58f41d2b88488939ca2d8e4d 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.h +++ b/tensorflow/compiler/xla/service/buffer_assignment.h @@ -22,6 +22,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_liveness.h" #include "tensorflow/compiler/xla/service/heap_simulator.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" @@ -32,8 +33,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/platform/logging.h" @@ -42,6 +41,17 @@ limitations under the License. namespace xla { +// Walk the call graph of the HLO module and place each computation into either +// thread_local_computations or global_computations depending upon whether the +// computation requires thread-local allocations or global allocations. The +// elements in thread_local_computations and global_computations are in post +// order (if computation A has an instruction which calls computation B, then A +// will appear after B in the vector). +Status GatherComputationsByAllocationType( + const HloModule* module, + std::vector* thread_local_computations, + std::vector* global_computations); + // This class abstracts an allocation of contiguous memory which can hold the // values described by LogicalBuffers. Each LogicalBuffer occupies a sub-range // of the allocation, represented by a Slice. A single BufferAllocation may hold @@ -58,13 +68,8 @@ class BufferAllocation { // contiguously and can be used as array indexes. using Index = int64; - BufferAllocation(Index index, int64 size, bool is_thread_local, - bool is_reusable, LogicalBuffer::Color color) - : index_(index), - size_(size), - is_thread_local_(is_thread_local), - is_reusable_(is_reusable), - color_(color) {} + BufferAllocation(Index index, int64 size, LogicalBuffer::Color color) + : index_(index), size_(size), color_(color) {} ~BufferAllocation() {} // Returns the index of this allocation. @@ -74,9 +79,28 @@ class BufferAllocation { // inside of a map or reduce computation. Such allocations need to be thread // local. bool is_thread_local() const { return is_thread_local_; } + void set_is_thread_local(bool is_thread_local) { + is_thread_local_ = is_thread_local; + } // Whether this allocation can be used by more than one logical buffer. - bool is_reusable() const { return is_reusable_; } + bool is_reusable() const { + // We do not reuse thread-local buffers for now, because they are + // dynamically allocated and their lifetimes are hard to compute. + // + // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend + // assumes longer buffer liveness than indicated by the analysis. + return !is_thread_local() && !is_tuple(); + } + + // Whether this allocation is readonly i.e. backed by memory we cannot write + // to. + bool is_readonly() const { + return is_entry_computation_parameter() || is_constant(); + } + + bool is_tuple() const { return is_tuple_; } + void set_is_tuple(bool is_tuple) { is_tuple_ = is_tuple; } // Whether this allocation holds a LogicalBuffer from a parameter of the entry // computation. These buffers have lifetimes which may be longer than the @@ -84,6 +108,13 @@ class BufferAllocation { bool is_entry_computation_parameter() const { return is_entry_computation_parameter_; } + + // Whether this allocation holds a constant. On the CPU and GPU backends + // constant allocations are not allocated dynamically, instead we resolve + // references to these buffer allocations to a global in the readonly section + // of the binary. + bool is_constant() const { return is_constant_; } + // If this allocation holds a Buffer from a parameter of the entry // computation, this methods returns the parameter number. CHECKs otherwise. int64 parameter_number() const { @@ -189,7 +220,9 @@ class BufferAllocation { // of the computation. !maybe_live_out() && // Thread-local buffers are allocated using `alloca`s. - !is_thread_local(); + !is_thread_local() && + // Constant buffers are allocated as global values. + !is_constant(); } // Add a heap trace which was used to assign slices to logical buffers in this @@ -245,6 +278,8 @@ class BufferAllocation { parameter_number_ = parameter_number; param_shape_index_ = std::move(param_shape_index); } + + void set_constant(bool is_constant) { is_constant_ = is_constant; } void set_maybe_live_out(bool value) { maybe_live_out_ = value; } void set_index(Index index) { index_ = index; } void set_size(int64 size) { size_ = size; } @@ -256,10 +291,10 @@ class BufferAllocation { int64 size_; // Whether this buffer needs to be thread-local. - bool is_thread_local_; + bool is_thread_local_ = false; - // Whether this buffer is usable by more than one logical buffer. - bool is_reusable_; + // Whether this buffer holds a tuple. + bool is_tuple_ = false; // Color of the allocation. LogicalBuffer::Color color_; @@ -283,6 +318,9 @@ class BufferAllocation { // might not actually escape. bool maybe_live_out_ = false; + // See comment on the is_constant() accessor. + bool is_constant_ = false; + // Mapping from the set of buffers assigned to this allocation to their // logical offsets and sizes. tensorflow::gtl::FlatMap assigned_buffers_; @@ -398,6 +436,8 @@ class BufferAssignment { struct Stats { int64 parameter_allocation_count = 0; int64 parameter_allocation_bytes = 0; + int64 constant_allocation_count = 0; + int64 constant_allocation_bytes = 0; int64 maybe_live_out_allocation_count = 0; int64 maybe_live_out_allocation_bytes = 0; int64 preallocated_temp_allocation_count = 0; @@ -426,14 +466,11 @@ class BufferAssignment { // Creates and returns a new BufferAllocation, with no assigned // LogicalBuffers. Ownership is maintained internally. - BufferAllocation* NewEmptyAllocation(int64 size, bool is_thread_local, - bool is_reusable, - LogicalBuffer::Color color); + BufferAllocation* NewEmptyAllocation(int64 size, LogicalBuffer::Color color); // Helper that calls NewEmptyAllocation and AddAssignment in one call, // creating an allocation containing a single LogicalBuffer. - BufferAllocation* NewAllocation(const LogicalBuffer& buffer, int64 size, - bool is_thread_local, bool is_reusable); + BufferAllocation* NewAllocation(const LogicalBuffer& buffer, int64 size); // Adds a LogicalBuffer to the set assigned to the given allocation. void AddAssignment(BufferAllocation* allocation, const LogicalBuffer& buffer, @@ -493,12 +530,15 @@ class BufferAssigner { LogicalBuffer::SizeFunction buffer_size, LogicalBuffer::AlignmentFunction color_alignment, bool allow_input_output_aliasing = false, + bool allocate_buffers_for_constants = false, BufferLiveness::Colorer colorer = BufferLiveness::DefaultColorer()); private: BufferAssigner(bool allow_input_output_aliasing, + bool allocate_buffers_for_constants, BufferLiveness::Colorer colorer) : allow_input_output_aliasing_(allow_input_output_aliasing), + allocate_buffers_for_constants_(allocate_buffers_for_constants), colorer_(colorer) {} virtual ~BufferAssigner() = default; @@ -513,8 +553,7 @@ class BufferAssigner { // true, then all assigned buffers have the is_thread_local flag set to // true. Status AssignBuffersForComputation( - const HloComputation* computation, const DebugOptions& debug_options, - bool is_thread_local, + const HloComputation* computation, bool is_thread_local, const tensorflow::gtl::FlatSet& colocated_buffers, const tensorflow::gtl::FlatSet& colocated_allocations, @@ -595,6 +634,9 @@ class BufferAssigner { // buffers can be shared if their sizes match. bool allow_input_output_aliasing_; + // If true, allocate buffers for constant instructions. + bool allocate_buffers_for_constants_; + // Functor used to assign colors to newly allocated logical buffers. BufferLiveness::Colorer colorer_; diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc index a4fb0eefaca094898ed9acad8062484d1a36afe7..8bd1533972413194dec3609829c8cf8df570cc2a 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc @@ -21,11 +21,10 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/buffer_value.h" #include "tensorflow/compiler/xla/service/call_graph.h" -#include "tensorflow/compiler/xla/service/computation_tracker.h" #include "tensorflow/compiler/xla/service/copy_insertion.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/flatten_call_graph.h" @@ -33,12 +32,12 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_ordering.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/hlo_scheduling.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" -#include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" +#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/macros.h" @@ -80,33 +79,46 @@ const std::vector GetInstructions(HloInstruction* root) { return main_list.GetInstructions(); } -class BufferAssignmentTest : public HloTestBase { +class BufferAssignmentTest : public HloVerifiedTestBase { protected: - BufferAssignmentTest() : computation_tracker_() {} ~BufferAssignmentTest() override {} std::unique_ptr RunBufferAssignment(HloModule* module, int64 alignment = 1) { return BufferAssigner::Run( - module, xla::MakeUnique(module), + module, absl::make_unique(module), backend().compiler()->BufferSizeBytesFunction(), - [alignment](LogicalBuffer::Color) { return alignment; }) + [alignment](LogicalBuffer::Color) { return alignment; }, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/true) + .ConsumeValueOrDie(); + } + + std::unique_ptr RunBufferAssignmentNoBuffersForConstants( + HloModule* module, int64 alignment = 1) { + return BufferAssigner::Run( + module, absl::make_unique(module), + backend().compiler()->BufferSizeBytesFunction(), + [alignment](LogicalBuffer::Color) { return alignment; }, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/false) .ConsumeValueOrDie(); } std::unique_ptr RunColoredBufferAssignment( HloModule* module, BufferLiveness::Colorer colorer, int64 alignment = 1) { return BufferAssigner::Run( - module, xla::MakeUnique(module), + module, absl::make_unique(module), backend().compiler()->BufferSizeBytesFunction(), - [alignment](LogicalBuffer::Color) { return alignment; }, false, - std::move(colorer)) + [alignment](LogicalBuffer::Color) { return alignment; }, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/true, std::move(colorer)) .ConsumeValueOrDie(); } std::unique_ptr RunBufferAssignmentWithInstructionSequence( HloModule* module, - tensorflow::gtl::ArraySlice instruction_sequence, + absl::Span instruction_sequence, int64 alignment = 1) { SequentialHloOrdering::HloModuleSequence module_sequence; module_sequence[module->entry_computation()] = @@ -114,9 +126,12 @@ class BufferAssignmentTest : public HloTestBase { instruction_sequence.end()); return BufferAssigner::Run( module, - xla::MakeUnique(module, module_sequence), + absl::make_unique(module, + module_sequence), backend().compiler()->BufferSizeBytesFunction(), - [alignment](LogicalBuffer::Color) { return alignment; }) + [alignment](LogicalBuffer::Color) { return alignment; }, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/true) .ConsumeValueOrDie(); } @@ -126,12 +141,23 @@ class BufferAssignmentTest : public HloTestBase { auto param = builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x")); auto value = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); builder.AddInstruction( HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, param, value)); return builder.Build(); } + std::unique_ptr BuildReduceComputation(const string& name) { + auto builder = HloComputation::Builder(name); + auto param = + builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x")); + auto param2 = + builder.AddInstruction(HloInstruction::CreateParameter(1, r0f32_, "y")); + builder.AddInstruction( + HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, param, param2)); + return builder.Build(); + } + // Builds a simple compare-to-limit (x < 4) computation for a While. // // condition: @@ -143,13 +169,13 @@ class BufferAssignmentTest : public HloTestBase { const string& name) { auto builder = HloComputation::Builder(name); auto const4 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(4))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(4))); auto param = builder.AddInstruction( HloInstruction::CreateParameter(0, t_s32_f32v4_, "x")); auto index = builder.AddInstruction( HloInstruction::CreateGetTupleElement(const4->shape(), param, 0)); - builder.AddInstruction( - HloInstruction::CreateBinary(r0f32_, HloOpcode::kLt, index, const4)); + builder.AddInstruction(HloInstruction::CreateBinary( + ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, index, const4)); return builder.Build(); } @@ -168,9 +194,9 @@ class BufferAssignmentTest : public HloTestBase { const string& name) { auto builder = HloComputation::Builder(name); auto const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); auto constv = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.1f, 2.2f, 3.3f, 4.4f}))); + LiteralUtil::CreateR1({1.1f, 2.2f, 3.3f, 4.4f}))); auto param = builder.AddInstruction( HloInstruction::CreateParameter(0, t_s32_f32v4_, "x")); auto indexc = builder.AddInstruction( @@ -252,9 +278,6 @@ class BufferAssignmentTest : public HloTestBase { return total_size; } - // Computation tracker for nested computations. - ComputationTracker computation_tracker_; - // Shapes for use in the examples. Shape s32_ = ShapeUtil::MakeShape(xla::S32, {}); Shape r0f32_ = ShapeUtil::MakeShape(xla::F32, {}); @@ -294,13 +317,19 @@ static bool BuffersDistinct(const std::vector& a, TEST_F(BufferAssignmentTest, ScalarConstant) { auto builder = HloComputation::Builder(TestName()); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto buffers = RunBufferAssignment(module.get()); - // Check that the constant does not have a buffer assigned. - EXPECT_FALSE(buffers->HasTopLevelAllocation(const0)); + { + auto buffers = RunBufferAssignment(module); + EXPECT_TRUE(buffers->HasTopLevelAllocation(const0)); + } + + { + auto buffers = RunBufferAssignmentNoBuffersForConstants(module); + EXPECT_FALSE(buffers->HasTopLevelAllocation(const0)); + } } TEST_F(BufferAssignmentTest, BufferForConst) { @@ -308,20 +337,26 @@ TEST_F(BufferAssignmentTest, BufferForConst) { // no buffers assigned, and their consumer has a buffer. auto builder = HloComputation::Builder(TestName()); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.1f, 2.2f, 3.3f, 4.4f}))); + LiteralUtil::CreateR1({1.1f, 2.2f, 3.3f, 4.4f}))); auto const1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({4.1f, 4.2f, 4.3f, 4.4f}))); + LiteralUtil::CreateR1({4.1f, 4.2f, 4.3f, 4.4f}))); auto add = builder.AddInstruction( HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, const0, const1)); auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto buffers = RunBufferAssignment(module.get()); - // The two constant nodes have no buffers assigned. - EXPECT_FALSE(buffers->HasTopLevelAllocation(const0)); - EXPECT_FALSE(buffers->HasTopLevelAllocation(const1)); - // The add node has an output buffer. - GetAssignedOutputAllocation(*buffers, add); + { + auto buffers = RunBufferAssignment(module); + EXPECT_TRUE(buffers->HasTopLevelAllocation(const0)); + EXPECT_TRUE(buffers->HasTopLevelAllocation(const1)); + GetAssignedOutputAllocation(*buffers, add); + } + { + auto buffers = RunBufferAssignmentNoBuffersForConstants(module); + EXPECT_FALSE(buffers->HasTopLevelAllocation(const0)); + EXPECT_FALSE(buffers->HasTopLevelAllocation(const1)); + GetAssignedOutputAllocation(*buffers, add); + } } TEST_F(BufferAssignmentTest, HasAllocationAt) { @@ -331,7 +366,7 @@ TEST_F(BufferAssignmentTest, HasAllocationAt) { auto param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, f32vec100_, "param0")); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); auto negate = builder.AddInstruction( HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0)); auto tuple = builder.AddInstruction( @@ -339,7 +374,7 @@ TEST_F(BufferAssignmentTest, HasAllocationAt) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); // Make sure that HasAllocationAt() agrees with what HasTopLevelAllocation() // reports for the instruction directly. EXPECT_EQ(buffers->HasTopLevelAllocation(tuple), @@ -356,13 +391,13 @@ TEST_F(BufferAssignmentTest, BufferForOutputConst) { // This computation copies a constant to output. auto builder = HloComputation::Builder(TestName()); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.1f, 2.2f, 3.3f, 4.4f}))); + LiteralUtil::CreateR1({1.1f, 2.2f, 3.3f, 4.4f}))); auto copy = builder.AddInstruction( HloInstruction::CreateUnary(const0->shape(), HloOpcode::kCopy, const0)); auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); // The copy node now has an output buffer. GetAssignedOutputAllocation(*buffers, copy); } @@ -375,13 +410,15 @@ TEST_F(BufferAssignmentTest, Basic) { // param1[100] --------------/--------/ auto builder = HloComputation::Builder(TestName()); auto paramscalar = - builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "")); + builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p")); + auto broadcast = builder.AddInstruction( + HloInstruction::CreateBroadcast(f32vec100_, paramscalar, {})); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(1, f32vec100_, "")); + HloInstruction::CreateParameter(1, f32vec100_, "p1")); auto param1 = builder.AddInstruction( - HloInstruction::CreateParameter(2, f32vec100_, "")); + HloInstruction::CreateParameter(2, f32vec100_, "p2")); auto mul = builder.AddInstruction(HloInstruction::CreateBinary( - f32vec100_, HloOpcode::kMultiply, paramscalar, param0)); + f32vec100_, HloOpcode::kMultiply, broadcast, param0)); auto add = builder.AddInstruction( HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1)); auto sub = builder.AddInstruction(HloInstruction::CreateBinary( @@ -389,7 +426,7 @@ TEST_F(BufferAssignmentTest, Basic) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); // Distinct input buffers were assigned for parameters. BufferAllocation paramscalar_buffer = @@ -422,13 +459,15 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) { // share anything. auto builder = HloComputation::Builder(TestName()); auto paramscalar = - builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "")); + builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p")); + auto broadcast = builder.AddInstruction( + HloInstruction::CreateBroadcast(f32vec100_, paramscalar, {})); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(1, f32vec100_, "")); + HloInstruction::CreateParameter(1, f32vec100_, "p1")); auto param1 = builder.AddInstruction( - HloInstruction::CreateParameter(2, f32vec100_, "")); + HloInstruction::CreateParameter(2, f32vec100_, "p2")); auto mul = builder.AddInstruction(HloInstruction::CreateBinary( - f32vec100_, HloOpcode::kMultiply, paramscalar, param0)); + f32vec100_, HloOpcode::kMultiply, broadcast, param0)); auto add = builder.AddInstruction( HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1)); auto sub = builder.AddInstruction(HloInstruction::CreateBinary( @@ -448,7 +487,7 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) { return Status::OK(); }; - auto buffers = RunColoredBufferAssignment(module.get(), colorer); + auto buffers = RunColoredBufferAssignment(module, colorer); // Distinct input buffers were assigned for parameters. BufferAllocation paramscalar_buffer = @@ -481,13 +520,15 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) { // have the color 0, which allows the mul and add to share buffers. auto builder = HloComputation::Builder(TestName()); auto paramscalar = - builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "")); + builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p")); + auto broadcast = builder.AddInstruction( + HloInstruction::CreateBroadcast(f32vec100_, paramscalar, {})); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(1, f32vec100_, "")); + HloInstruction::CreateParameter(1, f32vec100_, "p1")); auto param1 = builder.AddInstruction( - HloInstruction::CreateParameter(2, f32vec100_, "")); + HloInstruction::CreateParameter(2, f32vec100_, "p2")); auto mul = builder.AddInstruction(HloInstruction::CreateBinary( - f32vec100_, HloOpcode::kMultiply, paramscalar, param0)); + f32vec100_, HloOpcode::kMultiply, broadcast, param0)); auto add = builder.AddInstruction( HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1)); auto sub = builder.AddInstruction(HloInstruction::CreateBinary( @@ -515,7 +556,7 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) { return Status::OK(); }; - auto buffers = RunColoredBufferAssignment(module.get(), colorer); + auto buffers = RunColoredBufferAssignment(module, colorer); // Distinct input buffers were assigned for parameters. BufferAllocation paramscalar_buffer = @@ -551,13 +592,15 @@ TEST_F(BufferAssignmentTest, MultipleUsersForNode) { // auto builder = HloComputation::Builder(TestName()); auto paramscalar = - builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "")); + builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p")); + auto broadcast = builder.AddInstruction( + HloInstruction::CreateBroadcast(f32vec100_, paramscalar, {})); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(1, f32vec100_, "")); + HloInstruction::CreateParameter(1, f32vec100_, "p1")); auto param1 = builder.AddInstruction( - HloInstruction::CreateParameter(2, f32vec100_, "")); + HloInstruction::CreateParameter(2, f32vec100_, "p2")); auto mul = builder.AddInstruction(HloInstruction::CreateBinary( - f32vec100_, HloOpcode::kMultiply, paramscalar, param0)); + f32vec100_, HloOpcode::kMultiply, broadcast, param0)); auto add = builder.AddInstruction( HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1)); auto sub = builder.AddInstruction( @@ -565,7 +608,7 @@ TEST_F(BufferAssignmentTest, MultipleUsersForNode) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); // Input buffers were assigned for parameters. BufferAllocation paramscalar_buffer = @@ -605,7 +648,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) { // Creates the main kernel and verifies instruction counts. auto builder = HloComputation::Builder(TestName()); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, f32a100x10_, "")); + HloInstruction::CreateParameter(0, f32a100x10_, "p")); auto map = builder.AddInstruction( HloInstruction::CreateMap(f32a100x10_, {param0}, map_computation)); module->AddEntryComputation(builder.Build()); @@ -616,7 +659,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) { EXPECT_EQ(3, level1.size()) << "Invalid nested add+1 size"; // Assigns buffers and fetches sizes. - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); int64 size0 = ValidateBuffers(level0, *buffers); int64 size1 = ValidateBuffers(level1, *buffers); @@ -651,20 +694,20 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) { // output. (Reuse is not safe in the general case, as it reshapes and some // out-of-order reductions could overwrite an element before a use.) // - // param0[100] --- (exp1) --- (exp2) --- (reduce x+1) --- (exp3) + // param0[100] --- (exp1) --- (exp2) --- (reduce x+y) --- (exp3) auto module = CreateNewModule(); auto reduce_computation = - module->AddEmbeddedComputation(BuildMapComputationPlus1("f32+1")); + module->AddEmbeddedComputation(BuildReduceComputation("f32+f32")); auto builder = HloComputation::Builder(TestName()); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, f32a100x10_, "")); + HloInstruction::CreateParameter(0, f32a100x10_, "p")); auto exp1 = builder.AddInstruction( HloInstruction::CreateUnary(f32a100x10_, HloOpcode::kExp, param0)); auto exp2 = builder.AddInstruction( HloInstruction::CreateUnary(f32a100x10_, HloOpcode::kExp, exp1)); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); auto reduce = builder.AddInstruction(HloInstruction::CreateReduce( /*shape=*/f32vec10_, /*operand=*/exp2, @@ -675,7 +718,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) { module->AddEntryComputation(builder.Build()); - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); const std::vector instrs = GetInstructions(exp3); ValidateBuffers(instrs, *buffers); @@ -712,9 +755,9 @@ TEST_F(BufferAssignmentTest, ExampleWhile) { // Creates the main kernel and verifies instruction counts. auto builder = HloComputation::Builder(TestName()); auto const3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); auto const4 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.1f, 2.2f, 3.3f, 4.4f}))); + LiteralUtil::CreateR1({1.1f, 2.2f, 3.3f, 4.4f}))); auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({const3, const4})); auto while_op = builder.AddInstruction(HloInstruction::CreateWhile( @@ -731,7 +774,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) { EXPECT_EQ(8, levelb.size()) << "Invalid nested body size"; // Assigns buffers and fetches sizes. - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); int64 size0 = ValidateBuffers(level0, *buffers); int64 sizec = ValidateBuffers(levelc, *buffers); int64 sizeb = ValidateBuffers(levelb, *buffers); @@ -777,11 +820,11 @@ TEST_F(BufferAssignmentTest, ExampleConditional) { auto builder = HloComputation::Builder(TestName()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(56.4f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(56.4f))); auto const2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(12.4f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(12.4f))); auto conditional = builder.AddInstruction(HloInstruction::CreateConditional( r0f32_, pred, const1, true_computation, const2, false_computation)); module->AddEntryComputation(builder.Build()); @@ -796,7 +839,7 @@ TEST_F(BufferAssignmentTest, ExampleConditional) { EXPECT_EQ(2, true_instrs.size()); EXPECT_EQ(2, false_instrs.size()); - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); ValidateBuffers(conditional_instrs, *buffers); ValidateBuffers(true_instrs, *buffers); ValidateBuffers(false_instrs, *buffers); @@ -822,7 +865,7 @@ TEST_F(BufferAssignmentTest, UnaryOpReuseChain) { // param0[100] ---> (exp) ---> (tanh) ---> (exp) ---> (neg) auto builder = HloComputation::Builder(TestName()); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, f32vec100_, "")); + HloInstruction::CreateParameter(0, f32vec100_, "p")); auto exp1 = builder.AddInstruction( HloInstruction::CreateUnary(f32vec100_, HloOpcode::kExp, param0)); auto tanh = builder.AddInstruction( @@ -834,7 +877,7 @@ TEST_F(BufferAssignmentTest, UnaryOpReuseChain) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // tanh and exp2 can reuse exp1's buffer EXPECT_TRUE(assignment->HasTopLevelAllocation(exp1)); @@ -863,7 +906,7 @@ TEST_F(BufferAssignmentTest, ReuseNonOperandBuffer) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // negate and broadcast should share a buffer. EXPECT_TRUE(assignment->HasTopLevelAllocation(broadcast)); @@ -896,7 +939,7 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // The instructions should not share buffers. EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast), @@ -933,7 +976,7 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // The instructions should not share buffers. EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast), @@ -968,7 +1011,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBuffer) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // The broadcast output buffer cannot be shared. EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast), @@ -1000,7 +1043,7 @@ TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // negate and broadcast should share a buffer. EXPECT_TRUE(assignment->HasTopLevelAllocation(broadcast)); @@ -1038,7 +1081,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBufferInTuple) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // The broadcast output buffer cannot be shared. EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast), @@ -1082,7 +1125,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) { HloInstruction::CreateMap(vec_shape, {call}, map_computation)); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // Allocations for the map computation should be thread-local and not // live-out. @@ -1098,7 +1141,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) { // Allocations for the call computation should not be thread-local. auto& call_param_alloc = GetTopLevelAllocation(*assignment, call_param); - EXPECT_FALSE(call_param_alloc.is_entry_computation_parameter()); + EXPECT_TRUE(call_param_alloc.is_entry_computation_parameter()); EXPECT_FALSE(call_param_alloc.maybe_live_out()); EXPECT_FALSE(call_param_alloc.is_thread_local()); @@ -1131,7 +1174,7 @@ TEST_F(BufferAssignmentTest, TupleParameterAsOutput) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // There should be four allocations: one for vector of pointers, and one for // each tuple element. @@ -1167,7 +1210,7 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // Only some of the elements of the input param are liveout. EXPECT_FALSE( @@ -1200,16 +1243,17 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) { // TODO(b/32248867): Enable when buffer assignment gives allocations to // constants. -TEST_F(BufferAssignmentTest, DISABLED_TupleConstantAsOutput) { +TEST_F(BufferAssignmentTest, TupleConstantAsOutput) { // Test that a tuple constant which is forwarded to the computation output // is properly handled. auto builder = HloComputation::Builder(TestName()); - builder.AddInstruction(HloInstruction::CreateConstant(Literal::MakeTuple( - {Literal::CreateR0(0).get(), Literal::CreateR0(1).get()}))); + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::MakeTuple({LiteralUtil::CreateR0(0).get(), + LiteralUtil::CreateR0(1).get()}))); auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); EXPECT_EQ(3, assignment->Allocations().size()); } @@ -1223,7 +1267,7 @@ TEST_F(BufferAssignmentTest, TupleCustomCallAsOutput) { /*operands=*/{}, /*custom_call_target=*/"foo_function")); auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); EXPECT_EQ(3, assignment->Allocations().size()); EXPECT_TRUE( @@ -1254,18 +1298,20 @@ TEST_F(BufferAssignmentTest, TupleCallAsOutput) { HloInstruction::CreateCall(tuple_shape, {param}, sub_computation)); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); - EXPECT_EQ(3, assignment->Allocations().size()); + EXPECT_EQ(2, assignment->Allocations().size()); // Buffers for call are colocated with the sub-computation. EXPECT_EQ(GetAllocation(*assignment, call, /*index=*/{}), GetAllocation(*assignment, sub_tuple, /*index=*/{})); EXPECT_EQ(GetAllocation(*assignment, call, /*index=*/{0}), GetAllocation(*assignment, sub_param, /*index=*/{})); - // The parameter isn't aliased with anything. + + // The parameter isn't aliased with the result tuple, but it is aliased with + // the call operand. EXPECT_NE(GetTopLevelAllocation(*assignment, param), GetTopLevelAllocation(*assignment, sub_tuple)); - EXPECT_NE(GetTopLevelAllocation(*assignment, param), + EXPECT_EQ(GetTopLevelAllocation(*assignment, param), GetTopLevelAllocation(*assignment, sub_param)); } @@ -1314,7 +1360,7 @@ TEST_F(BufferAssignmentTest, TupleChainedCallAsOutput) { module->AddEntryComputation(std::move(a_computation)); module->AddEmbeddedComputation(std::move(b_computation)); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // Buffers for call are colocated with the sub-computations. EXPECT_EQ(GetAllocation(*assignment, a_call, /*index=*/{}), @@ -1329,13 +1375,15 @@ TEST_F(BufferAssignmentTest, TupleChainedCallAsOutput) { GetAllocation(*assignment, c_call, /*index=*/{0})); EXPECT_EQ(GetAllocation(*assignment, c_call, /*index=*/{0}), GetAllocation(*assignment, d_param, /*index=*/{0})); - // The parameters aren't aliased with anything. + EXPECT_TRUE(BuffersDistinct({a_param}, {b_param}, *assignment)); EXPECT_TRUE(BuffersDistinct({a_param}, {c_param}, *assignment)); EXPECT_TRUE(BuffersDistinct({a_param}, {d_param}, *assignment)); - EXPECT_TRUE(BuffersDistinct({b_param}, {c_param}, *assignment)); - EXPECT_TRUE(BuffersDistinct({b_param}, {d_param}, *assignment)); - EXPECT_TRUE(BuffersDistinct({c_param}, {d_param}, *assignment)); + + EXPECT_EQ(GetAllocation(*assignment, b_param, /*index=*/{0}), + GetAllocation(*assignment, c_param, /*index=*/{0})); + EXPECT_EQ(GetAllocation(*assignment, c_param, /*index=*/{0}), + GetAllocation(*assignment, d_param, /*index=*/{0})); } TEST_F(BufferAssignmentTest, BitcastAsOutput) { @@ -1348,7 +1396,7 @@ TEST_F(BufferAssignmentTest, BitcastAsOutput) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // Bitcast should get the same allocation as the param. EXPECT_EQ(1, assignment->Allocations().size()); @@ -1369,12 +1417,13 @@ TEST_F(BufferAssignmentTest, AmbiguousBufferAsOutput) { HloInstruction::CreateParameter(1, tuple_shape, "param1")); auto pred_param = builder.AddInstruction(HloInstruction::CreateParameter( 2, ShapeUtil::MakeShape(PRED, {}), "param1")); - auto select = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred_param, tuple_param0, tuple_param1)); + auto select = builder.AddInstruction( + HloInstruction::CreateTernary(tuple_shape, HloOpcode::kTupleSelect, + pred_param, tuple_param0, tuple_param1)); auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // Select shallow copies one of its operands so it defines its own top-level // buffer and receives its own allocation. @@ -1412,7 +1461,7 @@ TEST_F(BufferAssignmentTest, TupleBufferNotReused) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(module); // There should be no buffer reuse. The copy should not reuse the tuple // buffer. @@ -1446,12 +1495,12 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) { auto dot_bc = builder.AddInstruction( HloInstruction::CreateDot(shape_3x4, param_b, param_c, dot_dnums)); builder.AddInstruction( - HloInstruction::CreateConcatenate(shape_5x4, {dot_ab, dot_bc}, 1)); + HloInstruction::CreateConcatenate(shape_5x4, {dot_ab, dot_bc}, 0)); // Run buffer assignment with alignment=1. auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto assignment = RunBufferAssignment(module.get(), /*alignment=*/1); + auto assignment = RunBufferAssignment(module, /*alignment=*/1); // There are 5 allocations: 3 parameters, 1 output, and 1 temp. EXPECT_EQ(5, assignment->Allocations().size()); @@ -1470,7 +1519,7 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) { EXPECT_EQ(80, slice_bc.allocation()->size()); // Re-run buffer assignment with alignment=64. - assignment = RunBufferAssignment(module.get(), /*alignment=*/64); + assignment = RunBufferAssignment(module, /*alignment=*/64); EXPECT_EQ(5, assignment->Allocations().size()); slice_ab = assignment->GetUniqueTopLevelSlice(dot_ab).ConsumeValueOrDie(); slice_bc = assignment->GetUniqueTopLevelSlice(dot_bc).ConsumeValueOrDie(); @@ -1500,13 +1549,15 @@ TEST_F(BufferAssignmentTest, TrivialPeakBuffers) { // param1[100] --------------/--------/ auto builder = HloComputation::Builder(TestName()); auto paramscalar = - builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "")); + builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p")); + auto broadcast = builder.AddInstruction( + HloInstruction::CreateBroadcast(f32vec100_, paramscalar, {})); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(1, f32vec100_, "")); + HloInstruction::CreateParameter(1, f32vec100_, "p1")); auto param1 = builder.AddInstruction( - HloInstruction::CreateParameter(2, f32vec100_, "")); + HloInstruction::CreateParameter(2, f32vec100_, "p2")); auto mul = builder.AddInstruction(HloInstruction::CreateBinary( - f32vec100_, HloOpcode::kMultiply, paramscalar, param0)); + f32vec100_, HloOpcode::kMultiply, broadcast, param0)); auto add = builder.AddInstruction( HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1)); builder.AddInstruction(HloInstruction::CreateBinary( @@ -1514,16 +1565,13 @@ TEST_F(BufferAssignmentTest, TrivialPeakBuffers) { auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); - // Trivially, the set of peak memory logical buffer(s) of an allocation with a - // single logical buffer should be exactly the logical buffer in that - // allocation. const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul); const std::vector& peak_buffers = mul_buffer.PeakMemoryLogicalBuffers(); ASSERT_EQ(peak_buffers.size(), 1); - EXPECT_EQ(peak_buffers[0]->instruction(), mul); + EXPECT_EQ(peak_buffers[0]->instruction(), broadcast); } TEST_F(BufferAssignmentTest, PeakBuffers) { @@ -1540,7 +1588,7 @@ TEST_F(BufferAssignmentTest, PeakBuffers) { // be {%rev, %neg, %concat}. This occurs right at the concat itself. auto builder = HloComputation::Builder(TestName()); auto param = builder.AddInstruction( - HloInstruction::CreateParameter(0, f32vec100_, "")); + HloInstruction::CreateParameter(0, f32vec100_, "p")); auto log = builder.AddInstruction( HloInstruction::CreateUnary(f32vec100_, HloOpcode::kLog, param)); auto rev = builder.AddInstruction( @@ -1559,7 +1607,7 @@ TEST_F(BufferAssignmentTest, PeakBuffers) { module->AddEntryComputation(builder.Build()); auto buffers = RunBufferAssignmentWithInstructionSequence( - module.get(), {param, log, rev, neg, concat, root}); + module, {param, log, rev, neg, concat, root}); // The temporary buffer should hold the 4 interior instructions. const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, concat); @@ -1587,7 +1635,7 @@ TEST_F(BufferAssignmentTest, PeakBuffersWhile) { auto b = HloComputation::Builder(TestName() + ".cond"); b.AddInstruction(HloInstruction::CreateParameter(0, shape, "x")); b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); condition = module->AddEmbeddedComputation(b.Build()); } HloComputation* body; @@ -1615,7 +1663,7 @@ TEST_F(BufferAssignmentTest, PeakBuffersWhile) { ShapeUtil::MakeShape(F32, {123, 123, 123}), bcast, {0})); module->AddEntryComputation(builder.Build()); - auto buffers = RunBufferAssignment(module.get()); + auto buffers = RunBufferAssignment(module); const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, bcast); const std::vector& peak_buffers = buffer.PeakMemoryLogicalBuffers(); @@ -1642,7 +1690,65 @@ TEST_F(BufferAssignmentTest, PeakBuffersWhile) { nonbcast_buffer->instruction() == condition->parameter_instruction(0)); } -class WhileBufferAssignmentTest : public HloTestBase { +TEST_F(BufferAssignmentTest, ConstantBuffersAreNotReused) { + const char* hlo_text = R"( +HloModule Module + +True { + ROOT x.0.1 = f32[] parameter(0) +} + +False { + x.0.0 = f32[] parameter(0) + ROOT copy.1 = f32[] copy(x.0.0) +} + +ENTRY main { + pred.1.0 = pred[] parameter(0) + constant.1.1 = f32[] constant(56) + copy.2 = f32[] copy(constant.1.1) + constant.1.2 = f32[] constant(12) + ROOT conditional.1.3 = f32[] conditional(pred.1.0, copy.2, constant.1.2), + true_computation=True, false_computation=False +} +)"; + + ParseAndVerifyModule(hlo_text); + HloInstruction* constant_1 = + module().entry_computation()->GetInstructionWithName("constant.1.1"); + HloInstruction* constant_2 = + module().entry_computation()->GetInstructionWithName("constant.1.2"); + + auto buffers = RunBufferAssignment(&module()); + + { + const BufferAllocation& allocation_for_const_1 = + GetTopLevelAllocation(*buffers, constant_1); + EXPECT_TRUE(allocation_for_const_1.is_constant()); + for (const auto& buffer_offset_pair : + allocation_for_const_1.assigned_buffers()) { + EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(), + HloOpcode::kCopy); + EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(), + HloOpcode::kConditional); + } + } + + { + const BufferAllocation& allocation_for_const_2 = + GetTopLevelAllocation(*buffers, constant_2); + EXPECT_TRUE(allocation_for_const_2.is_constant()); + for (const auto& buffer_offset_pair : + allocation_for_const_2.assigned_buffers()) { + EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(), + HloOpcode::kCopy); + EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(), + HloOpcode::kConditional); + } + } +} + +class WhileBufferAssignmentTest : public HloVerifiedTestBase { protected: std::unique_ptr BuildWhileConditionComputation( const string& name) { @@ -1650,9 +1756,9 @@ class WhileBufferAssignmentTest : public HloTestBase { builder.AddInstruction( HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state")); auto zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); auto ten = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(10))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(10))); builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, zero, ten)); return builder.Build(); @@ -1677,11 +1783,14 @@ class WhileBufferAssignmentTest : public HloTestBase { std::unique_ptr RunBufferAssignment(HloModule* module, int64 alignment = 1) { auto sequence = - CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie(); + ScheduleComputationsInModule(*module, ByteSizeOf).ConsumeValueOrDie(); return BufferAssigner::Run( - module, xla::MakeUnique(module, sequence), + module, + absl::make_unique(module, sequence), ByteSizeOf, - [alignment](LogicalBuffer::Color) { return alignment; }) + [alignment](LogicalBuffer::Color) { return alignment; }, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/true) .ConsumeValueOrDie(); } @@ -1711,11 +1820,11 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) { HloInstruction::CreateParameter(2, data_shape_, "weights1")); auto zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0))); auto output0 = builder.AddInstruction( - HloInstruction::CreateBroadcast(data_shape_, zero, {1})); + HloInstruction::CreateBroadcast(data_shape_, zero, {})); auto output1 = builder.AddInstruction( - HloInstruction::CreateBroadcast(data_shape_, zero, {1})); + HloInstruction::CreateBroadcast(data_shape_, zero, {})); auto cond0 = module->AddEmbeddedComputation(BuildWhileConditionComputation("cond")); @@ -1739,8 +1848,8 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) { HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1)); module->AddEntryComputation(builder.Build()); - RunCopyInsertion(module.get()); - auto assignment = RunBufferAssignment(module.get()); + RunCopyInsertion(module); + auto assignment = RunBufferAssignment(module); // Verify 'input0' and read-only use while0{0} alias. EXPECT_EQ(assignment->GetUniqueSlice(input0, {}).ConsumeValueOrDie(), @@ -1796,20 +1905,20 @@ ENTRY %test_module { ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={} })"; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(module_str)); + ParseAndVerifyModule(module_str); // Run CopyInsertion and check if the graph constructed above doesn't need // any copies inserted for BufferAssignment to run. - int64 instruction_count = module->instruction_count(); + int64 instruction_count = module().instruction_count(); CopyInsertion copy_insertion; - ASSERT_IS_OK(copy_insertion.Run(module.get()).status()); - ASSERT_EQ(instruction_count, module->instruction_count()); + ASSERT_IS_OK(copy_insertion.Run(&module()).status()); + ASSERT_EQ(instruction_count, module().instruction_count()); // Get the instructions in the module. - const HloInstruction* bcast = module->entry_computation()->root_instruction(); + const HloInstruction* bcast = + module().entry_computation()->root_instruction(); const HloInstruction* param = - module->entry_computation()->parameter_instruction(0); + module().entry_computation()->parameter_instruction(0); ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast); const HloInstruction* while1 = bcast->operand(0); ASSERT_EQ(while1->opcode(), HloOpcode::kWhile); @@ -1817,7 +1926,7 @@ ENTRY %test_module { ASSERT_EQ(while0->opcode(), HloOpcode::kWhile); // Run buffer assignment. - auto assignment = RunBufferAssignment(module.get()); + auto assignment = RunBufferAssignment(&module()); TF_ASSERT_OK_AND_ASSIGN(auto slice_param, assignment->GetUniqueSlice(param, {})); TF_ASSERT_OK_AND_ASSIGN(auto slice_while0, @@ -1831,6 +1940,74 @@ ENTRY %test_module { EXPECT_NE(slice_param, slice_while1); } +TEST_F(WhileBufferAssignmentTest, ColocatedBufferWithConstant) { + const Shape r0s32 = ShapeUtil::MakeShape(S32, {}); + + const char* module_str = R"( +HloModule test_module + +%cond.v0 { + %param = s32[] parameter(0) + ROOT %constant = pred[] constant(true) +} + +%cond.v1 { + %param.0 = s32[] parameter(0) + ROOT %constant.0 = pred[] constant(true) +} + +%body.v0 { + ROOT %param.1 = s32[] parameter(0) +} + +%body.v1 { + %param.2 = s32[] parameter(0) + ROOT add = s32[] add(%param.2, %param.2) +} + +ENTRY %test_module { + %constant.42 = s32[] constant(42) + %while.0 = s32[] while(%constant.42), condition=%cond.v0, body=%body.v0 + %mul = s32[] multiply(%while.0, %while.0) + %while.1 = s32[] while(%mul), condition=%cond.v1, body=%body.v1 + ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={} +})"; + + ParseAndVerifyModule(module_str); + + // Run CopyInsertion and check if the graph constructed above doesn't need + // any copies inserted for BufferAssignment to run. + int64 instruction_count = module().instruction_count(); + CopyInsertion copy_insertion; + ASSERT_IS_OK(copy_insertion.Run(&module()).status()); + ASSERT_EQ(instruction_count, module().instruction_count()); + + // Get the instructions in the module. + const HloInstruction* bcast = + module().entry_computation()->root_instruction(); + const HloInstruction* constant = + module().entry_computation()->GetInstructionWithName("constant.42"); + ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast); + const HloInstruction* while1 = bcast->operand(0); + ASSERT_EQ(while1->opcode(), HloOpcode::kWhile); + const HloInstruction* while0 = while1->operand(0)->operand(0); + ASSERT_EQ(while0->opcode(), HloOpcode::kWhile); + + // Run buffer assignment. + auto assignment = RunBufferAssignment(&module()); + TF_ASSERT_OK_AND_ASSIGN(auto slice_constant, + assignment->GetUniqueSlice(constant, {})); + TF_ASSERT_OK_AND_ASSIGN(auto slice_while0, + assignment->GetUniqueSlice(while0, {})); + TF_ASSERT_OK_AND_ASSIGN(auto slice_while1, + assignment->GetUniqueSlice(while1, {})); + + // The constant slice is part of the while0's colocation set (init value), but + // not merged into the while1's colocation set. + EXPECT_EQ(slice_constant, slice_while0); + EXPECT_NE(slice_constant, slice_while1); +} + // Tests that the colocated buffers for while instructions are properly assigned // during buffer assignment such that the result tuple elements are not assigned // to the same buffer. @@ -1854,7 +2031,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) { auto build_cond = [&]() { auto builder = HloComputation::Builder("cond"); auto const4 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(4))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(4))); auto param = builder.AddInstruction(HloInstruction::CreateParameter(0, r0s32, "x")); builder.AddInstruction(HloInstruction::CreateBinary( @@ -1866,7 +2043,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) { auto build_body = [&]() { auto builder = HloComputation::Builder("body"); auto const9 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(9))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(9))); auto param = builder.AddInstruction(HloInstruction::CreateParameter(0, r0s32, "x")); builder.AddInstruction( @@ -1878,11 +2055,15 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) { auto module = CreateNewModule(); auto builder = HloComputation::Builder("entry"); - auto infeed = builder.AddInstruction(HloInstruction::CreateInfeed(r0s32, "")); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); + auto infeed = + builder.AddInstruction(HloInstruction::CreateInfeed(r0s32, token, "")); + auto infeed_data = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(r0s32, infeed, 0)); auto cond0 = module->AddEmbeddedComputation(build_cond()); auto body0 = module->AddEmbeddedComputation(build_body()); auto while0 = builder.AddInstruction( - HloInstruction::CreateWhile(r0s32, cond0, body0, infeed)); + HloInstruction::CreateWhile(r0s32, cond0, body0, infeed_data)); auto cond1 = module->AddEmbeddedComputation(build_cond()); auto body1 = module->AddEmbeddedComputation(build_body()); @@ -1890,7 +2071,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) { HloInstruction::CreateWhile(r0s32, cond1, body1, while0)); auto zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); auto add = builder.AddInstruction( HloInstruction::CreateBinary(r0s32, HloOpcode::kAdd, zero, zero)); auto cond2 = module->AddEmbeddedComputation(build_cond()); @@ -1906,22 +2087,23 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) { // any copies inserted for BufferAssignment to run. int64 instruction_count = module->instruction_count(); CopyInsertion copy_insertion; - ASSERT_IS_OK(copy_insertion.Run(module.get()).status()); + ASSERT_IS_OK(copy_insertion.Run(module).status()); ASSERT_EQ(instruction_count, module->instruction_count()); // Create a sequential order among all the instructions in the entry // computation, since the issue this test stresses depends on the order the // nodes are traversed during BufferAssignment. SequentialHloOrdering::HloModuleSequence sequence; - sequence[module->entry_computation()] = {infeed, while0, while1, zero, - add, while2, tuple}; + sequence[module->entry_computation()] = { + token, infeed, infeed_data, while0, while1, zero, add, while2, tuple}; TF_ASSERT_OK_AND_ASSIGN( auto assignment, BufferAssigner::Run( - module.get(), - xla::MakeUnique(module.get(), sequence), + module, absl::make_unique(module, sequence), backend().compiler()->BufferSizeBytesFunction(), - [](LogicalBuffer::Color) { return 1; })); + [](LogicalBuffer::Color) { return 1; }, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/true)); // The result tuple elements must be assigned with different buffers. TF_ASSERT_OK_AND_ASSIGN(auto slice0, assignment->GetUniqueSlice(tuple, {0})); @@ -1952,9 +2134,9 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) { HloInstruction::CreateParameter(1, data_shape_, "weights0")); auto zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0))); auto output0 = builder.AddInstruction( - HloInstruction::CreateBroadcast(data_shape_, zero, {1})); + HloInstruction::CreateBroadcast(data_shape_, zero, {})); auto cond0 = module->AddEmbeddedComputation(BuildWhileConditionComputation("cond")); @@ -1975,8 +2157,8 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) { HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, while0)); module->AddEntryComputation(builder.Build()); - RunCopyInsertion(module.get()); - auto assignment = RunBufferAssignment(module.get()); + RunCopyInsertion(module); + auto assignment = RunBufferAssignment(module); // while0 and while1 buffers should be completely aligned. EXPECT_EQ(assignment->GetUniqueSlice(while0, {0}).ConsumeValueOrDie(), @@ -1996,16 +2178,16 @@ TEST_F(BufferAssignmentTest, TwoCalls) { auto param = builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "param")); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto add = builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param, constant1)); sub_computation = module->AddEmbeddedComputation(builder.Build(add)); } auto builder = HloComputation::Builder(TestName()); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); auto call1 = builder.AddInstruction( HloInstruction::CreateCall(r0f32, {constant2}, sub_computation)); auto call2 = builder.AddInstruction( @@ -2018,17 +2200,66 @@ TEST_F(BufferAssignmentTest, TwoCalls) { { FlattenCallGraph flatten; - TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get())); + TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module)); EXPECT_TRUE(result); - std::unique_ptr call_graph = CallGraph::Build(module.get()); + std::unique_ptr call_graph = CallGraph::Build(module); } - RunCopyInsertion(module.get()); - auto assignment = RunBufferAssignment(module.get()); + RunCopyInsertion(module); + auto assignment = RunBufferAssignment(module); EXPECT_TRUE(BuffersDistinct({call1}, {call2}, *assignment)); } +TEST_F(BufferAssignmentTest, CallParamCoAllocation) { + const char* hlo_text = R"( +HloModule CallParamCoAllocation + +Callee { + param0 = (f32[100],(f32[200],f32[300])) parameter(0) + param1 = s32[20] parameter(1) + ROOT constant = f32[] constant(1) +} + +ENTRY Main { + entry_param0 = f32[100] parameter(0) + entry_param1 = s32[20] parameter(1) + custom_call = (f32[200],f32[300]) custom-call(), custom_call_target="call-target" + call_op0 = (f32[100],(f32[200],f32[300])) tuple(entry_param0, custom_call) + ROOT call_result = f32[] call(call_op0, entry_param1), to_apply=Callee +} +)"; + + HloModuleConfig config; + config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags()); + ParseAndVerifyModule(hlo_text, config); + + auto buffers = RunBufferAssignment(&module()); + + HloComputation* main = module().entry_computation(); + HloComputation* callee = module().GetComputationWithName("Callee"); + EXPECT_NE(callee, nullptr); + + HloInstruction* param0 = callee->parameter_instruction(0); + HloInstruction* param1 = callee->parameter_instruction(1); + + HloInstruction* entry_param0 = main->parameter_instruction(0); + HloInstruction* entry_param1 = main->parameter_instruction(1); + HloInstruction* custom_call = main->GetInstructionWithName("custom_call"); + + EXPECT_EQ(GetAllocation(*buffers, entry_param0, {}), + GetAllocation(*buffers, param0, {0})); + EXPECT_EQ(GetAllocation(*buffers, entry_param1, {}), + GetAllocation(*buffers, param1, {})); + + EXPECT_EQ(GetAllocation(*buffers, custom_call, {}), + GetAllocation(*buffers, param0, {1})); + EXPECT_EQ(GetAllocation(*buffers, custom_call, {0}), + GetAllocation(*buffers, param0, {1, 0})); + EXPECT_EQ(GetAllocation(*buffers, custom_call, {1}), + GetAllocation(*buffers, param0, {1, 1})); +} + static bool IsPostOrderTraversal( const std::vector& sequence) { tensorflow::gtl::FlatSet seen_so_far; @@ -2057,23 +2288,23 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) { auto builder = HloComputation::Builder(TestName()); auto zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0))); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto input0 = builder.AddInstruction( HloInstruction::CreateParameter(0, data_shape_, "input0")); auto weights0 = builder.AddInstruction( HloInstruction::CreateParameter(1, data_shape_, "weights0")); auto output0 = builder.AddInstruction( - HloInstruction::CreateBroadcast(data_shape_, zero, {1})); + HloInstruction::CreateBroadcast(data_shape_, zero, {})); auto input1 = builder.AddInstruction( HloInstruction::CreateParameter(2, data_shape_, "input1")); auto weights1 = builder.AddInstruction( HloInstruction::CreateParameter(3, data_shape_, "weights1")); auto output1 = builder.AddInstruction( - HloInstruction::CreateBroadcast(data_shape_, one, {1})); + HloInstruction::CreateBroadcast(data_shape_, one, {})); auto cond = module->AddEmbeddedComputation(BuildWhileConditionComputation("cond")); @@ -2093,21 +2324,21 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) { HloInstruction::CreateGetTupleElement(data_shape_, while0, 0)); auto gte1 = builder.AddInstruction( HloInstruction::CreateGetTupleElement(data_shape_, while1, 1)); - auto root_add = builder.AddInstruction(HloInstruction::CreateBinary( - while0->shape(), HloOpcode::kAdd, gte0, gte1)); + auto root_add = builder.AddInstruction( + HloInstruction::CreateBinary(data_shape_, HloOpcode::kAdd, gte0, gte1)); module->AddEntryComputation(builder.Build()); { FlattenCallGraph flatten; - TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get())); + TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module)); EXPECT_TRUE(result); } - RunCopyInsertion(module.get()); + RunCopyInsertion(module); auto sequence = - CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie(); + ScheduleComputationsInModule(*module, ByteSizeOf).ConsumeValueOrDie(); // To trigger b/38494731, we want a specific Hlo sequence for the // root computation, so we overwrite that entry with a manually @@ -2123,9 +2354,10 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) { auto assignment = BufferAssigner::Run( - module.get(), - xla::MakeUnique(module.get(), sequence), - ByteSizeOf, [](LogicalBuffer::Color) { return 1; }) + module, absl::make_unique(module, sequence), + ByteSizeOf, [](LogicalBuffer::Color) { return 1; }, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/true) .ConsumeValueOrDie(); EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment)); @@ -2141,11 +2373,11 @@ TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) { HloInstruction::CreateParameter(1, data_shape_, "weights0")); auto zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0))); auto output0 = builder.AddInstruction( - HloInstruction::CreateBroadcast(data_shape_, zero, {1})); + HloInstruction::CreateBroadcast(data_shape_, zero, {})); auto output1 = builder.AddInstruction( - HloInstruction::CreateBroadcast(data_shape_, zero, {1})); + HloInstruction::CreateBroadcast(data_shape_, zero, {})); auto cond0 = module->AddEmbeddedComputation(BuildWhileConditionComputation("cond")); @@ -2176,8 +2408,8 @@ TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) { HloInstruction::CreateGetTupleElement(data_shape_, while1, 2)); module->AddEntryComputation(builder.Build()); - RunCopyInsertion(module.get()); - auto assignment = RunBufferAssignment(module.get()); + RunCopyInsertion(module); + auto assignment = RunBufferAssignment(module); // Get BufferAllocation for root instruction. auto* root_alloc = assignment->GetUniqueTopLevelSlice(while1_out) .ConsumeValueOrDie() diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc index 810d597e730c1823668c81598df6138655e58b55..9b2783a214a686f3148723d19bbc94421fc8b4e4 100644 --- a/tensorflow/compiler/xla/service/buffer_liveness.cc +++ b/tensorflow/compiler/xla/service/buffer_liveness.cc @@ -20,6 +20,8 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/logical_buffer.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -28,8 +30,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -75,27 +75,25 @@ Status BufferLiveness::Analyze() { string BufferLiveness::ToString() const { std::vector pieces; - pieces.push_back(tensorflow::strings::Printf("BufferLiveness(module=%s):", - module_->name().c_str())); + pieces.push_back( + absl::StrFormat("BufferLiveness(module=%s):", module_->name())); pieces.push_back("HloOrdering:"); pieces.push_back(hlo_ordering_->ToString()); - pieces.push_back(tensorflow::strings::Printf("Aliased buffers:")); + pieces.push_back("Aliased buffers:"); for (const LogicalBuffer* buffer : aliased_buffers_) { - pieces.push_back( - tensorflow::strings::Printf(" %s", buffer->ToString().c_str())); + pieces.push_back(absl::StrFormat(" %s", buffer->ToString())); } - pieces.push_back(tensorflow::strings::Printf("Live out buffers:")); + pieces.push_back("Live out buffers:"); for (const LogicalBuffer* buffer : maybe_live_out_buffers_) { - pieces.push_back( - tensorflow::strings::Printf(" %s", buffer->ToString().c_str())); + pieces.push_back(absl::StrFormat(" %s", buffer->ToString())); } - return tensorflow::str_util::Join(pieces, "\n"); + return absl::StrJoin(pieces, "\n"); } bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a, const LogicalBuffer& b) const { - TF_CHECK_OK(points_to_analysis_->VerifyBuffer(a)); - TF_CHECK_OK(points_to_analysis_->VerifyBuffer(b)); + TF_DCHECK_OK(points_to_analysis_->VerifyBuffer(a)); + TF_DCHECK_OK(points_to_analysis_->VerifyBuffer(b)); if (!hlo_ordering_->ExecutesBefore(a.instruction(), b.instruction())) { return false; diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc index f623aef67a4f98b447a9a15634a78deb60cfe6f1..26e26e316d6281a97f8317f8ed1d7a6f21b0d374 100644 --- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc +++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc @@ -18,8 +18,9 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -119,8 +120,8 @@ TEST_F(BufferLivenessTest, ElementwiseChain) { module->AddEntryComputation(builder.Build()); auto liveness = - BufferLiveness::Run(module.get(), - xla::MakeUnique(module.get())) + BufferLiveness::Run( + module.get(), absl::make_unique(module.get())) .ConsumeValueOrDie(); EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate)); @@ -167,10 +168,10 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) { SequentialHloOrdering::HloModuleSequence sequence; sequence.insert({entry, {param0, negate, param1, exp, add}}); - auto liveness = - BufferLiveness::Run(module.get(), xla::MakeUnique( - module.get(), sequence)) - .ConsumeValueOrDie(); + auto liveness = BufferLiveness::Run(module.get(), + absl::make_unique( + module.get(), sequence)) + .ConsumeValueOrDie(); // Entry parameters interfere as if they are defined simultaneously at // the very beginning. @@ -215,8 +216,8 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) { module->AddEntryComputation(builder.Build()); auto liveness = - BufferLiveness::Run(module.get(), - xla::MakeUnique(module.get())) + BufferLiveness::Run( + module.get(), absl::make_unique(module.get())) .ConsumeValueOrDie(); EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp)); @@ -249,8 +250,8 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) { module->AddEntryComputation(builder.Build()); auto liveness = - BufferLiveness::Run(module.get(), - xla::MakeUnique(module.get())) + BufferLiveness::Run( + module.get(), absl::make_unique(module.get())) .ConsumeValueOrDie(); EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate)); @@ -293,10 +294,10 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) { SequentialHloOrdering::HloModuleSequence module_sequence; std::vector order = {param, negate, exp, add}; module_sequence.emplace(computation, order); - auto liveness = - BufferLiveness::Run(module.get(), xla::MakeUnique( - module.get(), module_sequence)) - .ConsumeValueOrDie(); + auto liveness = BufferLiveness::Run(module.get(), + absl::make_unique( + module.get(), module_sequence)) + .ConsumeValueOrDie(); EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate)); EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp)); @@ -327,11 +328,12 @@ TEST_F(BufferLivenessTest, RootInstructionIsNotLastInSequentialOrder) { builder.AddInstruction(HloInstruction::CreateParameter(0, vec_, "param")); auto add = builder.AddInstruction( HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, param, param)); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); auto recv = builder.AddInstruction( - HloInstruction::CreateRecv(vec_, /*channel_id=*/0)); + HloInstruction::CreateRecv(vec_, token, /*channel_id=*/0)); auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv)); auto send = builder.AddInstruction( - HloInstruction::CreateSend(recv_done, /*channel_id=*/1)); + HloInstruction::CreateSend(recv_done, token, /*channel_id=*/1)); auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send)); auto module = CreateNewModule(); @@ -341,10 +343,10 @@ TEST_F(BufferLivenessTest, RootInstructionIsNotLastInSequentialOrder) { std::vector order = {param, add, recv, recv_done, send, send_done}; module_sequence.emplace(computation, order); - auto liveness = - BufferLiveness::Run(module.get(), xla::MakeUnique( - module.get(), module_sequence)) - .ConsumeValueOrDie(); + auto liveness = BufferLiveness::Run(module.get(), + absl::make_unique( + module.get(), module_sequence)) + .ConsumeValueOrDie(); EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, add)); // Check the root instruction (add) buffer interferes with the recv buffer. @@ -375,8 +377,8 @@ TEST_F(BufferLivenessTest, TupleLiveOut) { module->AddEntryComputation(builder.Build()); auto liveness = - BufferLiveness::Run(module.get(), - xla::MakeUnique(module.get())) + BufferLiveness::Run( + module.get(), absl::make_unique(module.get())) .ConsumeValueOrDie(); // All buffers should be live out except the param @@ -411,8 +413,8 @@ TEST_F(BufferLivenessTest, EmbeddedComputation) { module->AddEntryComputation(builder.Build()); auto liveness = - BufferLiveness::Run(module.get(), - xla::MakeUnique(module.get())) + BufferLiveness::Run( + module.get(), absl::make_unique(module.get())) .ConsumeValueOrDie(); // Buffers in different computations should always interfere. @@ -438,11 +440,13 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) { // computation. The buffer containing {0, 1} is copied by GetTupleElement, and // the buffers containing {3} and 3 are dead. auto builder = HloComputation::Builder(TestName()); - auto inner_tuple0 = Literal::MakeTuple( - {Literal::CreateR0(0).get(), Literal::CreateR0(1).get()}); - auto inner_tuple1 = Literal::MakeTuple({Literal::CreateR0(3).get()}); + auto inner_tuple0 = + LiteralUtil::MakeTuple({LiteralUtil::CreateR0(0).get(), + LiteralUtil::CreateR0(1).get()}); + auto inner_tuple1 = + LiteralUtil::MakeTuple({LiteralUtil::CreateR0(3).get()}); auto tuple_constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::MakeTuple({inner_tuple0.get(), inner_tuple1.get()}))); + LiteralUtil::MakeTuple({inner_tuple0.get(), inner_tuple1.get()}))); builder.AddInstruction(HloInstruction::CreateGetTupleElement( inner_tuple0->shape(), tuple_constant, 0)); @@ -450,8 +454,8 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) { module->AddEntryComputation(builder.Build()); auto liveness = - BufferLiveness::Run(module.get(), - xla::MakeUnique(module.get())) + BufferLiveness::Run( + module.get(), absl::make_unique(module.get())) .ConsumeValueOrDie(); // Only the element buffers of the tuple constant which are pointed to by @@ -490,7 +494,7 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) { builder.AddInstruction(HloInstruction::CreateGetTupleElement( tuple_element0_shape, tuple_param0, 0)); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); + LiteralUtil::CreateR1({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); auto add0 = builder.AddInstruction(HloInstruction::CreateBinary( tuple_element0_shape, HloOpcode::kAdd, tuple_element0, const0)); @@ -502,7 +506,7 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) { builder.AddInstruction(HloInstruction::CreateGetTupleElement( tuple_element1_shape, tuple_param0, 1)); auto const1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}))); + LiteralUtil::CreateR1({2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}))); auto add1 = builder.AddInstruction(HloInstruction::CreateBinary( tuple_element1_shape, HloOpcode::kAdd, tuple_element1, const1)); @@ -515,8 +519,8 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) { module->AddEmbeddedComputation(builder.Build()); auto liveness = - BufferLiveness::Run(module.get(), - xla::MakeUnique(module.get())) + BufferLiveness::Run( + module.get(), absl::make_unique(module.get())) .ConsumeValueOrDie(); // We compare tuple element pairs that are input/output to the computation: @@ -554,7 +558,7 @@ TEST_F(BufferLivenessTest, DependentTupleElements) { builder.AddInstruction(HloInstruction::CreateGetTupleElement( tuple_element0_shape, tuple_param0, 0)); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); + LiteralUtil::CreateR1({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); auto add0 = builder.AddInstruction(HloInstruction::CreateBinary( tuple_element0_shape, HloOpcode::kAdd, tuple_element0, const0)); @@ -577,8 +581,8 @@ TEST_F(BufferLivenessTest, DependentTupleElements) { module->AddEmbeddedComputation(builder.Build()); auto liveness = - BufferLiveness::Run(module.get(), - xla::MakeUnique(module.get())) + BufferLiveness::Run( + module.get(), absl::make_unique(module.get())) .ConsumeValueOrDie(); // We compare tuple element pairs that are input/output to the computation: @@ -607,11 +611,8 @@ TEST_F(BufferLivenessTest, DependentTupleElements) { class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest { protected: // Builds and runs a computation (see test case computation graphs below). - // Runs BufferLiveness on this computation. - // Returns whether buffer interference is detected between tuple-shaped - // parameter and root instructions at tuple element 1. - bool Run(const bool update_uses_tuple_element1, - const bool fuse_gte0 = false) { + std::unique_ptr BuildModule(const bool update_uses_tuple_element1, + const bool fuse_gte0) { auto builder = HloComputation::Builder(TestName()); // Create param0 Tuple. Shape data_shape = ShapeUtil::MakeShape(F32, {8}); @@ -626,7 +627,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest { HloInstruction::CreateGetTupleElement(data_shape, tuple_param0, 1)); auto update = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({2.f, 2.f, 2.f}))); + LiteralUtil::CreateR1({2.f, 2.f, 2.f}))); HloInstruction* slice = nullptr; if (update_uses_tuple_element1) { // Create a slice instruction as an additional user of 'gte1'. @@ -637,17 +638,17 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest { } // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'. auto starts = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({2}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2}))); auto dynamic_update_slice = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( data_shape, gte1, update, starts)); // Create output tuple. - auto tuple_root = builder.AddInstruction( + builder.AddInstruction( HloInstruction::CreateTuple({gte0, dynamic_update_slice})); // Build module and get reference to entry computation. auto module = CreateNewModule(); - module->AddEntryComputation(BuildDummyComputation()); - auto* computation = module->AddEmbeddedComputation(builder.Build()); + module->AddEntryComputation(builder.Build()); + auto* computation = module->entry_computation(); // Create fusion instruction based on number of tuple element 1 users. if (update_uses_tuple_element1) { computation->CreateFusionInstruction( @@ -663,16 +664,39 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest { computation->CreateFusionInstruction({gte0}, HloInstruction::FusionKind::kLoop); } + return module; + } + // Returns whether buffer interference is detected between tuple-shaped + // parameter and root instructions at tuple element 1. + bool Run(const bool update_uses_tuple_element1, + const bool fuse_gte0 = false) { + auto module = BuildModule(update_uses_tuple_element1, fuse_gte0); // Run BufferLiveness on 'module'. - auto liveness = - BufferLiveness::Run( - module.get(), xla::MakeUnique(module.get())) - .ConsumeValueOrDie(); + auto liveness = BufferLiveness::Run( + module.get(), + absl::make_unique(module.get())) + .ConsumeValueOrDie(); // Return whether or not buffers interference is detected between // 'tuple_param0' and 'tuple_root' at shape index '{1}'. + auto tuple_param0 = FindInstruction(module.get(), "param0"); + auto tuple_root = module->entry_computation()->root_instruction(); return TupleElementsMayInterfere(*liveness, tuple_param0, tuple_root, {1}); } + bool RunWithHloDataflowAnalysis(const bool update_uses_tuple_element1, + const bool fuse_gte0 = false) { + auto module = BuildModule(update_uses_tuple_element1, fuse_gte0); + // Run BufferLiveness on 'module'. + auto dataflow = HloDataflowAnalysis::Run(*module).ConsumeValueOrDie(); + auto hlo_ordering = absl::make_unique(module.get()); + // Return whether or not buffers interference is detected between + // 'tuple_param0' and 'tuple_root' at shape index '{1}'. + auto tuple_param0 = FindInstruction(module.get(), "param0"); + auto tuple_root = module->entry_computation()->root_instruction(); + return hlo_ordering->MayInterfere( + dataflow->GetUniqueValueAt(tuple_param0, {1}), + dataflow->GetUniqueValueAt(tuple_root, {1}), *dataflow); + } }; // Tests that live ranges of buffers Param0[1] and Tuple[1] (which alias fusion) @@ -690,6 +714,8 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest { // TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterference) { EXPECT_FALSE(Run(/*update_uses_tuple_element1=*/false)); + EXPECT_FALSE( + RunWithHloDataflowAnalysis(/*update_uses_tuple_element1=*/false)); } // Tests that live ranges of buffers Param0[1] and Tuple[1] (which aliases @@ -709,6 +735,8 @@ TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterference) { // TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterferenceWithUnrelatedFusion) { EXPECT_FALSE(Run(/*update_uses_tuple_element1=*/false, /*fuse_gte0=*/true)); + EXPECT_FALSE(RunWithHloDataflowAnalysis(/*update_uses_tuple_element1=*/false, + /*fuse_gte0=*/true)); } // Tests that live ranges of buffers Param0[1] and Tuple[1] (which alias fusion) @@ -733,6 +761,7 @@ TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterferenceWithUnrelatedFusion) { // TEST_F(FusedDynamicUpdateSliceLivenessTest, WithInterference) { EXPECT_TRUE(Run(/*update_uses_tuple_element1=*/true)); + EXPECT_TRUE(RunWithHloDataflowAnalysis(/*update_uses_tuple_element1=*/true)); } class DynamicUpdateSliceLivenessTest : public BufferLivenessTest { @@ -756,7 +785,7 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest { HloInstruction::CreateGetTupleElement(data_shape, tuple_param0, 1)); auto update = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({2.f, 2.f, 2.f}))); + LiteralUtil::CreateR1({2.f, 2.f, 2.f}))); if (tuple_element1_has_two_uses) { // Add 'gte0' and 'gte1' to create another user of 'gte1'. @@ -765,7 +794,7 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest { } // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'. auto starts = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({2}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2}))); auto dynamic_update_slice = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( data_shape, gte1, update, starts)); @@ -777,10 +806,10 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest { module->AddEntryComputation(BuildDummyComputation()); module->AddEmbeddedComputation(builder.Build()); // Run BufferLiveness on 'module'. - auto liveness = - BufferLiveness::Run( - module.get(), xla::MakeUnique(module.get())) - .ConsumeValueOrDie(); + auto liveness = BufferLiveness::Run( + module.get(), + absl::make_unique(module.get())) + .ConsumeValueOrDie(); // Return whether or not buffers interference is detected between // 'tuple_param0' and 'tuple_root' at shape index '{1}'. return TupleElementsMayInterfere(*liveness, tuple_param0, tuple_root, {1}); diff --git a/tensorflow/compiler/xla/service/buffer_value.cc b/tensorflow/compiler/xla/service/buffer_value.cc index 2bc556a9e270136f5f3eaf2433f8c96eeeaea0a2..fdf822c666b15afbc7553ca89d4f92ab08201869 100644 --- a/tensorflow/compiler/xla/service/buffer_value.cc +++ b/tensorflow/compiler/xla/service/buffer_value.cc @@ -17,11 +17,10 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/types.h" namespace xla { diff --git a/tensorflow/compiler/xla/service/buffer_value.h b/tensorflow/compiler/xla/service/buffer_value.h index f4be16e0843f64f41ef27539bf263ae98ce0ebf9..69b36463560a1fad4f62687e9014fb3fbe5bbd13 100644 --- a/tensorflow/compiler/xla/service/buffer_value.h +++ b/tensorflow/compiler/xla/service/buffer_value.h @@ -19,12 +19,12 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/int_type.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc index a8053d15e124319c5c898f0034b9aaa95a007a89..23b2a327096dfdb3c756a4acc5476ec01dcac1b3 100644 --- a/tensorflow/compiler/xla/service/call_graph.cc +++ b/tensorflow/compiler/xla/service/call_graph.cc @@ -17,21 +17,21 @@ limitations under the License. #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/types.h" namespace xla { -using ::tensorflow::strings::Appendf; -using ::tensorflow::strings::StrCat; +using absl::StrAppendFormat; +using absl::StrCat; string CallContextToString(CallContext context) { switch (context) { @@ -57,9 +57,11 @@ CallContext GetInstructionCallContext(HloOpcode opcode) { case HloOpcode::kConditional: case HloOpcode::kWhile: return CallContext::kSequential; + case HloOpcode::kCrossReplicaSum: case HloOpcode::kMap: case HloOpcode::kReduce: case HloOpcode::kReduceWindow: + case HloOpcode::kScatter: case HloOpcode::kSelectAndScatter: case HloOpcode::kFusion: return CallContext::kParallel; @@ -69,10 +71,10 @@ CallContext GetInstructionCallContext(HloOpcode opcode) { } string CallSite::ToString() const { - return StrCat(instruction()->name(), " calls in context ", - CallContextToString(context()), ": ", - tensorflow::str_util::Join( - called_computations(), ", ", + return StrCat( + instruction()->name(), " calls in context ", + CallContextToString(context()), ": ", + absl::StrJoin(called_computations(), ", ", [](string* out, const HloComputation* computation) { out->append(computation->name()); })); @@ -235,8 +237,8 @@ void CallGraph::SetCallContexts() { /* static */ std::unique_ptr CallGraph::Build(const HloModule* module) { - // Constructor for CallGraph is private so MakeUnique can't be used. - auto call_graph = WrapUnique(new CallGraph(module)); + // Constructor for CallGraph is private so absl::make_unique can't be used. + auto call_graph = absl::WrapUnique(new CallGraph(module)); VLOG(2) << "Building call graph for:"; XLA_VLOG_LINES(2, module->ToString()); @@ -354,20 +356,20 @@ CallGraph::NearestAncestorsInSameComputation(HloInstruction* a, string CallGraph::ToString() const { string out; - Appendf(&out, "Call graph for module %s:\n", module_->name().c_str()); + StrAppendFormat(&out, "Call graph for module %s:\n", module_->name()); for (const CallGraphNode& node : nodes()) { - Appendf(&out, "Computation %s:\n", node.computation()->name().c_str()); - Appendf(&out, " calls:\n"); + StrAppendFormat(&out, "Computation %s:\n", node.computation()->name()); + StrAppendFormat(&out, " calls:\n"); for (const HloComputation* callee : node.callees()) { - Appendf(&out, " %s\n", callee->name().c_str()); + StrAppendFormat(&out, " %s\n", callee->name()); } - Appendf(&out, " called by:\n"); + StrAppendFormat(&out, " called by:\n"); for (const HloComputation* caller : node.callers()) { - Appendf(&out, " %s\n", caller->name().c_str()); + StrAppendFormat(&out, " %s\n", caller->name()); } - Appendf(&out, " callsites:\n"); + StrAppendFormat(&out, " callsites:\n"); for (const CallSite& callsite : node.callsites()) { - Appendf(&out, " %s\n", callsite.ToString().c_str()); + StrAppendFormat(&out, " %s\n", callsite.ToString()); } } return out; diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h index 97d3811508adee1bf2d0942bcc69e3e34a41c8c3..3af2ab5edfd9faf4ac5193df4b823c21b55b2f7f 100644 --- a/tensorflow/compiler/xla/service/call_graph.h +++ b/tensorflow/compiler/xla/service/call_graph.h @@ -15,8 +15,8 @@ limitations under the License. // Call graph for an HLO module. -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CALL_GRAPH_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CALL_GRAPH_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CALL_GRAPH_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_CALL_GRAPH_H_ #include @@ -272,4 +272,4 @@ class CallGraph { } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CALL_GRAPH_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CALL_GRAPH_H_ diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc index 1ea7d538cd515c3098b6a1f03c6146d288330406..cc80b7484313329104eec1ce71a150b47d8330c9 100644 --- a/tensorflow/compiler/xla/service/call_graph_test.cc +++ b/tensorflow/compiler/xla/service/call_graph_test.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/call_graph.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -82,7 +82,7 @@ class CallGraphTest : public HloTestBase { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, kScalarShape, "param0")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, param0, zero)); return builder.Build(); @@ -247,11 +247,11 @@ TEST_F(CallGraphTest, ComputationWithConditional) { HloComputation::Builder builder(TestName()); HloInstruction* pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloInstruction* const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(56.4f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(56.4f))); HloInstruction* const2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(12.6f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(12.6f))); HloInstruction* conditional = builder.AddInstruction(HloInstruction::CreateConditional( kScalarShape, pred, const1, true_computation, const2, diff --git a/tensorflow/compiler/xla/service/call_inliner.cc b/tensorflow/compiler/xla/service/call_inliner.cc index 482ccc5b67109258f544e5657ecfa0e8f62192c0..1d4214044409ae06239506e610000c839450a030 100644 --- a/tensorflow/compiler/xla/service/call_inliner.cc +++ b/tensorflow/compiler/xla/service/call_inliner.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/call_graph.h" +#include "tensorflow/compiler/xla/service/hlo_dce.h" #include "tensorflow/core/lib/core/errors.h" namespace xla { @@ -95,7 +96,7 @@ class SubcomputationInsertionVisitor : public DfsHloVisitorWithDefault { if (it == subcomputation_hlo_to_new_hlo_.end()) { return NotFound( "Could not find mapping from subcomputation HLO %s to a cloned HLO.", - subcomputation_hlo->ToString().c_str()); + subcomputation_hlo->ToString()); } return it->second; } @@ -151,6 +152,14 @@ StatusOr CallInliner::Run(HloModule* module) { } return Status::OK(); })); + if (did_mutate) { + // Run DCE to remove called computations which are now becoming unused. + // This can result then in problems if within the called computation, there + // were send/recv instructions, which the module group verifier will flag as + // error findingthe same channel ID used for multiple send/recv + // instructions. + TF_RETURN_IF_ERROR(HloDCE().Run(module).status()); + } return did_mutate; } diff --git a/tensorflow/compiler/xla/service/call_inliner.h b/tensorflow/compiler/xla/service/call_inliner.h index a8345a394d46c90a48305313dac0bcd9b06938ac..c5cd88b9ea2a9c308786d4d7476316b1e592d40a 100644 --- a/tensorflow/compiler/xla/service/call_inliner.h +++ b/tensorflow/compiler/xla/service/call_inliner.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE__CALL_INLINER_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE__CALL_INLINER_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CALL_INLINER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_CALL_INLINER_H_ #include @@ -35,11 +35,11 @@ class CallInliner : public HloPassInterface { static StatusOr Inline(HloInstruction* call); ~CallInliner() override = default; - tensorflow::StringPiece name() const override { return "CallInliner"; } + absl::string_view name() const override { return "CallInliner"; } StatusOr Run(HloModule* module) override; }; } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SERVICE__CALL_INLINER_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CALL_INLINER_H_ diff --git a/tensorflow/compiler/xla/service/call_inliner_test.cc b/tensorflow/compiler/xla/service/call_inliner_test.cc index 738d00881dd057fc13c115006c15e8f5b6d14a1d..5d85a3f173d50a964420e720f5c9b416731d948c 100644 --- a/tensorflow/compiler/xla/service/call_inliner_test.cc +++ b/tensorflow/compiler/xla/service/call_inliner_test.cc @@ -18,9 +18,9 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -32,7 +32,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace op = xla::testing::opcode_matchers; @@ -48,9 +47,9 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) { // the "one" value. HloComputation::Builder inner(TestName() + ".inner"); HloInstruction* zero = inner.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(24.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(24.0f))); HloInstruction* one = inner.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); TF_ASSERT_OK(zero->AddControlDependencyTo(one)); auto module = CreateNewModule(); HloComputation* inner_computation = @@ -87,7 +86,7 @@ TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) { // little trickier. HloComputation::Builder just_false(TestName() + ".false"); just_false.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* false_computation = module->AddEmbeddedComputation(just_false.Build()); @@ -99,7 +98,7 @@ TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) { HloComputation::Builder outer(TestName() + ".outer"); HloInstruction* init_value = outer.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); outer.AddInstruction( HloInstruction::CreateWhile(pred, call_false, call_false, init_value)); @@ -123,9 +122,9 @@ TEST_F(CallInlinerTest, InlineWithoutRunningPass) { HloComputation::Builder just_false(TestName() + ".false"); auto* true_constant = just_false.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({true}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({true}))); auto* false_constant = just_false.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); TF_ASSERT_OK(false_constant->AddControlDependencyTo(true_constant)); HloComputation* false_computation = module->AddEmbeddedComputation(just_false.Build()); @@ -147,15 +146,17 @@ TEST_F(CallInlinerTest, CallToOutfeedComputationIsInlined) { HloComputation::Builder outfeeder(TestName() + ".outfeeder"); auto value = outfeeder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); + auto token = outfeeder.AddInstruction(HloInstruction::CreateToken()); outfeeder.AddInstruction( - HloInstruction::CreateOutfeed(f32, value, /*outfeed_config=*/"")); + HloInstruction::CreateOutfeed(f32, value, token, /*outfeed_config=*/"")); auto outfeed_computation = module->AddEmbeddedComputation(outfeeder.Build()); HloComputation::Builder outer(TestName() + ".outer"); outer.AddInstruction(HloInstruction::CreateCall( - ShapeUtil::MakeNil(), /*operands=*/{}, outfeed_computation)); + outfeed_computation->root_instruction()->shape(), /*operands=*/{}, + outfeed_computation)); module->AddEntryComputation(outer.Build()); diff --git a/tensorflow/compiler/xla/service/channel_tracker.cc b/tensorflow/compiler/xla/service/channel_tracker.cc index a5b392cbc33c12c3255f3c06e9842fc116e672e5..3c2d1ae6d82ebc6c10d52194fd1cec5e291025f7 100644 --- a/tensorflow/compiler/xla/service/channel_tracker.cc +++ b/tensorflow/compiler/xla/service/channel_tracker.cc @@ -15,14 +15,14 @@ limitations under the License. #include "tensorflow/compiler/xla/service/channel_tracker.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" @@ -31,16 +31,23 @@ namespace xla { ChannelTracker::ChannelTracker() : next_channel_(1) {} -ChannelHandle ChannelTracker::NewChannel() { +StatusOr ChannelTracker::NewChannel( + ChannelHandle::ChannelType type) { + if (type != ChannelHandle::DEVICE_TO_DEVICE && + type != ChannelHandle::HOST_TO_DEVICE && + type != ChannelHandle::DEVICE_TO_HOST) { + return InvalidArgument("Invalid channel type: %d", type); + } tensorflow::mutex_lock lock(channel_mutex_); // Create a new channel handle with a unique value. - const ChannelHandle new_handle = AllocateHandle(); + ChannelHandle new_handle = AllocateHandle(type); // Register a channel object associated with the handle. Channel channel; channel.has_sender = false; channel.receiver_count = 0; + channel.type = type; opaque_to_channel_[new_handle.handle()] = channel; return new_handle; @@ -56,22 +63,30 @@ Status ChannelTracker::RegisterRecv(const ChannelHandle& handle) { return RegisterRecvInternal(handle); } -ChannelHandle ChannelTracker::AllocateHandle() { +ChannelHandle ChannelTracker::AllocateHandle(ChannelHandle::ChannelType type) { int64 handle_value = next_channel_++; ChannelHandle result; result.set_handle(handle_value); + result.set_type(type); return result; } Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) { if (opaque_to_channel_.count(handle.handle()) == 0) { - return NotFound("channel handle not found: %lld", handle.handle()); + return NotFound("channel handle not found: %d", handle.handle()); } Channel& channel = opaque_to_channel_[handle.handle()]; + if (channel.type == ChannelHandle::HOST_TO_DEVICE) { + return FailedPrecondition( + "host-to-device channels cannot be used with a Send operation; " + "channel handle: %d", + handle.handle()); + } + if (channel.has_sender) { return FailedPrecondition( "when registering send, passed a channel handle that is already used " - "by a sender: %lld", + "by a sender: %d", handle.handle()); } channel.has_sender = true; @@ -80,14 +95,21 @@ Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) { Status ChannelTracker::RegisterRecvInternal(const ChannelHandle& handle) { if (opaque_to_channel_.count(handle.handle()) == 0) { - return NotFound("channel handle not found: %lld", handle.handle()); + return NotFound("channel handle not found: %d", handle.handle()); } Channel& channel = opaque_to_channel_[handle.handle()]; + if (channel.type == ChannelHandle::DEVICE_TO_HOST) { + return FailedPrecondition( + "device-to-host channels cannot be used with a Recv operation; " + "channel handle: %d", + handle.handle()); + } + // TODO(b/33942691): Allow more than 1 receivers for broadcast. if (channel.receiver_count >= 1) { return FailedPrecondition( "when registering recv, passed a channel handle that is already used " - "by a receiver: %lld", + "by a receiver: %d", handle.handle()); } channel.receiver_count += 1; diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h index c7763f2ca3e68490cd0cd9b4ba4d7bd180134080..52037bf9b52556c6aa2e66dd3209e25cf085cfe3 100644 --- a/tensorflow/compiler/xla/service/channel_tracker.h +++ b/tensorflow/compiler/xla/service/channel_tracker.h @@ -18,15 +18,12 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/session.pb.h" -#include "tensorflow/compiler/xla/service/user_computation.h" -#include "tensorflow/compiler/xla/service/versioned_computation_handle.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -51,11 +48,12 @@ class ChannelTracker { struct Channel { bool has_sender; int64 receiver_count; + ChannelHandle::ChannelType type; }; // Creates a new Channel object and returns the corresponding // ChannelHandle for it. - ChannelHandle NewChannel(); + StatusOr NewChannel(ChannelHandle::ChannelType type); // Informs that the given channel handle is used for a Send operation. // Returns an error status if the handle is already used by another Send. @@ -68,7 +66,8 @@ class ChannelTracker { private: // Bumps the next_channel_ number and returns the allocated number // wrapped in a ChannelHandle. - ChannelHandle AllocateHandle() EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_); + ChannelHandle AllocateHandle(ChannelHandle::ChannelType type) + EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_); Status RegisterSendInternal(const ChannelHandle& handle) EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_); diff --git a/tensorflow/compiler/xla/service/compilation_cache.cc b/tensorflow/compiler/xla/service/compilation_cache.cc deleted file mode 100644 index b16907da9e9c909d2639f83895db27d724a84a7b..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/compilation_cache.cc +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/compilation_cache.h" - -#include - -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/platform/logging.h" - -namespace xla { - -std::shared_ptr CompilationCache::Insert( - std::unique_ptr executable, - const HloModuleConfig& module_config) { - tensorflow::mutex_lock lock(mutex_); - - CacheKey key = - BuildKey(executable->entry_computation_handle(), module_config); - VLOG(2) << "inserting cache key: " << key; - if (cache_.count(key) == 0) { - cache_.emplace(key, std::move(executable)); - } else { - // Executable already exists in the cache. This can happen if two Execute - // calls for a new computation are received simultaneously by the - // service. In this case, we discard the Executable given as a parameter and - // return what is in the cache. This is necessary because the service relies - // on the cache to keep ownership of the Executable. We only want to store - // one Executable for a given computation version and we can't discard the - // executable which is in the cache because it may be in use. - executable.reset(); - } - return cache_.at(key); -} - -std::shared_ptr CompilationCache::LookUp( - const VersionedComputationHandle& versioned_handle, - const HloModuleConfig& module_config) const { - tensorflow::mutex_lock lock(mutex_); - - CacheKey key = BuildKey(versioned_handle, module_config); - VLOG(2) << "looking up cache key: " << key; - if (cache_.count(key) == 0) { - VLOG(2) << "cache key not found: " << key; - return nullptr; - } else { - std::shared_ptr result = cache_.at(key); - VLOG(2) << "hit executable with module config: " - << result->module_config().compilation_cache_key(); - return result; - } -} - -CompilationCache::CacheKey CompilationCache::BuildKey( - const VersionedComputationHandle& versioned_handle, - const HloModuleConfig& module_config) const { - // The computation shape is represented entirely by its ProgramShape member, - // so just serialize the proto as part of the key. - return tensorflow::strings::StrCat(versioned_handle.handle.handle(), "::", - versioned_handle.version, "::", - module_config.compilation_cache_key()); -} - -} // namespace xla diff --git a/tensorflow/compiler/xla/service/compilation_cache.h b/tensorflow/compiler/xla/service/compilation_cache.h deleted file mode 100644 index 09989726ae6629aa65cb1dd84c16408a75019fa5..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/compilation_cache.h +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_ - -#include -#include -#include - -#include "tensorflow/compiler/xla/service/executable.h" -#include "tensorflow/compiler/xla/service/hlo_module_config.h" -#include "tensorflow/compiler/xla/service/versioned_computation_handle.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/thread_annotations.h" - -namespace xla { - -// A cache which stores Executables indexed by computation handle and version. -class CompilationCache { - public: - CompilationCache() {} - - // Insert the given Executable into the cache. Return a bare Executable - // pointer for the caller to use. Note: the returned pointer will *not* be the - // same as the given unique pointer if the computation already exists in the - // cache. See comments in the .cc implementation for details of this case. - // - // module_config is provided by the caller, instead of being taken from the - // executable, so that we can insert keys into the compilation cache that are - // devoid of layout (where XLA gets to choose what layout to compile). - // - // A shared_ptr is returned so the caller can keep the Executable from being - // destructed in the event that the Executable is evicted from the - // computation cache (and the cache's shared_ptr to the Executable is - // destructed). - std::shared_ptr Insert(std::unique_ptr executable, - const HloModuleConfig& module_config); - - // Lookup the Executable for the specified versioned computation in the cache. - // Return a shared_ptr to the Executable if it exists in the cache. Return - // nullptr otherwise. - std::shared_ptr LookUp( - const VersionedComputationHandle& versioned_handle, - const HloModuleConfig& module_config) const; - - protected: - mutable tensorflow::mutex mutex_; - - // Map from versioned handle with program layout to Executable built - // for that computation version and program layout. - using CacheKey = string; - - CacheKey BuildKey(const VersionedComputationHandle& versioned_handle, - const HloModuleConfig& module_config) const; - std::map> cache_ GUARDED_BY(mutex_); - - private: - TF_DISALLOW_COPY_AND_ASSIGN(CompilationCache); -}; - -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_ diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc index d39fd7307ae1b5bd0c431f98c413011ca081050b..e5a6c28478a7ebf87878c3937069f15cafe12615 100644 --- a/tensorflow/compiler/xla/service/compile_only_service.cc +++ b/tensorflow/compiler/xla/service/compile_only_service.cc @@ -19,17 +19,16 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/computation_layout.h" -#include "tensorflow/compiler/xla/service/computation_tracker.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/host_info.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -63,8 +62,9 @@ CompileOnlyService::CompileOnlyService(const ServiceOptions& options, StatusOr>> CompileOnlyService::CompileAheadOfTime( - const tensorflow::gtl::ArraySlice computations, - const AotCompilationOptions& options) { + const absl::Span computations, + const AotCompilationOptions& options, + std::unique_ptr* metadata) { std::vector> hlo_modules; for (const AotXlaComputationInstance& instance : computations) { TF_RET_CHECK(instance.computation.has_program_shape()); @@ -76,9 +76,9 @@ CompileOnlyService::CompileAheadOfTime( if (!directory_path.empty()) { HloSnapshot hlo_snapshot; *hlo_snapshot.mutable_hlo()->mutable_hlo_module() = instance.computation; - string filename = tensorflow::strings::StrCat( - "computation_", instance.computation.id(), "__", - instance.computation.entry_computation_name()); + string filename = + absl::StrCat("computation_", instance.computation.id(), "__", + instance.computation.entry_computation_name()); const string& per_host_path = tensorflow::io::JoinPath( directory_path, tensorflow::port::Hostname()); @@ -101,59 +101,8 @@ CompileOnlyService::CompileAheadOfTime( hlo_modules.push_back(std::move(hlo_module)); } - return compiler_->CompileAheadOfTime(std::move(hlo_modules), options); -} - -StatusOr>> -CompileOnlyService::CompileAheadOfTime( - const tensorflow::gtl::ArraySlice computations, - const AotCompilationOptions& options) { - std::vector> hlo_modules; - for (const AotComputationInstance& instance : computations) { - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, - computation_tracker_.Resolve(instance.computation)); - VersionedComputationHandle versioned_handle = - user_computation->GetVersionedHandle(); - - const DebugOptions& debug_options = options.debug_options(); - - // Dump computation proto state if flag is set. - const string& directory_path = debug_options.xla_dump_computations_to(); - if (!directory_path.empty()) { - TF_ASSIGN_OR_RETURN( - std::unique_ptr session_module, - computation_tracker_.SnapshotComputation(versioned_handle.handle)); - string filename = tensorflow::strings::StrCat( - "computation_", versioned_handle.handle.handle(), "__", - session_module->entry().name(), "__version_", - versioned_handle.version); - const string& per_host_path = tensorflow::io::JoinPath( - directory_path, tensorflow::port::Hostname()); - - TF_RETURN_IF_ERROR(Executable::DumpToDirectory(per_host_path, filename, - *session_module)); - } - - TF_ASSIGN_OR_RETURN( - std::shared_ptr program_shape, - user_computation->ComputeProgramShape(versioned_handle.version)); - - ExecutionOptions execution_options; - *execution_options.mutable_debug_options() = debug_options; - TF_ASSIGN_OR_RETURN( - std::unique_ptr module_config, - CreateModuleConfig(*program_shape, instance.argument_layouts, - &execution_options, user_computation)); - - TF_ASSIGN_OR_RETURN(std::unique_ptr hlo_module, - computation_tracker_.BuildHloModule( - versioned_handle, *module_config, - /*include_unreachable_instructions=*/true)); - TF_RETURN_IF_ERROR(MaybeDumpHloModule(*hlo_module)); - hlo_modules.push_back(std::move(hlo_module)); - } - - return compiler_->CompileAheadOfTime(std::move(hlo_modules), options); + return compiler_->CompileAheadOfTime(std::move(hlo_modules), options, + metadata); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h index 7f2ce0e8974c01b09664235d7b9d19555b2705a3..61136a3e11fe15fb74eac257f46292c6cd24ce7d 100644 --- a/tensorflow/compiler/xla/service/compile_only_service.h +++ b/tensorflow/compiler/xla/service/compile_only_service.h @@ -38,24 +38,7 @@ class CompileOnlyService : public Service { static StatusOr> NewService( const ServiceOptions& options); - // A description of a computation to compile using CompileAheadOfTime. - struct AotComputationInstance { - ComputationHandle computation; - std::vector argument_layouts; - const Shape* result_layout = nullptr; - }; - - // Compiles a list of computations for ahead-of-time execution. This is - // intended for use in static compilation. See - // |CompileOnlyClient::CompileAheadOfTime| for additional details. - StatusOr>> - CompileAheadOfTime( - const tensorflow::gtl::ArraySlice computations, - const AotCompilationOptions& Options); - // A description of a xla computation to compile using CompileAheadOfTime. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. struct AotXlaComputationInstance { HloModuleProto computation; std::vector argument_layouts; @@ -65,31 +48,21 @@ class CompileOnlyService : public Service { // Compiles a list of xla computations for ahead-of-time execution. This is // intended for use in static compilation. See // |CompileOnlyClient::CompileAheadOfTime| for additional details. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. StatusOr>> CompileAheadOfTime( - const tensorflow::gtl::ArraySlice computations, + const absl::Span computations, const AotCompilationOptions& options); - // Override Service methods that require or imply the existence of an - // execute backend. Note that this does not include TransferToClient, as - // computing constants produces global data that we may wish to transfer. - Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override { - return Unimplemented("CompileOnlyService does not support execution."); - } - Status ExecuteParallel(const ExecuteParallelRequest* arg, - ExecuteParallelResponse* result) override { - return Unimplemented("CompileOnlyService does not support execution."); - } + StatusOr>> + CompileAheadOfTime( + const absl::Span computations, + const AotCompilationOptions& options, + std::unique_ptr* metadata); + Status GetDeviceHandles(const GetDeviceHandlesRequest* arg, GetDeviceHandlesResponse* result) override { return Unimplemented("CompileOnlyService does not support devices."); } - Status ExecuteAsync(const ExecuteAsyncRequest* arg, - ExecuteAsyncResponse* result) override { - return Unimplemented("CompileOnlyService does not support execution."); - } Status WaitForExecution(const WaitForExecutionRequest* arg, WaitForExecutionResponse* result) override { return Unimplemented("CompileOnlyService does not support execution."); diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc index 31f84e88f826921ada7db60d178cc051d90355fe..687ecafe0c308ecc22857fae650c6998677f605d 100644 --- a/tensorflow/compiler/xla/service/compiler.cc +++ b/tensorflow/compiler/xla/service/compiler.cc @@ -28,12 +28,34 @@ namespace xla { /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_( tensorflow::LINKER_INITIALIZED); -std::vector Compiler::ComputeBackendConfigs( - const HloInstruction& hlo, se::StreamExecutor* executor) const { +std::vector> +Compiler::ComputeBackendConfigs(const HloInstruction& hlo, + se::StreamExecutor* executor) const { CHECK(executor != nullptr); return {}; } +std::unique_ptr +Compiler::ComputeDefaultBackendConfig(const HloInstruction& hlo, + se::StreamExecutor* executor) const { + CHECK(executor != nullptr); + return nullptr; +} + +// Define a default version where metadata is not used. +StatusOr>> +Compiler::CompileAheadOfTime( + std::vector> modules, + const AotCompilationOptions& options, + std::unique_ptr* metadata) { + if (metadata != nullptr) { + return Unimplemented( + "Populating AotCompilationMetadata is not implemented on this " + "compiler."); + } + return CompileAheadOfTime(std::move(modules), options); +} + /* static */ std::map* Compiler::GetPlatformCompilerFactories() { static auto* r = new std::map; @@ -79,7 +101,7 @@ Compiler::GetPlatformCompilers() { return NotFound( "could not find registered compiler for platform %s -- check " "target linkage", - platform->Name().c_str()); + platform->Name()); } // And then we invoke the factory, placing the result into the mapping. diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h index c39db58b78f5279302f9d1d0ae7b34c1b6b61b65..1fdda31c34a17a16f75e1efada542c2c2ea15038 100644 --- a/tensorflow/compiler/xla/service/compiler.h +++ b/tensorflow/compiler/xla/service/compiler.h @@ -26,6 +26,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_value.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -34,8 +35,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/logical_buffer.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -47,11 +48,6 @@ namespace xla { // compuation. using ObjectFileData = std::vector; -// Contains the buffer sizes information needed to allocate buffers to execute -// an ahead-of-time computation. Entries which contain -1 designate a parameter -// which should be skipped over during allocation. -using BufferSizes = std::vector; - // Abstract superclass describing the result of an ahead-of-time compilation. class AotCompilationResult { public: @@ -93,6 +89,19 @@ class AotCompilationOptions { DebugOptions debug_options_; }; +// Abstract superclass describing metadata produced during ahead-of-time +// compilation. +class AotCompilationMetadata { + public: + AotCompilationMetadata(const AotCompilationMetadata&) = delete; + AotCompilationMetadata& operator=(AotCompilationMetadata const&) = delete; + + virtual ~AotCompilationMetadata() = default; + + protected: + AotCompilationMetadata() = default; +}; + // Abstract compiler interface that is subclassed for compilation on a // particular platform. // @@ -161,8 +170,19 @@ class Compiler { // // The stream executor is passed in to provide information about the hardware // that the backend configurations would be targeting. - virtual std::vector ComputeBackendConfigs( - const HloInstruction& hlo, se::StreamExecutor* executor) const; + virtual std::vector> + ComputeBackendConfigs(const HloInstruction& hlo, + se::StreamExecutor* executor) const; + + // Returns the backend configuration that the backend chooses by default for + // the given HLO. Returns no configuration if the backend does not support + // configurations for the given HLO. + // + // The stream executor is passed in to provide information about the hardware + // that the backend configurations would be targeting. + virtual std::unique_ptr + ComputeDefaultBackendConfig(const HloInstruction& hlo, + se::StreamExecutor* executor) const; // Compiles the HLO module for ahead-of-time execution. This is intended for // use in static compilation. @@ -170,6 +190,13 @@ class Compiler { CompileAheadOfTime(std::vector> modules, const AotCompilationOptions& options) = 0; + // Similar to CompileAheadOfTime above but AotCompilationMetadata + // has an argument that can be populated during compilation. + virtual StatusOr>> + CompileAheadOfTime(std::vector> modules, + const AotCompilationOptions& options, + std::unique_ptr* metadata); + ///// // The Compiler class also serves as a point to register compiler objects // for the various platforms. diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc index cb61f3da39fb8eef69fd81066d87a1da91a62935..af8f7f1027a40703137d6880a9865449c560a47b 100644 --- a/tensorflow/compiler/xla/service/computation_layout.cc +++ b/tensorflow/compiler/xla/service/computation_layout.cc @@ -17,9 +17,9 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { @@ -52,9 +52,8 @@ string ComputationLayout::ToString() const { for (auto& param_layout : parameter_layouts_) { params.push_back(param_layout.ToString()); } - return tensorflow::strings::StrCat("(", - tensorflow::str_util::Join(params, ", "), - ") => ", result_layout_.ToString()); + return absl::StrCat("(", absl::StrJoin(params, ", "), ") => ", + result_layout_.ToString()); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h index 53c3a3f7b738687db3098acfaef1ae87860d0440..6975f387b4864bf28ea0ad23d7d4602b5b346e08 100644 --- a/tensorflow/compiler/xla/service/computation_layout.h +++ b/tensorflow/compiler/xla/service/computation_layout.h @@ -32,12 +32,21 @@ namespace xla { // mutable layouts. class ComputationLayout { public: + // Creates a new ComputationLayout with the given result layout. + explicit ComputationLayout(ShapeLayout result_layout) + : result_layout_(std::move(result_layout)) {} + // Constructs a ComputationLayout from a ProgramShape. The layouts of the // parameters and results are set to the default layout. Layouts in the // ProgramShape are ignored if ignore_layouts is true. explicit ComputationLayout(const ProgramShape& program_shape, bool ignore_layouts = true); + // Adds a new parameter layout to the computation layout. + void add_parameter_layout(ShapeLayout shape_layout) { + parameter_layouts_.push_back(std::move(shape_layout)); + } + // Returns the layout of a particular parameter. const ShapeLayout& parameter_layout(int64 param_no) const { return parameter_layouts_[param_no]; diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc index 7c1bacff92b231661477b9931a3066fd91110445..2210a8578ad73efb27dc9c230b142c55228d2af5 100644 --- a/tensorflow/compiler/xla/service/computation_placer.cc +++ b/tensorflow/compiler/xla/service/computation_placer.cc @@ -19,8 +19,9 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -32,6 +33,9 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +using absl::StrAppend; +using absl::StrCat; + namespace xla { Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const { @@ -56,8 +60,8 @@ DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) { "computation_count=%d", proto.replica_count(), proto.computation_count()); } - auto assignment = MakeUnique(proto.replica_count(), - proto.computation_count()); + auto assignment = absl::make_unique( + proto.replica_count(), proto.computation_count()); for (int computation = 0; computation < proto.computation_count(); ++computation) { const auto& computation_device = proto.computation_devices(computation); @@ -71,6 +75,19 @@ DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) { return std::move(assignment); } +string DeviceAssignment::ToString() const { + string output = StrCat("Computations: ", computation_count(), + " Replicas: ", replica_count(), "\n"); + for (int computation = 0; computation < computation_count(); ++computation) { + StrAppend(&output, "Computation ", computation, ": "); + for (int replica = 0; replica < replica_count(); ++replica) { + StrAppend(&output, operator()(replica, computation), " "); + } + StrAppend(&output, "\n"); + } + return output; +} + StatusOr ComputationPlacer::DeviceId(int replica, int computation, int replica_count, int computation_count) { @@ -115,7 +132,7 @@ StatusOr ComputationPlacer::AssignDevices( return NotFound( "could not find registered computation placer for platform %s -- check " "target linkage", - platform->Name().c_str()); + platform->Name()); } if (it->second.placer == nullptr) { @@ -139,7 +156,7 @@ ComputationPlacer::GetPlatformComputationPlacers() { } // namespace xla static std::unique_ptr CreateComputationPlacer() { - return xla::MakeUnique(); + return absl::make_unique(); } static bool InitModule() { diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h index 737d00e93ecb51a9bd544bbcbe99d93374d108fb..c899ffb9dc562426ef14c0d414469c04debeec70 100644 --- a/tensorflow/compiler/xla/service/computation_placer.h +++ b/tensorflow/compiler/xla/service/computation_placer.h @@ -55,6 +55,8 @@ class DeviceAssignment : public Array2D { // due to a StatusOr of an incomplete type (DeviceAssignment). static StatusOr> Deserialize( const DeviceAssignmentProto& proto); + + string ToString() const; }; // A generic implementation of the XLA computation placer, which assigns device diff --git a/tensorflow/compiler/xla/service/computation_tracker.cc b/tensorflow/compiler/xla/service/computation_tracker.cc deleted file mode 100644 index 70e25eebdb068db893e24aec0f72d09090ac7027..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/computation_tracker.cc +++ /dev/null @@ -1,256 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/computation_tracker.h" - -#include -#include -#include -#include - -#include "tensorflow/compiler/xla/ptr_util.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" -#include "tensorflow/core/platform/logging.h" - -using ::tensorflow::strings::Appendf; - -namespace xla { - -ComputationTracker::ComputationTracker() : next_computation_(1) {} - -ComputationHandle ComputationTracker::NewComputation( - const string& computation_name) { - tensorflow::mutex_lock lock(computation_mutex_); - ComputationHandle computation_handle; - int64 handle_value = next_computation_++; - computation_handle.set_handle(handle_value); - opaque_to_computation_[handle_value] = - MakeUnique(computation_name, computation_handle); - return computation_handle; -} - -StatusOr ComputationTracker::LoadSessionModule( - const SessionModule& session_module) { - tensorflow::mutex_lock lock(computation_mutex_); - - // For each embedded computation, create a new computation based on its - // serialized data, and place the mapping from the old computation handle to - // the new computation handle. - - // Build a mapping from old embedded computation handles to new computation - // handles. We build the ID mapping first since the embedded computations are - // in no particular order and may refer to each other. - std::map old_to_new; - for (const SessionComputation& computation : - session_module.embedded_computations()) { - const int64 old_handle = computation.computation_handle().handle(); - if (!old_to_new.emplace(old_handle, AllocateHandle()).second) { - return InvalidArgument("Duplicate embedded computation handle %lld", - old_handle); - } - } - - // Create a new computation from each serialized embedded computation. - for (const SessionComputation& computation : - session_module.embedded_computations()) { - const int64 old_handle = computation.computation_handle().handle(); - const ComputationHandle& new_handle = old_to_new[old_handle]; - TF_ASSIGN_OR_RETURN(opaque_to_computation_[new_handle.handle()], - UserComputation::MakeWithRemapping( - computation, new_handle, old_to_new)); - } - - // Finally, place the entry computation in the tracker with all of the - // remappings populated from the above. - const int64 old_handle = session_module.entry().computation_handle().handle(); - TF_ASSIGN_OR_RETURN( - old_to_new[old_handle], - LoadSessionComputation(session_module.entry(), &old_to_new)); - return old_to_new[old_handle]; -} - -StatusOr> -ComputationTracker::SnapshotComputation(const ComputationHandle& computation) { - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, Resolve(computation)); - const VersionedComputationHandle entry_versioned_handle = - user_computation->GetVersionedHandle(); - std::set visited; - std::list post_order; - { - tensorflow::mutex_lock lock(computation_mutex_); - ComputeComputationPostOrder(entry_versioned_handle, &visited, &post_order); - } - auto session_module = MakeUnique(); - *session_module->mutable_entry() = - Resolve(entry_versioned_handle.handle) - .ValueOrDie() - ->CloneSessionComputation(entry_versioned_handle.version); - for (auto it = ++post_order.rbegin(); it != post_order.rend(); ++it) { - *session_module->add_embedded_computations() = - Resolve(it->handle).ValueOrDie()->CloneSessionComputation(it->version); - } - return std::move(session_module); -} - -StatusOr ComputationTracker::Resolve( - const ComputationHandle& computation) const { - tensorflow::mutex_lock lock(computation_mutex_); - return ResolveInternal(computation); -} - -ComputationHandle ComputationTracker::AllocateHandle() { - int64 handle_value = next_computation_++; - ComputationHandle result; - result.set_handle(handle_value); - return result; -} - -StatusOr ComputationTracker::LoadSessionComputation( - const SessionComputation& session_computation, - std::map* old_to_new) { - TF_RET_CHECK(old_to_new != nullptr); - const ComputationHandle new_handle = AllocateHandle(); - (*old_to_new)[session_computation.computation_handle().handle()] = new_handle; - TF_ASSIGN_OR_RETURN(opaque_to_computation_[new_handle.handle()], - UserComputation::MakeWithRemapping( - session_computation, new_handle, *old_to_new)); - return new_handle; -} - -StatusOr ComputationTracker::ResolveInternal( - const ComputationHandle& computation) const { - auto it = opaque_to_computation_.find(computation.handle()); - if (it == opaque_to_computation_.end()) { - return NotFound("computation handle not found: %lld", computation.handle()); - } - UserComputation* user_computation = it->second.get(); - return user_computation; -} - -void ComputationTracker::ComputeComputationPostOrder( - const VersionedComputationHandle& versioned_handle, - std::set* visited, - std::list* post_order) const { - if (visited->count(versioned_handle) > 0) { - CHECK_EQ(1, visited->count(versioned_handle)); - return; - } - - UserComputation* computation = - ResolveInternal(versioned_handle.handle).ValueOrDie(); - std::vector embedded_handles = - computation->GetEmbeddedComputations(versioned_handle.version); - - for (const auto& embedded_handle : embedded_handles) { - ComputeComputationPostOrder(embedded_handle, visited, post_order); - } - - visited->insert(versioned_handle); - post_order->push_back(versioned_handle); -} - -StatusOr> ComputationTracker::BuildHloModule( - const VersionedComputationHandle& entry_handle, - const HloModuleConfig& config, - bool include_unreachable_instructions) const { - tensorflow::mutex_lock lock(computation_mutex_); - - VLOG(1) << "BuildHloModule(" << entry_handle - << ", include_unreachable_instructions=" - << include_unreachable_instructions << ")"; - XLA_VLOG_LINES(1, ToStringInternal()); - - TF_ASSIGN_OR_RETURN(UserComputation * entry_computation, - ResolveInternal(entry_handle.handle)); - - // Build a topological sort of the entry and any embedded computations as a - // list. The root of the computation will be the last element in the list. - std::set visited; - std::list post_order; - ComputeComputationPostOrder(entry_handle, &visited, &post_order); - - // Map from ComputationHandle value and computation version to HloComputation. - std::map hlo_computations; - - // The resolver lambda resolves VersionedHandles to embedded - // HloComputation*. This is required by UserComputation::BuildHloComputation - // when lowering calling operations (map, reduce etc). - auto resolver = [&hlo_computations]( - const VersionedComputationHandle& versioned_handle) -> HloComputation* { - CHECK_GT(hlo_computations.count(versioned_handle), 0); - return hlo_computations.at(versioned_handle); - }; - - // Print the post-order list for this entry computation. - if (VLOG_IS_ON(2)) { - VLOG(2) << "Visiting UserComputations in post order:"; - for (const VersionedComputationHandle& versioned_handle : post_order) { - VLOG(2) << " " << versioned_handle; - } - } - - string module_name = - tensorflow::strings::StrCat(entry_computation->name(), "_module"); - auto module = MakeUnique(module_name, entry_handle, config); - for (auto versioned_handle : post_order) { - UserComputation* computation = - ResolveInternal(versioned_handle.handle).ValueOrDie(); - - TF_ASSIGN_OR_RETURN( - std::unique_ptr hlo_computation, - computation->BuildHloComputation(versioned_handle.version, resolver, - config.debug_options(), - include_unreachable_instructions)); - - // Add the newly created computation to VersionedHandle-to-HloComputation - // map. - DCHECK_EQ(0, hlo_computations.count(versioned_handle)); - hlo_computations[versioned_handle] = hlo_computation.get(); - - if (computation == entry_computation) { - module->AddEntryComputation(std::move(hlo_computation)); - } else { - module->AddEmbeddedComputation(std::move(hlo_computation)); - } - } - - return std::move(module); -} - -string ComputationTracker::ToString() const { - tensorflow::mutex_lock lock(computation_mutex_); - return ToStringInternal(); -} - -string ComputationTracker::ToStringInternal() const { - string out; - Appendf(&out, "ComputationTracker(%p):\n", this); - for (const auto& handle_computation : opaque_to_computation_) { - int64 handle = handle_computation.first; - const std::unique_ptr& computation = - handle_computation.second; - Appendf(&out, " %4lld : %s \"%s\"\n", handle, - computation->GetVersionedHandle().ToString().c_str(), - computation->name().c_str()); - } - return out; -} - -} // namespace xla diff --git a/tensorflow/compiler/xla/service/computation_tracker.h b/tensorflow/compiler/xla/service/computation_tracker.h deleted file mode 100644 index d42d66adefe7faa2751da4cd80b392a38917ce70..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/computation_tracker.h +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_TRACKER_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_TRACKER_H_ - -#include -#include -#include -#include -#include - -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/hlo_module_config.h" -#include "tensorflow/compiler/xla/service/session.pb.h" -#include "tensorflow/compiler/xla/service/user_computation.h" -#include "tensorflow/compiler/xla/service/versioned_computation_handle.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/thread_annotations.h" -#include "tensorflow/core/platform/types.h" - -namespace xla { - -// Tracks computations for the XLA service; computations can be registered -// with a UserComputation instance and can be resolved from a handle for later -// use. -// -// This class is also capable of serializing/deserializing computations that it -// tracks (and to serialize properly you need to serialize all referred-to -// computations as well). -class ComputationTracker { - public: - ComputationTracker(); - - // Creates a new UserComputation object and returns the corresponding - // ComputationHandle for it. - // - // Precondition: user_computation is not already present in the map. - ComputationHandle NewComputation(const string& computation_name); - - // Restores session data for a computation that has been serialized, and - // allocates a new computation handle for it. - StatusOr LoadSessionModule( - const SessionModule& session_module); - - // Snapshots a computation (referenced by the provided handle) at its latest - // version, returning a module where it is the entry, and any referred-to - // computations are entrained as "embedded" (non-entry) computations. - StatusOr> SnapshotComputation( - const ComputationHandle& computation); - - // Resolves a ComputationHandle to a UserComputation that is present in the - // map. - StatusOr Resolve( - const ComputationHandle& computation) const; - - // Builds an HLO module using the specified computation as the entry. The - // module will include the entry computation as well as all computations which - // are called directly or indirectly from the entry computation via operations - // like "map". config is the HLO module configuration to use for the - // constructed module. - // If include_unreachable_instructions is true, then instructions - // which are not reachable from the root are lowered into HloInstructions - // including unreachable parameters. This ensures the entry HloComputation has - // the same program shape (ProgramShape) as the entry UserComputation. - StatusOr> BuildHloModule( - const VersionedComputationHandle& entry_handle, - const HloModuleConfig& config, - bool include_unreachable_instructions = true) const; - - string ToString() const; - - private: - // Bumps the next_computation_ number and returns the allocated number wrapped - // in a ComputationHandle. - ComputationHandle AllocateHandle() - EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_); - - // Loads a session computation into a UserComputation, registers it, and - // returns the computation handle of the registered computation. If old_to_new - // is provided, it is used for remapping references to computations present in - // session_computation. - // - // old_to_new will be updated with the mapping from session_computation's old - // handle to the returned handle value, and may not be null. - StatusOr LoadSessionComputation( - const SessionComputation& session_computation, - std::map* old_to_new) - EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_); - - // Internal implementation of Resolve method which requires, but does not - // acquire the mutex. - StatusOr ResolveInternal( - const ComputationHandle& computation) const - EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_); - - // Builds a post order sort of a computation ("entry") and all of its embedded - // computations including all transitively embedded computations. An embedded - // computation (the callee) will always appear in the sort before the - // computation which calls the embedded computation (the caller). Necessarily, - // the entry computation is the last element in the sort. visited and - // post_order should be empty when calling. post_order contains the post order - // sort when the function return. - void ComputeComputationPostOrder( - const VersionedComputationHandle& versioned_handle, - std::set* visited, - std::list* post_order) const - EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_); - - string ToStringInternal() const EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_); - - // Guards the computation mapping. Marked mutable so that the Resolve method - // can remain const; Resolve does't really modify the tracker in any way, but - // it has to lock the mutex for safety. - mutable tensorflow::mutex computation_mutex_; - - // The next sequence number to assign to a computation, guarded by the same - // mutex as the mapping as they'll be mutated at the same time. - int64 next_computation_ GUARDED_BY(computation_mutex_); - - // Mapping from ComputationHandle value to the corresponding registered - // UserComputation object. - std::map> opaque_to_computation_ - GUARDED_BY(computation_mutex_); - - TF_DISALLOW_COPY_AND_ASSIGN(ComputationTracker); -}; - -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_TRACKER_H_ diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc index e9ec796121fff223474c3e81a5e973cc37f8caec..4ea3a13f2835c5fef99c274f14d7d683c9ff5fc8 100644 --- a/tensorflow/compiler/xla/service/conditional_simplifier.cc +++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc @@ -19,7 +19,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/call_inliner.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -28,8 +29,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.h b/tensorflow/compiler/xla/service/conditional_simplifier.h index 063261e26d06e21a297e8e3c405898a17221b7ca..3de50cbd7ff752e8722a103b68f75144c6c889cd 100644 --- a/tensorflow/compiler/xla/service/conditional_simplifier.h +++ b/tensorflow/compiler/xla/service/conditional_simplifier.h @@ -16,10 +16,10 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_ +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" #include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/core/lib/core/stringpiece.h" namespace xla { @@ -27,9 +27,7 @@ namespace xla { // with their true or false computation as appropriate. class ConditionalSimplifier : public HloPassInterface { public: - tensorflow::StringPiece name() const override { - return "simplify-conditional"; - } + absl::string_view name() const override { return "simplify-conditional"; } StatusOr Run(HloModule* module) override; }; diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc index 868348547d9f5cbdc7576c7fc0697d72c3a3e557..c43a31b167d47af3c92ed35fa52594fa5da1e4af 100644 --- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc @@ -55,7 +55,7 @@ HloComputation* ConditionalSimplifierTest::MakeConditional(HloModule* module) { true_computation_builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(S32, {}), "param")); auto one = true_computation_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); true_computation_builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(S32, {}), HloOpcode::kAdd, param, one)); @@ -73,7 +73,7 @@ HloComputation* ConditionalSimplifierTest::MakeConditional(HloModule* module) { HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(S32, {}), "param")); auto forty_two = false_computation_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42))); false_computation_builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(S32, {}), HloOpcode::kAdd, param, forty_two)); @@ -82,11 +82,11 @@ HloComputation* ConditionalSimplifierTest::MakeConditional(HloModule* module) { } auto false_instrn = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto false_param = builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(S32, {}), "false_param")); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); builder.AddInstruction(HloInstruction::CreateConditional( ShapeUtil::MakeShape(S32, {}), false_instrn, one, true_computation, @@ -106,7 +106,7 @@ TEST_F(ConditionalSimplifierTest, ConditionalWithControlDependency) { HloComputation* computation = MakeConditional(&module()); auto* true_op = computation->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); TF_ASSERT_OK( true_op->AddControlDependencyTo(computation->root_instruction())); @@ -119,10 +119,11 @@ TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsSend) { ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional); auto* true_computation = conditional->true_computation(); + auto* token = true_computation->AddInstruction(HloInstruction::CreateToken()); auto* send = true_computation->AddInstruction(HloInstruction::CreateSend( true_computation->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))), - /*channel_id=*/0)); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))), + token, /*channel_id=*/0)); true_computation->AddInstruction(HloInstruction::CreateSendDone(send)); EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie()); } @@ -133,8 +134,9 @@ TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsRecv) { ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional); auto* true_computation = conditional->true_computation(); + auto* token = true_computation->AddInstruction(HloInstruction::CreateToken()); auto* recv = true_computation->AddInstruction(HloInstruction::CreateRecv( - ShapeUtil::MakeShape(F32, {1}), /*channel_id=*/0)); + ShapeUtil::MakeShape(F32, {1}), token, /*channel_id=*/0)); true_computation->AddInstruction(HloInstruction::CreateRecvDone(recv)); EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie()); } @@ -144,8 +146,9 @@ TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsNonRemovableInstruction) { auto* conditional = computation->root_instruction(); ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional); auto* false_computation = conditional->false_computation(); - false_computation->AddInstruction( - HloInstruction::CreateInfeed(ShapeUtil::MakeShape(F32, {1}), "config")); + auto token = false_computation->AddInstruction(HloInstruction::CreateToken()); + false_computation->AddInstruction(HloInstruction::CreateInfeed( + ShapeUtil::MakeShape(F32, {1}), token, "config")); EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie()); } diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c81a86bbb9dc7078237fe200f510a4905cb4d8d --- /dev/null +++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc @@ -0,0 +1,249 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h" + +#include +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/logging.h" + +namespace xla { + +namespace { + +// ConvolutionVisitor traverses the HLO computation and rewrites Convolution +// operations with feature_group_count > 1 into convolutions with +// feature_group_count = 1. +class ConvolutionVisitor : public DfsHloVisitorWithDefault { + public: + // Default visitor action is to do nothing and return OK. + Status DefaultAction(HloInstruction* /*hlo_instruction*/) override { + return Status::OK(); + } + + Status HandleConvolution(HloInstruction* convolution) override; + + // Runs the visitor on a computation. + static bool Run(HloComputation* computation); + + // Returns whether any convolution ops were rewritten. + const bool changed() const { return changed_; } + + ~ConvolutionVisitor() override = default; + + private: + explicit ConvolutionVisitor(HloComputation* computation) + : computation_(computation) {} + + // Current HloComputation instance the ConvolutionVisitor is traversing. + HloComputation* computation_; + + // Whether rewrite has occurred. + bool changed_ = false; +}; + +bool ConvolutionVisitor::Run(HloComputation* computation) { + ConvolutionVisitor visitor(computation); + TF_CHECK_OK(computation->Accept(&visitor)); + return visitor.changed_; +} + +Shape ExpandedFilterShape(const Shape& shape, int64 group_count, + int64 input_feature_dim) { + int64 num_dims = shape.dimensions_size(); + CHECK_GE(num_dims, 2); + Shape expanded_shape = shape; + expanded_shape.set_dimensions( + input_feature_dim, shape.dimensions(input_feature_dim) * group_count); + return expanded_shape; +} + +// Returns a vector with 'group_count' many groups, where the i-th group +// consists of 'group_size' times the value i. +std::vector GetMaskIds(int64 group_size, int64 group_count) { + std::vector values; + for (int i = 0; i < group_count; ++i) { + for (int j = 0; j < group_size; ++j) { + values.push_back(i); + } + } + return values; +} + +// Create a mask for grouped convolution that will make a normal convolution +// produce the same results as a grouped convolution. For a [2, 1, 6] +// filter this returns a [2, 3, 6] mask +// 1 1 0 0 0 0 +// 0 0 1 1 0 0 +// 0 0 0 0 1 1 +// +// 1 1 0 0 0 0 +// 0 0 1 1 0 0 +// 0 0 0 0 1 1 +// +// The first step is to create a rank 1 constant: +// 0 1 2 +// +// This is broadcasted to +// 0 0 0 0 0 0 +// 1 1 1 1 1 1 +// 2 2 2 2 2 2 +// +// 0 0 0 0 0 0 +// 1 1 1 1 1 1 +// 2 2 2 2 2 2 +// +// Then we create another rank 1 constant +// 0 0 1 1 2 2 +// +// This is broadcasted to +// 0 0 1 1 2 2 +// 0 0 1 1 2 2 +// 0 0 1 1 2 2 +// +// 0 0 1 1 2 2 +// 0 0 1 1 2 2 +// 0 0 1 1 2 2 +// +// Finally we use the Eq op of these two broadcasted constants and get the +// desired mask. +HloInstruction* GetExpandedFilterMask( + const Shape& filter_shape, int64 input_feature_dim, + int64 output_feature_dim, int64 group_count, + const std::function)>& + add_instruction) { + Shape expanded_filter_shape = + ExpandedFilterShape(filter_shape, group_count, input_feature_dim); + Shape mask_shape = ShapeUtil::MakeShape( + S32, AsInt64Slice(expanded_filter_shape.dimensions())); + int64 output_feature = filter_shape.dimensions(output_feature_dim); + int64 group_size = filter_shape.dimensions(input_feature_dim); + + // Create a 'input_feature' sized linspace and 'output_feature' sized linspace + // that will be broadcasted into perpendicular dimensions and compared. + const std::vector input_feature_filter_mask = + GetMaskIds(group_size, group_count); + const std::vector output_feature_filter_mask = + GetMaskIds(output_feature / group_count, group_count); + + auto mask1 = add_instruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1(input_feature_filter_mask))); + auto broadcasted_mask1 = add_instruction( + HloInstruction::CreateBroadcast(mask_shape, mask1, {input_feature_dim})); + auto mask2 = add_instruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1(output_feature_filter_mask))); + auto broadcasted_mask2 = add_instruction( + HloInstruction::CreateBroadcast(mask_shape, mask2, {output_feature_dim})); + + // Compare the broadcasted output feature linspace to the input feature + // linspace to create a diagonal predicate. + Shape predicate_shape = ShapeUtil::MakeShape( + PRED, AsInt64Slice(expanded_filter_shape.dimensions())); + return add_instruction(HloInstruction::CreateBinary( + predicate_shape, HloOpcode::kEq, broadcasted_mask1, broadcasted_mask2)); +} + +Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) { + int64 group_count = convolution->feature_group_count(); + if (group_count == 1) { + return Status::OK(); + } + auto filter = convolution->mutable_operand(1); + changed_ = true; + auto add = [&](std::unique_ptr inst) { + return computation_->AddInstruction(std::move(inst)); + }; + + auto dim_numbers = convolution->convolution_dimension_numbers(); + int64 input_feature_dim = dim_numbers.kernel_input_feature_dimension(); + int64 group_size = filter->shape().dimensions(input_feature_dim); + int64 output_feature_dim = dim_numbers.kernel_output_feature_dimension(); + auto expanded_filter_shape = + ExpandedFilterShape(filter->shape(), group_count, input_feature_dim); + HloInstruction* filter_mask = GetExpandedFilterMask( + filter->shape(), input_feature_dim, output_feature_dim, group_count, add); + HloInstruction* expanded_filter; + // We want to repeat 'filter' in the 'input_feature_dim' dimension + // 'group_count' times. + if (group_size == 1) { + Shape reshaped_filter_shape = + ShapeUtil::DeleteDimension(input_feature_dim, filter->shape()); + auto reshaped_filter = + add(HloInstruction::CreateReshape(reshaped_filter_shape, filter)); + std::vector broadcast_dims; + for (int64 i = 0; i < filter->shape().dimensions_size(); ++i) { + if (i == input_feature_dim) { + continue; + } + broadcast_dims.push_back(i); + } + expanded_filter = add(HloInstruction::CreateBroadcast( + expanded_filter_shape, reshaped_filter, broadcast_dims)); + } else { + // We could possibly also use reshape, broadcast, reshape instead of concat + // here, but it would require more complex code, and for depthwise + // convolution we would never end up in this branch. + std::vector concat_operands(group_count, filter); + expanded_filter = add(HloInstruction::CreateConcatenate( + expanded_filter_shape, concat_operands, input_feature_dim)); + } + auto zero = add(HloInstruction::CreateConstant(absl::make_unique( + LiteralUtil::Zero(expanded_filter_shape.element_type())))); + auto zero_filter = + add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {})); + auto new_filter = add( + HloInstruction::CreateTernary(expanded_filter_shape, HloOpcode::kSelect, + filter_mask, expanded_filter, zero_filter)); + auto new_convolution = HloInstruction::CreateConvolve( + convolution->shape(), convolution->mutable_operand(0), new_filter, + convolution->window(), dim_numbers, /*feature_group_count=*/1); + new_convolution->set_precision_config(convolution->precision_config()); + TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction( + convolution, std::move(new_convolution))); + return Status::OK(); +} + +} // namespace + +StatusOr ConvolutionFeatureGroupConverter::Run(HloModule* module) { + XLA_VLOG_LINES(2, "ConvolutionFeatureGroupConverter::Run(), before:\n" + + module->ToString()); + bool changed = false; + for (auto* comp : module->MakeNonfusionComputations()) { + if (ConvolutionVisitor::Run(comp)) { + changed = true; + } + } + XLA_VLOG_LINES(2, "ConvolutionFeatureGroupConverter::Run(), after:\n" + + module->ToString()); + return changed; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h new file mode 100644 index 0000000000000000000000000000000000000000..498894737fa37a6d8cca6ead2a86c72eb84ababd --- /dev/null +++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h @@ -0,0 +1,43 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_ + +#include "absl/strings/string_view.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" +#include "tensorflow/compiler/xla/status_macros.h" + +namespace xla { + +// A pass which rewrites convolutions with feature_group_count > 1 into +// convolutions with feature_group_count = 1. +class ConvolutionFeatureGroupConverter : public HloPassInterface { + public: + ConvolutionFeatureGroupConverter() {} + + absl::string_view name() const override { + return "convolution-feature-group-converter"; + } + + // Run convolution rewriting on the given computation. Returns whether the + // computation was changed. + StatusOr Run(HloModule* module) override; +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_ diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..28373ebf636c7b6b3059dcf6cd931901ebc87fc2 --- /dev/null +++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc @@ -0,0 +1,100 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h" + +#include +#include + +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/types.h" + +namespace xla { +namespace { + +using ConvolutionFeatureGroupConverterTest = HloTestBase; +namespace op = testing::opcode_matchers; + +TEST_F(ConvolutionFeatureGroupConverterTest, + ConvertFeatureGroupCountEqualToInputFeatureDim) { + string hlo_string = R"(HloModule Convolve1D1Window_0_module + +ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,2], filter: f32[1,1,2]) -> f32[1,2,2] { + %input = f32[1,2,2]{2,1,0} parameter(0) + %copy = f32[1,2,2]{2,0,1} copy(f32[1,2,2]{2,1,0} %input) + %filter = f32[1,1,2]{2,1,0} parameter(1) + ROOT %convolution = f32[1,2,2]{2,0,1} convolution(f32[1,2,2]{2,0,1} %copy, f32[1,1,2]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, feature_group_count=2 +})"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_string)); + + auto computation = module->entry_computation(); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kConvolution); + ConvolutionFeatureGroupConverter converter; + ASSERT_TRUE(converter.Run(module.get()).ValueOrDie()); + root = computation->root_instruction(); + // Make sure the convolution is converted to one with feature_group_count = 1. + EXPECT_EQ(root->opcode(), HloOpcode::kConvolution); + EXPECT_EQ(root->feature_group_count(), 1); + // Verify that the filter operand has been replaced. + EXPECT_THAT(root->operand(1), + op::Select(op::Eq(op::Broadcast(op::Constant()), + op::Broadcast(op::Constant())), + op::Broadcast(op::Reshape(op::Parameter())), + op::Broadcast(op::Constant()))); +} + +TEST_F(ConvolutionFeatureGroupConverterTest, + ConvertFeatureGroupCountDivisorOfInputFeatureDim) { + string hlo_string = R"(HloModule Convolve1D1Window_0_module + +ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2,2] { + %input = f32[1,2,4]{2,1,0} parameter(0) + %copy = f32[1,2,4]{2,0,1} copy(f32[1,2,4]{2,1,0} %input) + %filter = f32[1,2,2]{2,1,0} parameter(1) + ROOT %convolution = f32[1,2,2]{2,0,1} convolution(f32[1,2,4]{2,0,1} %copy, f32[1,2,2]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, feature_group_count=2 +})"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_string)); + + auto computation = module->entry_computation(); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kConvolution); + ConvolutionFeatureGroupConverter converter; + ASSERT_TRUE(converter.Run(module.get()).ValueOrDie()); + root = computation->root_instruction(); + // Make sure the convolution is converted to one with feature_group_count = 1. + EXPECT_EQ(root->opcode(), HloOpcode::kConvolution); + EXPECT_EQ(root->feature_group_count(), 1); + // Verify that the filter operand has been replaced. + EXPECT_THAT(root->operand(1), + op::Select(op::Eq(op::Broadcast(op::Constant()), + op::Broadcast(op::Constant())), + // We expect to see Concatenate here instead of + // Broadcast, because feature_group_count < input + // feature dimension. + op::Concatenate(op::Parameter(), op::Parameter()), + op::Broadcast(op::Constant()))); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc index 33d8338809d4e8c7c4774f062c3dda5494543ca6..b65dfef9c9575b683b2656af2ccc151d87db2cd7 100644 --- a/tensorflow/compiler/xla/service/copy_insertion.cc +++ b/tensorflow/compiler/xla/service/copy_insertion.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/copy_insertion.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_dce.h" @@ -31,18 +33,13 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" namespace xla { - -using ::tensorflow::str_util::Join; -using ::tensorflow::strings::StrAppend; -using ::tensorflow::strings::StrCat; - namespace { +using absl::StrAppend; + bool IsEntryParameterValue(const HloValue& value) { const HloComputation* computation = value.defining_instruction()->parent(); return value.defining_instruction()->opcode() == HloOpcode::kParameter && @@ -76,15 +73,6 @@ SpecialCaseCopyPolicy GetSpecialCaseCopyPolicy(const CallGraphNode& node, policy.copy_parameters_and_constants = true; policy.copy_root_replicated_buffers = true; } - for (const CallSite& site : node.caller_callsites()) { - // The AddCopiesForConditional() already adds copies, but the copy remover - // removes them, so we re-add them by returning the policy here. But really - // the copy remover should not be removing them. - if (site.instruction()->opcode() == HloOpcode::kConditional) { - policy.copy_parameters_and_constants = true; - policy.copy_root_replicated_buffers = true; - } - } return policy; } @@ -321,7 +309,7 @@ Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis, return Status::OK(); } -// We add copies for all the indices of the true and false computaiton roots, +// We add copies for all the indices of the true and false computation roots, // in order to resolve interference. We later rely on the CopyRemover to drop // the unnecessary ones. Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis, @@ -360,26 +348,6 @@ Status StripControlDependenciesFrom(HloInstruction* instruction) { return Status::OK(); } -// Add kCopy instructions to the given module to guarantee there is no -// live-range interference. Generally interference can only occur around kWhile -// instructions which have update-in-place semantics. -Status AddCopiesToResolveInterference(HloModule* module) { - TF_ASSIGN_OR_RETURN(std::unique_ptr alias_analysis, - HloAliasAnalysis::Run(module)); - - for (HloComputation* computation : module->computations()) { - for (HloInstruction* instruction : computation->instructions()) { - if (instruction->opcode() == HloOpcode::kWhile) { - TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction)); - } else if (instruction->opcode() == HloOpcode::kConditional) { - TF_RETURN_IF_ERROR( - AddCopiesForConditional(*alias_analysis, instruction)); - } - } - } - return Status::OK(); -} - // Class for removing unnecessary copies from the module. // // kCopy instructions are added conservatively to guarantee no live range @@ -410,7 +378,7 @@ class CopyRemover { } string ToString() const { - string out = StrCat("CopyRemover, module ", module_->name(), "\n"); + string out = absl::StrCat("CopyRemover, module ", module_->name(), "\n"); StrAppend(&out, " Buffer values, in dependency order:\n"); for (const HloBuffer& buffer : alias_analysis_.buffers()) { StrAppend(&out, " HloBuffer ", buffer.id(), ":\n"); @@ -472,6 +440,10 @@ class CopyRemover { // between copies added around aliased operations (kWhile) guarantees // this strict order. for (const HloValue* value_a : buffer.values()) { + if (ShapeUtil::IsToken(value_a->shape())) { + // Token values have no representation and cannot interfere. + continue; + } for (const HloValue* value_b : buffer.values()) { if (value_a != value_b) { DCHECK(ordering_.LiveRangeStrictlyBefore(*value_a, *value_b, @@ -507,7 +479,7 @@ class CopyRemover { // 'values' an entry is created in value_to_node which indicates the // respective ValueNode representing that value. void AddValueList( - tensorflow::gtl::ArraySlice values, + absl::Span values, tensorflow::gtl::FlatMap* value_to_node) { ValueNode* tail = nullptr; ValueNode* head = nullptr; @@ -613,7 +585,10 @@ class CopyRemover { VLOG(2) << copy->name() << " is not removable"; return false; } - + if (!ShapeUtil::Equal(copy->shape(), copy->operand(0)->shape())) { + VLOG(2) << copy->name() << " is not removable (shape mismatch)"; + return false; + } const CopyNodes& copy_node = copy_map_.at(copy); ValueNode* src = copy_node.src; ValueNode* dest = copy_node.dest; @@ -670,7 +645,12 @@ class CopyRemover { // We can only perform copy elision if the resulting merged values have // totally ordered live ranges; otherwise the merged buffer would have // live range interference. - if (IsHead(*dest)) { + if (src->next == dest) { + // In the process of eliding copies, its possible for a copy to have the + // same source and destination buffer. In this case, the copy can be + // safely removed. + VLOG(2) << copy->name() << " source and destination buffers are same."; + } else if (IsHead(*dest)) { // The copy copies an arbitrary value in the source buffer (call it s_x) // and defines d_0, the first value in the destination buffer. After // merging, the values in the combined buffer must be strictly ordered @@ -880,16 +860,16 @@ class CopyRemover { for (const ValueNode* p = head; p != nullptr; p = Next(*p)) { values.push_back(p->value); } - return StrCat("{", - Join(values, ", ", - [](string* s, const HloValue* value) { - StrAppend(s, value->ToShortString()); - }), - "}"); + return absl::StrCat("{", + absl::StrJoin(values, ", ", + [](string* s, const HloValue* value) { + StrAppend(s, value->ToShortString()); + }), + "}"); } string ToString() const { - string out = StrCat("BufferValueTracker:\n"); + string out = absl::StrCat("BufferValueTracker:\n"); StrAppend(&out, " Def-use chains in each buffer:\n"); for (const ValueNode* head : value_lists_) { StrAppend(&out, " Buffer defined by ", head->value->ToShortString(), @@ -897,10 +877,10 @@ class CopyRemover { const ValueNode* p = head; do { StrAppend(&out, " ", p->value->ToShortString(), ", uses: ", - Join(p->uses, "; ", - [](string* s, const HloUse* use) { - StrAppend(s, use->ToString()); - }), + absl::StrJoin(p->uses, "; ", + [](string* s, const HloUse* use) { + StrAppend(s, use->ToString()); + }), "\n"); p = p->next; @@ -947,41 +927,45 @@ class CopyRemover { BufferValueTracker buffer_value_tracker_; }; -// Try to remove as many copies from the module as possible without introducing -// live range interference. Copy instructions (identified by their unique id) in -// the set copies_to_exclude are not considered for removal. -Status RemoveUnnecessaryCopies( - const HloOrdering& ordering, - const tensorflow::gtl::FlatSet& copies_to_exclude, HloModule* module) { +void MaybeDumpModule(const string& message, const HloModule& module) { + if (VLOG_IS_ON(3)) { + VLOG(3) << message; + XLA_VLOG_LINES(3, module.ToString()); + hlo_graph_dumper::MaybeDumpHloModule(module, message); + } +} + +} // namespace + +// Add kCopy instructions to the given module to guarantee there is no +// live-range interference. Generally interference can only occur around kWhile +// instructions which have update-in-place semantics. +Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) { TF_ASSIGN_OR_RETURN(std::unique_ptr alias_analysis, - HloAliasAnalysis::Run(module)); - CopyRemover copy_remover(*alias_analysis, ordering, module); - XLA_VLOG_LINES(3, copy_remover.ToString()); + HloAliasAnalysis::Run(module, fusion_can_share_buffer_)); for (HloComputation* computation : module->computations()) { for (HloInstruction* instruction : computation->instructions()) { - if (instruction->opcode() == HloOpcode::kCopy && - !ContainsKey(copies_to_exclude, instruction->unique_id())) { - TF_RETURN_IF_ERROR(copy_remover.TryElideCopy(instruction).status()); + if (instruction->opcode() == HloOpcode::kWhile) { + TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction)); + } else if (instruction->opcode() == HloOpcode::kConditional) { + TF_RETURN_IF_ERROR( + AddCopiesForConditional(*alias_analysis, instruction)); } } } return Status::OK(); } -// Add copies to address special constraints on the roots of computations not -// related to live range interference: -// -// (1) Entry computation root must be unambiguous and distinct. -// -// (2) Any computation called by a kCall instruction must have an -// unambiguous root. -// -// (3) Constants and parameters cannot be live out of the entry computation -// -Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) { +Status CopyInsertion::AddSpecialCaseCopies(HloModule* module) { + std::unique_ptr call_graph = CallGraph::Build(module); + return AddSpecialCaseCopies(*call_graph, module); +} + +Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph, + HloModule* module) { TF_ASSIGN_OR_RETURN(std::unique_ptr alias_analysis, - HloAliasAnalysis::Run(module)); + HloAliasAnalysis::Run(module, fusion_can_share_buffer_)); // Identify which shape indices of which instructions need to be copied. Store // these results in 'instructions_to_copy'. @@ -1065,10 +1049,11 @@ Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) { HloInstruction* instruction = pair.first; const ShapeTree& indices_to_copy = pair.second; + ShapeTree copies_added(indices_to_copy.shape()); std::vector users = instruction->users(); TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy, instruction->parent()->DeepCopyInstruction( - instruction, &indices_to_copy)); + instruction, &indices_to_copy, &copies_added)); for (HloInstruction* user : users) { TF_RETURN_IF_ERROR(instruction->ReplaceUseWith(user, deep_copy)); } @@ -1079,23 +1064,35 @@ Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) { return Status::OK(); } -Status VerifyNoLiveRangeInterference(HloModule* module) { +Status CopyInsertion::VerifyNoLiveRangeInterference(const HloOrdering& ordering, + HloModule* module) { TF_ASSIGN_OR_RETURN(std::unique_ptr alias_analysis, - HloAliasAnalysis::Run(module)); - DependencyHloOrdering ordering(module); + HloAliasAnalysis::Run(module, fusion_can_share_buffer_)); TF_RET_CHECK(!alias_analysis->HasLiveRangeInterference(ordering)); return Status::OK(); } -void MaybeDumpModule(const string& message, const HloModule& module) { - if (VLOG_IS_ON(3)) { - VLOG(3) << message; - XLA_VLOG_LINES(3, module.ToString()); - hlo_graph_dumper::MaybeDumpHloModule(module, message); +Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering, + HloModule* module) { + MaybeDumpModule("after adding copies to resolve interference", *module); + + TF_ASSIGN_OR_RETURN(std::unique_ptr alias_analysis, + HloAliasAnalysis::Run(module, fusion_can_share_buffer_)); + CopyRemover copy_remover(*alias_analysis, ordering, module); + XLA_VLOG_LINES(3, copy_remover.ToString()); + + std::unique_ptr call_graph = CallGraph::Build(module); + for (HloComputation* computation : module->computations()) { + for (HloInstruction* instruction : computation->instructions()) { + if (instruction->opcode() == HloOpcode::kCopy) { + TF_RETURN_IF_ERROR(copy_remover.TryElideCopy(instruction).status()); + } + } } -} + MaybeDumpModule("after removing unnecessary copies", *module); -} // namespace + return Status::OK(); +} StatusOr CopyInsertion::Run(HloModule* module) { // Copy insertion is performed in three steps: @@ -1130,16 +1127,13 @@ StatusOr CopyInsertion::Run(HloModule* module) { "Call graph must be flattened before copy insertion."); } - // Gather Ids of existing kCopy instructions in the module. We avoid removing - // these copies (except via DCE in TupleSimplifier) because they may have been - // added for reasons not considered by copy insertion (eg, layout assignment). - // Instruction id is used instead of HloInstruction* because the pointer - // values may be recycled. - tensorflow::gtl::FlatSet existing_copies; - for (HloComputation* computation : module->computations()) { - for (HloInstruction* instruction : computation->instructions()) { - if (instruction->opcode() == HloOpcode::kCopy) { - existing_copies.insert(instruction->unique_id()); + int64 num_existing_copies = 0; + if (VLOG_IS_ON(1)) { + for (HloComputation* computation : module->computations()) { + for (HloInstruction* instruction : computation->instructions()) { + if (instruction->opcode() == HloOpcode::kCopy) { + ++num_existing_copies; + } } } } @@ -1156,15 +1150,10 @@ StatusOr CopyInsertion::Run(HloModule* module) { TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); TF_RETURN_IF_ERROR(dce.Run(module).status()); - TF_DCHECK_OK(VerifyNoLiveRangeInterference(module)); - - MaybeDumpModule("after adding copies to resolve interference", *module); + DependencyHloOrdering dep_ordering(module); + TF_DCHECK_OK(VerifyNoLiveRangeInterference(dep_ordering, module)); - DependencyHloOrdering ordering(module); - TF_RETURN_IF_ERROR( - RemoveUnnecessaryCopies(ordering, existing_copies, module)); - - MaybeDumpModule("after removing unnecessary copies", *module); + TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(dep_ordering, module)); TF_RETURN_IF_ERROR(AddSpecialCaseCopies(*call_graph, module)); @@ -1172,7 +1161,8 @@ StatusOr CopyInsertion::Run(HloModule* module) { TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); TF_RETURN_IF_ERROR(dce.Run(module).status()); - TF_DCHECK_OK(VerifyNoLiveRangeInterference(module)); + TF_DCHECK_OK( + VerifyNoLiveRangeInterference(DependencyHloOrdering(module), module)); MaybeDumpModule("after copy insertion", *module); @@ -1185,7 +1175,7 @@ StatusOr CopyInsertion::Run(HloModule* module) { } } } - VLOG(1) << "Num copies before copy-insertion: " << existing_copies.size(); + VLOG(1) << "Num copies before copy-insertion: " << num_existing_copies; VLOG(1) << "Num copies after copy-insertion: " << num_total_copies; } diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h index 65e3d31e347e2cb249a072e7d06ca10c55401748..d308f6bc84670b78b9cab476f2893bce267df2cf 100644 --- a/tensorflow/compiler/xla/service/copy_insertion.h +++ b/tensorflow/compiler/xla/service/copy_insertion.h @@ -21,7 +21,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" -#include "tensorflow/core/lib/gtl/flatmap.h" namespace xla { @@ -46,7 +45,16 @@ namespace xla { // InstructionAliasSet::IsDistinct return true. class CopyInsertion : public HloPassInterface { public: - tensorflow::StringPiece name() const override { return "copy-insertion"; } + absl::string_view name() const override { return "copy-insertion"; } + + // fusion_can_share_buffer: backend specific function that decides whether a + // fusion can share buffer with its operand. + // + // TODO(b/80315712): Find a better way to tell whether a fusion can share + // buffer. + CopyInsertion(const HloDataflowAnalysis::FusionCanShareBufferFunction& + fusion_can_share_buffer = nullptr) + : fusion_can_share_buffer_(fusion_can_share_buffer) {} // Run the pass on the given module. Returns whether the module was changed // (copies were inserted). @@ -62,6 +70,39 @@ class CopyInsertion : public HloPassInterface { // // TODO(b/62548313): Remove this when buffer assignment is module-scoped. static StatusOr AddCopiesForBufferAssignment(HloModule* module); + + // Try to remove as many copies from the module as possible without + // introducing live range interference. Only copy instructions that are + // eligible for copy elision are considered for removal. + Status RemoveUnnecessaryCopies(const HloOrdering& ordering, + HloModule* module); + + // Add copies to address special constraints on the roots of computations not + // related to live range interference: + // + // (1) Entry computation root must be unambiguous and distinct. + // + // (2) Any computation called by a kCall instruction must have an + // unambiguous root. + // + // (3) Constants and parameters cannot be live out of the entry computation + // + Status AddSpecialCaseCopies(HloModule* module); + + // Verifies that no HLO values have interfering live ranges using the given + // ordering. + Status VerifyNoLiveRangeInterference(const HloOrdering& ordering, + HloModule* module); + + private: + // Override which requires the caller to pass in a call graph. + Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module); + + Status AddCopiesToResolveInterference(HloModule* module); + + // Backend specific function that decides whether a fusion can share buffer + // with its operand. + HloDataflowAnalysis::FusionCanShareBufferFunction fusion_can_share_buffer_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc index 153f062d015e49db11c4c9ae0a2a61e76c020f02..892d0d7b547aaf1e7f1c55e4163d1e1fd9518def 100644 --- a/tensorflow/compiler/xla/service/copy_insertion_test.cc +++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc @@ -18,7 +18,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -108,7 +108,7 @@ TEST_F(CopyInsertionTest, SingleConstant) { // be copied before entering the tuple. auto builder = HloComputation::Builder(TestName()); HloInstruction* constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); HloInstruction* tuple = builder.AddInstruction(HloInstruction::CreateTuple({constant})); @@ -125,21 +125,27 @@ TEST_F(CopyInsertionTest, SingleConstant) { } TEST_F(CopyInsertionTest, ExistingCopiesNotRemoved) { - // Verify that an kCopy instructions which exist in the pass before + // Verify that kCopy instructions which change layout and exist before // copy-insertion remain in the graph after copy-insertion. auto module = CreateNewModule(); auto builder = HloComputation::Builder(TestName()); - HloInstruction* constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); - HloInstruction* copy_1 = builder.AddInstruction(HloInstruction::CreateUnary( - constant->shape(), HloOpcode::kCopy, constant)); - HloInstruction* copy_2 = builder.AddInstruction(HloInstruction::CreateUnary( - constant->shape(), HloOpcode::kCopy, constant)); + HloInstruction* constant = + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2({{0.f, 2.f}, {2.f, 4.f}}))); + auto minor_to_major = LayoutUtil::MinorToMajor(constant->shape()); + Layout reversed_layout = + LayoutUtil::MakeLayoutFromMajorToMinor(minor_to_major); + Shape copy_shape = constant->shape(); + *copy_shape.mutable_layout() = reversed_layout; + HloInstruction* copy_1 = builder.AddInstruction( + HloInstruction::CreateUnary(copy_shape, HloOpcode::kCopy, constant)); + HloInstruction* copy_2 = builder.AddInstruction( + HloInstruction::CreateUnary(copy_shape, HloOpcode::kCopy, constant)); HloInstruction* add = builder.AddInstruction(HloInstruction::CreateBinary( constant->shape(), HloOpcode::kAdd, copy_1, copy_2)); - HloInstruction* add_copy = builder.AddInstruction( - HloInstruction::CreateUnary(constant->shape(), HloOpcode::kCopy, add)); + builder.AddInstruction( + HloInstruction::CreateUnary(add->shape(), HloOpcode::kCopy, add)); module->AddEntryComputation(builder.Build()); @@ -147,12 +153,11 @@ TEST_F(CopyInsertionTest, ExistingCopiesNotRemoved) { InsertCopies(module.get()); - EXPECT_EQ(CountCopies(*module), 3); + EXPECT_EQ(CountCopies(*module), 2); - EXPECT_EQ(module->entry_computation()->root_instruction(), add_copy); - EXPECT_THAT( - module->entry_computation()->root_instruction(), - op::Copy(op::Add(op::Copy(op::Constant()), op::Copy(op::Constant())))); + EXPECT_EQ(module->entry_computation()->root_instruction(), add); + EXPECT_THAT(module->entry_computation()->root_instruction(), + op::Add(op::Copy(op::Constant()), op::Copy(op::Constant()))); } TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) { @@ -162,9 +167,9 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) { auto builder = HloComputation::Builder(TestName()); HloInstruction* constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); HloInstruction* constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); HloInstruction* x = builder.AddInstruction( HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "x")); @@ -192,11 +197,11 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) { // the computation result. Verify that copies are added properly. auto builder = HloComputation::Builder(TestName()); HloInstruction* constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); HloInstruction* constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); HloInstruction* constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); HloInstruction* tuple1 = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); @@ -204,9 +209,9 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) { HloInstruction::CreateTuple({constant3, constant2})); HloInstruction* pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); builder.AddInstruction(HloInstruction::CreateTernary( - tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple1->shape(), HloOpcode::kTupleSelect, pred, tuple1, tuple2)); EXPECT_THAT(constant1->users(), UnorderedElementsAre(tuple1)); EXPECT_THAT(constant2->users(), UnorderedElementsAre(tuple1, tuple2)); @@ -250,8 +255,9 @@ TEST_F(CopyInsertionTest, BitcastConstant) { // The output of a bitcast is its operand (same buffer), so a bitcast // constant feeding the result must have a copy added. auto builder = HloComputation::Builder(TestName()); - HloInstruction* constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1.0, 42.0}))); + HloInstruction* constant = + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({1.0, 42.0}))); HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary( ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant)); @@ -365,9 +371,9 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) { // copy is added. auto builder = HloComputation::Builder(TestName()); HloInstruction* constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); HloInstruction* constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); HloInstruction* tuple1 = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); @@ -375,9 +381,9 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) { HloInstruction::CreateTuple({constant2, constant1})); HloInstruction* pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloInstruction* select = builder.AddInstruction(HloInstruction::CreateTernary( - tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple1->shape(), HloOpcode::kTupleSelect, pred, tuple1, tuple2)); HloInstruction* gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement( ShapeUtil::GetSubshape(select->shape(), {0}), select, 0)); @@ -408,7 +414,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest { const Shape& loop_state_shape) { auto builder = HloComputation::Builder(TestName() + ".Condition"); auto limit_const = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(10))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(10))); auto loop_state = builder.AddInstruction( HloInstruction::CreateParameter(0, loop_state_shape, "loop_state")); auto induction_variable = @@ -437,7 +443,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest { builder.AddInstruction(HloInstruction::CreateGetTupleElement( induction_variable_shape_, loop_state, 0)); auto inc = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); auto add0 = builder.AddInstruction(HloInstruction::CreateBinary( induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc)); // Update data GTE(1). @@ -475,7 +481,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest { builder.AddInstruction(HloInstruction::CreateGetTupleElement( induction_variable_shape_, loop_state, 0)); auto inc = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); // add0 = Add(in0, 1) auto add0 = builder.AddInstruction(HloInstruction::CreateBinary( @@ -544,7 +550,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest { builder.AddInstruction(HloInstruction::CreateGetTupleElement( induction_variable_shape_, loop_state, 0)); auto inc = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); // add0 = Add(in0, 1) auto add0 = builder.AddInstruction(HloInstruction::CreateBinary( induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc)); @@ -559,8 +565,9 @@ class WhileCopyInsertionTest : public CopyInsertionTest { data = builder.AddInstruction( HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1)); } - auto update = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); + auto update = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1( + {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); // add1 = Add(in1, {1, 1, 1, 1, 1, 1, 1, 1}) auto add1 = builder.AddInstruction(HloInstruction::CreateBinary( data_shape_, HloOpcode::kAdd, data, update)); @@ -593,7 +600,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest { auto gte0 = builder.AddInstruction(HloInstruction::CreateGetTupleElement( induction_variable_shape_, loop_state, 0)); auto inc = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); auto add0 = builder.AddInstruction(HloInstruction::CreateBinary( gte0->shape(), HloOpcode::kAdd, gte0, inc)); @@ -603,8 +610,9 @@ class WhileCopyInsertionTest : public CopyInsertionTest { // GTE(GTE(loop_state, 1), 0) -> Add auto gte10 = builder.AddInstruction( HloInstruction::CreateGetTupleElement(data_shape_, gte1, 0)); - auto update10 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); + auto update10 = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1( + {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); auto add10 = builder.AddInstruction(HloInstruction::CreateBinary( data_shape_, HloOpcode::kAdd, gte10, update10)); @@ -628,10 +636,11 @@ class WhileCopyInsertionTest : public CopyInsertionTest { bool nested = false) { auto builder = HloComputation::Builder(TestName() + ".While"); auto induction_var_init = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); - auto data_init = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}))); + auto data_init = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1( + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}))); if (nested) { auto inner_init = builder.AddInstruction( @@ -654,8 +663,9 @@ class WhileCopyInsertionTest : public CopyInsertionTest { HloInstruction* BuildWhileInstruction_InitPointsToConstant() { auto builder = HloComputation::Builder(TestName() + ".While"); - auto data_init = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}))); + auto data_init = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1( + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}))); return BuildWhileInstructionWithCustomInit(loop_state_shape_, data_init, &builder); } @@ -672,11 +682,11 @@ class WhileCopyInsertionTest : public CopyInsertionTest { auto builder = HloComputation::Builder(TestName() + ".While"); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto v1 = builder.AddInstruction( HloInstruction::CreateBroadcast(data_shape_, one, {1})); auto zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto v2 = builder.AddInstruction( HloInstruction::CreateBroadcast(data_shape_, zero, {1})); @@ -684,9 +694,9 @@ class WhileCopyInsertionTest : public CopyInsertionTest { auto tuple2 = builder.AddInstruction(HloInstruction::CreateTuple({v2, v1})); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto data_init = builder.AddInstruction(HloInstruction::CreateTernary( - nested_tuple_shape_, HloOpcode::kSelect, pred, tuple1, tuple2)); + nested_tuple_shape_, HloOpcode::kTupleSelect, pred, tuple1, tuple2)); return BuildWhileInstructionWithCustomInit(nested_loop_state_shape_, data_init, &builder); @@ -696,7 +706,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest { auto builder = HloComputation::Builder(TestName() + ".While"); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto one_vec = builder.AddInstruction( HloInstruction::CreateBroadcast(data_shape_, one, {1})); auto data_init = @@ -709,11 +719,12 @@ class WhileCopyInsertionTest : public CopyInsertionTest { HloInstruction* BuildWhileInstruction_InitPointsToInterfering() { auto builder = HloComputation::Builder(TestName() + ".While"); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto data_init = builder.AddInstruction( HloInstruction::CreateBroadcast(data_shape_, one, {1})); - auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); + auto one_vec = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1( + {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}))); // Take a reference to 'data_init' to make it interfere with while result. auto add = builder.AddInstruction(HloInstruction::CreateBinary( data_shape_, HloOpcode::kAdd, data_init, one_vec)); @@ -745,7 +756,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest { const bool nested = ShapeUtil::Equal(loop_state_shape, nested_loop_state_shape_); auto induction_var_init = builder->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); auto condition = module_->AddEmbeddedComputation( BuildConditionComputation(loop_state_shape)); auto body = module_->AddEmbeddedComputation( @@ -1247,7 +1258,6 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) { auto loop_init = builder.AddInstruction( HloInstruction::CreateTuple({iter_param, data_param, data_param})); - // Two while loops shares the same loop init tuple. auto while_hlo1 = builder.AddInstruction(HloInstruction::CreateWhile( loop_state_shape, condition1, body1, loop_init)); @@ -1305,7 +1315,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhile) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, loop_state_shape, "param")); auto cond_constant = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); cond_builder.AddInstruction(HloInstruction::CreateUnary( cond_constant->shape(), HloOpcode::kNot, cond_constant)); HloComputation* condition = @@ -1313,9 +1323,9 @@ TEST_F(CopyInsertionTest, SwizzlingWhile) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto xla_while = builder.AddInstruction( @@ -1370,7 +1380,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, loop_state_shape, "param")); auto cond_constant = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); cond_builder.AddInstruction(HloInstruction::CreateUnary( cond_constant->shape(), HloOpcode::kNot, cond_constant)); HloComputation* condition = @@ -1378,9 +1388,9 @@ TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto xla_while = builder.AddInstruction( @@ -1430,7 +1440,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileSharedInput) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, loop_state_shape, "param")); auto cond_constant = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); cond_builder.AddInstruction(HloInstruction::CreateUnary( cond_constant->shape(), HloOpcode::kNot, cond_constant)); HloComputation* condition = @@ -1438,7 +1448,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileSharedInput) { auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({constant, constant})); builder.AddInstruction( @@ -1515,7 +1525,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, loop_state_shape, "param")); auto cond_constant = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); cond_builder.AddInstruction(HloInstruction::CreateUnary( cond_constant->shape(), HloOpcode::kNot, cond_constant)); HloComputation* condition = @@ -1570,14 +1580,14 @@ TEST_F(CopyInsertionTest, WhileBodyWithConstantRoot) { body_builder.AddInstruction( HloInstruction::CreateParameter(0, scalar_shape_, "param")); body_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(123.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(123.0))); HloComputation* body = module->AddEmbeddedComputation(body_builder.Build()); auto cond_builder = HloComputation::Builder("condition"); cond_builder.AddInstruction( HloInstruction::CreateParameter(0, scalar_shape_, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module->AddEmbeddedComputation(cond_builder.Build()); @@ -1595,12 +1605,51 @@ TEST_F(CopyInsertionTest, WhileBodyWithConstantRoot) { EXPECT_THAT(condition->root_instruction(), op::Constant()); } +TEST_F(CopyInsertionTest, TokensShouldNotBeCopied) { + string module_string = R"( +HloModule TokensShouldNotBeCopied + +%Body (param.1: (s32[], token[])) -> (s32[], token[]) { + %param.1 = (s32[], token[]) parameter(0) + %get-tuple-element.1 = s32[] get-tuple-element((s32[], token[]) %param.1), index=0 + %constant.1 = s32[] constant(1) + %add = s32[] add(s32[] %get-tuple-element.1, s32[] %constant.1) + %get-tuple-element.2 = token[] get-tuple-element((s32[], token[]) %param.1), index=1 + %after-all = token[] after-all(token[] %get-tuple-element.2) + ROOT %tuple = (s32[], token[]) tuple(s32[] %add, token[] %after-all) +} + +%Cond (param: (s32[], token[])) -> pred[] { + %param = (s32[], token[]) parameter(0) + %get-tuple-element = s32[] get-tuple-element((s32[], token[]) %param), index=0 + %constant = s32[] constant(42) + ROOT %less-than = pred[] less-than(s32[] %get-tuple-element, s32[] %constant) +} + +ENTRY %TokensShouldNotBeCopied () -> s32[] { + %one = s32[] constant(1) + %negative_one = s32[] negate(%one) + %init_token = token[] after-all() + %init_tuple = (s32[], token[]) tuple(s32[] %negative_one, token[] %init_token) + %while = (s32[], token[]) while((s32[], token[]) %init_tuple), condition=%Cond, body=%Body + ROOT %root = s32[] get-tuple-element((s32[], token[]) %while), index=0 +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + HloRunner::CreateModuleFromString( + module_string, GetDebugOptionsForTest())); + InsertCopies(module.get()); + + // There should be no copies added because tokens should not be copied. + EXPECT_EQ(CountCopies(*module), 0); +} + std::unique_ptr MakeTrivialCondition(const Shape& shape) { auto builder = HloComputation::Builder("trivial_condition"); builder.AddInstruction( HloInstruction::CreateParameter(0, shape, "loop_state")); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); builder.AddInstruction(HloInstruction::CreateUnary( constant->shape(), HloOpcode::kNot, constant)); return builder.Build(); @@ -1636,8 +1685,7 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) { for (int i = 0; i < num_iters; ++i) { HloModuleConfig config; config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags()); - HloModule module("BM_SequentialWhiles", VersionedComputationHandle(), - config); + HloModule module("BM_SequentialWhiles", config); auto builder = HloComputation::Builder("BM_SequentialWhiles"); HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( @@ -1677,8 +1725,7 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) { for (int i = 0; i < num_iters; ++i) { HloModuleConfig config; config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags()); - HloModule module("BM_SequentialWhiles", VersionedComputationHandle(), - config); + HloModule module("BM_SequentialWhiles", config); auto builder = HloComputation::Builder("BM_ParallelWhiles"); HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( @@ -1750,8 +1797,7 @@ void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) { std::vector tuple_params(num_tuple_inputs); for (int i = 0; i < num_iters; ++i) { auto builder = HloComputation::Builder("BM_ParallelWhiles"); - HloModule module("BM_ManyElementTuple", VersionedComputationHandle(), - config); + HloModule module("BM_ManyElementTuple", config); for (int j = 0; j < num_tuple_inputs; ++j) { tuple_params[j] = builder.AddInstruction( HloInstruction::CreateParameter(j, element_shape, "")); @@ -1961,5 +2007,46 @@ ENTRY TestComputation { InsertCopies(module.get()); } +TEST_F(CopyInsertionTest, NestedWhiles) { + // Verify that only no unnecessary copies remain after copy insertion for + // trivial nested whiles (b/112472605). + const string& hlo_string = R"( +HloModule TestModule + +cond.inner { + ROOT param.cond.inner = pred[] parameter(0) +} + +body.inner { + param.body.inner = pred[] parameter(0) + ROOT neg = pred[] negate(param.body.inner) +} + +cond.outer { + ROOT param.cond.outer = pred[] parameter(0) +} + +body.outer { + param.cond.outer = pred[] parameter(0) + ROOT while = pred[] while(param.cond.outer), condition=cond.inner, body=body.inner +} + +ENTRY TestComputation { + entry_param = pred[] parameter(0) + ROOT while = pred[] while(entry_param), condition=cond.outer, body=body.outer +} +)"; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())); + InsertCopies(module.get()); + + // There should only be a single copy inserted, and it's in the entry + // computation. + EXPECT_EQ(CountCopies(*module), 1); + EXPECT_THAT(module->entry_computation()->root_instruction(), + op::While(op::Copy(op::Parameter()))); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index bfd85f257fb9550a6babb2459a7227ca9003a14f..d412578619e5d23db3933af19d665cf8beb4d622 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -20,7 +20,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_binary") load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS") load( "//third_party/mkl:build_defs.bzl", - "if_mkl", + "mkl_deps", ) # Filegroup used to collect source files for dependency checking. @@ -37,6 +37,7 @@ cc_library( srcs = ["cpu_transfer_manager.cc"], hdrs = ["cpu_transfer_manager.h"], deps = [ + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -49,30 +50,21 @@ cc_library( "//tensorflow/compiler/xla/service/cpu:cpu_runtime", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], alwayslink = True, # Contains per-platform transfer manager registration ) cc_library( - name = "external_constant_pool", - srcs = ["external_constant_pool.cc"], - hdrs = ["external_constant_pool.h"], + name = "buffer_info_util", + srcs = ["buffer_info_util.cc"], + hdrs = ["buffer_info_util.h"], deps = [ - "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/tf2xla:cpu_function_runtime", + "//tensorflow/compiler/xla/service:buffer_assignment", "//tensorflow/core:lib", - ], -) - -tf_cc_test( - name = "external_constant_pool_test", - srcs = ["external_constant_pool_test.cc"], - deps = [ - ":external_constant_pool", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/core:test", + "@com_google_absl//absl/types:span", ], ) @@ -82,6 +74,7 @@ cc_library( hdrs = ["cpu_compiler.h"], deps = [ ":compiler_functor", + ":buffer_info_util", ":conv_canonicalization", ":cpu_copy_insertion", ":cpu_executable", @@ -95,7 +88,13 @@ cc_library( ":ir_emitter", ":parallel_task_assignment", ":simple_orc_jit", - "//tensorflow/compiler/xla:literal_util", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + ":target_machine_features", + "@com_google_absl//absl/types:span", + "//tensorflow/compiler/tf2xla:cpu_function_runtime", + "//tensorflow/compiler/xla/service:scatter_expander", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -109,10 +108,10 @@ cc_library( "//tensorflow/compiler/xla/service:buffer_liveness", "//tensorflow/compiler/xla/service:call_inliner", "//tensorflow/compiler/xla/service:conditional_simplifier", + "//tensorflow/compiler/xla/service:convolution_feature_group_converter", "//tensorflow/compiler/xla/service:dot_decomposer", "//tensorflow/compiler/xla/service:executable", "//tensorflow/compiler/xla/service:flatten_call_graph", - "//tensorflow/compiler/xla/service:gather_expander", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_constant_folding", "//tensorflow/compiler/xla/service:hlo_cse", @@ -151,7 +150,14 @@ cc_library( "@llvm//:target", # fixdeps: keep "@llvm//:x86_code_gen", # fixdeps: keep "@llvm//:x86_disassembler", # fixdeps: keep - ], + ] + select({ + "//tensorflow:linux_ppc64le": [ + "@llvm//:powerpc_disassembler", + "@llvm//:powerpc_code_gen", + ], + "//conditions:default": [ + ], + }), alwayslink = True, # Contains compiler registration ) @@ -168,7 +174,6 @@ cc_library( ":cpu_runtime", ":custom_call_target_registry", ":disassembler", - ":external_constant_pool", ":orc_jit_memory_mapper", ":runtime_fp16", ":runtime_conv2d", @@ -180,6 +185,7 @@ cc_library( ":runtime_single_threaded_conv2d", ":runtime_single_threaded_fft", ":runtime_single_threaded_matmul", + "@com_google_absl//absl/memory", "@llvm//:execution_engine", "@llvm//:core", "@llvm//:mc", # fixdeps: keep @@ -231,6 +237,9 @@ cc_library( "//tensorflow/compiler/xla/service:tuple_points_to_analysis", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", "@llvm//:orc_jit", ], ) @@ -249,7 +258,6 @@ cc_library( ":cpu_options", ":cpu_runtime", ":dot_op_emitter", - ":external_constant_pool", ":ir_emission_utils", ":ir_function", ":parallel_loop_emitter", @@ -266,17 +274,23 @@ cc_library( "//tensorflow/compiler/xla/service:buffer_assignment", "//tensorflow/compiler/xla/service:elemental_ir_emitter", "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_casting_utils", "//tensorflow/compiler/xla/service:hlo_module_config", "//tensorflow/compiler/xla/service:name_uniquer", "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis", + "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util", + "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util", "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", + "//tensorflow/compiler/xla/service/llvm_ir:ir_builder_mixin", "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter", - "//tensorflow/compiler/xla/service/llvm_ir:ops", "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", "@llvm//:code_gen", "@llvm//:core", "@llvm//:support", @@ -321,6 +335,8 @@ cc_library( "//tensorflow/compiler/xla/service/cpu:cpu_runtime", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", "@llvm//:core", ], ) @@ -331,12 +347,12 @@ cc_library( hdrs = ["parallel_loop_emitter.h"], deps = [ ":ir_emission_utils", - "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", "@llvm//:core", ], ) @@ -363,6 +379,7 @@ cc_library( "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", "@llvm//:core", ], ) @@ -372,7 +389,7 @@ tf_cc_binary( srcs = ["sample_harness.cc"], deps = [ "//tensorflow/compiler/xla:array4d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", @@ -380,9 +397,10 @@ tf_cc_binary( "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", ], ) @@ -396,6 +414,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", "@llvm//:mc", "@llvm//:mc_disassembler", "@llvm//:object", @@ -419,6 +438,7 @@ cc_library( "//tensorflow/compiler/xla/service:llvm_compiler", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", "@llvm//:analysis", "@llvm//:core", "@llvm//:ipo", @@ -447,6 +467,7 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -461,6 +482,7 @@ cc_library( deps = [ ":vector_support_library", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", + "//tensorflow/compiler/xla/service/llvm_ir:math_ops", "//tensorflow/core:lib", "@llvm//:core", "@llvm//:transform_utils", @@ -499,10 +521,7 @@ cc_library( "//tensorflow/core:framework_lite", "//tensorflow/core/kernels:eigen_helpers", "//third_party/eigen3", - ] + if_mkl([ - "@mkl_dnn", - "//third_party/mkl:intel_binary_blob", - ]), + ] + mkl_deps(), ) cc_library( @@ -556,10 +575,7 @@ cc_library( "//tensorflow/compiler/xla:executable_run_options", "//tensorflow/core:framework_lite", "//third_party/eigen3", - ] + if_mkl([ - "//third_party/mkl:intel_binary_blob", - "@mkl_dnn", - ]), + ] + mkl_deps(), ) cc_library( @@ -640,6 +656,8 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//third_party/eigen3", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", ], ) @@ -649,11 +667,13 @@ tf_cc_test( deps = [ ":cpu_instruction_fusion", "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service:transpose_folding", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -706,9 +726,9 @@ tf_cc_test( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) @@ -734,7 +754,7 @@ tf_cc_test( deps = [ ":cpu_layout_assignment", ":target_machine_features_fake", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_layout", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", @@ -748,6 +768,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -816,6 +837,8 @@ cc_library( "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_cost_analysis", "//tensorflow/compiler/xla/service:hlo_pass", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", ], ) @@ -826,7 +849,7 @@ tf_cc_test( ":cpu_executable", ":parallel_task_assignment", ":target_machine_features_fake", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_layout", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", @@ -852,6 +875,7 @@ cc_library( deps = [ "//tensorflow/compiler/xla/service:hlo_module_config", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -898,6 +922,9 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", + "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/types:span", "@llvm//:core", "@llvm//:support", ], @@ -908,7 +935,7 @@ tf_cc_test( srcs = ["cpu_copy_insertion_test.cc"], deps = [ ":cpu_copy_insertion", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", @@ -958,7 +985,7 @@ tf_cc_test( ":ir_emission_utils", ":target_machine_features_fake", "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) diff --git a/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc b/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..1942ea1a2af8a349de53bafe80977436f9740fc4 --- /dev/null +++ b/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc @@ -0,0 +1,57 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h" + +namespace xla { +namespace cpu { + +using BufferInfo = ::tensorflow::cpu_function_runtime::BufferInfo; + +std::vector CreateBufferInfosFromBufferAssignment( + const BufferAssignment& buffer_assignment) { + std::vector buffer_infos; + for (const BufferAllocation& allocation : buffer_assignment.Allocations()) { + if (allocation.is_thread_local()) { + buffer_infos.push_back(BufferInfo::MakeOnStackBuffer(allocation.size())); + } else if (allocation.is_constant()) { + buffer_infos.push_back(BufferInfo::MakeConstant(allocation.size())); + } else if (allocation.is_entry_computation_parameter()) { + buffer_infos.push_back(BufferInfo::MakeEntryParameter( + /*size=*/allocation.size(), + /*param_number=*/allocation.parameter_number())); + } else { + buffer_infos.push_back(BufferInfo::MakeTempBuffer(allocation.size())); + } + } + return buffer_infos; +} + +std::vector CreateArgIndexTableFromBufferInfos( + absl::Span buffer_infos) { + std::vector result; + for (int64 i = 0; i < buffer_infos.size(); i++) { + if (buffer_infos[i].is_entry_parameter()) { + if (buffer_infos[i].entry_parameter_number() >= result.size()) { + result.resize(buffer_infos[i].entry_parameter_number() + 1); + } + result[buffer_infos[i].entry_parameter_number()] = i; + } + } + return result; +} + +} // namespace cpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/buffer_info_util.h b/tensorflow/compiler/xla/service/cpu/buffer_info_util.h new file mode 100644 index 0000000000000000000000000000000000000000..e9ee928ab290f2f5338bd7b3804dc43033e2042f --- /dev/null +++ b/tensorflow/compiler/xla/service/cpu/buffer_info_util.h @@ -0,0 +1,42 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_ + +#include "absl/types/span.h" +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" +#include "tensorflow/compiler/xla/service/buffer_assignment.h" + +namespace xla { +namespace cpu { +// Creates and returns a list of BufferInfo instances containing relevant +// information from `buffer_assignment`. +std::vector<::tensorflow::cpu_function_runtime::BufferInfo> +CreateBufferInfosFromBufferAssignment( + const BufferAssignment& buffer_assignment); + +// Creates and returns a table containing the mapping from entry computation +// parameters to buffer allocation indices. +// +// If this function returns V then entry parameter i has buffer allocation index +// V[i]. +std::vector CreateArgIndexTableFromBufferInfos( + absl::Span + buffer_infos); +} // namespace cpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_ diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc index 6a7eb85e3baec3517b8f3ddef6a8dcfae9c9e614..73b03440cbb936017257b8a92f16dcc25d41e21c 100644 --- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc +++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc @@ -22,6 +22,7 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -35,7 +36,6 @@ limitations under the License. #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h" #include "tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" @@ -156,9 +156,26 @@ std::unique_ptr CompilerFunctor::operator()( target_machine_->addPassesToEmitMC(codegen_passes, mc_context, ostream); codegen_passes.run(module); - // Construct ObjectFile from machine code buffer. - return std::unique_ptr( + std::unique_ptr memory_buffer( new llvm::SmallVectorMemoryBuffer(std::move(stream_buffer))); + + if (VLOG_IS_ON(2)) { + llvm::Expected> obj_file = + llvm::object::ObjectFile::createObjectFile(*memory_buffer); + if (obj_file) { + StatusOr disasm_result = + disassembler_->DisassembleObjectFile(*obj_file.get()); + if (disasm_result.ok()) { + XLA_VLOG_LINES(2, disasm_result.ValueOrDie().text); + } else { + LOG(WARNING) << "Could not disassemble object file!"; + } + } else { + LOG(WARNING) << "Could convert memory buffer to object file!"; + } + } + + return memory_buffer; } static std::vector VectorFunctionsForTargetLibraryInfoImpl() { @@ -188,7 +205,7 @@ void CompilerFunctor::AddTargetInfoPasses( llvm::legacy::PassManagerBase* passes) const { llvm::Triple target_triple(target_machine_->getTargetTriple()); auto target_library_info_impl = - MakeUnique(target_triple); + absl::make_unique(target_triple); target_library_info_impl->addVectorizableFunctions( VectorFunctionsForTargetLibraryInfoImpl()); passes->add( diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc index 0985b9297fe487f3523826cb0978c17775549735..098ce17a568fd3fb531020e7731100fabda43721 100644 --- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc +++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc @@ -132,6 +132,7 @@ StatusOr ConvCanonicalization::Run(HloModule* module) { HloInstruction* new_conv = module->entry_computation()->AddInstruction( HloInstruction::CreateConvolve(new_conv_shape, new_input, new_kernel, hlo->window(), new_dnums)); + new_conv->set_precision_config(hlo->precision_config()); // Reshape the output back to the shape of the original convolution. TF_RETURN_IF_ERROR(module->entry_computation()->ReplaceWithNewInstruction( diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h index e6fd1499edd0095395194200a5b444ad61e7e39d..59437e88af27528654a0af86baf69ec7a1e91d60 100644 --- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h +++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h @@ -38,7 +38,7 @@ class ConvCanonicalization : public HloPassInterface { : target_machine_features_(*target_machine_features) {} ~ConvCanonicalization() override {} - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "convolution-canonicalization"; } diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc index 375b017b09263c20c1b1ef8329f7e2f6a573dda4..547d4c696da5cfdde3dece03250ae5fa51c92f25 100644 --- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc +++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc @@ -60,11 +60,11 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) { auto builder = HloComputation::Builder(TestName()); // The input dimensions are in CNHW order. auto input = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR4FromArray4D(Array4D( + LiteralUtil::CreateR4FromArray4D(Array4D( kInputFeatureCount, kBatchSize, kInputSize, kInputSize)))); // The kernel dimensions are in OIHW order. auto kernel = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR4FromArray4D(Array4D( + LiteralUtil::CreateR4FromArray4D(Array4D( kOutputFeatureCount, kInputFeatureCount, kWindowSize, kWindowSize)))); ConvolutionDimensionNumbers dnums; @@ -122,11 +122,11 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) { auto builder = HloComputation::Builder(TestName()); // The input dimensions are in NHWC order. auto input = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR4FromArray4D(Array4D( + LiteralUtil::CreateR4FromArray4D(Array4D( kBatchSize, kInputSize, kInputSize, kInputFeatureCount)))); // The kernel dimensions are in HWIO order. auto kernel = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR4FromArray4D(Array4D( + LiteralUtil::CreateR4FromArray4D(Array4D( kWindowSize, kWindowSize, kInputFeatureCount, kOutputFeatureCount)))); ConvolutionDimensionNumbers dnums; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 25b18eff20f901fc34343a12bfbd353ecec49cfb..796f36510e414cde692208cfe0cf9626acae63d3 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -26,10 +26,13 @@ limitations under the License. // IWYU pragma: no_include "llvm/Config/Disassemblers.def.inc" // IWYU pragma: no_include "llvm/Config/Targets.def.inc" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include "llvm/Object/ObjectFile.h" @@ -38,10 +41,9 @@ limitations under the License. #include "llvm/Support/TargetSelect.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/protobuf_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/algebraic_simplifier.h" #include "tensorflow/compiler/xla/service/batch_dot_simplification.h" #include "tensorflow/compiler/xla/service/batchnorm_expander.h" @@ -49,6 +51,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/buffer_liveness.h" #include "tensorflow/compiler/xla/service/call_inliner.h" #include "tensorflow/compiler/xla/service/conditional_simplifier.h" +#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h" +#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h" #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h" #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h" #include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h" @@ -66,7 +70,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/dot_decomposer.h" #include "tensorflow/compiler/xla/service/flatten_call_graph.h" -#include "tensorflow/compiler/xla/service/gather_expander.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_constant_folding.h" @@ -87,6 +90,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/service/reshape_mover.h" +#include "tensorflow/compiler/xla/service/scatter_expander.h" #include "tensorflow/compiler/xla/service/transpose_folding.h" #include "tensorflow/compiler/xla/service/tuple_simplifier.h" #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h" @@ -98,11 +102,10 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { namespace cpu { +using BufferInfo = ::tensorflow::cpu_function_runtime::BufferInfo; CpuAotCompilationOptions::CpuAotCompilationOptions( string triple, string cpu_name, string features, string entry_point_name, @@ -120,11 +123,11 @@ se::Platform::Id CpuAotCompilationOptions::PlatformId() const { } CpuAotCompilationResult::CpuAotCompilationResult( - ObjectFileData object_file_data, BufferSizes buffer_sizes, + ObjectFileData object_file_data, std::vector buffer_infos, int64 result_buffer_index, std::unique_ptr hlo_profile_printer_data) : object_file_data_(std::move(object_file_data)), - buffer_sizes_(std::move(buffer_sizes)), + buffer_infos_(std::move(buffer_infos)), result_buffer_index_(result_buffer_index), hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {} @@ -231,15 +234,15 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault { std::unordered_map* hlo_to_profile_idx_; const std::unordered_map& assigned_indices_; }; -} // namespace -Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile, - llvm::TargetMachine* target_machine) { - LLVMTargetMachineFeatures target_machine_features(target_machine); +} // namespace - // Optimization pipeline. - HloPassPipeline pipeline("CPU"); - pipeline.AddInvariantChecker(); +Status CpuCompiler::RunHloPassesThroughLayoutAssn( + HloModule* module, bool /*is_aot_compile*/, + LLVMTargetMachineFeatures* target_machine_features) { + HloPassPipeline pipeline("HLO passes through layout assignment"); + pipeline.AddInvariantChecker(/*layout_sensitive=*/false, + /*allow_mixed_precision=*/false); pipeline.AddPass(); ReducePrecisionInsertion::AddPasses( @@ -255,25 +258,27 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile, pipeline.AddPass(); pipeline.AddPass(); pipeline.AddPass(); - pipeline.AddPass(&target_machine_features); + pipeline.AddPass(); + pipeline.AddPass(target_machine_features); { auto& pass = pipeline.AddPass>("simplification"); - pass.AddInvariantChecker(); + pass.AddInvariantChecker(/*layout_sensitive=*/false, + /*allow_mixed_precision=*/false); pass.AddPass( /*rewrite_training_op=*/true, /*rewrite_inference_op=*/true, - /*rewrite_grad_op=*/true, - /*use_fusion=*/false); + /*rewrite_grad_op=*/true); pass.AddPass( /*is_layout_sensitive=*/false, [](const Shape&, const Shape&) { return false; }, /*enable_dot_strength_reduction=*/false); + pass.AddPass(); // BatchNormExpander can create zero-sized ops, so zero-sized HLO // elimination has to come after that pass. - pipeline.AddPass(); + pass.AddPass(); pass.AddPass(); pass.AddPass(); @@ -286,10 +291,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile, } pipeline.AddPass(); pipeline.AddPass( - [&target_machine_features]( - const HloInstruction& dot, + [&](const HloInstruction& dot, const TransposeFolding::OperandIndices& candidate_operands) { - return PotentiallyImplementedAsEigenDot(dot, target_machine_features) + return PotentiallyImplementedAsEigenDot(dot, *target_machine_features) ? candidate_operands : TransposeFolding::OperandIndices{}; }, @@ -297,23 +301,45 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile, pipeline.AddPass(/*is_layout_sensitive=*/false); pipeline.AddPass(); - pipeline.AddPass(); + pipeline.AddPass(); ReducePrecisionInsertion::AddPasses( &pipeline, module->config().debug_options(), ReducePrecisionInsertion::PassTiming::AFTER_FUSION); pipeline.AddPass( - module->mutable_device_entry_computation_layout(), - &target_machine_features); + module->mutable_entry_computation_layout(), target_machine_features); + return pipeline.Run(module).status(); +} + +Status CpuCompiler::RunHloPassesAfterLayoutAssn( + HloModule* module, bool is_aot_compile, + LLVMTargetMachineFeatures* target_machine_features) { + HloPassPipeline pipeline("HLO passes after layout assignment"); + // After layout assignment, use a layout-sensitive verifier. + auto& after_layout_assn = + pipeline.AddPass("after layout assignment"); + after_layout_assn.AddInvariantChecker( + /*layout_sensitive=*/true, + /*allow_mixed_precision=*/false); + // The LayoutAssignment pass may leave behind kCopy instructions which are // duplicate or NOPs, so remove them with algebraic simplification and CSE. - pipeline.AddPass>( - /*is_layout_sensitive=*/true, - [](const Shape&, const Shape&) { return true; }, - /*enable_dot_strength_reduction=*/false); - pipeline.AddPass(/*is_layout_sensitive=*/true); + { + auto& pass = pipeline.AddPass>( + "simplification after layout assignement"); + pass.AddInvariantChecker(/*layout_sensitive=*/true, + /*allow_mixed_precision=*/false); + pass.AddPass>( + /*is_layout_sensitive=*/true, + [](const Shape&, const Shape&) { return true; }, + /*enable_dot_strength_reduction=*/false); + pass.AddPass(); + pass.AddPass(/*is_layout_sensitive=*/true); + } + pipeline.AddPass(BF16, F32); + // Outline ops in the entry computation into calls to subcomputations. const int max_parallelism = module->config().intra_op_parallelism_threads() > 0 @@ -326,14 +352,14 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile, // binary size (and most AOT applications are single-threaded). // TODO(b/29630486) Support multi-threaded AOT. pipeline.AddPass( - max_parallelism, ShapeSizeBytesFunction(), &target_machine_features); + max_parallelism, ShapeSizeBytesFunction(), target_machine_features); } - // Copy insertion should be performed immediately before IR emission to avoid - // inserting unnecessary copies (later pass adds an instruction which - // materializes the value) or missing a necessary copy (later pass removes an - // instruction which materializes a value). DCE must be run immediately before - // (and sometime after) copy insertion, to avoid dead code from interfering - // with the rewrites. + // Copy insertion should be performed immediately before IR emission to + // avoid inserting unnecessary copies (later pass adds an instruction which + // materializes the value) or missing a necessary copy (later pass removes + // an instruction which materializes a value). DCE must be run immediately + // before (and sometime after) copy insertion, to avoid dead code from + // interfering with the rewrites. pipeline.AddPass(); pipeline.AddPass(); pipeline.AddPass(); @@ -341,6 +367,15 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile, return pipeline.Run(module).status(); } +Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile, + llvm::TargetMachine* target_machine) { + LLVMTargetMachineFeatures target_machine_features(target_machine); + TF_RETURN_IF_ERROR(RunHloPassesThroughLayoutAssn(module, is_aot_compile, + &target_machine_features)); + return RunHloPassesAfterLayoutAssn(module, is_aot_compile, + &target_machine_features); +} + namespace { // Align buffers to 16-byte boundaries. @@ -352,7 +387,7 @@ llvm::TargetOptions CompilerTargetOptions( llvm::TargetOptions target_options; llvm_ir::SetTargetOptions( /*fast_math_enabled=*/module_config.debug_options() - .xla_enable_fast_math(), + .xla_cpu_enable_fast_math(), &target_options); return target_options; } @@ -444,7 +479,7 @@ Status CreateHloProfilingArtifacts( computation_to_profile_idx, std::unique_ptr* hlo_profile_index_map, std::unique_ptr* hlo_profile_printer_data) { - *hlo_profile_index_map = MakeUnique(module); + *hlo_profile_index_map = absl::make_unique(module); const HloComputation& entry_computation = *module.entry_computation(); TF_ASSIGN_OR_RETURN( @@ -511,15 +546,15 @@ StatusOr> CpuCompiler::RunBackend( &pre_optimization_ir_hook, &post_optimization_ir_hook)); // Compile must be thread-safe so create a new LLVM context for the module. - auto llvm_context = xla::MakeUnique(); + auto llvm_context = absl::make_unique(); auto llvm_module = - xla::MakeUnique("__compute_module", *llvm_context); + absl::make_unique("__compute_module", *llvm_context); - auto jit = xla::MakeUnique( + auto jit = absl::make_unique( CompilerTargetOptions(module->config()), CodeGenOptLevel(module->config()), options::OptimizeForSizeRequested(module->config()), - module->config().debug_options().xla_enable_fast_math(), + module->config().debug_options().xla_cpu_enable_fast_math(), module->config().debug_options().xla_llvm_disable_expensive_passes(), pre_optimization_ir_hook, post_optimization_ir_hook); llvm_module->setDataLayout(jit->data_layout()); @@ -550,17 +585,18 @@ StatusOr> CpuCompiler::RunBackend( // and reduced memory usage (as compared to using DependencyHloOrdering). TF_ASSIGN_OR_RETURN( SequentialHloOrdering::HloModuleSequence module_sequence, - CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction(), - DFSMemoryScheduler)); + ScheduleComputationsInModule(*module, BufferSizeBytesFunction(), + DFSMemoryScheduler)); - // Run buffer analysis on the HLO graph. This analysis figures out which - // temporary buffers are required to run the computation. + // Run buffer allocation on the HLO graph. TF_ASSIGN_OR_RETURN( std::unique_ptr assignment, - BufferAssigner::Run( - module.get(), - xla::MakeUnique(module.get(), module_sequence), - BufferSizeBytesFunction(), memory_alignment)); + BufferAssigner::Run(module.get(), + absl::make_unique( + module.get(), module_sequence), + BufferSizeBytesFunction(), memory_alignment, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/true)); // BufferAssignment::ToString() includes a header, so no need for us to // print one ourselves. XLA_VLOG_LINES(2, assignment->ToString()); @@ -580,7 +616,9 @@ StatusOr> CpuCompiler::RunBackend( IrEmitter ir_emitter(*module, *assignment, llvm_module.get(), std::move(instruction_to_profile_idx), std::move(computation_to_profile_idx), - &target_machine_features, jit->external_constant_pool()); + &target_machine_features); + + TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals()); for (auto embedded_computation : entry_computation->MakeEmbeddedComputationsList()) { @@ -603,7 +641,13 @@ StatusOr> CpuCompiler::RunBackend( /*is_top_level_computation=*/true, &module_sequence.at(entry_computation))); - string function_name = llvm_ir::AsString(entry_function->getName()); + string function_name = [&]() { + llvm::SmallVector function_name_vector; + llvm::Mangler::getNameWithPrefix( + function_name_vector, entry_function->getName(), jit->data_layout()); + return string(function_name_vector.begin(), function_name_vector.end()); + }(); + string ir_module_string; if (embed_ir_in_executable) { ir_module_string = llvm_ir::DumpModuleToString(*llvm_module); @@ -639,9 +683,9 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, // so we bail if the configs have conflicting flags. At the moment, the only // flag that needs to be consistent is fast-math. const bool fast_math_enabled = - modules[0]->config().debug_options().xla_enable_fast_math(); + modules[0]->config().debug_options().xla_cpu_enable_fast_math(); for (const auto& module : modules) { - if (module->config().debug_options().xla_enable_fast_math() != + if (module->config().debug_options().xla_cpu_enable_fast_math() != fast_math_enabled) { return InvalidArgument( "All HLO module configs must have the same value for " @@ -660,8 +704,7 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, const llvm::Target* target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), error); if (target == nullptr) { - return InternalError("TargetRegistry::lookupTarget failed: %s", - error.c_str()); + return InternalError("TargetRegistry::lookupTarget failed: %s", error); } llvm::Reloc::Model reloc_model = llvm::Reloc::Static; @@ -697,7 +740,7 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, llvm::StringRef cpu_name = llvm_ir::AsStringRef(options.cpu_name()); llvm::StringRef features = llvm_ir::AsStringRef(options.features()); llvm::CodeGenOpt::Level opt_level = CodeGenOptLevel(modules[0]->config()); - std::unique_ptr target_machine = WrapUnique( + std::unique_ptr target_machine = absl::WrapUnique( target->createTargetMachine(triple.getTriple(), cpu_name, features, CompilerTargetOptions(modules[0]->config()), reloc_model, llvm::None, opt_level)); @@ -730,7 +773,7 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, TF_ASSIGN_OR_RETURN( SequentialHloOrdering::HloModuleSequence module_sequence, - CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction())); + ScheduleComputationsInModule(*module, BufferSizeBytesFunction())); // Run buffer analysis on the HLO graph. This analysis figures out which // temporary buffers are required to run the computation. @@ -738,8 +781,10 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, std::unique_ptr assignment, BufferAssigner::Run( module, - xla::MakeUnique(module, module_sequence), - BufferSizeBytesFunction(), memory_alignment)); + absl::make_unique(module, module_sequence), + BufferSizeBytesFunction(), memory_alignment, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/true)); // BufferAssignment::ToString() includes a header, so no need for us to // print one ourselves. XLA_VLOG_LINES(2, assignment->ToString()); @@ -767,8 +812,10 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, IrEmitter ir_emitter(*module, *assignment, &llvm_module, std::move(instruction_to_profile_idx), std::move(computation_to_profile_idx), - &target_machine_features, - /*external_constant_pool=*/nullptr); + &target_machine_features); + + TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals()); + HloComputation* computation = module->entry_computation(); for (auto embedded_computation : computation->MakeEmbeddedComputationsList()) { @@ -814,7 +861,7 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, CompilerFunctor compiler_functor( target_machine.get(), &disassembler, opt_level, options::OptimizeForSizeRequested(module->config()), - module->config().debug_options().xla_enable_fast_math(), + module->config().debug_options().xla_cpu_enable_fast_math(), module->config().debug_options().xla_llvm_disable_expensive_passes(), pre_optimization_ir_dump_hook, post_optimization_ir_dump_hook); std::unique_ptr object_file = @@ -822,27 +869,14 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, ObjectFileData object_file_data(object_file->getBufferStart(), object_file->getBufferEnd()); - BufferSizes buffer_sizes; - for (const BufferAllocation& allocation : assignment->Allocations()) { - // Callers don't need to allocate temporary buffers for parameters. - if (allocation.is_entry_computation_parameter()) { - buffer_sizes.push_back(-1); - continue; - } - // Callers don't need to allocate anything for thread-local temporary - // buffers. They are lowered to allocas. - if (allocation.is_thread_local()) { - buffer_sizes.push_back(-1); - continue; - } - buffer_sizes.push_back(allocation.size()); - } + std::vector buffer_infos = + CreateBufferInfosFromBufferAssignment(*assignment); TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, assignment->GetUniqueTopLevelOutputSlice()); - results.emplace_back(MakeUnique( - std::move(object_file_data), std::move(buffer_sizes), + results.emplace_back(absl::make_unique( + std::move(object_file_data), std::move(buffer_infos), result_slice.index(), std::move(hlo_profile_printer_data))); } @@ -864,7 +898,7 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const { static bool InitModule() { xla::Compiler::RegisterCompilerFactory( stream_executor::host::kHostPlatformId, - []() { return xla::MakeUnique(); }); + []() { return absl::make_unique(); }); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h index e56f9f01134f84b4698c078b750b0c1fdca7748e..f2af923782df268e3e6da3895ec35579ab6aa51f 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h @@ -18,12 +18,14 @@ limitations under the License. #include +#include "absl/types/span.h" #include "llvm/Target/TargetMachine.h" +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" +#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/llvm_compiler.h" #include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -78,7 +80,8 @@ class CpuAotCompilationOptions : public AotCompilationOptions { class CpuAotCompilationResult : public AotCompilationResult { public: CpuAotCompilationResult( - ObjectFileData object_file_data, BufferSizes buffer_sizes, + ObjectFileData object_file_data, + std::vector<::tensorflow::cpu_function_runtime::BufferInfo> buffer_infos, int64 result_buffer_index, std::unique_ptr hlo_profile_printer_data); ~CpuAotCompilationResult(); @@ -88,17 +91,20 @@ class CpuAotCompilationResult : public AotCompilationResult { } const ObjectFileData& object_file_data() const { return object_file_data_; } - const BufferSizes& buffer_sizes() const { return buffer_sizes_; } + const std::vector<::tensorflow::cpu_function_runtime::BufferInfo>& + buffer_infos() const { + return buffer_infos_; + } int64 result_buffer_index() const { return result_buffer_index_; } private: // Contains the compiled computation: an object file. const ObjectFileData object_file_data_; - // The list of buffer sizes which should be allocated in order to execute the - // compiled computation. These buffers are used for temporary buffers used - // ephemerally during computation as well as the output result. - const BufferSizes buffer_sizes_; + // A list of BufferInfo objects describing the buffers used by the XLA + // computation. + const std::vector<::tensorflow::cpu_function_runtime::BufferInfo> + buffer_infos_; // Contains which buffer index into |buffer_sizes| was designated to the // result of the computation. This buffer should be passed into the output @@ -152,6 +158,16 @@ class CpuCompiler : public LLVMCompiler { Status RunHloPasses(HloModule* module, bool is_aot_compile, llvm::TargetMachine* target_machine); + // Runs HLO passes up to and including layout assignment. + Status RunHloPassesThroughLayoutAssn( + HloModule* module, bool /*is_aot_compile*/, + LLVMTargetMachineFeatures* target_machine_features); + + // Runs HLO passes after layout assignment. + Status RunHloPassesAfterLayoutAssn( + HloModule* module, bool is_aot_compile, + LLVMTargetMachineFeatures* target_machine_features); + TF_DISALLOW_COPY_AND_ASSIGN(CpuCompiler); }; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h index 3313d1e6eb71bff39f509c3d24858568df786422..d49f7d7cc2d9b1d00847feda62fa62dd740820d8 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COPY_INSERTION_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COPY_INSERTION_H_ #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" @@ -32,11 +32,11 @@ namespace xla { // (module-scoped). class CpuCopyInsertion : public HloPassInterface { public: - tensorflow::StringPiece name() const override { return "copy-insertion"; } + absl::string_view name() const override { return "copy-insertion"; } StatusOr Run(HloModule* module) override; }; } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COPY_INSERTION_H_ diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc index a05a26941786cbf404c4685abb098c9ac8caaa09..4db7fa446ea9188940f930bcadf753bd3e6b79e3 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc @@ -16,7 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -74,14 +74,14 @@ TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) { body_builder.AddInstruction( HloInstruction::CreateParameter(0, scalar_shape_, "param")); body_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(123.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(123.0))); HloComputation* body = module->AddEmbeddedComputation(body_builder.Build()); auto cond_builder = HloComputation::Builder("condition"); cond_builder.AddInstruction( HloInstruction::CreateParameter(0, scalar_shape_, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module->AddEmbeddedComputation(cond_builder.Build()); @@ -114,7 +114,7 @@ TEST_F(CpuCopyInsertionTest, TupleCall) { auto sub_param = sub_builder.AddInstruction( HloInstruction::CreateParameter(0, scalar_shape_, "param")); auto constant = sub_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(123.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(123.0))); auto add = sub_builder.AddInstruction(HloInstruction::CreateBinary( scalar_shape_, HloOpcode::kAdd, sub_param, constant)); sub_builder.AddInstruction( diff --git a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc index d12fa6bb9ad2054bdc052c9d7b3729cc28e11f6d..8727c72b6e42517b1859e98ecadb41bbceed761c 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h" #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" namespace xla { namespace cpu { @@ -40,7 +40,7 @@ ENTRY DotOperation { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); HloInstruction* dot = module->entry_computation()->root_instruction(); @@ -71,7 +71,7 @@ ENTRY ConvOperation { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); HloInstruction* conv = module->entry_computation()->root_instruction(); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index cf43b74c699ca8cbbef11a0abbaf4d69476f5d77..29abf38e439d919ff93629ed992cb3ff93a929bd 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -22,6 +22,9 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/computation_layout.h" @@ -35,9 +38,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mem.h" @@ -69,12 +69,19 @@ CpuExecutable::CpuExecutable( // guarded by the mutex. compute_function_ = reinterpret_cast(cantFail(sym.getAddress())); + VLOG(1) << "compute_function_ at address " + << reinterpret_cast(compute_function_); } -Status CpuExecutable::AllocateBuffers( +StatusOr, + std::vector>> +CpuExecutable::CreateBufferTable( DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers) { - CHECK_EQ(buffers->size(), assignment_->Allocations().size()); + absl::Span arguments) { + std::vector unowning_buffers( + assignment_->Allocations().size()); + std::vector owning_buffers( + assignment_->Allocations().size()); VLOG(3) << "Allocating " << assignment_->Allocations().size() << " allocations for module " << module().name(); for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); @@ -84,64 +91,65 @@ Status CpuExecutable::AllocateBuffers( VLOG(3) << allocation.ToString(); if (allocation.is_entry_computation_parameter()) { + unowning_buffers[i] = arguments[allocation.parameter_number()]->buffer( + allocation.param_shape_index()); VLOG(3) << "allocation #" << i << " is a parameter"; continue; } + if (allocation.is_constant()) { + VLOG(3) << "allocation #" << i << " is a constant"; + continue; + } + if (allocation.is_thread_local()) { VLOG(3) << "buffer #" << i << " is thread-local"; continue; } int64 buffer_size = allocation.size(); - if (!(*buffers)[i].is_null()) { + if (!owning_buffers[i].is_null()) { VLOG(3) << "buffer #" << i << " is in the preallocated result ShapedBuffer"; } else { - TF_ASSIGN_OR_RETURN((*buffers)[i], memory_allocator->Allocate( - device_ordinal, buffer_size)); + TF_ASSIGN_OR_RETURN(owning_buffers[i], memory_allocator->Allocate( + device_ordinal, buffer_size)); + unowning_buffers[i] = owning_buffers[i].AsDeviceMemoryBase(); VLOG(3) << "buffer #" << i << " allocated " << buffer_size << " bytes [" - << (*buffers)[i].opaque() << "]"; + << owning_buffers[i].opaque() << "]"; } // Since the output buffer and all the temporary buffers were written into // by the JITed code, msan has no way of knowing their memory was // initialized. Mark them initialized so that msan doesn't flag loads from // these buffers. - TF_ANNOTATE_MEMORY_IS_INITIALIZED((*buffers)[i].opaque(), buffer_size); + TF_ANNOTATE_MEMORY_IS_INITIALIZED(owning_buffers[i].opaque(), buffer_size); } TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, assignment_->GetUniqueTopLevelOutputSlice()); VLOG(3) << "result index: " << result_slice.index(); - return Status::OK(); + return {{std::move(unowning_buffers), std::move(owning_buffers)}}; } Status CpuExecutable::ExecuteComputeFunction( const ExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice buffers, + absl::Span buffers, HloExecutionProfile* hlo_execution_profile) { // The calling convention for JITed functions is: // // void function(void* result, const void* run_options, void** args_array, - // void** temps_array) + // void** buffer_table) // // result: Points at the result. // run_options: the ExecutableRunOptions object. - // args_array: An array of pointers, each of which points to a parameter. - // The size of this array is determined by the function's arity - // (ProgramShape). - // temps_array: An array of pointers, each of which points to a temporary - // buffer the computation needs. The size of this array is - // determined by buffer analysis. + // args_array: null + // buffer_table: An array of pointers, containing pointers to temporary + // buffers required by the executable adn pointers to entry computation + // parameters. // - std::vector args_array; - for (const ShapedBuffer* argument : arguments) { - args_array.push_back(argument->root_buffer().opaque()); - } uint64 start_micros = tensorflow::Env::Default()->NowMicros(); @@ -163,26 +171,23 @@ Status CpuExecutable::ExecuteComputeFunction( void* result_buffer = buffer_pointers[result_slice.index()]; if (VLOG_IS_ON(3)) { VLOG(3) << "Executing compute function:"; - VLOG(3) << tensorflow::strings::Printf( - " func(void* result, void* params[%zu], void* temps[%zu], " - "uint64 profile_counters[%zu])", - args_array.size(), buffer_pointers.size(), profile_counters_size); - VLOG(3) << tensorflow::strings::Printf(" result = %p", result_buffer); + VLOG(3) << absl::StrFormat( + " func(void* result, void* params[null], void* buffer_table[%u], " + "uint64 profile_counters[%u])", + buffer_pointers.size(), profile_counters_size); + VLOG(3) << absl::StrFormat(" result = %p", result_buffer); auto ptr_printer = [](string* out, const void* p) { - tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p)); + absl::StrAppend(out, absl::StrFormat("%p", p)); }; - VLOG(3) << tensorflow::strings::Printf( - " params = [%s]", - tensorflow::str_util::Join(args_array, ", ", ptr_printer).c_str()); - VLOG(3) << tensorflow::strings::Printf( - " temps = [%s]", - tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str()); - VLOG(3) << tensorflow::strings::Printf(" profile_counters = %p", - profile_counters); + VLOG(3) << " params = nullptr"; + VLOG(3) << absl::StrFormat( + " buffer_table = [%s]", + absl::StrJoin(buffer_pointers, ", ", ptr_printer)); + VLOG(3) << absl::StrFormat(" profile_counters = %p", profile_counters); } - compute_function_(result_buffer, run_options, args_array.data(), - buffer_pointers.data(), profile_counters); + compute_function_(result_buffer, run_options, nullptr, buffer_pointers.data(), + profile_counters); uint64 end_micros = tensorflow::Env::Default()->NowMicros(); @@ -203,11 +208,11 @@ Status CpuExecutable::ExecuteComputeFunction( StatusOr CpuExecutable::CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::MutableArraySlice buffers) { + absl::Span buffers) { se::Stream* stream = run_options->stream(); ScopedShapedBuffer result_buffer( - /*on_host_shape=*/host_result_shape(), - /*on_device_shape=*/host_result_shape(), run_options->allocator(), + /*on_host_shape=*/result_shape(), + /*on_device_shape=*/result_shape(), run_options->allocator(), stream->parent()->device_ordinal()); // Move OwningDeviceMemory values which contain the array(s) of the result @@ -241,55 +246,48 @@ StatusOr CpuExecutable::CreateResultShapedBuffer( StatusOr CpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) { - if (GetRootPointsToSet().IsAmbiguous()) { - return Unimplemented("Points-to set of root instruction is ambiguous"); - } - - se::Stream* stream = run_options->stream(); - DeviceMemoryAllocator* memory_allocator = run_options->allocator(); - std::vector buffers(assignment_->Allocations().size()); - - TF_RETURN_IF_ERROR(AllocateBuffers( - memory_allocator, stream->parent()->device_ordinal(), &buffers)); - - std::vector unowning_buffers; - unowning_buffers.reserve(buffers.size()); - for (auto& buffer : buffers) { - unowning_buffers.push_back(buffer.AsDeviceMemoryBase()); - } - TF_RETURN_IF_ERROR(ExecuteComputeFunction(&run_options->run_options(), - arguments, unowning_buffers, - hlo_execution_profile)); - - return CreateResultShapedBuffer(run_options, &buffers); + TF_ASSIGN_OR_RETURN( + auto result, + ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile)); + TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone()); + return std::move(result); } StatusOr CpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { if (hlo_profiling_enabled()) { return Unimplemented( "Asynchronous execution on stream with hlo profiling is not yet " "supported on CPU."); } + return ExecuteAsyncOnStreamImpl(run_options, arguments, nullptr); +} + +StatusOr CpuExecutable::ExecuteAsyncOnStreamImpl( + const ServiceExecutableRunOptions* run_options, + absl::Span arguments, + HloExecutionProfile* hlo_execution_profile) { + if (GetRootPointsToSet().IsAmbiguous()) { + return Unimplemented("Points-to set of root instruction is ambiguous"); + } auto* host_stream = dynamic_cast( run_options->stream()->implementation()); se::Stream* stream = run_options->stream(); DeviceMemoryAllocator* memory_allocator = run_options->allocator(); - std::vector buffers(assignment_->Allocations().size()); - TF_RETURN_IF_ERROR(AllocateBuffers( - memory_allocator, stream->parent()->device_ordinal(), &buffers)); - + std::vector owning_buffers; std::vector unowning_buffers; - unowning_buffers.reserve(buffers.size()); - for (auto& buffer : buffers) { - unowning_buffers.push_back(buffer.AsDeviceMemoryBase()); - } - TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, - CreateResultShapedBuffer(run_options, &buffers)); + TF_ASSIGN_OR_RETURN( + std::tie(unowning_buffers, owning_buffers), + CreateBufferTable(memory_allocator, stream->parent()->device_ordinal(), + arguments)); + + TF_ASSIGN_OR_RETURN( + ScopedShapedBuffer result, + CreateResultShapedBuffer(run_options, absl::MakeSpan(owning_buffers))); // At this point, `unowning_buffers` contains unowning pointers to all of our // buffers, and `buffers` contains owning pointers to the non-live-out @@ -302,28 +300,27 @@ StatusOr CpuExecutable::ExecuteAsyncOnStream( // // We also need to change the types of some of the variables we capture: // run_options needs to change from a pointer to a value type, and arguments - // needs to change from an ArraySlice into a vector. We use a struct instead + // needs to change from a Span into a vector. We use a struct instead // of a lambda to make this explicit. struct AsyncRunTask { CpuExecutable* executable; ServiceExecutableRunOptions run_options; - std::vector arguments; std::vector unowning_buffers; std::shared_ptr> buffers; + HloExecutionProfile* hlo_execution_profile; void operator()() { // Failing a CHECK here is not great, but I don't see an obvious way to // return a failed Status asynchronously. TF_CHECK_OK(executable->ExecuteComputeFunction( - &run_options.run_options(), arguments, unowning_buffers, - /*hlo_execution_profile=*/nullptr)); + &run_options.run_options(), unowning_buffers, hlo_execution_profile)); } }; - host_stream->EnqueueTask(AsyncRunTask{ - this, *run_options, - std::vector(arguments.begin(), arguments.end()), - unowning_buffers, - std::make_shared>(std::move(buffers))}); + host_stream->EnqueueTask( + AsyncRunTask{this, *run_options, std::move(unowning_buffers), + std::make_shared>( + std::move(owning_buffers)), + hlo_execution_profile}); return std::move(result); } diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 8dd47bfb865e8a0552542f510d3365cff0d111e0..3c3c047bfe8ee0d1ad90ede2432a86264f47870b 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -22,6 +22,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" @@ -33,7 +34,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" @@ -57,12 +57,12 @@ class CpuExecutable : public Executable { StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) override; StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) override; + absl::Span arguments) override; // This should be called after set_ir_module_string. const string& ir_module_string() const { return ir_module_string_; } @@ -74,9 +74,10 @@ class CpuExecutable : public Executable { static int64 ShapeSizeBytes(const Shape& shape); // Type of the computation function we expect in the JIT. - using ComputeFunctionType = void (*)( - void* /*result*/, const ExecutableRunOptions* /*run_options*/, - const void** /*args*/, void** /*temps*/, int64* /*profile_counters*/); + using ComputeFunctionType = + void (*)(void* /*result*/, const ExecutableRunOptions* /*run_options*/, + const void** /*args*/, void** /*buffer_table*/, + int64* /*profile_counters*/); const ComputeFunctionType& compute_function() const { return compute_function_; @@ -85,29 +86,47 @@ class CpuExecutable : public Executable { const BufferAssignment& buffer_assignment() const { return *assignment_; } private: - // Allocate buffers required for execution and assign them to the elements of - // "buffers". "buffers" should be sized to the number of buffers in buffer - // assignment. Each vector element corresponds to a particular Index. If - // a vector element already contains a non-null DeviceMemoryBase, then no - // buffer is assigned for this element. - Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator, - int device_ordinal, - std::vector* buffers); + // This is for sharing the code between ExecuteOnStream and + // ExecuteAsyncOnStream. + // + // Notice that it's tricky to use correctly, as the profile object (when it + // exists) must out-live the task. + StatusOr ExecuteAsyncOnStreamImpl( + const ServiceExecutableRunOptions* run_options, + absl::Span arguments, + HloExecutionProfile* hlo_execution_profile); + + // Creates an array suitable for passing as the "buffer_table" argument to the + // JIT compiled function pointer. + // + // Returns (unowning_buffers, owning_buffers) where: + // + // - unowning_buffers.data() can be passed as the buffer_table argument as-is + // and includes pointers to the scratch storage required by the + // computation, the live-out buffer into which the result will be written + // and entry computation parameters. + // + // - owning_buffers contains owning pointers to the buffers that were + // allocated by this routine. This routine allocates buffers for temporary + // storage and the live-out buffer into which the computation writes it + // result. + StatusOr, + std::vector>> + CreateBufferTable(DeviceMemoryAllocator* memory_allocator, int device_ordinal, + absl::Span arguments); // Calls the generated function performing the computation with the given // arguments using the supplied buffers. - Status ExecuteComputeFunction( - const ExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice buffers, - HloExecutionProfile* hlo_execution_profile); + Status ExecuteComputeFunction(const ExecutableRunOptions* run_options, + absl::Span buffers, + HloExecutionProfile* hlo_execution_profile); // Creates a ScopedShapedBuffer for holding the result of the computation, // moving buffers out of allocated_buffers and into the result as appropriate. // The addresses are set according to buffer assignment. StatusOr CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::MutableArraySlice buffers); + absl::Span buffers); // Returns the points-to set of the root instruction of the entry // computation. Uses points-to analysis from buffer assignment. diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc index 7bd4741a04b1135d9780e0cf765b7b33378526e1..7fbe0fa157c57eb0c274662a1de95cf5328ccfa8 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc @@ -34,9 +34,8 @@ StatusOr CpuHloSupportChecker::Run(HloModule* module) { return xla::Unimplemented( "CPU backend does not support HLO instruction %s with shape " "containing a sparse layout: %s", - instruction->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(instruction->shape()) - .c_str()); + instruction->ToString(), + ShapeUtil::HumanStringWithLayout(instruction->shape())); } return Status::OK(); })); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h index 2924b6365943f0a3ec998d7a77767a76cbb576ae..6af724b2a5d71b9c30f3485ffb7e51d1d201cb6b 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h @@ -28,9 +28,7 @@ class CpuHloSupportChecker : public HloPassInterface { CpuHloSupportChecker() = default; ~CpuHloSupportChecker() override = default; - tensorflow::StringPiece name() const override { - return "cpu_hlo_support_checker"; - } + absl::string_view name() const override { return "cpu_hlo_support_checker"; } // Note: always returns false (no instructions are ever modified by this // pass). diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc index b40d264c03aba6e9308e8a621ae86e180e33c335..f9cd61bea3dc86cadff99d4a90eca44c16520823 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc @@ -35,7 +35,7 @@ bool CanBeLoopFused(const HloInstruction& hlo) { hlo.opcode() == HloOpcode::kDynamicSlice || hlo.opcode() == HloOpcode::kDynamicUpdateSlice || hlo.opcode() == HloOpcode::kGather || - hlo.opcode() == HloOpcode::kPad || + hlo.opcode() == HloOpcode::kIota || hlo.opcode() == HloOpcode::kPad || hlo.opcode() == HloOpcode::kReshape || hlo.opcode() == HloOpcode::kReverse || hlo.opcode() == HloOpcode::kSlice || @@ -78,7 +78,7 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer, } if (!CanBeLoopFused(*producer)) { - VLOG(2) << "Producer is not fusile."; + VLOG(2) << "Producer is not fusible."; return false; } @@ -140,7 +140,7 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer, } if (CanBeLoopFused(*consumer)) { - VLOG(2) << "Fusing: consumer is elementwise or fusile."; + VLOG(2) << "Fusing: consumer is elementwise or fusible."; return true; } diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc index 46fe060817b0264d90574b45a94cf1f6e5964593..284929ca073ca0d8c5c7cc383f8341a53d0f9e88 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc @@ -18,11 +18,12 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/transpose_folding.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace op = xla::testing::opcode_matchers; @@ -172,7 +173,7 @@ ENTRY DotOperationFusion_TransposeFusion { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); HloComputation* computation = module->entry_computation(); TransposeFolding transpose_folding( @@ -202,7 +203,7 @@ ENTRY DotOperationFusion_TransposeFusion { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); HloComputation* computation = module->entry_computation(); TransposeFolding transpose_folding( @@ -233,7 +234,7 @@ ENTRY DotOperationFusion_TransposeFusion { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); HloComputation* computation = module->entry_computation(); TransposeFolding transpose_folding( @@ -282,7 +283,7 @@ class OpcodeFusionTest : public InstructionFusionTest { builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(F32, {}), "arg0")); HloInstruction* one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, arg0, one)); return module->AddEmbeddedComputation(builder.Build()); @@ -501,8 +502,8 @@ TEST_F(OpcodeFusionTest, UnaryMapOfExp) { HloInstruction* exp = builder.AddInstruction( HloInstruction::CreateUnary(shape, HloOpcode::kExp, param0)); - builder.AddInstruction(HloInstruction::CreateMap( - shape, {exp}, CreateAdderToOne(module.get()), /*static_operands=*/{})); + builder.AddInstruction( + HloInstruction::CreateMap(shape, {exp}, CreateAdderToOne(module.get()))); module->AddEntryComputation(builder.Build()); @@ -525,8 +526,8 @@ TEST_F(OpcodeFusionTest, BinaryMapOfExps) { HloInstruction* exp1 = builder.AddInstruction( HloInstruction::CreateUnary(shape, HloOpcode::kExp, param1)); - builder.AddInstruction(HloInstruction::CreateMap( - shape, {exp0, exp1}, CreateMax(module.get()), /*static_operands=*/{})); + builder.AddInstruction( + HloInstruction::CreateMap(shape, {exp0, exp1}, CreateMax(module.get()))); module->AddEntryComputation(builder.Build()); @@ -566,7 +567,7 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) { HloOpcode::kParameter, HloOpcode::kParameter}); } -TEST_F(OpcodeFusionTest, MessOfFusileNodes) { +TEST_F(OpcodeFusionTest, MessOfFusibleNodes) { auto module = CreateNewModule(); HloComputation::Builder builder(TestName()); @@ -595,7 +596,7 @@ TEST_F(OpcodeFusionTest, MessOfFusileNodes) { auto pad = builder.AddInstruction(HloInstruction::CreatePad( ShapeUtil::MakeShape(S32, {5}), idx_choice, builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))), + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))), padding_config)); auto slice = builder.AddInstruction(HloInstruction::CreateDynamicSlice( @@ -697,8 +698,9 @@ void CreateComputationForDotAddOutputFusionTest(const string& test_name, HloInstruction::CreateBinary(dot_shape, HloOpcode::kAdd, dot, addend)); if (add_extra_use_for_dot) { + auto* token = builder.AddInstruction(HloInstruction::CreateToken()); builder.AddInstruction( - HloInstruction::CreateOutfeed(dot_shape, dot, "no_config")); + HloInstruction::CreateOutfeed(dot_shape, dot, token, "no_config")); } module->AddEntryComputation(builder.Build()); @@ -772,10 +774,10 @@ class GatherLoopFusionTest TEST_P(GatherLoopFusionTest, GatherLoopFusion) { const GatherLoopFusionTestSpec& spec = GetParam(); - string hlo_string = tensorflow::strings::StrCat( - "HloModule ", spec.test_name, "\n\n", spec.hlo_computation_text); + string hlo_string = absl::StrCat("HloModule ", spec.test_name, "\n\n", + spec.hlo_computation_text); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); RunFusionAndCheckOpcodesWereFused( module.get(), @@ -791,11 +793,11 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) gather = s32[3,2] gather(operand, indices), - output_window_dims={0}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={0}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=1, - window_bounds={3, 1} + slice_sizes={3, 1} one = s32[] constant(1) one_broadcasted = s32[3,2] broadcast(one), dimensions={} ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted) @@ -807,11 +809,11 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2] parameter(1) gather = s32[2,3,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={1}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=2, - window_bounds={3, 1} + slice_sizes={3, 1} one = s32[] constant(1) one_broadcasted = s32[2,3,2] broadcast(one), dimensions={} ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted) @@ -823,11 +825,11 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2,2] parameter(1) gather = s32[2,2] gather(operand, indices), - output_window_dims={}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=2, - window_bounds={1, 1} + slice_sizes={1, 1} one = s32[] constant(1) one_broadcasted = s32[2,2] broadcast(one), dimensions={} ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted) @@ -839,11 +841,11 @@ ENTRY main { operand = s32[3,3,2] parameter(0) indices = s32[2,2] parameter(1) gather = s32[2,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=1, - window_bounds={1,1,2} + slice_sizes={1,1,2} one = s32[] constant(1) one_broadcasted = s32[2,2] broadcast(one), dimensions={} ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted) @@ -855,11 +857,11 @@ ENTRY main { operand = s32[3,3,2] parameter(0) indices = s32[2,2] parameter(1) gather = s32[2,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1,2} + slice_sizes={1,1,2} one = s32[] constant(1) one_broadcasted = s32[2,2] broadcast(one), dimensions={} ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted) @@ -871,11 +873,11 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) gather = s32[1,1] gather(operand, indices), - output_window_dims={0,1}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={0,1}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1} + slice_sizes={1,1} one = s32[] constant(1) one_broadcasted = s32[1,1] broadcast(one), dimensions={} ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted) @@ -887,11 +889,11 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2] parameter(1) gather = s32[2,1,1] gather(operand, indices), - output_window_dims={1,2}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1,2}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1} + slice_sizes={1,1} one = s32[] constant(1) one_broadcasted = s32[2,1,1] broadcast(one), dimensions={} ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted) diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc index aa872d5ec9e7593b8d2f731421c17af590729529..bfecbd6e017893e4f6d3dcbc01d46c899e6060fa 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc @@ -34,8 +34,8 @@ namespace cpu { // instruction stream. namespace { -using ::tensorflow::gtl::nullopt; -using ::tensorflow::gtl::optional; +using absl::nullopt; +using absl::optional; using ShouldMakeOperandColMajorCache = tensorflow::gtl::FlatMap; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc index 429fc7b78608da0e9cd794ac294851b326f5be24..9363af3b8941c68284915d6770188bde4c87f78e 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc @@ -20,8 +20,9 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/algebraic_simplifier.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h" @@ -39,7 +40,6 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace op = xla::testing::opcode_matchers; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc index e75fcb6bc9719f7453d5f0cb52d1673cef1fd3df..b8ace5702688096822573c7afae234cbcbe77b28 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/cpu_options.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_split.h" #include "tensorflow/core/lib/strings/numbers.h" namespace { @@ -24,6 +26,7 @@ const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce"; const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor"; const char* const kXlaEnableExperimentalLlvmIrGemm = "xla_enable_experimental_llvm_ir_gemm"; +const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size"; } // namespace @@ -43,17 +46,16 @@ bool VectorizedReduceDisabled(const HloModuleConfig& config) { return extra_options_map.count(kXlaOptimizeForSizeCpuOption) > 0; } -tensorflow::gtl::optional LlvmIrGemvTilingFactor( - const HloModuleConfig& config) { +absl::optional LlvmIrGemvTilingFactor(const HloModuleConfig& config) { const auto& extra_options_map = config.debug_options().xla_backend_extra_options(); auto it = extra_options_map.find(kLlvmIrDotTilingFactor); int64 tiling_factor; if (it != extra_options_map.end() && - tensorflow::strings::safe_strto64(it->second, &tiling_factor)) { + absl::SimpleAtoi(it->second, &tiling_factor)) { return tiling_factor; } - return tensorflow::gtl::nullopt; + return absl::nullopt; } bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) { @@ -62,6 +64,42 @@ bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) { return extra_options_map.count(kXlaEnableExperimentalLlvmIrGemm) > 0; } +static absl::string_view RemoveSuffix(absl::string_view str, + absl::string_view suffix) { + CHECK_GE(str.size(), suffix.size()); + CHECK_EQ(str.substr(str.size() - suffix.size()), suffix); + return str.substr(0, str.size() - suffix.size()); +} + +absl::optional> LlvmIrGemmTileSize( + const HloModuleConfig& config) { + const auto& extra_options_map = + config.debug_options().xla_backend_extra_options(); + auto it = extra_options_map.find(kLlvmIrGemmTileSize); + if (it == extra_options_map.end()) { + return absl::nullopt; + } + + std::vector tile_components = absl::StrSplit(it->second, ':'); + CHECK_EQ(tile_components.size(), 3); + + int64 tile_size_m; + int64 tile_size_k; + int64 tile_size_n_in_vector_width; + + CHECK(absl::SimpleAtoi(tile_components[0], &tile_size_m)); + CHECK(absl::SimpleAtoi(tile_components[1], &tile_size_k)); + + absl::string_view tile_size_n_in_vector_width_str = + RemoveSuffix(tile_components[2], "*vectwidth"); + + CHECK(absl::SimpleAtoi(tile_size_n_in_vector_width_str, + &tile_size_n_in_vector_width)); + + return std::tuple(tile_size_m, tile_size_k, + tile_size_n_in_vector_width); +} + } // namespace options } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h index 106dfbbc62dfba8d3de74e0a2ae3bb247bd91d67..47c7eb13b6e4cc05a23f82b8d2a25249f4b82ac0 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_options.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h @@ -27,7 +27,8 @@ namespace options { bool OptimizeForSizeRequested(const HloModuleConfig& config); bool VectorizedReduceDisabled(const HloModuleConfig& config); bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config); -tensorflow::gtl::optional LlvmIrGemvTilingFactor( +absl::optional LlvmIrGemvTilingFactor(const HloModuleConfig& config); +absl::optional> LlvmIrGemmTileSize( const HloModuleConfig& config); } // namespace options diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc index 54c52bc08f9c53b8c6898689b18c4cb7f4bdcfd0..8a44c384bb0fe6f132c352ca8bd78baa23d093d4 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/core/platform/dynamic_annotations.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -92,9 +93,10 @@ tensorflow::string ShapeString(const void* shape_ptr, xla::int32 shape_length) { } // namespace -void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(xla::int32 buffer_length, - const void* shape, - xla::int32 shape_length) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void* +__xla_cpu_runtime_AcquireInfeedBufferForDequeue(xla::int32 buffer_length, + const void* shape, + xla::int32 shape_length) { if (VLOG_IS_ON(2)) { LOG(INFO) << "AcquireInfeedBufferForDequeue: " << ShapeString(shape, shape_length); @@ -111,9 +113,11 @@ void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(xla::int32 buffer_length, return buffer->data(); } -void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue( - xla::int32 buffer_length, void* buffer_ptr, const void* shape_ptr, - xla::int32 shape_length) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(xla::int32 buffer_length, + void* buffer_ptr, + const void* shape_ptr, + xla::int32 shape_length) { if (VLOG_IS_ON(2)) { LOG(INFO) << "ReleaseInfeedBufferAfterDeque: " << ShapeString(shape_ptr, shape_length); @@ -125,8 +129,10 @@ void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue( std::move(shape)); } -void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation( - xla::int32 buffer_length, const void* shape_ptr, xla::int32 shape_length) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void* +__xla_cpu_runtime_AcquireOutfeedBufferForPopulation(xla::int32 buffer_length, + const void* shape_ptr, + xla::int32 shape_length) { if (VLOG_IS_ON(2)) { LOG(INFO) << "AcquireOutfeedBufferForPopulation: " << ShapeString(shape_ptr, shape_length); @@ -143,9 +149,11 @@ void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation( return buffer->data(); } -void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation( - xla::int32 buffer_length, void* buffer_ptr, const void* shape_ptr, - xla::int32 shape_length) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(xla::int32 buffer_length, + void* buffer_ptr, + const void* shape_ptr, + xla::int32 shape_length) { if (VLOG_IS_ON(2)) { LOG(INFO) << "ReleaseOutfeedBufferAfterPopulation: " << ShapeString(shape_ptr, shape_length); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc index 2ac950e6d93ade315808f2ca1d0bdd7bc85f53b9..1ae3aa57111e3a3b7ac18b4907c5c282edf89b7e 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc @@ -19,16 +19,16 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h" #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h" #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/common_runtime/eigen_thread_pool.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" @@ -46,7 +46,7 @@ std::unique_ptr> MaybeTransposeArray2D(const Array2D& array, if (transpose) { std::swap(output_width, output_height); } - auto output = MakeUnique>(output_height, output_width); + auto output = absl::make_unique>(output_height, output_width); for (int y = 0; y < array.height(); y++) { for (int x = 0; x < array.width(); x++) { if (transpose) { @@ -93,7 +93,7 @@ std::unique_ptr> EigenMatrixMultiply(const Array2D& a, // Since we're going to transpose c before returning it. Swap the order of the // dimension sizes to ensure the returned array is properly dimensioned. - auto c_transpose = MakeUnique>(n, m); + auto c_transpose = absl::make_unique>(n, m); if (single_threaded) { __xla_cpu_runtime_EigenSingleThreadedMatMulF32( nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(), @@ -142,10 +142,10 @@ class EigenMatMulTest : public CpuRuntimeTest, bool transpose_rhs = std::get<2>(info.param); bool single_threaded = std::get<3>(info.param); - return tensorflow::strings::Printf( - "EigenMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n, - transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "", - single_threaded ? "single" : "multi"); + return absl::StrFormat("EigenMatMul_%d_%d_%d_%s%s%s_threaded", shape.m, + shape.k, shape.n, transpose_lhs ? "Tlhs_" : "", + transpose_rhs ? "Trhs_" : "", + single_threaded ? "single" : "multi"); } }; @@ -178,10 +178,10 @@ class MKLMatMulTest : public CpuRuntimeTest, bool transpose_rhs = std::get<2>(info.param); bool single_threaded = std::get<3>(info.param); - return tensorflow::strings::Printf( - "MKLMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n, - transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "", - single_threaded ? "single" : "multi"); + return absl::StrFormat("MKLMatMul_%d_%d_%d_%s%s%s_threaded", shape.m, + shape.k, shape.n, transpose_lhs ? "Tlhs_" : "", + transpose_rhs ? "Trhs_" : "", + single_threaded ? "single" : "multi"); } }; @@ -204,7 +204,7 @@ std::unique_ptr> MKLMatrixMultiply(const Array2D& a, // Since we're going to transpose c before returning it, swap the order of the // dimension sizes to ensure the returned array is properly dimensioned. - auto c_transpose = MakeUnique>(n, m); + auto c_transpose = absl::make_unique>(n, m); if (single_threaded) { __xla_cpu_runtime_MKLSingleThreadedMatMulF32( nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(), diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc index d97802ee45d6add3c466577d7624d9ca74e2f380..5519a43b2f6bc3a7df9a58823e43fae42f7f94df 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc @@ -19,6 +19,8 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -102,7 +104,7 @@ Status CpuTransferManager::TransferLiteralToInfeed( if (ShapeUtil::IsNestedTuple(shape)) { return Unimplemented( "Infeed with a nested tuple shape is not supported: %s", - ShapeUtil::HumanString(literal.shape()).c_str()); + ShapeUtil::HumanString(literal.shape())); } // For a tuple, we transfer each of its elements to the device and @@ -150,19 +152,18 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor, int64 size, const void* source) { if (size > std::numeric_limits::max()) { - return InvalidArgument("Infeed shape is too large: needs %lld bytes", size); + return InvalidArgument("Infeed shape is too large: needs %d bytes", size); } if (size <= 0) { - return InvalidArgument("Infeed shape must have positive size; got %lld", + return InvalidArgument("Infeed shape must have positive size; got %d", size); } int32 size_32 = static_cast(size); CpuInfeedBuffer* queued_buffer = new CpuInfeedBuffer(size_32); - Status s = - TransferBufferToDevice(executor, /*size=*/size, - /*source=*/source, queued_buffer->device_memory()); + Status s = executor->SynchronousMemcpyH2D( + /*host_src=*/source, /*size=*/size, queued_buffer->device_memory()); if (!s.ok()) { queued_buffer->Done(s); @@ -173,26 +174,24 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor, Status CpuTransferManager::TransferLiteralFromOutfeed( se::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) { + MutableBorrowingLiteral literal) { if (!ShapeUtil::IsTuple(literal_shape)) { int64 size = GetByteSizeRequirement(literal_shape); // Note: OSS build didn't like implicit conversion from // literal_shape.dimensions() to the array slice on 2017-07-10. - tensorflow::gtl::ArraySlice dimensions( + absl::Span dimensions( tensorflow::bit_cast(literal_shape.dimensions().data()), literal_shape.dimensions().size()); - *literal = std::move(*Literal::CreateFromDimensions( - literal_shape.element_type(), dimensions)); - TF_ASSIGN_OR_RETURN(Shape received_shape, - TransferArrayBufferFromOutfeed( - executor, literal->untyped_data(), size)); - TF_RET_CHECK(ShapeUtil::Compatible(received_shape, literal->shape())) + TF_ASSIGN_OR_RETURN( + Shape received_shape, + TransferArrayBufferFromOutfeed(executor, literal.untyped_data(), size)); + TF_RET_CHECK(ShapeUtil::Compatible(received_shape, literal.shape())) << "Shape received from outfeed " << ShapeUtil::HumanString(received_shape) << " did not match the shape that was requested for outfeed: " << ShapeUtil::HumanString(literal_shape); TF_RET_CHECK(size == GetByteSizeRequirement(received_shape)); - *literal->mutable_shape_do_not_use() = received_shape; + *literal.mutable_shape_do_not_use() = received_shape; return Status::OK(); } @@ -201,22 +200,12 @@ Status CpuTransferManager::TransferLiteralFromOutfeed( "Nested tuple outfeeds are not yet implemented on CPU."); } - std::vector> elements; std::vector> buffer_data; for (int64 i = 0; i < literal_shape.tuple_shapes_size(); ++i) { const Shape& tuple_element_shape = ShapeUtil::GetTupleElementShape(literal_shape, i); - // Note: OSS build didn't like implicit conversion from - // literal_shape.dimensions() to the array slice on 2017-07-10. - tensorflow::gtl::ArraySlice dimensions( - tensorflow::bit_cast( - tuple_element_shape.dimensions().data()), - tuple_element_shape.dimensions().size()); - auto empty = Literal::CreateFromDimensions( - tuple_element_shape.element_type(), dimensions); int64 size = GetByteSizeRequirement(tuple_element_shape); - buffer_data.push_back({empty->untyped_data(), size}); - elements.push_back(std::move(empty)); + buffer_data.push_back({literal.untyped_data({i}), size}); } TF_ASSIGN_OR_RETURN(Shape received_shape, @@ -230,17 +219,13 @@ Status CpuTransferManager::TransferLiteralFromOutfeed( TF_RET_CHECK(GetByteSizeRequirement(literal_shape) == GetByteSizeRequirement(received_shape)); - for (int64 i = 0; i < literal_shape.tuple_shapes_size(); ++i) { - *elements[i]->mutable_shape_do_not_use() = received_shape.tuple_shapes(i); - } - *literal = std::move(*Literal::MakeTupleOwned(std::move(elements))); - TF_RET_CHECK(ShapeUtil::Equal(literal->shape(), literal_shape)); + TF_RET_CHECK(ShapeUtil::Equal(literal.shape(), literal_shape)); return Status::OK(); } StatusOr CpuTransferManager::TransferTupleBuffersFromOutfeed( se::StreamExecutor* executor, - tensorflow::gtl::ArraySlice> buffer_data) { + absl::Span> buffer_data) { return TransferBuffersFromOutfeedInternal(executor, buffer_data, /*is_tuple=*/true); } @@ -253,18 +238,17 @@ StatusOr CpuTransferManager::TransferArrayBufferFromOutfeed( StatusOr CpuTransferManager::TransferBuffersFromOutfeedInternal( se::StreamExecutor* executor, - tensorflow::gtl::ArraySlice> buffer_data, - bool is_tuple) { + absl::Span> buffer_data, bool is_tuple) { std::vector> buffers; for (auto b : buffer_data) { int64 size = b.second; if (size > std::numeric_limits::max()) { - return InvalidArgument("Outfeed shape is too large: needs %lld bytes", + return InvalidArgument("Outfeed shape is too large: needs %d bytes", size); } if (size <= 0) { - return InvalidArgument("Outfeed shape must have positive size; got %lld", + return InvalidArgument("Outfeed shape must have positive size; got %d", size); } @@ -272,7 +256,7 @@ StatusOr CpuTransferManager::TransferBuffersFromOutfeedInternal( VLOG(2) << "Enqueueing outfeed buffer (for the device to populate) of length " << size_32 << "B"; - buffers.emplace_back(MakeUnique(b.first, size_32)); + buffers.emplace_back(absl::make_unique(b.first, size_32)); } std::vector buffer_pointers; @@ -299,7 +283,7 @@ StatusOr CpuTransferManager::TransferBuffersFromOutfeedInternal( } // namespace xla static std::unique_ptr CreateCpuTransferManager() { - return xla::MakeUnique(); + return absl::make_unique(); } static bool InitModule() { diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h index 6dfc666f09dfa6df740cd54bea0957e3144181bc..361d4b9c8422fff6afe53e56e0bb10a484c9becc 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h @@ -13,17 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TRANSFER_MANAGER_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TRANSFER_MANAGER_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_TRANSFER_MANAGER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_TRANSFER_MANAGER_H_ #include +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h" #include "tensorflow/compiler/xla/service/generic_transfer_manager.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" @@ -39,13 +40,14 @@ class CpuTransferManager : public GenericTransferManager { Status TransferLiteralToInfeed(se::StreamExecutor* executor, const LiteralSlice& literal) override; - Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, - const void* source) override; Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) override; + MutableBorrowingLiteral literal) override; private: + Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, + const void* source); + // Transfers infeed data to device. InfeedBuffer->Done() must be // called to clean up the memory allocated for InfeedBuffer. StatusOr TransferBufferToInfeedInternal( @@ -54,7 +56,7 @@ class CpuTransferManager : public GenericTransferManager { // Helper that transfers a tuple of element buffers from the device's outfeed. StatusOr TransferTupleBuffersFromOutfeed( se::StreamExecutor* executor, - tensorflow::gtl::ArraySlice> buffer_data); + absl::Span> buffer_data); // Helper that transfers an array buffer from the device's outfeed. StatusOr TransferArrayBufferFromOutfeed(se::StreamExecutor* executor, @@ -66,12 +68,11 @@ class CpuTransferManager : public GenericTransferManager { // for the given buffers. StatusOr TransferBuffersFromOutfeedInternal( se::StreamExecutor* executor, - tensorflow::gtl::ArraySlice> buffer_data, - bool is_tuple); + absl::Span> buffer_data, bool is_tuple); TF_DISALLOW_COPY_AND_ASSIGN(CpuTransferManager); }; } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TRANSFER_MANAGER_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_TRANSFER_MANAGER_H_ diff --git a/tensorflow/compiler/xla/service/cpu/disassembler.cc b/tensorflow/compiler/xla/service/cpu/disassembler.cc index e4c674e227ffc6725ca929f720b9aa7cf7c4c032..3ae64142cd7e32d3aa8d50870efaf94698c06440 100644 --- a/tensorflow/compiler/xla/service/cpu/disassembler.cc +++ b/tensorflow/compiler/xla/service/cpu/disassembler.cc @@ -21,13 +21,13 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -151,7 +151,7 @@ StatusOr Disassembler::DisassembleObjectFile( size = 1; } - ostream << tensorflow::strings::Printf("0x%08lx", index) << " "; + ostream << absl::StrFormat("0x%08lx", index) << " "; if (decode_status == llvm::MCDisassembler::Success) { // For branches, try to determine the actual address and emit it as an @@ -163,7 +163,7 @@ StatusOr Disassembler::DisassembleObjectFile( uint64_t target; if (inst_analysis_->evaluateBranch( instruction, section_address + index, size, target)) { - annotation = tensorflow::strings::Printf("[0x%08lx]", target); + annotation = absl::StrFormat("[0x%08lx]", target); } } inst_printer_->printInst(&instruction, ostream, annotation.c_str(), diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc index d77076546f404afc1292bc4b5e902b59e24a1246..99fa707c959854e50c6d954fe92b87e93e267dc6 100644 --- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -49,15 +50,15 @@ class MemoryTile { // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at // `major_dim_offset` in the major dimension. The tile size along the minor // dimension is the vector size, and that is implicitly determined by `vsl`. - MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* ir_builder, + MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* b, llvm::Value* matrix, int64 matrix_size_along_minor_dim, llvm::Value* major_dim_offset, int64 tile_size_along_major_dim) - : vsl_(vsl), ir_builder_(ir_builder) { + : vsl_(vsl), b_(b) { pointers_.reserve(tile_size_along_major_dim); for (int64 i = 0; i < tile_size_along_major_dim; i++) { - llvm::Value* total_offset = ir_builder->CreateMul( - ir_builder->getInt64(matrix_size_along_minor_dim), - ir_builder->CreateAdd(ir_builder->getInt64(i), major_dim_offset)); + llvm::Value* total_offset = + b->CreateMul(b->getInt64(matrix_size_along_minor_dim), + b->CreateAdd(b->getInt64(i), major_dim_offset)); pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset)); } } @@ -79,7 +80,7 @@ class MemoryTile { // `minor_dim_offset`}. // // Note: `major_dim_offset` is a parameter to the constructor. - void StoreTile(tensorflow::gtl::ArraySlice tile, + void StoreTile(absl::Span tile, llvm::Value* minor_dim_offset) const { CHECK_EQ(tile.size(), pointers_.size()); for (int64 i = 0; i < pointers_.size(); i++) { @@ -101,8 +102,7 @@ class MemoryTile { for (int64 i = 0; i < pointers_.size(); i++) { for (int64 j = 0; j < tile_size_along_middle_dim; j++) { result[i].push_back(vsl_->LoadBroadcast( - pointers_[i], ir_builder_->CreateAdd(minor_dim_offset, - ir_builder_->getInt64(j)))); + pointers_[i], b_->CreateAdd(minor_dim_offset, b_->getInt64(j)))); } } return result; @@ -110,7 +110,7 @@ class MemoryTile { private: VectorSupportLibrary* vsl_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; std::vector pointers_; }; @@ -147,9 +147,9 @@ class GemvConfig { bool has_addend() const { return has_addend_; } string GetCacheKey() const { - return tensorflow::strings::StrCat( - name_, "_", PrimitiveType_Name(scalar_type()), "_", tile_rows(), "_", - tile_cols(), "_", m(), "_", k(), has_addend() ? "_with_addend" : ""); + return absl::StrCat(name_, "_", PrimitiveType_Name(scalar_type()), "_", + tile_rows(), "_", tile_cols(), "_", m(), "_", k(), + has_addend() ? "_with_addend" : ""); } protected: @@ -249,16 +249,15 @@ class ColumnMajorMatrixVectorProductEmitter ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend, llvm::Value* result, - llvm::IRBuilder<>* ir_builder) + llvm::IRBuilder<>* b) : config_(config), lhs_(lhs), rhs_(rhs), addend_(addend), result_(result), - ir_builder_(ir_builder), - ksl_(ir_builder_), - vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), - ir_builder_, "") { + b_(b), + ksl_(b_), + vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), b_, "") { CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast(tile_rows()))); CHECK(!has_addend() || addend != nullptr); } @@ -272,7 +271,7 @@ class ColumnMajorMatrixVectorProductEmitter bool is_first_column); MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) { - return MemoryTile(&vsl_, ir_builder_, /*matrix=*/lhs_, + return MemoryTile(&vsl_, b_, /*matrix=*/lhs_, /*matrix_size_along_minor_dim=*/m(), /*major_dim_offset=*/column_start, /*tile_size_along_major_dim=*/column_count); @@ -302,7 +301,7 @@ class ColumnMajorMatrixVectorProductEmitter llvm::Value* rhs_; llvm::Value* addend_; llvm::Value* result_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; KernelSupportLibrary ksl_; VectorSupportLibrary vsl_; }; @@ -324,14 +323,14 @@ void ColumnMajorMatrixVectorProductEmitter::Emit() { int64 column_remainder = k() % tile_cols(); int64 column_limit = k() - column_remainder; - ksl_.For("dot.outer.tiled", - /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(), - [&](llvm::Value* column, bool is_first_column) { - EmitOuterLoopBody(column, tile_cols(), is_first_column); - }); + ksl_.ForReturnVoid("dot.outer.tiled", + /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(), + [&](llvm::Value* column, bool is_first_column) { + EmitOuterLoopBody(column, tile_cols(), is_first_column); + }); if (column_remainder != 0) { - EmitOuterLoopBody(ir_builder_->getInt64(column_limit), column_remainder, + EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder, column_limit == 0); } } @@ -341,19 +340,20 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled( int64 columns, bool is_first_column) { int64 row_limit = m() - (m() % tile_rows()); - ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/row_limit, - /*step=*/tile_rows(), [&](llvm::Value* row) { - std::vector lhs_tile = - lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row); - llvm::Value* accumulator = - is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row) - : vsl_.GetZeroVector()) - : vsl_.LoadVector(result_, row); - for (int i = 0; i < columns; i++) { - accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator); - } - vsl_.StoreVector(accumulator, result_, row); - }); + ksl_.ForReturnVoid( + "dot.inner.tiled", /*start=*/0, /*end=*/row_limit, + /*step=*/tile_rows(), [&](llvm::Value* row) { + std::vector lhs_tile = + lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row); + llvm::Value* accumulator = + is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row) + : vsl_.GetZeroVector()) + : vsl_.LoadVector(result_, row); + for (int i = 0; i < columns; i++) { + accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator); + } + vsl_.StoreVector(accumulator, result_, row); + }); } void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue( @@ -363,7 +363,7 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue( return; } - llvm::Value* columns_llvm = ir_builder_->getInt64(columns); + llvm::Value* columns_llvm = b_->getInt64(columns); // for (col = current_tile_col; col < (columns + current_tile_col); col++) // for (row = row_start, row < m_; row++) { @@ -372,25 +372,23 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue( // // initialized. // } - ksl_.For( + ksl_.ForReturnVoid( "dot.inner.epilg.outer", /*start=*/current_tile_col, - /*end=*/ir_builder_->CreateAdd(columns_llvm, current_tile_col), + /*end=*/b_->CreateAdd(columns_llvm, current_tile_col), /*step=*/1, /*peel_first_iteration=*/false, [&](llvm::Value* col, llvm::Value* is_first_scalar_col) { llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col); - llvm::Value* total_offset = - ir_builder_->CreateMul(col, ir_builder_->getInt64(m())); + llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m())); llvm::Value* lhs_base_pointer = vsl_.ComputeOffsetPointer(lhs_, total_offset); - ksl_.For( + ksl_.ForReturnVoid( "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(), /*step=*/1, [&](llvm::Value* scalar_row) { llvm::Value* product = vsl_.Mul( vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element); - llvm::Value* setting_result_first_time = ir_builder_->CreateAnd( - is_first_scalar_col, - ir_builder_->getInt1(is_first_tiled_column)); - ksl_.If( + llvm::Value* setting_result_first_time = b_->CreateAnd( + is_first_scalar_col, b_->getInt1(is_first_tiled_column)); + ksl_.IfReturnVoid( setting_result_first_time, /*true_block_generator=*/ [&]() { @@ -477,16 +475,15 @@ class RowMajorMatrixVectorProductEmitter RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend, - llvm::Value* result, - llvm::IRBuilder<>* ir_builder) + llvm::Value* result, llvm::IRBuilder<>* b) : config_(config), lhs_(lhs), rhs_(rhs), addend_(addend), result_(result), - ir_builder_(ir_builder), - ksl_(ir_builder_), - vsl_(scalar_type(), /*vector_size=*/tile_cols(), ir_builder_, "") { + b_(b), + ksl_(b_), + vsl_(scalar_type(), /*vector_size=*/tile_cols(), b_, "") { CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast(tile_cols()))); CHECK(!has_addend() || addend != nullptr); } @@ -497,7 +494,7 @@ class RowMajorMatrixVectorProductEmitter private: MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) { - return MemoryTile(&vsl_, ir_builder_, /*matrix=*/lhs_, + return MemoryTile(&vsl_, b_, /*matrix=*/lhs_, /*matrix_size_along_minor_dim=*/k(), /*major_dim_offset=*/row_start, /*tile_size_along_major_dim=*/row_count); @@ -516,7 +513,7 @@ class RowMajorMatrixVectorProductEmitter llvm::Value* rhs_; llvm::Value* addend_; llvm::Value* result_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; KernelSupportLibrary ksl_; VectorSupportLibrary vsl_; }; @@ -558,7 +555,7 @@ void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row, for (int i = 0; i < row_count; i++) { llvm::Value* result_value = vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get()); - llvm::Value* offset = ir_builder_->CreateAdd(ir_builder_->getInt64(i), row); + llvm::Value* offset = b_->CreateAdd(b_->getInt64(i), row); if (addend_ && row_count != vsl_.vector_size()) { result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value); } @@ -571,12 +568,13 @@ void RowMajorMatrixVectorProductEmitter::Emit() { int64 row_remainder = m() % tile_rows(); int64 row_limit = m() - row_remainder; - ksl_.For("dot.outer.tiled", - /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(), - [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); }); + ksl_.ForReturnVoid( + "dot.outer.tiled", + /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(), + [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); }); if (row_remainder != 0) { - EmitOuterLoopBody(ir_builder_->getInt64(row_limit), row_remainder); + EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder); } } @@ -585,17 +583,17 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled( std::vector* vector_accumulators) { int64 column_limit = k() - (k() % tile_cols()); - ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit, - /*step=*/tile_cols(), [&](llvm::Value* col) { - std::vector lhs_tile = - lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col); - llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col); - for (int i = 0; i < rows; i++) { - llvm::Value* old_sum = (*vector_accumulators)[i].Get(); - (*vector_accumulators)[i].Set( - vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i]))); - } - }); + ksl_.ForReturnVoid("dot.inner.tiled", /*start=*/0, /*end=*/column_limit, + /*step=*/tile_cols(), [&](llvm::Value* col) { + std::vector lhs_tile = + lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col); + llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col); + for (int i = 0; i < rows; i++) { + llvm::Value* old_sum = (*vector_accumulators)[i].Get(); + (*vector_accumulators)[i].Set(vsl_.Add( + old_sum, vsl_.Mul(rhs_value, lhs_tile[i]))); + } + }); } void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue( @@ -607,36 +605,36 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue( } for (int r = 0; r < rows; r++) { - llvm::Value* total_offset = ir_builder_->CreateMul( - ir_builder_->CreateAdd(ir_builder_->getInt64(r), current_tile_row), - ir_builder_->getInt64(k())); + llvm::Value* total_offset = b_->CreateMul( + b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k())); llvm::Value* lhs_base_pointer = vsl_.ComputeOffsetPointer(lhs_, total_offset); - ksl_.For("dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(), - /*step=*/1, [&](llvm::Value* scalar_col) { - llvm::Value* product = - vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col), - vsl_.LoadScalar(rhs_, scalar_col)); - llvm::Value* old_value = (*scalar_accumulators)[r].Get(); - (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product)); - }); + ksl_.ForReturnVoid( + "dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(), + /*step=*/1, [&](llvm::Value* scalar_col) { + llvm::Value* product = + vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col), + vsl_.LoadScalar(rhs_, scalar_col)); + llvm::Value* old_value = (*scalar_accumulators)[r].Get(); + (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product)); + }); } } // This class implements a tiled matrix multiplication algorithm, intended for -// use as the innermost GEBP loop in a GEMM kernel (GEBP is described in "Goto, -// Kazushige, and Robert Van De Geijn. "High-performance implementation of the -// level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1 (2008): -// 4). +// multiplying small matrices that don't need cache tiling. +// +// In the future this can be used as the innermost GEBP loop in a GEMM kernel as +// described in "Goto, Kazushige, and Robert A. Geijn. "Anatomy of +// high-performance matrix multiplication." ACM Transactions on Mathematical +// Software (TOMS) 34.3 (2008): 12.". // // This only supports canonical dot operations (i.e. where the lhs contraction // dimension is 1 and the rhs contraction dimension is 0) over row major // matrices. -class MatrixMatrixBlockPanelEmitter { +class TiledSmallGemmEmitter { public: - // Describe the dimensions of the GEBP kernel. These will usually not be the - // dimensions of the GEMM itself, the GEMM will usually be broken up into GEBP - // kernels with smaller dimensions. + // Describe the dimensions of the kernel. class Dimensions { public: explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {} @@ -645,9 +643,7 @@ class MatrixMatrixBlockPanelEmitter { int64 k() const { return k_; } int64 n() const { return n_; } - string ToString() const { - return tensorflow::strings::StrCat(m(), "x", k(), "x", n()); - } + string ToString() const { return absl::StrCat(m(), "x", k(), "x", n()); } private: const int64 m_; @@ -655,9 +651,9 @@ class MatrixMatrixBlockPanelEmitter { const int64 n_; }; - // Represents the configuration of the GEBP emitter. The LLVM IR emitted by - // the emitter, modulo the LLVM values holding the input and output buffers, - // must be a function of the instance of `Config` passed to it. + // Represents the configuration of the emitter. The LLVM IR emitted by the + // emitter, modulo the LLVM values holding the input and output buffers, must + // be a function of the instance of `Config` passed to it. // // `dims` holds the matrix multiplication dimensions. // @@ -665,6 +661,10 @@ class MatrixMatrixBlockPanelEmitter { // the largest vector register we will use). This can be larger than the // largest vector register supported by the machine -- LLVM will legalize // these large vector widths into legally sized vectors. + // + // `max_vector_count` is the maximum number of vectors of size + // `max_vectorization_width` that we will attempt to process at once. + // // `min_vectorization_width` is the smallest vector width the emitter will use // -- below that it will devolve to using a scalar loop. // @@ -674,26 +674,28 @@ class MatrixMatrixBlockPanelEmitter { class Config { public: explicit Config(PrimitiveType scalar_type, Dimensions dims, - int64 max_vectorization_width, + int64 max_vectorization_width, int64 max_vector_count, int64 min_vectorization_width, int64 tile_size_m, int64 tile_size_k) : scalar_type_(scalar_type), dims_(dims), max_vectorization_width_(max_vectorization_width), + max_vector_count_(max_vector_count), min_vectorization_width_(min_vectorization_width), tile_size_m_(tile_size_m), tile_size_k_(tile_size_k) {} string GetCacheKey() const { - return tensorflow::strings::StrCat( - "gebp_", PrimitiveType_Name(scalar_type()), "_", dims().ToString(), - "_", max_vectorization_width(), "_", min_vectorization_width(), "_", - tile_size_m(), "_", tile_size_k()); + return absl::StrCat("gemm_", PrimitiveType_Name(scalar_type()), "_", + dims().ToString(), "_", max_vectorization_width(), + "_", min_vectorization_width(), "_", tile_size_m(), + "_", tile_size_k()); } PrimitiveType scalar_type() const { return scalar_type_; } Dimensions dims() const { return dims_; } int64 max_vectorization_width() const { return max_vectorization_width_; } + int64 max_vector_count() const { return max_vector_count_; } int64 min_vectorization_width() const { return min_vectorization_width_; } int64 tile_size_m() const { return tile_size_m_; } @@ -703,59 +705,56 @@ class MatrixMatrixBlockPanelEmitter { PrimitiveType scalar_type_; Dimensions dims_; int64 max_vectorization_width_; + int64 max_vector_count_; int64 min_vectorization_width_; int64 tile_size_m_; int64 tile_size_k_; }; - // Creates an instance of MatrixMatrixBlockPanelEmitter that matrix-multiplies + // Creates an instance of TiledSmallGemmEmitter that matrix-multiplies // `lhs` with `rhs` and stores the result in `result`. - explicit MatrixMatrixBlockPanelEmitter(Config config, llvm::Value* lhs, - llvm::Value* rhs, llvm::Value* result, - llvm::IRBuilder<>* ir_builder) + explicit TiledSmallGemmEmitter(Config config, llvm::Value* lhs, + llvm::Value* rhs, llvm::Value* result, + llvm::IRBuilder<>* b) : lhs_(lhs), rhs_(rhs), result_(result), config_(config), - ir_builder_(ir_builder), - ksl_(ir_builder_) { + b_(b), + ksl_(b_) { CHECK(max_vectorization_width() > 0 && IsPowerOfTwo(static_cast(max_vectorization_width()))); + CHECK_GT(max_vector_count(), 0); CHECK(min_vectorization_width() > 0 && IsPowerOfTwo(static_cast(min_vectorization_width()))); + CHECK_GE(max_vectorization_width(), min_vectorization_width()); CHECK_GT(tile_size_k(), 0); } void Emit(); private: - // This emits a loop that loops over the `n` dimension in multiples of - // `max_vectorization_width` as much as possible and then emits a remainder - // epilogue. - void EmitLoopOverN(); - - // This emits a loop that loops over the `k` dimension in multiples of - // `tile_size_k` as much as possible and then emits a remainder epilogue. - void EmitLoopOverK(VectorSupportLibrary* vsl, llvm::Value* n_start, - llvm::Value* n_end); - - // This emits a loop that loops over the `m` dimension in multiples of - // `tile_size_m` as much as possible and then emits a remainder epilogue. - void EmitLoopOverM(VectorSupportLibrary* vsl, int64 tile_size_k, + // The HandleResiduesOnX helpers split the iteration space for dimension X + // into a multiple of the tile size on dimension X and an epilogue. These + // helpers ultimately call into `EmitTiledGemm` for emitting the + // tiled GEMM kernel. + + void HandleResiduesOnN(); + void HandleResiduesOnK(VectorSupportLibrary* vsl, llvm::Value* n_start, + llvm::Value* n_end); + void HandleResiduesOnM(VectorSupportLibrary* vsl, int64 tile_size_k, + llvm::Value* k_start, llvm::Value* k_end, + llvm::Value* n_start, llvm::Value* n_end); + + // This emits a tiled GEMM kernel. For a detailed description see the comment + // on the implementation. + void EmitTiledGemm(VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start, llvm::Value* k_end, - llvm::Value* n_start, llvm::Value* n_end); + llvm::Value* n_start, llvm::Value* n_end, + int64 tile_size_m, llvm::Value* m_start, + llvm::Value* m_end); - // This emits the inner reduction loop. This inner reduction loop multiplies - // a tile from the LHS of size [tile_size_m,tile_size_k] and a tile from the - // RHS of size [`tile_size_k`, vls->vector_width()] to update a tile of size - // [`tile_size_m`, vls->vector_width()] in the result. - void EmitTiledReductionLoop(VectorSupportLibrary* vsl, int64 tile_size_k, - llvm::Value* k_start, llvm::Value* k_end, - llvm::Value* n_start, llvm::Value* n_end, - int64 tile_size_m, llvm::Value* m_start, - llvm::Value* m_end); - - llvm::Value* GetInt64(int64 value) { return ir_builder_->getInt64(value); } + llvm::Value* GetInt64(int64 value) { return b_->getInt64(value); } Config config() const { return config_; } Dimensions dims() const { return config().dims(); } @@ -763,6 +762,7 @@ class MatrixMatrixBlockPanelEmitter { int64 max_vectorization_width() const { return config().max_vectorization_width(); } + int64 max_vector_count() const { return config().max_vector_count(); } int64 min_vectorization_width() const { return config().min_vectorization_width(); } @@ -775,74 +775,90 @@ class MatrixMatrixBlockPanelEmitter { llvm::Value* result_; Config config_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; KernelSupportLibrary ksl_; }; -void MatrixMatrixBlockPanelEmitter::Emit() { EmitLoopOverN(); } +void TiledSmallGemmEmitter::Emit() { HandleResiduesOnN(); } -void MatrixMatrixBlockPanelEmitter::EmitLoopOverN() { +void TiledSmallGemmEmitter::HandleResiduesOnN() { // We can only iterate the `n` dimension for an extent that is divisible by // the vectorization width. So we emit an outer loop that first processes the // largest extent in `n` that is divisible by max_vectorization_width, then // the largest remaining extent that is divisible by max_vectorization_width / // 2 etc. - int64 current_vectorization_width = max_vectorization_width(); + int64 current_vectorization_width = + max_vector_count() * max_vectorization_width(); + int64 current_vector_count = max_vector_count(); + int64 n_start = 0; while (n_start != dims().n() && current_vectorization_width >= min_vectorization_width()) { int64 n_end = dims().n() - (dims().n() % current_vectorization_width); if (n_start != n_end) { - VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, - ir_builder_, "gebp"); - EmitLoopOverK(&vsl, GetInt64(n_start), GetInt64(n_end)); + VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_, + "gemm"); + HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end)); n_start = n_end; } - current_vectorization_width /= 2; + if (current_vector_count == 1) { + current_vectorization_width /= 2; + } else { + current_vector_count--; + current_vectorization_width = + current_vector_count * max_vectorization_width(); + } } if (n_start != dims().n()) { - VectorSupportLibrary vsl(scalar_type(), 1, ir_builder_, "gebp"); - ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) { - llvm::Value* n_i_next = - ir_builder_->CreateAdd(n_i, ir_builder_->getInt64(1)); - EmitLoopOverK(&vsl, n_i, n_i_next); + VectorSupportLibrary vsl(scalar_type(), 1, b_, "gemm"); + ksl_.ForReturnVoid("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) { + llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1)); + HandleResiduesOnK(&vsl, n_i, n_i_next); }); } } -void MatrixMatrixBlockPanelEmitter::EmitLoopOverK(VectorSupportLibrary* vsl, - llvm::Value* n_start, - llvm::Value* n_end) { +void TiledSmallGemmEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl, + llvm::Value* n_start, + llvm::Value* n_end) { int64 k_start = 0; int64 k_end = dims().k() - (dims().k() % tile_size_k()); if (k_end != k_start) { - EmitLoopOverM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end), - n_start, n_end); + HandleResiduesOnM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end), + n_start, n_end); k_start = k_end; } if (k_start != dims().k()) { - EmitLoopOverM(vsl, dims().k() - k_start, GetInt64(k_start), - GetInt64(dims().k()), n_start, n_end); + HandleResiduesOnM(vsl, dims().k() - k_start, GetInt64(k_start), + GetInt64(dims().k()), n_start, n_end); } } -void MatrixMatrixBlockPanelEmitter::EmitLoopOverM( +void TiledSmallGemmEmitter::HandleResiduesOnM( VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start, llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) { const int64 m_end = dims().m() - dims().m() % tile_size_m(); - EmitTiledReductionLoop(vsl, tile_size_k, k_start, k_end, n_start, n_end, - tile_size_m(), GetInt64(0), GetInt64(m_end)); + EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end, tile_size_m(), + GetInt64(0), GetInt64(m_end)); if (m_end != dims().m()) { - EmitTiledReductionLoop(vsl, tile_size_k, k_start, k_end, n_start, n_end, - dims().m() - m_end, GetInt64(m_end), - GetInt64(dims().m())); + EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end, + dims().m() - m_end, GetInt64(m_end), GetInt64(dims().m())); } } +// The loop structure is: +// +// Iterate over dimension M as m: +// Iterate over dimension N as n: +// Iterate over dimension K as k: +// OutputTile[m,n] += Dot(LhsTile[m,k], RhsTile[k,n]) +// +// I.e. a just a tiled version of a "naive" GEMM. +// // The tiling scheme is as follows: // // Let the LHS be: @@ -904,41 +920,48 @@ void MatrixMatrixBlockPanelEmitter::EmitLoopOverM( // +-------------------+-------------------+-------------------+--------- // | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 | ... // +-------------------+-------------------+-------------------+--------- -void MatrixMatrixBlockPanelEmitter::EmitTiledReductionLoop( +void TiledSmallGemmEmitter::EmitTiledGemm( VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start, llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end, int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) { - ksl_.For("dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) { - MemoryTile result_memory_tile(vsl, ir_builder_, /*matrix=*/result_, - /*matrix_size_along_minor_dim=*/dims().n(), - /*major_dim_offset=*/m_i, - /*tile_size_along_major_dim=*/tile_size_m); - MemoryTile lhs_memory_tile(vsl, ir_builder_, /*matrix=*/lhs_, - /*matrix_size_along_minor_dim=*/dims().k(), - /*major_dim_offset=*/m_i, - /*tile_size_along_major_dim=*/tile_size_m); - - ksl_.For("dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) { - MemoryTile rhs_memory_tile(vsl, ir_builder_, rhs_, dims().n(), k_i, - tile_size_k); - std::vector> lhs_tile = - lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k); - ksl_.For( - "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) { - std::vector rhs_tile = rhs_memory_tile.LoadTile(n_i); - std::vector result_tile = - result_memory_tile.LoadTile(n_i); - for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) { - for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) { - result_tile[r_m_i] = - vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i], - result_tile[r_m_i]); - } - } - result_memory_tile.StoreTile(result_tile, n_i); - }); - }); - }); + ksl_.ForReturnVoid( + "dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) { + MemoryTile result_memory_tile( + vsl, b_, /*matrix=*/result_, + /*matrix_size_along_minor_dim=*/dims().n(), + /*major_dim_offset=*/m_i, + /*tile_size_along_major_dim=*/tile_size_m); + MemoryTile lhs_memory_tile(vsl, b_, /*matrix=*/lhs_, + /*matrix_size_along_minor_dim=*/dims().k(), + /*major_dim_offset=*/m_i, + /*tile_size_along_major_dim=*/tile_size_m); + ksl_.ForReturnVoid( + "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) { + TileVariable result_tile_var(vsl, + result_memory_tile.LoadTile(n_i)); + ksl_.ForReturnVoid( + "dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) { + MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i, + tile_size_k); + std::vector> lhs_tile = + lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k); + std::vector rhs_tile = + rhs_memory_tile.LoadTile(n_i); + std::vector result_tile = + result_tile_var.Get(); + for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) { + for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) { + result_tile[r_m_i] = + vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i], + result_tile[r_m_i]); + } + } + result_tile_var.Set(result_tile); + }); + + result_memory_tile.StoreTile(result_tile_var.Get(), n_i); + }); + }); } } // namespace @@ -949,7 +972,7 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot, const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array, llvm::Value* executable_run_options_value, - llvm::IRBuilder<>* ir_builder, + llvm::IRBuilder<>* b, const HloModuleConfig& hlo_module_config, const TargetMachineFeatures& target_machine_features) : dot_(dot), @@ -958,7 +981,7 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot, rhs_array_(rhs_array), addend_array_(addend_array), executable_run_options_value_(executable_run_options_value), - ir_builder_(ir_builder), + b_(b), hlo_module_config_(hlo_module_config), target_machine_features_(target_machine_features) {} @@ -966,24 +989,33 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot, const HloInstruction& dot, const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array, - llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder, + llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b, const HloModuleConfig& hlo_module_config, const TargetMachineFeatures& target_machine_features) { PrimitiveType type = target_array.GetShape().element_type(); TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type); DotOpEmitter dot_emitter(dot, target_array, lhs_array, rhs_array, - addend_array, executable_run_options_value, - ir_builder, hlo_module_config, - target_machine_features); + addend_array, executable_run_options_value, b, + hlo_module_config, target_machine_features); return dot_emitter.Emit(); } -bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled( +bool DotOpEmitter::EmitSmallGemmIfProfitable( const DotOpEmitter::MatMultDims& mat_mult_dims) { - if (!EnableExperimentalLlvmIrGemm() || ShouldUseMultiThreadedEigen()) { + if (ShouldUseMultiThreadedEigen()) { return false; } + if (!EnableExperimentalLlvmIrGemm()) { + // TODO(sanjoy): We should make these numbers micro-arch specific. + bool small_gemm = mat_mult_dims.k <= 128 && + ((mat_mult_dims.m <= 32 && mat_mult_dims.n <= 128) || + (mat_mult_dims.m <= 128 && mat_mult_dims.n <= 32)); + if (!small_gemm) { + return false; + } + } + if (mat_mult_dims.lhs_non_canonical || mat_mult_dims.rhs_non_canonical) { return false; } @@ -1019,38 +1051,43 @@ bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled( } int64 size_bytes = m * n * ShapeUtil::ByteSizeOfPrimitiveType(primitive_type); - ir_builder_->CreateMemSet( - target, ir_builder_->getInt8(0), size_bytes, + b_->CreateMemSet( + target, b_->getInt8(0), size_bytes, target_machine_features_.minimum_alignment_for_allocation(size_bytes)); - int64 max_vector_width = + int64 max_target_vector_width = target_machine_features_.vector_register_num_elements( - *ir_builder_->GetInsertBlock()->getParent(), primitive_type); + *b_->GetInsertBlock()->getParent(), primitive_type); + + int64 tile_size_m, tile_size_k, tile_size_n_in_vector_width; + std::tie(tile_size_m, tile_size_k, tile_size_n_in_vector_width) = + GetGemmTileSize(); - MatrixMatrixBlockPanelEmitter::Config config( + TiledSmallGemmEmitter::Config config( /*scalar_type=*/primitive_type, - MatrixMatrixBlockPanelEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n}, - /*max_vectorization_width=*/max_vector_width, - /*min_vectorization_width=*/std::min(4, max_vector_width), - /*tile_size_m=*/3, /*tile_size_k=*/5); + TiledSmallGemmEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n}, + /*max_vectorization_width=*/max_target_vector_width, + /*max_vector_count=*/tile_size_n_in_vector_width, + /*min_vectorization_width=*/std::min(4, max_target_vector_width), + /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k); - VLOG(2) << "Emitting GEBP kernel in LLVM IR with config " + VLOG(2) << "Emitting GEMM kernel in LLVM IR with config " << config.GetCacheKey(); const bool enable_fast_math = - hlo_module_config_.debug_options().xla_enable_fast_math(); + hlo_module_config_.debug_options().xla_cpu_enable_fast_math(); const bool optimize_for_size = options::OptimizeForSizeRequested(hlo_module_config_); KernelSupportLibrary::EmitAndCallOutlinedKernel( /*enable_fast_math=*/enable_fast_math, - /*optimize_for_size=*/optimize_for_size, ir_builder_, - config.GetCacheKey(), lhs, rhs, target, + /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(), lhs, + rhs, target, [this, config](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* target) { - MatrixMatrixBlockPanelEmitter gebp_emitter( - config, /*lhs=*/lhs, /*rhs=*/rhs, - /*result=*/target, ir_builder_); - gebp_emitter.Emit(); + TiledSmallGemmEmitter small_gemm_emitter(config, /*lhs=*/lhs, + /*rhs=*/rhs, + /*result=*/target, b_); + small_gemm_emitter.Emit(); }); return true; @@ -1108,7 +1145,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() { } if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) { - return EmitExperimentalGebpDotIfEnabled(mat_mult_dims); + return EmitSmallGemmIfProfitable(mat_mult_dims); } int64 tiling_factor = GetGemvTilingFactor(); @@ -1121,13 +1158,13 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() { swap_operands ? lhs_array_.GetBasePointer() : rhs_array_.GetBasePointer(); const bool enable_fast_math = - hlo_module_config_.debug_options().xla_enable_fast_math(); + hlo_module_config_.debug_options().xla_cpu_enable_fast_math(); const bool optimize_for_size = options::OptimizeForSizeRequested(hlo_module_config_); const int target_vector_register_element_size = target_machine_features_.vector_register_num_elements( - *ir_builder_->GetInsertBlock()->getParent(), primitive_type); + *b_->GetInsertBlock()->getParent(), primitive_type); // We may not always know the vector register size for the target we're // compiling against, in which case target_vector_register_element_size is 0. @@ -1148,13 +1185,13 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() { KernelSupportLibrary::EmitAndCallOutlinedKernel( /*enable_fast_math=*/enable_fast_math, - /*optimize_for_size=*/optimize_for_size, ir_builder_, - config.GetCacheKey(), lhs_op, rhs_op, + /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(), + lhs_op, rhs_op, addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op, [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* addend_op, llvm::Value* result_op) { ColumnMajorMatrixVectorProductEmitter emitter( - config, lhs_op, rhs_op, addend_op, result_op, ir_builder_); + config, lhs_op, rhs_op, addend_op, result_op, b_); emitter.Emit(); }); } else { @@ -1167,13 +1204,13 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() { KernelSupportLibrary::EmitAndCallOutlinedKernel( /*enable_fast_math=*/enable_fast_math, - /*optimize_for_size=*/optimize_for_size, ir_builder_, - config.GetCacheKey(), lhs_op, rhs_op, + /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(), + lhs_op, rhs_op, addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op, [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* addend_op, llvm::Value* result_op) { - RowMajorMatrixVectorProductEmitter emitter( - config, lhs_op, rhs_op, addend_op, result_op, ir_builder_); + RowMajorMatrixVectorProductEmitter emitter(config, lhs_op, rhs_op, + addend_op, result_op, b_); emitter.Emit(); }); } @@ -1249,11 +1286,11 @@ Status DotOpEmitter::Emit() { // Create loop nests which loop through the LHS operand dimensions and the RHS // operand dimensions. The reduction dimension of the LHS and RHS are handled // in a separate innermost loop which performs the sum of products. - llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(&dot_), ir_builder_); - llvm_ir::IrArray::Index lhs_index = EmitOperandArrayLoopNest( - &loop_nest, lhs_array_, lhs_reduction_dimension, "lhs"); - llvm_ir::IrArray::Index rhs_index = EmitOperandArrayLoopNest( - &loop_nest, rhs_array_, rhs_reduction_dimension, "rhs"); + llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(&dot_), b_); + llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest( + lhs_array_, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs"); + llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest( + rhs_array_, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs"); // Create the loop which does the sum of products reduction. // @@ -1265,8 +1302,11 @@ Status DotOpEmitter::Emit() { // from messing up the vectorization. std::unique_ptr reduction_loop = loop_nest.AddLoop( 0, lhs_shape.dimensions(lhs_reduction_dimension), "reduction", - /*prevent_unrolling=*/lhs_reduction_along_minor_dimension && - rhs_reduction_along_minor_dimension); + /*unroll_mode=*/ + (lhs_reduction_along_minor_dimension && + rhs_reduction_along_minor_dimension) + ? xla::llvm_ir::UnrollMode::kNoUnroll + : xla::llvm_ir::UnrollMode::kDefaultUnroll); // The final entry in the rhs and lhs indexes is the indvar of the // reduction loop. @@ -1280,68 +1320,61 @@ Status DotOpEmitter::Emit() { // Function entry basic block. // - Emit alloca for accumulator llvm::Function* func = reduction_loop->GetPreheaderBasicBlock()->getParent(); - SetToFirstInsertPoint(&func->getEntryBlock(), ir_builder_); + SetToFirstInsertPoint(&func->getEntryBlock(), b_); llvm::Type* accum_type = target_array_.GetElementLlvmType(); - llvm::Value* accum_address = ir_builder_->CreateAlloca( - accum_type, /*ArraySize=*/nullptr, "accum_address"); + llvm::Value* accum_address = + b_->CreateAlloca(accum_type, /*ArraySize=*/nullptr, "accum_address"); // Preheader basic block of reduction loop: // - Initialize accumulator to zero. llvm::BasicBlock* preheader_bb = reduction_loop->GetPreheaderBasicBlock(); - ir_builder_->SetInsertPoint(preheader_bb->getTerminator()); + b_->SetInsertPoint(preheader_bb->getTerminator()); - ir_builder_->CreateStore(llvm::Constant::getNullValue(accum_type), - accum_address); + b_->CreateStore(llvm::Constant::getNullValue(accum_type), accum_address); // Body basic block of reduction loop: // - Load elements from lhs and rhs array. // - Multiply lhs-element and rhs-element. // - Load accumulator and add to product. // - Store sum back into accumulator. - SetToFirstInsertPoint(reduction_loop->GetBodyBasicBlock(), ir_builder_); + SetToFirstInsertPoint(reduction_loop->GetBodyBasicBlock(), b_); - llvm::Value* lhs_element = - lhs_array_.EmitReadArrayElement(lhs_index, ir_builder_); - llvm::Value* rhs_element = - rhs_array_.EmitReadArrayElement(rhs_index, ir_builder_); + llvm::Value* lhs_element = lhs_array_.EmitReadArrayElement(lhs_index, b_); + llvm::Value* rhs_element = rhs_array_.EmitReadArrayElement(rhs_index, b_); - llvm::Value* accum = ir_builder_->CreateLoad(accum_address); + llvm::Value* accum = b_->CreateLoad(accum_address); llvm::Value* updated_accum; if (ShapeUtil::ElementIsComplex(lhs_shape)) { - auto real = [&](llvm::Value* x) { - return ir_builder_->CreateExtractValue(x, {0}); - }; - auto imag = [&](llvm::Value* x) { - return ir_builder_->CreateExtractValue(x, {1}); - }; - llvm::Value* product_real = ir_builder_->CreateFSub( - ir_builder_->CreateFMul(real(lhs_element), real(rhs_element)), - ir_builder_->CreateFMul(imag(lhs_element), imag(rhs_element))); - llvm::Value* product_imag = ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(real(lhs_element), imag(rhs_element)), - ir_builder_->CreateFMul(imag(lhs_element), real(rhs_element))); - updated_accum = ir_builder_->CreateInsertValue( - accum, ir_builder_->CreateFAdd(real(accum), product_real), {0}); - updated_accum = ir_builder_->CreateInsertValue( - updated_accum, ir_builder_->CreateFAdd(imag(accum), product_imag), {1}); + auto real = [&](llvm::Value* x) { return b_->CreateExtractValue(x, {0}); }; + auto imag = [&](llvm::Value* x) { return b_->CreateExtractValue(x, {1}); }; + llvm::Value* product_real = + b_->CreateFSub(b_->CreateFMul(real(lhs_element), real(rhs_element)), + b_->CreateFMul(imag(lhs_element), imag(rhs_element))); + llvm::Value* product_imag = + b_->CreateFAdd(b_->CreateFMul(real(lhs_element), imag(rhs_element)), + b_->CreateFMul(imag(lhs_element), real(rhs_element))); + updated_accum = b_->CreateInsertValue( + accum, b_->CreateFAdd(real(accum), product_real), {0}); + updated_accum = b_->CreateInsertValue( + updated_accum, b_->CreateFAdd(imag(accum), product_imag), {1}); } else { - llvm::Value* product = ir_builder_->CreateFMul(lhs_element, rhs_element); - updated_accum = ir_builder_->CreateFAdd(accum, product); + llvm::Value* product = b_->CreateFMul(lhs_element, rhs_element); + updated_accum = b_->CreateFAdd(accum, product); } - ir_builder_->CreateStore(updated_accum, accum_address); + b_->CreateStore(updated_accum, accum_address); // Exit basic block of reduction loop. // - Load accumulator value (the result). // - Store into output array. - SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), ir_builder_); + SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), b_); - llvm::Value* result = ir_builder_->CreateLoad(accum_address); + llvm::Value* result = b_->CreateLoad(accum_address); // Create index into target address. The target index is the concatenation of // the rhs and lhs indexes with the reduction dimensions removed. The terms // from the rhs index are the lower dimensions in the index so we add them // first. - llvm_ir::IrArray::Index target_index; + llvm_ir::IrArray::Index target_index(lhs_index.GetType()); for (int dimension = 0; dimension < lhs_index.size(); ++dimension) { if (dimension != lhs_reduction_dimension) { target_index.push_back(lhs_index[dimension]); @@ -1353,11 +1386,11 @@ Status DotOpEmitter::Emit() { } } - target_array_.EmitWriteArrayElement(target_index, result, ir_builder_); + target_array_.EmitWriteArrayElement(target_index, result, b_); // Set the IR builder insert point to the exit basic block of the outer most // loop. - ir_builder_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock()); + b_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock()); return Status::OK(); } @@ -1365,28 +1398,31 @@ Status DotOpEmitter::Emit() { Status DotOpEmitter::EmitScalarDot() { // A scalar dot is just a scalar multiply. llvm::Value* result; + // Use the same index_type for all tensor accesses in the same kernel. + llvm::Type* index_type = b_->getInt64Ty(); + llvm_ir::IrArray::Index element_index(index_type); llvm::Value* lhs_value = - lhs_array_.EmitReadArrayElement(/*index=*/{}, ir_builder_); + lhs_array_.EmitReadArrayElement(/*index=*/element_index, b_); llvm::Value* rhs_value = - rhs_array_.EmitReadArrayElement(/*index=*/{}, ir_builder_); + rhs_array_.EmitReadArrayElement(/*index=*/element_index, b_); if (ShapeUtil::ElementIsComplex(lhs_array_.GetShape())) { -#define REAL(x) ir_builder_->CreateExtractValue(x, {0}) -#define IMAG(x) ir_builder_->CreateExtractValue(x, {1}) - llvm::Value* real = ir_builder_->CreateFSub( - ir_builder_->CreateFMul(REAL(lhs_value), REAL(rhs_value)), - ir_builder_->CreateFMul(IMAG(lhs_value), IMAG(rhs_value))); - llvm::Value* imag = ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(REAL(lhs_value), IMAG(rhs_value)), - ir_builder_->CreateFMul(IMAG(lhs_value), REAL(rhs_value))); +#define REAL(x) b_->CreateExtractValue(x, {0}) +#define IMAG(x) b_->CreateExtractValue(x, {1}) + llvm::Value* real = + b_->CreateFSub(b_->CreateFMul(REAL(lhs_value), REAL(rhs_value)), + b_->CreateFMul(IMAG(lhs_value), IMAG(rhs_value))); + llvm::Value* imag = + b_->CreateFAdd(b_->CreateFMul(REAL(lhs_value), IMAG(rhs_value)), + b_->CreateFMul(IMAG(lhs_value), REAL(rhs_value))); #undef IMAG #undef REAL result = llvm::ConstantAggregateZero::get(lhs_array_.GetElementLlvmType()); - result = ir_builder_->CreateInsertValue(result, real, {0}); - result = ir_builder_->CreateInsertValue(result, imag, {1}); + result = b_->CreateInsertValue(result, real, {0}); + result = b_->CreateInsertValue(result, imag, {1}); } else { - result = ir_builder_->CreateFMul(lhs_value, rhs_value); + result = b_->CreateFMul(lhs_value, rhs_value); } - target_array_.EmitWriteArrayElement(/*index=*/{}, result, ir_builder_); + target_array_.EmitWriteArrayElement(/*index=*/element_index, result, b_); return Status::OK(); } @@ -1409,7 +1445,7 @@ Status DotOpEmitter::EmitCallToRuntime() { fn_name = multi_threaded ? runtime::kEigenMatMulF16SymbolName : runtime::kEigenSingleThreadedMatMulF16SymbolName; - float_type = ir_builder_->getHalfTy(); + float_type = b_->getHalfTy(); break; case F32: fn_name = multi_threaded @@ -1418,7 +1454,7 @@ Status DotOpEmitter::EmitCallToRuntime() { : (use_mkl_dnn ? runtime::kMKLSingleThreadedMatMulF32SymbolName : runtime::kEigenSingleThreadedMatMulF32SymbolName); - float_type = ir_builder_->getFloatTy(); + float_type = b_->getFloatTy(); break; case F64: fn_name = multi_threaded @@ -1427,24 +1463,24 @@ Status DotOpEmitter::EmitCallToRuntime() { : (use_mkl_dnn ? runtime::kMKLSingleThreadedMatMulF64SymbolName : runtime::kEigenSingleThreadedMatMulF64SymbolName); - float_type = ir_builder_->getDoubleTy(); + float_type = b_->getDoubleTy(); break; default: return Unimplemented("Invalid type %s for dot operation", - PrimitiveType_Name(type).c_str()); + PrimitiveType_Name(type)); } llvm::Type* float_ptr_type = float_type->getPointerTo(); - llvm::Type* int64_type = ir_builder_->getInt64Ty(); - llvm::Type* int32_type = ir_builder_->getInt32Ty(); - llvm::Type* int8_ptr_type = ir_builder_->getInt8Ty()->getPointerTo(); + llvm::Type* int64_type = b_->getInt64Ty(); + llvm::Type* int32_type = b_->getInt32Ty(); + llvm::Type* int8_ptr_type = b_->getInt8Ty()->getPointerTo(); llvm::FunctionType* matmul_type = llvm::FunctionType::get( - ir_builder_->getVoidTy(), + b_->getVoidTy(), {int8_ptr_type, float_ptr_type, float_ptr_type, float_ptr_type, int64_type, int64_type, int64_type, int32_type, int32_type}, /*isVarArg=*/false); - llvm::Function* function = ir_builder_->GetInsertBlock()->getParent(); + llvm::Function* function = b_->GetInsertBlock()->getParent(); llvm::Module* module = function->getParent(); llvm::Function* matmul_func = llvm::cast( @@ -1479,18 +1515,15 @@ Status DotOpEmitter::EmitCallToRuntime() { std::swap(transpose_lhs, transpose_rhs); } - ir_builder_->CreateCall( + b_->CreateCall( matmul_func, - {ir_builder_->CreateBitCast(executable_run_options_value_, int8_ptr_type), - ir_builder_->CreateBitCast(target_array_.GetBasePointer(), - float_ptr_type), - ir_builder_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type), - ir_builder_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type), - ir_builder_->getInt64(mat_mult_dims.m), - ir_builder_->getInt64(mat_mult_dims.n), - ir_builder_->getInt64(mat_mult_dims.k), - ir_builder_->getInt32(transpose_lhs), - ir_builder_->getInt32(transpose_rhs)}); + {b_->CreateBitCast(executable_run_options_value_, int8_ptr_type), + b_->CreateBitCast(target_array_.GetBasePointer(), float_ptr_type), + b_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type), + b_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type), + b_->getInt64(mat_mult_dims.m), b_->getInt64(mat_mult_dims.n), + b_->getInt64(mat_mult_dims.k), b_->getInt32(transpose_lhs), + b_->getInt32(transpose_rhs)}); return Status::OK(); } @@ -1513,36 +1546,6 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const { LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0}; } -llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest( - llvm_ir::ForLoopNest* loop_nest, const llvm_ir::IrArray& operand_array, - int64 reduction_dimension, tensorflow::StringPiece name_suffix) { - // Prepares the dimension list we will use to emit the loop nest. Outermost - // loops are added first. Add loops in major-to-minor order, and skip the - // reduction dimension. - std::vector dimensions; - const Shape& shape = operand_array.GetShape(); - for (int i = LayoutUtil::MinorToMajor(shape).size() - 1; i >= 0; --i) { - int64 dimension = LayoutUtil::Minor(shape.layout(), i); - if (dimension != reduction_dimension) { - dimensions.push_back(dimension); - } - } - - // Create loop nest with one for-loop for each dimension of the - // output. - llvm_ir::IrArray::Index index = - loop_nest->AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix); - // Verify every dimension except the reduction dimension was set in the index. - for (int dimension = 0; dimension < index.size(); ++dimension) { - if (dimension == reduction_dimension) { - DCHECK_EQ(nullptr, index[dimension]); - } else { - DCHECK_NE(nullptr, index[dimension]); - } - } - return index; -} - // Return whether the given shape is a matrix with no padding. static bool IsRank2WithNoPadding(const Shape& shape) { return ShapeUtil::Rank(shape) == 2 && !LayoutUtil::IsPadded(shape); @@ -1588,8 +1591,8 @@ bool PotentiallyImplementedAsEigenDot( const Shape& lhs_shape = hlo.operand(0)->shape(); const Shape& rhs_shape = hlo.operand(1)->shape(); - if (ShapeUtil::HasZeroElements(lhs_shape) || - ShapeUtil::HasZeroElements(rhs_shape)) { + if (ShapeUtil::IsZeroElementArray(lhs_shape) || + ShapeUtil::IsZeroElementArray(rhs_shape)) { return false; } @@ -1616,7 +1619,7 @@ bool PotentiallyImplementedAsEigenDot( // For vector-matrix dot products, it is always profitable to make the Rhs // column major. -tensorflow::gtl::optional ProfitableToMakeDotOperandColumnMajor( +absl::optional ProfitableToMakeDotOperandColumnMajor( const HloInstruction& hlo) { if (hlo.opcode() == HloOpcode::kDot && hlo.shape().dimensions_size() == 2 && hlo.shape().dimensions(0) == 1) { diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h index d88ccea0dbc845c0d9a580a5b118c57c888fb557..4c2041b556aa8bf8fe8fb8e0674c0f4f04f0acae 100644 --- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_H_ +#include "absl/strings/string_view.h" #include "llvm/IR/IRBuilder.h" #include "tensorflow/compiler/xla/service/cpu/cpu_options.h" #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h" @@ -25,7 +26,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -38,7 +38,7 @@ bool PotentiallyImplementedAsEigenDot( // Returns the index for an operand to `hlo` that should ideally be column // major. Returns nullopt if there is no such operand or if `hlo` is not a dot // or a fusion containing a dot. -tensorflow::gtl::optional ProfitableToMakeDotOperandColumnMajor( +absl::optional ProfitableToMakeDotOperandColumnMajor( const HloInstruction& hlo); // Returns true to indicate that we can generate a tiled LLVM IR implementation @@ -61,7 +61,7 @@ class DotOpEmitter { const HloInstruction& dot, const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array, - llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder, + llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b, const HloModuleConfig& hlo_module_config, const TargetMachineFeatures& target_machine_features); @@ -70,8 +70,7 @@ class DotOpEmitter { const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array, - llvm::Value* executable_run_options_value, - llvm::IRBuilder<>* ir_builder, + llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b, const HloModuleConfig& hlo_module_config, const TargetMachineFeatures& target_machine_features); @@ -89,17 +88,6 @@ class DotOpEmitter { // Emits a call to the CPU runtime to perform the matrix multiply. Status EmitCallToRuntime(); - // Emits a series of nested loops for iterating over an operand array in the - // dot operation. Loops are constructed in major to minor dimension layout - // order. No loop is emitted for the given reduction_dimension. The function - // returns an IrArray index for the given operand_array containing the indvars - // of the loops. All dimensions of the index are filled except for the - // reduction dimension. name_suffix is the string to append to the names of - // LLVM constructs (eg, basic blocks) constructed by this method. - llvm_ir::IrArray::Index EmitOperandArrayLoopNest( - llvm_ir::ForLoopNest* loop_nest, const llvm_ir::IrArray& operand_array, - int64 reduction_dimension, tensorflow::StringPiece name_suffix); - // Represents the dimensions of a matrix-matrix multiply operation. struct MatMultDims { // The number of rows in the LHS. @@ -133,7 +121,7 @@ class DotOpEmitter { // of rank 2 as well). MatMultDims GetMatMultDims() const; - bool EmitExperimentalGebpDotIfEnabled(const MatMultDims& mat_mult_dims); + bool EmitSmallGemmIfProfitable(const MatMultDims& mat_mult_dims); // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector // registers. @@ -143,6 +131,17 @@ class DotOpEmitter { .value_or(kDefaultTilingFactor); } + std::tuple GetGemmTileSize() const { + // Tuned for broadwell - Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz + // + // TODO(b/80093688): Tune for other architectures and centralize this + // information in one place. + const std::tuple kDefaultTileSize = + std::tuple(11, 9, 1); + return options::LlvmIrGemmTileSize(hlo_module_config_) + .value_or(kDefaultTileSize); + } + // Returns true if we should use an experimental implementation of GEMM // (general matrix matrix multiplication) if possible. bool EnableExperimentalLlvmIrGemm() const { @@ -160,7 +159,7 @@ class DotOpEmitter { const llvm_ir::IrArray& rhs_array_; const llvm_ir::IrArray* addend_array_; llvm::Value* executable_run_options_value_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; const HloModuleConfig& hlo_module_config_; const TargetMachineFeatures& target_machine_features_; }; diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc index e97113dfa0f59e791d614c0093d0781e49c48ee4..c8312d80bd5012e5bcb42a410db18a7fa77a2eb6 100644 --- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc @@ -19,6 +19,8 @@ limitations under the License. #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/types.h" @@ -28,57 +30,16 @@ limitations under the License. namespace xla { namespace cpu { -StatusOr CpuElementalIrEmitter::EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { - switch (op->opcode()) { - case HloOpcode::kTanh: { - PrimitiveType element_type = op->shape().element_type(); - bool cast_result_to_fp16 = false; - string function_name; - switch (element_type) { - case F16: - cast_result_to_fp16 = true; - operand_value = ir_builder_->CreateFPCast(operand_value, - ir_builder_->getFloatTy()); - TF_FALLTHROUGH_INTENDED; - case F32: - function_name = "tanhf"; - break; - case F64: - function_name = "tanh"; - break; - default: - return Unimplemented("tanh"); - } - // Create a function declaration. - llvm::Function* function = - llvm::cast(module_->getOrInsertFunction( - llvm_ir::AsStringRef(function_name), operand_value->getType(), - operand_value->getType())); - function->setCallingConv(llvm::CallingConv::C); - function->setDoesNotThrow(); - function->setDoesNotAccessMemory(); - // Create an instruction to call the function. - llvm::Value* result = ir_builder_->CreateCall(function, operand_value); - if (cast_result_to_fp16) { - result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy()); - } - return result; - } - default: - return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value); - } -} - -StatusOr CpuElementalIrEmitter::EmitAtan2( - PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const { +StatusOr CpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type, + llvm::Value* lhs, + llvm::Value* rhs) { string function_name; bool cast_result_to_fp16 = false; switch (prim_type) { case F16: cast_result_to_fp16 = true; - lhs = ir_builder_->CreateFPCast(lhs, ir_builder_->getFloatTy()); - rhs = ir_builder_->CreateFPCast(rhs, ir_builder_->getFloatTy()); + lhs = FPCast(lhs, b_->getFloatTy()); + rhs = FPCast(rhs, b_->getFloatTy()); TF_FALLTHROUGH_INTENDED; case F32: function_name = "atan2f"; @@ -98,16 +59,49 @@ StatusOr CpuElementalIrEmitter::EmitAtan2( function->setDoesNotThrow(); function->setDoesNotAccessMemory(); // Create an instruction to call the function. - llvm::Value* result = ir_builder_->CreateCall(function, {lhs, rhs}); + llvm::Value* result = Call(function, {lhs, rhs}); + if (cast_result_to_fp16) { + result = FPCast(result, b_->getHalfTy()); + } + return result; +} + +StatusOr CpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type, + llvm::Value* value) { + bool cast_result_to_fp16 = false; + string function_name; + switch (prim_type) { + case F16: + cast_result_to_fp16 = true; + value = FPCast(value, b_->getFloatTy()); + TF_FALLTHROUGH_INTENDED; + case F32: + function_name = "tanhf"; + break; + case F64: + function_name = "tanh"; + break; + default: + return Unimplemented("tanh"); + } + // Create a function declaration. + llvm::Function* function = llvm::cast( + module_->getOrInsertFunction(llvm_ir::AsStringRef(function_name), + value->getType(), value->getType())); + function->setCallingConv(llvm::CallingConv::C); + function->setDoesNotThrow(); + function->setDoesNotAccessMemory(); + // Create an instruction to call the function. + llvm::Value* result = Call(function, value); if (cast_result_to_fp16) { - result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy()); + result = FPCast(result, b_->getHalfTy()); } return result; } llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const { + const HloToElementGeneratorMap& operand_to_generator) { if (hlo->opcode() == HloOpcode::kMap) { return [this, hlo, &operand_to_generator]( const llvm_ir::IrArray::Index& index) -> StatusOr { @@ -118,9 +112,8 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator( ElementwiseSourceIndex(index, *hlo, i))); operands.push_back(operand_value); } - return ir_emitter_->EmitScalarCall(hlo->shape().element_type(), - hlo->to_apply(), operands, - llvm_ir::IrName(hlo)); + return ir_emitter_->EmitElementalMap(*Cast(hlo), + operands, llvm_ir::IrName(hlo)); }; } return ElementalIrEmitter::MakeElementGenerator(hlo, operand_to_generator); diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h index 4446dfd2821fb4b6e75f33694367392ecbcdd8bf..e3fba9306b72904803259047fafea245a8e183db 100644 --- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h @@ -31,18 +31,18 @@ class CpuElementalIrEmitter : public ElementalIrEmitter { public: CpuElementalIrEmitter(const HloModuleConfig& module_config, IrEmitter* ir_emitter, llvm::Module* module) - : ElementalIrEmitter(module_config, module, ir_emitter->ir_builder()), + : ElementalIrEmitter(module_config, module, ir_emitter->b()), ir_emitter_(ir_emitter) {} llvm_ir::ElementGenerator MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const override; + const HloToElementGeneratorMap& operand_to_generator) override; protected: - StatusOr EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const override; StatusOr EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const override; + llvm::Value* rhs) override; + StatusOr EmitTanh(PrimitiveType prim_type, + llvm::Value* value) override; IrEmitter* ir_emitter_; }; diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc b/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc deleted file mode 100644 index c56286559158758ca6db5ae097729286bde346f0..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/cpu/external_constant_pool.h" - -#include -#include -#include - -#include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/core/lib/gtl/flatset.h" - -namespace xla { -namespace cpu { -void ExternalConstantPool::Insert(string name, const LiteralSlice& literal, - int64 alignment) { - CHECK(!ShapeUtil::IsTuple(literal.shape())); - CHECK(alignment > 0 && IsPowerOfTwo(static_cast(alignment))); - CHECK(entries_.find(name) == entries_.end()); - - const int64 literal_size = ShapeUtil::ByteSizeOf(literal.shape()); - void* raw_pointer = tensorflow::port::AlignedMalloc( - literal_size, std::max(alignment, sizeof(void*))); - CHECK(raw_pointer != nullptr) << "failed to allocate " << literal_size - << " bytes with alignment of " << alignment; - - std::memcpy(raw_pointer, literal.untyped_data(), literal_size); - entries_.emplace(std::move(name), static_cast(raw_pointer)); -} - -const uint8* ExternalConstantPool::Find(const string& name) { - auto it = entries_.find(name); - return it == entries_.end() ? nullptr : it->second.get(); -} -} // namespace cpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h b/tensorflow/compiler/xla/service/cpu/external_constant_pool.h deleted file mode 100644 index 0677f5f0b58005079890052a426e5f48c5d09ed1..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_ - -#include - -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/core/lib/gtl/flatmap.h" -#include "tensorflow/core/platform/mem.h" - -namespace xla { -namespace cpu { -// An ExternalConstantPool maintains a set of constants kept external to -// generated LLVM IR. These constants are accessed from the IR via globals with -// extern linkage. This current incarnation of ExternalConstantPool only -// supports the JIT CPU backend; the AOT backend is not supported. -// -// Implementation-wise, this is a simple wrapper around a map of strings to byte -// buffers. This simply implementation works in a JIT scenario. This class -// will have to become smarter if we decide to support external constant pools -// on AOT compiles in the future. -class ExternalConstantPool { - public: - // Inserts a buffer with the contents of `literal` into the constant pool with - // the name `name`. It is an error to try to insert two constants with the - // same `name` into the same constant pool. The buffer for literal is aligned - // to `aligment` bytes, and `alignment` must be a power of 2. - // - // The constant pool copies out the contents of `literal` into a buffer it - // owns -- it does not keep pointers to `literal`, or to memory owned by - // `literal`. - void Insert(string name, const LiteralSlice& literal, int64 alignment); - - // Find the constant with name `name` in this constant pool. If there isn't - // such constant, return nullptr. - const uint8* Find(const string& name); - - private: - // We need to `AlignedFree` pointers allocated into `entries_` since we - // allocate them with `AlignedMalloc`. - struct FreeDeleter { - void operator()(void* ptr) { tensorflow::port::AlignedFree(ptr); } - }; - - tensorflow::gtl::FlatMap> - entries_; -}; -} // namespace cpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_ diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool_test.cc b/tensorflow/compiler/xla/service/cpu/external_constant_pool_test.cc deleted file mode 100644 index 9290a4e5dfc03ddb86e9d82f1f0f4f9a8ceebb88..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/external_constant_pool_test.cc +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/cpu/external_constant_pool.h" -#include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/core/platform/test.h" - -namespace xla { -namespace cpu { -namespace { -class ExternalConstantPoolTest : public ::testing::Test {}; - -template -T GetFromBuffer(const uint8* buffer, int64 index) { - T result; - std::memcpy(&result, buffer + index * sizeof(T), sizeof(T)); - return result; -} - -TEST(ExternalConstantPoolTest, Basic) { - ExternalConstantPool constant_pool; - EXPECT_EQ(constant_pool.Find("name-0"), nullptr); - const auto literal = Literal::CreateR2({{1, 2}, {3, 4}}); - constant_pool.Insert("name-0", *literal, 4); - const uint8* constant = constant_pool.Find("name-0"); - ASSERT_NE(constant, nullptr); - - EXPECT_EQ(GetFromBuffer(constant, 0), 1); - EXPECT_EQ(GetFromBuffer(constant, 1), 2); - EXPECT_EQ(GetFromBuffer(constant, 2), 3); - EXPECT_EQ(GetFromBuffer(constant, 3), 4); - - EXPECT_EQ(constant_pool.Find("name-1"), nullptr); -} - -TEST(ExternalConstantPoolTest, RowMinorLayout) { - ExternalConstantPool constant_pool; - EXPECT_EQ(constant_pool.Find("name-0"), nullptr); - const auto literal = Literal::CreateR2WithLayout( - {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({0, 1})); - constant_pool.Insert("name-0", *literal, 4); - const uint8* constant = constant_pool.Find("name-0"); - ASSERT_NE(constant, nullptr); - - EXPECT_EQ(GetFromBuffer(constant, 0), 1); - EXPECT_EQ(GetFromBuffer(constant, 1), 3); - EXPECT_EQ(GetFromBuffer(constant, 2), 2); - EXPECT_EQ(GetFromBuffer(constant, 3), 4); -} - -TEST(ExternalConstantPoolTest, Alignment) { - ExternalConstantPool constant_pool; - EXPECT_EQ(constant_pool.Find("name-0"), nullptr); - - for (int i = 0; i < 8; i++) { - int64 alignment = 1 << i; - string name = tensorflow::strings::StrCat("name-", i); - - const auto literal = Literal::CreateR2({{1, 2}, {3, 4}}); - constant_pool.Insert(name, *literal, alignment); - - const uint8* constant = constant_pool.Find(name); - ASSERT_NE(constant, nullptr); - EXPECT_EQ(reinterpret_cast(constant) % alignment, 0); - } -} - -} // namespace -} // namespace cpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc index b560b7531c0d24e6f670e61a15dce295d9fa2a49..1a8bedfe6afb4f096ddd4703c312b84d521a7ba5 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc @@ -64,8 +64,8 @@ bool PotentiallyImplementedAsEigenConvolution( return false; } - if (ShapeUtil::HasZeroElements(input_shape) || - ShapeUtil::HasZeroElements(kernel_shape)) { + if (ShapeUtil::IsZeroElementArray(input_shape) || + ShapeUtil::IsZeroElementArray(kernel_shape)) { return false; } // Make sure input and kernel has the same data type. diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc index abb2471e6ae6b2f2949ab2e91235e5047ae404f8..530ebce854fedf4e4db12139d5b56087b1176a6c 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" namespace xla { namespace { @@ -35,7 +35,7 @@ ENTRY Conv { } )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); HloComputation* entry_computation = module->entry_computation(); diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 13bd5e73db500e20b0e8c33bf921ee2457e126e5..e5cf15c686157d837901fa912bdde2a7a5d501d9 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -27,6 +27,9 @@ limitations under the License. #include "tensorflow/core/lib/math/math_util.h" #include "tensorflow/core/platform/logging.h" // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/types/span.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/BasicBlock.h" @@ -48,11 +51,14 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/shape_partition.h" #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h" #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h" +#include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" -#include "tensorflow/compiler/xla/service/llvm_ir/ops.h" #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -61,11 +67,8 @@ limitations under the License. #include "tensorflow/compiler/xla/window_util.h" #include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { @@ -83,22 +86,25 @@ IrEmitter::IrEmitter( llvm::Module* llvm_module, std::unordered_map instruction_to_profile_idx, std::unordered_map computation_to_profile_idx, - const TargetMachineFeatures* target_machine_features, - ExternalConstantPool* external_constant_pool) + const TargetMachineFeatures* target_machine_features) : assignment_(assignment), module_(llvm_module), arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()), - ir_builder_(llvm_module->getContext()), + b_(llvm_module->getContext()), instruction_to_profile_idx_(std::move(instruction_to_profile_idx)), computation_to_profile_idx_(std::move(computation_to_profile_idx)), alias_analysis_(hlo_module, assignment, &llvm_module->getContext()), hlo_module_config_(hlo_module.config()), is_top_level_computation_(false), - target_machine_features_(*target_machine_features), - external_constant_pool_(external_constant_pool) { - ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags( + target_machine_features_(*target_machine_features) { + b_.setFastMathFlags(llvm_ir::GetFastMathFlags( /*fast_math_enabled=*/hlo_module_config_.debug_options() - .xla_enable_fast_math())); + .xla_cpu_enable_fast_math())); + Status s = GatherComputationsByAllocationType( + &hlo_module, &thread_local_computations_, &global_computations_); + absl::c_sort(thread_local_computations_); + absl::c_sort(global_computations_); + TF_CHECK_OK(s) << "Should have failed buffer assignment."; } StatusOr IrEmitter::EmitComputation( @@ -115,6 +121,19 @@ StatusOr IrEmitter::EmitComputation( computation->root_instruction()->outer_dimension_partitions().size(); } + if (computation->root_instruction()->opcode() != HloOpcode::kOutfeed) { + TF_ASSIGN_OR_RETURN( + computation_root_allocation_, + assignment_.GetUniqueTopLevelSlice(computation->root_instruction())); + } + + for (const HloInstruction* param : computation->parameter_instructions()) { + TF_ASSIGN_OR_RETURN(BufferAllocation::Slice param_slice, + assignment_.GetUniqueTopLevelSlice(param)); + computation_parameter_allocations_[param_slice.allocation()->index()] = + param->parameter_number(); + } + InitializeIrFunction(function_name); // The rdtscp instruction is x86 specific. We will fallback to LLVM's generic // readcyclecounter if it is unavailable. @@ -131,6 +150,8 @@ StatusOr IrEmitter::EmitComputation( // Delete 'compute_function', finalizing 'ir_function' and restoring caller // IR insert point. compute_function_.reset(); + computation_root_allocation_ = BufferAllocation::Slice(); + computation_parameter_allocations_.clear(); return ir_function; } @@ -142,11 +163,11 @@ void IrEmitter::InitializeIrFunction(const string& function_name) { is_top_level_computation_ ? llvm::GlobalValue::ExternalLinkage : llvm::GlobalValue::InternalLinkage; // Create and initialize new IrFunction. - compute_function_.reset( - new IrFunction(function_name, linkage, - options::OptimizeForSizeRequested(hlo_module_config_), - hlo_module_config_.debug_options().xla_enable_fast_math(), - module_, &ir_builder_, num_dynamic_loop_bounds_)); + compute_function_.reset(new IrFunction( + function_name, linkage, + options::OptimizeForSizeRequested(hlo_module_config_), + hlo_module_config_.debug_options().xla_cpu_enable_fast_math(), module_, + &b_, num_dynamic_loop_bounds_)); } IrEmitter::~IrEmitter() {} @@ -154,66 +175,55 @@ IrEmitter::~IrEmitter() {} Status IrEmitter::HandleBitcast(HloInstruction* bitcast) { VLOG(2) << "HandleBitcast: " << bitcast->ToString(); emitted_value_[bitcast] = - ir_builder_.CreateBitCast(GetEmittedValueFor(bitcast->operand(0)), - IrShapeType(bitcast->shape())->getPointerTo(), - AsStringRef(IrName(bitcast))); + BitCast(GetEmittedValueFor(bitcast->operand(0)), + IrShapeType(bitcast->shape())->getPointerTo(), + AsStringRef(IrName(bitcast))); return Status::OK(); } -llvm::GlobalVariable* IrEmitter::EmitGlobalForLiteral(const Literal& literal) { - llvm::GlobalVariable* result; - - // We avoid creating large constants in the LLVM IR since LLVM is not - // efficient for large constant arrays. We still emit "small enough" constant - // arrays into the Ir, in the off chance the LLVM optimizer can do something - // interesting with it. - const int kMaxInternalConstantSizeInBytes = 128; - if (external_constant_pool_ && - ByteSizeOf(literal.shape()) >= kMaxInternalConstantSizeInBytes) { - string global_name = tensorflow::strings::StrCat( - "constant_global_", external_global_constant_counter_++); - result = new llvm::GlobalVariable( - /*Module=*/*module_, - /*Type=*/IrShapeType(literal.shape()), - /*isConstant=*/true, - /*Linkage=*/llvm::GlobalValue::ExternalLinkage, - /*Initializer=*/nullptr, - /*Name=*/AsStringRef(global_name)); - result->setAlignment(MinimumAlignmentForShape(literal.shape())); - external_constant_pool_->Insert(global_name, literal, - MinimumAlignmentForShape(literal.shape())); - } else { - llvm::Constant* initializer = - llvm_ir::ConvertLiteralToIrConstant(literal, module_); - result = new llvm::GlobalVariable( - /*Module=*/*module_, - /*Type=*/initializer->getType(), - /*isConstant=*/true, - /*Linkage=*/llvm::GlobalValue::PrivateLinkage, - /*Initializer=*/initializer, - /*Name=*/""); - result->setAlignment(MinimumAlignmentForShape(literal.shape())); +llvm::Constant* IrEmitter::EmitGlobalForLiteral(const Literal& literal) { + llvm::Constant* initializer = + llvm_ir::ConvertLiteralToIrConstant(literal, module_); + llvm::GlobalVariable* result_global = new llvm::GlobalVariable( + /*Module=*/*module_, + /*Type=*/initializer->getType(), + /*isConstant=*/true, + /*Linkage=*/llvm::GlobalValue::PrivateLinkage, + /*Initializer=*/initializer, + /*Name=*/""); + result_global->setAlignment(MinimumAlignmentForShape(literal.shape())); + return llvm::ConstantExpr::getBitCast( + result_global, IrShapeType(literal.shape())->getPointerTo()); +} + +Status IrEmitter::EmitConstantGlobals() { + for (const BufferAllocation& allocation : assignment_.Allocations()) { + if (!allocation.is_constant()) { + continue; + } + + const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation); + llvm::Constant* global_for_const; + auto it = emitted_literals_.find(&literal); + if (it != emitted_literals_.end()) { + global_for_const = it->second; + } else { + global_for_const = EmitGlobalForLiteral(literal); + InsertOrDie(&emitted_literals_, &literal, global_for_const); + } + + InsertOrDie(&constant_buffer_to_global_, allocation.index(), + global_for_const); } - return result; + + return Status::OK(); } Status IrEmitter::HandleConstant(HloInstruction* constant) { VLOG(2) << "HandleConstant: " << constant->ToString(); - const Literal& literal = constant->literal(); - llvm::GlobalVariable* global_for_const; - - auto it = emitted_literals_.find(&literal); - if (it != emitted_literals_.end()) { - global_for_const = it->second; - } else { - global_for_const = EmitGlobalForLiteral(literal); - emitted_literals_[&literal] = global_for_const; - } - emitted_value_[constant] = global_for_const; - VLOG(2) << " emitted value: " << llvm_ir::DumpToString(*global_for_const); - VLOG(2) << " its type: " - << llvm_ir::DumpToString(*global_for_const->getType()); - return Status::OK(); + // IrEmitter::EmitConstantGlobals has already taken care of emitting the body + // of the constant. + return EmitTargetAddressForOp(constant); } Status IrEmitter::HandleCopy(HloInstruction* copy) { @@ -221,10 +231,12 @@ Status IrEmitter::HandleCopy(HloInstruction* copy) { // kCopy shallow copies a tuple so just memcpy the top-level buffer. TF_RETURN_IF_ERROR(EmitTargetAddressForOp(copy)); return EmitMemcpy(*(copy->operand(0)), *copy); - } else { - // Use the elemental emitter for non-tuple shapes. + } else if (ShapeUtil::IsArray(copy->shape())) { + // Use the elemental emitter for array shapes. return DefaultAction(copy); } + return Unimplemented("unsupported operand type %s for copy instruction", + PrimitiveType_Name(copy->shape().element_type())); } // Calculate the alignment of a buffer allocated for a given primitive type. @@ -292,58 +304,73 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) { const Shape& shape = get_tuple_element->shape(); emitted_value_[get_tuple_element] = llvm_ir::EmitGetTupleElement( shape, get_tuple_element->tuple_index(), MinimumAlignmentForShape(shape), - GetEmittedValueFor(operand), &ir_builder_, module_); + GetEmittedValueFor(operand), &b_, module_); return Status::OK(); } Status IrEmitter::HandleSelect(HloInstruction* select) { auto pred = select->operand(0); - auto on_true = select->operand(1); - auto on_false = select->operand(2); TF_RET_CHECK(pred->shape().element_type() == PRED); - - if (ShapeUtil::IsTuple(select->shape())) { - TF_RETURN_IF_ERROR(EmitTargetAddressForOp(select)); - llvm_ir::EmitTupleSelect( - GetIrArrayFor(select), GetIrArrayFor(pred), GetEmittedValueFor(on_true), - GetEmittedValueFor(on_false), &ir_builder_, module_); - return Status::OK(); - } - return DefaultAction(select); } -Status IrEmitter::HandleInfeed(HloInstruction* infeed) { - VLOG(2) << "HandleInfeed: " << infeed->ToString(); +Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) { + auto pred = tuple_select->operand(0); + auto on_true = tuple_select->operand(1); + auto on_false = tuple_select->operand(2); + TF_RET_CHECK(pred->shape().element_type() == PRED); + TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape())); + TF_RET_CHECK(ShapeUtil::IsTuple(tuple_select->shape())); + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(tuple_select)); + llvm_ir::EmitTupleSelect(GetIrArrayFor(tuple_select), GetIrArrayFor(pred), + GetEmittedValueFor(on_true), + GetEmittedValueFor(on_false), &b_, module_); + return Status::OK(); +} - const Shape& shape = infeed->shape(); +Status IrEmitter::HandleInfeed(HloInstruction* instruction) { + HloInfeedInstruction* infeed = Cast(instruction); + VLOG(2) << "HandleInfeed: " << infeed->ToString(); - // The infeed operation produces data (dequeued from the infeed queue) at this - // address, which has been provided by buffer assignment. + // The infeed operation produces a two-element tuple containing data and a + // token value. HloInfeedInstruction::infeed_shape gives us the data shape. + const Shape& data_shape = infeed->infeed_shape(); + DCHECK(ShapeUtil::Equal(data_shape, + ShapeUtil::GetTupleElementShape(infeed->shape(), 0))); TF_RETURN_IF_ERROR(EmitTargetAddressForOp(infeed)); - llvm_ir::IrArray infeed_array = GetIrArrayFor(infeed); - if (ShapeUtil::IsTuple(shape)) { - TF_RET_CHECK(!ShapeUtil::IsNestedTuple(shape)); + // Write the tuple index table. + TF_ASSIGN_OR_RETURN(BufferAllocation::Slice data_slice, + assignment_.GetUniqueSlice(infeed, {0})); + llvm::Value* data_address = EmitBufferPointer(data_slice, data_shape); + TF_ASSIGN_OR_RETURN(BufferAllocation::Slice token_slice, + assignment_.GetUniqueSlice(infeed, {1})); + llvm::Value* token_address = EmitBufferPointer( + token_slice, ShapeUtil::GetTupleElementShape(infeed->shape(), 1)); + llvm_ir::EmitTuple(GetIrArrayFor(infeed), {data_address, token_address}, &b_, + module_); + + if (ShapeUtil::IsTuple(data_shape)) { + TF_RET_CHECK(!ShapeUtil::IsNestedTuple(data_shape)); // For a tuple, we first copy each of the internal elements to // their corresponding target locations. We then construct the // tuple outer buffer containing pointers to the internal // elements. std::vector tuple_element_addresses; - for (int64 i = 0; i < shape.tuple_shapes_size(); ++i) { + for (int64 i = 0; i < data_shape.tuple_shapes_size(); ++i) { TF_ASSIGN_OR_RETURN(BufferAllocation::Slice buffer, - assignment_.GetUniqueSlice(infeed, {i})); + assignment_.GetUniqueSlice(infeed, {0, i})); const Shape& tuple_element_shape = - ShapeUtil::GetTupleElementShape(shape, i); + ShapeUtil::GetTupleElementShape(data_shape, i); // Only the outer tuple buffer's target address is obtained from // GetEmittedValueFor, to handle the case when Infeed is the root // instruction. Target addresses for internal elements can be obtained - // from EmitTempBufferPointer. + // from EmitBufferPointer. llvm::Value* tuple_element_address = - EmitTempBufferPointer(buffer, tuple_element_shape); + EmitBufferPointer(buffer, tuple_element_shape); TF_RETURN_IF_ERROR(EmitXfeedTransfer( XfeedKind::kInfeed, tuple_element_shape, tuple_element_address)); @@ -351,11 +378,11 @@ Status IrEmitter::HandleInfeed(HloInstruction* infeed) { tuple_element_addresses.push_back(tuple_element_address); } - llvm_ir::EmitTuple(infeed_array, tuple_element_addresses, &ir_builder_, - module_); + llvm_ir::EmitTuple(llvm_ir::IrArray(data_address, data_shape), + tuple_element_addresses, &b_, module_); } else { - TF_RETURN_IF_ERROR(EmitXfeedTransfer(XfeedKind::kInfeed, shape, - GetEmittedValueFor(infeed))); + TF_RETURN_IF_ERROR( + EmitXfeedTransfer(XfeedKind::kInfeed, data_shape, data_address)); } return Status::OK(); @@ -366,21 +393,21 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape, int64 length = ByteSizeOf(shape); if (length <= 0 || length > std::numeric_limits::max()) { return InvalidArgument( - "xfeed (infeed or outfeed) buffer length %lld is outside the valid " + "xfeed (infeed or outfeed) buffer length %d is outside the valid " "size range", length); } int32 length_32 = static_cast(length); int32 shape_length; - TF_ASSIGN_OR_RETURN(llvm::Value * shape_ptr, - llvm_ir::EncodeSelfDescribingShapeConstant( - shape, &shape_length, &ir_builder_)); + TF_ASSIGN_OR_RETURN( + llvm::Value * shape_ptr, + llvm_ir::EncodeSelfDescribingShapeConstant(shape, &shape_length, &b_)); // The signature of the acquire infeed buffer function is: // // (void*)(int32 length); - llvm::Type* int32_type = ir_builder_.getInt32Ty(); + llvm::Type* int32_type = b_.getInt32Ty(); llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext()); llvm::FunctionType* acquire_type = llvm::FunctionType::get( i8_ptr_type, {int32_type, i8_ptr_type, int32_type}, @@ -400,8 +427,7 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape, // // (void)(int32 length, void* buffer); llvm::FunctionType* release_type = llvm::FunctionType::get( - ir_builder_.getVoidTy(), - {int32_type, i8_ptr_type, i8_ptr_type, int32_type}, + b_.getVoidTy(), {int32_type, i8_ptr_type, i8_ptr_type, int32_type}, /*isVarArg=*/false); llvm::Function* release_func; @@ -418,30 +444,33 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape, // of size exactly 'length_32', and the runtime is responsible for // check-failing the process if there is a mismatch, versus passing us back a // buffer that we might overrun. - llvm::Value* acquired_pointer = ir_builder_.CreateCall( - acquire_func, {ir_builder_.getInt32(length_32), shape_ptr, - ir_builder_.getInt32(shape_length)}); + llvm::Value* acquired_pointer = + Call(acquire_func, + {b_.getInt32(length_32), shape_ptr, b_.getInt32(shape_length)}); if (kind == XfeedKind::kInfeed) { // Copy to the program buffer address from the acquired buffer. - ir_builder_.CreateMemCpy(program_buffer_address, /*DstAlign=*/1, - acquired_pointer, - /*SrcAlign=*/1, length_32); + MemCpy(program_buffer_address, /*DstAlign=*/1, acquired_pointer, + /*SrcAlign=*/1, length_32); } else { // Outfeed -- copy from the in-program address to the acquired buffer. - ir_builder_.CreateMemCpy(acquired_pointer, /*DstAlign=*/1, - program_buffer_address, - /*SrcAlign=*/1, length_32); + MemCpy(acquired_pointer, /*DstAlign=*/1, program_buffer_address, + /*SrcAlign=*/1, length_32); } - ir_builder_.CreateCall(release_func, - {ir_builder_.getInt32(length_32), acquired_pointer, - shape_ptr, ir_builder_.getInt32(shape_length)}); + Call(release_func, {b_.getInt32(length_32), acquired_pointer, shape_ptr, + b_.getInt32(shape_length)}); return Status::OK(); } Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) { + // Outfeed produces no useful result, but it does return a token[] that can be + // threaded through to other side effecting operations to ensure ordering. In + // the IR emitter we treat this token as a normal u8[] and thus need to insert + // an entry for it in emitted_value_. + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(outfeed)); + HloInstruction* operand = outfeed->operands()[0]; const Shape& operand_shape = operand->shape(); @@ -457,7 +486,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) { ShapeUtil::GetTupleElementShape(operand_shape, i); llvm::Value* tuple_element = llvm_ir::EmitGetTupleElement( tuple_element_shape, i, MinimumAlignmentForShape(tuple_element_shape), - value, &ir_builder_, module_); + value, &b_, module_); TF_RETURN_IF_ERROR(EmitXfeedTransfer(XfeedKind::kOutfeed, tuple_element_shape, tuple_element)); } @@ -476,46 +505,94 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) { for (auto operand : tuple->operands()) { base_ptrs.push_back(GetEmittedValueFor(operand)); } - llvm_ir::EmitTuple(GetIrArrayFor(tuple), base_ptrs, &ir_builder_, module_); + llvm_ir::EmitTuple(GetIrArrayFor(tuple), base_ptrs, &b_, module_); return Status::OK(); } -Status IrEmitter::HandleMap(HloInstruction* map) { - gtl::ArraySlice operands(map->operands()); - HloComputation* function = map->to_apply(); - // The called computation should have been emitted previously. - llvm::Function* mapped_ir_function = FindOrDie(emitted_functions_, function); +llvm::Value* IrEmitter::EmitElementalMap( + const HloMapInstruction& map_instr, + absl::Span elemental_operands, absl::string_view name) { + return EmitThreadLocalCall(*map_instr.to_apply(), elemental_operands, name); +} + +StatusOr IrEmitter::EmitTargetElementLoopBodyForReduceWindow( + HloReduceWindowInstruction* reduce_window, + const llvm_ir::IrArray::Index& index) { + const HloInstruction* operand = reduce_window->operand(0); + const Window& window = reduce_window->window(); - return EmitTargetElementLoop(map, [this, map, operands, mapped_ir_function]( - const llvm_ir::IrArray::Index& index) { - std::vector parameter_addresses; - for (const HloInstruction* operand : operands) { - const llvm_ir::IrArray& array = GetIrArrayFor(operand); - parameter_addresses.push_back( - array.EmitArrayElementAddress(index, &ir_builder_)); + // We fold inputs into the accumulator and initialize it to + // the initial value on the reduce_window. + PrimitiveType operand_element_type = operand->shape().element_type(); + llvm::Value* accumulator_address = llvm_ir::EmitAllocaAtFunctionEntry( + llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_), + "reduce_window_accumulator_address", &b_, + MinimumAlignmentForPrimitiveType(operand_element_type)); + Store(Load(GetEmittedValueFor(reduce_window->operand(1))), + accumulator_address); + + llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &b_); + std::vector window_size; + for (const auto& dim : window.dimensions()) { + window_size.push_back(dim.size()); + } + const llvm_ir::IrArray::Index window_index = loops.AddLoopsForShape( + ShapeUtil::MakeShape(operand_element_type, window_size), "window"); + CHECK_EQ(window_index.size(), index.size()); + + SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_); + + llvm_ir::IrArray::Index input_index(b_.getInt64Ty(), index.size()); + llvm::Value* in_bounds_condition = nullptr; + for (size_t i = 0; i < index.size(); ++i) { + llvm::Value* strided_index = + NSWMul(index[i], b_.getInt64(window.dimensions(i).stride())); + input_index[i] = NSWSub(NSWAdd(strided_index, window_index[i]), + b_.getInt64(window.dimensions(i).padding_low())); + + // We need to check if 0 <= input_index[i] < bound, as otherwise we are in + // the padding so that we can skip the computation. That is equivalent to + // input_index[i] < bound as an *unsigned* comparison, since a negative + // value will wrap to a large positive value. + llvm::Value* index_condition = + ICmpULT(input_index[i], + b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i))); + if (in_bounds_condition == nullptr) { + in_bounds_condition = index_condition; + } else { + in_bounds_condition = And(in_bounds_condition, index_condition); } - return EmitElementFunctionCall(mapped_ir_function, map->shape(), - parameter_addresses, "map_function"); - }); + } + CHECK(in_bounds_condition != nullptr); + + llvm_ir::LlvmIfData if_data = + llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_); + SetToFirstInsertPoint(if_data.true_block, &b_); + + // We are not in the padding, so carry out the computation. + llvm_ir::IrArray input_array(GetIrArrayFor(operand)); + llvm::Value* input_value = input_array.EmitReadArrayElement(input_index, &b_); + llvm::Value* result = EmitThreadLocalCall( + *reduce_window->to_apply(), {Load(accumulator_address), input_value}, + "reducer_function"); + Store(result, accumulator_address); + + SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); + return Load(accumulator_address); } Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) { - auto operand = reduce_window->operand(0); - const Window& window = reduce_window->window(); - HloComputation* function = reduce_window->to_apply(); TF_RETURN_IF_ERROR(ElementTypesSameAndSupported( - /*instruction=*/*reduce_window, /*operands=*/{operand}, - /*supported_types=*/{F32, BF16, S32})); + /*instruction=*/*reduce_window, + /*operands=*/{reduce_window->operand(0)}, + /*supported_types=*/{F32, BF16, S32, F16})); // TODO(b/31410564): Implement dilation for reduce-window. - if (window_util::HasDilation(window)) { + if (window_util::HasDilation(reduce_window->window())) { return Unimplemented( "Dilation for ReduceWindow is not implemented on CPU."); } - // The called computation should have been emitted previously. - llvm::Function* reducer_function = FindOrDie(emitted_functions_, function); - // Pseudo code for reduce window: // // for (coordinates O in the output) @@ -530,72 +607,9 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) { // This is completely un-optimized and just here to have something // that works. return EmitTargetElementLoop( - reduce_window, [this, reduce_window, operand, window, - reducer_function](const llvm_ir::IrArray::Index& index) { - // We fold inputs into the accumulator and initialize it to - // the initial value on the reduce_window. - PrimitiveType operand_element_type = operand->shape().element_type(); - llvm::Value* accumulator_address = llvm_ir::EmitAllocaAtFunctionEntry( - llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_), - "reduce_window_accumulator_address", &ir_builder_, - MinimumAlignmentForPrimitiveType(operand_element_type)); - ir_builder_.CreateStore(ir_builder_.CreateLoad(GetEmittedValueFor( - reduce_window->operand(1))), - accumulator_address); - - llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), - &ir_builder_); - std::vector window_size; - for (const auto& dim : window.dimensions()) { - window_size.push_back(dim.size()); - } - const llvm_ir::IrArray::Index window_index = loops.AddLoopsForShape( - ShapeUtil::MakeShape(operand_element_type, window_size), "window"); - CHECK_EQ(window_index.size(), index.size()); - - SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_); - - llvm_ir::IrArray::Index input_index(index.size()); - llvm::Value* in_bounds_condition = nullptr; - for (size_t i = 0; i < index.size(); ++i) { - llvm::Value* strided_index = ir_builder_.CreateNSWMul( - index[i], ir_builder_.getInt64(window.dimensions(i).stride())); - input_index[i] = ir_builder_.CreateNSWSub( - ir_builder_.CreateNSWAdd(strided_index, window_index[i]), - ir_builder_.getInt64(window.dimensions(i).padding_low())); - - // We need to check if 0 <= input_index[i] < bound, as - // otherwise we are in the padding so that we can skip the - // computation. That is equivalent to input_index[i] < bound - // as an *unsigned* comparison, since a negative value will - // wrap to a large positive value. - llvm::Value* index_condition = ir_builder_.CreateICmpULT( - input_index[i], ir_builder_.getInt64(ShapeUtil::GetDimension( - operand->shape(), i))); - if (in_bounds_condition == nullptr) { - in_bounds_condition = index_condition; - } else { - in_bounds_condition = - ir_builder_.CreateAnd(in_bounds_condition, index_condition); - } - } - CHECK(in_bounds_condition != nullptr); - - llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( - in_bounds_condition, "in-bounds", &ir_builder_); - SetToFirstInsertPoint(if_data.true_block, &ir_builder_); - - // We are not in the padding, so carry out the computation. - llvm_ir::IrArray input_array(GetIrArrayFor(operand)); - llvm::Value* input_value_address = - input_array.EmitArrayElementAddress(input_index, &ir_builder_); - llvm::Value* result = EmitElementFunctionCall( - reducer_function, reduce_window->shape(), - {accumulator_address, input_value_address}, "reducer_function"); - ir_builder_.CreateStore(result, accumulator_address); - - SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_); - return ir_builder_.CreateLoad(accumulator_address); + reduce_window, [&](const llvm_ir::IrArray::Index& index) { + return EmitTargetElementLoopBodyForReduceWindow( + Cast(reduce_window), index); }); } @@ -616,12 +630,6 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { "Dilation for SelectAndScatter is not implemented on CPU. "); } - // The select and scatter computations should have been emitted previously. - llvm::Function* select_function = - FindOrDie(emitted_functions_, select_and_scatter->select()); - llvm::Function* scatter_function = - FindOrDie(emitted_functions_, select_and_scatter->scatter()); - // Pseudo code for select-and-scatter: // // initialized_flag is initially off for every window, and is turned on after @@ -647,140 +655,126 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { select_and_scatter, /*desc=*/IrName(select_and_scatter, "init"), [this, init_value](const llvm_ir::IrArray::Index& target_index) { llvm::Value* init_value_addr = GetEmittedValueFor(init_value); - return ir_builder_.CreateLoad(init_value_addr); + return Load(init_value_addr); })); // Create a loop to iterate over the source array to scatter to the output. - llvm_ir::ForLoopNest source_loops(IrName(select_and_scatter), &ir_builder_); + llvm_ir::ForLoopNest source_loops(IrName(select_and_scatter), &b_); const llvm_ir::IrArray::Index source_index = source_loops.AddLoopsForShape(source->shape(), "source"); - SetToFirstInsertPoint(source_loops.GetInnerLoopBodyBasicBlock(), - &ir_builder_); + SetToFirstInsertPoint(source_loops.GetInnerLoopBodyBasicBlock(), &b_); // Allocate space to keep the currently selected value, its index, and // the boolean initialized_flag, which is initially set to false. llvm::Value* selected_value_address = llvm_ir::EmitAllocaAtFunctionEntry( llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_), - "selected_value_address", &ir_builder_, + "selected_value_address", &b_, MinimumAlignmentForPrimitiveType(operand_element_type)); llvm::Value* selected_index_address = llvm_ir::EmitAllocaAtFunctionEntryWithCount( - ir_builder_.getInt64Ty(), ir_builder_.getInt32(rank), - "selected_index_address", &ir_builder_); + b_.getInt64Ty(), b_.getInt32(rank), "selected_index_address", &b_); llvm::Value* initialized_flag_address = llvm_ir::EmitAllocaAtFunctionEntry( - ir_builder_.getInt1Ty(), "initialized_flag_address", &ir_builder_); - ir_builder_.CreateStore(ir_builder_.getInt1(false), initialized_flag_address); + b_.getInt1Ty(), "initialized_flag_address", &b_); + Store(b_.getInt1(false), initialized_flag_address); // Create the inner loop to iterate over the window. - llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "window"), - &ir_builder_); + llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "window"), &b_); std::vector window_size; for (const auto& dim : window.dimensions()) { window_size.push_back(dim.size()); } const llvm_ir::IrArray::Index window_index = window_loops.AddLoopsForShape( ShapeUtil::MakeShape(operand_element_type, window_size), "window"); - SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(), - &ir_builder_); + SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(), &b_); // Compute the operand index to visit and evaluate the condition whether the // operand index is within the bounds. The unsigned comparison includes // checking whether the operand index >= 0. - llvm_ir::IrArray::Index operand_index(source_index.size()); - llvm::Value* in_bounds_condition = ir_builder_.getTrue(); + llvm_ir::IrArray::Index operand_index(b_.getInt64Ty(), source_index.size()); + llvm::Value* in_bounds_condition = b_.getTrue(); for (int64 i = 0; i < rank; ++i) { - llvm::Value* strided_index = ir_builder_.CreateNSWMul( - source_index[i], ir_builder_.getInt64(window.dimensions(i).stride())); - operand_index[i] = ir_builder_.CreateNSWSub( - ir_builder_.CreateNSWAdd(strided_index, window_index[i]), - ir_builder_.getInt64(window.dimensions(i).padding_low())); - llvm::Value* index_condition = ir_builder_.CreateICmpULT( - operand_index[i], - ir_builder_.getInt64(ShapeUtil::GetDimension(operand->shape(), i))); - in_bounds_condition = - ir_builder_.CreateAnd(in_bounds_condition, index_condition); + llvm::Value* strided_index = + NSWMul(source_index[i], b_.getInt64(window.dimensions(i).stride())); + operand_index[i] = NSWSub(NSWAdd(strided_index, window_index[i]), + b_.getInt64(window.dimensions(i).padding_low())); + llvm::Value* index_condition = + ICmpULT(operand_index[i], + b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i))); + in_bounds_condition = And(in_bounds_condition, index_condition); } CHECK(in_bounds_condition != nullptr); // Only need to do something if the operand index is within the bounds. First // check if the initialized_flag is set. llvm_ir::LlvmIfData if_in_bounds = - llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_); - SetToFirstInsertPoint(if_in_bounds.true_block, &ir_builder_); - llvm_ir::LlvmIfData if_initialized = - llvm_ir::EmitIfThenElse(ir_builder_.CreateLoad(initialized_flag_address), - "initialized", &ir_builder_); + llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_); + SetToFirstInsertPoint(if_in_bounds.true_block, &b_); + llvm_ir::LlvmIfData if_initialized = llvm_ir::EmitIfThenElse( + Load(initialized_flag_address), "initialized", &b_); // If the initialized_flag is false, initialize the selected value and index // with the currently visiting operand. - SetToFirstInsertPoint(if_initialized.false_block, &ir_builder_); + SetToFirstInsertPoint(if_initialized.false_block, &b_); const auto save_operand_index = [&](const llvm_ir::IrArray::Index& operand_index) { for (int64 i = 0; i < rank; ++i) { llvm::Value* selected_index_address_slot = - ir_builder_.CreateInBoundsGEP(selected_index_address, - {ir_builder_.getInt32(i)}); - ir_builder_.CreateStore(operand_index[i], - selected_index_address_slot); + InBoundsGEP(selected_index_address, {b_.getInt32(i)}); + Store(operand_index[i], selected_index_address_slot); } }; llvm_ir::IrArray operand_array(GetIrArrayFor(operand)); llvm::Value* operand_data = - operand_array.EmitReadArrayElement(operand_index, &ir_builder_); - ir_builder_.CreateStore(operand_data, selected_value_address); + operand_array.EmitReadArrayElement(operand_index, &b_); + Store(operand_data, selected_value_address); save_operand_index(operand_index); - ir_builder_.CreateStore(ir_builder_.getInt1(true), initialized_flag_address); + Store(b_.getInt1(true), initialized_flag_address); // If the initialized_flag is true, call the `select` function to potentially // update the selected value and index with the currently visiting operand. - SetToFirstInsertPoint(if_initialized.true_block, &ir_builder_); - const Shape output_shape = ShapeUtil::MakeShape(PRED, {}); + SetToFirstInsertPoint(if_initialized.true_block, &b_); llvm::Value* operand_address = - operand_array.EmitArrayElementAddress(operand_index, &ir_builder_); - llvm::Value* result = EmitElementFunctionCall( - select_function, output_shape, {selected_value_address, operand_address}, - "select_function"); + operand_array.EmitArrayElementAddress(operand_index, &b_); + llvm::Value* operand_element = Load(operand_address); + llvm::Value* result = EmitThreadLocalCall( + *select_and_scatter->select(), + {Load(selected_value_address), operand_element}, "select_function"); // If the 'select' function returns false, update the selected value and the // index to the currently visiting operand. - llvm::Value* cond = ir_builder_.CreateICmpNE( + llvm::Value* cond = ICmpNE( result, llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0), "boolean_predicate"); llvm_ir::LlvmIfData if_select_lhs = - llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &ir_builder_); - SetToFirstInsertPoint(if_select_lhs.false_block, &ir_builder_); - ir_builder_.CreateStore(ir_builder_.CreateLoad(operand_address), - selected_value_address); + llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &b_); + SetToFirstInsertPoint(if_select_lhs.false_block, &b_); + Store(Load(operand_address), selected_value_address); save_operand_index(operand_index); // After iterating over the window elements, scatter the source element to // the selected index of the output. The value we store at the output // location is computed by calling the `scatter` function with the source // value and the current output value. - SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(), - &ir_builder_); - llvm_ir::IrArray::Index selected_index; + SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(), &b_); + llvm_ir::IrArray::Index selected_index(source_index.GetType()); for (int64 i = 0; i < rank; ++i) { - llvm::Value* selected_index_address_slot = ir_builder_.CreateInBoundsGEP( - selected_index_address, {ir_builder_.getInt32(i)}); - selected_index.push_back( - ir_builder_.CreateLoad(selected_index_address_slot)); + llvm::Value* selected_index_address_slot = + InBoundsGEP(selected_index_address, {b_.getInt32(i)}); + selected_index.push_back(Load(selected_index_address_slot)); } llvm_ir::IrArray source_array(GetIrArrayFor(source)); - llvm::Value* source_value_address = - source_array.EmitArrayElementAddress(source_index, &ir_builder_); + llvm::Value* source_value = + source_array.EmitReadArrayElement(source_index, &b_); llvm_ir::IrArray output_array(GetIrArrayFor(select_and_scatter)); - llvm::Value* output_value_address = - output_array.EmitArrayElementAddress(selected_index, &ir_builder_); - llvm::Value* scatter_value = EmitElementFunctionCall( - scatter_function, source->shape(), - {output_value_address, source_value_address}, "scatter_function"); - output_array.EmitWriteArrayElement(selected_index, scatter_value, - &ir_builder_); - - SetToFirstInsertPoint(source_loops.GetOuterLoopExitBasicBlock(), - &ir_builder_); + llvm::Value* output_value = + output_array.EmitReadArrayElement(selected_index, &b_); + llvm::Value* scatter_value = + EmitThreadLocalCall(*select_and_scatter->scatter(), + {output_value, source_value}, "scatter_function"); + output_array.EmitWriteArrayElement(selected_index, scatter_value, &b_); + + SetToFirstInsertPoint(source_loops.GetOuterLoopExitBasicBlock(), &b_); return Status::OK(); } @@ -819,21 +813,154 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { // Dot operation is complicated so we delegate to a helper class. return DotOpEmitter::EmitDotOperation( *dot, target_array, lhs_array, rhs_array, /*addend_array=*/nullptr, - GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_, + GetExecutableRunOptionsArgument(), &b_, hlo_module_config_, target_machine_features_); } +StatusOr IrEmitter::EmitTargetElementLoopBodyForConvolution( + HloConvolutionInstruction* convolution, + const llvm_ir::IrArray::Index& index) { + const HloInstruction* lhs = convolution->operand(0); + const HloInstruction* rhs = convolution->operand(1); + const Window& window = convolution->window(); + + const ConvolutionDimensionNumbers& dnums = + convolution->convolution_dimension_numbers(); + int num_spatial_dims = dnums.output_spatial_dimensions_size(); + std::vector output_spatial(num_spatial_dims); + for (int i = 0; i < num_spatial_dims; ++i) { + output_spatial[i] = index[dnums.output_spatial_dimensions(i)]; + } + llvm::Value* output_feature = index[dnums.output_feature_dimension()]; + llvm::Value* batch = index[dnums.output_batch_dimension()]; + + // We will accumulate the products into this sum to calculate the output entry + // at the given index. + PrimitiveType lhs_element_type = lhs->shape().element_type(); + llvm::Type* lhs_llvm_type = + llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_); + llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry( + lhs_llvm_type, "convolution_sum_address", &b_, + MinimumAlignmentForPrimitiveType(lhs_element_type)); + llvm::Value* constant_zero = llvm::Constant::getNullValue(lhs_llvm_type); + Store(constant_zero, sum_address); + + llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &b_); + std::vector kernel_spatial(num_spatial_dims); + for (int i = 0; i < num_spatial_dims; ++i) { + kernel_spatial[i] = + loops + .AddLoop( + 0, rhs->shape().dimensions(dnums.kernel_spatial_dimensions(i)), + absl::StrCat("k", i)) + ->GetIndVarValue(); + } + llvm::Value* input_feature = + loops + .AddLoop(0, lhs->shape().dimensions(dnums.input_feature_dimension()), + "iz") + ->GetIndVarValue(); + + SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_); + + // Calculate the spatial index in the input array, taking striding, dilation + // and padding into account. An index in the padding will be out of the bounds + // of the array. + const auto calculate_input_index = [this](llvm::Value* output_index, + llvm::Value* kernel_index, + const WindowDimension& window_dim) { + llvm::Value* strided_index = + NSWMul(output_index, b_.getInt64(window_dim.stride())); + llvm::Value* dilated_kernel_index = + NSWMul(kernel_index, b_.getInt64(window_dim.window_dilation())); + return NSWSub(NSWAdd(strided_index, dilated_kernel_index), + b_.getInt64(window_dim.padding_low())); + }; + std::vector input_spatial(num_spatial_dims); + for (int i = 0; i < num_spatial_dims; ++i) { + input_spatial[i] = calculate_input_index( + output_spatial[i], kernel_spatial[i], window.dimensions(i)); + } + + // We need to check if 0 <= input dim < bound, as otherwise we are in the + // padding so that we can skip the computation. That is equivalent to input + // dim < bound as an *unsigned* comparison, since a negative value will wrap + // to a large positive value. The input dim is dilated, so we need to dilate + // the bound as well to match. + + // Also need to check that the input coordinates are not in one of the + // holes created by base dilation. + const auto not_in_hole = [&](llvm::Value* input_index, int64 base_dilation) { + llvm::Value* remainder = SRem(input_index, b_.getInt64(base_dilation)); + return ICmpEQ(remainder, b_.getInt64(0)); + }; + + llvm::Value* in_bounds_condition = b_.getInt1(true); + for (int i = 0; i < num_spatial_dims; ++i) { + llvm::ConstantInt* input_bound = b_.getInt64(window_util::DilatedBound( + lhs->shape().dimensions(dnums.input_spatial_dimensions(i)), + window.dimensions(i).base_dilation())); + llvm::Value* dim_in_bound = ICmpULT(input_spatial[i], input_bound); + llvm::Value* dim_not_in_hole = + not_in_hole(input_spatial[i], window.dimensions(i).base_dilation()); + llvm::Value* dim_ok = And(dim_in_bound, dim_not_in_hole); + in_bounds_condition = And(in_bounds_condition, dim_ok); + } + + // Now we need to map the dilated base coordinates back to the actual + // data indices on the lhs. + const auto undilate = [&](llvm::Value* input_index, int64 base_dilation) { + return SDiv(input_index, b_.getInt64(base_dilation)); + }; + for (int i = 0; i < num_spatial_dims; ++i) { + input_spatial[i] = + undilate(input_spatial[i], window.dimensions(i).base_dilation()); + } + + llvm_ir::LlvmIfData if_data = + llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_); + SetToFirstInsertPoint(if_data.true_block, &b_); + + // We are not in the padding, so carry out the computation. + int num_dims = num_spatial_dims + 2; + llvm_ir::IrArray::Index input_index(b_.getInt64Ty(), num_dims); + for (int i = 0; i < num_spatial_dims; ++i) { + input_index[dnums.input_spatial_dimensions(i)] = input_spatial[i]; + } + input_index[dnums.input_feature_dimension()] = input_feature; + input_index[dnums.input_batch_dimension()] = batch; + + llvm_ir::IrArray kernel_array(GetIrArrayFor(rhs)); + llvm_ir::IrArray::Index kernel_index(b_.getInt64Ty(), num_dims); + for (int i = 0; i < num_spatial_dims; ++i) { + kernel_index[dnums.kernel_spatial_dimensions(i)] = + window.dimensions(i).window_reversal() + ? NSWSub(b_.getInt64(window.dimensions(i).size() - 1), + kernel_spatial[i]) + : kernel_spatial[i]; + } + + kernel_index[dnums.kernel_input_feature_dimension()] = input_feature; + kernel_index[dnums.kernel_output_feature_dimension()] = output_feature; + + llvm_ir::IrArray input_array(GetIrArrayFor(lhs)); + llvm::Value* product = + FMul(input_array.EmitReadArrayElement(input_index, &b_), + kernel_array.EmitReadArrayElement(kernel_index, &b_)); + llvm::Value* sum = FAdd(Load(sum_address), product); + Store(sum, sum_address); + + SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); + return Load(sum_address); +} + Status IrEmitter::HandleConvolution(HloInstruction* convolution) { auto lhs = convolution->operand(0); auto rhs = convolution->operand(1); - const auto& window = convolution->window(); TF_RETURN_IF_ERROR(ElementTypesSameAndSupported( /*instruction=*/*convolution, /*operands=*/{lhs, rhs}, /*supported_types=*/{F16, F32, C64})); - const ConvolutionDimensionNumbers& dnums = - convolution->convolution_dimension_numbers(); - // TODO(tonywy): Add PotentiallyImplementedAsMKLCovolution to support // different data layouts. if (PotentiallyImplementedAsEigenConvolution(*convolution, @@ -913,12 +1040,12 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) { PrimitiveType primitive_type = lhs->shape().element_type(); llvm::Type* ir_ptr_type = primitive_type == F16 - ? ir_builder_.getHalfTy()->getPointerTo() - : ir_builder_.getFloatTy()->getPointerTo(); - llvm::Type* int64_type = ir_builder_.getInt64Ty(); - llvm::Type* int8_ptr_type = ir_builder_.getInt8Ty()->getPointerTo(); + ? b_.getHalfTy()->getPointerTo() + : b_.getFloatTy()->getPointerTo(); + llvm::Type* int64_type = b_.getInt64Ty(); + llvm::Type* int8_ptr_type = b_.getInt8Ty()->getPointerTo(); llvm::FunctionType* conv_type = llvm::FunctionType::get( - ir_builder_.getVoidTy(), + b_.getVoidTy(), {int8_ptr_type, ir_ptr_type, ir_ptr_type, ir_ptr_type, int64_type, int64_type, int64_type, int64_type, int64_type, int64_type, int64_type, int64_type, int64_type, int64_type, int64_type, @@ -950,34 +1077,32 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) { conv_func->setCallingConv(llvm::CallingConv::C); conv_func->setDoesNotThrow(); conv_func->setOnlyAccessesArgMemory(); - ir_builder_.CreateCall( - conv_func, { - GetExecutableRunOptionsArgument(), - ir_builder_.CreateBitCast( - GetEmittedValueFor(convolution), ir_ptr_type), - ir_builder_.CreateBitCast(lhs_address, ir_ptr_type), - ir_builder_.CreateBitCast(rhs_address, ir_ptr_type), - ir_builder_.getInt64(input_batch), - ir_builder_.getInt64(input_rows), - ir_builder_.getInt64(input_cols), - ir_builder_.getInt64(input_channels), - ir_builder_.getInt64(kernel_rows), - ir_builder_.getInt64(kernel_cols), - ir_builder_.getInt64(kernel_channels), - ir_builder_.getInt64(kernel_filters), - ir_builder_.getInt64(output_rows), - ir_builder_.getInt64(output_cols), - ir_builder_.getInt64(row_stride), - ir_builder_.getInt64(col_stride), - ir_builder_.getInt64(padding_top), - ir_builder_.getInt64(padding_bottom), - ir_builder_.getInt64(padding_left), - ir_builder_.getInt64(padding_right), - ir_builder_.getInt64(lhs_row_dilation), - ir_builder_.getInt64(lhs_col_dilation), - ir_builder_.getInt64(rhs_row_dilation), - ir_builder_.getInt64(rhs_col_dilation), - }); + Call(conv_func, { + GetExecutableRunOptionsArgument(), + BitCast(GetEmittedValueFor(convolution), ir_ptr_type), + BitCast(lhs_address, ir_ptr_type), + BitCast(rhs_address, ir_ptr_type), + b_.getInt64(input_batch), + b_.getInt64(input_rows), + b_.getInt64(input_cols), + b_.getInt64(input_channels), + b_.getInt64(kernel_rows), + b_.getInt64(kernel_cols), + b_.getInt64(kernel_channels), + b_.getInt64(kernel_filters), + b_.getInt64(output_rows), + b_.getInt64(output_cols), + b_.getInt64(row_stride), + b_.getInt64(col_stride), + b_.getInt64(padding_top), + b_.getInt64(padding_bottom), + b_.getInt64(padding_left), + b_.getInt64(padding_right), + b_.getInt64(lhs_row_dilation), + b_.getInt64(lhs_col_dilation), + b_.getInt64(rhs_row_dilation), + b_.getInt64(rhs_col_dilation), + }); return Status::OK(); } @@ -990,149 +1115,9 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) { // See the description of convolution in the XLA documentation for the pseudo // code for convolution. return EmitTargetElementLoop( - convolution, [this, convolution, lhs, rhs, window, - dnums](const llvm_ir::IrArray::Index& index) { - int num_spatial_dims = dnums.output_spatial_dimensions_size(); - std::vector output_spatial(num_spatial_dims); - for (int i = 0; i < num_spatial_dims; ++i) { - output_spatial[i] = index[dnums.output_spatial_dimensions(i)]; - } - llvm::Value* output_feature = index[dnums.output_feature_dimension()]; - llvm::Value* batch = index[dnums.output_batch_dimension()]; - - // We will accumulate the products into this sum to calculate - // the output entry at the given index. - PrimitiveType lhs_element_type = lhs->shape().element_type(); - llvm::Type* lhs_llvm_type = - llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_); - llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry( - lhs_llvm_type, "convolution_sum_address", &ir_builder_, - MinimumAlignmentForPrimitiveType(lhs_element_type)); - llvm::Value* constant_zero = - llvm::Constant::getNullValue(lhs_llvm_type); - ir_builder_.CreateStore(constant_zero, sum_address); - - llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &ir_builder_); - std::vector kernel_spatial(num_spatial_dims); - for (int i = 0; i < num_spatial_dims; ++i) { - kernel_spatial[i] = - loops - .AddLoop(0, - rhs->shape().dimensions( - dnums.kernel_spatial_dimensions(i)), - tensorflow::strings::StrCat("k", i)) - ->GetIndVarValue(); - } - llvm::Value* input_feature = - loops - .AddLoop( - 0, lhs->shape().dimensions(dnums.input_feature_dimension()), - "iz") - ->GetIndVarValue(); - - SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_); - - // Calculate the spatial index in the input array, taking striding, - // dilation and padding into account. An index in the padding will be - // out of the bounds of the array. - const auto calculate_input_index = - [this](llvm::Value* output_index, llvm::Value* kernel_index, - const WindowDimension& window_dim) { - llvm::Value* strided_index = ir_builder_.CreateNSWMul( - output_index, ir_builder_.getInt64(window_dim.stride())); - llvm::Value* dilated_kernel_index = ir_builder_.CreateNSWMul( - kernel_index, - ir_builder_.getInt64(window_dim.window_dilation())); - return ir_builder_.CreateNSWSub( - ir_builder_.CreateNSWAdd(strided_index, dilated_kernel_index), - ir_builder_.getInt64(window_dim.padding_low())); - }; - std::vector input_spatial(num_spatial_dims); - for (int i = 0; i < num_spatial_dims; ++i) { - input_spatial[i] = calculate_input_index( - output_spatial[i], kernel_spatial[i], window.dimensions(i)); - } - - // We need to check if 0 <= input dim < bound, as otherwise we are in - // the padding so that we can skip the computation. That is equivalent - // to input dim < bound as an *unsigned* comparison, since a negative - // value will wrap to a large positive value. The input dim is dilated, - // so we need to dilate the bound as well to match. - - // Also need to check that the input coordinates are not in one of the - // holes created by base dilation. - const auto not_in_hole = [&](llvm::Value* input_index, - int64 base_dilation) { - llvm::Value* remainder = ir_builder_.CreateSRem( - input_index, ir_builder_.getInt64(base_dilation)); - return ir_builder_.CreateICmpEQ(remainder, ir_builder_.getInt64(0)); - }; - - llvm::Value* in_bounds_condition = ir_builder_.getInt1(true); - for (int i = 0; i < num_spatial_dims; ++i) { - llvm::ConstantInt* input_bound = - ir_builder_.getInt64(window_util::DilatedBound( - lhs->shape().dimensions(dnums.input_spatial_dimensions(i)), - window.dimensions(i).base_dilation())); - llvm::Value* dim_in_bound = - ir_builder_.CreateICmpULT(input_spatial[i], input_bound); - llvm::Value* dim_not_in_hole = not_in_hole( - input_spatial[i], window.dimensions(i).base_dilation()); - llvm::Value* dim_ok = - ir_builder_.CreateAnd(dim_in_bound, dim_not_in_hole); - in_bounds_condition = - ir_builder_.CreateAnd(in_bounds_condition, dim_ok); - } - - // Now we need to map the dilated base coordinates back to the actual - // data indices on the lhs. - const auto undilate = [&](llvm::Value* input_index, - int64 base_dilation) { - return ir_builder_.CreateSDiv(input_index, - ir_builder_.getInt64(base_dilation)); - }; - for (int i = 0; i < num_spatial_dims; ++i) { - input_spatial[i] = - undilate(input_spatial[i], window.dimensions(i).base_dilation()); - } - - llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( - in_bounds_condition, "in-bounds", &ir_builder_); - SetToFirstInsertPoint(if_data.true_block, &ir_builder_); - - // We are not in the padding, so carry out the computation. - int num_dims = num_spatial_dims + 2; - llvm_ir::IrArray::Index input_index(num_dims); - for (int i = 0; i < num_spatial_dims; ++i) { - input_index[dnums.input_spatial_dimensions(i)] = input_spatial[i]; - } - input_index[dnums.input_feature_dimension()] = input_feature; - input_index[dnums.input_batch_dimension()] = batch; - - llvm_ir::IrArray kernel_array(GetIrArrayFor(rhs)); - llvm_ir::IrArray::Index kernel_index(num_dims); - for (int i = 0; i < num_spatial_dims; ++i) { - kernel_index[dnums.kernel_spatial_dimensions(i)] = - window.dimensions(i).window_reversal() - ? ir_builder_.CreateNSWSub( - ir_builder_.getInt64(window.dimensions(i).size() - 1), - kernel_spatial[i]) - : kernel_spatial[i]; - } - - kernel_index[dnums.kernel_input_feature_dimension()] = input_feature; - kernel_index[dnums.kernel_output_feature_dimension()] = output_feature; - - llvm_ir::IrArray input_array(GetIrArrayFor(lhs)); - llvm::Value* product = ir_builder_.CreateFMul( - input_array.EmitReadArrayElement(input_index, &ir_builder_), - kernel_array.EmitReadArrayElement(kernel_index, &ir_builder_)); - llvm::Value* sum = ir_builder_.CreateFAdd( - ir_builder_.CreateLoad(sum_address), product); - ir_builder_.CreateStore(sum, sum_address); - - SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_); - return ir_builder_.CreateLoad(sum_address); + convolution, [&](const llvm_ir::IrArray::Index& index) { + return EmitTargetElementLoopBodyForConvolution( + Cast(convolution), index); }); } @@ -1156,11 +1141,11 @@ Status IrEmitter::HandleFft(HloInstruction* fft) { } // Args have been computed, make the call. - llvm::Type* int8_ptr_type = ir_builder_.getInt8Ty()->getPointerTo(); - llvm::Type* int32_type = ir_builder_.getInt32Ty(); - llvm::Type* int64_type = ir_builder_.getInt64Ty(); + llvm::Type* int8_ptr_type = b_.getInt8Ty()->getPointerTo(); + llvm::Type* int32_type = b_.getInt32Ty(); + llvm::Type* int64_type = b_.getInt64Ty(); llvm::FunctionType* fft_type = llvm::FunctionType::get( - ir_builder_.getVoidTy(), + b_.getVoidTy(), {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type, int64_type, int64_type, int64_type, int64_type}, /*isVarArg=*/false); @@ -1177,16 +1162,14 @@ Status IrEmitter::HandleFft(HloInstruction* fft) { fft_func->setDoesNotThrow(); fft_func->setOnlyAccessesInaccessibleMemOrArgMem(); const int fft_rank = fft_length.size(); - ir_builder_.CreateCall( - fft_func, - {GetExecutableRunOptionsArgument(), - ir_builder_.CreateBitCast(GetEmittedValueFor(fft), int8_ptr_type), - ir_builder_.CreateBitCast(operand_address, int8_ptr_type), - ir_builder_.getInt32(fft->fft_type()), ir_builder_.getInt32(fft_rank), - ir_builder_.getInt64(input_batch), - ir_builder_.getInt64(fft_rank > 0 ? fft_length[0] : 0), - ir_builder_.getInt64(fft_rank > 1 ? fft_length[1] : 0), - ir_builder_.getInt64(fft_rank > 2 ? fft_length[2] : 0)}); + Call(fft_func, + {GetExecutableRunOptionsArgument(), + BitCast(GetEmittedValueFor(fft), int8_ptr_type), + BitCast(operand_address, int8_ptr_type), b_.getInt32(fft->fft_type()), + b_.getInt32(fft_rank), b_.getInt64(input_batch), + b_.getInt64(fft_rank > 0 ? fft_length[0] : 0), + b_.getInt64(fft_rank > 1 ? fft_length[1] : 0), + b_.getInt64(fft_rank > 2 ? fft_length[2] : 0)}); return Status::OK(); } @@ -1222,14 +1205,13 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) { const Shape& operand_shape = crs->operand(i)->shape(); CHECK(ShapeUtil::IsArray(operand_shape)) << "Operands to cross-replica-sum must be arrays: " << crs->ToString(); - operand_ptrs.push_back(EmitTempBufferPointer(out_slice, operand_shape)); + operand_ptrs.push_back(EmitBufferPointer(out_slice, operand_shape)); // TODO(b/63762267): Be more aggressive about specifying alignment. - ir_builder_.CreateMemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr, - /*SrcAlign=*/1, - ShapeUtil::ByteSizeOf(operand_shape)); + MemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr, + /*SrcAlign=*/1, ShapeUtil::ByteSizeOf(operand_shape)); } - llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &ir_builder_, module_); + llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &b_, module_); return Status::OK(); } @@ -1262,47 +1244,7 @@ static llvm_ir::IrArray::Index FillReducedDimensionIndex( Status IrEmitter::HandleParameter(HloInstruction* parameter) { VLOG(2) << "HandleParameter: " << parameter->ToString(); - auto param_number = parameter->parameter_number(); - auto param_shape = parameter->shape(); - - // We have to access the parameter at offset param_number in the params - // array. The code generated here is equivalent to this C code: - // - // i8* param_address_untyped = params[param_number]; - // Param* param_address_typed = (Param*)param_address_untyped; - // - // Where Param is the actual element type of the underlying buffer (for - // example, float for an XLA F32 element type). - llvm::Value* params = compute_function_->parameters_arg(); - llvm::Value* param_address_offset = - llvm_ir::EmitBufferIndexingGEP(params, param_number, &ir_builder_); - llvm::LoadInst* param_address_untyped = - ir_builder_.CreateLoad(param_address_offset); - param_address_untyped->setName(AsStringRef(IrName(parameter, "untyped"))); - if (is_top_level_computation_ && - hlo_module_config_.debug_options() - .xla_llvm_enable_invariant_load_metadata()) { - // In the entry computation the parameter slots in the %params argument are - // invariant through program execution. In computations that are called - // from the entry computation (via kWhile, kCall and kConditional) the - // parameter slots are *not* invariant since they're written to by their - // callers. - param_address_untyped->setMetadata( - llvm::LLVMContext::MD_invariant_load, - llvm::MDNode::get(param_address_untyped->getContext(), /*MDs=*/{})); - } - - llvm::Value* param_address_typed = ir_builder_.CreateBitCast( - param_address_untyped, IrShapeType(param_shape)->getPointerTo()); - emitted_value_[parameter] = param_address_typed; - - if (!ShapeUtil::IsOpaque(param_shape)) { - AttachAlignmentMetadataForLoad(param_address_untyped, param_shape); - AttachDereferenceableMetadataForLoad(param_address_untyped, param_shape); - } - - VLOG(2) << " emitted value: " << llvm_ir::DumpToString(*param_address_typed); - return Status::OK(); + return EmitTargetAddressForOp(parameter); } // Returns true if the relative order of the unreduced dimensions stays the same @@ -1400,58 +1342,61 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator( return nullptr; case HloOpcode::kAdd: - return [root_is_integral](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs, + return [root_is_integral](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) { - return root_is_integral ? ir_builder->CreateAdd(lhs, rhs) - : ir_builder->CreateFAdd(lhs, rhs); + return root_is_integral ? b->CreateAdd(lhs, rhs) + : b->CreateFAdd(lhs, rhs); }; case HloOpcode::kMultiply: - return [root_is_integral](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs, + return [root_is_integral](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) { - return root_is_integral ? ir_builder->CreateMul(lhs, rhs) - : ir_builder->CreateFMul(lhs, rhs); + return root_is_integral ? b->CreateMul(lhs, rhs) + : b->CreateFMul(lhs, rhs); }; case HloOpcode::kAnd: - return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs, - llvm::Value* rhs) { return ir_builder->CreateAnd(lhs, rhs); }; + return [](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) { + return b->CreateAnd(lhs, rhs); + }; case HloOpcode::kOr: - return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs, - llvm::Value* rhs) { return ir_builder->CreateOr(lhs, rhs); }; + return [](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) { + return b->CreateOr(lhs, rhs); + }; + + case HloOpcode::kXor: + return [](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) { + return b->CreateXor(lhs, rhs); + }; case HloOpcode::kMaximum: return [root_is_floating_point, root_is_signed]( - llvm::IRBuilder<>* ir_builder, llvm::Value* lhs, - llvm::Value* rhs) { + llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) { if (root_is_floating_point) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::maxnum, - {lhs, rhs}, {lhs->getType()}, - ir_builder); + {lhs, rhs}, {lhs->getType()}, b); } - return ir_builder->CreateSelect( - ir_builder->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SGE - : llvm::ICmpInst::ICMP_UGE, - lhs, rhs), + return b->CreateSelect( + b->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SGE + : llvm::ICmpInst::ICMP_UGE, + lhs, rhs), lhs, rhs); }; case HloOpcode::kMinimum: return [root_is_floating_point, root_is_signed]( - llvm::IRBuilder<>* ir_builder, llvm::Value* lhs, - llvm::Value* rhs) { + llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) { if (root_is_floating_point) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::minnum, - {lhs, rhs}, {lhs->getType()}, - ir_builder); + {lhs, rhs}, {lhs->getType()}, b); } - return ir_builder->CreateSelect( - ir_builder->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SLE - : llvm::ICmpInst::ICMP_ULE, - lhs, rhs), + return b->CreateSelect( + b->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SLE + : llvm::ICmpInst::ICMP_ULE, + lhs, rhs), lhs, rhs); }; } @@ -1514,40 +1459,37 @@ IrEmitter::EmitInnerLoopForVectorizedReduction( const ReductionGenerator& reduction_generator, const llvm_ir::IrArray::Index& output_index, const ShardedVectorType& accumulator_type, HloInstruction* init_value, - HloInstruction* arg, gtl::ArraySlice dimensions, + HloInstruction* arg, absl::Span dimensions, unsigned element_alignment) { ShardedVector accumulator; accumulator.reserve(accumulator_type.size()); for (auto accumulator_shard_type : accumulator_type) { accumulator.push_back(llvm_ir::EmitAllocaAtFunctionEntry( - accumulator_shard_type, "accumulator", &ir_builder_, 0)); + accumulator_shard_type, "accumulator", &b_, 0)); } - llvm::Value* init_value_ssa = - ir_builder_.CreateLoad(GetEmittedValueFor(init_value)); + llvm::Value* init_value_ssa = Load(GetEmittedValueFor(init_value)); for (llvm::Value* accumulator_shard : accumulator) { llvm::Value* initial_value; auto shard_type = accumulator_shard->getType()->getPointerElementType(); if (auto vector_type = llvm::dyn_cast(shard_type)) { - initial_value = ir_builder_.CreateVectorSplat( - vector_type->getNumElements(), init_value_ssa); + initial_value = + VectorSplat(vector_type->getNumElements(), init_value_ssa); } else { initial_value = init_value_ssa; } - ir_builder_.CreateAlignedStore(initial_value, accumulator_shard, - element_alignment); + AlignedStore(initial_value, accumulator_shard, element_alignment); } llvm_ir::ForLoopNest reduction_loop_nest(IrName(arg, "vectorized_inner"), - &ir_builder_); + &b_); llvm_ir::IrArray::Index reduced_dims_index = reduction_loop_nest.AddLoopsForShapeOnDimensions(arg->shape(), dimensions, "reduction_dim"); - SetToFirstInsertPoint(reduction_loop_nest.GetInnerLoopBodyBasicBlock(), - &ir_builder_); + SetToFirstInsertPoint(reduction_loop_nest.GetInnerLoopBodyBasicBlock(), &b_); llvm_ir::IrArray arg_array(GetIrArrayFor(arg)); llvm_ir::IrArray::Index input_index = reduced_dims_index; @@ -1560,38 +1502,33 @@ IrEmitter::EmitInnerLoopForVectorizedReduction( } CHECK(output_index.end() == it); - llvm::Value* input_address = ir_builder_.CreateBitCast( - arg_array.EmitArrayElementAddress(input_index, &ir_builder_), - ir_builder_.getInt8PtrTy()); + llvm::Value* input_address = BitCast( + arg_array.EmitArrayElementAddress(input_index, &b_), b_.getInt8PtrTy()); for (int i = 0; i < accumulator.size(); i++) { auto input_address_typed = - ir_builder_.CreateBitCast(input_address, accumulator[i]->getType()); + BitCast(input_address, accumulator[i]->getType()); auto current_accumulator_value = - ir_builder_.CreateAlignedLoad(accumulator[i], element_alignment); - auto addend = - ir_builder_.CreateAlignedLoad(input_address_typed, element_alignment); + AlignedLoad(accumulator[i], element_alignment); + auto addend = AlignedLoad(input_address_typed, element_alignment); arg_array.AnnotateLoadStoreInstructionWithMetadata(addend); auto reduced_result = - reduction_generator(&ir_builder_, current_accumulator_value, addend); - ir_builder_.CreateAlignedStore(reduced_result, accumulator[i], - element_alignment); + reduction_generator(&b_, current_accumulator_value, addend); + AlignedStore(reduced_result, accumulator[i], element_alignment); if (i != (accumulator.size() - 1)) { - input_address = ir_builder_.CreateConstInBoundsGEP1_32( - reduced_result->getType(), input_address_typed, 1); + input_address = ConstInBoundsGEP1_32(reduced_result->getType(), + input_address_typed, 1); } } - SetToFirstInsertPoint(reduction_loop_nest.GetOuterLoopExitBasicBlock(), - &ir_builder_); + SetToFirstInsertPoint(reduction_loop_nest.GetOuterLoopExitBasicBlock(), &b_); ShardedVector result_ssa; result_ssa.reserve(accumulator.size()); for (auto accumulator_shard : accumulator) { - result_ssa.push_back( - ir_builder_.CreateAlignedLoad(accumulator_shard, element_alignment)); + result_ssa.push_back(AlignedLoad(accumulator_shard, element_alignment)); } return result_ssa; } @@ -1600,25 +1537,25 @@ void IrEmitter::EmitShardedVectorStore( llvm::Value* store_address, const std::vector& value_to_store, const int alignment, const llvm_ir::IrArray& containing_array) { for (int i = 0; i < value_to_store.size(); i++) { - auto store_address_typed = ir_builder_.CreateBitCast( - store_address, - llvm::PointerType::getUnqual(value_to_store[i]->getType())); + auto store_address_typed = + BitCast(store_address, + llvm::PointerType::getUnqual(value_to_store[i]->getType())); - auto store_instruction = ir_builder_.CreateAlignedStore( - value_to_store[i], store_address_typed, alignment); + auto store_instruction = + AlignedStore(value_to_store[i], store_address_typed, alignment); containing_array.AnnotateLoadStoreInstructionWithMetadata( store_instruction); if (i != (value_to_store.size() - 1)) { - store_address = ir_builder_.CreateConstInBoundsGEP1_32( - value_to_store[i]->getType(), store_address_typed, 1); + store_address = ConstInBoundsGEP1_32(value_to_store[i]->getType(), + store_address_typed, 1); } } } StatusOr IrEmitter::EmitVectorizedReduce( HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value, - gtl::ArraySlice dimensions, HloComputation* function, + absl::Span dimensions, HloComputation* function, string* failure_reason) { if (!ReductionPreservesLayout(*reduce)) { return false; @@ -1676,16 +1613,16 @@ StatusOr IrEmitter::EmitVectorizedReduce( // } // } - llvm_ir::ForLoopNest loop_nest(IrName(reduce), &ir_builder_); - llvm_ir::IrArray::Index array_index(reduce->shape().dimensions_size()); + llvm_ir::ForLoopNest loop_nest(IrName(reduce), &b_); + llvm_ir::IrArray::Index array_index(b_.getInt64Ty(), + reduce->shape().dimensions_size()); for (int i = LayoutUtil::MinorToMajor(reduce->shape()).size() - 1; i > 0; --i) { int64 dimension = LayoutUtil::Minor(reduce->shape().layout(), i); int64 start_index = 0; int64 end_index = reduce->shape().dimensions(dimension); - std::unique_ptr loop = - loop_nest.AddLoop(start_index, end_index, - tensorflow::strings::Printf("dim.%lld", dimension)); + std::unique_ptr loop = loop_nest.AddLoop( + start_index, end_index, absl::StrFormat("dim.%d", dimension)); array_index[dimension] = loop->GetIndVarValue(); } @@ -1695,7 +1632,7 @@ StatusOr IrEmitter::EmitVectorizedReduce( if (llvm::BasicBlock* innermost_body_bb = loop_nest.GetInnerLoopBodyBasicBlock()) { - SetToFirstInsertPoint(innermost_body_bb, &ir_builder_); + SetToFirstInsertPoint(innermost_body_bb, &b_); } auto outermost_loop_exit_block = loop_nest.GetOuterLoopExitBasicBlock(); @@ -1704,12 +1641,12 @@ StatusOr IrEmitter::EmitVectorizedReduce( int64 start_index = 0; int64 end_index = (innermost_dimension_size / vectorization_factor) * vectorization_factor; - std::unique_ptr loop = loop_nest.AddLoop( - start_index, end_index, vectorization_factor, - tensorflow::strings::Printf("dim.%lld", innermost_dimension)); + std::unique_ptr loop = + loop_nest.AddLoop(start_index, end_index, vectorization_factor, + absl::StrFormat("dim.%d", innermost_dimension)); array_index[innermost_dimension] = loop->GetIndVarValue(); - SetToFirstInsertPoint(loop->GetBodyBasicBlock(), &ir_builder_); + SetToFirstInsertPoint(loop->GetBodyBasicBlock(), &b_); ShardedVectorType vector_type = CreateShardedVectorType( reduce->shape().element_type(), vectorization_factor); @@ -1720,16 +1657,16 @@ StatusOr IrEmitter::EmitVectorizedReduce( llvm_ir::IrArray target_array = GetIrArrayFor(reduce); llvm::Value* output_address = - target_array.EmitArrayElementAddress(array_index, &ir_builder_); + target_array.EmitArrayElementAddress(array_index, &b_); EmitShardedVectorStore(output_address, accumulator, element_alignment, target_array); if (auto exit_terminator = loop->GetExitBasicBlock()->getTerminator()) { CHECK_GT(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1); - ir_builder_.SetInsertPoint(exit_terminator); + b_.SetInsertPoint(exit_terminator); } else { CHECK_EQ(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1); - ir_builder_.SetInsertPoint(loop->GetExitBasicBlock()); + b_.SetInsertPoint(loop->GetExitBasicBlock()); } } @@ -1739,8 +1676,8 @@ StatusOr IrEmitter::EmitVectorizedReduce( if (innermost_dimension_size % vectorization_factor) { // TODO(b/63775531): Consider using a scalar loop here to save on code size. array_index[innermost_dimension] = - ir_builder_.getInt64(innermost_dimension_size - - (innermost_dimension_size % vectorization_factor)); + b_.getInt64(innermost_dimension_size - + (innermost_dimension_size % vectorization_factor)); ShardedVectorType vector_type = CreateShardedVectorType( reduce->shape().element_type(), @@ -1752,22 +1689,80 @@ StatusOr IrEmitter::EmitVectorizedReduce( llvm_ir::IrArray target_array = GetIrArrayFor(reduce); llvm::Value* output_address = - target_array.EmitArrayElementAddress(array_index, &ir_builder_); + target_array.EmitArrayElementAddress(array_index, &b_); EmitShardedVectorStore(output_address, accumulator, element_alignment, target_array); } if (outermost_loop_exit_block) { - ir_builder_.SetInsertPoint(outermost_loop_exit_block); + b_.SetInsertPoint(outermost_loop_exit_block); } return true; } +StatusOr IrEmitter::EmitTargetElementLoopBodyForReduce( + HloReduceInstruction* reduce, const llvm_ir::IrArray::Index& index) { + const HloInstruction* arg = reduce->mutable_operand(0); + const HloInstruction* init_value = reduce->mutable_operand(1); + absl::Span dimensions(reduce->dimensions()); + + // Initialize an accumulator with init_value. + PrimitiveType accumulator_type = reduce->shape().element_type(); + llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry( + llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_), "accumulator", + &b_, MinimumAlignmentForPrimitiveType(accumulator_type)); + llvm::Value* init_value_addr = GetEmittedValueFor(init_value); + llvm::Value* load_init_value = Load(init_value_addr); + Store(load_init_value, accumulator_addr); + + // The enclosing loops go over all the target elements. Now we have to compute + // the actual target element. For this, we build a new loop nest to iterate + // over all the reduction dimensions in the argument. + // AddLoopsForShapeOnDimensions will return an Index where induction Value*s + // are placed for each dimension in dimensions, and all the rest are nullptrs. + llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_); + const llvm_ir::IrArray::Index reduced_dims_index = + loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions, + "reduction_dim"); + + SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_); + + // Build a full index for the input argument, using reduced_dims_index as the + // base. In reduced_dims_index only the reduction dimensions are filled in. We + // fill in the rest of the dimensions with induction Value*s taken from + // 'index' which iterates over the target array. See the high-level + // description in the XLA documentation for details. + llvm_ir::IrArray arg_array(GetIrArrayFor(arg)); + llvm_ir::IrArray::Index input_index = reduced_dims_index; + llvm_ir::IrArray::Index::const_iterator it = index.begin(); + + for (size_t i = 0; i < input_index.size(); ++i) { + if (input_index[i] == nullptr) { + input_index[i] = *it++; + } + } + CHECK(index.end() == it); + + // Apply the reduction function to the loaded value. + llvm::Value* input_element = arg_array.EmitReadArrayElement(input_index, &b_); + llvm::Value* result = EmitThreadLocalCall( + *reduce->to_apply(), {Load(accumulator_addr), input_element}, + "reduce_function"); + Store(result, accumulator_addr); + + SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); + return Load(accumulator_addr); +} + Status IrEmitter::HandleReduce(HloInstruction* reduce) { + // TODO(b/112040122): Support variadic reduce. + if (!ShapeUtil::IsArray(reduce->shape())) { + return Unimplemented("Variadic reduce is not supported on CPU"); + } auto arg = reduce->mutable_operand(0); auto init_value = reduce->mutable_operand(1); - gtl::ArraySlice dimensions(reduce->dimensions()); + absl::Span dimensions(reduce->dimensions()); HloComputation* function = reduce->to_apply(); if (!options::VectorizedReduceDisabled(hlo_module_config_)) { string vectorization_failure_reason; @@ -1785,61 +1780,11 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) { } } - // The called computation should have been emitted previously. - llvm::Function* reducer_function = FindOrDie(emitted_functions_, function); - return EmitTargetElementLoop( - reduce, [this, reduce, arg, init_value, dimensions, - reducer_function](const llvm_ir::IrArray::Index& index) { - // Initialize an accumulator with init_value. - PrimitiveType accumulator_type = reduce->shape().element_type(); - llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry( - llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_), - "accumulator", &ir_builder_, - MinimumAlignmentForPrimitiveType(accumulator_type)); - llvm::Value* init_value_addr = GetEmittedValueFor(init_value); - llvm::Value* load_init_value = ir_builder_.CreateLoad(init_value_addr); - ir_builder_.CreateStore(load_init_value, accumulator_addr); - - // The enclosing loops go over all the target elements. Now we have to - // compute the actual target element. For this, we build a new loop nest - // to iterate over all the reduction dimensions in the argument. - // AddLoopsForShapeOnDimensions will return an Index where induction - // Value*s are placed for each dimension in dimensions, and all the rest - // are nullptrs. - llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &ir_builder_); - const llvm_ir::IrArray::Index reduced_dims_index = - loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions, - "reduction_dim"); - - SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_); - - // Build a full index for the input argument, using reduced_dims_index - // as the base. In reduced_dims_index only the reduction dimensions are - // filled in. We fill in the rest of the dimensions with induction - // Value*s taken from 'index' which iterates over the target array. - // See the high-level description in the XLA documentation for details. - llvm_ir::IrArray arg_array(GetIrArrayFor(arg)); - llvm_ir::IrArray::Index input_index = reduced_dims_index; - llvm_ir::IrArray::Index::const_iterator it = index.begin(); - - for (size_t i = 0; i < input_index.size(); ++i) { - if (input_index[i] == nullptr) { - input_index[i] = *it++; - } - } - CHECK(index.end() == it); - - // Apply the reduction function to the loaded value. - llvm::Value* input_address = - arg_array.EmitArrayElementAddress(input_index, &ir_builder_); - llvm::Value* result = EmitElementFunctionCall( - reducer_function, reduce->shape(), - {accumulator_addr, input_address}, "reduce_function"); - ir_builder_.CreateStore(result, accumulator_addr); - - SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_); - return ir_builder_.CreateLoad(accumulator_addr); - }); + return EmitTargetElementLoop(reduce, + [&](const llvm_ir::IrArray::Index& index) { + return EmitTargetElementLoopBodyForReduce( + Cast(reduce), index); + }); } Status IrEmitter::HandleSend(HloInstruction* send) { @@ -1852,6 +1797,10 @@ Status IrEmitter::HandleSendDone(HloInstruction* send_done) { return Unimplemented("Send-done is not implemented on CPU."); } +Status IrEmitter::HandleScatter(HloInstruction*) { + return Unimplemented("Scatter is not implemented on CPUs."); +} + Status IrEmitter::HandleSlice(HloInstruction* slice) { VLOG(2) << "HandleSlice: " << slice->ToString(); auto operand = slice->operand(0); @@ -1868,7 +1817,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) { TF_RETURN_IF_ERROR(EmitTargetAddressForOp(slice)); - if (ShapeUtil::HasZeroElements(slice->shape())) { + if (ShapeUtil::IsZeroElementArray(slice->shape())) { return Status::OK(); } @@ -1941,7 +1890,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) { llvm_ir::IrArray target_array = GetIrArrayFor(slice); const int64 num_outer_loops = outer_dims.size(); - llvm_ir::ForLoopNest loops(IrName(slice), &ir_builder_); + llvm_ir::ForLoopNest loops(IrName(slice), &b_); llvm_ir::IrArray::Index target_index = loops.AddLoopsForShapeOnDimensions(slice->shape(), outer_dims, "slice"); @@ -1950,21 +1899,21 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) { // for the rest of the dimensions the copy writes to the full dimension. std::replace(target_index.begin(), target_index.end(), static_cast(nullptr), - static_cast(ir_builder_.getInt64(0))); + static_cast(b_.getInt64(0))); if (num_outer_loops > 0) { - SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_); + SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_); } llvm_ir::IrArray source_array = GetIrArrayFor(operand); const llvm_ir::IrArray::Index source_index = target_index.SourceIndexOfSlice( /*shape=*/slice->shape(), /*starts=*/slice->slice_starts(), - /*strides=*/slice->slice_strides(), /*builder=*/&ir_builder_); + /*strides=*/slice->slice_strides(), /*builder=*/&b_); - llvm::Value* memcpy_dest = target_array.EmitArrayElementAddress( - target_index, &ir_builder_, "slice.dest"); - llvm::Value* memcpy_source = source_array.EmitArrayElementAddress( - source_index, &ir_builder_, "slice.source"); + llvm::Value* memcpy_dest = + target_array.EmitArrayElementAddress(target_index, &b_, "slice.dest"); + llvm::Value* memcpy_source = + source_array.EmitArrayElementAddress(source_index, &b_, "slice.source"); const int64 memcpy_elements = primitive_elements_per_logical_element * memcpy_logical_elements; @@ -1981,7 +1930,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) { } if (num_outer_loops > 0) { - SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_); + SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); } return Status::OK(); @@ -2007,7 +1956,7 @@ Status IrEmitter::HandleDynamicUpdateSlice( auto operands = GetIrArraysForOperandsOf(dynamic_update_slice); return llvm_ir::EmitDynamicUpdateSliceInPlace( operands, GetIrArrayFor(dynamic_update_slice), - IrName(dynamic_update_slice, "in_place"), &ir_builder_); + IrName(dynamic_update_slice, "in_place"), &b_); } return DefaultAction(dynamic_update_slice); } @@ -2041,43 +1990,41 @@ Status IrEmitter::HandlePad(HloInstruction* pad) { [this, pad](const llvm_ir::IrArray::Index& target_index) { const HloInstruction* padding_value = pad->operand(1); llvm::Value* padding_value_addr = GetEmittedValueFor(padding_value); - return ir_builder_.CreateLoad(padding_value_addr); + return Load(padding_value_addr); })); // Create a loop to iterate over the operand elements and update the output // locations where the operand elements should be stored. - llvm_ir::ForLoopNest loops(IrName(pad, "assign"), &ir_builder_); + llvm_ir::ForLoopNest loops(IrName(pad, "assign"), &b_); const HloInstruction* operand = pad->operand(0); const llvm_ir::IrArray::Index operand_index = loops.AddLoopsForShape(operand->shape(), "operand"); - SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_); + SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_); // Load an element from the operand. llvm_ir::IrArray operand_array(GetIrArrayFor(operand)); llvm::Value* operand_data = - operand_array.EmitReadArrayElement(operand_index, &ir_builder_); + operand_array.EmitReadArrayElement(operand_index, &b_); // Compute the output index the operand element should be assigned to. // output_index := edge_padding_low + operand_index * (interior_padding + 1) const PaddingConfig& padding_config = pad->padding_config(); - llvm_ir::IrArray::Index output_index; + llvm_ir::IrArray::Index output_index(operand_index.GetType()); for (size_t i = 0; i < operand_index.size(); ++i) { - llvm::Value* offset = ir_builder_.CreateMul( - operand_index[i], - ir_builder_.getInt64(padding_config.dimensions(i).interior_padding() + - 1)); - llvm::Value* index = ir_builder_.CreateAdd( - offset, - ir_builder_.getInt64(padding_config.dimensions(i).edge_padding_low())); + llvm::Value* offset = + Mul(operand_index[i], + b_.getInt64(padding_config.dimensions(i).interior_padding() + 1)); + llvm::Value* index = Add( + offset, b_.getInt64(padding_config.dimensions(i).edge_padding_low())); output_index.push_back(index); } // Store the operand element to the computed output location. llvm_ir::IrArray output_array(GetIrArrayFor(pad)); - output_array.EmitWriteArrayElement(output_index, operand_data, &ir_builder_); + output_array.EmitWriteArrayElement(output_index, operand_data, &b_); - SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_); + SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); return Status::OK(); } @@ -2099,8 +2046,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) { // Delegate to common implementation of fused in-place dynamic-update-slice. auto operands = GetIrArraysForOperandsOf(fusion); return llvm_ir::EmitFusedDynamicUpdateSliceInPlace( - fusion, operands, GetIrArrayFor(fusion), &elemental_emitter, - &ir_builder_); + fusion, operands, GetIrArrayFor(fusion), &elemental_emitter, &b_); } else if (fusion->fusion_kind() == HloInstruction::FusionKind::kLoop) { VLOG(3) << "HandleFusion kLoop"; CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_); @@ -2135,7 +2081,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) { TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation( *dot, target_array, lhs_array, rhs_array, &addend_array, - GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_, + GetExecutableRunOptionsArgument(), &b_, hlo_module_config_, target_machine_features_)); return Status::OK(); } else { @@ -2147,65 +2093,57 @@ Status IrEmitter::HandleCall(HloInstruction* call) { HloComputation* computation = call->to_apply(); llvm::Function* call_ir_function = FindOrDie(emitted_functions_, computation); - std::vector parameter_addresses; - for (const HloInstruction* operand : call->operands()) { - parameter_addresses.push_back(GetEmittedValueFor(operand)); - } - TF_RETURN_IF_ERROR(EmitTargetAddressForOp(call)); if (!computation->root_instruction()->outer_dimension_partitions().empty()) { // ParallelTaskAssignment assigned partitions, emit call to // ParallelForkJoin. std::vector call_args = GetArrayFunctionCallArguments( - parameter_addresses, &ir_builder_, computation->name(), + {}, &b_, computation->name(), /*return_value_buffer=*/emitted_value_[call], /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), - /*temp_buffers_arg=*/GetTempBuffersArgument(), + /*buffer_table_arg=*/GetBufferTableArgument(), /*profile_counters_arg=*/GetProfileCountersArgument()); HloInstruction* root = computation->root_instruction(); TF_RETURN_IF_ERROR(EmitCallToParallelForkJoin( - call_args, root->shape(), root->outer_dimension_partitions(), - &ir_builder_, call_ir_function, computation->name())); + call_args, root->shape(), root->outer_dimension_partitions(), &b_, + call_ir_function, computation->name())); } else { - EmitArrayFunctionCallInto(call_ir_function, parameter_addresses, - emitted_value_[call], computation->name()); + EmitGlobalCall(*computation, computation->name()); } return Status::OK(); } Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) { - gtl::ArraySlice operands(custom_call->operands()); - tensorflow::StringPiece custom_call_target(custom_call->custom_call_target()); - llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy(); + absl::Span operands(custom_call->operands()); + absl::string_view custom_call_target(custom_call->custom_call_target()); + llvm::Type* i8_ptr_type = b_.getInt8PtrTy(); llvm::AllocaInst* operands_alloca = llvm_ir::EmitAllocaAtFunctionEntryWithCount( - i8_ptr_type, ir_builder_.getInt32(operands.size()), - "cc_operands_alloca", &ir_builder_); + i8_ptr_type, b_.getInt32(operands.size()), "cc_operands_alloca", &b_); for (size_t i = 0; i < operands.size(); ++i) { const HloInstruction* operand = operands[i]; llvm::Value* operand_as_i8ptr = - ir_builder_.CreatePointerCast(GetEmittedValueFor(operand), i8_ptr_type); - llvm::Value* slot_in_operands_alloca = ir_builder_.CreateInBoundsGEP( - operands_alloca, {ir_builder_.getInt64(i)}); - ir_builder_.CreateStore(operand_as_i8ptr, slot_in_operands_alloca); + PointerCast(GetEmittedValueFor(operand), i8_ptr_type); + llvm::Value* slot_in_operands_alloca = + InBoundsGEP(operands_alloca, {b_.getInt64(i)}); + Store(operand_as_i8ptr, slot_in_operands_alloca); } auto* custom_call_ir_function = llvm::cast(module_->getOrInsertFunction( AsStringRef(custom_call_target), llvm::FunctionType::get( - /*Result=*/ir_builder_.getVoidTy(), + /*Result=*/b_.getVoidTy(), /*Params=*/{i8_ptr_type, operands_alloca->getType()}, /*isVarArg=*/false))); TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call)); - auto* output_address_arg = ir_builder_.CreatePointerCast( - GetEmittedValueFor(custom_call), i8_ptr_type); + auto* output_address_arg = + PointerCast(GetEmittedValueFor(custom_call), i8_ptr_type); - ir_builder_.CreateCall(custom_call_ir_function, - {output_address_arg, operands_alloca}); + Call(custom_call_ir_function, {output_address_arg, operands_alloca}); return Status::OK(); } @@ -2232,8 +2170,8 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { return InternalError( "instruction %s %s does not share slice with " "instruction %s %s", - a->ToString().c_str(), slice_a.ToString().c_str(), - b->ToString().c_str(), slice_b.ToString().c_str()); + a->ToString(), slice_a.ToString(), b->ToString(), + slice_b.ToString()); } return Status::OK(); }; @@ -2253,12 +2191,6 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { const HloInstruction* init = xla_while->operand(0); emitted_value_[xla_while] = GetEmittedValueFor(init); - // The called computation should have been emitted previously. - llvm::Function* condition_ir_function = - FindOrDie(emitted_functions_, condition); - llvm::Function* body_ir_function = - FindOrDie(emitted_functions_, xla_while->while_body()); - // Generating: // while (Condition(while_result)) { // // CopyInsertion pass inserts copies which enable 'while_result' to @@ -2270,17 +2202,14 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { llvm::BasicBlock* header_bb = llvm::BasicBlock::Create( module_->getContext(), AsStringRef(IrName(xla_while, "header")), compute_function_->function()); - ir_builder_.CreateBr(header_bb); - ir_builder_.SetInsertPoint(header_bb); + Br(header_bb); + b_.SetInsertPoint(header_bb); // Calls the condition function to determine whether to proceed with the // body. It must return a bool, so use the scalar call form. - llvm::Value* while_result = GetEmittedValueFor(xla_while); - llvm::Value* while_condition = EmitElementFunctionCall( - condition_ir_function, condition->root_instruction()->shape(), - {while_result}, IrName(xla_while, "cond")); - llvm::Value* while_predicate = ir_builder_.CreateICmpNE( - while_condition, + EmitGlobalCall(*xla_while->while_condition(), IrName(xla_while, "cond")); + llvm::Value* while_predicate = ICmpNE( + Load(GetBufferForGlobalCallReturnValue(*xla_while->while_condition())), llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0)); // Branches to the body or to the while exit depending on the condition. @@ -2289,26 +2218,26 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { compute_function_->function()); llvm::BasicBlock* exit_bb = llvm::BasicBlock::Create( module_->getContext(), AsStringRef(IrName(xla_while, "exit"))); - ir_builder_.CreateCondBr(while_predicate, body_bb, exit_bb); + CondBr(while_predicate, body_bb, exit_bb); // Calls the body function from the body block. - ir_builder_.SetInsertPoint(body_bb); + b_.SetInsertPoint(body_bb); // Calls the body function. - EmitArrayFunctionCallInto(body_ir_function, {while_result}, while_result, - IrName(xla_while, "body")); + EmitGlobalCall(*xla_while->while_body(), IrName(xla_while, "body")); + // Finishes with a branch back to the header. - ir_builder_.CreateBr(header_bb); + Br(header_bb); // Adds the exit block to the function and sets the insert point there. compute_function_->function()->getBasicBlockList().push_back(exit_bb); - ir_builder_.SetInsertPoint(exit_bb); + b_.SetInsertPoint(exit_bb); return Status::OK(); } StatusOr IrEmitter::EmitFastConcatenate( - HloInstruction* concatenate, gtl::ArraySlice operands, + HloInstruction* concatenate, absl::Span operands, string* failure_reason) { if (ShouldEmitParallelLoopFor(*concatenate)) { *failure_reason = @@ -2344,21 +2273,20 @@ StatusOr IrEmitter::EmitFastConcatenate( std::vector outer_dims(std::next(concat_dim_layout_itr), output_min2maj.end()); - llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy(); - llvm::Type* i8_type = ir_builder_.getInt8Ty(); + llvm::Type* i8_ptr_type = b_.getInt8PtrTy(); TF_RETURN_IF_ERROR(EmitTargetAddressForOp(concatenate)); llvm_ir::IrArray target_array = GetIrArrayFor(concatenate); - llvm_ir::ForLoopNest loops(IrName(concatenate), &ir_builder_); + llvm_ir::ForLoopNest loops(IrName(concatenate), &b_); llvm_ir::IrArray::Index outer_dims_index = loops.AddLoopsForShapeOnDimensions(output_shape, outer_dims, "concat"); std::replace(outer_dims_index.begin(), outer_dims_index.end(), static_cast(nullptr), - static_cast(ir_builder_.getInt64(0))); + static_cast(b_.getInt64(0))); if (!outer_dims.empty()) { - SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_); + SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_); } PrimitiveType primitive_type = output_shape.element_type(); @@ -2367,10 +2295,10 @@ StatusOr IrEmitter::EmitFastConcatenate( // Contiguous subregions from each operand to the concatenate contribute to a // contiguous subregion in the target buffer starting at target_region_begin. - llvm::Value* target_region_begin = ir_builder_.CreateBitCast( - target_array.EmitArrayElementAddress(outer_dims_index, &ir_builder_, - "target_region"), - i8_ptr_type); + llvm::Value* target_region_begin = + BitCast(target_array.EmitArrayElementAddress(outer_dims_index, &b_, + "target_region"), + i8_ptr_type); int64 byte_offset_into_target_region = 0; int64 inner_dims_product = @@ -2384,14 +2312,12 @@ StatusOr IrEmitter::EmitFastConcatenate( for (HloInstruction* operand : operands) { const Shape& input_shape = operand->shape(); llvm_ir::IrArray source_array = GetIrArrayFor(operand); - llvm::Value* copy_source_address = ir_builder_.CreateBitCast( - source_array.EmitArrayElementAddress(outer_dims_index, &ir_builder_, - "src_addr"), + llvm::Value* copy_source_address = BitCast( + source_array.EmitArrayElementAddress(outer_dims_index, &b_, "src_addr"), i8_ptr_type); - llvm::Value* copy_target_address = ir_builder_.CreateGEP( - i8_type, target_region_begin, - ir_builder_.getInt64(byte_offset_into_target_region)); + llvm::Value* copy_target_address = + GEP(target_region_begin, b_.getInt64(byte_offset_into_target_region)); EmitTransferElements( copy_target_address, copy_source_address, @@ -2404,7 +2330,7 @@ StatusOr IrEmitter::EmitFastConcatenate( } if (!outer_dims.empty()) { - SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_); + SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); } return true; @@ -2423,16 +2349,15 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source, llvm_ir::PrimitiveTypeToIrType(primitive_type, module_)); if (element_count == 1) { - auto* load_instruction = ir_builder_.CreateAlignedLoad( - ir_builder_.CreateBitCast(source, primitive_ptr_type), - element_alignment); + auto* load_instruction = + AlignedLoad(BitCast(source, primitive_ptr_type), element_alignment); source_array.AnnotateLoadStoreInstructionWithMetadata(load_instruction); - auto* store_instruction = ir_builder_.CreateAlignedStore( - load_instruction, ir_builder_.CreateBitCast(target, primitive_ptr_type), - element_alignment); + auto* store_instruction = + AlignedStore(load_instruction, BitCast(target, primitive_ptr_type), + element_alignment); target_array.AnnotateLoadStoreInstructionWithMetadata(store_instruction); } else { - auto* memcpy_instruction = ir_builder_.CreateMemCpy( + auto* memcpy_instruction = MemCpy( target, /*DstAlign=*/element_alignment, source, /*SrcAlign=*/element_alignment, element_count * primitive_type_size); @@ -2448,7 +2373,7 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source, } Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) { - gtl::ArraySlice operands(concatenate->operands()); + absl::Span operands(concatenate->operands()); string failure_reason; TF_ASSIGN_OR_RETURN( bool successful, @@ -2466,8 +2391,6 @@ Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) { Status IrEmitter::HandleConditional(HloInstruction* conditional) { auto pred = conditional->operand(0); - auto true_arg = conditional->operand(1); - auto false_arg = conditional->operand(2); TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()) && pred->shape().element_type() == PRED) << "Predicate on a Conditional must be bool; got: " @@ -2489,37 +2412,55 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) { << " and " << ShapeUtil::HumanString(false_computation->root_instruction()->shape()); - llvm::Function* true_function = - FindOrDie(emitted_functions_, true_computation); - llvm::Function* false_function = - FindOrDie(emitted_functions_, false_computation); - TF_RETURN_IF_ERROR(EmitTargetAddressForOp(conditional)); - llvm::Value* conditional_result = GetEmittedValueFor(conditional); // Generating: // if (pred) // cond_result = true_computation(true_operand) // else // cond_result = false_computation(false_operand) - llvm::LoadInst* pred_value = ir_builder_.CreateLoad( - GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value"); - llvm::Value* pred_cond = ir_builder_.CreateICmpNE( + llvm::LoadInst* pred_value = + Load(GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value"); + llvm::Value* pred_cond = ICmpNE( pred_value, llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0), "boolean_predicate"); llvm_ir::LlvmIfData if_data = - llvm_ir::EmitIfThenElse(pred_cond, "conditional", &ir_builder_); + llvm_ir::EmitIfThenElse(pred_cond, "conditional", &b_); + + SetToFirstInsertPoint(if_data.true_block, &b_); + EmitGlobalCall(*conditional->true_computation(), + IrName(conditional, "_true")); - SetToFirstInsertPoint(if_data.true_block, &ir_builder_); - EmitArrayFunctionCallInto(true_function, {GetEmittedValueFor(true_arg)}, - conditional_result, IrName(conditional, "_true")); + SetToFirstInsertPoint(if_data.false_block, &b_); + EmitGlobalCall(*conditional->false_computation(), + IrName(conditional, "_false")); - SetToFirstInsertPoint(if_data.false_block, &ir_builder_); - EmitArrayFunctionCallInto(false_function, {GetEmittedValueFor(false_arg)}, - conditional_result, IrName(conditional, "_false")); + SetToFirstInsertPoint(if_data.after_block, &b_); + return Status::OK(); +} + +Status IrEmitter::HandleAfterAll(HloInstruction* gen_token) { + TF_RET_CHECK(ByteSizeOf(gen_token->shape()) == 0); + // No code to generate, but we need to emit an address for book-keeping. + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(gen_token)); + return Status::OK(); +} + +Status IrEmitter::HandleRng(HloInstruction* rng) { + ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator; + for (const HloInstruction* operand : rng->operands()) { + operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) { + return GetIrArrayFor(operand).EmitReadArrayElement(index, &b_); + }; + } + + CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_); + TF_RETURN_IF_ERROR(EmitTargetElementLoop( + rng, elemental_emitter.MakeElementGenerator(rng, operand_to_generator))); + + llvm_ir::IncrementVariableForPhiloxRngState(1, module_, &b_); - SetToFirstInsertPoint(if_data.after_block, &ir_builder_); return Status::OK(); } @@ -2540,7 +2481,7 @@ Status IrEmitter::FinishVisit(HloInstruction* root) { auto record_complete_computation = [&](llvm::Value* prof_counter) { if (prof_counter) { - profiling_state_.RecordCompleteComputation(&ir_builder_, prof_counter); + profiling_state_.RecordCompleteComputation(&b_, prof_counter); } }; @@ -2562,54 +2503,51 @@ llvm::Value* IrEmitter::GetProfileCounterCommon( int64 prof_counter_idx = it->second; string counter_name = IrName("prof_counter", hlo.name()); - return ir_builder_.CreateGEP(GetProfileCountersArgument(), - ir_builder_.getInt64(prof_counter_idx), - AsStringRef(counter_name)); + return GEP(GetProfileCountersArgument(), b_.getInt64(prof_counter_idx), + AsStringRef(counter_name)); } -void IrEmitter::ProfilingState::UpdateProfileCounter( - llvm::IRBuilder<>* ir_builder, llvm::Value* prof_counter, - llvm::Value* cycle_end, llvm::Value* cycle_start) { - auto* cycle_diff = ir_builder->CreateSub(cycle_end, cycle_start); +void IrEmitter::ProfilingState::UpdateProfileCounter(llvm::IRBuilder<>* b, + llvm::Value* prof_counter, + llvm::Value* cycle_end, + llvm::Value* cycle_start) { + auto* cycle_diff = b->CreateSub(cycle_end, cycle_start); llvm::LoadInst* old_cycle_count = - ir_builder->CreateLoad(prof_counter, "old_cycle_count"); + b->CreateLoad(prof_counter, "old_cycle_count"); auto* new_cycle_count = - ir_builder->CreateAdd(cycle_diff, old_cycle_count, "new_cycle_count"); - ir_builder->CreateStore(new_cycle_count, prof_counter); + b->CreateAdd(cycle_diff, old_cycle_count, "new_cycle_count"); + b->CreateStore(new_cycle_count, prof_counter); } -llvm::Value* IrEmitter::ProfilingState::ReadCycleCounter( - llvm::IRBuilder<>* ir_builder) { - llvm::Module* module = ir_builder->GetInsertBlock()->getModule(); +llvm::Value* IrEmitter::ProfilingState::ReadCycleCounter(llvm::IRBuilder<>* b) { + llvm::Module* module = b->GetInsertBlock()->getModule(); if (use_rdtscp_) { llvm::Function* func_llvm_readcyclecounter = llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::readcyclecounter); - return ir_builder->CreateCall(func_llvm_readcyclecounter); + return b->CreateCall(func_llvm_readcyclecounter); } llvm::Function* func_llvm_x86_rdtscp = llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::x86_rdtscp); if (!aux_i8ptr_) { - llvm::AllocaInst* rdtscp_aux = llvm_ir::EmitAllocaAtFunctionEntry( - ir_builder->getInt32Ty(), "rdtscp_aux", ir_builder); - aux_i8ptr_ = - ir_builder->CreateBitCast(rdtscp_aux, ir_builder->getInt8PtrTy()); + llvm::AllocaInst* rdtscp_aux = + llvm_ir::EmitAllocaAtFunctionEntry(b->getInt32Ty(), "rdtscp_aux", b); + aux_i8ptr_ = b->CreateBitCast(rdtscp_aux, b->getInt8PtrTy()); } - llvm::ConstantInt* alloca_size = ir_builder->getInt64(4); + llvm::ConstantInt* alloca_size = b->getInt64(4); llvm::Function* func_llvm_lifetime_start = llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::lifetime_start); - ir_builder->CreateCall(func_llvm_lifetime_start, {alloca_size, aux_i8ptr_}); - llvm::Value* rdtscp_call = - ir_builder->CreateCall(func_llvm_x86_rdtscp, aux_i8ptr_); + b->CreateCall(func_llvm_lifetime_start, {alloca_size, aux_i8ptr_}); + llvm::Value* rdtscp_call = b->CreateCall(func_llvm_x86_rdtscp, aux_i8ptr_); llvm::Function* func_llvm_lifetime_end = llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::lifetime_end); - ir_builder->CreateCall(func_llvm_lifetime_end, {alloca_size, aux_i8ptr_}); + b->CreateCall(func_llvm_lifetime_end, {alloca_size, aux_i8ptr_}); return rdtscp_call; } -void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* ir_builder, +void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* b, HloInstruction* hlo) { - auto* cycle_start = ReadCycleCounter(ir_builder); + auto* cycle_start = ReadCycleCounter(b); cycle_start->setName(AsStringRef(IrName(hlo, "cycle_start"))); cycle_starts_[hlo] = cycle_start; if (first_read_cycle_start_ == nullptr) { @@ -2617,20 +2555,20 @@ void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* ir_builder, } } -void IrEmitter::ProfilingState::RecordCycleDelta(llvm::IRBuilder<>* ir_builder, +void IrEmitter::ProfilingState::RecordCycleDelta(llvm::IRBuilder<>* b, HloInstruction* hlo, llvm::Value* prof_counter) { - auto* cycle_end = ReadCycleCounter(ir_builder); + auto* cycle_end = ReadCycleCounter(b); cycle_end->setName(AsStringRef(IrName(hlo, "cycle_end"))); auto* cycle_start = cycle_starts_[hlo]; - UpdateProfileCounter(ir_builder, prof_counter, cycle_end, cycle_start); + UpdateProfileCounter(b, prof_counter, cycle_end, cycle_start); last_read_cycle_end_ = cycle_end; } void IrEmitter::ProfilingState::RecordCompleteComputation( - llvm::IRBuilder<>* ir_builder, llvm::Value* prof_counter) { + llvm::IRBuilder<>* b, llvm::Value* prof_counter) { if (last_read_cycle_end_ && first_read_cycle_start_) { - UpdateProfileCounter(ir_builder, prof_counter, last_read_cycle_end_, + UpdateProfileCounter(b, prof_counter, last_read_cycle_end_, first_read_cycle_start_); } } @@ -2638,14 +2576,14 @@ void IrEmitter::ProfilingState::RecordCompleteComputation( Status IrEmitter::Preprocess(HloInstruction* hlo) { VLOG(3) << "Visiting: " << hlo->ToString(); if (instruction_to_profile_idx_.count(hlo)) { - profiling_state_.RecordCycleStart(&ir_builder_, hlo); + profiling_state_.RecordCycleStart(&b_, hlo); } return Status::OK(); } Status IrEmitter::Postprocess(HloInstruction* hlo) { if (auto* prof_counter = GetProfileCounterFor(*hlo)) { - profiling_state_.RecordCycleDelta(&ir_builder_, hlo, prof_counter); + profiling_state_.RecordCycleDelta(&b_, hlo, prof_counter); } return Status::OK(); } @@ -2684,50 +2622,81 @@ llvm::Value* IrEmitter::GetProfileCountersArgument() { return compute_function_->profile_counters_arg(); } -llvm::Value* IrEmitter::GetTempBuffersArgument() { - return compute_function_->temp_buffers_arg(); +llvm::Value* IrEmitter::GetBufferTableArgument() { + return compute_function_->buffer_table_arg(); } llvm::Value* IrEmitter::GetExecutableRunOptionsArgument() { return compute_function_->exec_run_options_arg(); } -llvm::Value* IrEmitter::EmitTempBufferPointer( +llvm::Value* IrEmitter::EmitThreadLocalBufferPointer( const BufferAllocation::Slice& slice, const Shape& target_shape) { - llvm::Type* element_type = IrShapeType(target_shape); - // The alignment and number of bytes within the temporary buffer is determined - // by the maximal shape as determined by buffer assignment. - const BufferAllocation& allocation = assignment_.GetAllocation(slice.index()); - if (allocation.is_thread_local()) { + const BufferAllocation& allocation = *slice.allocation(); + llvm::Value* tempbuf_address = [&]() -> llvm::Value* { + if (slice == computation_root_allocation_) { + llvm::Argument* retval = compute_function_->result_arg(); + llvm::AttrBuilder attr_builder; + attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape)); + attr_builder.addDereferenceableAttr(ByteSizeOf(target_shape)); + retval->addAttrs(attr_builder); + return retval; + } + + auto param_it = + computation_parameter_allocations_.find(slice.allocation()->index()); + if (param_it != computation_parameter_allocations_.end()) { + int64 param_number = param_it->second; + // We have to access the parameter at offset param_number in the params + // array. The code generated here is equivalent to this C code: + // + // i8* param_address_untyped = params[param_number]; + // Param* param_address_typed = (Param*)param_address_untyped; + // + // Where Param is the actual element type of the underlying buffer (for + // example, float for an XLA F32 element type). + llvm::Value* params = compute_function_->parameters_arg(); + llvm::Value* param_address_offset = + llvm_ir::EmitBufferIndexingGEP(params, param_number, &b_); + llvm::LoadInst* param_address_untyped = Load(param_address_offset); + + if (!ShapeUtil::IsOpaque(target_shape)) { + AttachAlignmentMetadataForLoad(param_address_untyped, target_shape); + AttachDereferenceableMetadataForLoad(param_address_untyped, + target_shape); + } + return param_address_untyped; + } + // Thread-local allocations should only be assigned a single buffer. const auto& assigned_buffers = allocation.assigned_buffers(); CHECK_EQ(1, assigned_buffers.size()); const Shape& shape = assigned_buffers.begin()->first->shape(); - llvm::AllocaInst*& tempbuf_address = thread_local_buffers_[{ - ir_builder_.GetInsertBlock()->getParent(), slice}]; - if (tempbuf_address == nullptr) { - tempbuf_address = llvm_ir::EmitAllocaAtFunctionEntry( - IrShapeType(shape), - tensorflow::strings::StrCat("thread_local", slice.ToString()), - &ir_builder_, MinimumAlignmentForShape(target_shape)); + std::pair key = { + compute_function_->function(), slice}; + auto buf_it = thread_local_buffers_.find(key); + if (buf_it == thread_local_buffers_.end()) { + llvm::Value* buffer = llvm_ir::EmitAllocaAtFunctionEntry( + IrShapeType(shape), absl::StrCat("thread_local", slice.ToString()), + &b_, MinimumAlignmentForShape(target_shape)); + auto it_inserted_pair = thread_local_buffers_.insert({key, buffer}); + CHECK(it_inserted_pair.second); + buf_it = it_inserted_pair.first; } - return ir_builder_.CreateBitCast(tempbuf_address, - element_type->getPointerTo()); - } + return buf_it->second; + }(); + return BitCast(tempbuf_address, IrShapeType(target_shape)->getPointerTo()); +} +llvm::Value* IrEmitter::EmitGlobalBufferPointer( + const BufferAllocation::Slice& slice, const Shape& target_shape) { + const BufferAllocation& allocation = *slice.allocation(); llvm::Value* tempbuf_address_ptr = llvm_ir::EmitBufferIndexingGEP( - GetTempBuffersArgument(), slice.index(), &ir_builder_); - llvm::LoadInst* tempbuf_address_base = - ir_builder_.CreateLoad(tempbuf_address_ptr); - if (is_top_level_computation_ && - hlo_module_config_.debug_options() + GetBufferTableArgument(), slice.index(), &b_); + llvm::LoadInst* tempbuf_address_base = Load(tempbuf_address_ptr); + if (hlo_module_config_.debug_options() .xla_llvm_enable_invariant_load_metadata()) { - // In the entry computation the parameter slots in the %params argument are - // invariant through program execution. In computations that are called - // from the entry computation (via kWhile, kCall and kConditional) the - // parameter slots are *not* invariant since they're written to by their - // callers. tempbuf_address_base->setMetadata( llvm::LLVMContext::MD_invariant_load, llvm::MDNode::get(tempbuf_address_base->getContext(), /*MDs=*/{})); @@ -2738,87 +2707,29 @@ llvm::Value* IrEmitter::EmitTempBufferPointer( llvm::Value* tempbuf_address_untyped = tempbuf_address_base; if (slice.offset() > 0) { // Adjust the address to account for the slice offset. - tempbuf_address_untyped = ir_builder_.CreateInBoundsGEP( - tempbuf_address_base, ir_builder_.getInt64(slice.offset())); + tempbuf_address_untyped = + InBoundsGEP(tempbuf_address_base, b_.getInt64(slice.offset())); } - return ir_builder_.CreateBitCast(tempbuf_address_untyped, - element_type->getPointerTo()); + return BitCast(tempbuf_address_untyped, + IrShapeType(target_shape)->getPointerTo()); } -// Emits a function call returning a single array element. Allocates space -// for a single element_type value, and loads it after call. -llvm::Value* IrEmitter::EmitElementFunctionCall( - llvm::Function* function, const Shape& return_shape, - gtl::ArraySlice parameter_addresses, - tensorflow::StringPiece name) { - llvm::Value* return_value_buffer = EmitArrayFunctionCall( - function, return_shape, 1, parameter_addresses, name); - return ir_builder_.CreateLoad( - return_value_buffer, - AsStringRef(tensorflow::strings::StrCat(name, "_return_value"))); -} - -// Emits a core function call based on the following pseudo-code. -// -// char** parameter_addresses_buffer = -// allocate buffer with a pointer for each parameter to the function -// for each parameter index, i.e. for i = 0, ..., #parameters: -// parameter_addresses_buffer[i] = parameter_addresses[i] -// call function(return_value_buffer, -// parameter_addresses_buffer, -// temps) -// return return_value_buffer -- address of the return value. -void IrEmitter::EmitArrayFunctionCallInto( - llvm::Function* function, gtl::ArraySlice parameter_addresses, - llvm::Value* return_value_buffer, tensorflow::StringPiece name) { - ir_builder_.CreateCall( - function, GetArrayFunctionCallArguments( - parameter_addresses, &ir_builder_, name, - /*return_value_buffer=*/return_value_buffer, - /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), - /*temp_buffers_arg=*/GetTempBuffersArgument(), - /*profile_counters_arg=*/GetProfileCountersArgument())); -} - -llvm::Value* IrEmitter::EmitArrayFunctionCall( - llvm::Function* function, const Shape& return_shape, int64 element_count, - gtl::ArraySlice parameter_addresses, - tensorflow::StringPiece name) { - llvm::Value* elements = - llvm::ConstantInt::get(ir_builder_.getInt64Ty(), element_count); - PrimitiveType return_type = return_shape.element_type(); - llvm::Value* return_value_buffer = - llvm_ir::EmitAllocaAtFunctionEntryWithCount( - llvm_ir::PrimitiveTypeToIrType(return_type, module_), elements, - tensorflow::strings::StrCat(name, "_return_value_address"), - &ir_builder_, MinimumAlignmentForPrimitiveType(return_type)); - EmitArrayFunctionCallInto(function, parameter_addresses, return_value_buffer, - name); - return return_value_buffer; +llvm::Value* IrEmitter::EmitBufferPointer(const BufferAllocation::Slice& slice, + const Shape& target_shape) { + if (slice.allocation()->is_thread_local()) { + return EmitThreadLocalBufferPointer(slice, target_shape); + } else if (slice.allocation()->is_constant()) { + return FindOrDie(constant_buffer_to_global_, slice.allocation()->index()); + } else { + return EmitGlobalBufferPointer(slice, target_shape); + } } Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) { - llvm::Value* addr; const Shape& target_shape = op->shape(); - if (op == op->parent()->root_instruction()) { - // For the root node, we write directly to the output buffer of the - // function. - llvm::Argument* retval = compute_function_->result_arg(); - if (!ShapeUtil::IsNil(target_shape)) { - llvm::AttrBuilder attr_builder; - attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape)); - attr_builder.addDereferenceableAttr(ByteSizeOf(target_shape)); - retval->addAttrs(attr_builder); - } - addr = ir_builder_.CreateBitCast(retval, - IrShapeType(target_shape)->getPointerTo()); - } else { - // For other nodes, we need the temporary buffer allocated for this node to - // write the result into. - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice, - assignment_.GetUniqueTopLevelSlice(op)); - addr = EmitTempBufferPointer(slice, target_shape); - } + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice, + assignment_.GetUniqueTopLevelSlice(op)); + llvm::Value* addr = EmitBufferPointer(slice, target_shape); addr->setName(AsStringRef(IrName(op))); emitted_value_[op] = addr; return Status::OK(); @@ -2831,7 +2742,7 @@ Status IrEmitter::EmitTargetElementLoop( } Status IrEmitter::EmitTargetElementLoop( - HloInstruction* target_op, tensorflow::StringPiece desc, + HloInstruction* target_op, absl::string_view desc, const llvm_ir::ElementGenerator& element_generator) { VLOG(2) << "EmitTargetElementLoop: " << target_op->ToString(); @@ -2847,20 +2758,19 @@ Status IrEmitter::EmitTargetElementLoop( TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice, assignment_.GetUniqueSlice(target_op, {i})); const Shape& element_shape = ShapeUtil::GetSubshape(target_shape, {i}); - llvm::Value* op_target_address = - EmitTempBufferPointer(slice, element_shape); + llvm::Value* op_target_address = EmitBufferPointer(slice, element_shape); output_arrays.push_back( llvm_ir::IrArray(op_target_address, element_shape)); } TF_RETURN_IF_ERROR( - llvm_ir::LoopEmitter(element_generator, output_arrays, &ir_builder_) + llvm_ir::LoopEmitter(element_generator, output_arrays, &b_) .EmitLoop(IrName(target_op))); std::vector tuple_operand_ptrs; for (int64 i = 0; i < output_arrays.size(); ++i) { tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer()); } - llvm_ir::EmitTuple(target_array, tuple_operand_ptrs, &ir_builder_, module_); + llvm_ir::EmitTuple(target_array, tuple_operand_ptrs, &b_, module_); } else { if (ShouldEmitParallelLoopFor(*target_op)) { @@ -2869,11 +2779,11 @@ Status IrEmitter::EmitTargetElementLoop( compute_function_->GetDynamicLoopBounds(); // Emit parallel loop with dynamic loop bounds for most-major dimensions. TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, target_array, - &dynamic_loop_bounds, &ir_builder_) + &dynamic_loop_bounds, &b_) .EmitLoop(IrName(target_op))); } else { TF_RETURN_IF_ERROR( - llvm_ir::LoopEmitter(element_generator, target_array, &ir_builder_) + llvm_ir::LoopEmitter(element_generator, target_array, &b_) .EmitLoop(IrName(target_op))); } } @@ -2886,15 +2796,15 @@ Status IrEmitter::EmitMemcpy(const HloInstruction& source, llvm::Value* destination_value = GetEmittedValueFor(&destination); int64 source_size = ByteSizeOf(source.shape()); // TODO(b/63762267): Be more aggressive about specifying alignment. - ir_builder_.CreateMemCpy(destination_value, /*DstAlign=*/1, source_value, - /*SrcAlign=*/1, source_size); + MemCpy(destination_value, /*DstAlign=*/1, source_value, + /*SrcAlign=*/1, source_size); return Status::OK(); } Status IrEmitter::ElementTypesSameAndSupported( const HloInstruction& instruction, - gtl::ArraySlice operands, - gtl::ArraySlice supported_types) { + absl::Span operands, + absl::Span supported_types) { for (auto operand : operands) { TF_RET_CHECK( ShapeUtil::SameElementType(operands[0]->shape(), operand->shape())); @@ -2905,8 +2815,8 @@ Status IrEmitter::ElementTypesSameAndSupported( if (std::find(supported_types.begin(), supported_types.end(), primitive_type) == supported_types.end()) { return Unimplemented("unsupported operand type %s in op %s", - PrimitiveType_Name(primitive_type).c_str(), - HloOpcodeString(instruction.opcode()).c_str()); + PrimitiveType_Name(primitive_type), + HloOpcodeString(instruction.opcode())); } return Status::OK(); } @@ -2915,7 +2825,7 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) { ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator; for (const HloInstruction* operand : hlo->operands()) { operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) { - return GetIrArrayFor(operand).EmitReadArrayElement(index, &ir_builder_); + return GetIrArrayFor(operand).EmitReadArrayElement(index, &b_); }; } CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_); @@ -2923,20 +2833,71 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) { hlo, elemental_emitter.MakeElementGenerator(hlo, operand_to_generator)); } -StatusOr IrEmitter::EmitScalarCall( - PrimitiveType return_type, HloComputation* computation, - const std::vector& arguments, tensorflow::StringPiece name) { - llvm::Function* llvm_function = FindOrDie(emitted_functions_, computation); - std::vector argument_addrs; - for (auto argument : arguments) { - llvm::Value* argument_addr = llvm_ir::EmitAllocaAtFunctionEntry( - argument->getType(), "arg_addr", &ir_builder_); - ir_builder_.CreateStore(argument, argument_addr); - argument_addrs.push_back(argument_addr); +llvm::Value* IrEmitter::EmitThreadLocalCall( + const HloComputation& callee, absl::Span parameters, + absl::string_view name) { + CHECK(absl::c_binary_search(thread_local_computations_, &callee)); + + const Shape& return_shape = callee.root_instruction()->shape(); + + // Lifting this restriction to allow "small" arrays should be easy. Allowing + // larger arrays is difficult because we allocate the buffer for this return + // value on the stack. + CHECK(ShapeUtil::IsScalar(return_shape)); + + PrimitiveType return_type = return_shape.element_type(); + + std::vector parameter_addrs; + for (llvm::Value* parameter : parameters) { + CHECK(!parameter->getType()->isPointerTy()); + llvm::Value* parameter_addr = llvm_ir::EmitAllocaAtFunctionEntry( + parameter->getType(), "arg_addr", &b_); + Store(parameter, parameter_addr); + parameter_addrs.push_back(parameter_addr); + } + + llvm::Value* return_value_buffer = llvm_ir::EmitAllocaAtFunctionEntry( + llvm_ir::PrimitiveTypeToIrType(return_type, module_), + absl::StrCat(name, "_retval_addr"), &b_, + MinimumAlignmentForPrimitiveType(return_type)); + + Call(FindOrDie(emitted_functions_, &callee), + GetArrayFunctionCallArguments( + parameter_addrs, &b_, name, + /*return_value_buffer=*/return_value_buffer, + /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), + /*buffer_table_arg=*/ + llvm::Constant::getNullValue(b_.getInt8PtrTy()->getPointerTo()), + /*profile_counters_arg=*/GetProfileCountersArgument())); + + return Load(return_value_buffer); +} + +void IrEmitter::EmitGlobalCall(const HloComputation& callee, + absl::string_view name) { + CHECK(absl::c_binary_search(global_computations_, &callee)); + + Call(FindOrDie(emitted_functions_, &callee), + GetArrayFunctionCallArguments( + /*parameter_addresses=*/{}, &b_, name, + /*return_value_buffer=*/ + llvm::Constant::getNullValue(b_.getInt8PtrTy()), + /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), + /*buffer_table_arg=*/GetBufferTableArgument(), + /*profile_counters_arg=*/GetProfileCountersArgument())); +} + +llvm::Value* IrEmitter::GetBufferForGlobalCallReturnValue( + const HloComputation& callee) { + const HloInstruction* root_inst = callee.root_instruction(); + if (root_inst->opcode() == HloOpcode::kOutfeed) { + return llvm::Constant::getNullValue(b_.getInt8PtrTy()); } - return EmitElementFunctionCall(llvm_function, - ShapeUtil::MakeShape(return_type, {}), - argument_addrs, name); + + const BufferAllocation::Slice root_buffer = + assignment_.GetUniqueTopLevelSlice(root_inst).ValueOrDie(); + return EmitBufferPointer(root_buffer, root_inst->shape()); } + } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h index f49cfc1dc378bb80da3ddf995363acfa2081067b..58a333b8fb2dc46868b04fec0d7d87788a809d06 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h @@ -23,6 +23,8 @@ limitations under the License. #include #include +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -30,22 +32,21 @@ limitations under the License. #include "llvm/IR/Value.h" #include "llvm/Target/TargetMachine.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" -#include "tensorflow/compiler/xla/service/cpu/external_constant_pool.h" #include "tensorflow/compiler/xla/service/cpu/ir_function.h" #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h" #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" #include "tensorflow/compiler/xla/service/name_uniquer.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -55,29 +56,26 @@ namespace cpu { // This class is the top-level API for the XLA HLO --> LLVM IR compiler. It // implements the DfsHloVisitor interface and emits HLO computations as LLVM IR // functions. -class IrEmitter : public DfsHloVisitorWithDefault { +class IrEmitter : public DfsHloVisitorWithDefault, + public IrBuilderMixin { public: // Create a new LLVM IR emitter. // // hlo_module: the HLO module we are emitting IR for. - // assignment: a BufferAssignment from which we know which temporary buffers - // are used by the HLO nodes. + // assignment: a BufferAssignment from which we know which buffers are used by + // the HLO nodes. // llvm_module: the LLVM module to emit IR into. // instruction_to_profile_idx: the mapping from HLO instructions to their // index in the profiling array. // computation_to_profile_idx: the mapping from HLO computations to their // index in the profiling array. - // external_constant_pool: if non-null, points to an ExternalConstantPool - // instance into which the Ir emitter can spill - // constants. IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment, llvm::Module* llvm_module, std::unordered_map instruction_to_profile_idx, std::unordered_map computation_to_profile_idx, - const TargetMachineFeatures* target_machine, - ExternalConstantPool* external_constant_pool); + const TargetMachineFeatures* target_machine); ~IrEmitter() override; // Emit and return the given HLO computation as an LLVM IR @@ -102,12 +100,19 @@ class IrEmitter : public DfsHloVisitorWithDefault { bool is_top_level_computation, std::vector* instruction_order); - llvm::IRBuilder<>* ir_builder() { return &ir_builder_; } + llvm::IRBuilder<>* b() { return &b_; } - // Emits a call to `computation` with scalar arguments `arguments`. - StatusOr EmitScalarCall( - PrimitiveType return_type, HloComputation* computation, - const std::vector& arguments, tensorflow::StringPiece name); + // builder() is for IrBuilderMixin. + llvm::IRBuilder<>* builder() { return &b_; } + + // Emit an LLVM global variable for every constant buffer allocation. + Status EmitConstantGlobals(); + + // Emit code to map one element according to `map_instr`. + llvm::Value* EmitElementalMap( + const HloMapInstruction& map_instr, + absl::Span elemental_operands, + absl::string_view name); protected: // @@ -122,6 +127,7 @@ class IrEmitter : public DfsHloVisitorWithDefault { Status HandleCopy(HloInstruction* copy) override; Status HandleGetTupleElement(HloInstruction* get_tuple_element) override; Status HandleSelect(HloInstruction* select) override; + Status HandleTupleSelect(HloInstruction* tuple_select) override; Status HandleDot(HloInstruction* dot) override; Status HandleConvolution(HloInstruction* convolution) override; Status HandleFft(HloInstruction* fft) override; @@ -143,13 +149,15 @@ class IrEmitter : public DfsHloVisitorWithDefault { Status HandleRecvDone(HloInstruction* recv_done) override; Status HandlePad(HloInstruction* pad) override; Status HandleTuple(HloInstruction* tuple) override; - Status HandleMap(HloInstruction* map) override; Status HandleFusion(HloInstruction* fusion) override; Status HandleCall(HloInstruction* call) override; Status HandleCustomCall(HloInstruction* custom_call) override; Status HandleWhile(HloInstruction* xla_while) override; Status HandleConcatenate(HloInstruction* concatenate) override; Status HandleConditional(HloInstruction* conditional) override; + Status HandleScatter(HloInstruction* scatter) override; + Status HandleAfterAll(HloInstruction* gen_token) override; + Status HandleRng(HloInstruction* rng) override; Status FinishVisit(HloInstruction* root) override; Status Preprocess(HloInstruction* hlo) override; @@ -211,69 +219,56 @@ class IrEmitter : public DfsHloVisitorWithDefault { // argument of the computation function being emitted by this emitter. llvm::Value* GetExecutableRunOptionsArgument(); - // Get the llvm::Value* that represents the "temps" argument of the + // Get the llvm::Value* that represents the "buffer_table" argument of the // computation function being emitted by this emitter. - llvm::Value* GetTempBuffersArgument(); + llvm::Value* GetBufferTableArgument(); + + // Helper for EmitBufferPointer. + llvm::Value* EmitGlobalBufferPointer(const BufferAllocation::Slice& slice, + const Shape& target_shape); - // Emits code that computes the address of the given temporary buffer to the - // function. target_shape is the shape of this temporary buffer. - // The returned Value's type is a pointer to element_type. - llvm::Value* EmitTempBufferPointer(const BufferAllocation::Slice& slice, - const Shape& target_shape); + // Helper for EmitBufferPointer. + llvm::Value* EmitThreadLocalBufferPointer( + const BufferAllocation::Slice& slice, const Shape& target_shape); + + // Emits code that computes the address of the given buffer allocation slice. + llvm::Value* EmitBufferPointer(const BufferAllocation::Slice& slice, + const Shape& target_shape); // Emits a function into the current module. This can be used for // computations embedded inside other computations, such as the // function that a map operation applies. StatusOr EmitFunction( HloComputation* function, // The function to emit. - tensorflow::StringPiece + absl::string_view function_name_suffix); // Used for LLVM IR register names. - // Methods that emit a function call. - // Parameters: - // function - The LLVM function to call. - // return_shape - The return shape of the HLO computation that was used to - // make the function. Not the same as the return type of the function - // in LLVM, since we use output parameters for the return type. - // element_count - number of elements to return (array form only). - // parameter_addresses - pointers to be passed to the function as - // parameters. - // name - used for LLVM IR register names. - - // Emits a function call, returning a scalar, often an element of a larger - // array. Returns a Value for the scalar element returned by the function. - llvm::Value* EmitElementFunctionCall( - llvm::Function* function, const Shape& return_shape, - tensorflow::gtl::ArraySlice parameter_addresses, - tensorflow::StringPiece name); - - // Array function call emitter. Stores the function's result into a supplied - // buffer. - // Parameters: - // function - The LLVM function to call. - // parameter_addresses - pointers to be passed to the function as - // parameters. - // return_value - pointer to a buffer where the call result is stored. - - void EmitArrayFunctionCallInto( - llvm::Function* function, - tensorflow::gtl::ArraySlice parameter_addresses, - llvm::Value* return_value_buffer, tensorflow::StringPiece name); - - // Array function call emitter. Returns a Value for the function's return - // value buffer address. The return value buffer is alloca'ed by this - // function. - llvm::Value* EmitArrayFunctionCall( - llvm::Function* function, const Shape& return_shape, int64 element_count, - tensorflow::gtl::ArraySlice parameter_addresses, - tensorflow::StringPiece name); + // Emits a call to a thread local function (e.g. to the computation nested + // within a reduce or a map). Thread local callees (by definition) only write + // to and read from thread local allocations. + // + // `parameters` holds the *scalar values* that need to be passed to the + // callee. The return value is the scalar returned by the callee. + llvm::Value* EmitThreadLocalCall(const HloComputation& callee, + absl::Span parameters, + absl::string_view name); + + // Emits a call to a "global" function (e.g. to the computation nested within + // a kWhile or a kCall). Buffer assignment unabiguously assignes buffers to + // the parameters and return values for these computations so there is no need + // to explicitly pass parameters or return results. + void EmitGlobalCall(const HloComputation& callee, absl::string_view name); + + // Returns the buffer to which a global call to `callee` would have written + // its result. + llvm::Value* GetBufferForGlobalCallReturnValue(const HloComputation& callee); // Verifies that the element types of all of the given operand instructions // match and are of one of the given supported types. Status ElementTypesSameAndSupported( const HloInstruction& instruction, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice supported_types); + absl::Span operands, + absl::Span supported_types); // Emit IR to perform a computation for every element in the given target op. // This produces a series of nested loops (one for each dimension of the op's @@ -289,7 +284,7 @@ class IrEmitter : public DfsHloVisitorWithDefault { HloInstruction* target_op, const llvm_ir::ElementGenerator& element_generator); Status EmitTargetElementLoop( - HloInstruction* target_op, tensorflow::StringPiece desc, + HloInstruction* target_op, absl::string_view desc, const llvm_ir::ElementGenerator& element_generator); // Emits a memcpy from the source instruction's result value to the @@ -320,10 +315,12 @@ class IrEmitter : public DfsHloVisitorWithDefault { // concepts that generalize over other vectorizable operations. We should // consider pulling out these abstractions into a VectorizingIrEmitter or // something similar. - StatusOr EmitVectorizedReduce( - HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value, - tensorflow::gtl::ArraySlice dimensions, HloComputation* function, - string* failure_reason); + StatusOr EmitVectorizedReduce(HloInstruction* reduce, + HloInstruction* arg, + HloInstruction* init_value, + absl::Span dimensions, + HloComputation* function, + string* failure_reason); // We'd like to keep one or two one cache-line's worth of data in registers // without generating IR with illegal (e.g. excessively large or @@ -373,16 +370,15 @@ class IrEmitter : public DfsHloVisitorWithDefault { const ReductionGenerator& reduction_generator, const llvm_ir::IrArray::Index& output_index, const ShardedVectorType& accumulator_type, HloInstruction* init_value, - HloInstruction* arg, tensorflow::gtl::ArraySlice dimensions, + HloInstruction* arg, absl::Span dimensions, unsigned element_alignment); // Tries to emit a fast concatenate operation using memcpy. Returns true if // successful, and false on failure. On failure, sets "failure_reason" to a // string describing why it could not emit a fast concatenate. - StatusOr EmitFastConcatenate( - HloInstruction* concatenate, - tensorflow::gtl::ArraySlice operands, - string* failure_reason); + StatusOr EmitFastConcatenate(HloInstruction* concatenate, + absl::Span operands, + string* failure_reason); // Emits LLVM IR to transfer "element_count" elements of type "primitive_type" // from the address "source" to the address "target". @@ -391,8 +387,8 @@ class IrEmitter : public DfsHloVisitorWithDefault { const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& source_array); - // Assignment of the temporary buffers needed by the computation and their - // shape information. + // Assignment of the buffers needed by the computation and their shape + // information. const BufferAssignment& assignment_; // The LLVM module into which IR will be emitted. @@ -405,11 +401,10 @@ class IrEmitter : public DfsHloVisitorWithDefault { NameUniquer name_uniquer_; // Map containing all previously emitted computations. - std::map emitted_functions_; + std::map emitted_functions_; // Map containing all previously emitted thread-local temporary buffers. - std::map, - llvm::AllocaInst*> + std::map, llvm::Value*> thread_local_buffers_; // The following fields track the IR emission state. According to LLVM memory @@ -417,7 +412,17 @@ class IrEmitter : public DfsHloVisitorWithDefault { // creates the encapsulated llvm::Function s.t. it is added to the llvm // module's function list). std::unique_ptr compute_function_; - llvm::IRBuilder<> ir_builder_; + llvm::IRBuilder<> b_; + + // The buffer allocation slice for the root of the computation being compiled. + // Only relevant for thread local computations. + BufferAllocation::Slice computation_root_allocation_; + + // Maps the buffer allocation slices for the parameters to the computation + // being compiled to their parameter numbers. Only relevant for thread local + // computations. + tensorflow::gtl::FlatMap + computation_parameter_allocations_; // Maps HLO instructions to their index into the profile counter array. const std::unordered_map @@ -453,23 +458,22 @@ class IrEmitter : public DfsHloVisitorWithDefault { : use_rdtscp_(use_rdtscp), prof_counters_(prof_counters) {} // Record the cycle counter before an HLO executes. - void RecordCycleStart(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo); + void RecordCycleStart(llvm::IRBuilder<>* b, HloInstruction* hlo); // Record the number of cycles it took for an HLO to execute. - void RecordCycleDelta(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo, + void RecordCycleDelta(llvm::IRBuilder<>* b, HloInstruction* hlo, llvm::Value* prof_counter); // Record the number of cycles it took for the entire computation to // execute. - void RecordCompleteComputation(llvm::IRBuilder<>* ir_builder, + void RecordCompleteComputation(llvm::IRBuilder<>* b, llvm::Value* prof_counter); // Convenience function to generate a call to an intrinsic which reads the // CPU cycle counter. - llvm::Value* ReadCycleCounter(llvm::IRBuilder<>* ir_builder); + llvm::Value* ReadCycleCounter(llvm::IRBuilder<>* b); // Store the cycle counter delta to the per-HLO profile counter. - void UpdateProfileCounter(llvm::IRBuilder<>* ir_builder, - llvm::Value* prof_counter, llvm::Value* cycle_end, - llvm::Value* cycle_start); + void UpdateProfileCounter(llvm::IRBuilder<>* b, llvm::Value* prof_counter, + llvm::Value* cycle_end, llvm::Value* cycle_start); private: // Should we use the x86-specific rdtscp or the generic readcyclecounter @@ -517,6 +521,17 @@ class IrEmitter : public DfsHloVisitorWithDefault { // Returns the number of bytes within the shape. int64 ByteSizeOf(const Shape& shape) const; + StatusOr EmitTargetElementLoopBodyForMap( + HloMapInstruction* map, const llvm_ir::IrArray::Index& index); + StatusOr EmitTargetElementLoopBodyForReduceWindow( + HloReduceWindowInstruction* reduce_window, + const llvm_ir::IrArray::Index& index); + StatusOr EmitTargetElementLoopBodyForConvolution( + HloConvolutionInstruction* convolution, + const llvm_ir::IrArray::Index& index); + StatusOr EmitTargetElementLoopBodyForReduce( + HloReduceInstruction* reduce, const llvm_ir::IrArray::Index& index); + enum class XfeedKind { kInfeed, kOutfeed, @@ -527,7 +542,8 @@ class IrEmitter : public DfsHloVisitorWithDefault { Status EmitXfeedTransfer(XfeedKind kind, const Shape& shape, llvm::Value* program_buffer_address); - llvm::GlobalVariable* EmitGlobalForLiteral(const Literal& literal); + // Returns a ConstExpr bitcast. + llvm::Constant* EmitGlobalForLiteral(const Literal& literal); const HloModuleConfig& hlo_module_config_; @@ -535,9 +551,6 @@ class IrEmitter : public DfsHloVisitorWithDefault { const TargetMachineFeatures& target_machine_features_; - int64 external_global_constant_counter_ = 0; - ExternalConstantPool* external_constant_pool_; - struct LiteralPtrHashFunctor { size_t operator()(const Literal* literal) const { return literal->Hash(); } }; @@ -548,10 +561,16 @@ class IrEmitter : public DfsHloVisitorWithDefault { } }; - tensorflow::gtl::FlatMap emitted_literals_; + tensorflow::gtl::FlatMap + constant_buffer_to_global_; + + std::vector thread_local_computations_; + std::vector global_computations_; + TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter); }; diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc index 2d6f2f3818a7bd4424aaa7d918ca86abef15c0e9..adfb8392bf6fa356f0a5cdab3ff74036eca8918e 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_function.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/ir_function.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h" #include "tensorflow/compiler/xla/service/cpu/shape_partition.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" @@ -49,11 +50,10 @@ IrFunction::IrFunction(const string& function_name, llvm::Function::LinkageTypes linkage, const bool optimize_for_size_requested, const bool enable_fast_math, llvm::Module* llvm_module, - llvm::IRBuilder<>* ir_builder, - int64 num_dynamic_loop_bounds) - : ir_builder_(ir_builder), + llvm::IRBuilder<>* b, int64 num_dynamic_loop_bounds) + : b_(b), llvm_module_(llvm_module), - caller_insert_point_guard_(*ir_builder), + caller_insert_point_guard_(*b), num_dynamic_loop_bounds_(num_dynamic_loop_bounds) { Initialize(function_name, linkage, optimize_for_size_requested, enable_fast_math); @@ -61,7 +61,7 @@ IrFunction::IrFunction(const string& function_name, IrFunction::~IrFunction() { // Emit function return value. - ir_builder_->CreateRetVoid(); + b_->CreateRetVoid(); } DynamicLoopBounds IrFunction::GetDynamicLoopBounds() { @@ -78,12 +78,20 @@ void IrFunction::Initialize(const string& function_name, const bool optimize_for_size_requested, const bool enable_fast_math) { // The function signature is: - // void function(i8* retval, i8* run_options, i8** params, i8** temps, + // void function(i8* retval, i8* run_options, i8** params, i8** + // buffer_table, // i64* dynamic_loop_bounds, i64* prof_counters) // - // retval: points to the returned value. - // params: address of an array with pointers to parameters. - // temps: address of an array with pointers to temporary buffers. + // For thread local functions: + // retval: points to the returned value. + // params: address of an array with pointers to parameters. + // buffer_table: is null + // + // For global functions: + // retval: is null + // params: is null + // buffer_table: address of an array with pointers to temporary buffers and + // entry computation parameters (but not to constant buffers). // // Therefore, the generated function's signature (FunctionType) is statically // determined - parameter unpacking is done in code generated into the @@ -109,7 +117,7 @@ void IrFunction::Initialize(const string& function_name, // \---------/ \---------/ \-----------/ // // /---------------------------------------------\ - // temps ---------> | temp 0 | temp 1 | ..... | temp N-1 | + // buffer_table---> | buff 0 | guff 1 | ..... | buff N-1 | // | addr | addr | | addr | // \---------------------------------------------/ // | | | @@ -127,9 +135,9 @@ void IrFunction::Initialize(const string& function_name, // prof counters -> | counter 0 | counter 1 | ..... | counter N-1 | // \---------------------------------------------/ - // Even though the type of params and temps is void** in the host's view, in - // LLVM IR this is represented by i8*, similarly to void*. It's up to the code - // to use GEPs to unravel the indirection layers. + // Even though the type of params and buffer_table is void** in the host's + // view, in LLVM IR this is represented by i8*, similarly to void*. It's up to + // the code to use GEPs to unravel the indirection layers. llvm::FunctionType* function_type = llvm::FunctionType::get( /*Result=*/llvm::Type::getVoidTy(llvm_module_->getContext()), /*Params=*/ @@ -153,8 +161,8 @@ void IrFunction::Initialize(const string& function_name, exec_run_options_arg_ = &*arg_iter; (++arg_iter)->setName("params"); parameters_arg_ = &*arg_iter; - (++arg_iter)->setName("temps"); - temp_buffers_arg_ = &*arg_iter; + (++arg_iter)->setName("buffer_table"); + buffer_table_arg_ = &*arg_iter; if (num_dynamic_loop_bounds_ > 0) { (++arg_iter)->setName("dynamic_loop_bounds"); dynamic_loop_bounds_arg_ = &*arg_iter; @@ -174,7 +182,7 @@ void IrFunction::Initialize(const string& function_name, function_->addAttribute(argument.getArgNo() + 1, llvm::Attribute::NoAlias); } - ir_builder_->SetInsertPoint(llvm::BasicBlock::Create( + b_->SetInsertPoint(llvm::BasicBlock::Create( /*Context=*/llvm_module_->getContext(), /*Name=*/"entry", /*Parent=*/function_)); @@ -183,10 +191,9 @@ void IrFunction::Initialize(const string& function_name, llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) { CHECK_GT(num_dynamic_loop_bounds_, 0); CHECK_LT(offset, num_dynamic_loop_bounds_ * 2); - string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset); - return ir_builder_->CreateLoad( - ir_builder_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_), - ir_builder_->getInt64(offset), AsStringRef(name))); + string name = absl::StrCat("dynamic_loop_bound_", offset); + return b_->CreateLoad(b_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_), + b_->getInt64(offset), AsStringRef(name))); } // Emits code to allocate an array of parameter address pointers, and store @@ -194,32 +201,37 @@ llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) { // Returns an array of compute function call arguments (including parameter // address buffer). std::vector GetArrayFunctionCallArguments( - tensorflow::gtl::ArraySlice parameter_addresses, - llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name, - llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg, - llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg) { - llvm::Value* parameter_addresses_buffer = - llvm_ir::EmitAllocaAtFunctionEntryWithCount( - ir_builder->getInt8PtrTy(), - ir_builder->getInt32(parameter_addresses.size()), - tensorflow::strings::StrCat(name, "_parameter_addresses"), - ir_builder); - for (size_t i = 0; i < parameter_addresses.size(); ++i) { - llvm::Value* parameter_as_i8ptr = ir_builder->CreateBitCast( - parameter_addresses[i], ir_builder->getInt8PtrTy(), - AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i, - "_address_as_i8ptr"))); - llvm::Value* slot_in_param_addresses = ir_builder->CreateInBoundsGEP( - parameter_addresses_buffer, {ir_builder->getInt64(i)}); - ir_builder->CreateStore(parameter_as_i8ptr, slot_in_param_addresses); + absl::Span parameter_addresses, llvm::IRBuilder<>* b, + absl::string_view name, llvm::Value* return_value_buffer, + llvm::Value* exec_run_options_arg, llvm::Value* buffer_table_arg, + llvm::Value* profile_counters_arg) { + llvm::Value* parameter_addresses_buffer; + + if (parameter_addresses.empty()) { + parameter_addresses_buffer = + llvm::Constant::getNullValue(b->getInt8PtrTy()->getPointerTo()); + } else { + parameter_addresses_buffer = llvm_ir::EmitAllocaAtFunctionEntryWithCount( + b->getInt8PtrTy(), b->getInt32(parameter_addresses.size()), + absl::StrCat(name, "_parameter_addresses"), b); + + for (size_t i = 0; i < parameter_addresses.size(); ++i) { + llvm::Value* parameter_as_i8ptr = + b->CreateBitCast(parameter_addresses[i], b->getInt8PtrTy(), + AsStringRef(absl::StrCat(name, "_parameter_", i, + "_address_as_i8ptr"))); + llvm::Value* slot_in_param_addresses = + b->CreateInBoundsGEP(parameter_addresses_buffer, {b->getInt64(i)}); + b->CreateStore(parameter_as_i8ptr, slot_in_param_addresses); + } } const auto to_int8_ptr = [=](llvm::Value* ptr) { - return ir_builder->CreatePointerCast(ptr, ir_builder->getInt8PtrTy()); + return b->CreatePointerCast(ptr, b->getInt8PtrTy()); }; std::vector arguments{ to_int8_ptr(return_value_buffer), to_int8_ptr(exec_run_options_arg), - parameter_addresses_buffer, temp_buffers_arg}; + parameter_addresses_buffer, buffer_table_arg}; if (profile_counters_arg != nullptr) { arguments.push_back(profile_counters_arg); } @@ -230,22 +242,21 @@ std::vector GetArrayFunctionCallArguments( // calls to 'parallel_function' (and joins threads before returning). Status EmitCallToParallelForkJoin( const std::vector& arguments, const Shape& shape, - const std::vector& dimension_partition_counts, - llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function, - const string& name) { - llvm::Module* module = ir_builder->GetInsertBlock()->getModule(); + const std::vector& dimension_partition_counts, llvm::IRBuilder<>* b, + llvm::Function* parallel_function, const string& name) { + llvm::Module* module = b->GetInsertBlock()->getModule(); // Build ParallelForkJoin function type. std::vector compute_function_params = GetComputeFunctionParams(module, /*num_dynamic_loop_bounds=*/0); // Number of parallel compute functions. - compute_function_params.push_back(ir_builder->getInt32Ty()); + compute_function_params.push_back(b->getInt32Ty()); // Array of partitions. There is an array element for each // partition x partition_dim x 2 (for dimension start and limit). compute_function_params.push_back( llvm::Type::getInt64PtrTy(module->getContext())); // Number of partitioned most-major dimensions in 'shape'. - compute_function_params.push_back(ir_builder->getInt32Ty()); + compute_function_params.push_back(b->getInt32Ty()); // Function pointer for compute function to be dispatched in parallel. compute_function_params.push_back( llvm::Type::getInt8PtrTy(module->getContext())); @@ -268,7 +279,7 @@ Status EmitCallToParallelForkJoin( ShapePartitionIterator partition_iterator(shape, dimension_partition_counts); const int64 num_partitions = partition_iterator.GetTotalPartitionCount(); // Add argument specifying the number of parallel partitions. - fork_join_arguments.push_back(ir_builder->getInt32(num_partitions)); + fork_join_arguments.push_back(b->getInt32(num_partitions)); // The number of partitioned most-major dimensions in 'shape'. const int32 num_partitioned_dims = dimension_partition_counts.size(); @@ -293,15 +304,15 @@ Status EmitCallToParallelForkJoin( const std::pair& dim_partition = dim_partitions[j]; const int32 index = partition_index + j * dim_partition_size; // Store partition [dim_start, dim_limit) intervals for each dimension. - partitions[index] = ir_builder->getInt64(dim_partition.first); + partitions[index] = b->getInt64(dim_partition.first); partitions[index + 1] = - ir_builder->getInt64(dim_partition.first + dim_partition.second); + b->getInt64(dim_partition.first + dim_partition.second); } } // Create global variable out of dimension partitions in 'partitions'. llvm::ArrayType* partitions_array_type = - llvm::ArrayType::get(ir_builder->getInt64Ty(), partition_array_size); + llvm::ArrayType::get(b->getInt64Ty(), partition_array_size); llvm::Constant* partitions_array = llvm::ConstantArray::get(partitions_array_type, partitions); llvm::GlobalVariable* global_partitions_array = new llvm::GlobalVariable( @@ -311,20 +322,19 @@ Status EmitCallToParallelForkJoin( /*Linkage=*/llvm::GlobalValue::PrivateLinkage, /*Initializer=*/partitions_array, /*Name=*/ - AsStringRef( - tensorflow::strings::StrCat(name, "_parallel_dimension_partitions"))); + AsStringRef(absl::StrCat(name, "_parallel_dimension_partitions"))); // Add argument specifying parallel dimension partitions. - fork_join_arguments.push_back(ir_builder->CreateBitCast( - global_partitions_array, - llvm::Type::getInt64PtrTy(module->getContext()))); + fork_join_arguments.push_back( + b->CreateBitCast(global_partitions_array, + llvm::Type::getInt64PtrTy(module->getContext()))); // Add argument specifying the number of partitioned most-major dimensions. - fork_join_arguments.push_back(ir_builder->getInt32(num_partitioned_dims)); + fork_join_arguments.push_back(b->getInt32(num_partitioned_dims)); // Add argument for parallel compute function pointer. fork_join_arguments.push_back( - ir_builder->CreateBitCast(parallel_function, ir_builder->getInt8PtrTy())); + b->CreateBitCast(parallel_function, b->getInt8PtrTy())); // Emit call to parallel fork/join. - ir_builder->CreateCall(fork_join_func, fork_join_arguments); + b->CreateCall(fork_join_func, fork_join_arguments); return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h index 2e55181eed867aca762f2b9b8310624ea12c7487..623a5f185fa1fd0526bc8664e2ba11c9dde79b1d 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_function.h +++ b/tensorflow/compiler/xla/service/cpu/ir_function.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_ +#include "absl/types/span.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" @@ -24,7 +25,6 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { namespace cpu { @@ -54,7 +54,7 @@ class IrFunction { IrFunction(const string& function_name, llvm::Function::LinkageTypes linkage, const bool optimize_for_size_requested, const bool enable_fast_math, llvm::Module* llvm_module, - llvm::IRBuilder<>* ir_builder, int64 num_dynamic_loop_bounds); + llvm::IRBuilder<>* b, int64 num_dynamic_loop_bounds); ~IrFunction(); // Emit ir to read and return the set of ir values representing the dynamic @@ -80,8 +80,9 @@ class IrFunction { // Get the llvm::Value* that represents this functions parameters argument. llvm::Value* parameters_arg() { return parameters_arg_; } - // Get the llvm::Value* that represents this functions "temps" argument. - llvm::Value* temp_buffers_arg() { return temp_buffers_arg_; } + // Get the llvm::Value* that represents this functions "buffer_table" + // argument. + llvm::Value* buffer_table_arg() { return buffer_table_arg_; } // Get the llvm::Value* that represents this functions "prof_counters" // argument. @@ -97,7 +98,7 @@ class IrFunction { // 'offset' from the "dynamic_loop_bounds" argument of this function. llvm::Value* GetDynamicLoopBound(int64 offset); - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; llvm::Module* llvm_module_; llvm::IRBuilder<>::InsertPointGuard caller_insert_point_guard_; @@ -108,25 +109,24 @@ class IrFunction { llvm::Argument* result_arg_; llvm::Value* exec_run_options_arg_; llvm::Value* parameters_arg_; - llvm::Value* temp_buffers_arg_; + llvm::Value* buffer_table_arg_; llvm::Value* dynamic_loop_bounds_arg_ = nullptr; llvm::Value* profile_counters_arg_; }; // Returns an array of compute function call argument ir values. std::vector GetArrayFunctionCallArguments( - tensorflow::gtl::ArraySlice parameter_addresses, - llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name, - llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg, - llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg); + absl::Span parameter_addresses, llvm::IRBuilder<>* b, + absl::string_view name, llvm::Value* return_value_buffer, + llvm::Value* exec_run_options_arg, llvm::Value* buffer_table_arg, + llvm::Value* profile_counters_arg); // Emits a call to a runtime fork/join function which dispatches parallel // calls to 'parallel_function' (and joins threads before returning). Status EmitCallToParallelForkJoin( const std::vector& arguments, const Shape& shape, - const std::vector& dimension_partition_counts, - llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function, - const string& name); + const std::vector& dimension_partition_counts, llvm::IRBuilder<>* b, + llvm::Function* parallel_function, const string& name); } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc index 2e5cc96098241415b82f225afc81981f3e1069e0..cef5e57b0b12b7ae93af0d2508b2b9d6a592d390 100644 --- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc +++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc @@ -21,6 +21,7 @@ limitations under the License. #include "llvm/IR/Verifier.h" #include "llvm/Transforms/Utils/Cloning.h" #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h" +#include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h" #include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/platform/logging.h" @@ -52,46 +53,14 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module, llvm::BasicBlock* vector_tanh_body = llvm::BasicBlock::Create(*context, "body", vector_tanh_function); - llvm::IRBuilder<> ir_builder(vector_tanh_body); + llvm::IRBuilder<> b(vector_tanh_body); llvm::FastMathFlags fast_math_flags; - fast_math_flags.setFast(); - ir_builder.setFastMathFlags(fast_math_flags); - - VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "tanh_f32"); + fast_math_flags.setFast(enable_fast_math); + b.setFastMathFlags(fast_math_flags); llvm::Value* input = &*vector_tanh_function->arg_begin(); - CHECK_EQ(input->getType(), vsl.vector_type()); - - // This implements the same rational interpolant as implemented in Eigen3. - llvm::Value* input_clamped = - vsl.Clamp(input, /*low=*/GetIeeeF32(-9.0), /*high=*/GetIeeeF32(9.0)); - - std::array numerator_coeffs{ - -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f, - 5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f, - 4.89352455891786e-03f}; - - std::array denominator_coeffs{ - 1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f, - 4.89352518554385e-03f}; - - llvm::Value* input_squared = vsl.Mul(input_clamped, input_clamped); - llvm::Value* numerator = vsl.SplatFloat(GetIeeeF32(numerator_coeffs[0])); - for (int i = 1; i < numerator_coeffs.size(); i++) { - numerator = - vsl.MulAdd(input_squared, numerator, GetIeeeF32(numerator_coeffs[i])); - } - - numerator = vsl.Mul(input_clamped, numerator); - - llvm::Value* denominator = vsl.SplatFloat(GetIeeeF32(denominator_coeffs[0])); - for (int i = 1; i < denominator_coeffs.size(); i++) { - denominator = vsl.MulAdd(input_squared, denominator, - GetIeeeF32(denominator_coeffs[i])); - } - - llvm::Value* result = vsl.Div(numerator, denominator); - ir_builder.CreateRet(result); + CHECK_EQ(vector_width, input->getType()->getVectorNumElements()); + b.CreateRet(llvm_ir::EmitFastTanh(&b, input)); DCHECK(!llvm::verifyFunction(*vector_tanh_function)); return vector_tanh_function; @@ -113,12 +82,12 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module, llvm::BasicBlock* vector_exp_body = llvm::BasicBlock::Create(*context, "body", vector_exp_function); - llvm::IRBuilder<> ir_builder(vector_exp_body); + llvm::IRBuilder<> b(vector_exp_body); llvm::FastMathFlags fast_math_flags; fast_math_flags.setFast(); - ir_builder.setFastMathFlags(fast_math_flags); + b.setFastMathFlags(fast_math_flags); - VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "exp_f32"); + VectorSupportLibrary vsl(F32, vector_width, &b, "exp_f32"); // This implements the same polynomial approximation as implemented in Eigen3. @@ -160,21 +129,21 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module, // VectorSupportLibrary (intentionally) can't juggle more than one type at a // time so drop down to IRBuilder for this bit. llvm::Value* vector_constant_0x7f = - ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(0x7f)); + b.CreateVectorSplat(vector_width, b.getInt32(0x7f)); llvm::Value* vector_constant_23 = - ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(23)); + b.CreateVectorSplat(vector_width, b.getInt32(23)); llvm::Type* i32_vector_type = - llvm::VectorType::get(ir_builder.getInt32Ty(), vector_width); + llvm::VectorType::get(b.getInt32Ty(), vector_width); // fx is clamped so we don't have to worry about it being out of range for // i32. - llvm::Value* emm0 = ir_builder.CreateFPToSI(fx, i32_vector_type); - emm0 = ir_builder.CreateAdd(emm0, vector_constant_0x7f); - emm0 = ir_builder.CreateShl(emm0, vector_constant_23); - llvm::Value* emm0_f32 = ir_builder.CreateBitCast(emm0, vsl.vector_type()); + llvm::Value* emm0 = b.CreateFPToSI(fx, i32_vector_type); + emm0 = b.CreateAdd(emm0, vector_constant_0x7f); + emm0 = b.CreateShl(emm0, vector_constant_23); + llvm::Value* emm0_f32 = b.CreateBitCast(emm0, vsl.vector_type()); llvm::Value* result = vsl.Max(vsl.Mul(y, emm0_f32), input); - ir_builder.CreateRet(result); + b.CreateRet(result); DCHECK(!llvm::verifyFunction(*vector_exp_function)); return vector_exp_function; @@ -196,13 +165,13 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module, llvm::BasicBlock* vector_log_body = llvm::BasicBlock::Create(*context, "body", vector_log_function); - llvm::IRBuilder<> ir_builder(vector_log_body); + llvm::IRBuilder<> b(vector_log_body); llvm::FastMathFlags fast_math_flags; fast_math_flags.setFast(); - ir_builder.setFastMathFlags(fast_math_flags); + b.setFastMathFlags(fast_math_flags); llvm::Value* input = &*vector_log_function->arg_begin(); - VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "log_f32"); + VectorSupportLibrary vsl(F32, vector_width, &b, "log_f32"); const llvm::APFloat half = GetIeeeF32(0.5); const llvm::APFloat one = GetIeeeF32(1.0); @@ -238,22 +207,21 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module, // VectorSupportLibrary (intentionally) can't juggle more than one type at a // time so drop down to IRBuilder for this bit. llvm::Value* vector_constant_0x7f = - ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(0x7f)); + b.CreateVectorSplat(vector_width, b.getInt32(0x7f)); llvm::Value* vector_constant_23 = - ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(23)); + b.CreateVectorSplat(vector_width, b.getInt32(23)); llvm::Type* i32_vector_type = - llvm::VectorType::get(ir_builder.getInt32Ty(), vector_width); + llvm::VectorType::get(b.getInt32Ty(), vector_width); - llvm::Value* emm0 = ir_builder.CreateLShr( - ir_builder.CreateBitCast(input, i32_vector_type), vector_constant_23); + llvm::Value* emm0 = + b.CreateLShr(b.CreateBitCast(input, i32_vector_type), vector_constant_23); // Keep only the fractional part. input = vsl.FloatAnd(input, inv_mant_mask); input = vsl.FloatOr(input, half); - emm0 = ir_builder.CreateSub(emm0, vector_constant_0x7f); - llvm::Value* e = - vsl.Add(one, ir_builder.CreateSIToFP(emm0, vsl.vector_type())); + emm0 = b.CreateSub(emm0, vector_constant_0x7f); + llvm::Value* e = vsl.Add(one, b.CreateSIToFP(emm0, vsl.vector_type())); // part2: // if( x < SQRTHF ) { @@ -294,7 +262,7 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module, llvm::Value* or_rhs = vsl.FloatAnd(iszero_mask, minus_inf); llvm::Value* result = vsl.FloatOr(or_lhs, or_rhs); - ir_builder.CreateRet(result); + b.CreateRet(result); DCHECK(!llvm::verifyFunction(*vector_log_function)); return vector_log_function; diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc index 54af40506dab48b3c2a3a44eb0b5f5fb213a32ec..f8441c3e345504616485c6b34b4302acd5cc23a3 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc @@ -15,9 +15,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { namespace cpu { @@ -25,19 +25,21 @@ namespace cpu { ParallelLoopEmitter::ParallelLoopEmitter( const llvm_ir::ElementGenerator& target_element_generator, const llvm_ir::IrArray& target_array, - const DynamicLoopBounds* dynamic_loop_bounds, llvm::IRBuilder<>* ir_builder) - : LoopEmitter(target_element_generator, target_array, ir_builder), + const DynamicLoopBounds* dynamic_loop_bounds, llvm::IRBuilder<>* b) + : LoopEmitter(target_element_generator, target_array, b), dynamic_loop_bounds_(dynamic_loop_bounds) {} std::vector -ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock( - tensorflow::StringPiece loop_name) { +ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name, + llvm::Type* index_type) { + CHECK_NE(index_type, nullptr); + CHECK(!ShapeUtil::IsTuple(shape_)); CHECK(!ShapeUtil::IsScalar(shape_)); - llvm_ir::ForLoopNest loop_nest(loop_name, ir_builder_); + llvm_ir::ForLoopNest loop_nest(loop_name, b_); const int64 num_dims = shape_.dimensions_size(); - llvm_ir::IrArray::Index array_index(num_dims); + llvm_ir::IrArray::Index array_index(index_type, num_dims); // Add loops from outer-most to inner-most dimensions. for (int i = LayoutUtil::MinorToMajor(shape_).size() - 1; i >= 0; --i) { @@ -50,21 +52,20 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock( llvm::Value* end_index = (*dynamic_loop_bounds_)[bounds_index].second; std::unique_ptr loop = loop_nest.AddLoop( - /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension), - start_index, end_index); + /*suffix=*/absl::StrFormat("dim.%d", dimension), start_index, + end_index); array_index[dimension] = loop->GetIndVarValue(); } else { // Emit static loop bounds for this dimension. std::unique_ptr loop = loop_nest.AddLoop( /*start_index=*/0, /*end_index=*/shape_.dimensions(dimension), - /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension)); + /*suffix=*/absl::StrFormat("dim.%d", dimension)); array_index[dimension] = loop->GetIndVarValue(); } } // Point IR builder at inner loop BB. - llvm_ir::SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), - ir_builder_); + llvm_ir::SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b_); // Set exit_bb_ to the exit block of the loop nest. exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock(); diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h index 755715634aa70a822b21d25dcae20a8fe053477a..a604e1db222139c239a2a89359a7359463e0def7 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h @@ -54,14 +54,14 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter { ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator, const llvm_ir::IrArray& target_array, const DynamicLoopBounds* dynamic_loop_bounds, - llvm::IRBuilder<>* ir_builder); + llvm::IRBuilder<>* b); ParallelLoopEmitter(const ParallelLoopEmitter&) = delete; ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete; ~ParallelLoopEmitter() override = default; std::vector EmitIndexAndSetExitBasicBlock( - tensorflow::StringPiece loop_name) override; + absl::string_view loop_name, llvm::Type* index_type) override; private: const DynamicLoopBounds* dynamic_loop_bounds_; diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc index 4fa5984b0466b178a587e97cbced97deac749f74..b4c0c09ec06bac9b5e228428c072948afdd4a547 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h" #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/cpu/shape_partition.h" @@ -109,7 +111,7 @@ ParallelTaskAssignment::ParallelTaskAssignment( : target_machine_features_(*target_machine_features) { VLOG(1) << "ParallelTaskAssignment max_parallelism: " << max_parallelism; // Run cost analysis on 'module'. - auto cost_analysis = MakeUnique(shape_size); + auto cost_analysis = absl::make_unique(shape_size); HloComputation* computation = module->entry_computation(); Status status = computation->root_instruction()->Accept(cost_analysis.get()); if (status.ok()) { @@ -216,8 +218,7 @@ bool ParallelTaskAssigner::AssignParallelTasksHelper( // Outline 'instruction' in 'computation' for parallel task assignment. auto* call = module->OutlineExpressionFromComputation( - {instruction}, - tensorflow::strings::StrCat("parallel_", instruction->name()), + {instruction}, absl::StrCat("parallel_", instruction->name()), computation); // Set assigned dimension partitioning to 'instruction'. diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h index 8becc8fa23424d7454cc783eb9d853aecb5d053b..a99cd99c14abb66fc426c43656520e01f34a1700 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h +++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h @@ -73,7 +73,7 @@ class ParallelTaskAssigner : public HloPassInterface { target_machine_features_(*target_machine_features) {} ~ParallelTaskAssigner() override {} - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "cpu-parallel-task-assigner"; } diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc index fc2efbaf9a22b02cd729da2f367d53bc15506836..a84ee78b19981e480858320e445de7f5dae27d61 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc @@ -19,7 +19,6 @@ limitations under the License. #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace xla { namespace { @@ -36,7 +35,9 @@ class ParallelTaskAssignmentTest : public HloVerifiedTestBase { cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features_; ParallelTaskAssignmentTest() - : target_machine_features_([](int64 shape_size) { + : HloVerifiedTestBase(/*layout_sensitive=*/false, + /*allow_mixed_precision=*/false), + target_machine_features_([](int64 shape_size) { return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment; }) {} @@ -110,8 +111,10 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) { const string hlo_string = R"( HloModule TestTaskParallel_infeed_outfeed ENTRY InfeedOutfeed { - infeed0 = u32[12345678,2]{1,0} infeed() - ROOT outfeed0 = u32[12345678,2]{1,0} outfeed(infeed0) + token = token[] after-all() + infeed0 = (u32[12345678,2]{1,0}, token[]) infeed(token) + infeed0.data = u32[12345678,2]{1,0} get-tuple-element((u32[12345678,2]{1,0}, token[]) infeed0), index=0 + ROOT outfeed0 = token[] outfeed(infeed0.data, token) } )"; diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc index d03da46575b331de113cc5f33c2b4267504e8308..2d9492eacfea34bec3b0f1115e171a5328b7cdc3 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc +++ b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc @@ -20,6 +20,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/core/lib/core/blocking_counter.h" +#include "tensorflow/core/platform/dynamic_annotations.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -58,13 +59,14 @@ using ComputeFunctionType = void (*)(void*, const void*, const void**, void**, // [partition1_dim2_start] // [partition1_dim2_limit] // -void __xla_cpu_runtime_ParallelForkJoin( +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ParallelForkJoin( void* result_ptr, const void* run_options_ptr, const void** params, - void** temps, uint64* prof_counters, int32 num_partitions, + void** buffer_table, uint64* prof_counters, int32 num_partitions, int64* partitions, int32 num_partitioned_dims, void* function_ptr) { VLOG(2) << "ParallelForkJoin ENTRY" << " num_partitions: " << num_partitions << " num_partitioned_dims: " << num_partitioned_dims; + CHECK_EQ(params, nullptr); CHECK_GT(num_partitions, 1); CHECK_GT(num_partitioned_dims, 0); const xla::ExecutableRunOptions* run_options = @@ -79,9 +81,9 @@ void __xla_cpu_runtime_ParallelForkJoin( for (int32 i = 1; i < num_partitions; ++i) { const int64 offset = i * stride; run_options->intra_op_thread_pool()->enqueueNoNotification( - [i, function, result_ptr, run_options_ptr, params, temps, prof_counters, + [i, function, result_ptr, run_options_ptr, buffer_table, prof_counters, partitions, offset, &bc]() { - function(result_ptr, run_options_ptr, params, temps, + function(result_ptr, run_options_ptr, nullptr, buffer_table, &partitions[offset], prof_counters); bc.DecrementCount(); VLOG(3) << "ParallelForkJoin partition " << i << " done."; @@ -89,7 +91,7 @@ void __xla_cpu_runtime_ParallelForkJoin( } // Call first compute function inline. - function(result_ptr, run_options_ptr, params, temps, &partitions[0], + function(result_ptr, run_options_ptr, params, buffer_table, &partitions[0], prof_counters); VLOG(3) << "ParallelForkJoin partition 0 done."; bc.Wait(); diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h index 1cf0ec6e3df400e35fa4e755a0b25b4ce7966e8f..a279c7d2d61bdd138f5285a8c8ccc89d22db9692 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h +++ b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h @@ -24,7 +24,7 @@ extern "C" { // threads before returning. See comments in runtime_fork_join.cc for details. extern void __xla_cpu_runtime_ParallelForkJoin( void* result_ptr, const void* run_options_ptr, const void** params, - void** temps, tensorflow::uint64* prof_counters, + void** buffer_table, tensorflow::uint64* prof_counters, tensorflow::int32 num_partitions, tensorflow::int64* partitions, tensorflow::int32 num_partitioned_dims, void* function_ptr); diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc index 39b13183ff093611a42b3931d45f64eadb420622..a71a85913cfef271bc2a226cb0cf2dd4204499a4 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc +++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc @@ -20,6 +20,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/compiler/xla/service/cpu/runtime_matvec.h" +#include "tensorflow/core/platform/dynamic_annotations.h" #include "tensorflow/core/platform/types.h" using tensorflow::int32; @@ -77,27 +78,24 @@ void MatMulImpl(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m, } // namespace -void __xla_cpu_runtime_EigenMatMulF16(const void* run_options_ptr, - Eigen::half* out, Eigen::half* lhs, - Eigen::half* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF16( + const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs, + Eigen::half* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs, + int32 transpose_rhs) { MatMulImpl(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } -void __xla_cpu_runtime_EigenMatMulF32(const void* run_options_ptr, float* out, - float* lhs, float* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF32( + const void* run_options_ptr, float* out, float* lhs, float* rhs, int64 m, + int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { MatMulImpl(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } -void __xla_cpu_runtime_EigenMatMulF64(const void* run_options_ptr, double* out, - double* lhs, double* rhs, int64 m, - int64 n, int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF64( + const void* run_options_ptr, double* out, double* lhs, double* rhs, int64 m, + int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { MatMulImpl(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc index 92da5f71c23d5e1450b39ea8b7bb8345f6fabb3b..8dc5f3c93b6ba1a722ea7b23b4b5190ac0600cd6 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc +++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifdef INTEL_MKL +#if defined(INTEL_MKL) && !defined(INTEL_MKL_DNN_ONLY) #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h" #include "third_party/intel_mkl_ml/include/mkl_cblas.h" #include "third_party/intel_mkl_ml/include/mkl_service.h" @@ -23,6 +23,7 @@ limitations under the License. #define EIGEN_USE_THREADS #include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool" +#include "tensorflow/core/platform/dynamic_annotations.h" using tensorflow::int32; using tensorflow::int64; @@ -74,10 +75,9 @@ void MatMulF64(const void* run_options_ptr, double* out, double* lhs, } // namespace -void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out, - float* lhs, float* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_MKLMatMulF32( + const void* run_options_ptr, float* out, float* lhs, float* rhs, int64 m, + int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { const xla::ExecutableRunOptions* run_options = static_cast(run_options_ptr); // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread @@ -88,11 +88,11 @@ void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out, // Set thread number back to the previous number. mkl_set_num_threads_local(prev_num_threads); } + // BLAS GEMM API for 64-bit Matrix Multiplication -void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out, - double* lhs, double* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_MKLMatMulF64( + const void* run_options_ptr, double* out, double* lhs, double* rhs, int64 m, + int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { const xla::ExecutableRunOptions* run_options = static_cast(run_options_ptr); // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread @@ -103,22 +103,26 @@ void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out, // Set thread number back to the previous number. mkl_set_num_threads_local(prev_num_threads); } -void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr, - float* out, float* lhs, - float* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { + +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr, + float* out, float* lhs, float* rhs, + int64 m, int64 n, int64 k, + int32 transpose_lhs, + int32 transpose_rhs) { // Set the thread number to 1 for single threaded excution. int prev_num_threads = mkl_set_num_threads_local(1); MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); // Set thread number back to the previous number. mkl_set_num_threads_local(prev_num_threads); } -void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr, - double* out, double* lhs, - double* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { + +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr, + double* out, double* lhs, + double* rhs, int64 m, int64 n, + int64 k, int32 transpose_lhs, + int32 transpose_rhs) { // Set the thread number to 1 for single threaded excution. int prev_num_threads = mkl_set_num_threads_local(1); MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc index 17303e2f0d34e531a3a56aa147608b949e0f43ae..16692e7f2e6145b2649b67987eef47916e958be2 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc +++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc @@ -17,6 +17,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/service/cpu/runtime_matvec.h" +#include "tensorflow/core/platform/dynamic_annotations.h" #include "tensorflow/core/platform/types.h" using tensorflow::int32; @@ -71,7 +72,8 @@ void SingleThreadedMatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, } // namespace -void __xla_cpu_runtime_EigenSingleThreadedMatMulF16( +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_EigenSingleThreadedMatMulF16( const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { @@ -79,16 +81,22 @@ void __xla_cpu_runtime_EigenSingleThreadedMatMulF16( transpose_lhs, transpose_rhs); } -void __xla_cpu_runtime_EigenSingleThreadedMatMulF32( - const void* run_options_ptr, float* out, float* lhs, float* rhs, int64 m, - int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_EigenSingleThreadedMatMulF32(const void* run_options_ptr, + float* out, float* lhs, + float* rhs, int64 m, int64 n, + int64 k, int32 transpose_lhs, + int32 transpose_rhs) { SingleThreadedMatMul(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } -void __xla_cpu_runtime_EigenSingleThreadedMatMulF64( - const void* run_options_ptr, double* out, double* lhs, double* rhs, int64 m, - int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_EigenSingleThreadedMatMulF64(const void* run_options_ptr, + double* out, double* lhs, + double* rhs, int64 m, int64 n, + int64 k, int32 transpose_lhs, + int32 transpose_rhs) { SingleThreadedMatMul(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } diff --git a/tensorflow/compiler/xla/service/cpu/sample_harness.cc b/tensorflow/compiler/xla/service/cpu/sample_harness.cc index 167aa4adda995a259190a932a76a34ca5883444c..942e2ddd3940fffd5d87518f059beaced3cdc925 100644 --- a/tensorflow/compiler/xla/service/cpu/sample_harness.cc +++ b/tensorflow/compiler/xla/service/cpu/sample_harness.cc @@ -16,18 +16,18 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -38,20 +38,21 @@ int main(int argc, char** argv) { // Transfer parameters. std::unique_ptr param0_literal = - xla::Literal::CreateR1({1.1f, 2.2f, 3.3f, 5.5f}); + xla::LiteralUtil::CreateR1({1.1f, 2.2f, 3.3f, 5.5f}); std::unique_ptr param0_data = client->TransferToServer(*param0_literal).ConsumeValueOrDie(); - std::unique_ptr param1_literal = xla::Literal::CreateR2( - {{3.1f, 4.2f, 7.3f, 9.5f}, {1.1f, 2.2f, 3.3f, 4.4f}}); + std::unique_ptr param1_literal = + xla::LiteralUtil::CreateR2( + {{3.1f, 4.2f, 7.3f, 9.5f}, {1.1f, 2.2f, 3.3f, 4.4f}}); std::unique_ptr param1_data = client->TransferToServer(*param1_literal).ConsumeValueOrDie(); // Build computation. xla::XlaBuilder builder(""); - auto p0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto p1 = builder.Parameter(1, param1_literal->shape(), "param1"); - auto add = builder.Add(p1, p0, {0}); + auto p0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto p1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + Add(p1, p0, {0}); xla::StatusOr computation_status = builder.Build(); xla::XlaComputation computation = computation_status.ConsumeValueOrDie(); @@ -66,8 +67,8 @@ int main(int argc, char** argv) { /*execution_profile=*/&profile); std::unique_ptr actual = result.ConsumeValueOrDie(); - LOG(INFO) << tensorflow::strings::Printf("computation took %lldns", - profile.compute_time_ns()); + LOG(INFO) << absl::StrFormat("computation took %dns", + profile.compute_time_ns()); LOG(INFO) << actual->ToString(); return 0; diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc index ae80a6f4977f85cfd9f872734fd0a69432a1f382..7d8e51f909e3db699b745f94a6c625407bc4a6e3 100644 --- a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc +++ b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc @@ -102,22 +102,22 @@ TEST_F(ShapePartitionIteratorTest, Shape53WithLayout10) { { ShapePartitionIterator iterator(shape, {1}); EXPECT_EQ(1, iterator.GetTotalPartitionCount()); - EXPECT_TRUE(ContainersEqual(Partition({{0, 5}}), iterator.GetPartition(0))); + EXPECT_TRUE(absl::c_equal(Partition({{0, 5}}), iterator.GetPartition(0))); } { ShapePartitionIterator iterator(shape, {2}); EXPECT_EQ(2, iterator.GetTotalPartitionCount()); - EXPECT_TRUE(ContainersEqual(Partition({{0, 2}}), iterator.GetPartition(0))); - EXPECT_TRUE(ContainersEqual(Partition({{2, 3}}), iterator.GetPartition(1))); + EXPECT_TRUE(absl::c_equal(Partition({{0, 2}}), iterator.GetPartition(0))); + EXPECT_TRUE(absl::c_equal(Partition({{2, 3}}), iterator.GetPartition(1))); } { ShapePartitionIterator iterator(shape, {3}); EXPECT_EQ(3, iterator.GetTotalPartitionCount()); - EXPECT_TRUE(ContainersEqual(Partition({{0, 1}}), iterator.GetPartition(0))); - EXPECT_TRUE(ContainersEqual(Partition({{1, 1}}), iterator.GetPartition(1))); - EXPECT_TRUE(ContainersEqual(Partition({{2, 3}}), iterator.GetPartition(2))); + EXPECT_TRUE(absl::c_equal(Partition({{0, 1}}), iterator.GetPartition(0))); + EXPECT_TRUE(absl::c_equal(Partition({{1, 1}}), iterator.GetPartition(1))); + EXPECT_TRUE(absl::c_equal(Partition({{2, 3}}), iterator.GetPartition(2))); } } @@ -128,20 +128,20 @@ TEST_F(ShapePartitionIteratorTest, Shape532WithLayout210) { ShapePartitionIterator iterator(shape, {1, 1}); EXPECT_EQ(1, iterator.GetTotalPartitionCount()); EXPECT_TRUE( - ContainersEqual(Partition({{0, 5}, {0, 3}}), iterator.GetPartition(0))); + absl::c_equal(Partition({{0, 5}, {0, 3}}), iterator.GetPartition(0))); } { ShapePartitionIterator iterator(shape, {2, 2}); EXPECT_EQ(4, iterator.GetTotalPartitionCount()); EXPECT_TRUE( - ContainersEqual(Partition({{0, 2}, {0, 1}}), iterator.GetPartition(0))); + absl::c_equal(Partition({{0, 2}, {0, 1}}), iterator.GetPartition(0))); EXPECT_TRUE( - ContainersEqual(Partition({{0, 2}, {1, 2}}), iterator.GetPartition(1))); + absl::c_equal(Partition({{0, 2}, {1, 2}}), iterator.GetPartition(1))); EXPECT_TRUE( - ContainersEqual(Partition({{2, 3}, {0, 1}}), iterator.GetPartition(2))); + absl::c_equal(Partition({{2, 3}, {0, 1}}), iterator.GetPartition(2))); EXPECT_TRUE( - ContainersEqual(Partition({{2, 3}, {1, 2}}), iterator.GetPartition(3))); + absl::c_equal(Partition({{2, 3}, {1, 2}}), iterator.GetPartition(3))); } } diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc index c4c90515ac7ec2721cb9ea48d42e3c5080e249af..bf98064647f4c29ba689902da4d737e1922391d3 100644 --- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc +++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc @@ -20,13 +20,13 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/JITSymbol.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/IR/Mangler.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Host.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h" #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h" #include "tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h" @@ -127,13 +127,6 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options, } llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) { - if (const uint8* from_constant_pool = - external_constant_pool_.Find(string(name))) { - return llvm::JITEvaluatedSymbol( - reinterpret_cast(from_constant_pool), - llvm::JITSymbolFlags::None); - } - void* func_addr = CustomCallTargetRegistry::Global()->Lookup(name); if (func_addr == nullptr) { return nullptr; @@ -177,15 +170,14 @@ namespace { bool RegisterKnownJITSymbols() { CustomCallTargetRegistry* registry = CustomCallTargetRegistry::Global(); -#define REGISTER_CPU_RUNTIME_SYMBOL(base_name) \ - do { \ - auto* function_address = \ - reinterpret_cast(__xla_cpu_runtime_##base_name); \ - registry->Register(xla::cpu::runtime::k##base_name##SymbolName, \ - function_address); \ - CHECK_EQ( \ - tensorflow::StringPiece(xla::cpu::runtime::k##base_name##SymbolName), \ - "__xla_cpu_runtime_" #base_name); \ +#define REGISTER_CPU_RUNTIME_SYMBOL(base_name) \ + do { \ + auto* function_address = \ + reinterpret_cast(__xla_cpu_runtime_##base_name); \ + registry->Register(xla::cpu::runtime::k##base_name##SymbolName, \ + function_address); \ + CHECK_EQ(absl::string_view(xla::cpu::runtime::k##base_name##SymbolName), \ + "__xla_cpu_runtime_" #base_name); \ } while (false) REGISTER_CPU_RUNTIME_SYMBOL(AcquireInfeedBufferForDequeue); diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h index 1851a3ee0bb97b4860605d7211a6ae70ac88686b..d74b63fcf45bd70cd18ee41f1e9714ba6a222abd 100644 --- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h +++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h @@ -29,7 +29,6 @@ limitations under the License. #include "llvm/Target/TargetMachine.h" #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h" #include "tensorflow/compiler/xla/service/cpu/disassembler.h" -#include "tensorflow/compiler/xla/service/cpu/external_constant_pool.h" #include "tensorflow/compiler/xla/types.h" namespace xla { @@ -91,10 +90,6 @@ class SimpleOrcJIT { llvm::TargetMachine* target_machine() const { return target_machine_.get(); } - ExternalConstantPool* external_constant_pool() { - return &external_constant_pool_; - } - // Creates an llvm::TargetMachine suitable for JITting code that will run on // the current machine. static std::unique_ptr InferTargetMachineForJIT( @@ -112,7 +107,6 @@ class SimpleOrcJIT { std::shared_ptr symbol_resolver_; ObjLayerT object_layer_; CompileLayerT compile_layer_; - ExternalConstantPool external_constant_pool_; }; } // namespace cpu diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD index 67f776e7b5883f425b41c05342b74bebe223e17f..2384166fd2002a67a8aa785ad5fb341d037ee01f 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/BUILD +++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD @@ -40,7 +40,7 @@ tf_cc_test( name = "cpu_fusion_test", srcs = ["cpu_fusion_test.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", @@ -51,6 +51,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", ], ) @@ -82,7 +83,7 @@ tf_cc_test( name = "cpu_noalias_test", srcs = ["cpu_noalias_test.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", @@ -94,6 +95,7 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:filecheck", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", "@llvm//:core", ], ) @@ -108,6 +110,7 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/strings", ], ) @@ -121,6 +124,7 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/strings", ], ) @@ -128,16 +132,16 @@ tf_cc_test( name = "cpu_infeed_test", srcs = ["cpu_infeed_test.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:cpu_plugin", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -152,9 +156,9 @@ tf_cc_test( srcs = ["cpu_literal_caching_test.cc"], deps = [ "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service/cpu:cpu_compiler", "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", @@ -166,9 +170,9 @@ tf_cc_test( srcs = ["cpu_outfeed_test.cc"], deps = [ "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service/cpu:cpu_compiler", "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h b/tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h index 7c8d07a10baf55dba8cbd347ebe1459b78e268e0..77b3a0301f2f90b577b7eaad86064dc30e2d9456 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h @@ -22,7 +22,7 @@ namespace xla { namespace cpu { // Tests that verify IR emitted by the CPU backend is as expected. -class CpuCodegenTest : public LLVMIRGenTestBase {}; +class CpuCodegenTest : public LlvmIrGenTestBase {}; } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc index 6fcce42eaa4599eb8a6dacc1bd39eefd39aa5e50..fcd87b36b32915773546c211d7d2c447a69bef49 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc @@ -19,10 +19,10 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h" #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/test.h" namespace xla { diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc index ed8f375bd6186e4805fe9ded5be9ae7c9f4d5c84..00a7aa2ad2f6bac4877302296ccb76222557535c 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc @@ -40,7 +40,7 @@ class CpuExternalConstantsTest : public CpuCodegenTest { HloInstruction* constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2FromArray2D(backing_array))); + LiteralUtil::CreateR2FromArray2D(backing_array))); HloInstruction* param = builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x")); builder.AddInstruction( @@ -56,7 +56,8 @@ class CpuExternalConstantsTest : public CpuCodegenTest { TEST_F(CpuExternalConstantsTest, Basic) { TestWithArray(/*rows=*/1024, /*cols=*/1024, R"( -CHECK: @constant_global_0 = external constant [1024 x [1024 x float]], align 16 +CHECK-NOT: @constant_global_0 = external constant [1024 x [1024 x float]], align 16 +CHECK: @0 = private constant [4194304 x i8] {{.*}}, align 16 )"); } @@ -64,8 +65,8 @@ TEST_F(CpuExternalConstantsTest, BasicNegative) { // The constant array in this test case is small enough that there is no need // to externalize it. TestWithArray(/*rows=*/4, /*cols=*/4, R"( -CHECK-NOT: @constant_global_0 = external constant [4 x [4 x float]], align 8 -CHECK: @0 = private constant [4 x [4 x float]] {{.*}}, align 8 +CHECK-NOT: @constant_global_0 = external constant [16 x float], align 8 +CHECK: @0 = private constant [64 x i8] {{.*}}, align 8 )"); } } // namespace diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc index 23e7a3de4d8188a3add259582e11030539e154c1..22721051e54e2cf9590b60333c51d1d028bb28e9 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc @@ -17,8 +17,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -43,8 +43,8 @@ class CpuFusionTest : public HloTestBase { TEST_F(CpuFusionTest, FuseTwoElementwiseOps) { auto builder = HloComputation::Builder(TestName()); - auto input_literal1 = Literal::CreateR1({1.0, 2.0, 3.0}); - auto input_literal2 = Literal::CreateR1({-2.0, -42.0, 2.0}); + auto input_literal1 = LiteralUtil::CreateR1({1.0, 2.0, 3.0}); + auto input_literal2 = LiteralUtil::CreateR1({-2.0, -42.0, 2.0}); Shape vshape = input_literal1->shape(); auto input1 = builder.AddInstruction( @@ -83,7 +83,7 @@ TEST_F(CpuFusionTest, FuseTwoElementwiseOps) { TEST_F(CpuFusionTest, FuseElementwiseOpChain) { auto builder = HloComputation::Builder(TestName()); - auto input_literal = Literal::CreateR1({-1.5, -2.5, -3.0}); + auto input_literal = LiteralUtil::CreateR1({-1.5, -2.5, -3.0}); Shape vshape = input_literal->shape(); auto input = builder.AddInstruction( @@ -96,8 +96,11 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) { HloInstruction::CreateUnary(vshape, HloOpcode::kExp, ceil)); auto floor = builder.AddInstruction( HloInstruction::CreateUnary(vshape, HloOpcode::kFloor, exp)); - auto two = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + auto two = builder.AddInstruction(HloInstruction::CreateBroadcast( + vshape, + builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))), + {})); builder.AddInstruction( HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, two, floor)); @@ -114,9 +117,9 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) { EXPECT_EQ(HloOpcode::kFusion, fusion_instruction->opcode()); EXPECT_EQ(HloOpcode::kMultiply, fusion_instruction->fused_expression_root()->opcode()); - // There should be 7 fused instructions: 2 parameters and the fused + // There should be 8 fused instructions: 2 parameters and the fused // operations. - EXPECT_EQ(7, fusion_instruction->fused_instruction_count()); + EXPECT_EQ(8, fusion_instruction->fused_instruction_count()); // Compile and execute the computation. auto result = ExecuteAndTransfer(std::move(module), {}); @@ -126,12 +129,12 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) { error_spec_); } -TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusableInstruction) { - // Test a chain of fusable ops with a non-fusable op (a reduce) thrown in the +TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) { + // Test a chain of fusible ops with a non-fusible op (a reduce) thrown in the // middle. auto module = CreateNewModule(); auto builder = HloComputation::Builder(TestName()); - auto input_literal = Literal::CreateR1({-1.5, -2.5, -3.0}); + auto input_literal = LiteralUtil::CreateR1({-1.5, -2.5, -3.0}); Shape vshape = input_literal->shape(); auto input = builder.AddInstruction( @@ -163,15 +166,18 @@ TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusableInstruction) { ShapeUtil::MakeShape(F32, {6, 1}), concatenate)), /*init_value=*/ builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))), + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))), /*dimensions_to_reduce=*/{1}, add_f32)); auto exp = builder.AddInstruction( HloInstruction::CreateUnary(cshape, HloOpcode::kExp, reduce)); auto floor = builder.AddInstruction( HloInstruction::CreateUnary(cshape, HloOpcode::kFloor, exp)); - auto two = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + auto two = builder.AddInstruction(HloInstruction::CreateBroadcast( + cshape, + builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))), + {})); builder.AddInstruction( HloInstruction::CreateBinary(cshape, HloOpcode::kMultiply, two, floor)); @@ -188,9 +194,9 @@ TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusableInstruction) { EXPECT_EQ(HloOpcode::kFusion, fusion_instruction1->opcode()); EXPECT_EQ(HloOpcode::kMultiply, fusion_instruction1->fused_expression_root()->opcode()); - // There should be 5 fused instructions in the root fusion instruction: 2 + // There should be 6 fused instructions in the root fusion instruction: 2 // parameters, multiply, floor, and exp. - EXPECT_EQ(5, fusion_instruction1->fused_instruction_count()) + EXPECT_EQ(6, fusion_instruction1->fused_instruction_count()) << fusion_instruction1->fused_instructions_computation()->ToString(); auto fusion_instruction2 = reduce->operand(0); @@ -225,7 +231,7 @@ TEST_F(CpuFusionTest, TestOperandOrderToAvoidDuplication) { // operand vectors. Test for this problem by counting the number of nodes in // each fusion instruction to ensure that negate is not duplicated. auto builder = HloComputation::Builder(TestName()); - auto input_literal = Literal::CreateR1({1.0, 2.0, 3.0}); + auto input_literal = LiteralUtil::CreateR1({1.0, 2.0, 3.0}); Shape vshape = input_literal->shape(); auto constant = builder.AddInstruction( @@ -286,10 +292,10 @@ TEST_F(CpuFusionTest, DoNotDuplicateExpensiveOps) { // computation. The duplication is caused by the other use of exp2 in the // tuple. auto builder = HloComputation::Builder(TestName()); - auto input_literal1 = Literal::CreateR1({1.0, 2.0, 3.0}); - auto input_literal2 = Literal::CreateR1({-2.0, -42.0, 2.0}); + auto input_literal1 = LiteralUtil::CreateR1({1.0, 2.0, 3.0}); + auto input_literal2 = LiteralUtil::CreateR1({-2.0, -42.0, 2.0}); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); Shape shape = constant->shape(); auto exp1 = builder.AddInstruction( diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc index dd63b998e9b6d04981ec6f7300c883c9b23b154f..c35569c6619ba5b534c5d8bb7ad683d84b6ecf4b 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc @@ -19,9 +19,9 @@ limitations under the License. #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -47,7 +47,7 @@ class InfeedTest : public ClientLibraryTestBase { // don't use ResetDevice since it is not implemented on CPU. ASSERT_IS_OK(client_->TransferToInfeed(literal)); XlaBuilder builder(TestName()); - builder.Infeed(literal.shape()); + Infeed(&builder, literal.shape()); if (ShapeUtil::IsTuple(literal.shape())) { // TODO(b/30609564): Use ComputeAndCompareLiteral instead. ComputeAndCompareTuple(&builder, literal, {}); @@ -58,52 +58,52 @@ class InfeedTest : public ClientLibraryTestBase { }; TEST_F(InfeedTest, SingleInfeedR0Bool) { - TestInfeedRoundTrip(*Literal::CreateR0(true)); + TestInfeedRoundTrip(*LiteralUtil::CreateR0(true)); } TEST_F(InfeedTest, SingleInfeedR1U32) { - TestInfeedRoundTrip(*Literal::CreateR1({1, 2, 3})); + TestInfeedRoundTrip(*LiteralUtil::CreateR1({1, 2, 3})); } TEST_F(InfeedTest, SingleInfeedR2F32) { - TestInfeedRoundTrip(*Literal::CreateR2F32Linspace(0.0, 1.0, 128, 64)); + TestInfeedRoundTrip(*LiteralUtil::CreateR2F32Linspace(0.0, 1.0, 128, 64)); } TEST_F(InfeedTest, SingleInfeedR3F32) { TestInfeedRoundTrip( - *Literal::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, - {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}})); + *LiteralUtil::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, + {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}})); } TEST_F(InfeedTest, SingleInfeedR3F32DifferentLayout) { const Layout r3_dim0minor = LayoutUtil::MakeLayout({0, 1, 2}); const Layout r3_dim0major = LayoutUtil::MakeLayout({2, 1, 0}); - TestInfeedRoundTrip( - *Literal::CreateR3WithLayout({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, - {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}, - r3_dim0minor)); + TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout( + {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, + {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}, + r3_dim0minor)); - TestInfeedRoundTrip( - *Literal::CreateR3WithLayout({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, - {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}, - r3_dim0major)); + TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout( + {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, + {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}, + r3_dim0major)); } TEST_F(InfeedTest, SingleInfeedR4S32) { - TestInfeedRoundTrip(*Literal::CreateR4( + TestInfeedRoundTrip(*LiteralUtil::CreateR4( {{{{1, -2}, {-4, 5}, {6, 7}}, {{8, 9}, {10, 11}, {12, 13}}}, {{{10, 3}, {7, -2}, {3, 6}}, {{2, 5}, {-11, 5}, {-2, -5}}}})); } TEST_F(InfeedTest, SingleInfeedTuple) { TestInfeedRoundTrip( - *Literal::MakeTuple({Literal::CreateR1({1, 2, 3}).get(), - Literal::CreateR0(false).get()})); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({1, 2, 3}).get(), + LiteralUtil::CreateR0(false).get()})); } TEST_F(InfeedTest, SingleInfeedEmptyTuple) { - TestInfeedRoundTrip(*Literal::MakeTuple({})); + TestInfeedRoundTrip(*LiteralUtil::MakeTuple({})); } // Tests Infeed operation used in a while loop, as in the code below. The @@ -125,8 +125,8 @@ TEST_F(InfeedTest, DISABLED_SingleInfeedInWhile) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - builder.Gt(builder.ConstantR0(40.0f), prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + Gt(ConstantR0(&builder, 40.0f), prev); condition = builder.Build().ConsumeValueOrDie(); } // Create a computation for the body: add the reduced value of the Infeed @@ -134,17 +134,16 @@ TEST_F(InfeedTest, DISABLED_SingleInfeedInWhile) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto infeed = builder.Infeed(infeed_shape); - auto addend = - builder.Reduce(infeed, builder.ConstantR0(0.0f), - CreateScalarAddComputation(F32, &builder), {0}); - builder.Add(prev, addend); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto infeed = Infeed(&builder, infeed_shape); + auto addend = Reduce(infeed, ConstantR0(&builder, 0.0f), + CreateScalarAddComputation(F32, &builder), {0}); + Add(prev, addend); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. - auto init = builder.ConstantR0(0.0f); - builder.While(condition, body, init); + auto init = ConstantR0(&builder, 0.0f); + While(condition, body, init); // Build and asynchronously launch the computation. auto computation = builder.Build().ConsumeValueOrDie(); @@ -157,13 +156,16 @@ TEST_F(InfeedTest, DISABLED_SingleInfeedInWhile) { }); // Send 5 Infeed data of shape F32[3]. - ASSERT_IS_OK(client_->TransferToInfeed(*Literal::CreateR1({1, 2, 3}))); - ASSERT_IS_OK(client_->TransferToInfeed(*Literal::CreateR1({4, 5, 6}))); - ASSERT_IS_OK(client_->TransferToInfeed(*Literal::CreateR1({7, 8, 9}))); ASSERT_IS_OK( - client_->TransferToInfeed(*Literal::CreateR1({10, 11, 12}))); + client_->TransferToInfeed(*LiteralUtil::CreateR1({1, 2, 3}))); + ASSERT_IS_OK( + client_->TransferToInfeed(*LiteralUtil::CreateR1({4, 5, 6}))); ASSERT_IS_OK( - client_->TransferToInfeed(*Literal::CreateR1({13, 14, 15}))); + client_->TransferToInfeed(*LiteralUtil::CreateR1({7, 8, 9}))); + ASSERT_IS_OK( + client_->TransferToInfeed(*LiteralUtil::CreateR1({10, 11, 12}))); + ASSERT_IS_OK( + client_->TransferToInfeed(*LiteralUtil::CreateR1({13, 14, 15}))); delete computation_thread; // Joins the thread. auto result_literal = client_->Transfer(*result).ConsumeValueOrDie(); @@ -207,8 +209,8 @@ TEST_F(InfeedTest, DISABLED_TwoInfeedsInTotalOrder) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - builder.GetTupleElement(prev, 1); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + GetTupleElement(prev, 1); condition = builder.Build().ConsumeValueOrDie(); } @@ -218,47 +220,47 @@ TEST_F(InfeedTest, DISABLED_TwoInfeedsInTotalOrder) { // The body adds the reduced value of the Infeed data (first tuple element) // to the previous accumulator, and returns the accumulator and the continue // flag (second tuple element) as a tuple. - const auto build_body = [this, &result_shape](const Shape& infeed_shape) { + const auto build_body = [&result_shape](const Shape& infeed_shape) { XlaComputation body; XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto infeed = builder.Infeed(infeed_shape); - auto addend = builder.Reduce( - builder.GetTupleElement(infeed, 0), builder.ConstantR0(0.0f), - CreateScalarAddComputation(F32, &builder), {0}); - auto result = builder.Add(builder.GetTupleElement(prev, 0), addend); - builder.Tuple({result, builder.GetTupleElement(infeed, 1)}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto infeed = Infeed(&builder, infeed_shape); + auto addend = + Reduce(GetTupleElement(infeed, 0), ConstantR0(&builder, 0.0f), + CreateScalarAddComputation(F32, &builder), {0}); + auto result = Add(GetTupleElement(prev, 0), addend); + Tuple(&builder, {result, GetTupleElement(infeed, 1)}); return builder.Build().ConsumeValueOrDie(); }; // Create the first while loop with infeed1_shape. - auto init = builder.Tuple( - {builder.ConstantR0(0.0f), builder.ConstantR0(true)}); - auto while1 = builder.While(condition, build_body(infeed1_shape), init); - auto result1 = builder.Tuple( - {builder.GetTupleElement(while1, 0), builder.ConstantR0(true)}); + auto init = Tuple(&builder, {ConstantR0(&builder, 0.0f), + ConstantR0(&builder, true)}); + auto while1 = While(condition, build_body(infeed1_shape), init); + auto result1 = Tuple( + &builder, {GetTupleElement(while1, 0), ConstantR0(&builder, true)}); // Create the second while loop with infeed2_shape. Note that the result from // the first while loop is used as the initial value. - auto while2 = builder.While(condition, build_body(infeed2_shape), result1); - builder.GetTupleElement(while2, 0); + auto while2 = While(condition, build_body(infeed2_shape), result1); + GetTupleElement(while2, 0); // Build the computation. auto computation = builder.Build().ConsumeValueOrDie(); // Send the first 4 Infeed data of shape Tuple(F32[2], PRED). ASSERT_IS_OK(client_->TransferToInfeed( - *Literal::MakeTuple({Literal::CreateR1({1, 2}).get(), - Literal::CreateR0(true).get()}))); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({1, 2}).get(), + LiteralUtil::CreateR0(true).get()}))); ASSERT_IS_OK(client_->TransferToInfeed( - *Literal::MakeTuple({Literal::CreateR1({3, 4}).get(), - Literal::CreateR0(true).get()}))); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({3, 4}).get(), + LiteralUtil::CreateR0(true).get()}))); ASSERT_IS_OK(client_->TransferToInfeed( - *Literal::MakeTuple({Literal::CreateR1({5, 6}).get(), - Literal::CreateR0(true).get()}))); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({5, 6}).get(), + LiteralUtil::CreateR0(true).get()}))); ASSERT_IS_OK(client_->TransferToInfeed( - *Literal::MakeTuple({Literal::CreateR1({7, 8}).get(), - Literal::CreateR0(false).get()}))); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({7, 8}).get(), + LiteralUtil::CreateR0(false).get()}))); // Asynchronously launch the execution on the device. std::unique_ptr result; @@ -273,14 +275,14 @@ TEST_F(InfeedTest, DISABLED_TwoInfeedsInTotalOrder) { // Infeed data, and send the rest Infeed data of shape Tuple(F32[3], PRED). sleep(1); ASSERT_IS_OK(client_->TransferToInfeed( - *Literal::MakeTuple({Literal::CreateR1({1, 2, 3}).get(), - Literal::CreateR0(true).get()}))); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({1, 2, 3}).get(), + LiteralUtil::CreateR0(true).get()}))); ASSERT_IS_OK(client_->TransferToInfeed( - *Literal::MakeTuple({Literal::CreateR1({7, 8, 9}).get(), - Literal::CreateR0(false).get()}))); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({7, 8, 9}).get(), + LiteralUtil::CreateR0(false).get()}))); ASSERT_IS_OK(client_->TransferToInfeed( - *Literal::MakeTuple({Literal::CreateR1({4, 5, 6}).get(), - Literal::CreateR0(true).get()}))); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({4, 5, 6}).get(), + LiteralUtil::CreateR0(true).get()}))); // Wait for the execution to be done, and transfer the result. delete computation_thread; // Joins the thread. diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc index 973aac8766f5aabca15e5173b43480c113c100dd..a434c04a980b9b3cd849792b97a0d9e965ba09f2 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc @@ -17,10 +17,10 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h" #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/test.h" namespace xla { @@ -32,9 +32,9 @@ const char* const kTriple_android_arm = "armv7-none-android"; struct IntrinsicTestSpec { HloOpcode opcode; - tensorflow::StringPiece triple; - tensorflow::StringPiece features; - tensorflow::StringPiece check_lines; + absl::string_view triple; + absl::string_view features; + absl::string_view check_lines; }; // Tests that unary functions get lowered using intrinsic calls. @@ -65,9 +65,8 @@ class CpuUnaryIntrinsicTest features = ""; } - return tensorflow::strings::StrCat(opcode.c_str(), "_On_", triple.c_str(), - features.empty() ? "" : "_With", - features.c_str()); + return absl::StrCat(opcode, "_On_", triple, + (features.empty() ? "" : "_With"), features); } }; diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc index d6e0425c5542be89835571f0103b1829f63cc2c2..3b87683ffffefd2aa24dd234cc072425bef00a24 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h" #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" namespace xla { namespace cpu { @@ -38,7 +38,9 @@ while_body { while_cond { arg_cond = f32[2,3,2] parameter(0) - ROOT unknown = pred[] infeed() + token = token[] after-all() + infeed = (pred[], token[]) infeed(token) + ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0 } ENTRY main { @@ -49,18 +51,19 @@ ENTRY main { {{2, 1}, {2001, 3002}, {2001, 2002}}}) const_b = f32[2,3,2] while(f32[2,3,2] const_a), condition=while_cond, body=while_body - out0 = () outfeed(f32[2,3,2] const_a) - ROOT out1 = () outfeed(f32[2,3,2] const_b) + token = token[] after-all() + out0 = token[] outfeed(f32[2,3,2] const_a, token[] token) + ROOT out1 = token[] outfeed(f32[2,3,2] const_b, token[] token) } )"; string filecheck_pattern = R"( -CHECK: private constant [2 x [3 x [2 x float]]] -CHECK-NOT: private constant [2 x [3 x [2 x float]]] +CHECK: private constant [48 x i8] +CHECK-NOT: private constant [48 x i8] )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_text)); + ParseHloString(hlo_text)); CpuAotCompilationOptions options{ /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"", @@ -78,34 +81,37 @@ TEST_F(CpuDuplicateConstantsTest, RepeatedTupleConstants) { HloModule RepeatedConstants while_body { - arg_body = (f32[2,1]{1,0}, f32[2]{0}) parameter(0) - ROOT const = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} )) + arg_body = (f32[2,1]{1,0}, f32[1]{0}) parameter(0) + ROOT const = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} )) } while_cond { - arg_cond = (f32[2,1]{1,0}, f32[2]{0}) parameter(0) - ROOT unknown = pred[] infeed() + arg_cond = (f32[2,1]{1,0}, f32[1]{0}) parameter(0) + token = token[] after-all() + infeed = (pred[], token[]) infeed(token) + ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0 } ENTRY main { param = f32[2,3,2] parameter(0) - const_a = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} )) - const_b = (f32[2,1]{1,0}, f32[2]{0}) while((f32[2,1]{1,0}, f32[2]{0}) const_a), condition=while_cond, body=while_body + const_a = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} )) + const_b = (f32[2,1]{1,0}, f32[1]{0}) while((f32[2,1]{1,0}, f32[1]{0}) const_a), condition=while_cond, body=while_body - out0 = () outfeed((f32[2,1]{1,0}, f32[2]{0}) const_a) - ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[2]{0}) const_b) + token = token[] after-all() + out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a, token[] token) + ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b, token[] token) } )"; string filecheck_pattern = R"( -CHECK: private constant [2 x float] -CHECK: private constant [2 x [1 x float]] -CHECK-NOT: private constant [2 x float] -CHECK-NOT: private constant [2 x [1 x float]] +CHECK: private constant [4 x i8] +CHECK: private constant [8 x i8] +CHECK-NOT: private constant [4 x i8] +CHECK-NOT: private constant [8 x i8] )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_text)); + ParseHloString(hlo_text)); CpuAotCompilationOptions options{ /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"", diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc index 3b6b0ed74065615fb9e47a0ec3c6c4ab078e45c4..bb105194f1c9001ca4d9fff9174e1ea7e5d8b72a 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc @@ -16,9 +16,9 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "llvm/IR/Module.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" @@ -42,7 +42,7 @@ TEST_F(CpuNoAliasTest, Concat) { HloComputation::Builder builder(TestName()); std::unique_ptr literal = - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); auto param_shape = ShapeUtil::MakeShape(F32, {2, 2}); HloInstruction* param_x = builder.AddInstruction( HloInstruction::CreateParameter(0, param_shape, "x")); @@ -62,7 +62,8 @@ TEST_F(CpuNoAliasTest, Concat) { // Now that we have an HLO module, build an llvm_ir::AliasAnalysis for it. auto status_or_buffer_assn = BufferAssigner::Run( - hlo_module.get(), MakeUnique(hlo_module.get()), + hlo_module.get(), + absl::make_unique(hlo_module.get()), backend().compiler()->BufferSizeBytesFunction(), [](LogicalBuffer::Color) { return /*alignment=*/1; }); ASSERT_EQ(status_or_buffer_assn.status(), Status::OK()); @@ -78,7 +79,7 @@ TEST_F(CpuNoAliasTest, Concat) { llvm::Function* func = llvm::cast( ir_module.getOrInsertFunction("test_fn", llvm::Type::getVoidTy(context))); llvm::BasicBlock* bb = llvm::BasicBlock::Create(context, "body", func); - llvm::IRBuilder<> ir_builder(bb); + llvm::IRBuilder<> b(bb); auto* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 0); llvm_ir::IrArray::Index zero2D({zero, zero}); @@ -90,7 +91,7 @@ TEST_F(CpuNoAliasTest, Concat) { ir_module.getOrInsertGlobal("param_x", array2d_type); llvm_ir::IrArray param_x_array(param_x_val, param_shape); aa.AddAliasingInformationToIrArray(*param_x, ¶m_x_array); - param_x_array.EmitReadArrayElement(zero2D, &ir_builder) + param_x_array.EmitReadArrayElement(zero2D, &b) ->setName("read_param_x_array"); } @@ -100,7 +101,7 @@ TEST_F(CpuNoAliasTest, Concat) { auto shape = ShapeUtil::MakeShape(F32, {2, 4}); llvm_ir::IrArray concat1_array(concat1_val, shape); aa.AddAliasingInformationToIrArray(*concat1, &concat1_array); - concat1_array.EmitReadArrayElement(zero2D, &ir_builder) + concat1_array.EmitReadArrayElement(zero2D, &b) ->setName("read_concat1_array"); } @@ -110,7 +111,7 @@ TEST_F(CpuNoAliasTest, Concat) { auto shape = ShapeUtil::MakeShape(F32, {2, 6}); llvm_ir::IrArray concat2_array(concat2_val, shape); aa.AddAliasingInformationToIrArray(*concat2, &concat2_array); - concat2_array.EmitReadArrayElement(zero2D, &ir_builder) + concat2_array.EmitReadArrayElement(zero2D, &b) ->setName("read_concat2_array"); } diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc index 879372eb13884cdb7edd8cfb3e8b4bac4e314951..e2c7af541eede5265f274c72f55305549f059839 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h" #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" namespace xla { namespace cpu { @@ -32,16 +32,18 @@ ENTRY main { {{{1, 2}, {1001, 1002}, {2001, 2002}}, {{2, 1}, {2001, 3002}, {2001, 2002}}}) - ROOT out = () outfeed(f32[2,3,2] const_a) + token = token[] after-all() + outfeed = token[] outfeed(f32[2,3,2] const_a, token) + ROOT root = () tuple() } )"; string filecheck_pattern = R"( -CHECK: private constant [2 x [3 x [2 x float]]] +CHECK: private constant [48 x i8] )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_text)); + ParseHloString(hlo_text)); CpuAotCompilationOptions options{ /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"", @@ -52,6 +54,33 @@ CHECK: private constant [2 x [3 x [2 x float]]] /*match_optimized_ir=*/false); } +TEST_F(CpuOutfeedTest, OutfeedTokenInTuple) { + const string hlo_text = R"( +HloModule OutfeedTokenInTuple + +ENTRY main { + const = f32[] constant(42) + epoch = token[] after-all() + outfeed.tok = token[] outfeed(const, epoch) + ROOT root = (token[], f32[]) tuple(outfeed.tok, const) +} +)"; + + string filecheck_pattern = R"( +CHECK: Outfeed +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_text)); + + CpuAotCompilationOptions options{ + /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"", + /*entry_point_name=*/"entry", + /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static}; + + CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern, + /*match_optimized_ir=*/false); +} } // namespace } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc index cd1165e23812861ba9951546b7dd744529232196..1bd4b59dd604687589eee061d34aa9ca94f6d700 100644 --- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc +++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h" +#include "absl/algorithm/container.h" #include "llvm/Support/raw_ostream.h" #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" @@ -23,14 +24,14 @@ namespace xla { namespace cpu { VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type, int64 vector_size, - llvm::IRBuilder<>* ir_builder, + llvm::IRBuilder<>* b, std::string name) : vector_size_(vector_size), primitive_type_(primitive_type), - ir_builder_(ir_builder), + b_(b), name_(std::move(name)) { scalar_type_ = llvm_ir::PrimitiveTypeToIrType( - primitive_type, ir_builder_->GetInsertBlock()->getModule()); + primitive_type, b_->GetInsertBlock()->getModule()); scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_); vector_type_ = llvm::VectorType::get(scalar_type_, vector_size); vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_); @@ -63,9 +64,9 @@ llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) { llvm::Value* VectorSupportLibrary::MulInternal(llvm::Value* lhs, llvm::Value* rhs) { if (scalar_type_->isFloatingPointTy()) { - return ir_builder()->CreateFMul(lhs, rhs, name()); + return b()->CreateFMul(lhs, rhs, name()); } else { - return ir_builder()->CreateMul(lhs, rhs, name()); + return b()->CreateMul(lhs, rhs, name()); } } @@ -76,13 +77,13 @@ llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) { llvm::Value* VectorSupportLibrary::Sub(llvm::Value* lhs, llvm::Value* rhs) { AssertCorrectTypes({lhs, rhs}); - return ir_builder()->CreateFSub(lhs, rhs); + return b()->CreateFSub(lhs, rhs); } llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) { AssertCorrectTypes({lhs, rhs}); if (scalar_type_->isFloatingPointTy()) { - return llvm_ir::EmitFloatMax(lhs, rhs, ir_builder_); + return llvm_ir::EmitFloatMax(lhs, rhs, b_); } else { LOG(FATAL) << "Max for integers is unimplemented"; } @@ -91,13 +92,13 @@ llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) { llvm::Value* VectorSupportLibrary::Floor(llvm::Value* a) { AssertCorrectTypes({a}); return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor, {a}, - {a->getType()}, ir_builder()); + {a->getType()}, b()); } llvm::Value* VectorSupportLibrary::Div(llvm::Value* lhs, llvm::Value* rhs) { AssertCorrectTypes({lhs, rhs}); if (scalar_type_->isFloatingPointTy()) { - return ir_builder()->CreateFDiv(lhs, rhs, name()); + return b()->CreateFDiv(lhs, rhs, name()); } else { LOG(FATAL) << "Division for integers is unimplemented"; } @@ -111,42 +112,41 @@ llvm::Value* VectorSupportLibrary::Clamp(llvm::Value* a, CHECK(low.compare(high) == llvm::APFloat::cmpLessThan); CHECK(scalar_type_->isFloatingPointTy()); return llvm_ir::EmitFloatMin( - llvm_ir::EmitFloatMax(a, GetConstantFloat(type, low), ir_builder_), - GetConstantFloat(type, high), ir_builder_); + llvm_ir::EmitFloatMax(a, GetConstantFloat(type, low), b_), + GetConstantFloat(type, high), b_); } llvm::Value* VectorSupportLibrary::FCmpEQMask(llvm::Value* lhs, llvm::Value* rhs) { AssertCorrectTypes({lhs, rhs}); - return I1ToFloat(ir_builder()->CreateFCmpOEQ(lhs, rhs, name())); + return I1ToFloat(b()->CreateFCmpOEQ(lhs, rhs, name())); } llvm::Value* VectorSupportLibrary::FCmpOLTMask(llvm::Value* lhs, llvm::Value* rhs) { AssertCorrectTypes({lhs, rhs}); - return I1ToFloat(ir_builder()->CreateFCmpOLT(lhs, rhs, name())); + return I1ToFloat(b()->CreateFCmpOLT(lhs, rhs, name())); } llvm::Value* VectorSupportLibrary::FCmpULEMask(llvm::Value* lhs, llvm::Value* rhs) { AssertCorrectTypes({lhs, rhs}); - return I1ToFloat(ir_builder()->CreateFCmpULE(lhs, rhs, name())); + return I1ToFloat(b()->CreateFCmpULE(lhs, rhs, name())); } llvm::Value* VectorSupportLibrary::I1ToFloat(llvm::Value* i1) { bool is_vector = llvm::isa(i1->getType()); llvm::Type* integer_type = IntegerTypeForFloatSize(is_vector); - return ir_builder()->CreateBitCast( - ir_builder()->CreateSExt(i1, integer_type, name()), - is_vector ? vector_type() : scalar_type(), name()); + return b()->CreateBitCast(b()->CreateSExt(i1, integer_type, name()), + is_vector ? vector_type() : scalar_type(), name()); } llvm::Type* VectorSupportLibrary::IntegerTypeForFloatSize(bool vector) { CHECK(scalar_type()->isFloatingPointTy()); const llvm::DataLayout& data_layout = - ir_builder()->GetInsertBlock()->getModule()->getDataLayout(); + b()->GetInsertBlock()->getModule()->getDataLayout(); int64 float_size_bits = data_layout.getTypeSizeInBits(scalar_type()); - llvm::Type* scalar_int_type = ir_builder()->getIntNTy(float_size_bits); + llvm::Type* scalar_int_type = b()->getIntNTy(float_size_bits); if (vector) { return llvm::VectorType::get(scalar_int_type, vector_size()); } else { @@ -156,7 +156,7 @@ llvm::Type* VectorSupportLibrary::IntegerTypeForFloatSize(bool vector) { llvm::Value* VectorSupportLibrary::BroadcastScalar(llvm::Value* x) { CHECK_EQ(x->getType(), scalar_type()); - return ir_builder()->CreateVectorSplat(vector_size(), x, name()); + return b()->CreateVectorSplat(vector_size(), x, name()); } llvm::Value* VectorSupportLibrary::FloatAnd(llvm::Value* lhs, @@ -164,10 +164,9 @@ llvm::Value* VectorSupportLibrary::FloatAnd(llvm::Value* lhs, AssertCorrectTypes({lhs, rhs}); llvm::Type* int_type = IntegerTypeForFloatSize(lhs->getType() == vector_type()); - return ir_builder()->CreateBitCast( - ir_builder()->CreateAnd( - ir_builder()->CreateBitCast(lhs, int_type, name()), - ir_builder()->CreateBitCast(rhs, int_type, name()), name()), + return b()->CreateBitCast( + b()->CreateAnd(b()->CreateBitCast(lhs, int_type, name()), + b()->CreateBitCast(rhs, int_type, name()), name()), vector_type()); } @@ -175,9 +174,8 @@ llvm::Value* VectorSupportLibrary::FloatNot(llvm::Value* lhs) { AssertCorrectTypes({lhs}); llvm::Type* int_type = IntegerTypeForFloatSize(lhs->getType() == vector_type()); - return ir_builder()->CreateBitCast( - ir_builder()->CreateNot( - ir_builder()->CreateBitCast(lhs, int_type, name()), name()), + return b()->CreateBitCast( + b()->CreateNot(b()->CreateBitCast(lhs, int_type, name()), name()), vector_type()); } @@ -185,47 +183,43 @@ llvm::Value* VectorSupportLibrary::FloatOr(llvm::Value* lhs, llvm::Value* rhs) { AssertCorrectTypes({lhs, rhs}); llvm::Type* int_type = IntegerTypeForFloatSize(lhs->getType() == vector_type()); - return ir_builder()->CreateBitCast( - ir_builder()->CreateOr(ir_builder()->CreateBitCast(lhs, int_type, name()), - ir_builder()->CreateBitCast(rhs, int_type, name()), - name()), + return b()->CreateBitCast( + b()->CreateOr(b()->CreateBitCast(lhs, int_type, name()), + b()->CreateBitCast(rhs, int_type, name()), name()), vector_type(), name()); } llvm::Value* VectorSupportLibrary::AddInternal(llvm::Value* lhs, llvm::Value* rhs) { if (scalar_type_->isFloatingPointTy()) { - return ir_builder()->CreateFAdd(lhs, rhs, name()); + return b()->CreateFAdd(lhs, rhs, name()); } else { - return ir_builder()->CreateAdd(lhs, rhs, name()); + return b()->CreateAdd(lhs, rhs, name()); } } llvm::Value* VectorSupportLibrary::ComputeOffsetPointer( llvm::Value* base_pointer, llvm::Value* offset_elements) { if (base_pointer->getType() != scalar_pointer_type()) { - base_pointer = ir_builder()->CreateBitCast(base_pointer, - scalar_pointer_type(), name()); + base_pointer = + b()->CreateBitCast(base_pointer, scalar_pointer_type(), name()); } - return ir_builder()->CreateInBoundsGEP(base_pointer, {offset_elements}, - name()); + return b()->CreateInBoundsGEP(base_pointer, {offset_elements}, name()); } llvm::Value* VectorSupportLibrary::LoadVector(llvm::Value* pointer) { if (pointer->getType() != vector_pointer_type()) { - pointer = - ir_builder()->CreateBitCast(pointer, vector_pointer_type(), name()); + pointer = b()->CreateBitCast(pointer, vector_pointer_type(), name()); } - return ir_builder()->CreateAlignedLoad( + return b()->CreateAlignedLoad( pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name()); } llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) { if (pointer->getType() != scalar_pointer_type()) { - pointer = - ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name()); + pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name()); } - return ir_builder()->CreateAlignedLoad( + return b()->CreateAlignedLoad( pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name()); } @@ -233,30 +227,28 @@ void VectorSupportLibrary::StoreVector(llvm::Value* value, llvm::Value* pointer) { AssertCorrectTypes({value}); if (pointer->getType() != vector_pointer_type()) { - pointer = ir_builder()->CreateBitCast(pointer, vector_pointer_type()); + pointer = b()->CreateBitCast(pointer, vector_pointer_type()); } - ir_builder()->CreateAlignedStore( - value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_)); + b()->CreateAlignedStore(value, pointer, + ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_)); } void VectorSupportLibrary::StoreScalar(llvm::Value* value, llvm::Value* pointer) { AssertCorrectTypes({value}); if (pointer->getType() != scalar_pointer_type()) { - pointer = - ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name()); + pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name()); } - ir_builder()->CreateAlignedStore( - value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_)); + b()->CreateAlignedStore(value, pointer, + ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_)); } llvm::Value* VectorSupportLibrary::LoadBroadcast(llvm::Value* pointer) { if (pointer->getType() != scalar_pointer_type()) { - pointer = - ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name()); + pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name()); } - return ir_builder()->CreateVectorSplat( - vector_size(), ir_builder()->CreateLoad(pointer), name()); + return b()->CreateVectorSplat(vector_size(), b()->CreateLoad(pointer), + name()); } llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) { @@ -267,20 +259,19 @@ llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) { for (unsigned j = 0; j < vector_size(); ++j) { if (j < (i / 2)) { - mask[j] = ir_builder()->getInt32(i / 2 + j); + mask[j] = b()->getInt32(i / 2 + j); } else { - mask[j] = llvm::UndefValue::get(ir_builder()->getInt32Ty()); + mask[j] = llvm::UndefValue::get(b()->getInt32Ty()); } } - llvm::Value* half_remaining_lanes = ir_builder()->CreateShuffleVector( - vector, llvm::UndefValue::get(vector_type()), - llvm::ConstantVector::get(mask), ""); + llvm::Value* half_remaining_lanes = + b()->CreateShuffleVector(vector, llvm::UndefValue::get(vector_type()), + llvm::ConstantVector::get(mask), ""); vector = Add(vector, half_remaining_lanes); } - return ir_builder()->CreateExtractElement(vector, ir_builder()->getInt32(0), - name()); + return b()->CreateExtractElement(vector, b()->getInt32(0), name()); } llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs, @@ -307,19 +298,19 @@ llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs, // vector, which are the lanes 2 and 3 in the rhs vector. for (int i = 0; i < vector_size(); i += 2) { int increment = i < vector_size() / 2 ? 0 : (vector_size() / 2); - mask_a.push_back(ir_builder()->getInt32(increment + i)); - mask_b.push_back(ir_builder()->getInt32(increment + i + 1)); + mask_a.push_back(b()->getInt32(increment + i)); + mask_b.push_back(b()->getInt32(increment + i + 1)); } for (int i = 0; i < vector_size(); i += 2) { int increment = i < vector_size() / 2 ? (vector_size() / 2) : vector_size(); - mask_a.push_back(ir_builder()->getInt32(increment + i)); - mask_b.push_back(ir_builder()->getInt32(increment + i + 1)); + mask_a.push_back(b()->getInt32(increment + i)); + mask_b.push_back(b()->getInt32(increment + i + 1)); } - llvm::Value* shuffle_0 = ir_builder()->CreateShuffleVector( - lhs, rhs, llvm::ConstantVector::get(mask_a)); - llvm::Value* shuffle_1 = ir_builder()->CreateShuffleVector( - lhs, rhs, llvm::ConstantVector::get(mask_b)); + llvm::Value* shuffle_0 = + b()->CreateShuffleVector(lhs, rhs, llvm::ConstantVector::get(mask_a)); + llvm::Value* shuffle_1 = + b()->CreateShuffleVector(lhs, rhs, llvm::ConstantVector::get(mask_b)); return Add(shuffle_0, shuffle_1); } @@ -327,23 +318,21 @@ llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs, llvm::Value* VectorSupportLibrary::ExtractLowHalf(llvm::Value* vector) { llvm::SmallVector mask; for (int i = 0; i < vector_size() / 2; i++) { - mask.push_back(ir_builder()->getInt32(i)); + mask.push_back(b()->getInt32(i)); } - return ir_builder()->CreateShuffleVector(vector, - llvm::UndefValue::get(vector_type()), - llvm::ConstantVector::get(mask)); + return b()->CreateShuffleVector(vector, llvm::UndefValue::get(vector_type()), + llvm::ConstantVector::get(mask)); } llvm::Value* VectorSupportLibrary::ExtractHighHalf(llvm::Value* vector) { llvm::SmallVector mask; for (int i = 0; i < vector_size() / 2; i++) { - mask.push_back(ir_builder()->getInt32(i + vector_size() / 2)); + mask.push_back(b()->getInt32(i + vector_size() / 2)); } - return ir_builder()->CreateShuffleVector(vector, - llvm::UndefValue::get(vector_type()), - llvm::ConstantVector::get(mask)); + return b()->CreateShuffleVector(vector, llvm::UndefValue::get(vector_type()), + llvm::ConstantVector::get(mask)); } std::vector VectorSupportLibrary::ComputeHorizontalSums( @@ -360,8 +349,8 @@ std::vector VectorSupportLibrary::ComputeHorizontalSums( [this](llvm::Value* vector) { return AddReduce(vector); }); if (init_values) { for (int64 i = 0, e = result.size(); i < e; i++) { - result[i] = Add(result[i], ir_builder()->CreateExtractElement( - init_values, ir_builder()->getInt32(i))); + result[i] = Add(result[i], + b()->CreateExtractElement(init_values, b()->getInt32(i))); } } return result; @@ -398,9 +387,9 @@ VectorSupportLibrary::ComputeAvxOptimizedHorizontalSums( std::vector results; for (int i = 0; i < lane_width; i++) { - llvm::Value* scalar_result = ir_builder()->CreateExtractElement( - i < (lane_width / 2) ? low : high, - ir_builder()->getInt32(i % (lane_width / 2)), name()); + llvm::Value* scalar_result = + b()->CreateExtractElement(i < (lane_width / 2) ? low : high, + b()->getInt32(i % (lane_width / 2)), name()); results.push_back(scalar_result); } @@ -415,17 +404,36 @@ llvm::Value* VectorSupportLibrary::GetZeroScalar() { return llvm::Constant::getNullValue(scalar_type()); } -LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* ir_builder) - : ir_builder_(ir_builder) { - alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", ir_builder_); +LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* b) : b_(b) { + alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", b_); } -llvm::Value* LlvmVariable::Get() const { - return ir_builder_->CreateLoad(alloca_); -} +llvm::Value* LlvmVariable::Get() const { return b_->CreateLoad(alloca_); } void LlvmVariable::Set(llvm::Value* new_value) { - ir_builder_->CreateStore(new_value, alloca_); + b_->CreateStore(new_value, alloca_); +} + +TileVariable::TileVariable(VectorSupportLibrary* vector_support, + std::vector initial_value) { + for (llvm::Value* initial_vector_value : initial_value) { + storage_.emplace_back(vector_support, initial_vector_value); + } +} + +std::vector TileVariable::Get() const { + std::vector result; + absl::c_transform(storage_, std::back_inserter(result), + [&](VectorVariable vect_var) { return vect_var.Get(); }); + return result; +} + +void TileVariable::Set(absl::Span value) { + CHECK_EQ(value.size(), storage_.size()); + for (int64 i = 0, e = value.size(); i < e; i++) { + storage_[i].Set(value[i]); + } } + } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h index edcaec584997b17dce30b8c46fda4abc78441064..5690d2be2fe3e21c96b51a5226e0b29148217fd1 100644 --- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h +++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h @@ -18,6 +18,7 @@ limitations under the License. #include +#include "absl/types/span.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/primitive_util.h" @@ -45,11 +46,11 @@ class VectorSupportLibrary { // instance (i.e. LoadVector will load a vector of type <`vector_size` x // `primitive_type`>). VectorSupportLibrary(PrimitiveType primitive_type, int64 vector_size, - llvm::IRBuilder<>* ir_builder, std::string name); + llvm::IRBuilder<>* b, std::string name); llvm::Value* Mul(llvm::Value* lhs, llvm::Value* rhs); llvm::Value* Mul(int64 lhs, llvm::Value* rhs) { - return Mul(ir_builder()->getInt64(lhs), rhs); + return Mul(b()->getInt64(lhs), rhs); } llvm::Value* Mul(const llvm::APFloat& lhs, llvm::Value* rhs) { return Mul(GetConstantFloat(rhs->getType(), lhs), rhs); @@ -62,7 +63,7 @@ class VectorSupportLibrary { llvm::Value* Add(llvm::Value* lhs, llvm::Value* rhs); llvm::Value* Add(int64 lhs, llvm::Value* rhs) { - return Add(ir_builder()->getInt64(lhs), rhs); + return Add(b()->getInt64(lhs), rhs); } llvm::Value* Add(const llvm::APFloat& lhs, llvm::Value* rhs) { return Add(GetConstantFloat(rhs->getType(), lhs), rhs); @@ -146,13 +147,11 @@ class VectorSupportLibrary { llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer, llvm::Value* offset_elements, int64 scale) { return ComputeOffsetPointer( - base_pointer, - ir_builder_->CreateMul(ir_builder_->getInt64(scale), offset_elements)); + base_pointer, b_->CreateMul(b_->getInt64(scale), offset_elements)); } llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer, int64 offset_elements) { - return ComputeOffsetPointer(base_pointer, - ir_builder()->getInt64(offset_elements)); + return ComputeOffsetPointer(base_pointer, b()->getInt64(offset_elements)); } llvm::Value* LoadVector(llvm::Value* pointer); @@ -163,7 +162,7 @@ class VectorSupportLibrary { } llvm::Value* LoadVector(llvm::Value* base_pointer, int64 offset_elements) { - return LoadVector(base_pointer, ir_builder()->getInt64(offset_elements)); + return LoadVector(base_pointer, b()->getInt64(offset_elements)); } llvm::Value* LoadScalar(llvm::Value* pointer); @@ -174,7 +173,7 @@ class VectorSupportLibrary { } llvm::Value* LoadScalar(llvm::Value* base_pointer, int64 offset_elements) { - return LoadScalar(base_pointer, ir_builder()->getInt64(offset_elements)); + return LoadScalar(base_pointer, b()->getInt64(offset_elements)); } void StoreVector(llvm::Value* value, llvm::Value* pointer); @@ -186,7 +185,7 @@ class VectorSupportLibrary { void StoreVector(llvm::Value* value, llvm::Value* base_pointer, int64 offset_elements) { - StoreVector(value, base_pointer, ir_builder()->getInt64(offset_elements)); + StoreVector(value, base_pointer, b()->getInt64(offset_elements)); } void StoreScalar(llvm::Value* value, llvm::Value* pointer); @@ -197,7 +196,7 @@ class VectorSupportLibrary { void StoreScalar(llvm::Value* value, llvm::Value* base_pointer, int64 offset_elements) { - StoreScalar(base_pointer, ir_builder()->getInt64(offset_elements)); + StoreScalar(base_pointer, b()->getInt64(offset_elements)); } llvm::Value* LoadBroadcast(llvm::Value* pointer); @@ -206,7 +205,7 @@ class VectorSupportLibrary { return LoadBroadcast(ComputeOffsetPointer(base_pointer, offset_elements)); } llvm::Value* LoadBroadcast(llvm::Value* base_pointer, int64 offset_elements) { - return LoadBroadcast(base_pointer, ir_builder()->getInt64(offset_elements)); + return LoadBroadcast(base_pointer, b()->getInt64(offset_elements)); } // Compute the horizontal sum of each vector in `vectors`. The i'th element @@ -219,7 +218,7 @@ class VectorSupportLibrary { llvm::Value* GetZeroVector(); llvm::Value* GetZeroScalar(); - llvm::IRBuilder<>* ir_builder() const { return ir_builder_; } + llvm::IRBuilder<>* b() const { return b_; } int64 vector_size() const { return vector_size_; } llvm::Type* vector_type() const { return vector_type_; } llvm::Type* vector_pointer_type() const { return vector_pointer_type_; } @@ -276,7 +275,7 @@ class VectorSupportLibrary { int64 vector_size_; PrimitiveType primitive_type_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; llvm::Type* vector_type_; llvm::Type* vector_pointer_type_; llvm::Type* scalar_type_; @@ -288,22 +287,21 @@ class VectorSupportLibrary { // can later convert to a SSA value. class LlvmVariable { public: - LlvmVariable(llvm::Type*, llvm::IRBuilder<>* ir_builder); + LlvmVariable(llvm::Type*, llvm::IRBuilder<>* b); llvm::Value* Get() const; void Set(llvm::Value* new_value); private: llvm::AllocaInst* alloca_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; }; class VectorVariable : public LlvmVariable { public: VectorVariable(VectorSupportLibrary* vector_support, llvm::Value* initial_value) - : LlvmVariable(vector_support->vector_type(), - vector_support->ir_builder()) { + : LlvmVariable(vector_support->vector_type(), vector_support->b()) { Set(initial_value); } }; @@ -312,11 +310,25 @@ class ScalarVariable : public LlvmVariable { public: ScalarVariable(VectorSupportLibrary* vector_support, llvm::Value* initial_value) - : LlvmVariable(vector_support->scalar_type(), - vector_support->ir_builder()) { + : LlvmVariable(vector_support->scalar_type(), vector_support->b()) { Set(initial_value); } }; + +// This wraps a set of alloca-backed stack variables that can, as a whole, store +// a tile. A "tile" is a sequence of vectors that is typically used as a 2D +// grid of scalar values (e.g. for tiled GEMMs). +class TileVariable { + public: + TileVariable(VectorSupportLibrary* vector_support, + std::vector initial_value); + + std::vector Get() const; + void Set(absl::Span value); + + private: + std::vector storage_; +}; } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc b/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc index 47543b2082f55cf7b8cf60f1c5bbb16a0a609912..b9e47f5aade3334bece28643e6e32ecfce3bf67b 100644 --- a/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc +++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc @@ -37,7 +37,7 @@ void XfeedQueueManager::Reset() { } void XfeedQueueManager::EnqueueBuffersAtomically( - tensorflow::gtl::ArraySlice buffers) { + absl::Span buffers) { tensorflow::mutex_lock l(mu_); bool was_empty = enqueued_buffers_.empty(); for (XfeedBuffer* b : buffers) { diff --git a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h index b4ace232607e14fbfec01d48946f0031d96cd027..990ff94ba2338cb663b655ca3106bda83ab718a3 100644 --- a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h +++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h @@ -22,10 +22,10 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/mutex.h" namespace xla { @@ -63,8 +63,7 @@ class XfeedQueueManager { // called when the buffer will no longer be accessed by the XfeedManager, // either as a result of a call to Reset or because the runtime has dequeued // and used the buffer. - void EnqueueBuffersAtomically( - tensorflow::gtl::ArraySlice buffers); + void EnqueueBuffersAtomically(absl::Span buffers); // Blocks until the queue is non-empty, then returns the buffer at the head of // the queue. Sets the current buffer to be the returned buffer. It is an diff --git a/tensorflow/compiler/xla/service/defuser.h b/tensorflow/compiler/xla/service/defuser.h index 56b28fd22da1ea6bc19f98e76f0f2ef4044cd3af..c326beb899f9a434d772c0fda032efc9113b6f42 100644 --- a/tensorflow/compiler/xla/service/defuser.h +++ b/tensorflow/compiler/xla/service/defuser.h @@ -29,7 +29,7 @@ class Defuser : public HloPassInterface { public: Defuser() {} ~Defuser() override {} - tensorflow::StringPiece name() const override { return "defuser"; } + absl::string_view name() const override { return "defuser"; } // Run defusion on the given module. Returns whether the module was // changed. diff --git a/tensorflow/compiler/xla/service/defuser_test.cc b/tensorflow/compiler/xla/service/defuser_test.cc index 32b5c5d35fae61ae6cb17fafcada1abd6c3c088c..e727ba49cb6321e499b5d50d5f45e7f7f6bb6fef 100644 --- a/tensorflow/compiler/xla/service/defuser_test.cc +++ b/tensorflow/compiler/xla/service/defuser_test.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/defuser.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" @@ -124,7 +124,7 @@ TEST_F(DefuserTest, NonTrivialFusionInstruction) { auto div = builder.AddInstruction( HloInstruction::CreateBinary(shape_, HloOpcode::kDivide, mul, param3)); auto constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); auto add2 = builder.AddInstruction( HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, constant, div)); @@ -162,7 +162,7 @@ TEST_F(DefuserTest, MultipleFusionInstructions) { auto div = builder.AddInstruction( HloInstruction::CreateBinary(shape_, HloOpcode::kDivide, mul, param3)); auto constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); auto add2 = builder.AddInstruction( HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, constant, div)); diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc index d938f3a2c4b5bfdd70d5a614b9890b4d7bf050f7..ba2a674d9af547ad574ae49e1e87f3afcaf6112a 100644 --- a/tensorflow/compiler/xla/service/despecializer.cc +++ b/tensorflow/compiler/xla/service/despecializer.cc @@ -21,8 +21,31 @@ limitations under the License. namespace xla { +namespace { + +// Pass which strips control dependencies from all instructions in the module. +class ControlDepRemover : public HloPassInterface { + public: + ControlDepRemover() = default; + absl::string_view name() const override { return "control-dep-remover"; } + + StatusOr Run(HloModule* module) override { + bool changed = false; + for (HloComputation* computation : module->computations()) { + for (HloInstruction* instruction : computation->instructions()) { + changed = changed || !instruction->control_predecessors().empty(); + TF_RETURN_IF_ERROR(instruction->DropAllControlDeps()); + } + } + return changed; + } +}; + +} // namespace + Despecializer::Despecializer() : pipeline_("despecializer") { // TODO(b/70588125): Also deal with window reversal in a fast way. + pipeline_.AddPass(); pipeline_.AddPass(); pipeline_.AddPass(); pipeline_.AddPass(); diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h index cc1695b7f863805e0b483478639c17cb9061310a..7be70add2f7566376b3179740e411d6341badf7c 100644 --- a/tensorflow/compiler/xla/service/despecializer.h +++ b/tensorflow/compiler/xla/service/despecializer.h @@ -33,7 +33,7 @@ namespace xla { class Despecializer : public HloPassInterface { public: Despecializer(); - tensorflow::StringPiece name() const override { return "despecializer"; } + absl::string_view name() const override { return "despecializer"; } StatusOr Run(HloModule* module) override; private: diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc index e228bb56bce8febcca28ae171f6de90973d020ab..edbcb25247421cdb50a845df1ec8b1851970efe3 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.cc +++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc @@ -25,7 +25,7 @@ namespace xla { StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator( const se::Platform* platform, - tensorflow::gtl::ArraySlice stream_executors) + absl::Span stream_executors) : DeviceMemoryAllocator(platform), stream_executors_(stream_executors.begin(), stream_executors.end()) {} @@ -36,9 +36,8 @@ StatusOr StreamExecutorMemoryAllocator::Allocate( se::DeviceMemoryBase result = stream_executor->AllocateArray(size); if (size > 0 && result == nullptr) { return ResourceExhausted( - "Failed to allocate request for %s (%lluB) on device ordinal %d", - tensorflow::strings::HumanReadableNumBytes(size).c_str(), size, - device_ordinal); + "Failed to allocate request for %s (%uB) on device ordinal %d", + tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal); } return OwningDeviceMemory(result, device_ordinal, this); } @@ -61,12 +60,12 @@ StatusOr StreamExecutorMemoryAllocator::GetStreamExecutor( } if (device_ordinal >= stream_executors_.size()) { return InvalidArgument( - "device ordinal value (%d) >= number of devices (%zu)", device_ordinal, + "device ordinal value (%d) >= number of devices (%u)", device_ordinal, stream_executors_.size()); } if (stream_executors_[device_ordinal] == nullptr) { return NotFound("Device %s:%d present but not supported", - platform()->Name().c_str(), device_ordinal); + platform()->Name(), device_ordinal); } return stream_executors_[device_ordinal]; } diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h index d87b86caf0d3acaa5bf9a455cff2315cedb2496d..a2308ee7a4137bbafe9804c30e33cc68d4628588 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.h +++ b/tensorflow/compiler/xla/service/device_memory_allocator.h @@ -18,10 +18,10 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/owning_device_memory.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" @@ -80,7 +80,7 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator { public: StreamExecutorMemoryAllocator( const se::Platform* platform, - tensorflow::gtl::ArraySlice stream_executors); + absl::Span stream_executors); StatusOr Allocate(int device_ordinal, uint64 size, bool retry_on_failure) override; diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc index 2172ae0a29626660e8abd29a789e0baa3831519d..3e7373adc5ab8a60fd18348ce2477175aaaa8fd4 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc @@ -28,14 +28,14 @@ template Status DfsHloVisitorBase::HandleElementwiseUnary( HloInstructionPtr hlo) { return Unimplemented("DfsHloVisitor::HandleElementwiseUnary: %s", - HloOpcodeString(hlo->opcode()).c_str()); + HloOpcodeString(hlo->opcode())); } template Status DfsHloVisitorBase::HandleElementwiseBinary( HloInstructionPtr hlo) { return Unimplemented("DfsHloVisitor::HandleElementwiseBinary: %s", - HloOpcodeString(hlo->opcode()).c_str()); + HloOpcodeString(hlo->opcode())); } template diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h index 64678d9d7450974f68817f92526519697a83683c..5761573791d90e45c65b55124a4bae3c5b929ef1 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h @@ -19,14 +19,14 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -76,6 +76,7 @@ class DfsHloVisitorBase { virtual Status HandleClamp(HloInstructionPtr hlo) = 0; virtual Status HandleSelect(HloInstructionPtr hlo) = 0; + virtual Status HandleTupleSelect(HloInstructionPtr hlo) = 0; virtual Status HandleMaximum(HloInstructionPtr hlo) { return HandleElementwiseBinary(hlo); } @@ -105,6 +106,8 @@ class DfsHloVisitorBase { virtual Status HandleConvolution(HloInstructionPtr hlo) = 0; virtual Status HandleFft(HloInstructionPtr fft) = 0; virtual Status HandleCrossReplicaSum(HloInstructionPtr hlo) = 0; + virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0; + virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0; virtual Status HandleCompare(HloInstructionPtr hlo) { return HandleElementwiseBinary(hlo); } @@ -183,6 +186,9 @@ class DfsHloVisitorBase { virtual Status HandleOr(HloInstructionPtr hlo) { return HandleElementwiseBinary(hlo); } + virtual Status HandleXor(HloInstructionPtr hlo) { + return HandleElementwiseBinary(hlo); + } virtual Status HandleShiftLeft(HloInstructionPtr hlo) { return HandleElementwiseBinary(hlo); } @@ -203,11 +209,11 @@ class DfsHloVisitorBase { virtual Status HandleInfeed(HloInstructionPtr hlo) = 0; virtual Status HandleOutfeed(HloInstructionPtr hlo) = 0; - virtual Status HandleHostCompute(HloInstructionPtr hlo) = 0; virtual Status HandleRng(HloInstructionPtr hlo) = 0; virtual Status HandleReverse(HloInstructionPtr hlo) = 0; virtual Status HandleSort(HloInstructionPtr hlo) = 0; virtual Status HandleConstant(HloInstructionPtr hlo) = 0; + virtual Status HandleIota(HloInstructionPtr hlo) = 0; virtual Status HandleGetTupleElement(HloInstructionPtr hlo) = 0; virtual Status HandleReduce(HloInstructionPtr hlo) = 0; virtual Status HandleBitcast(HloInstructionPtr hlo) = 0; @@ -228,6 +234,7 @@ class DfsHloVisitorBase { virtual Status HandleWhile(HloInstructionPtr hlo) = 0; virtual Status HandleConditional(HloInstructionPtr hlo) = 0; virtual Status HandleGather(HloInstructionPtr hlo) = 0; + virtual Status HandleScatter(HloInstructionPtr hlo) = 0; virtual Status HandlePad(HloInstructionPtr hlo) = 0; @@ -243,6 +250,8 @@ class DfsHloVisitorBase { virtual Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0; + virtual Status HandleAfterAll(HloInstructionPtr token) = 0; + // Invoked to inform the visitor that the traversal has completed, and that // the root was "root". virtual Status FinishVisit(HloInstructionPtr root) = 0; diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h index 240faebe62f5cee4f61b3c36b5e8f653cfd6db8e..4cd10ab06cd3b804406607212d3f3c316d6cff95 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h @@ -16,14 +16,14 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_ -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -79,6 +79,9 @@ class DfsHloVisitorWithDefaultBase Status HandleSelect(HloInstructionPtr select) override { return DefaultAction(select); } + Status HandleTupleSelect(HloInstructionPtr tuple_select) override { + return DefaultAction(tuple_select); + } Status HandleDot(HloInstructionPtr dot) override { return DefaultAction(dot); } @@ -91,6 +94,12 @@ class DfsHloVisitorWithDefaultBase Status HandleCrossReplicaSum(HloInstructionPtr crs) override { return DefaultAction(crs); } + Status HandleAllToAll(HloInstructionPtr hlo) override { + return DefaultAction(hlo); + } + Status HandleCollectivePermute(HloInstructionPtr hlo) override { + return DefaultAction(hlo); + } Status HandleRng(HloInstructionPtr random) override { return DefaultAction(random); } @@ -100,9 +109,6 @@ class DfsHloVisitorWithDefaultBase Status HandleOutfeed(HloInstructionPtr outfeed) override { return DefaultAction(outfeed); } - Status HandleHostCompute(HloInstructionPtr host_compute) override { - return DefaultAction(host_compute); - } Status HandleReverse(HloInstructionPtr reverse) override { return DefaultAction(reverse); } @@ -112,6 +118,9 @@ class DfsHloVisitorWithDefaultBase Status HandleConstant(HloInstructionPtr constant) override { return DefaultAction(constant); } + Status HandleIota(HloInstructionPtr iota) override { + return DefaultAction(iota); + } Status HandleGetTupleElement(HloInstructionPtr get_tuple_element) override { return DefaultAction(get_tuple_element); } @@ -188,6 +197,12 @@ class DfsHloVisitorWithDefaultBase Status HandleGather(HloInstructionPtr gather) override { return DefaultAction(gather); } + Status HandleScatter(HloInstructionPtr scatter) override { + return DefaultAction(scatter); + } + Status HandleAfterAll(HloInstructionPtr token) override { + return DefaultAction(token); + } // Invoked to inform the visitor that the traversal has completed, and that // the root was "root". diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc index 12faed69677cd99c6ed82c8d13dad3138d9461b7..09cb10d6ee579111b6e0cdb460b9af2b95d090db 100644 --- a/tensorflow/compiler/xla/service/dot_decomposer.cc +++ b/tensorflow/compiler/xla/service/dot_decomposer.cc @@ -136,6 +136,7 @@ Status DecomposeBatchDot(HloInstruction* dot) { dot_dnums.add_rhs_contracting_dimensions(0); auto dot_r2 = computation->AddInstruction(HloInstruction::CreateDot( dot_shape_r2, lhs_slice_r2, rhs_slice_r2, dot_dnums)); + dot_r2->set_precision_config(dot->precision_config()); // Reshape Dot to R3 so we can concat along batch dimension. auto dot_r3 = computation->AddInstruction( diff --git a/tensorflow/compiler/xla/service/dot_decomposer.h b/tensorflow/compiler/xla/service/dot_decomposer.h index 1959b687f16d6909a3283021c8635b3e65e6e412..fc38e317001695921d20f9bbe5775e61a8eeaa45 100644 --- a/tensorflow/compiler/xla/service/dot_decomposer.h +++ b/tensorflow/compiler/xla/service/dot_decomposer.h @@ -29,7 +29,7 @@ class DotDecomposer : public HloPassInterface { DotDecomposer(bool decompose_batch_dot = true) : decompose_batch_dot_(decompose_batch_dot) {} ~DotDecomposer() = default; - tensorflow::StringPiece name() const override { return "dot_decomposer"; } + absl::string_view name() const override { return "dot_decomposer"; } // Run DotDecomposer pass on computations in 'module'. // Returns whether the 'module' was changed. diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index 9a8bab353ef6b1e0b05b250d35296bc3cef8bc37..4bb1e071d8da75d0219d0b5cc9a6d16f1750a191 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -21,11 +21,15 @@ limitations under the License. #include // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" +#include "absl/algorithm/container.h" +#include "absl/strings/str_cat.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "tensorflow/compiler/xla/primitive_util.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" @@ -38,17 +42,16 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/random/random.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" namespace xla { +using absl::StrCat; using llvm_ir::AsStringRef; using llvm_ir::IrArray; using llvm_ir::IrName; using llvm_ir::SetToFirstInsertPoint; -using tensorflow::strings::StrCat; namespace { @@ -61,13 +64,13 @@ int64 GlobalRandomValue() { llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits, int64 mantissa_bits, - llvm::IRBuilder<>* ir_builder) { + llvm::IRBuilder<>* b) { // Integer and float types for casting and constant generation. llvm::Type* float_type = x->getType(); - llvm::IntegerType* int_type = ir_builder->getInt32Ty(); + llvm::IntegerType* int_type = b->getInt32Ty(); // Cast the input value to an integer for bitwise manipulation. - llvm::Value* x_as_int = ir_builder->CreateBitCast(x, int_type); + llvm::Value* x_as_int = b->CreateBitCast(x, int_type); if (mantissa_bits < 23) { // Last remaining mantissa bit. @@ -77,22 +80,22 @@ llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits, // equal to a base value of 0111... plus one bit if the last remaining // mantissa bit is 1. const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1; - llvm::Value* x_last_mantissa_bit = ir_builder->CreateLShr( - ir_builder->CreateAnd( - x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)), + llvm::Value* x_last_mantissa_bit = b->CreateLShr( + b->CreateAnd(x_as_int, + llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)), (23 - mantissa_bits)); - llvm::Value* x_rounding_bias = ir_builder->CreateAdd( - x_last_mantissa_bit, - llvm::ConstantInt::get(int_type, base_rounding_bias)); + llvm::Value* x_rounding_bias = + b->CreateAdd(x_last_mantissa_bit, + llvm::ConstantInt::get(int_type, base_rounding_bias)); // Add rounding bias, and mask out truncated bits. Note that the case // where adding the rounding bias overflows into the exponent bits is // correct; the non-masked mantissa bits will all be zero, and the // exponent will be incremented by one. const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1); - x_as_int = ir_builder->CreateAdd(x_as_int, x_rounding_bias); - x_as_int = ir_builder->CreateAnd( - x_as_int, llvm::ConstantInt::get(int_type, truncation_mask)); + x_as_int = b->CreateAdd(x_as_int, x_rounding_bias); + x_as_int = b->CreateAnd(x_as_int, + llvm::ConstantInt::get(int_type, truncation_mask)); } if (exponent_bits < 8) { @@ -120,29 +123,29 @@ llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits, f32_exponent_bias - reduced_exponent_bias; // Do we overflow or underflow? - llvm::Value* x_exponent = ir_builder->CreateAnd( + llvm::Value* x_exponent = b->CreateAnd( x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask)); - llvm::Value* x_overflows = ir_builder->CreateICmpUGT( + llvm::Value* x_overflows = b->CreateICmpUGT( x_exponent, llvm::ConstantInt::get(int_type, reduced_max_exponent << 23)); - llvm::Value* x_underflows = ir_builder->CreateICmpULE( + llvm::Value* x_underflows = b->CreateICmpULE( x_exponent, llvm::ConstantInt::get(int_type, reduced_min_exponent << 23)); // Compute appropriately-signed values of zero and infinity. - llvm::Value* x_signed_zero = ir_builder->CreateAnd( + llvm::Value* x_signed_zero = b->CreateAnd( x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask)); - llvm::Value* x_signed_inf = ir_builder->CreateOr( + llvm::Value* x_signed_inf = b->CreateOr( x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask)); // Force to zero or infinity if overflow or underflow. (Note that this // truncates all denormal values to zero, rather than rounding them.) - x_as_int = ir_builder->CreateSelect(x_overflows, x_signed_inf, x_as_int); - x_as_int = ir_builder->CreateSelect(x_underflows, x_signed_zero, x_as_int); + x_as_int = b->CreateSelect(x_overflows, x_signed_inf, x_as_int); + x_as_int = b->CreateSelect(x_underflows, x_signed_zero, x_as_int); } // Cast the result back to a floating-point type. - llvm::Value* result = ir_builder->CreateBitCast(x_as_int, float_type); + llvm::Value* result = b->CreateBitCast(x_as_int, float_type); // Correct result for NaN inputs. // @@ -154,60 +157,56 @@ llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits, // // If the fast-math flags are set to assume no NaNs, the comparison is likely // to be optimized away, so there's no point in even emitting it. - if (!ir_builder->getFastMathFlags().noNaNs()) { - llvm::Value* x_is_nan = ir_builder->CreateFCmpUNO(x, x); + if (!b->getFastMathFlags().noNaNs()) { + llvm::Value* x_is_nan = b->CreateFCmpUNO(x, x); if (mantissa_bits > 0) { - result = ir_builder->CreateSelect(x_is_nan, x, result); + result = b->CreateSelect(x_is_nan, x, result); } else { - result = ir_builder->CreateSelect( + result = b->CreateSelect( x_is_nan, llvm::ConstantFP::getInfinity(float_type), result); } } return result; } -llvm::Value* EmitF32ToBF16(llvm::Value* f32_value, - llvm::IRBuilder<>* ir_builder) { +llvm::Value* EmitF32ToBF16(llvm::Value* f32_value, llvm::IRBuilder<>* b) { auto reduced_precision = EmitReducePrecisionFloat( f32_value, /*exponent_bits=*/primitive_util::kBFloat16ExponentBits, - /*mantissa_bits=*/primitive_util::kBFloat16MantissaBits, ir_builder); - auto as_int32 = - ir_builder->CreateBitCast(reduced_precision, ir_builder->getInt32Ty()); - auto shifted = ir_builder->CreateLShr(as_int32, 16); - auto truncated = ir_builder->CreateTrunc(shifted, ir_builder->getInt16Ty()); - return ir_builder->CreateBitCast(truncated, ir_builder->getInt16Ty()); + /*mantissa_bits=*/primitive_util::kBFloat16MantissaBits, b); + auto as_int32 = b->CreateBitCast(reduced_precision, b->getInt32Ty()); + auto shifted = b->CreateLShr(as_int32, 16); + auto truncated = b->CreateTrunc(shifted, b->getInt16Ty()); + return b->CreateBitCast(truncated, b->getInt16Ty()); } -llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value, - llvm::IRBuilder<>* ir_builder) { - auto as_int16 = - ir_builder->CreateBitCast(bf16_value, ir_builder->getInt16Ty()); - auto as_int32 = ir_builder->CreateZExt(as_int16, ir_builder->getInt32Ty()); - auto shifted = ir_builder->CreateShl(as_int32, 16); - return ir_builder->CreateBitCast(shifted, ir_builder->getFloatTy()); +llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value, llvm::IRBuilder<>* b) { + auto as_int16 = b->CreateBitCast(bf16_value, b->getInt16Ty()); + auto as_int32 = b->CreateZExt(as_int16, b->getInt32Ty()); + auto shifted = b->CreateShl(as_int32, 16); + return b->CreateBitCast(shifted, b->getFloatTy()); } llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value, PrimitiveType from_type, PrimitiveType to_type, llvm::Module* module, - llvm::IRBuilder<>* ir_builder) { + llvm::IRBuilder<>* b) { if (primitive_util::IsSignedIntegralType(from_type)) { - return ir_builder->CreateSIToFP( - integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module)); + return b->CreateSIToFP(integer_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module)); } else { CHECK(primitive_util::IsUnsignedIntegralType(from_type) || from_type == PRED); - return ir_builder->CreateUIToFP( - integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module)); + return b->CreateUIToFP(integer_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module)); } } } // namespace StatusOr ElementalIrEmitter::EmitUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { + const HloInstruction* op, llvm::Value* operand_value) { if (op->opcode() == HloOpcode::kCopy) { return operand_value; } else if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) || @@ -221,50 +220,52 @@ StatusOr ElementalIrEmitter::EmitUnaryOp( } StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { + const HloInstruction* op, llvm::Value* operand_value) { switch (op->opcode()) { case HloOpcode::kConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); PrimitiveType to_type = op->shape().element_type(); - CHECK(primitive_util::IsIntegralType(from_type) || from_type == PRED); + CHECK(primitive_util::IsIntegralType(from_type) || from_type == PRED) + << from_type; if (from_type == to_type) { return operand_value; } + if (to_type == PRED) { + return b_->CreateZExt( + ICmpNE(operand_value, + llvm::ConstantInt::get(operand_value->getType(), 0)), + llvm_ir::PrimitiveTypeToIrType(PRED, module_)); + } if (primitive_util::IsIntegralType(to_type)) { - return ir_builder_->CreateIntCast( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_), - primitive_util::IsSignedIntegralType(from_type)); + return IntCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_), + primitive_util::IsSignedIntegralType(from_type)); } if (primitive_util::IsFloatingPointType(to_type)) { if (to_type == BF16) { - return EmitF32ToBF16( - EmitIntegralToFloating(operand_value, from_type, F32, module_, - ir_builder_), - ir_builder_); + return EmitF32ToBF16(EmitIntegralToFloating(operand_value, from_type, + F32, module_, b_), + b_); } return EmitIntegralToFloating(operand_value, from_type, to_type, - module_, ir_builder_); + module_, b_); } if (primitive_util::IsComplexType(to_type)) { auto to_ir_component_type = llvm_ir::PrimitiveTypeToIrType( primitive_util::ComplexComponentType(to_type), module_); if (primitive_util::IsSignedIntegralType(from_type)) { return EmitComposeComplex( - op, - ir_builder_->CreateSIToFP(operand_value, to_ir_component_type), - nullptr); + op, SIToFP(operand_value, to_ir_component_type), nullptr); } if (primitive_util::IsUnsignedIntegralType(from_type) || from_type == PRED) { return EmitComposeComplex( - op, - ir_builder_->CreateUIToFP(operand_value, to_ir_component_type), - nullptr); + op, UIToFP(operand_value, to_ir_component_type), nullptr); } } return Unimplemented("conversion from primitive type %s to %s", - PrimitiveType_Name(from_type).c_str(), - PrimitiveType_Name(to_type).c_str()); + PrimitiveType_Name(from_type), + PrimitiveType_Name(to_type)); } case HloOpcode::kBitcastConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); @@ -275,14 +276,13 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( } if (primitive_util::BitWidth(from_type) == primitive_util::BitWidth(to_type)) { - return ir_builder_->CreateBitCast( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return BitCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } return InvalidArgument( "bitcast conversion from primitive type %s to %s with unequal " "bit-widths (%u versus %u) ", - PrimitiveType_Name(from_type).c_str(), - PrimitiveType_Name(to_type).c_str(), + PrimitiveType_Name(from_type), PrimitiveType_Name(to_type), primitive_util::BitWidth(from_type), primitive_util::BitWidth(to_type)); } @@ -292,67 +292,55 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( if (is_signed) { auto type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_); - auto zero = llvm::ConstantInt::get(type, 0); - auto cmp = ir_builder_->CreateICmpSGE(operand_value, zero); - return ir_builder_->CreateSelect(cmp, operand_value, - ir_builder_->CreateNeg(operand_value)); + auto cmp = ICmpSGE(operand_value, GetZero(type)); + return Select(cmp, operand_value, Neg(operand_value)); } else { return operand_value; } } case HloOpcode::kClz: { - auto is_zero_undef = ir_builder_->getFalse(); - return llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::ctlz, {operand_value, is_zero_undef}, - {operand_value->getType()}, ir_builder_); + auto is_zero_undef = b_->getFalse(); + return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::ctlz, + {operand_value, is_zero_undef}, + {operand_value->getType()}, b_); } case HloOpcode::kSign: { - bool is_signed = - primitive_util::IsSignedIntegralType(op->shape().element_type()); + CHECK(primitive_util::IsSignedIntegralType(op->shape().element_type())) + << op->shape().element_type(); auto type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_); - auto zero = llvm::ConstantInt::get(type, 0); - auto cmp = ir_builder_->CreateICmpEQ(operand_value, zero); - if (is_signed) { - auto ashr = ir_builder_->CreateAShr(operand_value, - type->getIntegerBitWidth() - 1); - return ir_builder_->CreateSelect(cmp, zero, - ir_builder_->CreateOr(ashr, 1)); - } else { - return ir_builder_->CreateSelect(cmp, zero, - llvm::ConstantInt::get(type, 1)); - } + auto cmp = ICmpEQ(operand_value, GetZero(type)); + auto ashr = AShr(operand_value, type->getIntegerBitWidth() - 1); + return Select(cmp, GetZero(type), Or(ashr, 1)); } case HloOpcode::kNegate: - return ir_builder_->CreateNeg(operand_value); + return Neg(operand_value); case HloOpcode::kNot: { auto type = op->shape().element_type(); if (type == PRED) { // It is not sufficient to just call CreateNot() here because a PRED // is represented as an i8 and the truth value is stored only in the // bottom bit. - return ir_builder_->CreateZExt( - ir_builder_->CreateNot(ir_builder_->CreateTrunc( - operand_value, ir_builder_->getInt1Ty())), - llvm_ir::PrimitiveTypeToIrType(PRED, module_)); + return b_->CreateZExt(Not(Trunc(operand_value, b_->getInt1Ty())), + llvm_ir::PrimitiveTypeToIrType(PRED, module_)); } else if (primitive_util::IsIntegralType(type)) { - return ir_builder_->CreateNot(operand_value); + return Not(operand_value); } return Unimplemented("unary op Not is not defined for type '%d'", type); } default: return Unimplemented("unary integer op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } StatusOr ElementalIrEmitter::EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { + const HloInstruction* op, llvm::Value* operand_value) { switch (op->opcode()) { case HloOpcode::kConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); PrimitiveType to_type = op->shape().element_type(); - CHECK(primitive_util::IsFloatingPointType(from_type)); + CHECK(primitive_util::IsFloatingPointType(from_type)) << from_type; if (from_type == to_type) { return operand_value; } @@ -364,37 +352,42 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( } return EmitComposeComplex( op, - ir_builder_->CreateFPCast( - operand_value, - llvm_ir::PrimitiveTypeToIrType(to_component_type, module_)), + FPCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_component_type, module_)), nullptr); } if (from_type == BF16) { TF_RET_CHECK(to_type != BF16); - operand_value = EmitBF16ToF32(operand_value, ir_builder_); + operand_value = EmitBF16ToF32(operand_value, b_); from_type = F32; if (from_type == to_type) { return operand_value; } } if (from_type == F32 && to_type == BF16) { - return EmitF32ToBF16(operand_value, ir_builder_); + return EmitF32ToBF16(operand_value, b_); + } + if (to_type == PRED) { + return b_->CreateZExt( + FCmpUNE(operand_value, + llvm::ConstantFP::get(operand_value->getType(), 0.0)), + llvm_ir::PrimitiveTypeToIrType(PRED, module_)); } if (primitive_util::IsFloatingPointType(to_type)) { - return ir_builder_->CreateFPCast( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return FPCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } if (primitive_util::IsSignedIntegralType(to_type)) { - return ir_builder_->CreateFPToSI( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return FPToSI(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } if (primitive_util::IsUnsignedIntegralType(to_type)) { - return ir_builder_->CreateFPToUI( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return FPToUI(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } return Unimplemented("unhandled conversion operation: %s => %s", - PrimitiveType_Name(from_type).c_str(), - PrimitiveType_Name(to_type).c_str()); + PrimitiveType_Name(from_type), + PrimitiveType_Name(to_type)); } case HloOpcode::kBitcastConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); @@ -405,14 +398,13 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( } if (primitive_util::BitWidth(from_type) == primitive_util::BitWidth(to_type)) { - return ir_builder_->CreateBitCast( - operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_)); + return BitCast(operand_value, + llvm_ir::PrimitiveTypeToIrType(to_type, module_)); } return InvalidArgument( "bitcast conversion from primitive type %s to %s with unequal " "bit-widths (%u versus %u) ", - PrimitiveType_Name(from_type).c_str(), - PrimitiveType_Name(to_type).c_str(), + PrimitiveType_Name(from_type), PrimitiveType_Name(to_type), primitive_util::BitWidth(from_type), primitive_util::BitWidth(to_type)); } @@ -428,56 +420,59 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( return EmitCos(op->shape().element_type(), operand_value); case HloOpcode::kSin: return EmitSin(op->shape().element_type(), operand_value); + case HloOpcode::kTanh: + return EmitTanh(op->shape().element_type(), operand_value); case HloOpcode::kFloor: - return llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::floor, {operand_value}, {operand_value->getType()}, - ir_builder_); + return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor, + {operand_value}, + {operand_value->getType()}, b_); case HloOpcode::kCeil: - return llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::ceil, {operand_value}, {operand_value->getType()}, - ir_builder_); + return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::ceil, + {operand_value}, + {operand_value->getType()}, b_); case HloOpcode::kAbs: - return llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::fabs, {operand_value}, {operand_value->getType()}, - ir_builder_); + return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, + {operand_value}, + {operand_value->getType()}, b_); case HloOpcode::kRoundNearestAfz: - return llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::round, {operand_value}, {operand_value->getType()}, - ir_builder_); + return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::round, + {operand_value}, + {operand_value->getType()}, b_); case HloOpcode::kSign: { // TODO(b/32151903): Ensure consistent sign behavior for -0.0. auto type = operand_value->getType(); auto zero = llvm::ConstantFP::get(type, 0.0); - auto oeq = ir_builder_->CreateFCmpOEQ(operand_value, zero); - auto olt = ir_builder_->CreateFCmpOLT(operand_value, zero); - return ir_builder_->CreateSelect( - oeq, zero, - ir_builder_->CreateSelect(olt, llvm::ConstantFP::get(type, -1.0), - llvm::ConstantFP::get(type, 1.0))); + auto oeq = FCmpOEQ(operand_value, zero); + auto olt = FCmpOLT(operand_value, zero); + return Select(oeq, zero, + Select(olt, llvm::ConstantFP::get(type, -1.0), + llvm::ConstantFP::get(type, 1.0))); } case HloOpcode::kIsFinite: { - // (x == x) && abs(x) != inf + // abs(x) o!= inf, this works because the comparison returns false if + // either operand is NaN. auto type = operand_value->getType(); - auto equal_self = - ir_builder_->CreateFCmpOEQ(operand_value, operand_value); auto abs_value = llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::fabs, {operand_value}, {type}, ir_builder_); + llvm::Intrinsic::fabs, {operand_value}, {type}, b_); auto infinity = llvm::ConstantFP::getInfinity(type); - auto not_infinite = ir_builder_->CreateFCmpONE(abs_value, infinity); - auto result_i1 = ir_builder_->CreateAnd(equal_self, not_infinite); - return ir_builder_->CreateZExt( - result_i1, llvm_ir::PrimitiveTypeToIrType(PRED, module_)); + auto not_infinite = FCmpONE(abs_value, infinity); + return b_->CreateZExt(not_infinite, + llvm_ir::PrimitiveTypeToIrType(PRED, module_)); } case HloOpcode::kNegate: - return ir_builder_->CreateFNeg(operand_value); + return FNeg(operand_value); + case HloOpcode::kReal: + return operand_value; + case HloOpcode::kImag: + return llvm::ConstantFP::get(operand_value->getType(), 0.0); default: return Unimplemented("unary floating-point op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } StatusOr ElementalIrEmitter::EmitComplexUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { + const HloInstruction* op, llvm::Value* operand_value) { PrimitiveType input_type = op->operand(0)->shape().element_type(); PrimitiveType component_type = primitive_util::IsComplexType(input_type) @@ -489,13 +484,11 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto a = EmitExtractReal(operand_value); auto b = EmitExtractImag(operand_value); llvm::Type* llvm_ty = a->getType(); - auto sum_sq = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a), - ir_builder_->CreateFMul(b, b)); + auto sum_sq = FAdd(FMul(a, a), FMul(b, b)); TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq)); TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a)); auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5); - return EmitComposeComplex( - op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle); + return EmitComposeComplex(op, FMul(one_half, log_sum_sq), angle); } case HloOpcode::kLog1p: { // log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1) @@ -503,15 +496,12 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto b = EmitExtractImag(operand_value); llvm::Type* llvm_ty = a->getType(); auto one = llvm::ConstantFP::get(llvm_ty, 1.0); - auto a_plus_one = ir_builder_->CreateFAdd(a, one); - auto sum_sq = ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(a_plus_one, a_plus_one), - ir_builder_->CreateFMul(b, b)); + auto a_plus_one = FAdd(a, one); + auto sum_sq = FAdd(FMul(a_plus_one, a_plus_one), FMul(b, b)); TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq)); TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a_plus_one)); auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5); - return EmitComposeComplex( - op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle); + return EmitComposeComplex(op, FMul(one_half, log_sum_sq), angle); } case HloOpcode::kConvert: { PrimitiveType from_type = op->operand(0)->shape().element_type(); @@ -526,11 +516,8 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto to_ir_component_type = llvm_ir::PrimitiveTypeToIrType(to_component_type, module_); return EmitComposeComplex( - op, - ir_builder_->CreateFPCast(EmitExtractReal(operand_value), - to_ir_component_type), - ir_builder_->CreateFPCast(EmitExtractImag(operand_value), - to_ir_component_type)); + op, FPCast(EmitExtractReal(operand_value), to_ir_component_type), + FPCast(EmitExtractImag(operand_value), to_ir_component_type)); } case HloOpcode::kExp: { // e^(a+bi) = e^a*(cos(b)+sin(b)i) @@ -540,8 +527,7 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value))); TF_ASSIGN_OR_RETURN( auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value))); - return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b), - ir_builder_->CreateFMul(exp_a, sin_b)); + return EmitComposeComplex(op, FMul(exp_a, cos_b), FMul(exp_a, sin_b)); } case HloOpcode::kExpm1: { // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i @@ -552,9 +538,8 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( TF_ASSIGN_OR_RETURN( auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value))); auto one = llvm::ConstantFP::get(exp_a->getType(), 1.0); - auto real_result = - ir_builder_->CreateFSub(ir_builder_->CreateFMul(exp_a, cos_b), one); - auto imag_result = ir_builder_->CreateFMul(exp_a, sin_b); + auto real_result = FSub(FMul(exp_a, cos_b), one); + auto imag_result = FMul(exp_a, sin_b); return EmitComposeComplex(op, real_result, imag_result); } case HloOpcode::kCos: { @@ -569,18 +554,13 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto b = EmitExtractImag(operand_value); auto type = a->getType(); TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b)); - auto half_exp_b = - ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b); - auto half_exp_neg_b = - ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b); + auto half_exp_b = FMul(llvm::ConstantFP::get(type, 0.5), exp_b); + auto half_exp_neg_b = FDiv(llvm::ConstantFP::get(type, 0.5), exp_b); TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a)); TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a)); - return EmitComposeComplex( - op, - ir_builder_->CreateFMul( - cos_a, ir_builder_->CreateFAdd(half_exp_neg_b, half_exp_b)), - ir_builder_->CreateFMul( - sin_a, ir_builder_->CreateFSub(half_exp_neg_b, half_exp_b))); + return EmitComposeComplex(op, + FMul(cos_a, FAdd(half_exp_neg_b, half_exp_b)), + FMul(sin_a, FSub(half_exp_neg_b, half_exp_b))); } case HloOpcode::kSin: { // sin(z) = .5i(e^(-iz) - e^(iz)) @@ -596,18 +576,13 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( auto b = EmitExtractImag(operand_value); auto type = a->getType(); TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b)); - auto half_exp_b = - ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b); - auto half_exp_neg_b = - ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b); + auto half_exp_b = FMul(llvm::ConstantFP::get(type, 0.5), exp_b); + auto half_exp_neg_b = FDiv(llvm::ConstantFP::get(type, 0.5), exp_b); TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a)); TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a)); - return EmitComposeComplex( - op, - ir_builder_->CreateFMul( - sin_a, ir_builder_->CreateFAdd(half_exp_b, half_exp_neg_b)), - ir_builder_->CreateFMul( - cos_a, ir_builder_->CreateFSub(half_exp_b, half_exp_neg_b))); + return EmitComposeComplex(op, + FMul(sin_a, FAdd(half_exp_b, half_exp_neg_b)), + FMul(cos_a, FSub(half_exp_b, half_exp_neg_b))); } case HloOpcode::kTanh: { /* @@ -635,77 +610,63 @@ StatusOr ElementalIrEmitter::EmitComplexUnaryOp( TF_ASSIGN_OR_RETURN(auto exp_a, EmitExp(component_type, a)); TF_ASSIGN_OR_RETURN(auto cos_b, EmitCos(component_type, b)); TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b)); - auto exp_neg_a = ir_builder_->CreateFDiv( - llvm::ConstantFP::get(exp_a->getType(), 1), exp_a); - auto exp_2a_minus_exp_neg_2a = ir_builder_->CreateFSub( - ir_builder_->CreateFMul(exp_a, exp_a), - ir_builder_->CreateFMul(exp_neg_a, exp_neg_a)); - auto cos_b_sq = ir_builder_->CreateFMul(cos_b, cos_b); - auto sin_b_sq = ir_builder_->CreateFMul(sin_b, sin_b); - auto real_num = ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a), - ir_builder_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a)); - auto cos_b_sin_b = ir_builder_->CreateFMul(cos_b, sin_b); - auto exp_a_plus_exp_neg_a = ir_builder_->CreateFAdd(exp_a, exp_neg_a); + auto exp_neg_a = FDiv(llvm::ConstantFP::get(exp_a->getType(), 1), exp_a); + auto exp_2a_minus_exp_neg_2a = + FSub(FMul(exp_a, exp_a), FMul(exp_neg_a, exp_neg_a)); + auto cos_b_sq = FMul(cos_b, cos_b); + auto sin_b_sq = FMul(sin_b, sin_b); + auto real_num = FAdd(FMul(cos_b_sq, exp_2a_minus_exp_neg_2a), + FMul(sin_b_sq, exp_2a_minus_exp_neg_2a)); + auto cos_b_sin_b = FMul(cos_b, sin_b); + auto exp_a_plus_exp_neg_a = FAdd(exp_a, exp_neg_a); auto exp_a_plus_exp_neg_a_sq = - ir_builder_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a); - auto exp_a_minus_exp_neg_a = ir_builder_->CreateFSub(exp_a, exp_neg_a); + FMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a); + auto exp_a_minus_exp_neg_a = FSub(exp_a, exp_neg_a); auto exp_a_minus_exp_neg_a_sq = - ir_builder_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a); - auto imag_num = ir_builder_->CreateFMul( - cos_b_sin_b, ir_builder_->CreateFSub(exp_a_plus_exp_neg_a_sq, - exp_a_minus_exp_neg_a_sq)); - auto denom = ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq), - ir_builder_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq)); - return EmitComposeComplex(op, ir_builder_->CreateFDiv(real_num, denom), - ir_builder_->CreateFDiv(imag_num, denom)); + FMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a); + auto imag_num = FMul( + cos_b_sin_b, FSub(exp_a_plus_exp_neg_a_sq, exp_a_minus_exp_neg_a_sq)); + auto denom = FAdd(FMul(cos_b_sq, exp_a_plus_exp_neg_a_sq), + FMul(sin_b_sq, exp_a_minus_exp_neg_a_sq)); + return EmitComposeComplex(op, FDiv(real_num, denom), + FDiv(imag_num, denom)); } case HloOpcode::kAbs: { - auto sum_sq = ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(EmitExtractReal(operand_value), - EmitExtractReal(operand_value)), - ir_builder_->CreateFMul(EmitExtractImag(operand_value), - EmitExtractImag(operand_value))); + auto sum_sq = FAdd( + FMul(EmitExtractReal(operand_value), EmitExtractReal(operand_value)), + FMul(EmitExtractImag(operand_value), EmitExtractImag(operand_value))); return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sqrt, {sum_sq}, - {sum_sq->getType()}, ir_builder_); + {sum_sq->getType()}, b_); } case HloOpcode::kSign: { // Sign(c) = c / |c| - auto sum_sq = ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(EmitExtractReal(operand_value), - EmitExtractReal(operand_value)), - ir_builder_->CreateFMul(EmitExtractImag(operand_value), - EmitExtractImag(operand_value))); + auto sum_sq = FAdd( + FMul(EmitExtractReal(operand_value), EmitExtractReal(operand_value)), + FMul(EmitExtractImag(operand_value), EmitExtractImag(operand_value))); auto cplx_abs = llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::sqrt, {sum_sq}, {sum_sq->getType()}, ir_builder_); + llvm::Intrinsic::sqrt, {sum_sq}, {sum_sq->getType()}, b_); auto type = cplx_abs->getType(); auto zero = llvm::ConstantFP::get(type, 0.0); - auto oeq = ir_builder_->CreateFCmpOEQ(cplx_abs, zero); - return ir_builder_->CreateSelect( + auto oeq = FCmpOEQ(cplx_abs, zero); + return Select( oeq, EmitComposeComplex(op, zero, zero), - EmitComposeComplex( - op, - ir_builder_->CreateFDiv(EmitExtractReal(operand_value), cplx_abs), - ir_builder_->CreateFDiv(EmitExtractImag(operand_value), - cplx_abs))); + EmitComposeComplex(op, FDiv(EmitExtractReal(operand_value), cplx_abs), + FDiv(EmitExtractImag(operand_value), cplx_abs))); } case HloOpcode::kNegate: - return EmitComposeComplex( - op, ir_builder_->CreateFNeg(EmitExtractReal(operand_value)), - ir_builder_->CreateFNeg(EmitExtractImag(operand_value))); + return EmitComposeComplex(op, FNeg(EmitExtractReal(operand_value)), + FNeg(EmitExtractImag(operand_value))); case HloOpcode::kReal: return EmitExtractReal(operand_value); case HloOpcode::kImag: return EmitExtractImag(operand_value); default: return Unimplemented("unary complex op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } StatusOr ElementalIrEmitter::EmitBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { PrimitiveType operand_type = op->operand(0)->shape().element_type(); if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) || operand_type == PRED) { @@ -720,21 +681,20 @@ StatusOr ElementalIrEmitter::EmitBinaryOp( } StatusOr ElementalIrEmitter::EmitFloatBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { switch (op->opcode()) { case HloOpcode::kComplex: return EmitComposeComplex(op, lhs_value, rhs_value); case HloOpcode::kAdd: - return ir_builder_->CreateFAdd(lhs_value, rhs_value); + return FAdd(lhs_value, rhs_value); case HloOpcode::kSubtract: - return ir_builder_->CreateFSub(lhs_value, rhs_value); + return FSub(lhs_value, rhs_value); case HloOpcode::kMultiply: - return ir_builder_->CreateFMul(lhs_value, rhs_value); + return FMul(lhs_value, rhs_value); case HloOpcode::kDivide: - return ir_builder_->CreateFDiv(lhs_value, rhs_value); + return FDiv(lhs_value, rhs_value); case HloOpcode::kRemainder: - return ir_builder_->CreateFRem(lhs_value, rhs_value); + return FRem(lhs_value, rhs_value); // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered // comparisons always return false when one of the operands is NaN, whereas // unordered comparisons return true. @@ -744,22 +704,22 @@ StatusOr ElementalIrEmitter::EmitFloatBinaryOp( // matches C++'s semantics. case HloOpcode::kEq: return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, lhs_value, - rhs_value, ir_builder_); + rhs_value, b_); case HloOpcode::kNe: return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, lhs_value, - rhs_value, ir_builder_); + rhs_value, b_); case HloOpcode::kLt: return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLT, lhs_value, - rhs_value, ir_builder_); + rhs_value, b_); case HloOpcode::kGt: return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGT, lhs_value, - rhs_value, ir_builder_); + rhs_value, b_); case HloOpcode::kLe: return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLE, lhs_value, - rhs_value, ir_builder_); + rhs_value, b_); case HloOpcode::kGe: return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGE, lhs_value, - rhs_value, ir_builder_); + rhs_value, b_); case HloOpcode::kMaximum: return EmitFloatMax(lhs_value, rhs_value); @@ -771,74 +731,52 @@ StatusOr ElementalIrEmitter::EmitFloatBinaryOp( return EmitAtan2(op->shape().element_type(), lhs_value, rhs_value); default: return Unimplemented("binary floating point op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } StatusOr ElementalIrEmitter::EmitComplexBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { switch (op->opcode()) { case HloOpcode::kAdd: return EmitComposeComplex( - op, - ir_builder_->CreateFAdd(EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value)), - ir_builder_->CreateFAdd(EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value))); + op, FAdd(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)), + FAdd(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))); case HloOpcode::kSubtract: return EmitComposeComplex( - op, - ir_builder_->CreateFSub(EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value)), - ir_builder_->CreateFSub(EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value))); + op, FSub(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)), + FSub(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))); case HloOpcode::kMultiply: return EmitComposeComplex( op, - ir_builder_->CreateFSub( - ir_builder_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value)), - ir_builder_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value))), - ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractImag(rhs_value)), - ir_builder_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractReal(rhs_value)))); + FSub(FMul(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)), + FMul(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))), + FAdd(FMul(EmitExtractReal(lhs_value), EmitExtractImag(rhs_value)), + FMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value)))); case HloOpcode::kDivide: { // (a+bi) / (c+di) = ((a+bi)(c-di)) / ((c+di)(c-di)) // = ((ac + bd) + (bc - ad)i) / (c^2 + d^2) - auto rhs_sum_sq = ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(EmitExtractReal(rhs_value), - EmitExtractReal(rhs_value)), - ir_builder_->CreateFMul(EmitExtractImag(rhs_value), - EmitExtractImag(rhs_value))); + auto rhs_sum_sq = + FAdd(FMul(EmitExtractReal(rhs_value), EmitExtractReal(rhs_value)), + FMul(EmitExtractImag(rhs_value), EmitExtractImag(rhs_value))); auto type = rhs_sum_sq->getType(); auto zero = llvm::ConstantFP::get(type, 0.0); - auto oeq = ir_builder_->CreateFCmpOEQ(rhs_sum_sq, zero); - auto real_inf_or_nan = - ir_builder_->CreateFDiv(EmitExtractReal(lhs_value), zero); - auto imag_inf_or_nan = - ir_builder_->CreateFDiv(EmitExtractImag(lhs_value), zero); - return ir_builder_->CreateSelect( + auto oeq = FCmpOEQ(rhs_sum_sq, zero); + auto real_inf_or_nan = FDiv(EmitExtractReal(lhs_value), zero); + auto imag_inf_or_nan = FDiv(EmitExtractImag(lhs_value), zero); + return Select( oeq, EmitComposeComplex(op, real_inf_or_nan, imag_inf_or_nan), - EmitComposeComplex( - op, - ir_builder_->CreateFDiv( - ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value)), - ir_builder_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value))), - rhs_sum_sq), - ir_builder_->CreateFDiv( - ir_builder_->CreateFSub( - ir_builder_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractReal(rhs_value)), - ir_builder_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractImag(rhs_value))), - rhs_sum_sq))); + EmitComposeComplex(op, + FDiv(FAdd(FMul(EmitExtractReal(lhs_value), + EmitExtractReal(rhs_value)), + FMul(EmitExtractImag(lhs_value), + EmitExtractImag(rhs_value))), + rhs_sum_sq), + FDiv(FSub(FMul(EmitExtractImag(lhs_value), + EmitExtractReal(rhs_value)), + FMul(EmitExtractReal(lhs_value), + EmitExtractImag(rhs_value))), + rhs_sum_sq))); } // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered // comparisons always return false when one of the operands is NaN, whereas @@ -848,21 +786,19 @@ StatusOr ElementalIrEmitter::EmitComplexBinaryOp( // unordered comparison. This makes x != y equivalent to !(x == y), and // matches C++'s semantics. case HloOpcode::kEq: - return ir_builder_->CreateAnd( - llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, - EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value), ir_builder_), - llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, - EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value), ir_builder_)); + return And(llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, + EmitExtractReal(lhs_value), + EmitExtractReal(rhs_value), b_), + llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, + EmitExtractImag(lhs_value), + EmitExtractImag(rhs_value), b_)); case HloOpcode::kNe: - return ir_builder_->CreateOr( - llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, - EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value), ir_builder_), - llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, - EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value), ir_builder_)); + return Or(llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, + EmitExtractReal(lhs_value), + EmitExtractReal(rhs_value), b_), + llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, + EmitExtractImag(lhs_value), + EmitExtractImag(rhs_value), b_)); case HloOpcode::kPower: { // (a+bi)^(c+di) = @@ -874,48 +810,43 @@ StatusOr ElementalIrEmitter::EmitComplexBinaryOp( auto b = EmitExtractImag(lhs_value); auto c = EmitExtractReal(rhs_value); auto d = EmitExtractImag(rhs_value); - auto aa_p_bb = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a), - ir_builder_->CreateFMul(b, b)); + auto aa_p_bb = FAdd(FMul(a, a), FMul(b, b)); auto one_half = llvm::ConstantFP::get(a->getType(), 0.5); - auto half_c = ir_builder_->CreateFMul(one_half, c); + auto half_c = FMul(one_half, c); TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c, EmitPow(component_type, aa_p_bb, half_c)); - auto neg_d = ir_builder_->CreateFNeg(d); + auto neg_d = FNeg(d); TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a)); - auto neg_d_arg_lhs = ir_builder_->CreateFMul(neg_d, arg_lhs); + auto neg_d_arg_lhs = FMul(neg_d, arg_lhs); TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs, EmitExp(component_type, neg_d_arg_lhs)); - auto coeff = - ir_builder_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs); + auto coeff = FMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs); TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb)); - auto half_d = ir_builder_->CreateFMul(one_half, d); - auto q = - ir_builder_->CreateFAdd(ir_builder_->CreateFMul(c, arg_lhs), - ir_builder_->CreateFMul(half_d, ln_aa_p_bb)); + auto half_d = FMul(one_half, d); + auto q = FAdd(FMul(c, arg_lhs), FMul(half_d, ln_aa_p_bb)); TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q)); TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q)); - return EmitComposeComplex(op, ir_builder_->CreateFMul(coeff, cos_q), - ir_builder_->CreateFMul(coeff, sin_q)); + return EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q)); } default: return Unimplemented("binary complex op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } llvm::Value* ElementalIrEmitter::EmitFloatMax(llvm::Value* lhs_value, - llvm::Value* rhs_value) const { - return llvm_ir::EmitFloatMax(lhs_value, rhs_value, ir_builder_); + llvm::Value* rhs_value) { + return llvm_ir::EmitFloatMax(lhs_value, rhs_value, b_); } llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value, - llvm::Value* rhs_value) const { - return llvm_ir::EmitFloatMin(lhs_value, rhs_value, ir_builder_); + llvm::Value* rhs_value) { + return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_); } StatusOr ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type, - llvm::Value* x) const { + llvm::Value* x) { if (prim_type != F32) { // TODO(b/34339814): Implement inverse erf for F64. return Unimplemented( @@ -923,15 +854,14 @@ StatusOr ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type, "type F32."); } auto getFloat = [&](const float f) { - return llvm::ConstantFP::get(ir_builder_->getFloatTy(), f); + return llvm::ConstantFP::get(b_->getFloatTy(), f); }; - auto multiply_add = [&](tensorflow::gtl::ArraySlice coefficients, + auto multiply_add = [&](absl::Span coefficients, llvm::Value* w) { llvm::Value* p = getFloat(coefficients.front()); - coefficients.pop_front(); + coefficients.remove_prefix(1); for (float coefficient : coefficients) { - p = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(p, w), - getFloat(coefficient)); + p = FAdd(FMul(p, w), getFloat(coefficient)); } return p; }; @@ -949,108 +879,101 @@ StatusOr ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type, // } // return p*x llvm::Function* logf_fn = llvm::Intrinsic::getDeclaration( - module_, llvm::Intrinsic::log, {ir_builder_->getFloatTy()}); + module_, llvm::Intrinsic::log, {b_->getFloatTy()}); - llvm::Value* w = ir_builder_->CreateFNeg(ir_builder_->CreateCall( - logf_fn, - {ir_builder_->CreateFMul(ir_builder_->CreateFSub(getFloat(1.0f), x), - ir_builder_->CreateFAdd(getFloat(1.0f), x))})); + llvm::Value* w = FNeg( + Call(logf_fn, {FMul(FSub(getFloat(1.0f), x), FAdd(getFloat(1.0f), x))})); - llvm::Value* p_addr = llvm_ir::EmitAllocaAtFunctionEntry( - ir_builder_->getFloatTy(), "p.addr", ir_builder_); + llvm::Value* p_addr = + llvm_ir::EmitAllocaAtFunctionEntry(b_->getFloatTy(), "p.addr", b_); - llvm_ir::LlvmIfData if_data = - llvm_ir::EmitIfThenElse(ir_builder_->CreateFCmpOLT(w, getFloat(5.0f)), - "w_less_than_five", ir_builder_); + llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( + FCmpOLT(w, getFloat(5.0f)), "w_less_than_five", b_); // Handle true BB. - SetToFirstInsertPoint(if_data.true_block, ir_builder_); + SetToFirstInsertPoint(if_data.true_block, b_); { - llvm::Value* lw = ir_builder_->CreateFSub(w, getFloat(2.5f)); - tensorflow::gtl::ArraySlice lq{ + llvm::Value* lw = FSub(w, getFloat(2.5f)); + absl::Span lq{ 2.81022636e-08f, 3.43273939e-07f, -3.5233877e-06f, -4.39150654e-06f, 0.00021858087f, -0.00125372503f, -0.00417768164f, 0.246640727f, 1.50140941f}; llvm::Value* p = multiply_add(lq, lw); - ir_builder_->CreateStore(p, p_addr); + Store(p, p_addr); } // Handle false BB. - SetToFirstInsertPoint(if_data.false_block, ir_builder_); + SetToFirstInsertPoint(if_data.false_block, b_); { llvm::Function* sqrtf_fn = llvm::Intrinsic::getDeclaration( - module_, llvm::Intrinsic::sqrt, {ir_builder_->getFloatTy()}); + module_, llvm::Intrinsic::sqrt, {b_->getFloatTy()}); - llvm::Value* gw = ir_builder_->CreateFSub( - ir_builder_->CreateCall(sqrtf_fn, {w}), getFloat(3.0f)); - tensorflow::gtl::ArraySlice gq{ + llvm::Value* gw = FSub(Call(sqrtf_fn, w), getFloat(3.0f)); + absl::Span gq{ -0.000200214257f, 0.000100950558f, 0.00134934322f, -0.00367342844f, 0.00573950773f, -0.0076224613f, 0.00943887047f, 1.00167406f, 2.83297682f}; llvm::Value* p = multiply_add(gq, gw); - ir_builder_->CreateStore(p, p_addr); + Store(p, p_addr); } - SetToFirstInsertPoint(if_data.after_block, ir_builder_); - llvm::Value* p = ir_builder_->CreateLoad(p_addr); - return ir_builder_->CreateFMul(p, x); + SetToFirstInsertPoint(if_data.after_block, b_); + llvm::Value* p = Load(p_addr); + return FMul(p, x); } -StatusOr ElementalIrEmitter::EmitErfcInv( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr ElementalIrEmitter::EmitErfcInv(PrimitiveType prim_type, + llvm::Value* value) { // Compute erfcinv(value) by calculating erfinv(1.0 - value). auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_); auto one = llvm::ConstantFP::get(type, 1.0); - return EmitErfInv(prim_type, ir_builder_->CreateFSub(one, value)); + return EmitErfInv(prim_type, FSub(one, value)); } StatusOr ElementalIrEmitter::EmitLog(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::log, {value}, - {value->getType()}, ir_builder_); + {value->getType()}, b_); } StatusOr ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { auto x = value; auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_); auto one = llvm::ConstantFP::get(type, 1.0); auto negative_half = llvm::ConstantFP::get(type, -0.5); // When x is large, the naive evaluation of ln(x + 1) is more // accurate than the Taylor series. - TF_ASSIGN_OR_RETURN(auto for_large_x, - EmitLog(prim_type, ir_builder_->CreateFAdd(x, one))); + TF_ASSIGN_OR_RETURN(auto for_large_x, EmitLog(prim_type, FAdd(x, one))); // The Taylor series for ln(x+1) is x - x^2/2 - x^3/3 + …. - auto for_small_x = ir_builder_->CreateFMul( - ir_builder_->CreateFAdd(ir_builder_->CreateFMul(negative_half, x), one), - x); + auto for_small_x = FMul(FAdd(FMul(negative_half, x), one), x); const auto kAntilogarithmIsSmallThreshold = 1e-4; - auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, - {type}, ir_builder_); - auto x_is_small = ir_builder_->CreateFCmpOLT( + auto abs_x = + llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_); + auto x_is_small = FCmpOLT( abs_x, llvm::ConstantFP::get(type, kAntilogarithmIsSmallThreshold)); - return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x); + return Select(x_is_small, for_small_x, for_large_x); } StatusOr ElementalIrEmitter::EmitSin(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value}, - {value->getType()}, ir_builder_); + {value->getType()}, b_); } StatusOr ElementalIrEmitter::EmitCos(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {value}, - {value->getType()}, ir_builder_); + {value->getType()}, b_); } StatusOr ElementalIrEmitter::EmitExp(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {value}, - {value->getType()}, ir_builder_); + {value->getType()}, b_); } StatusOr ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type, - llvm::Value* value) const { + llvm::Value* value) { auto x = value; auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_); auto one = llvm::ConstantFP::get(type, 1.0); @@ -1058,44 +981,48 @@ StatusOr ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type, // When the exponent is large, the naive evaluation of e^(x) - 1 is more // accurate than the Taylor series. TF_ASSIGN_OR_RETURN(auto exp_x, EmitExp(prim_type, value)); - auto for_large_x = ir_builder_->CreateFSub(exp_x, one); + auto for_large_x = FSub(exp_x, one); // The Taylor series for exp(x) is 1 + x + x^2/2 + x^3/6 + …. // We want exp(x)-1 which is x + x^2/2 + x^3/6 + …. - auto x_squared = ir_builder_->CreateFAdd(x, x); - auto x_squared_over_two = ir_builder_->CreateFMul(x_squared, half); - auto for_small_x = ir_builder_->CreateFAdd(x, x_squared_over_two); + auto x_squared = FAdd(x, x); + auto x_squared_over_two = FMul(x_squared, half); + auto for_small_x = FAdd(x, x_squared_over_two); const auto kExponentIsSmallThreshold = 1e-5; - auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, - {type}, ir_builder_); - auto x_is_small = ir_builder_->CreateFCmpOLT( - abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold)); - return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x); + auto abs_x = + llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_); + auto x_is_small = + FCmpOLT(abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold)); + return Select(x_is_small, for_small_x, for_large_x); } StatusOr ElementalIrEmitter::EmitPow(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const { + llvm::Value* rhs) { return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::pow, {lhs, rhs}, - {lhs->getType()}, ir_builder_); + {lhs->getType()}, b_); } StatusOr ElementalIrEmitter::EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const { + llvm::Value* rhs) { return Unimplemented("atan2"); } +StatusOr ElementalIrEmitter::EmitTanh(PrimitiveType prim_type, + llvm::Value* value) { + return Unimplemented("tanh"); +} + StatusOr ElementalIrEmitter::EmitReducePrecision( - const HloInstruction* hlo, llvm::Value* x) const { + const HloInstruction* hlo, llvm::Value* x) { if (hlo->operand(0)->shape().element_type() != F32) { return Unimplemented("reduce-precision only implemented for F32"); } return EmitReducePrecisionFloat(x, /*exponent_bits=*/hlo->exponent_bits(), - /*mantissa_bits=*/hlo->mantissa_bits(), - ir_builder_); + /*mantissa_bits=*/hlo->mantissa_bits(), b_); } -static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* ir_builder, +static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs, llvm::Value* shift_result, bool saturate_to_sign_bit) { @@ -1108,64 +1035,145 @@ static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* ir_builder, llvm::ConstantInt* minus_one = llvm::ConstantInt::get(integer_type, -1); llvm::Value* saturated_value; if (saturate_to_sign_bit) { - saturated_value = ir_builder->CreateSelect( - ir_builder->CreateICmpSLT(lhs, zero), minus_one, zero); + saturated_value = + b->CreateSelect(b->CreateICmpSLT(lhs, zero), minus_one, zero); } else { saturated_value = zero; } llvm::Value* shift_amt_in_range = - ir_builder->CreateICmpULT(rhs, integer_bitsize_constant, "shft.chk"); - return ir_builder->CreateSelect(shift_amt_in_range, shift_result, - saturated_value); + b->CreateICmpULT(rhs, integer_bitsize_constant, "shft.chk"); + return b->CreateSelect(shift_amt_in_range, shift_result, saturated_value); +} + +llvm::Value* ElementalIrEmitter::GetOne(llvm::Type* type) { + return llvm::ConstantInt::get(llvm::cast(type), 1); +} + +llvm::Value* ElementalIrEmitter::GetZero(llvm::Type* type) { + return llvm::ConstantInt::get(llvm::cast(type), 0); +} + +llvm::Value* ElementalIrEmitter::GetIntSMin(llvm::Type* type) { + auto* integer_type = llvm::cast(type); + return llvm::ConstantInt::get(integer_type, llvm::APInt::getSignedMinValue( + integer_type->getBitWidth())); +} + +llvm::Value* ElementalIrEmitter::GetMinusOne(llvm::Type* type) { + auto* integer_type = llvm::cast(type); + return llvm::ConstantInt::get( + integer_type, llvm::APInt::getAllOnesValue(integer_type->getBitWidth())); +} + +llvm::Value* ElementalIrEmitter::IsZero(llvm::Value* v) { + return ICmpEQ(v, llvm::ConstantInt::get(v->getType(), 0)); +} + +llvm::Value* ElementalIrEmitter::IsIntMinDivisionOverflow(llvm::Value* lhs, + llvm::Value* rhs) { + return And(ICmpEQ(lhs, GetIntSMin(lhs->getType())), + ICmpEQ(rhs, GetMinusOne(rhs->getType()))); +} + +llvm::Value* ElementalIrEmitter::EmitIntegerDivide(llvm::Value* lhs, + llvm::Value* rhs, + bool is_signed) { + // Integer division overflow behavior: + // + // X / 0 == -1 + // INT_SMIN /s -1 = INT_SMIN + + if (!is_signed) { + llvm::Value* udiv_is_unsafe = IsZero(rhs); + llvm::Value* safe_rhs = Select(udiv_is_unsafe, GetOne(lhs->getType()), rhs); + llvm::Value* safe_div = UDiv(lhs, safe_rhs); + return Select(udiv_is_unsafe, GetMinusOne(lhs->getType()), safe_div); + } + + llvm::Value* has_zero_divisor = IsZero(rhs); + llvm::Value* has_int_min_overflow = IsIntMinDivisionOverflow(lhs, rhs); + llvm::Value* sdiv_is_unsafe = Or(has_int_min_overflow, has_zero_divisor); + llvm::Value* safe_rhs = Select(sdiv_is_unsafe, GetOne(lhs->getType()), rhs); + llvm::Value* safe_div = SDiv(lhs, safe_rhs); + + return Select( + has_zero_divisor, GetMinusOne(lhs->getType()), + Select(has_int_min_overflow, GetIntSMin(lhs->getType()), safe_div)); +} + +llvm::Value* ElementalIrEmitter::EmitIntegerRemainder(llvm::Value* lhs, + llvm::Value* rhs, + bool is_signed) { + // Integer remainder overflow behavior: + // + // X % 0 == X + // INT_SMIN %s -1 = 0 + + if (!is_signed) { + llvm::Value* urem_is_unsafe = IsZero(rhs); + llvm::Value* safe_rhs = Select(urem_is_unsafe, GetOne(lhs->getType()), rhs); + llvm::Value* safe_rem = URem(lhs, safe_rhs); + return Select(urem_is_unsafe, lhs, safe_rem); + } + + llvm::Value* has_zero_divisor = IsZero(rhs); + llvm::Value* has_int_min_overflow = IsIntMinDivisionOverflow(lhs, rhs); + llvm::Value* srem_is_unsafe = Or(has_int_min_overflow, has_zero_divisor); + llvm::Value* safe_rhs = Select(srem_is_unsafe, GetOne(lhs->getType()), rhs); + llvm::Value* safe_rem = SRem(lhs, safe_rhs); + + return Select( + has_zero_divisor, lhs, + Select(has_int_min_overflow, GetZero(lhs->getType()), safe_rem)); } StatusOr ElementalIrEmitter::EmitIntegerBinaryOp( const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const { + bool is_signed) { switch (op->opcode()) { // TODO(jingyue): add the "nsw" attribute for signed types. case HloOpcode::kAdd: - return ir_builder_->CreateAdd(lhs_value, rhs_value); + return Add(lhs_value, rhs_value); case HloOpcode::kSubtract: - return ir_builder_->CreateSub(lhs_value, rhs_value); + return Sub(lhs_value, rhs_value); case HloOpcode::kMultiply: - return ir_builder_->CreateMul(lhs_value, rhs_value); + return Mul(lhs_value, rhs_value); case HloOpcode::kDivide: - return is_signed ? ir_builder_->CreateSDiv(lhs_value, rhs_value) - : ir_builder_->CreateUDiv(lhs_value, rhs_value); + return EmitIntegerDivide(lhs_value, rhs_value, is_signed); case HloOpcode::kRemainder: - return is_signed ? ir_builder_->CreateSRem(lhs_value, rhs_value) - : ir_builder_->CreateURem(lhs_value, rhs_value); + return EmitIntegerRemainder(lhs_value, rhs_value, is_signed); case HloOpcode::kEq: return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_EQ, lhs_value, - rhs_value, ir_builder_); + rhs_value, b_); case HloOpcode::kNe: return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_NE, lhs_value, - rhs_value, ir_builder_); + rhs_value, b_); case HloOpcode::kLt: return llvm_ir::EmitComparison( is_signed ? llvm::CmpInst::ICMP_SLT : llvm::CmpInst::ICMP_ULT, - lhs_value, rhs_value, ir_builder_); + lhs_value, rhs_value, b_); case HloOpcode::kGt: return llvm_ir::EmitComparison( is_signed ? llvm::CmpInst::ICMP_SGT : llvm::CmpInst::ICMP_UGT, - lhs_value, rhs_value, ir_builder_); + lhs_value, rhs_value, b_); case HloOpcode::kLe: return llvm_ir::EmitComparison( is_signed ? llvm::CmpInst::ICMP_SLE : llvm::CmpInst::ICMP_ULE, - lhs_value, rhs_value, ir_builder_); + lhs_value, rhs_value, b_); case HloOpcode::kGe: return llvm_ir::EmitComparison( is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE, - lhs_value, rhs_value, ir_builder_); + lhs_value, rhs_value, b_); case HloOpcode::kMinimum: return EmitIntegralMin(lhs_value, rhs_value, is_signed); case HloOpcode::kMaximum: return EmitIntegralMax(lhs_value, rhs_value, is_signed); case HloOpcode::kAnd: - return ir_builder_->CreateAnd(lhs_value, rhs_value); + return And(lhs_value, rhs_value); case HloOpcode::kOr: - return ir_builder_->CreateOr(lhs_value, rhs_value); + return Or(lhs_value, rhs_value); + case HloOpcode::kXor: + return Xor(lhs_value, rhs_value); // Shifting out bits >= the number of bits in the type being shifted // produces a poison value in LLVM which is basically "deferred undefined @@ -1173,260 +1181,342 @@ StatusOr ElementalIrEmitter::EmitIntegerBinaryOp( // UB. We replace the poison value with a constant to avoid this deferred // UB. case HloOpcode::kShiftRightArithmetic: - return SaturateShiftIfNecessary( - ir_builder_, lhs_value, rhs_value, - ir_builder_->CreateAShr(lhs_value, rhs_value), - /*saturate_to_sign_bit=*/true); + return SaturateShiftIfNecessary(b_, lhs_value, rhs_value, + AShr(lhs_value, rhs_value), + /*saturate_to_sign_bit=*/true); case HloOpcode::kShiftLeft: - return SaturateShiftIfNecessary( - ir_builder_, lhs_value, rhs_value, - ir_builder_->CreateShl(lhs_value, rhs_value), - /*saturate_to_sign_bit=*/false); + return SaturateShiftIfNecessary(b_, lhs_value, rhs_value, + Shl(lhs_value, rhs_value), + /*saturate_to_sign_bit=*/false); case HloOpcode::kShiftRightLogical: - return SaturateShiftIfNecessary( - ir_builder_, lhs_value, rhs_value, - ir_builder_->CreateLShr(lhs_value, rhs_value), - /*saturate_to_sign_bit=*/false); + return SaturateShiftIfNecessary(b_, lhs_value, rhs_value, + LShr(lhs_value, rhs_value), + /*saturate_to_sign_bit=*/false); default: return Unimplemented("binary integer op '%s'", - HloOpcodeString(op->opcode()).c_str()); + HloOpcodeString(op->opcode())); } } llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const { - return ir_builder_->CreateSelect( - ir_builder_->CreateICmp( - is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE, - lhs_value, rhs_value), - lhs_value, rhs_value); + bool is_signed) { + return Select(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE + : llvm::ICmpInst::ICMP_UGE, + lhs_value, rhs_value), + lhs_value, rhs_value); } llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const { - return ir_builder_->CreateSelect( - ir_builder_->CreateICmp( - is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE, - lhs_value, rhs_value), - lhs_value, rhs_value); + bool is_signed) { + return Select(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE + : llvm::ICmpInst::ICMP_ULE, + lhs_value, rhs_value), + lhs_value, rhs_value); } llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex( const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo, - int64 operand_no) const { + int64 operand_no) { CHECK(hlo.IsElementwise()) << "HLO " << hlo.ToString() << " is not elementwise."; const Shape& operand_shape = hlo.operand(operand_no)->shape(); // If the operand is scalar, the source index is always {}. if (ShapeUtil::IsScalar(operand_shape)) { - return llvm_ir::IrArray::Index(); + return llvm_ir::IrArray::Index(target_index.GetType()); } // If no implicit broadcast is needed for this operand, returns the target // index as the source index. - if (ShapeUtil::CompatibleIgnoringElementType(operand_shape, hlo.shape())) { + // + // `IrArray::Index` may contain a physical linear which we can propagate to + // our operand only if our layouts match. "only if" is a bit strong since + // e.g. we can still forward the linear index if the operand shape is + // [5,1,1,5]{3,2,1,0} and the HLO shape is[5,1,1,5]{3,1,2,0}, but those cases + // are probably not worth handling here for now. + if (ShapeUtil::CompatibleIgnoringElementType(operand_shape, hlo.shape()) && + LayoutUtil::Equal(operand_shape.layout(), hlo.shape().layout())) { return target_index; } // If implicit broadcast is needed, the source dimensions that are broadcast // have index 0. CHECK_EQ(ShapeUtil::Rank(operand_shape), ShapeUtil::Rank(hlo.shape())); - llvm_ir::IrArray::Index source_index; + llvm_ir::IrArray::Index source_index(target_index.GetType()); for (int64 i = 0; i < ShapeUtil::Rank(hlo.shape()); ++i) { if (hlo.shape().dimensions(i) == operand_shape.dimensions(i)) { source_index.push_back(target_index[i]); } else { CHECK_EQ(1, operand_shape.dimensions(i)); - source_index.push_back(ir_builder_->getInt64(0)); + source_index.push_back(target_index.GetConstantWithIndexType(0)); } } return source_index; } -llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator( +StatusOr ElementalIrEmitter::ConvertValueForDistribution( const HloInstruction* hlo, - const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator) - const { - PrimitiveType param_prim_type = hlo->operand(0)->shape().element_type(); - llvm::Type* param_ir_type = - llvm_ir::PrimitiveTypeToIrType(param_prim_type, module_); - - // Same values as PCG library - // https://github.com/imneme/pcg-c/blob/master/include/pcg_variants.h - llvm::Value* multiplier = ir_builder_->getInt( - llvm::APInt(128, {0x4385DF649FCCF645, 0x2360ED051FC65DA4})); - llvm::Value* increment = ir_builder_->getInt( - llvm::APInt(128, {0x14057B7EF767814F, 0x5851F42D4C957F2D})); - - auto random_value_from_hlo = [hlo]() { - const HloModule* module = - hlo->IsFused() ? hlo->parent()->FusionInstruction()->parent()->parent() - : hlo->parent()->parent(); - return module->RandomNew64(); - }; + const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, + const llvm_ir::IrArray::Index& index, llvm::Value* raw_value) { + TF_ASSIGN_OR_RETURN(llvm::Value * a_or_mean, + operand_to_generator.at(hlo->operand(0))(index)); + TF_ASSIGN_OR_RETURN(llvm::Value * b_or_sigma, + operand_to_generator.at(hlo->operand(1))(index)); + PrimitiveType elem_prim_ty = hlo->shape().element_type(); + llvm::Type* elem_ir_ty = + llvm_ir::PrimitiveTypeToIrType(elem_prim_ty, module_); + llvm::Type* raw_value_ty = raw_value->getType(); + + // Convert raw integer to float in range [0, 1) if the element is a float. + llvm::Value* elem_value = raw_value; + if (elem_ir_ty->isFloatingPointTy()) { + unsigned raw_value_size_in_bits = raw_value_ty->getPrimitiveSizeInBits(); + CHECK(raw_value_size_in_bits == 32 || raw_value_size_in_bits == 64); + // Perform the division using the float type with the same number of bits + // as the raw value to avoid overflow. + if (raw_value_size_in_bits == 32) { + elem_value = UIToFP(elem_value, b_->getFloatTy()); + elem_value = FDiv(elem_value, + llvm::ConstantFP::get(b_->getFloatTy(), std::exp2(32))); + } else { + elem_value = UIToFP(elem_value, b_->getDoubleTy()); + elem_value = FDiv( + elem_value, llvm::ConstantFP::get(b_->getDoubleTy(), std::exp2(64))); + } + + if (elem_ir_ty != elem_value->getType()) { + elem_value = FPTrunc(elem_value, elem_ir_ty); + } + } - // Seed each RNG emitter with a new 64-bit seed from the HloModule. If the - // compilation order is deterministic (i.e., RandomNew64 invocation order is - // deterministic), then the order of RNG is deterministic for a given seed and - // hence tests will be deterministic. - // If the user provides a global seed instruction then we only use 64-bits of - // the host's random number generator to seed the 128 bit value with the other - // 64-bits is due to a user specified global seed instruction. - // Create a GlobalVariable to maintain state between invocations. There is a - // bug in NVPTX with GlobalVariable and 128 bit values, so using 2 64-bit + // Convert the value for the requested distribution. + switch (hlo->random_distribution()) { + case RNG_UNIFORM: { + if (elem_ir_ty->isFloatingPointTy()) { + return FAdd(FMul(FSub(b_or_sigma, a_or_mean), elem_value), a_or_mean); + } else { + // To generate a uniform random value in [a, b) from a raw random sample + // in range [0, 2^N), we let range = b - a and return + // (a + raw_value % range). If range is not a power of 2, raw values + // larger than (2^N - 2^N % range) are biased toward results in + // [a, a + (limit % range)). An unbiased algorithm would need to drop + // raw values and re-sample, but we don't do this because re-sampling in + // an efficient way is complex, and it's not clear that users need it. + // In particular, if one thread in a GPU warp needs to re-sample, we pay + // the same cost as if the whole warp were to re-sample. So an + // efficient re-sampling implementation on GPU would need to do + // nontrivial work to share entropy between threads in the warp. + auto range = Sub(b_or_sigma, a_or_mean); + return Add(a_or_mean, URem(elem_value, range)); + } + } + case RNG_NORMAL: { + TF_ASSIGN_OR_RETURN( + llvm::Value * r, + EmitErfcInv(elem_prim_ty, FMul(llvm::ConstantFP::get(elem_ir_ty, 2.0), + elem_value))); + return FAdd(FMul(r, b_or_sigma), a_or_mean); + } + default: + return InvalidArgument( + "unhandled distribution %s", + RandomDistribution_Name(hlo->random_distribution())); + } +} + +namespace { + +// Checks that the primitive type is supported by the elemental IR emitter for +// Philox RNG and returns the number of elements in each 128 bit sample of the +// Philox RNG algorithm. +int32 GetNumberOfElementsPerPhiloxRngSample(PrimitiveType elem_prim_ty) { + // Calculate the number of elements, that is the number of random numbers, in + // a 128 bit sample. + switch (elem_prim_ty) { + case U32: + case S32: + case F32: + // The algorithm uses 32 bits to generate values for F16. + case F16: + return 4; + case U64: + case S64: + case F64: + return 2; + default: + // BF16 is converted to F16 by the hlo pass HloElementTypeConverter. + // Other data types are not supported by XLA random operation. + LOG(FATAL) << "Unrecognized primitive type for RNG " << elem_prim_ty; + } + return 0; +} + +// Calculates the four uint32 values for the 128-bit Philox sample. +std::array CalculateSampleValues( + llvm::Value* sample_idx, llvm::Value* hlo_random_value, + llvm::Value* global_random_number, llvm::Value* rng_state, + llvm::IRBuilder<>* b) { + llvm::Type* index_ty = sample_idx->getType(); + + std::array counter_values; + + // Use the sample index to initialize counter[0] and counter[1]. + unsigned index_ty_size_in_bits = index_ty->getPrimitiveSizeInBits(); + CHECK(index_ty_size_in_bits == 32 || index_ty_size_in_bits == 64); + if (index_ty_size_in_bits == 32) { + counter_values[0] = sample_idx; + counter_values[1] = b->getInt32(0); + } else { + std::tie(counter_values[0], counter_values[1]) = + llvm_ir::SplitInt64ToInt32s(b, sample_idx); + } + + // Xor the global state variable with the global random number seed and use + // the result to initialize counter[2] and counter[3]. + std::tie(counter_values[2], counter_values[3]) = llvm_ir::SplitInt64ToInt32s( + b, b->CreateXor(rng_state, global_random_number)); + + // The algorithm uses a 64 bit key, which is also interpreted as two uint32 // values. - llvm::GlobalVariable* state_ptr0 = new llvm::GlobalVariable( - /*M=*/*module_, - /*Ty=*/ir_builder_->getInt64Ty(), - /*isConstant=*/false, - /*Linkage=*/llvm::GlobalValue::PrivateLinkage, - /*Initializer=*/ir_builder_->getInt64(random_value_from_hlo()), - /*Name=*/"state_ptr0"); - - // When the module config seed is 0, the expected result of a prng is a random - // value. Instead of using the random_value_from_hlo, we need a global random - // value as the graph seed. This is because if we use random_value_from_hlo - // here, then for a newly built hlo graph, it always gives the same number. - uint64 graph_seed = hlo_module_config_.seed() != 0 ? hlo_module_config_.seed() - : GlobalRandomValue(); - llvm::GlobalVariable* state_ptr1 = new llvm::GlobalVariable( - /*M=*/*module_, - /*Ty=*/ir_builder_->getInt64Ty(), - /*isConstant=*/false, - /*Linkage=*/llvm::GlobalValue::PrivateLinkage, - /*Initializer=*/ir_builder_->getInt64(graph_seed), - /*Name=*/"state_ptr1"); - - // We want each thread to use its own stream, so we modify the increment per - // thread. We want the increment to remain odd, so we shift the thread id left - // 1 and add it to the increment. - increment = ir_builder_->CreateAdd(increment, - ir_builder_->CreateShl(EmitThreadId(), 1)); - - // PCG-XSL-RR algorithm - // http://www.pcg-random.org/pdf/toms-oneill-pcg-family-v1.02.pdf - // state = multiplier * state + increment - // return uint64_t(state ^ (state >> 64))) >>> (state >> 122) - // where ">>>" is bitwise rotation - auto get_next_i64 = [=]() { - llvm::Value* state0 = ir_builder_->CreateZExtOrTrunc( - ir_builder_->CreateLoad(state_ptr0, "state0"), - ir_builder_->getInt128Ty()); - llvm::Value* state1 = ir_builder_->CreateShl( - ir_builder_->CreateZExtOrTrunc( - ir_builder_->CreateLoad(state_ptr1, "state1"), - ir_builder_->getInt128Ty()), - 64); - llvm::Value* state = ir_builder_->CreateOr(state0, state1); - llvm::Value* updated = ir_builder_->CreateAdd( - ir_builder_->CreateMul(state, multiplier), increment); - ir_builder_->CreateStore( - ir_builder_->CreateTrunc(updated, ir_builder_->getInt64Ty()), - state_ptr0); - ir_builder_->CreateStore( - ir_builder_->CreateTrunc(ir_builder_->CreateLShr(updated, 64), - ir_builder_->getInt64Ty()), - state_ptr1); - - return llvm_ir::CreateRor( - ir_builder_->CreateTrunc( - ir_builder_->CreateXor(state, ir_builder_->CreateLShr(state, 64)), - ir_builder_->getInt64Ty()), - ir_builder_->CreateTrunc(ir_builder_->CreateLShr(state, 122), - ir_builder_->getInt64Ty()), - ir_builder_); - }; + llvm::Value* key_values[2]; + + // Use a module random number to initialize the key. + std::tie(key_values[0], key_values[1]) = + llvm_ir::SplitInt64ToInt32s(b, hlo_random_value); + + // Prepare the constants used in the Philox RNG Algorithm. + llvm::Value* philoxW32A = b->getInt32(0x9E3779B9); + llvm::Value* philoxW32B = b->getInt32(0xBB67AE85); + llvm::Value* philoxM4xW32A = b->getInt32(0xD2511F53); + llvm::Value* philoxM4xW32B = b->getInt32(0xCD9E8D57); + + // Compute the 128 bit value for the current sample by repeating the + // single round computation and key raising computation for ten times. + for (int round = 0; round < 10; ++round) { + // A single round of computation of the counter values is as follows: + // MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0); + // MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1); + // counter[0] = hi1 ^ counter[1] ^ key[0]; + // counter[1] = lo1; + // counter[2] = hi0 ^ counter[3] ^ key[1]; + // counter[3] = lo0; + llvm::Value* lo0; + llvm::Value* hi0; + std::tie(lo0, hi0) = + llvm_ir::UMulLowHigh32(b, philoxM4xW32A, counter_values[0]); + llvm::Value* lo1; + llvm::Value* hi1; + std::tie(lo1, hi1) = + llvm_ir::UMulLowHigh32(b, philoxM4xW32B, counter_values[2]); + counter_values[0] = + b->CreateXor(hi1, b->CreateXor(counter_values[1], key_values[0])); + counter_values[1] = lo1; + counter_values[2] = + b->CreateXor(hi0, b->CreateXor(counter_values[3], key_values[1])); + counter_values[3] = lo0; + key_values[0] = b->CreateAdd(key_values[0], philoxW32A); + key_values[1] = b->CreateAdd(key_values[1], philoxW32B); + } - auto get_next_uniform_float = [=]() { - return ir_builder_->CreateFDiv( - ir_builder_->CreateUIToFP(get_next_i64(), param_ir_type), - llvm::ConstantFP::get(param_ir_type, 0x1p64)); - }; + return counter_values; +} + +} // namespace +// Implements the Philox algorithm to generate random numbers in parallel. +// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. +// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf +// +// The paper presents a few variants of the Philox algorithm, we picked the +// 4x32_10 version of the algorithm for the following reasons: +// . 4x32 uses 32-bit multiplication which is fast on GPUs. +// . The authors recommend the 10-round variant, and TensorFlow also uses it. +// +// Precondition: the RNG instruction is not fused. +llvm_ir::ElementGenerator ElementalIrEmitter::MakePhiloxRngElementGenerator( + const HloInstruction* hlo, + const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator) { + VLOG(3) << "Using philox RNG algorithm"; + CHECK(!hlo->IsFused()); + // A random number generated by the per module random number generator. + // This ensures that each RNG HLO generates a different random sequence. + llvm::Value* hlo_random_value = b_->getInt64(hlo->GetModule()->RandomNew64()); + // A value specified by the configuration or generated by a global random + // number generator. + llvm::Value* global_random_number = + b_->getInt64(hlo_module_config_.seed() != 0 ? hlo_module_config_.seed() + : GlobalRandomValue()); + + int elems_per_sample = + GetNumberOfElementsPerPhiloxRngSample(hlo->shape().element_type()); + + // Allocate stack storage for the 128 bit sample as four int32. + llvm::Type* int32_ty = b_->getInt32Ty(); + llvm::Value* sample_address = llvm_ir::EmitAllocaAtFunctionEntryWithCount( + int32_ty, /*element_count=*/b_->getInt32(4), "sample", b_); + + // Load the global state variable for the Philox RNG algorithm. + llvm::GlobalVariable* rng_state_ptr = + llvm_ir::GetOrCreateVariableForPhiloxRngState(module_, b_); + llvm::Value* rng_state = Load(rng_state_ptr, "rng_state_value"); + + // Build and return the elemental IR generator to generate a random value for + // the element corresponding to the current thread. + // + // This elemental IR generator computes one sample with multiple random + // numbers but only returns one random number. As a result, neighboring + // threads may calculate the same sample unnecessarily. However, if the + // kernel containing the RNG hlo is unrolled, LLVM is able to optimize away + // the duplicated computation of the same sample. In particular, if the unroll + // factor is a multiplier of elems_per_sample, LLVM is able to completely + // remove such duplicated computation. If the unroll factor is a non-trivial + // factor of elems_per_sample, LLVM can only partially remove such duplicated + // computation. return [=](const llvm_ir::IrArray::Index& index) -> StatusOr { - switch (hlo->random_distribution()) { - case RNG_UNIFORM: { - TF_ASSIGN_OR_RETURN(llvm::Value * p, - operand_to_generator.at(hlo->operand(0))(index)); - TF_ASSIGN_OR_RETURN(llvm::Value * q, - operand_to_generator.at(hlo->operand(1))(index)); - if (primitive_util::IsFloatingPointType(param_prim_type)) { - return ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(ir_builder_->CreateFSub(q, p), - get_next_uniform_float()), - p); - } else { - auto r = ir_builder_->CreateSub(q, p); - auto leading_zeros = llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::ctlz, {r, ir_builder_->getInt1(true)}, - {param_ir_type}, ir_builder_); - auto in_block = ir_builder_->GetInsertBlock(); - - // A terminator should be present iff we're emitting code - // into the middle (as opposed to the end) of a basic block. - CHECK_EQ(ir_builder_->GetInsertPoint() == in_block->end(), - in_block->getTerminator() == nullptr); - - llvm::BasicBlock* body_block; - llvm::BasicBlock* out_block; - - if (ir_builder_->GetInsertPoint() == in_block->end()) { - body_block = llvm_ir::CreateBasicBlock( - nullptr, IrName(hlo, "rng_body"), ir_builder_); - out_block = llvm_ir::CreateBasicBlock( - nullptr, IrName(hlo, "rng_out"), ir_builder_); - llvm::BranchInst::Create(body_block, in_block); - } else { - body_block = in_block->splitBasicBlock( - ir_builder_->GetInsertPoint(), "rng_body"); - out_block = body_block->splitBasicBlock( - ir_builder_->GetInsertPoint(), "rng_out"); - body_block->getTerminator()->eraseFromParent(); - } + llvm::Type* index_ty = index.GetType(); + // Calculate the linear element index. + llvm::Value* elem_idx = index.linear(); + if (elem_idx == nullptr) { + elem_idx = index.Linearize(AsInt64Slice(hlo->shape().dimensions()), b_); + } - SetToFirstInsertPoint(body_block, ir_builder_); - auto random = ir_builder_->CreateAnd( - ir_builder_->CreateZExtOrTrunc(get_next_i64(), param_ir_type), - ir_builder_->CreateLShr(llvm::ConstantInt::get(param_ir_type, ~0), - leading_zeros)); - llvm::BranchInst::Create(out_block, body_block, - ir_builder_->CreateICmpULT(random, r), - body_block); - SetToFirstInsertPoint(out_block, ir_builder_); - return ir_builder_->CreateAdd( - p, ir_builder_->CreateSelect( - ir_builder_->CreateICmpEQ(p, q), - llvm::ConstantInt::get(param_ir_type, 0), random)); - } - } - case RNG_NORMAL: { - TF_ASSIGN_OR_RETURN(llvm::Value * m, - operand_to_generator.at(hlo->operand(0))(index)); - TF_ASSIGN_OR_RETURN(llvm::Value * s, - operand_to_generator.at(hlo->operand(1))(index)); - TF_ASSIGN_OR_RETURN( - llvm::Value * r, - EmitErfcInv(param_prim_type, - ir_builder_->CreateFMul( - llvm::ConstantFP::get(param_ir_type, 2.0), - get_next_uniform_float()))); - return ir_builder_->CreateFAdd(ir_builder_->CreateFMul(r, s), m); - } - default: - return InvalidArgument( - "unhandled distribution %s", - RandomDistribution_Name(hlo->random_distribution()).c_str()); + // Calculate the index for the 128 bit sample and the offset of the current + // element within the sample. + llvm::Value* elems_per_sample_value = + llvm::ConstantInt::get(index_ty, elems_per_sample); + llvm::Value* sample_idx = UDiv(elem_idx, elems_per_sample_value); + llvm::Value* elem_offset = URem(elem_idx, elems_per_sample_value); + + std::array counter_values = CalculateSampleValues( + sample_idx, hlo_random_value, global_random_number, rng_state, b_); + + // Store the four counter_values into the sample_address alloca so we can + // load the elem_offset'th one below. + for (int idx = 0; idx < 4; ++idx) { + Store(counter_values[idx], + InBoundsGEP(sample_address, b_->getInt32(idx))); } + + llvm::Type* int64_ty = b_->getInt64Ty(); + CHECK(elems_per_sample == 2 || elems_per_sample == 4); + llvm::Type* raw_value_ty = elems_per_sample == 2 ? int64_ty : int32_ty; + // Retrieve the raw value for the current element from the current sample. + llvm::Value* raw_elem_value = Load( + InBoundsGEP(PointerCast(sample_address, raw_value_ty->getPointerTo()), + elem_offset), + "raw_elem_value"); + + return ConvertValueForDistribution(hlo, operand_to_generator, index, + raw_elem_value); }; } StatusOr ElementalIrEmitter::EmitElementalSelect( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { TF_ASSIGN_OR_RETURN(llvm::Value * pred_value, operand_to_generator.at(hlo->operand(0))( ElementwiseSourceIndex(index, *hlo, 0))); @@ -1436,15 +1526,14 @@ StatusOr ElementalIrEmitter::EmitElementalSelect( TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value, operand_to_generator.at(hlo->operand(2))( ElementwiseSourceIndex(index, *hlo, 2))); - return ir_builder_->CreateSelect( - ir_builder_->CreateTrunc(pred_value, ir_builder_->getInt1Ty()), - on_true_value, on_false_value); + return Select(Trunc(pred_value, b_->getInt1Ty()), on_true_value, + on_false_value); } StatusOr ElementalIrEmitter::EmitElementalClamp( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { TF_ASSIGN_OR_RETURN(llvm::Value * min_value, operand_to_generator.at(hlo->operand(0))( ElementwiseSourceIndex(index, *hlo, 0))); @@ -1463,120 +1552,115 @@ StatusOr ElementalIrEmitter::EmitElementalClamp( max_value, EmitIntegralMax(min_value, arg_value, is_signed), is_signed); } else { return Unimplemented("Clamp unimplemented for %s", - PrimitiveType_Name(prim_type).c_str()); + PrimitiveType_Name(prim_type)); } } StatusOr ElementalIrEmitter::EmitElementalConcatenate( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& target_index) const { + const llvm_ir::IrArray::Index& target_index) { const int64 concat_dim = hlo->dimensions(0); auto source_index = target_index; - llvm::BasicBlock* init_block = ir_builder_->GetInsertBlock(); + llvm::BasicBlock* init_block = b_->GetInsertBlock(); // A terminator should be present iff we're emitting code // into the middle (as opposed to the end) of a basic block. - CHECK_EQ(ir_builder_->GetInsertPoint() == init_block->end(), + CHECK_EQ(b_->GetInsertPoint() == init_block->end(), init_block->getTerminator() == nullptr); llvm::BasicBlock* exit_block; - if (ir_builder_->GetInsertPoint() == init_block->end()) { + if (b_->GetInsertPoint() == init_block->end()) { exit_block = llvm_ir::CreateBasicBlock( - /*insert_before=*/nullptr, IrName(hlo, "merge"), ir_builder_); + /*insert_before=*/nullptr, IrName(hlo, "merge"), b_); } else { - exit_block = init_block->splitBasicBlock(ir_builder_->GetInsertPoint(), + exit_block = init_block->splitBasicBlock(b_->GetInsertPoint(), AsStringRef(IrName(hlo, "merge"))); init_block->getTerminator()->eraseFromParent(); } - llvm_ir::SetToFirstInsertPoint(exit_block, ir_builder_); - llvm::PHINode* output = ir_builder_->CreatePHI( - llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_), - hlo->operands().size()); - auto prior_insert_point = ir_builder_->GetInsertPoint(); + llvm_ir::SetToFirstInsertPoint(exit_block, b_); + llvm::PHINode* output = + PHI(llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_), + hlo->operands().size()); + auto prior_insert_point = b_->GetInsertPoint(); - ir_builder_->SetInsertPoint(init_block); + b_->SetInsertPoint(init_block); for (int64 operand_idx = 0; operand_idx < hlo->operand_count(); ++operand_idx) { const HloInstruction* operand = hlo->operand(operand_idx); auto true_block = llvm_ir::CreateBasicBlock( - exit_block, StrCat("concat_index_from_operand", operand_idx), - ir_builder_); + exit_block, StrCat("concat_index_from_operand", operand_idx), b_); auto false_block = llvm_ir::CreateBasicBlock( - exit_block, StrCat("concat_index_not_from_operand", operand_idx), - ir_builder_); + exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_); auto concat_dim_size = llvm::ConstantInt::get(source_index[concat_dim]->getType(), operand->shape().dimensions(concat_dim)); - ir_builder_->CreateCondBr( - ir_builder_->CreateICmpULT(source_index[concat_dim], concat_dim_size), - true_block, false_block); + CondBr(ICmpULT(source_index[concat_dim], concat_dim_size), true_block, + false_block); // Create the terminator of the true block before calling operand // generators, because they require non-degenerate basic blocks. - ir_builder_->SetInsertPoint( + b_->SetInsertPoint( llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block)); TF_ASSIGN_OR_RETURN(llvm::Value * value, operand_to_generator.at(operand)(source_index)); - output->addIncoming(value, ir_builder_->GetInsertBlock()); + output->addIncoming(value, b_->GetInsertBlock()); // Subtract the size of the concat dimension of the current operand // from the source index. - ir_builder_->SetInsertPoint(false_block); - source_index[concat_dim] = - ir_builder_->CreateSub(source_index[concat_dim], concat_dim_size); + b_->SetInsertPoint(false_block); + source_index[concat_dim] = Sub(source_index[concat_dim], concat_dim_size); } - ir_builder_->CreateUnreachable(); - ir_builder_->SetInsertPoint(exit_block, prior_insert_point); + Unreachable(); + b_->SetInsertPoint(exit_block, prior_insert_point); return output; } StatusOr ElementalIrEmitter::EmitElementalDynamicSlice( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { // Emit IR to read dynamic start indices from hlo->operand(1). const HloInstruction* input_hlo = hlo->operand(0); const int64 rank = ShapeUtil::Rank(input_hlo->shape()); - llvm_ir::IrArray::Index slice_start_index(rank); + // Use the same index type for all tensor accesses in the same kernel. + llvm::Type* index_type = index.GetType(); + llvm_ir::IrArray::Index slice_start_index(index_type, rank); for (int64 i = 0; i < rank; ++i) { - llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i)); + auto index_typed_const = [&](uint64 c) -> llvm::Constant* { + return llvm::ConstantInt::get(index_type, c); + }; + llvm_ir::IrArray::Index dim_index(1, index_typed_const(i)); TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value, operand_to_generator.at(hlo->operand(1))(dim_index)); // Clamp the start index so that the sliced portion fits in the operand: // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size) + start_index_value = SExtOrTrunc(start_index_value, index_type); + int64 largest_valid_start_index = + input_hlo->shape().dimensions(i) - hlo->shape().dimensions(i); + CHECK_GE(largest_valid_start_index, 0); - // TODO(b/74360564): This is implementation defined behavior, but is - // currently respected by all implementations. Change this if we ever decide - // to oficially document different behavior. - start_index_value = ir_builder_->CreateSExtOrBitCast(start_index_value, - index[i]->getType()); - llvm::Value* operand_dim_size = llvm::ConstantInt::get( - start_index_value->getType(), input_hlo->shape().dimensions(i)); - llvm::Value* output_dim_size = llvm::ConstantInt::get( - start_index_value->getType(), hlo->shape().dimensions(i)); - + bool is_signed = ShapeUtil::ElementIsSigned(hlo->operand(1)->shape()); start_index_value = EmitIntegralMin( - ir_builder_->CreateSub(operand_dim_size, output_dim_size), - EmitIntegralMax(llvm::ConstantInt::get(start_index_value->getType(), 0), - start_index_value, /*is_signed=*/true), - /*is_signed=*/true); + index_typed_const(largest_valid_start_index), + EmitIntegralMax(index_typed_const(0), start_index_value, is_signed), + is_signed); start_index_value->setName( AsStringRef(IrName(hlo, StrCat("start_idx", i)))); slice_start_index[i] = start_index_value; } - llvm_ir::IrArray::Index input_index(rank); + llvm_ir::IrArray::Index input_index(index_type, rank); for (int64 i = 0; i < rank; ++i) { // Emit IR which computes: // input_index = start_index + offset_index - input_index[i] = ir_builder_->CreateAdd(slice_start_index[i], index[i]); + input_index[i] = Add(slice_start_index[i], index[i]); } return operand_to_generator.at(input_hlo)(input_index); } @@ -1584,7 +1668,7 @@ StatusOr ElementalIrEmitter::EmitElementalDynamicSlice( StatusOr ElementalIrEmitter::EmitElementalGather( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { const Shape& operand_shape = hlo->operand(0)->shape(); const Shape& indices_shape = hlo->operand(1)->shape(); const Shape& output_shape = hlo->shape(); @@ -1596,29 +1680,32 @@ StatusOr ElementalIrEmitter::EmitElementalGather( const llvm_ir::ElementGenerator& indices_generator = operand_to_generator.at(hlo->operand(1)); + llvm::Type* index_type = index.GetType(); // This is the index into `operand` that holds the element we want to - // generate. This index "unsafe" as in the components in here may be - // out of bounds. - IrArray::Index unsafe_operand_index; - - // First copy in the window indices to unsafe_operand_index. - for (int64 i = 0, e = operand_shape.dimensions_size(), - unsafe_operand_index_dim = 0; + // generate. + IrArray::Index operand_index(index_type); + + // First copy in the window indices to operand_index. Also collect a mapping + // from operand dimension to output window dimension. Elided window dimensions + // map to -1. + std::vector operand_to_output_dim(operand_shape.dimensions_size(), -1); + for (int64 i = 0, e = operand_shape.dimensions_size(), operand_index_dim = 0; i < e; i++) { - if (c_binary_search(dim_numbers.elided_window_dims(), i)) { - unsafe_operand_index.push_back(ir_builder_->getInt64(0)); + if (absl::c_binary_search(dim_numbers.collapsed_slice_dims(), i)) { + operand_index.push_back(index.GetConstantWithIndexType(0)); } else { - unsafe_operand_index.push_back( - index[dim_numbers.output_window_dims(unsafe_operand_index_dim++)]); + int64 output_window_dim = dim_numbers.offset_dims(operand_index_dim++); + operand_to_output_dim[i] = output_window_dim; + operand_index.push_back(index[output_window_dim]); } } - // This is the index of the index vector in the gather_indices tensor. - IrArray::Index gather_index_index; + // This is the index of the index vector in the start_indices tensor. + IrArray::Index gather_index_index(index_type); { std::vector gather_index_index_components; for (int64 i = 0, e = output_shape.dimensions_size(); i < e; i++) { - if (!c_binary_search(dim_numbers.output_window_dims(), i)) { + if (!absl::c_binary_search(dim_numbers.offset_dims(), i)) { gather_index_index.push_back(index[i]); } } @@ -1628,95 +1715,104 @@ StatusOr ElementalIrEmitter::EmitElementalGather( } } - auto add_to_unsafe_operand_index = [&](llvm::Value* index_component, - int64 dim) { - llvm::Value* gather_dim_component_extended = ir_builder_->CreateSExtOrTrunc( - index_component, ir_builder_->getInt64Ty()); - unsafe_operand_index[dim_numbers.gather_dims_to_operand_dims(dim)] = - ir_builder_->CreateAdd( - unsafe_operand_index[dim_numbers.gather_dims_to_operand_dims(dim)], - gather_dim_component_extended); + auto add_to_operand_index = [&](llvm::Value* index_component, int64 dim) { + llvm::Value* gather_dim_component_extended = + SExtOrTrunc(index_component, index_type); + int64 operand_dim = dim_numbers.start_index_map(dim); + int64 output_dim = operand_to_output_dim[operand_dim]; + // If 'output_dim' is -1, it means 'operand_dim' is an elided window dim. + // This means we set the iteration index to 0, so for the purpose of the + // following calculations we can consider the output dimension size to be 1. + int64 output_dim_size = + output_dim == -1 ? 1 : output_shape.dimensions(output_dim); + int64 largest_valid_start_index = + operand_shape.dimensions(operand_dim) - output_dim_size; + CHECK_GE(largest_valid_start_index, 0); + + // Clamp the gather index so that the gather region fits in the operand. + // gather_dim_component_extended_inbound = + // clamp(gather_dim_component_extended, 0, largest_valid_start_index); + + // TODO(b/111078873): This is implementation defined behavior. + bool is_signed = ShapeUtil::ElementIsSigned(indices_shape); + auto gather_dim_component_extended_inbound = EmitIntegralMin( + index.GetConstantWithIndexType(largest_valid_start_index), + EmitIntegralMax(index.GetConstantWithIndexType(0), + gather_dim_component_extended, is_signed), + is_signed); + + operand_index[operand_dim] = + Add(operand_index[operand_dim], gather_dim_component_extended_inbound); }; if (indices_shape.dimensions_size() == dim_numbers.index_vector_dim()) { TF_ASSIGN_OR_RETURN(llvm::Value * gather_dim_component, indices_generator(gather_index_index)); - add_to_unsafe_operand_index(gather_dim_component, 0); + add_to_operand_index(gather_dim_component, 0); } else { int64 index_vector_size = indices_shape.dimensions(dim_numbers.index_vector_dim()); for (int64 i = 0; i < index_vector_size; i++) { gather_index_index[dim_numbers.index_vector_dim()] = - ir_builder_->getInt64(i); + index.GetConstantWithIndexType(i); TF_ASSIGN_OR_RETURN(llvm::Value * gather_dim_component, indices_generator(gather_index_index)); - add_to_unsafe_operand_index(gather_dim_component, i); + add_to_operand_index(gather_dim_component, i); } } - - IrArray::Index safe_operand_index; - for (int64 i = 0, e = unsafe_operand_index.size(); i < e; i++) { - safe_operand_index.push_back(ir_builder_->CreateURem( - unsafe_operand_index[i], - ir_builder_->getInt64(operand_shape.dimensions(i)))); - } - - return operand_generator(safe_operand_index); + return operand_generator(operand_index); } StatusOr ElementalIrEmitter::EmitElementalDynamicUpdateSlice( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const { + const llvm_ir::IrArray::Index& index) { const HloInstruction* input_hlo = hlo->operand(0); const HloInstruction* update_hlo = hlo->operand(1); const HloInstruction* start_hlo = hlo->operand(2); // Calculate slice start/end indices. const int64 rank = ShapeUtil::Rank(input_hlo->shape()); - llvm_ir::IrArray::Index slice_start_index(rank); - llvm_ir::IrArray::Index slice_limit_index(rank); + llvm_ir::IrArray::Index slice_start_index(index.GetType(), rank); + llvm_ir::IrArray::Index slice_limit_index(index.GetType(), rank); // Slice intersection gathers (ANDs) conditions on all ranks for which // 'input' is set to 'update' - llvm::Value* slice_intersection = ir_builder_->getTrue(); + llvm::Value* slice_intersection = b_->getTrue(); for (int64 i = 0; i < rank; ++i) { - llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i)); + llvm::Type* index_type = index[0]->getType(); + auto index_typed_const = [&](uint64 c) -> llvm::Constant* { + return llvm::ConstantInt::get(index_type, c); + }; + llvm_ir::IrArray::Index dim_index(1, index_typed_const(i)); TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value, operand_to_generator.at(start_hlo)(dim_index)); // Clamp the start index so that the update region fits in the operand. // start_index = clamp(start_index, 0, input_dim_size - update_dim_size) - - // TODO(b/74360564): This is implementation defined behavior, but is - // currently respected by all implementations. Change this if we ever decide - // to oficially document different behavior. - start_index_value = ir_builder_->CreateSExtOrBitCast(start_index_value, - index[i]->getType()); - llvm::Value* input_dim_size = llvm::ConstantInt::get( - index[i]->getType(), input_hlo->shape().dimensions(i)); - llvm::Value* update_dim_size = llvm::ConstantInt::get( - index[i]->getType(), update_hlo->shape().dimensions(i)); - + start_index_value = SExtOrTrunc(start_index_value, index_type); + llvm::Value* update_dim_size = + index_typed_const(update_hlo->shape().dimensions(i)); + int64 largest_valid_start_index = + input_hlo->shape().dimensions(i) - update_hlo->shape().dimensions(i); + CHECK_GE(largest_valid_start_index, 0); + + bool is_signed = ShapeUtil::ElementIsSigned(start_hlo->shape()); start_index_value = EmitIntegralMin( - ir_builder_->CreateSub(input_dim_size, update_dim_size), - EmitIntegralMax(llvm::ConstantInt::get(start_index_value->getType(), 0), - start_index_value, /*is_signed=*/true), - /*is_signed=*/true); + index_typed_const(largest_valid_start_index), + EmitIntegralMax(index_typed_const(0), start_index_value, is_signed), + is_signed); start_index_value->setName( AsStringRef(IrName(hlo, StrCat("start_idx", i)))); slice_start_index[i] = start_index_value; - slice_limit_index[i] = - ir_builder_->CreateAdd(slice_start_index[i], update_dim_size); - - slice_intersection = ir_builder_->CreateAnd( - slice_intersection, - ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]), - "slice_intersection"); - slice_intersection = ir_builder_->CreateAnd( - slice_intersection, - ir_builder_->CreateICmpSLT(index[i], slice_limit_index[i]), - "slice_intersection"); + slice_limit_index[i] = Add(slice_start_index[i], update_dim_size); + + slice_intersection = + And(slice_intersection, ICmpSGE(index[i], slice_start_index[i]), + "slice_intersection"); + slice_intersection = + And(slice_intersection, ICmpSLT(index[i], slice_limit_index[i]), + "slice_intersection"); } // Emit: @@ -1724,62 +1820,58 @@ StatusOr ElementalIrEmitter::EmitElementalDynamicUpdateSlice( // else -> return data from 'input'. llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry( llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_), - "ret_value_addr", ir_builder_); - llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( - slice_intersection, "slice_intersection", ir_builder_); + "ret_value_addr", b_); + llvm_ir::LlvmIfData if_data = + llvm_ir::EmitIfThenElse(slice_intersection, "slice_intersection", b_); // Handle true BB (return data from 'update') - SetToFirstInsertPoint(if_data.true_block, ir_builder_); + SetToFirstInsertPoint(if_data.true_block, b_); // Compute update index for intersection case. - llvm_ir::IrArray::Index update_index(rank); + llvm_ir::IrArray::Index update_index(index.GetType(), rank); for (int64 i = 0; i < rank; ++i) { - update_index[i] = ir_builder_->CreateSub(index[i], slice_start_index[i]); + update_index[i] = Sub(index[i], slice_start_index[i]); } TF_ASSIGN_OR_RETURN(llvm::Value * true_value, operand_to_generator.at(update_hlo)(update_index)); - ir_builder_->CreateStore(true_value, ret_value_addr); + Store(true_value, ret_value_addr); // Handle false BB (return data from 'input') - SetToFirstInsertPoint(if_data.false_block, ir_builder_); + SetToFirstInsertPoint(if_data.false_block, b_); TF_ASSIGN_OR_RETURN(llvm::Value * false_value, operand_to_generator.at(input_hlo)(index)); - ir_builder_->CreateStore(false_value, ret_value_addr); + Store(false_value, ret_value_addr); - SetToFirstInsertPoint(if_data.after_block, ir_builder_); - return ir_builder_->CreateLoad(ret_value_addr); + SetToFirstInsertPoint(if_data.after_block, b_); + return Load(ret_value_addr); } StatusOr ElementalIrEmitter::EmitElementalPad( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& padded_index) const { + const llvm_ir::IrArray::Index& padded_index) { auto index = padded_index; - llvm::Value* in_bounds = ir_builder_->getTrue(); + llvm::Value* in_bounds = b_->getTrue(); for (size_t i = 0; i < index.size(); ++i) { auto index_typed_const = [=](int64 n) { return llvm::ConstantInt::get(index[i]->getType(), n); }; const auto& pad_dim = hlo->padding_config().dimensions(i); - index[i] = ir_builder_->CreateSub( - index[i], index_typed_const(pad_dim.edge_padding_low())); - in_bounds = ir_builder_->CreateAnd( - in_bounds, ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)), - "in_bounds"); - in_bounds = ir_builder_->CreateAnd( + index[i] = Sub(index[i], index_typed_const(pad_dim.edge_padding_low())); + in_bounds = + And(in_bounds, ICmpSGE(index[i], index_typed_const(0)), "in_bounds"); + in_bounds = And( in_bounds, - ir_builder_->CreateICmpEQ( + ICmpEQ( index_typed_const(0), - ir_builder_->CreateURem( - index[i], index_typed_const(pad_dim.interior_padding() + 1))), - "in_bounds"); - index[i] = ir_builder_->CreateSDiv( - index[i], index_typed_const(pad_dim.interior_padding() + 1)); - in_bounds = ir_builder_->CreateAnd( - in_bounds, - ir_builder_->CreateICmpSLT( - index[i], - index_typed_const(hlo->operand(0)->shape().dimensions(i))), + URem(index[i], index_typed_const(pad_dim.interior_padding() + 1))), "in_bounds"); + index[i] = + SDiv(index[i], index_typed_const(pad_dim.interior_padding() + 1)); + in_bounds = + And(in_bounds, + ICmpSLT(index[i], + index_typed_const(hlo->operand(0)->shape().dimensions(i))), + "in_bounds"); } // if (in_bounds) { @@ -1789,31 +1881,32 @@ StatusOr ElementalIrEmitter::EmitElementalPad( // } llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry( llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_), - "pad_result_addr", ir_builder_); + "pad_result_addr", b_); llvm_ir::LlvmIfData if_data = - llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_); - SetToFirstInsertPoint(if_data.true_block, ir_builder_); + llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_); + SetToFirstInsertPoint(if_data.true_block, b_); TF_ASSIGN_OR_RETURN(llvm::Value * operand_value, operand_to_generator.at(hlo->operand(0))(index)); - ir_builder_->CreateStore(operand_value, ret_value_addr); + Store(operand_value, ret_value_addr); - SetToFirstInsertPoint(if_data.false_block, ir_builder_); + SetToFirstInsertPoint(if_data.false_block, b_); TF_ASSIGN_OR_RETURN(llvm::Value * padding_value, - operand_to_generator.at(hlo->operand(1))({})); - ir_builder_->CreateStore(padding_value, ret_value_addr); + operand_to_generator.at(hlo->operand(1))( + IrArray::Index(index.GetType()))); + Store(padding_value, ret_value_addr); - SetToFirstInsertPoint(if_data.after_block, ir_builder_); + SetToFirstInsertPoint(if_data.after_block, b_); // Don't create phi(operand_value, padding_value) here, because invoking // operand_to_generator may create new basic blocks, making the parent // of operand_value or padding_value no longer a predecessor of // if_data.after_block. - return ir_builder_->CreateLoad(ret_value_addr); + return Load(ret_value_addr); } StatusOr ElementalIrEmitter::EmitElementalDot( const HloInstruction* hlo, const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& dot_result_index) const { + const llvm_ir::IrArray::Index& dot_result_index) { auto lhs_generator = operand_to_generator.at(hlo->operand(0)); auto rhs_generator = operand_to_generator.at(hlo->operand(1)); @@ -1826,21 +1919,24 @@ StatusOr ElementalIrEmitter::EmitElementalDot( int64 lhs_dims = hlo->operand(0)->shape().dimensions_size(); int64 rhs_dims = hlo->operand(1)->shape().dimensions_size(); + llvm::Type* index_type = dot_result_index[0]->getType(); + auto index_typed_const = [&](uint64 c) -> llvm::Constant* { + return llvm::ConstantInt::get(index_type, c); + }; + std::unique_ptr inner_loop = llvm_ir::ForLoop::EmitForLoop( - IrName(hlo, "inner"), ir_builder_->getInt64(0), - ir_builder_->getInt64(contracted_dim_size), ir_builder_->getInt64(1), - ir_builder_); + IrName(hlo, "inner"), index_typed_const(0), + index_typed_const(contracted_dim_size), index_typed_const(1), b_); - SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(), ir_builder_); + SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(), b_); PrimitiveType primitive_type = hlo->shape().element_type(); llvm::Type* primitive_type_llvm = llvm_ir::PrimitiveTypeToIrType(primitive_type, module_); - llvm::Value* accumulator_alloca = llvm_ir::EmitAllocaAtFunctionEntry( - primitive_type_llvm, "dot_acc", ir_builder_); - ir_builder_->CreateStore(llvm::Constant::getNullValue(primitive_type_llvm), - accumulator_alloca); + llvm::Value* accumulator_alloca = + llvm_ir::EmitAllocaAtFunctionEntry(primitive_type_llvm, "dot_acc", b_); + Store(llvm::Constant::getNullValue(primitive_type_llvm), accumulator_alloca); - SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), ir_builder_); + SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), b_); // This is the inner reduction loop for a dot operation that produces // one element in the output. If the operands to the dot operation have @@ -1848,7 +1944,7 @@ StatusOr ElementalIrEmitter::EmitElementalDot( // Given an output index [a,b,c,d,e] in the result, we compute: // sum(lhs[a,b,c,t]*rhs[d,t,e] for t in [0, T)) - IrArray::Index lhs_index, rhs_index; + IrArray::Index lhs_index(index_type), rhs_index(index_type); for (int64 i = 0; i < lhs_dims - 1; i++) { lhs_index.push_back(dot_result_index[i]); @@ -1860,49 +1956,37 @@ StatusOr ElementalIrEmitter::EmitElementalDot( } rhs_index.InsertAt(rhs_contracting_dim, inner_loop->GetIndVarValue()); - llvm::Value* current_accumulator = - ir_builder_->CreateLoad(accumulator_alloca); + llvm::Value* current_accumulator = Load(accumulator_alloca); TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index)); TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index)); llvm::Value* next_accumulator; if (primitive_util::IsComplexType(primitive_type)) { - llvm::Value* product_real = ir_builder_->CreateFSub( - ir_builder_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractReal(rhs_value)), - ir_builder_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractImag(rhs_value))); - llvm::Value* product_imag = ir_builder_->CreateFAdd( - ir_builder_->CreateFMul(EmitExtractReal(lhs_value), - EmitExtractImag(rhs_value)), - ir_builder_->CreateFMul(EmitExtractImag(lhs_value), - EmitExtractReal(rhs_value))); - next_accumulator = ir_builder_->CreateInsertValue( + llvm::Value* product_real = + FSub(FMul(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)), + FMul(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))); + llvm::Value* product_imag = + FAdd(FMul(EmitExtractReal(lhs_value), EmitExtractImag(rhs_value)), + FMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value))); + next_accumulator = InsertValue( current_accumulator, - ir_builder_->CreateFAdd(EmitExtractReal(current_accumulator), - product_real), - {0}); - next_accumulator = ir_builder_->CreateInsertValue( + FAdd(EmitExtractReal(current_accumulator), product_real), {0}); + next_accumulator = InsertValue( next_accumulator, - ir_builder_->CreateFAdd(EmitExtractImag(current_accumulator), - product_imag), - {1}); + FAdd(EmitExtractImag(current_accumulator), product_imag), {1}); } else if (primitive_util::IsFloatingPointType(primitive_type)) { - next_accumulator = ir_builder_->CreateFAdd( - current_accumulator, ir_builder_->CreateFMul(lhs_value, rhs_value)); + next_accumulator = FAdd(current_accumulator, FMul(lhs_value, rhs_value)); } else { - next_accumulator = ir_builder_->CreateAdd( - current_accumulator, ir_builder_->CreateMul(lhs_value, rhs_value)); + next_accumulator = Add(current_accumulator, Mul(lhs_value, rhs_value)); } - ir_builder_->CreateStore(next_accumulator, accumulator_alloca); + Store(next_accumulator, accumulator_alloca); - SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), ir_builder_); - return ir_builder_->CreateLoad(accumulator_alloca); + SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), b_); + return Load(accumulator_alloca); } llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( const HloInstruction* hlo, - const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator) - const { + const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator) { switch (hlo->opcode()) { case HloOpcode::kAbs: case HloOpcode::kRoundNearestAfz: @@ -1947,6 +2031,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( case HloOpcode::kMultiply: case HloOpcode::kNe: case HloOpcode::kOr: + case HloOpcode::kXor: case HloOpcode::kPower: case HloOpcode::kRemainder: case HloOpcode::kShiftLeft: @@ -1995,10 +2080,10 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( const HloInstruction* operand = hlo->operand(0); auto source_index = target_index; for (int64 dim : hlo->dimensions()) { - source_index[dim] = ir_builder_->CreateSub( - llvm::ConstantInt::get(target_index[dim]->getType(), - hlo->shape().dimensions(dim) - 1), - target_index[dim]); + source_index[dim] = + Sub(llvm::ConstantInt::get(target_index[dim]->getType(), + hlo->shape().dimensions(dim) - 1), + target_index[dim]); } return operand_to_generator.at(operand)(source_index); }; @@ -2008,16 +2093,71 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( const HloInstruction* operand = hlo->operand(0); // The `dimensions` member of the broadcast instruction maps from // input dimensions to output dimensions. - return operand_to_generator.at( - operand)(target_index.SourceIndexOfBroadcast( - hlo->shape(), operand->shape(), hlo->dimensions(), ir_builder_)); + return operand_to_generator.at(operand)( + target_index.SourceIndexOfBroadcast(hlo->shape(), operand->shape(), + hlo->dimensions(), b_)); + }; + case HloOpcode::kIota: + return [this, hlo]( + const IrArray::Index& target_index) -> StatusOr { + auto* iota = Cast(hlo); + PrimitiveType element_type = iota->shape().element_type(); + IrArray::Index elem_index = + ShapeUtil::Rank(iota->shape()) > 1 + ? target_index.SourceIndexOfBroadcast( + iota->shape(), + ShapeUtil::MakeShapeWithDescendingLayout( + element_type, + {iota->shape().dimensions(iota->iota_dimension())}), + {iota->iota_dimension()}, b_) + : target_index; + llvm::Value* elem_index_linear = elem_index.linear(); + if (elem_index_linear == nullptr) { + std::vector iota_bound = { + iota->shape().dimensions(iota->iota_dimension())}; + elem_index_linear = elem_index.Linearize(iota_bound, b_); + } + Shape component_shape = + ShapeUtil::ElementIsComplex(iota->shape()) + ? ShapeUtil::ComplexComponentShape(iota->shape()) + : iota->shape(); + PrimitiveType component_element_type = component_shape.element_type(); + llvm::Value* iota_result; + if (ShapeUtil::ElementIsIntegral(component_shape)) { + iota_result = b_->CreateIntCast( + elem_index_linear, + llvm_ir::PrimitiveTypeToIrType(component_element_type, module_), + /*isSigned=*/false); + } else { + TF_RET_CHECK(ShapeUtil::ElementIsFloating(component_shape)) + << component_element_type; + llvm::Type* float_ir_type; + if (component_element_type == BF16) { + float_ir_type = llvm_ir::PrimitiveTypeToIrType(F32, module_); + } else { + float_ir_type = + llvm_ir::PrimitiveTypeToIrType(component_element_type, module_); + } + llvm::Value* float_val = + b_->CreateUIToFP(elem_index_linear, float_ir_type); + if (component_element_type == BF16) { + iota_result = EmitF32ToBF16(float_val, b_); + } else { + iota_result = float_val; + } + } + if (ShapeUtil::ElementIsComplex(iota->shape())) { + return EmitComposeComplex(iota, iota_result, nullptr); + } else { + return iota_result; + } }; case HloOpcode::kSlice: return [this, hlo, &operand_to_generator]( const IrArray::Index& index) -> StatusOr { IrArray::Index sliced_index = index.SourceIndexOfSlice( /*shape=*/hlo->shape(), /*starts=*/hlo->slice_starts(), - /*strides=*/hlo->slice_strides(), /*builder=*/ir_builder_); + /*strides=*/hlo->slice_strides(), /*builder=*/b_); return operand_to_generator.at(hlo->operand(0))(sliced_index); }; case HloOpcode::kDynamicSlice: @@ -2042,27 +2182,26 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( ShapeUtil::ElementsIn(hlo->operand(0)->shape())); return [this, hlo, &operand_to_generator](const IrArray::Index& index) { const HloInstruction* operand = hlo->operand(0); - return operand_to_generator.at(operand)(index.SourceIndexOfBitcast( - hlo->shape(), operand->shape(), ir_builder_)); + return operand_to_generator.at(operand)( + index.SourceIndexOfBitcast(hlo->shape(), operand->shape(), b_)); }; case HloOpcode::kReshape: CHECK_EQ(ShapeUtil::ElementsIn(hlo->shape()), ShapeUtil::ElementsIn(hlo->operand(0)->shape())); return [this, hlo, &operand_to_generator](const IrArray::Index& index) { const HloInstruction* operand = hlo->operand(0); - return operand_to_generator.at(operand)(index.SourceIndexOfReshape( - hlo->shape(), operand->shape(), ir_builder_)); + return operand_to_generator.at(operand)( + index.SourceIndexOfReshape(hlo->shape(), operand->shape(), b_)); }; case HloOpcode::kTranspose: return [this, hlo, &operand_to_generator](const IrArray::Index& target_index) { return operand_to_generator.at(hlo->operand(0))( target_index.SourceIndexOfTranspose( - hlo->shape(), hlo->operand(0)->shape(), hlo->dimensions(), - ir_builder_)); + hlo->shape(), hlo->operand(0)->shape(), hlo->dimensions(), b_)); }; case HloOpcode::kRng: - return MakeRngElementGenerator(hlo, operand_to_generator); + return MakePhiloxRngElementGenerator(hlo, operand_to_generator); case HloOpcode::kPad: return [this, hlo, &operand_to_generator]( const IrArray::Index& padded_index) -> StatusOr { @@ -2076,30 +2215,30 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( return EmitElementalDot(hlo, operand_to_generator, dot_result_index); }; default: - return [this, hlo, &operand_to_generator](const IrArray::Index& index) { + return [hlo](const IrArray::Index& index) { return Unimplemented("Unhandled opcode for elemental IR emission: %s", - HloOpcodeString(hlo->opcode()).c_str()); + HloOpcodeString(hlo->opcode())); }; } } -llvm::Value* ElementalIrEmitter::EmitExtractReal(llvm::Value* value) const { - return ir_builder_->CreateExtractValue(value, {0}); +llvm::Value* ElementalIrEmitter::EmitExtractReal(llvm::Value* value) { + return ExtractValue(value, {0}); } -llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) const { - return ir_builder_->CreateExtractValue(value, {1}); +llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) { + return ExtractValue(value, {1}); } llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op, llvm::Value* real, - llvm::Value* imag) const { + llvm::Value* imag) { auto cplx_type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_); - auto complex = ir_builder_->CreateInsertValue( - llvm::ConstantAggregateZero::get(cplx_type), real, {0}); + auto complex = + InsertValue(llvm::ConstantAggregateZero::get(cplx_type), real, {0}); if (imag != nullptr) { - complex = ir_builder_->CreateInsertValue(complex, imag, {1}); + complex = InsertValue(complex, imag, {1}); } return complex; } diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h index d199473374ad394913413a7d3fe805f8782936f7..d3e2acaabd4f602171def70ccd3d4fd5adce0d0d 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h @@ -23,116 +23,132 @@ limitations under the License. #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h" #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" #include "tensorflow/compiler/xla/statusor.h" namespace xla { -class ElementalIrEmitter { +class ElementalIrEmitter : public IrBuilderMixin { public: using HloToElementGeneratorMap = std::unordered_map; ElementalIrEmitter(const HloModuleConfig& hlo_module_config, - llvm::Module* module, llvm::IRBuilder<>* ir_builder) - : ir_builder_(ir_builder), - module_(module), - hlo_module_config_(hlo_module_config) {} + llvm::Module* module, llvm::IRBuilder<>* b) + : b_(b), module_(module), hlo_module_config_(hlo_module_config) {} virtual ~ElementalIrEmitter() = default; virtual StatusOr EmitUnaryOp(const HloInstruction* op, - llvm::Value* operand_value) const; + llvm::Value* operand_value); virtual StatusOr EmitBinaryOp(const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + llvm::Value* rhs_value); // Returns a function to generate an element of the output of `hlo`, given a // map of functions to generate elements of its operands. virtual llvm_ir::ElementGenerator MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const; + const HloToElementGeneratorMap& operand_to_generator); - llvm::IRBuilder<>* ir_builder() const { return ir_builder_; } - llvm::Module* module() const { return module_; } + llvm::IRBuilder<>* b() { return b_; } + + // builder() is for IrBuilderMixin. + llvm::IRBuilder<>* builder() { return b_; } + + llvm::Module* module() { return module_; } protected: - virtual StatusOr EmitIntegerUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const; + virtual StatusOr EmitIntegerUnaryOp(const HloInstruction* op, + llvm::Value* operand_value); + + virtual StatusOr EmitFloatUnaryOp(const HloInstruction* op, + llvm::Value* operand_value); + + virtual StatusOr EmitComplexUnaryOp(const HloInstruction* op, + llvm::Value* operand_value); - virtual StatusOr EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const; + llvm::Value* IsZero(llvm::Value* v); + llvm::Value* IsIntMinDivisionOverflow(llvm::Value* lhs, llvm::Value* rhs); + llvm::Value* GetZero(llvm::Type* type); + llvm::Value* GetOne(llvm::Type* type); + llvm::Value* GetIntSMin(llvm::Type* type); + llvm::Value* GetMinusOne(llvm::Type* type); - virtual StatusOr EmitComplexUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const; + llvm::Value* EmitIntegerDivide(llvm::Value* lhs, llvm::Value* rhs, + bool is_signed); + llvm::Value* EmitIntegerRemainder(llvm::Value* lhs, llvm::Value* rhs, + bool is_signed); virtual StatusOr EmitIntegerBinaryOp(const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const; + bool is_signed); - virtual StatusOr EmitFloatBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + virtual StatusOr EmitFloatBinaryOp(const HloInstruction* op, + llvm::Value* lhs_value, + llvm::Value* rhs_value); - virtual StatusOr EmitComplexBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + virtual StatusOr EmitComplexBinaryOp(const HloInstruction* op, + llvm::Value* lhs_value, + llvm::Value* rhs_value); virtual llvm::Value* EmitFloatMax(llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + llvm::Value* rhs_value); virtual llvm::Value* EmitFloatMin(llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + llvm::Value* rhs_value); llvm::Value* EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const; + bool is_signed); llvm::Value* EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value, - bool is_signed) const; + bool is_signed); virtual StatusOr EmitErfInv(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitErfcInv(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitAtan2(PrimitiveType prim_type, - llvm::Value* lhs, - llvm::Value* rhs) const; + llvm::Value* lhs, llvm::Value* rhs); virtual StatusOr EmitLog(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitLog1p(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitSin(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitCos(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitExp(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitExpm1(PrimitiveType prim_type, - llvm::Value* value) const; + llvm::Value* value); virtual StatusOr EmitPow(PrimitiveType prim_type, - llvm::Value* lhs, - llvm::Value* rhs) const; + llvm::Value* lhs, llvm::Value* rhs); + + virtual StatusOr EmitTanh(PrimitiveType prim_type, + llvm::Value* value); virtual StatusOr EmitReducePrecision(const HloInstruction* hlo, - llvm::Value* x) const; + llvm::Value* x); - virtual llvm::Value* EmitExtractReal(llvm::Value* value) const; - virtual llvm::Value* EmitExtractImag(llvm::Value* value) const; + virtual llvm::Value* EmitExtractReal(llvm::Value* value); + virtual llvm::Value* EmitExtractImag(llvm::Value* value); // Composes a complex struct. imag may be nullptr for simple cast operations. llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real, - llvm::Value* imag) const; + llvm::Value* imag); // A helper method for MakeElementGenerator. Given an elementwise op `hlo` and // the target array index, computes the source array index of its @@ -141,54 +157,52 @@ class ElementalIrEmitter { // Precondition: `hlo` is an elementwise op. llvm_ir::IrArray::Index ElementwiseSourceIndex( const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo, - int64 operand_no) const; + int64 operand_no); // Identifier of the thread unique among all threads on the device - virtual llvm::Value* EmitThreadId() const { - return ir_builder_->getIntN(128, 0); - } + virtual llvm::Value* EmitThreadId() { return b_->getIntN(128, 0); } StatusOr EmitElementalSelect( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalClamp( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalConcatenate( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& target_index) const; + const llvm_ir::IrArray::Index& target_index); StatusOr EmitElementalDynamicSlice( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalGather( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalDynamicUpdateSlice( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& index) const; + const llvm_ir::IrArray::Index& index); StatusOr EmitElementalPad( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& padded_index) const; + const llvm_ir::IrArray::Index& padded_index); StatusOr EmitElementalDot( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator, - const llvm_ir::IrArray::Index& dot_result_index) const; + const llvm_ir::IrArray::Index& dot_result_index); - llvm::IRBuilder<>* const ir_builder_; + llvm::IRBuilder<>* const b_; llvm::Module* module_; @@ -197,10 +211,17 @@ class ElementalIrEmitter { const HloModuleConfig& hlo_module_config_; private: - // Returns a ElementGenerator for a RNG HloInstruction. - llvm_ir::ElementGenerator MakeRngElementGenerator( + // Returns a ElementGenerator for an RNG HloInstruction using the Philox + // random number generation algorithm. + llvm_ir::ElementGenerator MakePhiloxRngElementGenerator( + const HloInstruction* hlo, + const HloToElementGeneratorMap& operand_to_generator); + // Converts the raw value generated by a random number generation algorithm + // to the distribution requested by the RNG HloInstruction. + StatusOr ConvertValueForDistribution( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const; + const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator, + const llvm_ir::IrArray::Index& index, llvm::Value* raw_value); }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc index b43dc0c65d9b6e7c05e06010ba2ff2eb27392295..1b3be199f632a2aa6bd2c5a3820c7c5ce9b1382e 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc @@ -14,26 +14,25 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/execution_options_util.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" namespace xla { namespace { -using tensorflow::gtl::nullopt; +using absl::nullopt; class ElementalIrEmitterExecutionTest : public HloTestBase { protected: - void RunTest(const string& hlo_text, - tensorflow::gtl::ArraySlice args) { + void RunTest(const string& hlo_text, absl::Span args) { HloModuleConfig config; config.set_debug_options(GetDebugOptionsForTest()); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_text, config)); + ParseHloString(hlo_text, config)); EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), args, nullopt)); } }; @@ -57,8 +56,8 @@ ENTRY main { } )"; - std::unique_ptr lhs = Literal::CreateR3({{{1}, {2}}}); - std::unique_ptr rhs = Literal::CreateR3({{{3}, {4}}}); + std::unique_ptr lhs = LiteralUtil::CreateR3({{{1}, {2}}}); + std::unique_ptr rhs = LiteralUtil::CreateR3({{{3}, {4}}}); RunTest(hlo_text, {lhs.get(), rhs.get()}); } } // namespace diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index 8119478ce934da06969024905e5e054e0b509b03..47c56e2f7fbd9f53be6a2b189c5c36cf4fdcdccb 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/executable.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/status.h" @@ -22,16 +24,14 @@ limitations under the License. #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/proto_serialization.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" -using tensorflow::gtl::ArraySlice; namespace xla { StatusOr> Executable::ExecuteOnStreams( - ArraySlice run_options, - ArraySlice> arguments) { + absl::Span run_options, + absl::Span> arguments) { TF_RET_CHECK(run_options.size() == arguments.size()); std::vector return_values; @@ -62,7 +62,7 @@ StatusOr> Executable::ExecuteOnStreams( StatusOr Executable::ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, - ArraySlice arguments) { + absl::Span arguments) { se::Stream* stream = run_options->stream(); std::unique_ptr timer; if (profile != nullptr) { @@ -76,13 +76,24 @@ StatusOr Executable::ExecuteOnStreamWrapper( std::unique_ptr profile_ptr = module_config().debug_options().xla_hlo_profile() && hlo_profiling_enabled() - ? MakeUnique(&hlo_profile_printer_data(), - &hlo_profile_index_map()) + ? absl::make_unique(&hlo_profile_printer_data(), + &hlo_profile_index_map()) : nullptr; StatusOr return_value = ExecuteOnStream(run_options, arguments, profile_ptr.get()); - TF_RETURN_IF_ERROR(return_value.status()); + if (!return_value.status().ok()) { + if (profile != nullptr) { + // Ensure the ThenStartTimer call has completed before we destroy timer. + // We already have a failure status to return, so just log this if it + // fails. + Status status = stream->BlockHostUntilDone(); + if (!status.ok()) { + LOG(ERROR) << "Failed to BlockHostUntilDone: " << status; + } + } + return return_value.status(); + } if (profile != nullptr) { VLOG(1) << "enqueueing 'stop timer' and blocking host until done..."; @@ -116,6 +127,11 @@ StatusOr Executable::ExecuteOnStreamWrapper( if (profile->compute_time_ns() == 0) { profile->set_compute_time_ns(profile->compute_and_transfer_time_ns()); } + + const int64 executable_size_in_bytes = SizeInBytes(); + if (executable_size_in_bytes != 0) { + profile->set_executable_size_in_bytes(executable_size_in_bytes); + } } if (profile_ptr != nullptr) { @@ -129,19 +145,7 @@ StatusOr Executable::ExecuteOnStreamWrapper( return return_value; } -Status Executable::DumpSessionModule() { - TF_RET_CHECK(dumping()); - const string& directory_path = - module_config().debug_options().xla_dump_executions_to(); - VersionedComputationHandle versioned_handle = entry_computation_handle(); - // This filename does not include the version number because the computation - // is only ever executed at one version. - string filename = tensorflow::strings::Printf( - "computation_%lld__%s__execution_%lld", versioned_handle.handle.handle(), - session_module_->entry().name().c_str(), ++execution_count_); - return Executable::DumpToDirectory(directory_path, filename, - *session_module_); -} +int64 Executable::SizeInBytes() { return -1; } Status Executable::DumpHloSnapshot() { TF_RET_CHECK(dumping_snapshot()); @@ -150,32 +154,12 @@ Status Executable::DumpHloSnapshot() { const string& directory_path = module_config().debug_options().xla_dump_executions_to(); const auto& module = hlo_snapshot_->hlo().hlo_module(); - string filename = tensorflow::strings::Printf( - "computation_%lld__%s__execution_%lld", module.id(), - module.entry_computation_name().c_str(), ++execution_count_); + string filename = + absl::StrFormat("computation_%d__%s__execution_%d", module.id(), + module.entry_computation_name(), ++execution_count_); return Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot_); } -/* static */ Status Executable::DumpToDirectory( - const string& directory_path, string filename, - const SessionModule& session_module) { - tensorflow::Env* env = tensorflow::Env::Default(); - if (!env->IsDirectory(directory_path).ok()) { - // NB! CreateDir does not work reliably with multiple XLA threads -- two - // threads can race to observe the absence of the dump directory and - // simultaneously try to create it, causing the "losing" thread to get a - // "directory already exists" error. - TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path)); - } - filename = SanitizeFileName(std::move(filename)); - string file_path = tensorflow::io::JoinPath(directory_path, filename); - string result; - TF_RET_CHECK( - tensorflow::SerializeToStringDeterministic(session_module, &result)); - return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path, - result); -} - /* static */ Status Executable::DumpToDirectory( const string& directory_path, string filename, const HloSnapshot& hlo_session) { diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 4f0466c544738fa1ec4602ee5104daee8d969c83..3a6780f2a67f230cae626ea00cfbf93b4e60d968 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -18,7 +18,10 @@ limitations under the License. #include #include +#include +#include "absl/types/span.h" +#include "absl/types/variant.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" @@ -26,20 +29,33 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" +#include "tensorflow/compiler/xla/service/owning_device_memory.h" #include "tensorflow/compiler/xla/service/service_executable_run_options.h" -#include "tensorflow/compiler/xla/service/session.pb.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" -#include "tensorflow/compiler/xla/service/versioned_computation_handle.h" +#include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" namespace xla { +// ExecutionOutput encapsulates the output buffers of a execution and the +// leftover buffers to be released by the caller. +struct ExecutionOutput { + ExecutionOutput(ScopedShapedBuffer result, + std::vector to_be_released) + : result(std::move(result)), to_be_released(std::move(to_be_released)) {} + ScopedShapedBuffer result; + + // Leftover buffers for the caller to release. Elements in this list are + // donated input memory buffers that are not reused by XLA as outputs. + std::vector to_be_released; +}; + // A given platform's compiler will produce an Executable -- this is a uniform // interface that is used for launching compiled programs across platforms. class Executable { @@ -65,33 +81,53 @@ class Executable { // Returns a shaped buffer containing the result of the computation. virtual StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) = 0; // Same as ExecuteOnStream(), but this call is non-blocking and returns as // soon as all of the operations are enqueued for launch on the stream. virtual StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) = 0; + absl::Span arguments) = 0; + + // Starts the given program executing on the given stream/executor. + // + // `arguments` are ShapeTree containing the input parameters. For each element + // in the shape tree, if the element holds the ownership of the memory, it is + // considered donated and XLA will potentially reuse it as output buffers. For + // all donated inputs, XLA is also responsible for freeing them. + // + // If an input is donated to XLA but is not reused as output, it is returned + // as an leftover buffer for the caller to release. + virtual StatusOr ExecuteOnStream( + const ServiceExecutableRunOptions* run_options, + std::vector> arguments, + HloExecutionProfile* hlo_execution_profile) { + return Unimplemented( + "MaybeOwningDeviceMemory version of overload is not implemented "); + } + + virtual StatusOr ExecuteAsyncOnStream( + const ServiceExecutableRunOptions* run_options, + std::vector> arguments) { + return Unimplemented( + "MaybeOwningDeviceMemory version of overload is not implemented "); + } // Same as ExecuteOnStream(), but runs this executable on multiple // streams. arguments[i] contains the arguments to the execution on // run_options[i]->stream() and the returned value is at index i of the // returned vector. virtual StatusOr> ExecuteOnStreams( - tensorflow::gtl::ArraySlice - run_options, - tensorflow::gtl::ArraySlice< - tensorflow::gtl::ArraySlice> - arguments); + absl::Span run_options, + absl::Span> arguments); // Populates `hlo_execution_profile` from `executor`. This is implicit in any // Execute* API call that takes a hlo_execution_profile argument, but must be // called explicitly for other (async, for example) variants after the stream // has completed. virtual Status PopulateExecutionProfile( - HloExecutionProfile* hlo_execution_profile, - se::StreamExecutor* executor) { + HloExecutionProfile* hlo_execution_profile, se::Stream* stream) { return Status::OK(); } @@ -100,7 +136,7 @@ class Executable { // given ExecutionProfile if non-null. StatusOr ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); // Returns the ExecutionProfile from executing on the device. This includes // the number of cycles taken for the computation or the compilation time. @@ -132,25 +168,15 @@ class Executable { const HloModuleConfig& module_config() const { return hlo_module_->config(); } - // Returns the versioned computation handle of the computation computed by - // this executable. - const VersionedComputationHandle& entry_computation_handle() const { - return hlo_module_->entry_computation_handle(); - } - // The shape (including layout) that results from this execution. This is the // shape of the DeviceMemoryBase result value in ExecuteOnStream above. - const Shape& host_result_shape() const { - return hlo_module_->config().host_entry_computation_layout().result_shape(); + const Shape& result_shape() const { + return hlo_module_->config().entry_computation_layout().result_shape(); } - // TODO(b/74197823): Delete the session module dumping helpers. - void set_session_module(std::unique_ptr session_module) { - session_module_ = std::move(session_module); - } - bool dumping() const { return session_module_ != nullptr; } - SessionModule* session_module() const { return session_module_.get(); } - Status DumpSessionModule(); + // Returns the size of the executable in bytes. Returns -1 by default if the + // method is not overridden to support this kind of query. + virtual int64 SizeInBytes(); // Dumping helpers. void set_hlo_snapshot(std::unique_ptr hlo_snapshot) { @@ -160,10 +186,6 @@ class Executable { HloSnapshot* hlo_snapshot() const { return hlo_snapshot_.get(); } Status DumpHloSnapshot(); - // Dump session_module to directory_path/filename. - static Status DumpToDirectory(const string& directory_path, string filename, - const SessionModule& session_module); - // Dump hlo snapshot to directory_path/filename. static Status DumpToDirectory(const string& directory_path, string filename, const HloSnapshot& hlo_session); @@ -179,9 +201,6 @@ class Executable { // around. const std::unique_ptr hlo_module_; - // SessionModule this was compiled from. Null if not dumping executions. - std::unique_ptr session_module_; - // HloSnapshot this was compiled from. Null if not dumping executions. std::unique_ptr hlo_snapshot_; diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc index 6794cfe297b0fb9a15eb9b7e6906d225f9597d07..997db7c058af6da8ecff399769b85b803e2e5785 100644 --- a/tensorflow/compiler/xla/service/execution_tracker.cc +++ b/tensorflow/compiler/xla/service/execution_tracker.cc @@ -17,7 +17,7 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -25,7 +25,7 @@ limitations under the License. namespace xla { AsyncExecution::AsyncExecution(Backend* backend, - std::vector streams, + std::vector streams, const ExecutionProfile& profile, GlobalDataHandle result) : backend_(CHECK_NOTNULL(backend)), @@ -46,14 +46,15 @@ Status AsyncExecution::BlockUntilDone() const { ExecutionTracker::ExecutionTracker() : next_handle_(1) {} -ExecutionHandle ExecutionTracker::Register( - Backend* backend, std::vector streams, - const ExecutionProfile& profile, GlobalDataHandle result) { +ExecutionHandle ExecutionTracker::Register(Backend* backend, + std::vector streams, + const ExecutionProfile& profile, + GlobalDataHandle result) { tensorflow::mutex_lock lock(execution_mutex_); int64 handle = next_handle_++; auto inserted = handle_to_execution_.emplace( - handle, - MakeUnique(backend, std::move(streams), profile, result)); + handle, absl::make_unique(backend, std::move(streams), + profile, result)); CHECK(inserted.second); ExecutionHandle execution_handle; @@ -65,7 +66,7 @@ Status ExecutionTracker::Unregister(const ExecutionHandle& handle) { tensorflow::mutex_lock lock(execution_mutex_); auto it = handle_to_execution_.find(handle.handle()); if (it == handle_to_execution_.end()) { - return NotFound("no execution record for execution handle: %lld", + return NotFound("no execution record for execution handle: %d", handle.handle()); } handle_to_execution_.erase(handle.handle()); @@ -77,7 +78,7 @@ StatusOr ExecutionTracker::Resolve( tensorflow::mutex_lock lock(execution_mutex_); auto it = handle_to_execution_.find(handle.handle()); if (it == handle_to_execution_.end()) { - return NotFound("no execution record for execution handle: %lld", + return NotFound("no execution record for execution handle: %d", handle.handle()); } return it->second.get(); diff --git a/tensorflow/compiler/xla/service/execution_tracker.h b/tensorflow/compiler/xla/service/execution_tracker.h index 4458152dd9a98890fc3a3e7f324245ec68821467..4e9b9f883e26f5564a9c63a40d2b4b9348908214 100644 --- a/tensorflow/compiler/xla/service/execution_tracker.h +++ b/tensorflow/compiler/xla/service/execution_tracker.h @@ -22,7 +22,7 @@ limitations under the License. #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/compiler/xla/service/backend.h" -#include "tensorflow/compiler/xla/service/pool.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -40,7 +40,7 @@ namespace xla { // the stream when destructed. class AsyncExecution { public: - AsyncExecution(Backend* backend, std::vector streams, + AsyncExecution(Backend* backend, std::vector streams, const ExecutionProfile& profile, GlobalDataHandle result); Status BlockUntilDone() const; @@ -54,7 +54,7 @@ class AsyncExecution { Backend* backend_; // Stream on which the execution is launched. - std::vector streams_; + std::vector streams_; // Profile object of the execution to be returned to the user. ExecutionProfile profile_; @@ -72,7 +72,7 @@ class ExecutionTracker { // Registers an execution with its backend, streams, and data handle to the // execution result. Returns a handle for the registered execution. ExecutionHandle Register(Backend* backend, - std::vector stream, + std::vector stream, const ExecutionProfile& profile, GlobalDataHandle data); diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.h b/tensorflow/compiler/xla/service/flatten_call_graph.h index d3efab3614912e4b0c2c8aa3b80277c326382ed0..3cccec9862e0f92df478006939552099868121b9 100644 --- a/tensorflow/compiler/xla/service/flatten_call_graph.h +++ b/tensorflow/compiler/xla/service/flatten_call_graph.h @@ -28,7 +28,7 @@ namespace xla { // points-to analysis (see b/36865746 for details). class FlattenCallGraph : public HloPassInterface { public: - tensorflow::StringPiece name() const override { return "flatten-call-graph"; } + absl::string_view name() const override { return "flatten-call-graph"; } // Duplicates computations called from multiple call- or while-nodes to // flatten the call graph. diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc index d3854b40de3572a60df1ad99d8a4589f59ad7194..8f6608241ed02bbb7e9fde9b6d767c002435e777 100644 --- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc +++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/flatten_call_graph.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/call_graph.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -80,7 +80,7 @@ class FlattenCallGraphTest : public HloTestBase { HloInstruction* param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, kScalarShape, "param0")); HloInstruction* zero = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, param0, zero)); return builder.Build(); @@ -157,7 +157,7 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) { builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(PRED, {}), "param0")); HloInstruction* false_constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); builder.AddInstruction( HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, param0, false_constant)); @@ -168,7 +168,7 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) { { HloComputation::Builder builder(TestName() + ".entry"); HloInstruction* false_constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); builder.AddInstruction(HloInstruction::CreateWhile( ShapeUtil::MakeShape(PRED, {}), cond_computation, cond_computation, false_constant)); @@ -232,11 +232,11 @@ TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) { // computation in the true and false branch. HloComputation::Builder builder(TestName()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(56.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(56.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(12.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(12.0f))); builder.AddInstruction(HloInstruction::CreateConditional( kScalarShape, pred, constant1, sub_computation, constant2, sub_computation)); diff --git a/tensorflow/compiler/xla/tools/parser/README.md b/tensorflow/compiler/xla/service/g3doc/hlo_parser.md similarity index 100% rename from tensorflow/compiler/xla/tools/parser/README.md rename to tensorflow/compiler/xla/service/g3doc/hlo_parser.md diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc index 2d3e4b1fcdf6675955714cab262a8b2ca8ff4297..cb86c9857936f21d9d2ac6bc22c725b89cca6482 100644 --- a/tensorflow/compiler/xla/service/gather_expander.cc +++ b/tensorflow/compiler/xla/service/gather_expander.cc @@ -15,6 +15,8 @@ limitations under the License. #include +#include "absl/algorithm/container.h" +#include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/gather_expander.h" #include "tensorflow/compiler/xla/service/hlo_creation_utils.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -23,88 +25,87 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" namespace xla { -using tensorflow::gtl::ArraySlice; static StatusOr TransposeIndexVectorDimToLast( - HloInstruction* gather_indices, int64 index_vector_dim) { - const Shape& gather_indices_shape = gather_indices->shape(); + HloInstruction* start_indices, int64 index_vector_dim) { + const Shape& start_indices_shape = start_indices->shape(); - if (gather_indices_shape.dimensions_size() == index_vector_dim) { - return gather_indices; + if (start_indices_shape.dimensions_size() == index_vector_dim) { + return start_indices; } - if (index_vector_dim == (gather_indices_shape.dimensions_size() - 1)) { - return gather_indices; + if (index_vector_dim == (start_indices_shape.dimensions_size() - 1)) { + return start_indices; } std::vector permutation; - permutation.reserve(gather_indices_shape.dimensions_size()); - for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) { + permutation.reserve(start_indices_shape.dimensions_size()); + for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) { if (i != index_vector_dim) { permutation.push_back(i); } } permutation.push_back(index_vector_dim); - return MakeTransposeHlo(gather_indices, permutation); + return MakeTransposeHlo(start_indices, permutation); } -// Canonicalizes the gather_indices tensors so that we only have deal with some +// Canonicalizes the start_indices tensors so that we only have deal with some // specific cases in the while loop that does the heavy lifting. // // See the "High Level Algorithm" section for a broader picture. static StatusOr CanonicalizeGatherIndices( - HloInstruction* gather_indices, int64 index_vector_dim) { + HloInstruction* start_indices, int64 index_vector_dim) { // Transpose the non-index-vector dimensions to the front. TF_ASSIGN_OR_RETURN( - HloInstruction * transposed_gather_indices, - TransposeIndexVectorDimToLast(gather_indices, index_vector_dim)); + HloInstruction * transposed_start_indices, + TransposeIndexVectorDimToLast(start_indices, index_vector_dim)); bool indices_are_scalar = - index_vector_dim == gather_indices->shape().dimensions_size(); + index_vector_dim == start_indices->shape().dimensions_size(); - // The number of dimensions in gather_indices that are index dimensions. - const int64 index_dims_in_gather_indices = indices_are_scalar ? 0 : 1; + // The number of dimensions in start_indices that are index dimensions. + const int64 index_dims_in_start_indices = indices_are_scalar ? 0 : 1; - // If there is only one index (i.e. gather_indices has rank 1 and this gather + // If there is only one index (i.e. start_indices has rank 1 and this gather // is really just a dynamic slice) add a leading degenerate dimension for // uniformity. Otherwise create a "collapsed" leading dimension that subsumes // all of the non-index-vector dimensions. - const Shape& shape = transposed_gather_indices->shape(); - if (shape.dimensions_size() == index_dims_in_gather_indices) { - return PrependDegenerateDims(transposed_gather_indices, 1); + const Shape& shape = transposed_start_indices->shape(); + if (shape.dimensions_size() == index_dims_in_start_indices) { + return PrependDegenerateDims(transposed_start_indices, 1); } else { - // Collapse all but the dimensions (0 or 1) in gather_indices containing the + // Collapse all but the dimensions (0 or 1) in start_indices containing the // index vectors. return CollapseFirstNDims( - transposed_gather_indices, - shape.dimensions_size() - index_dims_in_gather_indices); + transposed_start_indices, + shape.dimensions_size() - index_dims_in_start_indices); } } // Expands out or contracts away the gather dimensions in the accumulator // produced by the while loop. -static StatusOr AdjustGatherDimsInAccumulator( - const Shape& gather_indices_shape, HloInstruction* accumulator, +static StatusOr AdjustBatchDimsInAccumulator( + const Shape& start_indices_shape, HloInstruction* accumulator, int64 index_vector_dim) { - std::vector output_gather_dim_bounds; - output_gather_dim_bounds.reserve(gather_indices_shape.dimensions_size()); - for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) { + std::vector batch_dim_bounds; + batch_dim_bounds.reserve(start_indices_shape.dimensions_size()); + for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) { if (i != index_vector_dim) { - output_gather_dim_bounds.push_back(gather_indices_shape.dimensions(i)); + batch_dim_bounds.push_back(start_indices_shape.dimensions(i)); } } - if (output_gather_dim_bounds.empty()) { - // If output_gather_dim_bounds is empty we must be lowering a (effectively) + if (batch_dim_bounds.empty()) { + // If batch_dim_bounds is empty we must be lowering a (effectively) // dynamic-slice. In that case, there is a leading degenerate gather // dimension that we added to make this special case play well with the // general while loop which we need to remove now. return ElideDegenerateDims(accumulator, {0}); } - return ExpandFirstDimIntoNDims(accumulator, output_gather_dim_bounds); + return ExpandFirstDimIntoNDims(accumulator, batch_dim_bounds); } -// Expand an index vector from the gather_indices tensor into a vector that can +// Expand an index vector from the start_indices tensor into a vector that can // be used to dynamic-slice out of the gather operand. static StatusOr ExpandIndexVectorIntoOperandSpace( HloInstruction* index_vector, const GatherDimensionNumbers& dim_numbers, @@ -113,17 +114,15 @@ static StatusOr ExpandIndexVectorIntoOperandSpace( const Shape& index_shape = index_vector->shape(); HloInstruction* zero = computation->AddInstruction(HloInstruction::CreateConstant( - Literal::CreateFromDimensions(index_shape.element_type(), {1}))); + LiteralUtil::CreateFromDimensions(index_shape.element_type(), {1}))); // We extract out individual components from the smaller index and concatenate // them (interspersing zeros as needed) into the larger index. std::vector expanded_index_components; for (int i = 0; i < operand_rank; i++) { - int64 index_vector_dim_index = - FindIndex(dim_numbers.gather_dims_to_operand_dims(), i); - if (index_vector_dim_index != - dim_numbers.gather_dims_to_operand_dims_size()) { + int64 index_vector_dim_index = FindIndex(dim_numbers.start_index_map(), i); + if (index_vector_dim_index != dim_numbers.start_index_map_size()) { TF_ASSIGN_OR_RETURN( HloInstruction * component_to_concat, MakeSliceHlo(index_vector, /*start_indices=*/{index_vector_dim_index}, @@ -146,10 +145,10 @@ static StatusOr> GatherLoopBody( const GatherDimensionNumbers& dim_numbers = gather.gather_dimension_numbers(); CHECK_EQ(incoming_loop_state.size(), 3); HloInstruction* const operand = incoming_loop_state[0]; - HloInstruction* const gather_indices = incoming_loop_state[1]; + HloInstruction* const start_indices = incoming_loop_state[1]; HloInstruction* const output_accumulator = incoming_loop_state[2]; - bool has_scalar_indices = gather_indices->shape().dimensions_size() == 1; + bool has_scalar_indices = start_indices->shape().dimensions_size() == 1; CHECK_EQ(has_scalar_indices, dim_numbers.index_vector_dim() == gather.operand(1)->shape().dimensions_size()); @@ -162,24 +161,24 @@ static StatusOr> GatherLoopBody( HloInstruction* index_vector; if (has_scalar_indices) { - // In this case gather_indices has rank 1 and induction_var_as_vector (of + // In this case start_indices has rank 1 and induction_var_as_vector (of // shape {1}) is an index into this rank 1 tensor. TF_ASSIGN_OR_RETURN( index_vector, - MakeDynamicSliceHlo(gather_indices, induction_var_as_vector, {1})); + MakeDynamicSliceHlo(start_indices, induction_var_as_vector, {1})); } else { - // In this case gather_indices has rank 2 and induction_var_as_vector (of + // In this case start_indices has rank 2 and induction_var_as_vector (of // shape {1}) is an index into just the first dimension of this rank 2 // tensor. TF_ASSIGN_OR_RETURN( - HloInstruction * index_into_gather_indices, + HloInstruction * index_into_start_indices, PadVectorWithZeros(induction_var_as_vector, /*zeros_to_prepend=*/0, /*zeros_to_append=*/1)); - int64 index_vector_size = gather_indices->shape().dimensions(1); + int64 index_vector_size = start_indices->shape().dimensions(1); TF_ASSIGN_OR_RETURN( HloInstruction * index_vector_2d, - MakeDynamicSliceHlo(gather_indices, index_into_gather_indices, + MakeDynamicSliceHlo(start_indices, index_into_start_indices, {1, index_vector_size})); TF_ASSIGN_OR_RETURN(index_vector, @@ -193,26 +192,26 @@ static StatusOr> GatherLoopBody( TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice, MakeDynamicSliceHlo(operand, gathered_slice_start, - gather.gather_window_bounds())); + gather.gather_slice_sizes())); TF_ASSIGN_OR_RETURN( - HloInstruction * gathered_slice_with_dims_elided, + HloInstruction* const gathered_slice_with_dims_collapsed, ElideDegenerateDims(gathered_slice, - AsInt64Slice(dim_numbers.elided_window_dims()))); + AsInt64Slice(dim_numbers.collapsed_slice_dims()))); TF_ASSIGN_OR_RETURN( - HloInstruction * gathered_slice_for_update, - PrependDegenerateDims(gathered_slice_with_dims_elided, 1)); + HloInstruction* const gathered_slice_for_update, + PrependDegenerateDims(gathered_slice_with_dims_collapsed, 1)); TF_ASSIGN_OR_RETURN( - HloInstruction * index_vector_into_accumulator, + HloInstruction* const index_vector_into_accumulator, PadVectorWithZeros( induction_var_as_vector, /*zeros_to_prepend=*/0, /*zeros_to_append=*/ - gathered_slice_with_dims_elided->shape().dimensions_size())); + gathered_slice_with_dims_collapsed->shape().dimensions_size())); TF_ASSIGN_OR_RETURN( - HloInstruction * updated_accumulator, + HloInstruction* const updated_accumulator, MakeDynamicUpdateSliceHlo(output_accumulator, gathered_slice_for_update, index_vector_into_accumulator)); @@ -220,19 +219,19 @@ static StatusOr> GatherLoopBody( // WhileUtil::MakeCountedLoop functions takes care of the induction variable // and the while loop exit condition. return StatusOr>{ - {operand, gather_indices, updated_accumulator}}; + {operand, start_indices, updated_accumulator}}; } static StatusOr CreateGatherLoopAccumulatorInitValue( HloComputation* computation, PrimitiveType element_type, - ArraySlice window_bounds, int64 gather_loop_trip_count, + absl::Span slice_sizes, int64 gather_loop_trip_count, const GatherDimensionNumbers& dim_numbers) { std::vector accumulator_state_shape_dims; - accumulator_state_shape_dims.reserve(1 + window_bounds.size()); + accumulator_state_shape_dims.reserve(1 + slice_sizes.size()); accumulator_state_shape_dims.push_back(gather_loop_trip_count); - for (int64 i = 0; i < window_bounds.size(); i++) { - if (!c_binary_search(dim_numbers.elided_window_dims(), i)) { - accumulator_state_shape_dims.push_back(window_bounds[i]); + for (int64 i = 0; i < slice_sizes.size(); i++) { + if (!absl::c_binary_search(dim_numbers.collapsed_slice_dims(), i)) { + accumulator_state_shape_dims.push_back(slice_sizes[i]); } } return BroadcastZeros(computation, element_type, @@ -240,23 +239,23 @@ static StatusOr CreateGatherLoopAccumulatorInitValue( } // `accumulator` is almost the tensor the gather operation would have produced, -// except that it has the dimensions in the wrong order -- the gather dimensions -// are the major dimensions and the window dimensions are the minor dimensions. +// except that it has the dimensions in the wrong order -- the batch dimensions +// are the major dimensions and the offset dimensions are the minor dimensions. // Fix this up with a transpose. -static StatusOr PermuteGatherAndWindowDims( - HloInstruction* accumulator, ArraySlice output_window_dims, +static StatusOr PermuteBatchAndOffsetDims( + HloInstruction* accumulator, absl::Span offset_dims, int64 output_rank) { std::vector permutation; permutation.reserve(output_rank); - int64 gather_idx_counter = 0; - int64 window_idx_counter = output_rank - output_window_dims.size(); + int64 batch_idx_counter = 0; + int64 offset_idx_counter = output_rank - offset_dims.size(); for (int64 i = 0; i < output_rank; i++) { - bool is_window_dim = c_binary_search(output_window_dims, i); - if (is_window_dim) { - permutation.push_back(window_idx_counter++); + bool is_offset_dim = absl::c_binary_search(offset_dims, i); + if (is_offset_dim) { + permutation.push_back(offset_idx_counter++); } else { - permutation.push_back(gather_idx_counter++); + permutation.push_back(batch_idx_counter++); } } @@ -267,11 +266,11 @@ static StatusOr PermuteGatherAndWindowDims( // // We follow the following steps in sequence: // -// 1. We canonicalize the gather_indices tensor such that it has rank +// 1. We canonicalize the start_indices tensor such that it has rank // 2 (i.e. is a matrix) where each row is an index vector into the // operand. // 2. We iterate over the set of indices in the canonicalized -// gather_indices tensor using a while loop, accumulating slices +// start_indices tensor using a while loop, accumulating slices // of the operand tensor into an accumulator using // DynamicUpdateSlice. // 3. The accumulator result from the while loop from (2) is then @@ -286,11 +285,11 @@ static StatusOr PermuteGatherAndWindowDims( // operand = s32[3,3] parameter(0) // indices = s32[2,2] parameter(1) // ROOT gather = s32[2,3,2] gather(operand, indices), -// output_window_dims={1}, -// elided_window_dims={1}, -// gather_dims_to_operand_dims={1}, +// offset_dims={1}, +// collapsed_slice_dims={1}, +// start_index_map={1}, // index_vector_dim=2, -// window_bounds={3, 1} +// slice_sizes={3, 1} // } // // We'd first reshape indices to s32[4,1], where each row is an index @@ -300,12 +299,12 @@ static StatusOr PermuteGatherAndWindowDims( StatusOr GatherExpander::ExpandGather( HloInstruction* gather_instr) { - CHECK(!ShapeUtil::HasZeroElements(gather_instr->shape())); + CHECK(!ShapeUtil::IsZeroElementArray(gather_instr->shape())); HloComputation* computation = gather_instr->parent(); HloInstruction* operand = gather_instr->mutable_operand(0); - HloInstruction* gather_indices = gather_instr->mutable_operand(1); - const Shape& gather_indices_shape = gather_indices->shape(); + HloInstruction* start_indices = gather_instr->mutable_operand(1); + const Shape& start_indices_shape = start_indices->shape(); const Shape& output_shape = gather_instr->shape(); int64 output_rank = output_shape.dimensions_size(); @@ -313,9 +312,9 @@ StatusOr GatherExpander::ExpandGather( gather_instr->gather_dimension_numbers(); int64 gather_loop_trip_count = 1; - for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) { + for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) { if (i != dim_numbers.index_vector_dim()) { - gather_loop_trip_count *= gather_indices_shape.dimensions(i); + gather_loop_trip_count *= start_indices_shape.dimensions(i); } } @@ -323,27 +322,27 @@ StatusOr GatherExpander::ExpandGather( return Unimplemented( "Gather operations with more than 2147483647 gather indices are not " "supported. This error occurred for %s.", - gather_instr->ToString().c_str()); + gather_instr->ToString()); } - TF_ASSIGN_OR_RETURN(HloInstruction * canonical_gather_indices, - CanonicalizeGatherIndices( - gather_indices, dim_numbers.index_vector_dim())); + TF_ASSIGN_OR_RETURN( + HloInstruction * canonical_start_indices, + CanonicalizeGatherIndices(start_indices, dim_numbers.index_vector_dim())); CHECK_EQ(gather_loop_trip_count, - canonical_gather_indices->shape().dimensions(0)); + canonical_start_indices->shape().dimensions(0)); TF_ASSIGN_OR_RETURN( HloInstruction * accumulator_init, CreateGatherLoopAccumulatorInitValue( computation, output_shape.element_type(), - gather_instr->gather_window_bounds(), gather_loop_trip_count, + gather_instr->gather_slice_sizes(), gather_loop_trip_count, gather_instr->gather_dimension_numbers())); StatusOr> gather_loop_result_or_error = WhileUtil::MakeCountedLoop( computation, gather_loop_trip_count, - {operand, canonical_gather_indices, accumulator_init}, + {operand, canonical_start_indices, accumulator_init}, [&](HloInstruction* indvar, const std::vector& loop_state) { return GatherLoopBody(*gather_instr, indvar, loop_state); @@ -355,13 +354,13 @@ StatusOr GatherExpander::ExpandGather( HloInstruction* accumulator_result = gather_loop_result.back(); TF_ASSIGN_OR_RETURN( - HloInstruction * accumulator_with_output_gather_dims_decanonicalized, - AdjustGatherDimsInAccumulator(gather_indices->shape(), accumulator_result, - dim_numbers.index_vector_dim())); + HloInstruction* const accumulator_with_batch_dims_decanonicalized, + AdjustBatchDimsInAccumulator(start_indices->shape(), accumulator_result, + dim_numbers.index_vector_dim())); - return PermuteGatherAndWindowDims( - accumulator_with_output_gather_dims_decanonicalized, - AsInt64Slice(dim_numbers.output_window_dims()), output_rank); + return PermuteBatchAndOffsetDims(accumulator_with_batch_dims_decanonicalized, + AsInt64Slice(dim_numbers.offset_dims()), + output_rank); } StatusOr GatherExpander::Run(HloModule* module) { @@ -369,13 +368,13 @@ StatusOr GatherExpander::Run(HloModule* module) { return inst->opcode() == HloOpcode::kGather && // Avoid expanding gather ops that produce zero sized tensors, // instead punt these to ZeroSizedHloElimination. - !ShapeUtil::HasZeroElements(inst->shape()); + !ShapeUtil::IsZeroElementArray(inst->shape()); }; std::vector gather_instrs; for (HloComputation* computation : module->MakeNonfusionComputations()) { - c_copy_if(computation->instructions(), std::back_inserter(gather_instrs), - is_nontrivial_gather); + absl::c_copy_if(computation->instructions(), + std::back_inserter(gather_instrs), is_nontrivial_gather); } for (HloInstruction* inst : gather_instrs) { diff --git a/tensorflow/compiler/xla/service/gather_expander.h b/tensorflow/compiler/xla/service/gather_expander.h index c1fc8574da99fff223c7dbb570b4533f76905b9a..7bd9ea598417a931d2df507d472c6a60be05e0bc 100644 --- a/tensorflow/compiler/xla/service/gather_expander.h +++ b/tensorflow/compiler/xla/service/gather_expander.h @@ -25,7 +25,7 @@ namespace xla { // nevertheless have a minimum level of support. class GatherExpander : public HloPassInterface { public: - tensorflow::StringPiece name() const override { return "gather_expander"; } + absl::string_view name() const override { return "gather_expander"; } StatusOr Run(HloModule* module) override; private: diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc index 1c72ca066502eb549bf8638cdf0b7827b06f92d7..141dd4d6f10272ce749edc4e91153c365ed322e6 100644 --- a/tensorflow/compiler/xla/service/gather_expander_test.cc +++ b/tensorflow/compiler/xla/service/gather_expander_test.cc @@ -14,9 +14,9 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gather_expander.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" namespace xla { namespace { @@ -28,15 +28,15 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2147483647,5] parameter(1) ROOT gather = s32[2147483647,3,5] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={1}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=2, - window_bounds={3, 1} + slice_sizes={3, 1} } )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_text)); + ParseHloString(hlo_text)); Status status = GatherExpander{}.Run(module.get()).status(); EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED); @@ -55,15 +55,15 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) ROOT gather = s32[3,2] gather(operand, indices), - output_window_dims={0}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={0}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=1, - window_bounds={3, 1} + slice_sizes={3, 1} } )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_text)); + ParseHloString(hlo_text)); TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get())); ASSERT_TRUE(changed); diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc index 5ee67ccb4ae147683c7b41941670c6fc413a0d09..4ed91ef18768d09c252d1b73890637227f0ce717 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc @@ -20,11 +20,10 @@ limitations under the License. #include #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/interpreter/platform_id.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -43,8 +42,7 @@ se::Platform::Id GenericTransferManager::PlatformId() const { } Status GenericTransferManager::WriteSingleTupleIndexTable( - se::StreamExecutor* executor, - tensorflow::gtl::ArraySlice elements, + se::Stream* stream, absl::Span elements, const Shape& shape, se::DeviceMemoryBase* region) { TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape)); @@ -52,13 +50,27 @@ Status GenericTransferManager::WriteSingleTupleIndexTable( for (const se::DeviceMemoryBase& element : elements) { element_pointers.push_back(element.opaque()); } - return TransferBufferToDevice(executor, GetByteSizeRequirement(shape), - element_pointers.data(), region); + TF_RETURN_IF_ERROR(TransferBufferToDevice( + stream, GetByteSizeRequirement(shape), element_pointers.data(), region)); + // Ensure the buffer is transferred before we destroy element_pointers. + return stream->BlockHostUntilDone(); } -StatusOr> -GenericTransferManager::TransferLiteralFromDevice( - se::StreamExecutor* executor, const ShapedBuffer& device_buffer) { +void GenericTransferManager::TransferLiteralFromDevice( + se::Stream* stream, const ShapedBuffer& device_buffer, + MutableBorrowingLiteral literal, std::function done) { + Status status = stream->BlockHostUntilDone(); + if (!status.ok()) { + return done(status); + } + + done(TransferLiteralFromDeviceInternal(stream->parent(), device_buffer, + literal)); +} + +Status GenericTransferManager::TransferLiteralFromDeviceInternal( + se::StreamExecutor* executor, const ShapedBuffer& device_buffer, + MutableBorrowingLiteral literal) { VLOG(2) << "transferring literal from device ordinal " << executor->device_ordinal() << "; device buffer: " << device_buffer; TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal()); @@ -68,28 +80,24 @@ GenericTransferManager::TransferLiteralFromDevice( TF_RET_CHECK(ShapeUtil::Equal(device_buffer.on_device_shape(), device_buffer.on_host_shape())); - std::unique_ptr literal = - Literal::CreateFromShape(device_buffer.on_host_shape()); - TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus( device_buffer.on_host_shape(), [&](const Shape& subshape, const ShapeIndex& index) -> Status { - if (!ShapeUtil::IsTuple(subshape)) { - TF_RETURN_IF_ERROR(TransferBufferFromDevice( - executor, + if (ShapeUtil::IsArray(subshape)) { + TF_RETURN_IF_ERROR(executor->SynchronousMemcpyD2H( /*source=*/device_buffer.buffer(index), /*size=*/GetByteSizeRequirement(subshape), /*destination=*/ - literal->untyped_data(index))); + literal.untyped_data(index))); } return Status::OK(); })); - return std::move(literal); + return Status::OK(); } -Status GenericTransferManager::TransferLiteralToDevice( - se::StreamExecutor* executor, const LiteralSlice& literal, +Status GenericTransferManager::TransferLiteralToDeviceAsync( + se::Stream* stream, const LiteralSlice& literal, const ShapedBuffer& device_buffer) { const Shape& shape = literal.shape(); VLOG(2) << "transferring literal shape to device: " @@ -103,9 +111,10 @@ Status GenericTransferManager::TransferLiteralToDevice( TF_RET_CHECK( ShapeUtil::Compatible(literal.shape(), device_buffer.on_host_shape())); - TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal()); + TF_RET_CHECK(stream->parent()->device_ordinal() == + device_buffer.device_ordinal()); - TF_RETURN_IF_ERROR(WriteTupleIndexTables(executor, device_buffer)); + TF_RETURN_IF_ERROR(WriteTupleIndexTables(stream, device_buffer)); return ShapeUtil::ForEachSubshapeWithStatus( device_buffer.on_host_shape(), @@ -121,16 +130,21 @@ Status GenericTransferManager::TransferLiteralToDevice( if (LayoutUtil::Equal(device_subshape.layout(), subliteral.shape().layout())) { source = subliteral.untyped_data(); + return TransferBufferToDevice( + stream, + /*size=*/GetByteSizeRequirement(device_subshape), source, + &device_memory); } else { // Relayout data before transferring. relayed_out_literal = subliteral.Relayout(device_subshape.layout(), /*shape_index=*/{}); source = relayed_out_literal->untyped_data(); + TF_RETURN_IF_ERROR(TransferBufferToDevice( + stream, + /*size=*/GetByteSizeRequirement(device_subshape), source, + &device_memory)); + return stream->BlockHostUntilDone(); } - return TransferBufferToDevice( - executor, - /*size=*/GetByteSizeRequirement(device_subshape), source, - &device_memory); } return Status::OK(); }); @@ -141,20 +155,14 @@ Status GenericTransferManager::TransferLiteralToInfeed( return Unimplemented("Generic transfer to Infeed"); } -Status GenericTransferManager::TransferBufferToInfeed( - se::StreamExecutor* executor, int64 size, const void* source) { - return Unimplemented("Generic transfer to Infeed"); -} - Status GenericTransferManager::TransferLiteralFromOutfeed( se::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) { - return Unimplemented( - "Outfeed is not supported on this platform (b/30467474)"); + MutableBorrowingLiteral literal) { + return Unimplemented("Generic transfer from Outfeed"); } Status GenericTransferManager::ResetDevices( - tensorflow::gtl::ArraySlice + absl::Span /*executors*/) { return Unimplemented( "Device reset is not yet supported on this platform (b/30481585)"); diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h index 3da9570ef7eebcdf618439f628fb4d5589993e4f..86c8b1c145a25149a25e7b272babc5c858d476af 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.h +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h @@ -19,7 +19,6 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/transfer_manager.h" -#include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -41,34 +40,35 @@ class GenericTransferManager : public TransferManager { se::Platform::Id PlatformId() const override; - StatusOr> TransferLiteralFromDevice( - se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override; + void TransferLiteralFromDevice(se::Stream* stream, + const ShapedBuffer& device_buffer, + MutableBorrowingLiteral literal, + std::function done) override; - Status TransferLiteralToDevice(se::StreamExecutor* executor, - const LiteralSlice& literal, - const ShapedBuffer& device_buffer) override; + Status TransferLiteralToDeviceAsync( + se::Stream* stream, const LiteralSlice& literal, + const ShapedBuffer& device_buffer) override; Status TransferLiteralToInfeed(se::StreamExecutor* executor, const LiteralSlice& literal) override; Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) override; + MutableBorrowingLiteral literal) override; - Status ResetDevices( - tensorflow::gtl::ArraySlice executors) override; + Status ResetDevices(absl::Span executors) override; int64 GetByteSizeRequirement(const Shape& shape) const override; protected: - Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, - const void* source) override; - Status WriteSingleTupleIndexTable( - se::StreamExecutor* executor, - tensorflow::gtl::ArraySlice elements, + se::Stream* stream, absl::Span elements, const Shape& shape, se::DeviceMemoryBase* region) override; private: + Status TransferLiteralFromDeviceInternal(se::StreamExecutor* executor, + const ShapedBuffer& device_buffer, + MutableBorrowingLiteral literal); + // The platform this transfer manager targets. const se::Platform::Id platform_id_; diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 27949302487b80d055cfe37378ecdc32ab898075..a68b7a1bef81e369dc1bbcd249642e5b80401c64 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -1,6 +1,9 @@ # Description: # GPU-specific components in XLA service implementation. +load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test") +load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library") + licenses(["notice"]) # Apache 2.0 package(default_visibility = [":friends"]) @@ -23,12 +26,18 @@ filegroup( load("//tensorflow:tensorflow.bzl", "tf_cc_test") +xla_proto_library( + name = "backend_configs", + srcs = ["backend_configs.proto"], +) + cc_library( name = "gpu_constants", srcs = ["gpu_constants.cc"], hdrs = ["gpu_constants.h"], deps = [ "//tensorflow/compiler/xla:types", + "//tensorflow/core:framework", ], ) @@ -47,6 +56,8 @@ cc_library( "//tensorflow/compiler/xla/service:hlo", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", ], ) @@ -82,6 +93,7 @@ cc_library( "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_reachability", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], ) @@ -98,6 +110,8 @@ tf_cc_test( "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", ], ) @@ -106,15 +120,19 @@ cc_library( srcs = ["hlo_to_ir_bindings.cc"], hdrs = ["hlo_to_ir_bindings.h"], deps = [ + ":buffer_allocations", ":ir_emission_utils", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:buffer_assignment", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis", + "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", "@llvm//:core", ], ) @@ -133,6 +151,8 @@ cc_library( "ir_emitter_unnested.h", ], deps = [ + ":backend_configs", + ":buffer_allocations", ":cudnn_convolution_runner", ":elemental_ir_emitter", ":gpu_constants", @@ -141,8 +161,7 @@ cc_library( ":ir_emission_utils", ":parallel_loop_emitter", ":partition_assignment", - ":while_transformer", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -154,15 +173,27 @@ cc_library( "//tensorflow/compiler/xla/service:elemental_ir_emitter", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:name_uniquer", + "//tensorflow/compiler/xla/service:while_loop_analysis", + "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util", + "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util", "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", + "//tensorflow/compiler/xla/service/llvm_ir:ir_builder_mixin", + "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library", + "//tensorflow/compiler/xla/service/llvm_ir:kernel_tiling", "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter", - "//tensorflow/compiler/xla/service/llvm_ir:ops", + "//tensorflow/compiler/xla/service/llvm_ir:sort_util", "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", "@llvm//:core", "@llvm//:support", ], @@ -190,7 +221,7 @@ cc_library( srcs = ["elemental_ir_emitter.cc"], hdrs = ["elemental_ir_emitter.h"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -205,7 +236,10 @@ cc_library( "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter", + "//tensorflow/compiler/xla/service/llvm_ir:math_ops", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", "@llvm//:core", "@llvm//:support", ], @@ -225,6 +259,23 @@ cc_library( "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", + ], +) + +cc_library( + name = "hlo_execution_profiler", + srcs = ["hlo_execution_profiler.cc"], + hdrs = ["hlo_execution_profiler.h"], + deps = [ + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_execution_profile", + "//tensorflow/compiler/xla/service:stream_pool", + "//tensorflow/core:lib", + "//tensorflow/core:ptr_util", + "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", ], ) @@ -242,7 +293,9 @@ cc_library( "infeed_thunk.cc", "kernel_thunk.cc", "memset_thunk.cc", + "outfeed_thunk.cc", "sequential_thunk.cc", + "thunk.cc", "thunk_schedule.cc", "tuple_thunk.cc", "while_thunk.cc", @@ -259,6 +312,7 @@ cc_library( "infeed_thunk.h", "kernel_thunk.h", "memset_thunk.h", + "outfeed_thunk.h", "sequential_thunk.h", "thunk.h", "thunk_schedule.h", @@ -268,11 +322,14 @@ cc_library( deps = [ ":buffer_allocations", ":cudnn_convolution_runner", + ":hlo_execution_profiler", ":infeed_manager", ":ir_emission_utils", + ":outfeed_manager", ":partition_assignment", ":stream_assignment", "//tensorflow/compiler/xla:array2d", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_tree", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status", @@ -290,6 +347,7 @@ cc_library( "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/compiler/xla/service:transfer_manager", "//tensorflow/compiler/xla/service:tuple_points_to_analysis", + "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:stream_executor_no_cuda", @@ -298,6 +356,11 @@ cc_library( "//tensorflow/core/platform/default/build_config:cufft_plugin", "//tensorflow/core/platform/default/build_config:stream_executor_cuda", # build_cleaner: keep "//tensorflow/stream_executor", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", ], ) @@ -322,14 +385,21 @@ cc_library( srcs = ["cudnn_convolution_algorithm_picker.cc"], hdrs = ["cudnn_convolution_algorithm_picker.h"], deps = [ + ":backend_configs", + ":buffer_comparator", ":cudnn_convolution_runner", ":gpu_executable", ":ir_emission_utils", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla/service:compiler", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_pass", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", ], ) @@ -347,6 +417,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/strings", ], ) @@ -356,7 +427,7 @@ cc_library( hdrs = ["cudnn_convolution_rewriter.h"], deps = [ ":ir_emission_utils", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:window_util", "//tensorflow/compiler/xla:xla_data_proto", @@ -377,7 +448,7 @@ tf_cc_test( "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_matchers", "//tensorflow/compiler/xla/service:shape_inference", - "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:hlo_verified_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep "//tensorflow/core:test", ], @@ -388,6 +459,7 @@ cc_library( srcs = ["instruction_fusion.cc"], hdrs = ["instruction_fusion.h"], deps = [ + ":gpu_fusible", ":ir_emission_utils", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:xla_data_proto", @@ -406,9 +478,43 @@ tf_cc_test( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", + ], +) + +cc_library( + name = "multi_output_fusion", + srcs = ["multi_output_fusion.cc"], + hdrs = ["multi_output_fusion.h"], + deps = [ + ":gpu_fusible", + ":instruction_fusion", + ":ir_emission_utils", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:multi_output_fusion", + "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + ], +) + +tf_cc_test( + name = "multi_output_fusion_test", + srcs = ["multi_output_fusion_test.cc"], + deps = [ + ":instruction_fusion", + ":multi_output_fusion", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -432,6 +538,7 @@ cc_library( srcs = ["fusion_merger.cc"], hdrs = ["fusion_merger.h"], deps = [ + ":gpu_fusible", ":instruction_fusion", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:util", @@ -439,6 +546,8 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_cost_analysis", "//tensorflow/compiler/xla/service:hlo_pass", "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", ], ) @@ -450,9 +559,9 @@ tf_cc_test( ":instruction_fusion", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) @@ -462,6 +571,25 @@ cc_library( hdrs = ["pad_insertion.h"], deps = [ ":ir_emission_utils", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:window_util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo_creation_utils", + "//tensorflow/compiler/xla/service:hlo_pass", + "//tensorflow/compiler/xla/service:shape_inference", + "@com_google_absl//absl/memory", + ], +) + +cc_library( + name = "pad_for_tensor_cores", + srcs = ["pad_for_tensor_cores.cc"], + hdrs = ["pad_for_tensor_cores.h"], + deps = [ + ":ir_emission_utils", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:window_util", @@ -472,13 +600,31 @@ cc_library( ], ) +tf_cc_test( + name = "pad_for_tensor_cores_test", + srcs = ["pad_for_tensor_cores_test.cc"], + deps = [ + ":ir_emission_utils", + ":pad_for_tensor_cores", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/tests:hlo_verified_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", # build_cleaner: keep + ], +) + cc_library( name = "gpu_transfer_manager", srcs = ["gpu_transfer_manager.cc"], hdrs = ["gpu_transfer_manager.h"], deps = [ ":gpu_compiler", + ":outfeed_manager", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:shape_tree", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -490,6 +636,7 @@ cc_library( "//tensorflow/compiler/xla/service/gpu:infeed_manager", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", "@llvm//:core", ], alwayslink = True, # Contains per-platform transfer manager registration @@ -497,8 +644,8 @@ cc_library( cc_library( name = "gpu_compiler", - srcs = ["gpu_compiler.cc"], - hdrs = ["gpu_compiler.h"], + srcs = ["nvptx_compiler.cc"], + hdrs = ["nvptx_compiler.h"], deps = [ ":cudnn_convolution_algorithm_picker", ":cudnn_convolution_rewriter", @@ -506,15 +653,18 @@ cc_library( ":gpu_constants", ":gpu_copy_insertion", ":gpu_executable", + ":gpu_hlo_schedule", ":gpu_hlo_support_checker", ":gpu_layout_assignment", - ":hlo_schedule", ":instruction_fusion", ":ir_emission_utils", ":ir_emitter", + ":multi_output_fusion", + ":pad_for_tensor_cores", ":pad_insertion", ":partition_assignment", ":stream_assignment", + ":stream_executor_util", "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -526,10 +676,8 @@ cc_library( "//tensorflow/compiler/xla/service:buffer_liveness", "//tensorflow/compiler/xla/service:call_inliner", "//tensorflow/compiler/xla/service:conditional_simplifier", - "//tensorflow/compiler/xla/service:dot_decomposer", "//tensorflow/compiler/xla/service:executable", "//tensorflow/compiler/xla/service:flatten_call_graph", - "//tensorflow/compiler/xla/service:gather_expander", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_constant_folding", "//tensorflow/compiler/xla/service:hlo_cse", @@ -544,10 +692,10 @@ cc_library( "//tensorflow/compiler/xla/service:llvm_compiler", "//tensorflow/compiler/xla/service:reduce_precision_insertion", "//tensorflow/compiler/xla/service:reshape_mover", + "//tensorflow/compiler/xla/service:scatter_expander", "//tensorflow/compiler/xla/service:transpose_folding", "//tensorflow/compiler/xla/service:tuple_simplifier", "//tensorflow/compiler/xla/service:while_loop_constant_sinking", - "//tensorflow/compiler/xla/service:while_loop_invariant_code_motion", "//tensorflow/compiler/xla/service:while_loop_simplifier", "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination", "//tensorflow/compiler/xla/service/gpu:cudnn_batchnorm_rewriter", @@ -558,8 +706,11 @@ cc_library( "//tensorflow/core:lib_internal", "//tensorflow/core:regexp_internal", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", "@llvm//:core", - "@llvm//:support", ], alwayslink = True, # Contains compiler registration ) @@ -570,21 +721,44 @@ cc_library( hdrs = ["cudnn_batchnorm_rewriter.h"], deps = [ ":ir_emission_utils", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_pass", ], ) +cc_library( + name = "xfeed_queue", + hdrs = ["xfeed_queue.h"], + deps = ["//tensorflow/core:lib"], +) + cc_library( name = "infeed_manager", srcs = ["infeed_manager.cc"], hdrs = ["infeed_manager.h"], deps = [ + ":xfeed_queue", + "//tensorflow/compiler/xla:shape_tree", "//tensorflow/compiler/xla:types", + "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + ], +) + +cc_library( + name = "outfeed_manager", + srcs = ["outfeed_manager.cc"], + hdrs = ["outfeed_manager.h"], + deps = [ + ":xfeed_queue", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:shape_tree", + "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", - "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", ], ) @@ -619,56 +793,46 @@ tf_cc_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:computation_layout", "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", # build_cleaner: keep + "@com_google_absl//absl/strings", ], ) cc_library( - name = "hlo_schedule", - srcs = ["hlo_schedule.cc"], - hdrs = ["hlo_schedule.h"], + name = "gpu_hlo_schedule", + srcs = ["gpu_hlo_schedule.cc"], + hdrs = ["gpu_hlo_schedule.h"], deps = [ ":stream_assignment", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:buffer_value", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_ordering", "//tensorflow/compiler/xla/service:hlo_reachability", "//tensorflow/compiler/xla/service:hlo_scheduling", + "@com_google_absl//absl/memory", ], ) tf_cc_test( - name = "hlo_schedule_test", + name = "gpu_hlo_schedule_test", srcs = [ - "hlo_schedule_test.cc", + "gpu_hlo_schedule_test.cc", ], deps = [ - ":hlo_schedule", + ":gpu_hlo_schedule", ":stream_assignment", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - ], -) - -cc_library( - name = "while_transformer", - srcs = ["while_transformer.cc"], - hdrs = ["while_transformer.h"], - deps = [ - "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", ], ) @@ -677,12 +841,12 @@ tf_cc_test( srcs = ["while_transformer_test.cc"], deps = [ ":instruction_fusion", - ":while_transformer", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/service:copy_insertion", "//tensorflow/compiler/xla/service:hlo_verifier", + "//tensorflow/compiler/xla/service:while_loop_analysis", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -717,7 +881,11 @@ cc_library( hdrs = ["stream_executor_util.h"], deps = [ "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo", "//tensorflow/core:stream_executor_no_cuda", ], ) @@ -735,3 +903,57 @@ tf_cc_test( "//tensorflow/core:test", ], ) + +cc_library( + name = "buffer_comparator", + srcs = ["buffer_comparator.cc"], + hdrs = ["buffer_comparator.h"], + deps = [ + ":gpu_executable", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla/service:compiler", + "//tensorflow/compiler/xla/service:device_memory_allocator", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/strings", + ], +) + +xla_test( + name = "buffer_comparator_test", + srcs = ["buffer_comparator_test.cc"], + backends = [ + "cpu", + "gpu", + ], + deps = [ + ":buffer_comparator", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla/service:backend", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + +cc_library( + name = "gpu_fusible", + srcs = ["gpu_fusible.cc"], + hdrs = ["gpu_fusible.h"], + deps = [ + ":ir_emission_utils", + "//tensorflow/compiler/xla/service:hlo", + ], +) + +tf_cc_test( + name = "gpu_fusible_test", + srcs = ["gpu_fusible_test.cc"], + deps = [ + ":gpu_fusible", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "@com_google_absl//absl/strings", + ], +) diff --git a/tensorflow/compiler/xla/service/gpu/backend_configs.proto b/tensorflow/compiler/xla/service/gpu/backend_configs.proto new file mode 100644 index 0000000000000000000000000000000000000000..640c6392b8b820c708b853c2a3cea4d4116e85a8 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/backend_configs.proto @@ -0,0 +1,27 @@ +syntax = "proto3"; + +package xla.gpu; + +// Backend configs for XLA:GPU. +// +// These are metadata that the GPU backend attaches to HloInstrucitons and later +// uses during e.g. codegen. +// +// Remember that proto3 doesn't give clients a way to tell the difference +// between a field not being present and a field having the default value. +// Choose your defaults carefully. +// +// No guarantee is made about the stability of these protos. +// +// See HloInstruction::backend_config() for more info. + +// Backend config for a convolution that runs through cudnn. +message CudnnConvBackendConfig { + // Opaque algorithm number of cudnn algorithm chosen for this conv. + int64 algorithm = 1; + + // Whether we may use tensor cores when running this conv. Even if this is + // true, cudnn may choose not to use tensor cores, e.g. because the GPU or + // selected algorithm doesn't support it. + bool tensor_ops_enabled = 2; +} diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc index ab5149dcdb09290cd0c0b2233029d0988a95f036..528209abc75777440163c2e1512658b8ad36315b 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc @@ -17,8 +17,8 @@ limitations under the License. #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" @@ -40,21 +40,31 @@ StatusOr> BufferAllocations::Builder::Build( const BufferAssignment* buffer_assignment, int device_ordinal, DeviceMemoryAllocator* memory_allocator) { const int64 num_buffers = buffer_assignment->Allocations().size(); - auto buffer_allocations = WrapUnique(new BufferAllocations( + auto buffer_allocations = absl::WrapUnique(new BufferAllocations( num_buffers, device_ordinal, memory_allocator, buffer_assignment)); for (BufferAllocation::Index i = 0; i < num_buffers; ++i) { + const BufferAllocation& allocation = buffer_assignment->GetAllocation(i); + const int64 expected_alignment = [&] { + if (allocation.is_entry_computation_parameter()) { + return kEntryParameterAlignBytes; + } else if (allocation.is_constant()) { + return kConstantBufferAlignBytes; + } else { + return kXlaAllocatedBufferAlignBytes; + } + }(); + // If buffer #i's address is already registered (e.g. external arguments or // result buffers), use that registered buffer. if (registered_buffers_.count(i)) { se::DeviceMemoryBase address = FindOrDie(registered_buffers_, i); - if (reinterpret_cast(address.opaque()) % - kCudaMallocAlignBytes != + if (reinterpret_cast(address.opaque()) % expected_alignment != 0) { return InternalError( - "Address of registered buffer %lld must be a multiple of %llx, but " + "Address of registered buffer %d must be a multiple of %x, but " "was %p", - i, kCudaMallocAlignBytes, address.opaque()); + i, kEntryParameterAlignBytes, address.opaque()); } buffer_allocations->SetBuffer(i, FindOrDie(registered_buffers_, i)); continue; @@ -62,7 +72,6 @@ StatusOr> BufferAllocations::Builder::Build( // Allocate each allocation that might escape, or is the temp buffer. bool seen_temp_buffer = false; - const BufferAllocation& allocation = buffer_assignment->GetAllocation(i); if (allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer()) { const int64 buffer_size = allocation.size(); se::DeviceMemoryBase buffer_address; @@ -70,13 +79,12 @@ StatusOr> BufferAllocations::Builder::Build( OwningDeviceMemory buffer; TF_ASSIGN_OR_RETURN( buffer, memory_allocator->Allocate(device_ordinal, buffer_size)); - if (reinterpret_cast(buffer.opaque()) % - kCudaMallocAlignBytes != + if (reinterpret_cast(buffer.opaque()) % expected_alignment != 0) { return InternalError( "Address returned by memory_allocator->Allocate must be a " - "multiple of %llx, but was %p", - kCudaMallocAlignBytes, buffer.opaque()); + "multiple of 0x%x, but was %p", + kXlaAllocatedBufferAlignBytes, buffer.opaque()); } // We do manual memory management within BufferAllocations. Be sure not // to do a TF_RETURN_IF_ERROR between this line and the @@ -165,5 +173,10 @@ void BufferAllocations::SetBuffer(BufferAllocation::Index buffer_index, buffers_[buffer_index] = buffer; } +bool ShouldEmitLiteralInLlvmIr(const Literal& literal) { + // LLVM can sometimes do interesting optimizations using scalar constants. + return ShapeUtil::IsScalar(literal.shape()); +} + } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h index 636623502597b3a66523938ba430e9d5a82f796c..14186b8faa68ad8492ea4863fcd7bd746e2eae48 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h @@ -20,10 +20,10 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -107,6 +107,12 @@ class BufferAllocations { bool torn_down_ = false; }; +// LLVM and PTXAS don't deal well with large constants, so we only emit very +// small constants directly in LLVM IR. Larger constants are emitted with zero +// initializers in LLVM IR and are later overwritten when the PTX/CUBIN is +// loaded. +bool ShouldEmitLiteralInLlvmIr(const Literal& literal); + } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc new file mode 100644 index 0000000000000000000000000000000000000000..13c83c9199fb1bbd8b00dbd601afcb677f92bbee --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc @@ -0,0 +1,204 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h" + +#include +#include "absl/strings/str_replace.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/status_macros.h" + +namespace xla { +namespace gpu { + +static constexpr float kTolerance = 0.1f; + +static string GetCompHloText(size_t num_elements) { + // Implements the textual format of the comparison routine, as it's more + // readable. + static constexpr char kF16CompHloText[] = R"( +HloModule CompareF16 + +MaxF32 { + %lhs = f32[] parameter(0) + %rhs = f32[] parameter(1) + ROOT %max = f32[] maximum(%lhs, %rhs) +} + +Canonicalize (aparam: f16[SIZE]) -> f32[SIZE] { + %min_constant = f32[] constant(-65505) + %max_constant = f32[] constant(65505) + %large_constant = f32[] constant(1048576) + %min_values = f32[SIZE] broadcast(%min_constant), dimensions={} + %max_values = f32[SIZE] broadcast(%max_constant), dimensions={} + %large_values = f32[SIZE] broadcast(%large_constant), dimensions={} + + %a = f16[SIZE] parameter(0) + %converted = f32[SIZE] convert(%a) + %clamped = f32[SIZE] clamp(%min_values, %converted, %max_values) + + // Since the clamp() above already took care of infs, only NaNs will cause + // is-finite() to return false. + %is_finite = pred[SIZE] is-finite(%clamped) + ROOT %result = f32[SIZE] select(%is_finite, %clamped, %large_values) +} + +ENTRY MaxDifference { + %one_constant = f32[] constant(1.0) + %zero_constant = f32[] constant(0.0) + + %ones = f32[SIZE] broadcast(%one_constant), dimensions={} + + %lhs = f16[SIZE] parameter(0) + %rhs = f16[SIZE] parameter(1) + %lhs_canonical = f32[SIZE] call(%lhs), to_apply=Canonicalize + %rhs_canonical = f32[SIZE] call(%rhs), to_apply=Canonicalize + %sub = f32[SIZE] subtract(%lhs_canonical, %rhs_canonical) + %sub_abs = f32[SIZE] abs(%sub) + %lhs_abs = f32[SIZE] abs(%lhs_canonical) + %rhs_abs = f32[SIZE] abs(%rhs_canonical) + %max = f32[SIZE] maximum(%lhs_abs, %rhs_abs) + %denominator = f32[SIZE] add(%max, %ones) + %error = f32[SIZE] divide(%sub_abs, %denominator) + ROOT %max_diff = f32[] reduce(%error, %zero_constant), dimensions={0}, to_apply=MaxF32 +})"; + return absl::StrReplaceAll(kF16CompHloText, + {{"SIZE", absl::StrCat(num_elements)}}); +} + +StatusOr F16BufferComparator::Create( + se::DeviceMemory ref_buffer, Compiler* compiler, + DeviceMemoryAllocator* allocator, se::Stream* stream) { + auto stream_exec = stream->parent(); + int64 num_elements = ref_buffer.ElementCount(); + + // One may consider using hlo_runner to do all the compilation and execution. + // However, as of the time hlo_runner doesn't support injection for Compiler*, + // Stream*, or even the allocator. We may revisit this in the future if it + // proves to be a maintenance burden. + TF_ASSIGN_OR_RETURN( + auto exec, ([&]() -> StatusOr> { + HloModuleConfig config; + DebugOptions debug_options; + debug_options.set_xla_backend_optimization_level(2); + config.set_debug_options(debug_options); + TF_ASSIGN_OR_RETURN( + auto module, ParseHloString(GetCompHloText(num_elements), config)); + TF_ASSIGN_OR_RETURN( + module, + compiler->RunHloPasses(std::move(module), stream_exec, nullptr)); + return compiler->RunBackend(std::move(module), stream_exec, nullptr); + }())); + + TF_ASSIGN_OR_RETURN( + auto shaped_buffer, ([&]() -> StatusOr { + auto device_ordinal = stream_exec->device_ordinal(); + TF_ASSIGN_OR_RETURN( + auto owning_buffer, + allocator->Allocate(device_ordinal, ref_buffer.size())); + se::DeviceMemory buffer( + owning_buffer.AsDeviceMemoryBase()); + stream->ThenMemcpy(&buffer, ref_buffer, ref_buffer.size()); + Shape shape = ShapeUtil::MakeShape(xla::F16, {num_elements}); + ScopedShapedBuffer ret(shape, shape, allocator, device_ordinal); + ret.set_buffer(std::move(owning_buffer), {}); + return std::move(ret); + }())); + + return F16BufferComparator(stream, allocator, std::move(exec), + std::move(shaped_buffer)); +} + +StatusOr F16BufferComparator::CompareEqualImpl( + se::DeviceMemory test_buffer) { + if (ref_buffer_.root_buffer().size() != test_buffer.size()) { + return InternalError("Mismatched buffer size: %d vs %d", + ref_buffer_.root_buffer().size(), test_buffer.size()); + } + + int64 num_elements = test_buffer.ElementCount(); + + TF_ASSIGN_OR_RETURN( + auto result_buffer, ([&]() -> StatusOr { + auto stream_exec = stream_->parent(); + Shape shape = ShapeUtil::MakeShape(xla::F16, {num_elements}); + auto device_ordinal = stream_exec->device_ordinal(); + ShapedBuffer shaped_test_buffer(shape, shape, stream_exec->platform(), + device_ordinal); + shaped_test_buffer.set_buffer(test_buffer, {}); + ExecutableRunOptions run_options; + run_options.set_device_ordinal(stream_exec->device_ordinal()); + run_options.set_stream(stream_); + run_options.set_allocator(allocator_); + ServiceExecutableRunOptions service_run_options(run_options); + return exec_->ExecuteOnStream( + &service_run_options, {&ref_buffer_, &shaped_test_buffer}, nullptr); + }())); + + float result; + CHECK(result_buffer.root_buffer().size() == sizeof(result)); + stream_->ThenMemcpy(&result, result_buffer.root_buffer(), sizeof(result)); + TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone()); + return result < kTolerance; +} + +StatusOr F16BufferComparator::CompareEqual( + se::DeviceMemory test_buffer) { + TF_ASSIGN_OR_RETURN(auto result, CompareEqualImpl(test_buffer)); + if (result) { + return true; + } + // Host side code that does the same thing, but report some of the + // differences as well. + int64 n = test_buffer.ElementCount(); + std::vector host_ref_buffer(n), host_test_buffer(n); + stream_->ThenMemcpy(host_ref_buffer.data(), ref_buffer_.root_buffer(), + ref_buffer_.root_buffer().size()); + stream_->ThenMemcpy(host_test_buffer.data(), test_buffer, test_buffer.size()); + TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone()); + + const auto canonicalize = [](float a) -> float { + constexpr float kBigNumer = 1048576.; + constexpr float kMaxFp16Value = 65504.; + if (std::isnan(a)) { + return kBigNumer; + } + if (std::isinf(a)) { + if (a < 0) { + return -(kMaxFp16Value + 1); + } + return kMaxFp16Value + 1; + } + return a; + }; + int differences_seen = 0; + for (int64 i = 0; i < n && differences_seen < 10; i++) { + float original_ref = static_cast(host_ref_buffer[i]); + float original_test = static_cast(host_test_buffer[i]); + float ref = canonicalize(original_ref); + float test = canonicalize(original_test); + if (!(std::abs(ref - test) / (std::max(std::abs(ref), std::abs(test)) + 1) < + kTolerance)) { + differences_seen++; + LOG(ERROR) << "Difference at " << i << ": " << original_ref << " vs " + << original_test; + } + } + + return false; +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.h b/tensorflow/compiler/xla/service/gpu/buffer_comparator.h new file mode 100644 index 0000000000000000000000000000000000000000..bf2ba78ceacaea1070830f758c3712b1378bd96f --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.h @@ -0,0 +1,71 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_ + +#include "tensorflow/compiler/xla/service/compiler.h" +#include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace gpu { + +// A fp16 comparator that internally keeps a reference buffer, and compares it +// against other test buffers. +class F16BufferComparator { + public: + F16BufferComparator(const F16BufferComparator&) = delete; + F16BufferComparator(F16BufferComparator&&) = default; + + // Creates a new comparator. It internally allocates a buffer initialized by + // ref_buffer. + static StatusOr Create( + se::DeviceMemory ref_buffer, Compiler* compiler, + DeviceMemoryAllocator* allocator, se::Stream* stream); + + // Returns true if the internally allocated buffer "compares equal" to + // test_buffer. The definition of "equal" is: + // * All NaNs equal. + // * All infs are treated as 65505 or -65505, so that this checker is tolerant + // to fp16 overflows. + // * With NaNs and infs taken care of, a and b compare equal iff: + // abs(a - b) / (max(abs(a), abs(b)) + 1) < tolerance + // + // See the implementation for the tolerance value. + StatusOr CompareEqual(se::DeviceMemory test_buffer); + + private: + F16BufferComparator(se::Stream* stream, DeviceMemoryAllocator* allocator, + std::unique_ptr exec, + ScopedShapedBuffer ref_buffer) + : stream_(stream), + allocator_(allocator), + exec_(std::move(exec)), + ref_buffer_(std::move(ref_buffer)) {} + + StatusOr CompareEqualImpl(se::DeviceMemory test_buffer); + + se::Stream* stream_; + DeviceMemoryAllocator* allocator_; + std::unique_ptr exec_; + ScopedShapedBuffer ref_buffer_; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_ diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..33761d1bd8807df225e2cf505303b120e418576f --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc @@ -0,0 +1,126 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h" + +#include +#include "tensorflow/compiler/xla/service/backend.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { +namespace { + +class BufferComparatorTest : public testing::Test { + protected: + BufferComparatorTest() + : backend_(Backend::CreateDefaultBackend().ConsumeValueOrDie()), + stream_exec_(backend_->default_stream_executor()), + allocator_(stream_exec_->platform(), {stream_exec_}), + compiler_(Compiler::GetForPlatform(stream_exec_->platform()) + .ConsumeValueOrDie()) {} + + // Take floats only for convenience. Still uses half internally. + bool CompareEqualFloatBuffers(const std::vector& lhs_float, + const std::vector& rhs_float) { + std::vector lhs(lhs_float.begin(), lhs_float.end()); + std::vector rhs(rhs_float.begin(), rhs_float.end()); + se::Stream stream(stream_exec_); + stream.Init(); + + auto owning_lhs_buffer = + allocator_ + .Allocate(stream_exec_->device_ordinal(), lhs.size() * sizeof(half)) + .ConsumeValueOrDie(); + + auto owning_rhs_buffer = + allocator_ + .Allocate(stream_exec_->device_ordinal(), rhs.size() * sizeof(half)) + .ConsumeValueOrDie(); + + auto lhs_buffer = + se::DeviceMemory(owning_lhs_buffer.AsDeviceMemoryBase()); + auto rhs_buffer = + se::DeviceMemory(owning_rhs_buffer.AsDeviceMemoryBase()); + + stream.ThenMemcpy(&lhs_buffer, lhs.data(), lhs_buffer.size()); + stream.ThenMemcpy(&rhs_buffer, rhs.data(), rhs_buffer.size()); + + TF_CHECK_OK(stream.BlockHostUntilDone()); + + return F16BufferComparator::Create(lhs_buffer, compiler_, &allocator_, + &stream) + .ConsumeValueOrDie() + .CompareEqual(rhs_buffer) + .ConsumeValueOrDie(); + } + + std::unique_ptr backend_; + se::StreamExecutor* stream_exec_; + StreamExecutorMemoryAllocator allocator_; + Compiler* compiler_; +}; + +TEST_F(BufferComparatorTest, TestNaNs) { + EXPECT_TRUE(CompareEqualFloatBuffers({std::nanf("")}, {std::nanf("")})); + // NaN values with different bit patterns should compare equal. + EXPECT_TRUE(CompareEqualFloatBuffers({std::nanf("")}, {std::nanf("1234")})); + EXPECT_FALSE(CompareEqualFloatBuffers({std::nanf("")}, {1.})); +} + +TEST_F(BufferComparatorTest, TestInfs) { + const auto inf = std::numeric_limits::infinity(); + EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {std::nanf("")})); + EXPECT_TRUE(CompareEqualFloatBuffers({inf}, {inf})); + EXPECT_TRUE(CompareEqualFloatBuffers({inf}, {65504})); + EXPECT_TRUE(CompareEqualFloatBuffers({-inf}, {-65504})); + EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {-65504})); + EXPECT_FALSE(CompareEqualFloatBuffers({-inf}, {65504})); + + EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {20})); + EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {-20})); + EXPECT_FALSE(CompareEqualFloatBuffers({-inf}, {20})); + EXPECT_FALSE(CompareEqualFloatBuffers({-inf}, {-20})); +} + +TEST_F(BufferComparatorTest, TestNumbers) { + EXPECT_TRUE(CompareEqualFloatBuffers({20}, {20.1})); + EXPECT_FALSE(CompareEqualFloatBuffers({0}, {1})); + EXPECT_TRUE(CompareEqualFloatBuffers({0.9}, {1})); + EXPECT_TRUE(CompareEqualFloatBuffers({9}, {10})); + EXPECT_TRUE(CompareEqualFloatBuffers({10}, {9})); +} + +TEST_F(BufferComparatorTest, TestMultiple) { + EXPECT_TRUE(CompareEqualFloatBuffers({20, 30, 40, 50, 60}, + {20.1, 30.1, 40.1, 50.1, 60.1})); + std::vector lhs(200); + std::vector rhs(200); + for (int i = 0; i < 200; i++) { + EXPECT_TRUE(CompareEqualFloatBuffers(lhs, rhs)) + << "should be the same at index " << i; + lhs[i] = 3; + rhs[i] = 5; + EXPECT_FALSE(CompareEqualFloatBuffers(lhs, rhs)) + << "should be the different at index " << i; + lhs[i] = 0; + rhs[i] = 0; + } +} + +} // namespace +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc index 77a48965e031349b045a956fd3f28c58607328e5..9ed523998bf07567133fdac0e40b12b8ce4ea3b0 100644 --- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc @@ -15,7 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" @@ -32,8 +33,11 @@ ConditionalThunk::ConditionalThunk( predicate_buffer_index_(predicate_buffer_index), true_operand_buffer_index_(true_operand_buffer_index), false_operand_buffer_index_(false_operand_buffer_index), - true_thunk_(std::move(true_thunk_sequence), hlo), - false_thunk_(std::move(false_thunk_sequence), hlo) {} + // Pass nullptr as the HloInstruction* to the true_thunk_ and false_thunk_ + // constructors because these SequentialThunks are logically "part of" + // this ConditionalThunk, and shouldn't be profiled separately from it. + true_thunk_(std::move(true_thunk_sequence), nullptr), + false_thunk_(std::move(false_thunk_sequence), nullptr) {} Status ConditionalThunk::Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) { @@ -43,7 +47,9 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable, } Status ConditionalThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); // Copy the predicate value from device. bool predicate; se::DeviceMemoryBase predicate_address = @@ -53,16 +59,21 @@ Status ConditionalThunk::ExecuteOnStream( Status block_status = stream->BlockHostUntilDone(); if (!block_status.ok()) { return InternalError("Failed to retrieve predicate value on stream %p: %s.", - stream, block_status.error_message().c_str()); + stream, block_status.error_message()); } // Execute the true or the false computation depending on the value of the // predicate. if (predicate) { - TF_RETURN_IF_ERROR(true_thunk_.ExecuteOnStream(buffer_allocations, stream)); + profiler->StartHloComputation(); + TF_RETURN_IF_ERROR( + true_thunk_.ExecuteOnStream(buffer_allocations, stream, profiler)); + profiler->FinishHloComputation(hlo_instruction()->true_computation()); } else { + profiler->StartHloComputation(); TF_RETURN_IF_ERROR( - false_thunk_.ExecuteOnStream(buffer_allocations, stream)); + false_thunk_.ExecuteOnStream(buffer_allocations, stream, profiler)); + profiler->FinishHloComputation(hlo_instruction()->false_computation()); } return Status::OK(); diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h index ee03865d174469285a9e98b8a30fea90d997df37..aef24342c9fe182eb54b1c2beff840a76e7b8115 100644 --- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_ #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -50,7 +51,8 @@ class ConditionalThunk : public Thunk { Status Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) override; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: BufferAllocation::Slice predicate_buffer_index_; diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc index f0881124128c9b043392ffc4fa3aee2cd5b754c7..05448d863dd2cfe69ad70168be40cdea5bc7017f 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc @@ -17,11 +17,11 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -37,8 +37,8 @@ ConvolutionThunk::ConvolutionThunk( const BufferAllocation::Slice& tuple_result_buffer, const BufferAllocation::Slice& scratch_buffer, const Shape& input_shape, const Shape& filter_shape, const Shape& output_shape, const Window& window, - const ConvolutionDimensionNumbers& dim_nums, int64 algorithm, - bool tensor_ops_enabled, const HloInstruction* hlo) + const ConvolutionDimensionNumbers& dim_nums, int64 feature_group_count, + int64 algorithm, bool tensor_ops_enabled, const HloInstruction* hlo) : Thunk(Kind::kConvolution, hlo), convolution_kind_(convolution_kind), input_buffer_(input_buffer), @@ -51,11 +51,13 @@ ConvolutionThunk::ConvolutionThunk( output_shape_(output_shape), window_(window), dim_nums_(dim_nums), + feature_group_count_(feature_group_count), algorithm_(algorithm), tensor_ops_enabled_(tensor_ops_enabled) {} Status ConvolutionThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { se::DeviceMemoryBase input_data = buffer_allocations.GetDeviceAddress(input_buffer_); se::DeviceMemoryBase filter_data = @@ -68,10 +70,11 @@ Status ConvolutionThunk::ExecuteOnStream( se::dnn::AlgorithmConfig algorithm_config( se::dnn::AlgorithmDesc(algorithm_, tensor_ops_enabled_)); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); TF_RETURN_IF_ERROR(RunCudnnConvolution( convolution_kind_, input_shape_, filter_shape_, output_shape_, input_data, - filter_data, output_data, scratch, window_, dim_nums_, algorithm_config, - stream)); + filter_data, output_data, scratch, window_, dim_nums_, + feature_group_count_, algorithm_config, stream)); // Figure out which of output/input/filter is the result produced by // this op, and write the result tuple. diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h index 6d845025b1aef2b0a5f147401b6db0598ba94d6d..68d67c40c56145a137398540e90b75b33642589f 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h @@ -16,16 +16,17 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_THUNK_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_THUNK_H_ +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h" #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -58,7 +59,8 @@ class ConvolutionThunk : public Thunk { const BufferAllocation::Slice& scratch_buffer, const Shape& input_shape, const Shape& filter_shape, const Shape& output_shape, const Window& window, - const ConvolutionDimensionNumbers& dim_nums, int64 algorithm, + const ConvolutionDimensionNumbers& dim_nums, + int64 feature_group_count, int64 algorithm, bool tensor_ops_enabled, const HloInstruction* hlo); ConvolutionThunk(const ConvolutionThunk&) = delete; @@ -66,22 +68,10 @@ class ConvolutionThunk : public Thunk { // Does the convolution for the thunk on "stream". Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: - class ScratchAllocator; - - Status Convolve(const se::dnn::BatchDescriptor& input_descriptor, - se::DeviceMemory input_data, - const se::dnn::FilterDescriptor& filter_descriptor, - se::DeviceMemory filter_data, - const se::dnn::BatchDescriptor& output_descriptor, - se::DeviceMemory output_data, - const se::dnn::ConvolutionDescriptor& convolution_descriptor, - const se::dnn::AlgorithmConfig& algorithm_config, - se::Stream* stream, ScratchAllocator* scratch_allocator, - se::dnn::ProfileResult* profile_result); - const CudnnConvKind convolution_kind_; const BufferAllocation::Slice input_buffer_; @@ -96,6 +86,7 @@ class ConvolutionThunk : public Thunk { const Window window_; const ConvolutionDimensionNumbers dim_nums_; + int64 feature_group_count_; int64 algorithm_; bool tensor_ops_enabled_; }; diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc index ee38c0318a878c7bcdc02afdcd146bfb4498d9a2..92e03f94c11f68082f0a8caa64f82e8533557194 100644 --- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -30,9 +31,11 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk( mem_size_(mem_size) {} Status HostToDeviceCopyThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { se::DeviceMemoryBase destination_data = buffer_allocations.GetDeviceAddress(destination_buffer_); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); stream->ThenMemcpy(&destination_data, source_address_, mem_size_); return Status::OK(); } @@ -47,11 +50,13 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk( mem_size_(mem_size) {} Status DeviceToDeviceCopyThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { se::DeviceMemoryBase destination_data = buffer_allocations.GetDeviceAddress(destination_buffer_); se::DeviceMemoryBase source_data = buffer_allocations.GetDeviceAddress(source_buffer_); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); stream->ThenMemcpy(&destination_data, source_data, mem_size_); return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h index 8b128386f61636de9ac41e856a2b00c578e05735..91564b520acae1839e0a466cf580db00bdf57e46 100644 --- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -40,7 +41,8 @@ class HostToDeviceCopyThunk : public Thunk { HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: const void* source_address_; @@ -63,7 +65,8 @@ class DeviceToDeviceCopyThunk : public Thunk { DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: const BufferAllocation::Slice source_buffer_; diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc index db6924c742e4a949a3e939b6d6659e92c2d1e312..60289506524759580dbb9b82147c78c4ce1cb25e 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" @@ -66,11 +67,12 @@ Status Visitor::HandleBatchNormInference(HloInstruction* batch_norm) { return Status::OK(); } - HloInstruction* epsilon = computation_->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon()))); + HloInstruction* epsilon = + computation_->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(batch_norm->epsilon()))); HloInstruction* feature_index = computation_->AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR0(batch_norm->feature_index()))); + LiteralUtil::CreateR0(batch_norm->feature_index()))); std::vector operands(batch_norm->operands().begin(), batch_norm->operands().end()); @@ -101,11 +103,12 @@ Status Visitor::HandleBatchNormTraining(HloInstruction* batch_norm) { return Status::OK(); } - HloInstruction* epsilon = computation_->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon()))); + HloInstruction* epsilon = + computation_->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(batch_norm->epsilon()))); HloInstruction* feature_index = computation_->AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR0(batch_norm->feature_index()))); + LiteralUtil::CreateR0(batch_norm->feature_index()))); std::vector operands(batch_norm->operands().begin(), batch_norm->operands().end()); @@ -126,12 +129,17 @@ Status Visitor::HandleBatchNormTraining(HloInstruction* batch_norm) { HloInstruction* variance_plus_epsilon = computation_->AddInstruction(HloInstruction::CreateBinary( inverse_stddev->shape(), HloOpcode::kPower, inverse_stddev, - computation_->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(-2))))); + computation_->AddInstruction(HloInstruction::CreateBroadcast( + inverse_stddev->shape(), + computation_->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(-2))), + {})))); HloInstruction* variance = computation_->AddInstruction(HloInstruction::CreateBinary( variance_plus_epsilon->shape(), HloOpcode::kSubtract, - variance_plus_epsilon, epsilon)); + variance_plus_epsilon, + computation_->AddInstruction(HloInstruction::CreateBroadcast( + variance_plus_epsilon->shape(), epsilon, {})))); // Repackage the results. std::unique_ptr new_tuple = HloInstruction::CreateTuple({ @@ -164,23 +172,29 @@ Status Visitor::HandleBatchNormGrad(HloInstruction* batch_norm) { return Status::OK(); } - HloInstruction* epsilon = computation_->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon()))); + HloInstruction* epsilon = + computation_->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(batch_norm->epsilon()))); HloInstruction* feature_index = computation_->AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR0(batch_norm->feature_index()))); + LiteralUtil::CreateR0(batch_norm->feature_index()))); // The cudnn libcall expects its input to be rsqrt(variance + epsilon), but // the batchnorm HLO takes plain variance as input. Fix it up. HloInstruction* var_plus_epsilon = computation_->AddInstruction(HloInstruction::CreateBinary( batch_norm->operand(3)->shape(), HloOpcode::kAdd, - batch_norm->mutable_operand(3), epsilon)); + batch_norm->mutable_operand(3), + computation_->AddInstruction(HloInstruction::CreateBroadcast( + batch_norm->operand(3)->shape(), epsilon, {})))); HloInstruction* inverse_stddev = computation_->AddInstruction(HloInstruction::CreateBinary( var_plus_epsilon->shape(), HloOpcode::kPower, var_plus_epsilon, - computation_->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(-.5))))); + computation_->AddInstruction(HloInstruction::CreateBroadcast( + var_plus_epsilon->shape(), + computation_->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(-.5))), + {})))); std::vector operands(batch_norm->operands().begin(), batch_norm->operands().end()); diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h index e09cde9abf85454c7a020566cd8c2671ae12ffc3..6e2e330edd4beabe0b395f05b80d57612d63f110 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h @@ -54,9 +54,7 @@ namespace gpu { // BatchNormRewriter. class CudnnBatchNormRewriter : public HloPassInterface { public: - tensorflow::StringPiece name() const override { - return "cudnn_batchnorm_rewriter"; - } + absl::string_view name() const override { return "cudnn_batchnorm_rewriter"; } StatusOr Run(HloModule* module) override; }; diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc index 68099fd63847ef9993f9bc7ac0e28b2939631b35..bc3c6f72f6799f84169748465d62c3f2a306d5fc 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc @@ -17,11 +17,11 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -99,13 +99,15 @@ CudnnBatchNormForwardInferenceThunk::CudnnBatchNormForwardInferenceThunk( } Status CudnnBatchNormForwardInferenceThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { dnn::BatchDescriptor operand_desc; dnn::BatchDescriptor scale_offset_desc; std::tie(operand_desc, scale_offset_desc) = MakeDescriptors(hlo_instruction()->shape(), feature_index_); se::DeviceMemory output(buffer_allocations.GetDeviceAddress(output_)); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); stream->ThenBatchNormalizationForward( se::DeviceMemory(buffer_allocations.GetDeviceAddress(operand_)), se::DeviceMemory(buffer_allocations.GetDeviceAddress(scale_)), @@ -123,6 +125,7 @@ Status CudnnBatchNormForwardInferenceThunk::ExecuteOnStream( /*is_training=*/false, // /*var_to_inv_var=*/nullptr, // /*inv_var_to_var=*/nullptr); + if (!stream->ok()) { return InternalError("BatchNormalizationForward call failed."); } @@ -158,7 +161,8 @@ CudnnBatchNormForwardTrainingThunk::CudnnBatchNormForwardTrainingThunk( } Status CudnnBatchNormForwardTrainingThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { dnn::BatchDescriptor operand_desc; dnn::BatchDescriptor scale_offset_desc; // The BatchNormTraining HLO outputs a tuple of three elements: output data, @@ -175,6 +179,7 @@ Status CudnnBatchNormForwardTrainingThunk::ExecuteOnStream( buffer_allocations.GetDeviceAddress(output_inv_stddev_)); se::DeviceMemory null_device_ptr(nullptr); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); stream->ThenBatchNormalizationForward( se::DeviceMemory(buffer_allocations.GetDeviceAddress(operand_)), se::DeviceMemory(buffer_allocations.GetDeviceAddress(scale_)), @@ -240,7 +245,8 @@ CudnnBatchNormBackwardThunk::CudnnBatchNormBackwardThunk( } Status CudnnBatchNormBackwardThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { dnn::BatchDescriptor operand_desc; dnn::BatchDescriptor scale_offset_desc; @@ -257,6 +263,7 @@ Status CudnnBatchNormBackwardThunk::ExecuteOnStream( se::DeviceMemory output_grad_offset( buffer_allocations.GetDeviceAddress(output_grad_offset_)); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); stream->ThenBatchNormalizationBackward( se::DeviceMemory( buffer_allocations.GetDeviceAddress(grad_output_)), diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h index 874f85a863092ee05ae5df1f92d732318c5a0554..d2143b3952984722d136757255aa0aa60e9cab7e 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/types.h" @@ -60,7 +61,8 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk { const CudnnBatchNormForwardInferenceThunk&) = delete; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: BufferAllocation::Slice operand_; @@ -90,7 +92,8 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk { const CudnnBatchNormForwardTrainingThunk&) = delete; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: BufferAllocation::Slice operand_; @@ -123,7 +126,8 @@ class CudnnBatchNormBackwardThunk : public Thunk { delete; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: BufferAllocation::Slice operand_; diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc index 6a46bdb9b438f81dc564b9033f5d302f90b6a997..5c2555148ae5de4a15e5a5f003b4783c64a20e9c 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc @@ -14,21 +14,25 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h" +#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h" #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/mutex.h" namespace xla { namespace gpu { namespace { +using absl::optional; using se::DeviceMemoryBase; using se::dnn::AlgorithmConfig; using se::dnn::AlgorithmDesc; -using tensorflow::gtl::nullopt; -using tensorflow::gtl::optional; class ScratchAllocator : public se::ScratchAllocator { public: @@ -56,8 +60,8 @@ StatusOr> ScratchAllocator::AllocateBytes( if (byte_size > GetMemoryLimitInBytes(stream)) { return se::port::Status( se::port::error::RESOURCE_EXHAUSTED, - tensorflow::strings::Printf( - "Allocating %lld bytes exceeds the memory limit of %lld bytes.", + absl::StrFormat( + "Allocating %d bytes exceeds the memory limit of %d bytes.", byte_size, GetMemoryLimitInBytes(stream))); } @@ -79,8 +83,7 @@ bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape, const ConvolutionDimensionNumbers& dnums, se::StreamExecutor* stream_exec) { // Skip this check for cudnn7 and newer. - auto version = - stream_exec->AsDnn()->GetVersion(); + auto version = stream_exec->AsDnn()->GetVersion(); if (version.ok() && version.ValueOrDie().major_version() >= 7) { return true; } @@ -126,14 +129,36 @@ std::vector GetAlgorithms(CudnnConvKind kind, string AlgorithmToString(const AlgorithmDesc& algo) { if (algo.tensor_ops_enabled()) { - return tensorflow::strings::StrCat(algo.algo_id(), "+TC"); + return absl::StrCat(algo.algo_id(), "+TC"); } - return tensorflow::strings::StrCat(algo.algo_id()); + return absl::StrCat(algo.algo_id()); } string NumBytesToString(int64 bytes) { - return tensorflow::strings::StrCat( - tensorflow::strings::HumanReadableNumBytes(bytes), " (", bytes, "B)"); + return absl::StrCat(tensorflow::strings::HumanReadableNumBytes(bytes), " (", + bytes, "B)"); +} + +// Acquires a process-global lock on the device pointed to by the given +// StreamExecutor. +// +// This is used to prevent other XLA instances from trying to autotune on this +// device while we're using it. +tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) { + static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); + // se::Platform*s are global singletons guaranteed to live forever. + static auto* mutexes = + new std::map, + tensorflow::mutex>(); + + tensorflow::mutex_lock global_lock(mu); + auto it = mutexes + ->emplace(std::piecewise_construct, + std::make_tuple(stream_exec->platform(), + stream_exec->device_ordinal()), + std::make_tuple()) + .first; + return tensorflow::mutex_lock{it->second}; } } // anonymous namespace @@ -149,11 +174,31 @@ string NumBytesToString(int64 bytes) { // cache misses and doing extra work. Overall, caching doesn't seem worth the // trouble, but we may want to revisit this if we ever find a model where // caching would speed up compilation a lot. -optional> +StatusOr> CudnnConvolutionAlgorithmPicker::PickBestAlgorithm( CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape, const Shape& output_shape, const Window& window, - const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) { + const ConvolutionDimensionNumbers& dnums, int64 feature_group_count, + HloInstruction* instr) { + CHECK_EQ(input_shape.element_type(), filter_shape.element_type()); + CHECK_EQ(input_shape.element_type(), output_shape.element_type()); + // TODO(timshen): for now only check fp16. It can be expanded to other types, + // with some work on the HLO routines. + const bool cross_check_enabled = input_shape.element_type() == xla::F16; + + // Don't run this function concurrently on the same GPU. + // + // This is a bit of a hack and doesn't protect us against arbitrary concurrent + // use of a GPU, but it's sufficient to let us compile two HLO modules + // concurrently and then run them sequentially. + tensorflow::mutex_lock lock = LockGpu(stream_exec_); + + // Make sure any previous activity on this executor is done. We don't want to + // interfere with programs that are still running on the GPU. + if (!stream_exec_->SynchronizeAllActivity()) { + return InternalError("Failed to synchronize GPU for autotuning."); + } + // Create a stream for us to do our work on. se::Stream stream{stream_exec_}; stream.Init(); @@ -166,60 +211,82 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm( if (allocator_ != nullptr) { allocator = allocator_; } else { - se_allocator.emplace( - stream_exec_->platform(), - tensorflow::gtl::ArraySlice({stream_exec_})); + se_allocator.emplace(stream_exec_->platform(), + absl::Span({stream_exec_})); allocator = &*se_allocator; } // Allocate space for the input, filter, and output of the convolution. We // use a ScratchAllocator for this instead of calling allocator_ directly so // that our allocations don't leak. - // - // We don't put any data in these buffers, because (in theory, anyway) the - // speed of a conv isn't affected by the data being convolved. ScratchAllocator input_output_allocator(device_ordinal, allocator); - StatusOr maybe_input_buf = - input_output_allocator.AllocateBytes(&stream, - ShapeUtil::ByteSizeOf(input_shape)); - StatusOr maybe_filter_buf = - input_output_allocator.AllocateBytes(&stream, - ShapeUtil::ByteSizeOf(filter_shape)); - StatusOr maybe_output_buf = - input_output_allocator.AllocateBytes(&stream, - ShapeUtil::ByteSizeOf(output_shape)); - if (!maybe_input_buf.ok() || !maybe_filter_buf.ok() || - !maybe_output_buf.ok()) { - LOG(WARNING) - << "Couldn't allocate space for input/filter/output of convolution " - << instr->ToString() << ". Falling back to default algorithm."; - return nullopt; + TF_ASSIGN_OR_RETURN(DeviceMemoryBase input_buf, + input_output_allocator.AllocateBytes( + &stream, ShapeUtil::ByteSizeOf(input_shape))); + TF_ASSIGN_OR_RETURN(DeviceMemoryBase filter_buf, + input_output_allocator.AllocateBytes( + &stream, ShapeUtil::ByteSizeOf(filter_shape))); + TF_ASSIGN_OR_RETURN(DeviceMemoryBase output_buf, + input_output_allocator.AllocateBytes( + &stream, ShapeUtil::ByteSizeOf(output_shape))); + + if (cross_check_enabled) { + // Broadcast a constant to the buffer, instead of zeroing the buffer. A + // non-zero constant is useful for the cross checking, because zero-inputs + // may not always reveal the bugs. + const auto initialize_f16 = [&stream](DeviceMemoryBase buffer) { + CHECK_EQ(0, (uintptr_t)buffer.opaque() % 4); + size_t left_over_bytes = buffer.size() % 4; + CHECK_EQ(0, left_over_bytes % 2); + + constexpr float kBroadcastedConstant = 0.1f; + static const Eigen::half halfs[2] = {Eigen::half(kBroadcastedConstant), + Eigen::half(kBroadcastedConstant)}; + uint32 bits; + static_assert(sizeof(bits) == sizeof(halfs), ""); + memcpy(&bits, halfs, sizeof(bits)); + + size_t aligned_size = buffer.size() / 4 * 4; + stream.ThenMemset32(&buffer, bits, aligned_size); + + DeviceMemoryBase left_over( + static_cast(buffer.opaque()) + aligned_size, left_over_bytes); + stream.ThenMemcpy(&left_over, halfs, left_over_bytes); + }; + initialize_f16(input_buf); + initialize_f16(filter_buf); + initialize_f16(output_buf); + } else { + // Although we don't have evidence this matters, zero out the buffers before + // autotuning. It's conceivable that using uninitialized memory as the + // inputs might affect performance if e.g. the inputs contain denormals, and + // this is easy enough. + stream.ThenMemZero(&input_buf, input_buf.size()) + .ThenMemZero(&filter_buf, filter_buf.size()) + .ThenMemZero(&output_buf, output_buf.size()); } - DeviceMemoryBase input_buf = maybe_input_buf.ValueOrDie(); - DeviceMemoryBase filter_buf = maybe_filter_buf.ValueOrDie(); - DeviceMemoryBase output_buf = maybe_output_buf.ValueOrDie(); - - // Although we don't have evidence this matters, zero out the buffers before - // autotuning. It's conceivable that using uninitialized memory as the inputs - // might affect performance if e.g. the inputs contain denormals, and this is - // easy enough. - if (!stream.ThenMemZero(&input_buf, input_buf.size()) - .ThenMemZero(&filter_buf, filter_buf.size()) - .ThenMemZero(&output_buf, output_buf.size()) - .BlockHostUntilDone() - .ok()) { - LOG(WARNING) - << "Couldn't zero out input/filter/output buffer for convolution " - << instr->ToString() << ". Falling back to default algorithm."; - return nullopt; - } + DeviceMemoryBase* result_buf = [&] { + switch (kind) { + case CudnnConvKind::kBackwardFilter: + return &filter_buf; + case CudnnConvKind::kBackwardInput: + return &input_buf; + case CudnnConvKind::kForward: + return &output_buf; + } + }(); const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo( input_shape, output_shape, dnums, stream_exec_); se::dnn::ProfileResult best_result; int64 best_result_bytes_used = 0; + optional comparator; + // Use the first algorithm that's supported as reference. There isn't a + // particular reason to use it, as any algorithm sufficies. It doesn't make + // this algorithm considered correct, though. + optional first_algorithm; for (const AlgorithmDesc& alg : GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) { ScratchAllocator scratch_allocator(device_ordinal, allocator); @@ -228,13 +295,49 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm( << instr->ToString(); bool launch_ok = - RunCudnnConvolution(kind, input_shape, filter_shape, output_shape, - input_buf, filter_buf, output_buf, - &scratch_allocator, window, dnums, - AlgorithmConfig(alg), &stream, &profile_result) + RunCudnnConvolution( + kind, input_shape, filter_shape, output_shape, input_buf, + filter_buf, output_buf, &scratch_allocator, window, dnums, + feature_group_count, AlgorithmConfig(alg), &stream, &profile_result) .ok(); if (launch_ok && profile_result.is_valid()) { + const bool crash_on_checking_failure = + instr->GetModule() + ->config() + .debug_options() + .xla_gpu_crash_on_verification_failures(); + if (comparator.has_value()) { + StatusOr result = comparator->CompareEqual( + se::DeviceMemory(*result_buf)); + if (!result.ok()) { + LOG(ERROR) << "Unable to compare " + << AlgorithmToString(*first_algorithm) << " against " + << AlgorithmToString(alg) << " for " << instr->ToString() + << ": " << result.status(); + CHECK(!crash_on_checking_failure); + } else if (!result.ValueOrDie()) { + LOG(ERROR) << "Results mismatch between different convolution " + "algorithms. This is likely a bug in convolution, or " + "an excessive loss of precision in convolution. " + << instr->ToString() << " for " + << AlgorithmToString(*first_algorithm) << " vs " + << AlgorithmToString(alg); + CHECK(!crash_on_checking_failure); + } + } else if (cross_check_enabled) { + auto comp = F16BufferComparator::Create( + se::DeviceMemory(*result_buf), compiler_, allocator, + &stream); + if (comp.ok()) { + comparator.emplace(comp.ConsumeValueOrDie()); + first_algorithm.emplace(alg); + } else { + LOG(ERROR) << "Fail to initialize buffer comparator: " + << comp.status() << ", instruction: " << instr->ToString(); + CHECK(!crash_on_checking_failure); + } + } int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes(); VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " succeeded, taking " << profile_result.elapsed_time_in_ms() @@ -261,9 +364,10 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm( best_result_bytes_used); } - LOG(WARNING) << "All algorithms tried for convolution " << instr->ToString() - << " failed. Falling back to default algorithm."; - return nullopt; + return InternalError( + "All algorithms tried for convolution %s failed. Falling back to " + "default algorithm.", + instr->ToString()); } StatusOr CudnnConvolutionAlgorithmPicker::RunOnInstruction( @@ -274,28 +378,33 @@ StatusOr CudnnConvolutionAlgorithmPicker::RunOnInstruction( const auto& lhs_shape = instr->operand(0)->shape(); const auto& rhs_shape = instr->operand(1)->shape(); const auto& conv_result_shape = instr->shape().tuple_shapes(0); - optional> alg_scratch_and_tc; + StatusOr> alg_scratch_and_tc; if (call_target == kCudnnConvForwardCallTarget) { - alg_scratch_and_tc = PickBestAlgorithm( - CudnnConvKind::kForward, /*input_shape=*/lhs_shape, - /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape, - instr->window(), instr->convolution_dimension_numbers(), instr); + alg_scratch_and_tc = + PickBestAlgorithm(CudnnConvKind::kForward, /*input_shape=*/lhs_shape, + /*filter_shape=*/rhs_shape, + /*output_shape=*/conv_result_shape, instr->window(), + instr->convolution_dimension_numbers(), + instr->feature_group_count(), instr); } else if (call_target == kCudnnConvBackwardInputCallTarget) { alg_scratch_and_tc = PickBestAlgorithm( CudnnConvKind::kBackwardInput, /*input_shape=*/conv_result_shape, /*filter_shape=*/rhs_shape, /*output_shape=*/lhs_shape, instr->window(), - instr->convolution_dimension_numbers(), instr); + instr->convolution_dimension_numbers(), instr->feature_group_count(), + instr); } else if (call_target == kCudnnConvBackwardFilterCallTarget) { alg_scratch_and_tc = PickBestAlgorithm( CudnnConvKind::kBackwardFilter, /*input_shape=*/lhs_shape, /*filter_shape=*/conv_result_shape, /*output_shape=*/rhs_shape, - instr->window(), instr->convolution_dimension_numbers(), instr); + instr->window(), instr->convolution_dimension_numbers(), + instr->feature_group_count(), instr); } else { LOG(FATAL) << "Unknown custom call target for cudnn conv: " << instr->ToString(); } - if (!alg_scratch_and_tc.has_value()) { + if (!alg_scratch_and_tc.ok()) { + LOG(ERROR) << alg_scratch_and_tc.status(); return false; } @@ -303,7 +412,8 @@ StatusOr CudnnConvolutionAlgorithmPicker::RunOnInstruction( bool tensor_ops_enabled; int64 scratch_bytes; - std::tie(algorithm, tensor_ops_enabled, scratch_bytes) = *alg_scratch_and_tc; + std::tie(algorithm, tensor_ops_enabled, scratch_bytes) = + alg_scratch_and_tc.ConsumeValueOrDie(); VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and " << NumBytesToString(scratch_bytes) @@ -316,21 +426,15 @@ StatusOr CudnnConvolutionAlgorithmPicker::RunOnInstruction( Shape new_call_shape = ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0), ShapeUtil::MakeShape(U8, {scratch_bytes})}); - HloInstruction* algorithm_hlo = computation->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(algorithm))); - HloInstruction* tensor_ops_enabled_hlo = - computation->AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR0(tensor_ops_enabled))); - - HloInstruction* new_call = - computation->AddInstruction(HloInstruction::CreateCustomCall( - new_call_shape, - {instr->mutable_operand(0), instr->mutable_operand(1), algorithm_hlo, - tensor_ops_enabled_hlo}, - instr->custom_call_target())); - new_call->set_window(instr->window()); - new_call->set_convolution_dimension_numbers( - instr->convolution_dimension_numbers()); + + CudnnConvBackendConfig backend_config; + backend_config.set_algorithm(algorithm); + backend_config.set_tensor_ops_enabled(tensor_ops_enabled); + + HloInstruction* new_call = computation->AddInstruction( + instr->CloneWithNewOperands(new_call_shape, {instr->mutable_operand(0), + instr->mutable_operand(1)})); + TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config)); // Repackage new_call so it has the same shape as the original call, namely // (conv_result, u8[0]). @@ -338,8 +442,8 @@ StatusOr CudnnConvolutionAlgorithmPicker::RunOnInstruction( computation->AddInstruction(HloInstruction::CreateTuple( {computation->AddInstruction(HloInstruction::CreateGetTupleElement( new_call_shape.tuple_shapes(0), new_call, 0)), - computation->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({})))})); + computation->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({})))})); TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple)); return true; diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h index bc5d1ce94afd2075a006899f0f6bcf64352e5e99..0cb01161b023b900c8c4b1386b679fe2bd5db802 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h @@ -16,11 +16,12 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_ +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -34,10 +35,11 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface { // memory while timing the various convolution algorithms. If it's null, // we'll use the default allocator on the StreamExecutor. CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* allocator) - : stream_exec_(stream_exec), allocator_(allocator) {} + DeviceMemoryAllocator* allocator, + Compiler* compiler) + : stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {} - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "cudnn-convolution-algorithm-picker"; } @@ -46,13 +48,15 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface { private: StatusOr RunOnComputation(HloComputation* computation); StatusOr RunOnInstruction(HloInstruction* instr); - tensorflow::gtl::optional> PickBestAlgorithm( + StatusOr> PickBestAlgorithm( CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape, const Shape& output_shape, const Window& window, - const ConvolutionDimensionNumbers& dnums, HloInstruction* instr); + const ConvolutionDimensionNumbers& dnums, int64 feature_group_count, + HloInstruction* instr); se::StreamExecutor* stream_exec_; // never null DeviceMemoryAllocator* allocator_; // may be null + Compiler* compiler_; }; } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc index e0c73aa73acb7f3313eb54fb07390cb76590433e..9bf721ecd2ad938e71f88a6fc65cd2d3bd25161e 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" @@ -42,8 +42,8 @@ bool CanImplementAsCudnnForwardConv(HloInstruction* conv) { } // CuDNN does not accept zero-element arguments - if (ShapeUtil::HasZeroElements(conv->operand(0)->shape()) || - ShapeUtil::HasZeroElements(conv->operand(1)->shape())) { + if (ShapeUtil::IsZeroElementArray(conv->operand(0)->shape()) || + ShapeUtil::IsZeroElementArray(conv->operand(1)->shape())) { return false; } @@ -59,6 +59,11 @@ std::tuple MatchBackwardFilter( HloInstruction* conv) { const auto no_match_result = std::make_tuple(false, Window(), ConvolutionDimensionNumbers()); + // TODO(b/31709653): Figure out if we can use grouped convolutions also on + // backward filter. + if (conv->feature_group_count() > 1) { + return no_match_result; + } // Step 1: match the instruction pattern without considering the paddings and // dimension numbers just yet. We may need some generic pattern matcher // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h @@ -218,6 +223,12 @@ std::tuple MatchBackwardInput( const auto no_match_result = std::make_tuple(false, Window(), ConvolutionDimensionNumbers()); + // TODO(b/31709653): Figure out if we can use grouped convolutions also on + // backward input. + if (conv->feature_group_count() > 1) { + return no_match_result; + } + // Match instruction pattern. CHECK_EQ(HloOpcode::kConvolution, conv->opcode()); HloInstruction* reverse_filter = conv->mutable_operand(1); @@ -234,6 +245,23 @@ std::tuple MatchBackwardInput( << "Backward input convolution should reverse all kernel dimensions."; return no_match_result; } + } else if (reverse_filter->IsConstant()) { + // If the filter is a constant, we're willing to pattern-match to a + // backwards-input conv, on the theory that + // + // a) reversing a constant is free, and + // b) even if the user specified this filter as reverse(constant), we would + // long ago have constant-folded away the reverse. + // + // If the constant has any other uses, reversing it isn't entirely free, + // since we'd now have two constants to keep in memory. But hopefully it's + // free enough. + // + // TODO(jlebar): Should we do this even if the filter is not a constant? + // Reversing a non-constant filter is probably cheaper than padding the + // input! + + // Nothing to do, just fall through. } else { // Possibly 1x1 filter. for (int64 i = 0; i < kernel_spatial_dims.size(); ++i) { @@ -373,22 +401,25 @@ std::tuple MatchBackwardInput( } } - // Fuse the matched HLOs into a backward convolution instruction. - // - // If the reverse is omitted (for 1x1 filters) in the original pattern, we add - // it back in the fusion instruction so that later passes (such as - // PadInsertion) can handle such fusion instructions easily. + // OK, it's a match! Canonicalize the conv's filter so that it's a reverse. + // This simplifies things for our caller, and algebraic-simplifier will later + // remove any unnecessary reverses. if (reverse_filter->opcode() != HloOpcode::kReverse) { - reverse_filter = reverse_filter->parent()->AddInstruction( + // Create a double-reverse, which is a nop. + HloComputation* c = conv->parent(); + reverse_filter = c->AddInstruction( + HloInstruction::CreateReverse(reverse_filter->shape(), reverse_filter, + AsInt64Slice(kernel_spatial_dims))); + reverse_filter = c->AddInstruction( HloInstruction::CreateReverse(reverse_filter->shape(), reverse_filter, AsInt64Slice(kernel_spatial_dims))); TF_CHECK_OK(conv->ReplaceOperandWith(/*operand_no=*/1, reverse_filter)); } + dnums.set_kernel_input_feature_dimension( conv->convolution_dimension_numbers().kernel_output_feature_dimension()); dnums.set_kernel_output_feature_dimension( conv->convolution_dimension_numbers().kernel_input_feature_dimension()); - return std::make_tuple(true, new_window, dnums); } @@ -405,7 +436,7 @@ StatusOr RunOnInstruction(HloInstruction* conv) { if (match) { return CreateCudnnConvBackwardFilter( conv->shape(), conv->mutable_operand(0), conv->mutable_operand(1), - window, dnums); + window, dnums, conv->feature_group_count()); } std::tie(match, window, dnums) = MatchBackwardInput(conv); @@ -415,15 +446,17 @@ StatusOr RunOnInstruction(HloInstruction* conv) { CHECK_EQ(reverse->opcode(), HloOpcode::kReverse); HloInstruction* rhs = reverse->mutable_operand(0); - return CreateCudnnConvBackwardInput( - conv->shape(), conv->mutable_operand(0), rhs, window, dnums); + return CreateCudnnConvBackwardInput(conv->shape(), + conv->mutable_operand(0), rhs, window, + dnums, conv->feature_group_count()); } // If all else fails, try a forward convolution. if (CanImplementAsCudnnForwardConv(conv)) { return CreateCudnnConvForward(conv->shape(), conv->mutable_operand(0), conv->mutable_operand(1), conv->window(), - conv->convolution_dimension_numbers()); + conv->convolution_dimension_numbers(), + conv->feature_group_count()); } return nullptr; diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h index 0c0578d88840fed1d77f7456c9acef27dec380f5..fbe7e9849458e9d52be15b3f5610479ab68ffa4c 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h @@ -26,7 +26,7 @@ namespace gpu { // backwards-input convolutions into CustomCall HLOs that call into cuDNN. class CudnnConvolutionRewriter : public HloPassInterface { public: - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "cudnn-convolution-rewriter"; } diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc index 65588b6aaf24da628ea586eb52c462b78b8daaa7..46c23db4652cccb06c9ca2a199a46ae04b332286 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc @@ -24,7 +24,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/shape_inference.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" -#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/core/platform/test.h" namespace xla { @@ -32,10 +32,13 @@ namespace gpu { namespace { namespace op = xla::testing::opcode_matchers; +using ::testing::_; -class CudnnConvolutionRewriterTest : public HloTestBase { +class CudnnConvolutionRewriterTest : public HloVerifiedTestBase { public: - CudnnConvolutionRewriterTest() { + CudnnConvolutionRewriterTest() + : HloVerifiedTestBase(/*layout_sensitive=*/true, + /*allow_mixed_precision=*/false) { for (int i = 0; i < 2; ++i) { WindowDimension* window_dim = default_conv_window_.add_dimensions(); window_dim->set_size(1); @@ -114,7 +117,7 @@ TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolve) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); EXPECT_THAT(entry_computation->root_instruction(), op::GetTupleElement( op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); @@ -142,7 +145,7 @@ TEST_F(CudnnConvolutionRewriterTest, auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); EXPECT_THAT(entry_computation->root_instruction(), op::GetTupleElement( op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); @@ -172,7 +175,7 @@ TEST_F(CudnnConvolutionRewriterTest, auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); EXPECT_THAT(entry_computation->root_instruction(), op::GetTupleElement( op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); @@ -202,7 +205,7 @@ TEST_F(CudnnConvolutionRewriterTest, auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); EXPECT_THAT(entry_computation->root_instruction(), op::GetTupleElement( op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); @@ -230,7 +233,7 @@ TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolveWithUnevenPadding) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); EXPECT_THAT(entry_computation->root_instruction(), op::GetTupleElement( op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); @@ -280,7 +283,7 @@ TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveEvenPadding) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); ASSERT_THAT(entry_computation->root_instruction(), op::GetTupleElement( @@ -325,7 +328,7 @@ TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolve1x1Filter) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); EXPECT_THAT(entry_computation->root_instruction(), op::GetTupleElement( op::CustomCall(kCudnnConvBackwardInputCallTarget), 0)); @@ -357,7 +360,7 @@ TEST_F(CudnnConvolutionRewriterTest, auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); EXPECT_THAT( entry_computation->root_instruction(), op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0)); @@ -410,7 +413,7 @@ TEST_F(CudnnConvolutionRewriterTest, auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); ASSERT_THAT(entry_computation->root_instruction(), op::GetTupleElement( op::CustomCall(kCudnnConvBackwardInputCallTarget), 0)); @@ -457,7 +460,7 @@ TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveLowPaddingTooLarge) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); EXPECT_THAT( entry_computation->root_instruction(), op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0)); @@ -510,7 +513,7 @@ TEST_F(CudnnConvolutionRewriterTest, auto module = CreateNewModule(); const HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); ASSERT_THAT(entry_computation->root_instruction(), op::GetTupleElement( op::CustomCall(kCudnnConvBackwardInputCallTarget), 0)); @@ -562,12 +565,38 @@ TEST_F(CudnnConvolutionRewriterTest, auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(RunPass(module.get())); + EXPECT_TRUE(RunPass(module)); EXPECT_THAT( entry_computation->root_instruction(), op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0)); } +// Check that we will materialize a reversed version of a constant in order to +// pattern-match a backwards input convolution. +TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveConstantFilter) { + Array4D constant_arr(4, 4, 2, 2); + constant_arr.FillIota(0); + string constant_str = + LiteralUtil::CreateR4FromArray4D(constant_arr)->ToString(); + ParseAndVerifyModule(absl::StrFormat(R"( + HloModule test + + ENTRY entry_computation { + param0 = f32[128,2,16,16]{3,2,1,0} parameter(0) + constant = f32[4,4,2,2]{3,2,1,0} constant(%s) + ROOT convolution = f32[128,2,32,32]{3,2,1,0} convolution(param0, constant), + window={size=4x4 pad=2_2x2_2 lhs_dilate=2x2}, + dim_labels=bf01_01oi->bf01, feature_group_count=1 + })", + constant_str)); + EXPECT_TRUE(RunPass(&module())); + EXPECT_THAT( + module().entry_computation()->root_instruction(), + op::GetTupleElement(op::CustomCall(kCudnnConvBackwardInputCallTarget, _, + op::Reverse(op::Constant())), + 0)); +} + } // anonymous namespace } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc index 0645fbb3ad39f1f1649caf45a6068b5a196c30b9..05125e9d1fb3cd03cb72b7854fc28c767b49fd64 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -56,7 +57,7 @@ class ScratchBufAllocator : public se::ScratchAllocator { "Can't allocate twice from a ScratchBufAllocator."); } if (byte_size > scratch_.size()) { - return se::port::InternalError(tensorflow::strings::StrCat( + return se::port::InternalError(absl::StrCat( "Can't allocate ", byte_size, " bytes from a ScratchBufAllocator of size ", scratch_.size())); } @@ -76,8 +77,9 @@ Status RunCudnnConvolution( const Shape& output_shape, DeviceMemory input_buf, DeviceMemory filter_buf, DeviceMemory output_buf, se::ScratchAllocator* scratch_allocator, const Window& window, - const ConvolutionDimensionNumbers& dnums, AlgorithmConfig algorithm, - Stream* stream, ProfileResult* profile_result /*= nullptr*/) { + const ConvolutionDimensionNumbers& dnums, int64 feature_group_count, + AlgorithmConfig algorithm, Stream* stream, + ProfileResult* profile_result /*= nullptr*/) { VLOG(3) << "Convolution Algorithm: " << algorithm.algorithm().algo_id(); VLOG(3) << "tensor_ops_enabled: " << algorithm.algorithm().tensor_ops_enabled(); @@ -96,15 +98,9 @@ Status RunCudnnConvolution( // tensorflow/python/ops/nn_ops.py). const int effective_num_dimensions = std::max(2, num_dimensions); - if (std::is_same::value) { - CHECK_EQ(F32, output_shape.element_type()) - << ShapeUtil::HumanString(output_shape); - } else if (std::is_same::value) { - CHECK_EQ(F16, output_shape.element_type()) - << ShapeUtil::HumanString(output_shape); - } else { - LOG(FATAL) << ShapeUtil::HumanString(output_shape); - } + CHECK_EQ(primitive_util::NativeToPrimitiveType(), + output_shape.element_type()) + << ShapeUtil::HumanString(output_shape); CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size()); CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size()); @@ -149,6 +145,7 @@ Status RunCudnnConvolution( } ConvolutionDescriptor convolution_descriptor(effective_num_dimensions); + convolution_descriptor.set_group_count(feature_group_count); for (int dim = 0; dim < num_dimensions; ++dim) { convolution_descriptor .set_zero_padding( @@ -202,8 +199,8 @@ Status RunCudnnConvolution( if (!stream->ok()) { return InternalError( - "Unable to launch convolution with type %s and algorithm (%lld, %lld)", - CudnnConvKindToString(kind).c_str(), algorithm.algorithm().algo_id(), + "Unable to launch convolution with type %s and algorithm (%d, %d)", + CudnnConvKindToString(kind), algorithm.algorithm().algo_id(), algorithm.algorithm_no_scratch().algo_id()); } return Status::OK(); @@ -227,14 +224,14 @@ Status RunCudnnConvolution( const Shape& output_shape, se::DeviceMemoryBase input_buf, se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf, se::DeviceMemoryBase scratch_buf, const Window& window, - const ConvolutionDimensionNumbers& dnums, + const ConvolutionDimensionNumbers& dnums, int64 feature_group_count, se::dnn::AlgorithmConfig algorithm, se::Stream* stream, se::dnn::ProfileResult* profile_result) { ScratchBufAllocator scratch_allocator(scratch_buf); - return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape, - input_buf, filter_buf, output_buf, - &scratch_allocator, window, dnums, algorithm, - stream, profile_result); + return RunCudnnConvolution( + kind, input_shape, filter_shape, output_shape, input_buf, filter_buf, + output_buf, &scratch_allocator, window, dnums, feature_group_count, + algorithm, stream, profile_result); } Status RunCudnnConvolution( @@ -242,25 +239,35 @@ Status RunCudnnConvolution( const Shape& output_shape, se::DeviceMemoryBase input_buf, se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf, se::ScratchAllocator* scratch_allocator, const Window& window, - const ConvolutionDimensionNumbers& dnums, + const ConvolutionDimensionNumbers& dnums, int64 feature_group_count, se::dnn::AlgorithmConfig algorithm, se::Stream* stream, se::dnn::ProfileResult* profile_result) { PrimitiveType output_primitive_type = output_shape.element_type(); - CHECK(output_primitive_type == F32 || output_primitive_type == F16) - << ShapeUtil::HumanString(output_shape); - if (output_primitive_type == F32) { - return RunCudnnConvolution( - kind, input_shape, filter_shape, output_shape, - se::DeviceMemory(input_buf), se::DeviceMemory(filter_buf), - se::DeviceMemory(output_buf), scratch_allocator, window, dnums, - algorithm, stream, profile_result); + switch (output_primitive_type) { + case F16: + return RunCudnnConvolution( + kind, input_shape, filter_shape, output_shape, + se::DeviceMemory(input_buf), + se::DeviceMemory(filter_buf), + se::DeviceMemory(output_buf), scratch_allocator, window, + dnums, feature_group_count, algorithm, stream, profile_result); + case F32: + return RunCudnnConvolution( + kind, input_shape, filter_shape, output_shape, + se::DeviceMemory(input_buf), + se::DeviceMemory(filter_buf), + se::DeviceMemory(output_buf), scratch_allocator, window, dnums, + feature_group_count, algorithm, stream, profile_result); + case F64: + return RunCudnnConvolution( + kind, input_shape, filter_shape, output_shape, + se::DeviceMemory(input_buf), + se::DeviceMemory(filter_buf), + se::DeviceMemory(output_buf), scratch_allocator, window, + dnums, feature_group_count, algorithm, stream, profile_result); + default: + LOG(FATAL) << ShapeUtil::HumanString(output_shape); } - return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape, - se::DeviceMemory(input_buf), - se::DeviceMemory(filter_buf), - se::DeviceMemory(output_buf), - scratch_allocator, window, dnums, algorithm, - stream, profile_result); } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h index 944e4ac686d45408b08ff1faa321510c1c8920ba..a1b4fc71d0cac3e5ea067ca7941b07cbade8d7cc 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h @@ -75,7 +75,7 @@ Status RunCudnnConvolution( const Shape& output_shape, se::DeviceMemoryBase input_buf, se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf, se::DeviceMemoryBase scratch_buf, const Window& window, - const ConvolutionDimensionNumbers& dnums, + const ConvolutionDimensionNumbers& dnums, int64 feature_group_count, se::dnn::AlgorithmConfig algorithm, se::Stream* stream, se::dnn::ProfileResult* profile_result = nullptr); @@ -84,7 +84,7 @@ Status RunCudnnConvolution( const Shape& output_shape, se::DeviceMemoryBase input_buf, se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf, se::ScratchAllocator* scratch_allocator, const Window& window, - const ConvolutionDimensionNumbers& dnums, + const ConvolutionDimensionNumbers& dnums, int64 feature_group_count, se::dnn::AlgorithmConfig algorithm, se::Stream* stream, se::dnn::ProfileResult* profile_result = nullptr); diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc index e5e2a0478a0659986ddec8d6785827b14b9efb56..c1aaa4bf04ddc31edf723c056805ae5aad994e55 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc @@ -23,18 +23,21 @@ limitations under the License. #include "tensorflow/core/platform/types.h" // IWYU pragma: no_include "llvm/IR/Attributes.gen.inc" // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" #include "llvm/ADT/APInt.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -42,35 +45,37 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/window_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { namespace gpu { +using absl::StrAppend; using llvm_ir::IrArray; using llvm_ir::IrName; using llvm_ir::SetToFirstInsertPoint; -using tensorflow::strings::StrAppend; +namespace { // Returns whether operand is a floating-point literal with the given value. bool IsFPLiteralWithValue(const HloInstruction* operand, float value) { - return operand->opcode() == HloOpcode::kConstant && - operand->literal().IsAllFloat(value); + if (operand->opcode() == HloOpcode::kConstant && + operand->literal().IsAllFloat(value)) { + return true; + } + return operand->opcode() == HloOpcode::kBroadcast && + IsFPLiteralWithValue(operand->operand(0), value); } +} // namespace GpuElementalIrEmitter::GpuElementalIrEmitter( const HloModuleConfig& hlo_module_config, llvm::Module* module, - llvm::IRBuilder<>* ir_builder, NestedComputer compute_nested) - : ElementalIrEmitter(hlo_module_config, module, ir_builder), + llvm::IRBuilder<>* b, NestedComputer compute_nested) + : ElementalIrEmitter(hlo_module_config, module, b), hlo_module_config_(hlo_module_config), compute_nested_(std::move(compute_nested)) {} StatusOr GpuElementalIrEmitter::EmitLibdeviceMathCall( - const string& callee_name, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const { + const string& callee_name, absl::Span operands, + absl::Span input_types, PrimitiveType output_type) { // The libdevice math functions differentiate between "double" and "float" by // appending an 'f' to the function's name. libdevice doesn't have f16 math // functions, so we convert the operands to f32 before calling the function @@ -86,8 +91,8 @@ StatusOr GpuElementalIrEmitter::EmitLibdeviceMathCall( cast_result_to_fp16 = true; for (int64 i = 0; i < operands.size(); ++i) { if (input_types[i] == F16) { - converted_operands[i] = ir_builder_->CreateFPCast( - converted_operands[i], ir_builder_->getFloatTy()); + converted_operands[i] = + FPCast(converted_operands[i], b_->getFloatTy()); converted_input_types[i] = F32; } } @@ -100,22 +105,20 @@ StatusOr GpuElementalIrEmitter::EmitLibdeviceMathCall( break; default: return Unimplemented("Bad type for libdevice math call: %s", - PrimitiveType_Name(output_type).c_str()); + PrimitiveType_Name(output_type)); } llvm::Value* result = EmitMathCall(munged_callee, converted_operands, converted_input_types, output_type) .ValueOrDie(); if (cast_result_to_fp16) { - result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy()); + result = FPCast(result, b_->getHalfTy()); } return result; } StatusOr GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall( - const string& callee_name, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const { + const string& callee_name, absl::Span operands, + absl::Span input_types, PrimitiveType output_type) { // llvm intrinsics differentiate between half/float/double functions via // the suffixes ".f16", ".f32" and ".f64". string munged_callee = callee_name; @@ -131,22 +134,20 @@ StatusOr GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall( break; default: return Unimplemented("Bad type for llvm intrinsic math call: %s", - PrimitiveType_Name(output_type).c_str()); + PrimitiveType_Name(output_type)); } return EmitMathCall(munged_callee, operands, input_types, output_type); } StatusOr GpuElementalIrEmitter::EmitMathCall( - const string& callee_name, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const { + const string& callee_name, absl::Span operands, + absl::Span input_types, PrimitiveType output_type) { // Binary math functions transform are of type [T] -> T. for (PrimitiveType input_type : input_types) { if (output_type != input_type) { return Unimplemented("Input type ≠ output type: %s ≠ %s", - PrimitiveType_Name(input_type).c_str(), - PrimitiveType_Name(output_type).c_str()); + PrimitiveType_Name(input_type), + PrimitiveType_Name(output_type)); } } @@ -156,8 +157,7 @@ StatusOr GpuElementalIrEmitter::EmitMathCall( } StatusOr GpuElementalIrEmitter::EmitFloatBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { PrimitiveType lhs_input_type = op->operand(0)->shape().element_type(); PrimitiveType rhs_input_type = op->operand(1)->shape().element_type(); PrimitiveType output_type = op->shape().element_type(); @@ -176,8 +176,7 @@ StatusOr GpuElementalIrEmitter::EmitFloatBinaryOp( } StatusOr GpuElementalIrEmitter::EmitPowerOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const { + const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) { CHECK_EQ(op->opcode(), HloOpcode::kPower); PrimitiveType lhs_input_type = op->operand(0)->shape().element_type(); PrimitiveType rhs_input_type = op->operand(1)->shape().element_type(); @@ -203,13 +202,15 @@ StatusOr GpuElementalIrEmitter::EmitPowerOp( return make_sqrt(); } - if (hlo_module_config_.debug_options().xla_enable_fast_math() && - IsFPLiteralWithValue(rhs, -.5)) { + if (IsFPLiteralWithValue(rhs, -.5)) { VLOG(10) << "emitting pow(A, -.5) as 1/sqrt(A): " << op->ToString(); // LLVM's NVPTX backend knows how to transform 1/sqrt(A) into the NVPTX // rsqrt.approx instruction. + // + // TODO(jlebar): Does this happen with fastmath disabled? If not, should + // we force-enable it? TF_ASSIGN_OR_RETURN(auto* sqrt, make_sqrt()); - return ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt); + return FDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt); } VLOG(10) << "emitting pow as regular call to pow(): " << op->ToString(); @@ -218,72 +219,74 @@ StatusOr GpuElementalIrEmitter::EmitPowerOp( } StatusOr GpuElementalIrEmitter::EmitErfcInv( - PrimitiveType prim_type, llvm::Value* value) const { + PrimitiveType prim_type, llvm::Value* value) { return EmitLibdeviceMathCall("__nv_erfcinv", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitLog( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitLog(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_log", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitLog1p( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitLog1p(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_log1p", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitSin( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitSin(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_sin", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitCos( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitCos(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_cos", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitExp( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitExp(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_exp", {value}, {prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitExpm1( - PrimitiveType prim_type, llvm::Value* value) const { +StatusOr GpuElementalIrEmitter::EmitExpm1(PrimitiveType prim_type, + llvm::Value* value) { return EmitLibdeviceMathCall("__nv_expm1", {value}, {prim_type}, prim_type); } StatusOr GpuElementalIrEmitter::EmitPow(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const { + llvm::Value* rhs) { return EmitLibdeviceMathCall("__nv_pow", {lhs, rhs}, {prim_type, prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitAtan2( - PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const { +StatusOr GpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type, + llvm::Value* lhs, + llvm::Value* rhs) { return EmitLibdeviceMathCall("__nv_atan2", {lhs, rhs}, {prim_type, prim_type}, prim_type); } -StatusOr GpuElementalIrEmitter::EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { - PrimitiveType input_type = op->operand(0)->shape().element_type(); - PrimitiveType output_type = op->shape().element_type(); - switch (op->opcode()) { - case HloOpcode::kTanh: - return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type}, - output_type); - default: - return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value); - } +StatusOr GpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type, + llvm::Value* value) { + // Emit a fast approximation of tanh instead of calling __nv_tanh. + // __nv_tanh is particularly bad because it contains branches, thus + // preventing LLVM's load-store vectorizer from working its magic across a + // function which contains tanh calls. + // + // This routine isn't numerically precise, but it's good enough for ML. + + // Upcast F16 to F32 if necessary. + llvm::Type* type = prim_type == F16 ? b_->getFloatTy() : value->getType(); + llvm::Value* input = FPCast(value, type); + llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input); + return FPCast(fast_tanh, value->getType()); } llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall( - const string& callee_name, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type, - tensorflow::gtl::ArraySlice attributes) const { + const string& callee_name, absl::Span operands, + absl::Span input_types, PrimitiveType output_type, + absl::Span attributes) { std::vector ir_input_types; for (PrimitiveType input_type : input_types) { ir_input_types.push_back( @@ -296,37 +299,35 @@ llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall( // Declares the callee if it is not declared already. llvm::Function* callee = llvm::cast( - ir_builder_->GetInsertBlock()->getModule()->getOrInsertFunction( + b_->GetInsertBlock()->getModule()->getOrInsertFunction( llvm_ir::AsStringRef(callee_name), callee_type)); for (auto attribute : attributes) { callee->addFnAttr(attribute); } - return ir_builder_->CreateCall(callee, llvm_ir::AsArrayRef(operands)); + return Call(callee, llvm_ir::AsArrayRef(operands)); } -llvm::Value* GpuElementalIrEmitter::EmitThreadId() const { - llvm::Value* block_id = ir_builder_->CreateIntCast( - llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, - {}, {}, ir_builder_), - ir_builder_->getIntNTy(128), /*isSigned=*/true, "block.id"); - llvm::Value* thread_id_in_block = ir_builder_->CreateIntCast( - llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, - {}, {}, ir_builder_), - ir_builder_->getIntNTy(128), /*isSigned=*/true, "thread.id"); - llvm::Value* threads_per_block = ir_builder_->CreateIntCast( - llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x, - {}, {}, ir_builder_), - ir_builder_->getIntNTy(128), /*isSigned=*/true, "threads_per_block"); - return ir_builder_->CreateNSWAdd( - ir_builder_->CreateNSWMul(block_id, threads_per_block), - thread_id_in_block); +llvm::Value* GpuElementalIrEmitter::EmitThreadId() { + llvm::Value* block_id = + IntCast(llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_), + b_->getIntNTy(128), /*isSigned=*/true, "block.id"); + llvm::Value* thread_id_in_block = + IntCast(llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_), + b_->getIntNTy(128), /*isSigned=*/true, "thread.id"); + llvm::Value* threads_per_block = + IntCast(llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x, {}, {}, b_), + b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block"); + return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block); } llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const { + const HloToElementGeneratorMap& operand_to_generator) { switch (hlo->opcode()) { case HloOpcode::kMap: return [=, &operand_to_generator]( @@ -367,14 +368,20 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( PrimitiveType operand_element_type = operand->shape().element_type(); llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry( llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_), - "reduce_window_accum_ptr", ir_builder_); + "reduce_window_accum_ptr", b_); { TF_ASSIGN_OR_RETURN(llvm::Value * init_value, - operand_to_generator.at(hlo->operand(1))({})); - ir_builder_->CreateStore(init_value, accum_ptr); + operand_to_generator.at(hlo->operand(1))( + IrArray::Index(index.GetType()))); + Store(init_value, accum_ptr); } - llvm_ir::ForLoopNest loops(IrName(hlo), ir_builder_); + llvm::Type* index_type = index.GetType(); + auto index_typed_const = [&](uint64 c) -> llvm::Constant* { + return index.GetConstantWithIndexType(c); + }; + + llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type); std::vector window_size; for (const auto& dim : window.dimensions()) { window_size.push_back(dim.size()); @@ -383,57 +390,59 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( ShapeUtil::MakeShape(operand_element_type, window_size), "window"); CHECK_EQ(window_index.size(), index.size()); - SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder_); + SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_); - IrArray::Index input_index(index.size()); - llvm::Value* in_bounds = ir_builder_->getInt1(true); + IrArray::Index input_index(index_type, index.size()); + llvm::Value* in_bounds = b_->getInt1(true); for (size_t i = 0; i < index.size(); ++i) { - llvm::Value* stridden_index = ir_builder_->CreateNSWMul( - index[i], ir_builder_->getInt64(window.dimensions(i).stride())); - input_index[i] = ir_builder_->CreateNSWSub( - ir_builder_->CreateNSWAdd(stridden_index, window_index[i]), - ir_builder_->getInt64(window.dimensions(i).padding_low())); + llvm::Value* stridden_index = NSWMul( + index[i], index_typed_const(window.dimensions(i).stride())); + input_index[i] = + NSWSub(NSWAdd(stridden_index, window_index[i]), + index_typed_const(window.dimensions(i).padding_low())); // We must check whether 0 ≤ input_index[i] < bound, as otherwise // we are in the pad and so can skip the computation. This // comparison is equivalent to the unsigned comparison // input_index[i] < bound, as a negative value wraps to a large // positive value. - in_bounds = ir_builder_->CreateAnd( - in_bounds, - ir_builder_->CreateICmpULT( - input_index[i], - ir_builder_->getInt64(operand->shape().dimensions(i)))); + in_bounds = + And(in_bounds, + ICmpULT(input_index[i], + index_typed_const(operand->shape().dimensions(i)))); } llvm_ir::LlvmIfData if_data = - llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_); - SetToFirstInsertPoint(if_data.true_block, ir_builder_); + llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_); + SetToFirstInsertPoint(if_data.true_block, b_); // We are not in pad, so do the computation. TF_ASSIGN_OR_RETURN(llvm::Value * input_value, operand_to_generator.at(operand)(input_index)); TF_ASSIGN_OR_RETURN( llvm::Value * accum_value, - compute_nested_(*hlo->to_apply(), - {ir_builder_->CreateLoad(accum_ptr), input_value})); - ir_builder_->CreateStore(accum_value, accum_ptr); + compute_nested_(*hlo->to_apply(), {Load(accum_ptr), input_value})); + Store(accum_value, accum_ptr); - SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder_); - return ir_builder_->CreateLoad(accum_ptr); + SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_); + return Load(accum_ptr); }; case HloOpcode::kReduce: + // TODO(b/112040122): This should be supported. + CHECK_EQ(hlo->operand_count(), 2) << "Did not expect variadic reduce"; return [=, &operand_to_generator]( const IrArray::Index& output_index) -> StatusOr { const HloInstruction* operand = hlo->operand(0); llvm::Value* accum_ptr = - ir_builder()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType( + b()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType( hlo->shape().element_type(), module_)); + llvm::Type* index_type = output_index.GetType(); TF_ASSIGN_OR_RETURN(llvm::Value * init_value, - operand_to_generator.at(hlo->operand(1))({})); - ir_builder()->CreateStore(init_value, accum_ptr); + operand_to_generator.at(hlo->operand(1))( + IrArray::Index(index_type))); + b()->CreateStore(init_value, accum_ptr); - llvm_ir::ForLoopNest loops(IrName(hlo), ir_builder_); + llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type); IrArray::Index input_index = loops.AddLoopsForShapeOnDimensions( operand->shape(), hlo->dimensions(), "reduction_dim"); if (!ShapeUtil::IsScalar(hlo->shape())) { @@ -448,18 +457,17 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( CHECK_EQ(output_index.size(), j); } - SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder()); + SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b()); TF_ASSIGN_OR_RETURN( llvm::Value * input_value, operand_to_generator.at(hlo->operand(0))(input_index)); TF_ASSIGN_OR_RETURN( llvm::Value * accum_value, - compute_nested_( - *hlo->to_apply(), - {ir_builder()->CreateLoad(accum_ptr), input_value})); - ir_builder()->CreateStore(accum_value, accum_ptr); - SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder()); - return ir_builder()->CreateLoad(accum_ptr); + compute_nested_(*hlo->to_apply(), + {b()->CreateLoad(accum_ptr), input_value})); + b()->CreateStore(accum_value, accum_ptr); + SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b()); + return b()->CreateLoad(accum_ptr); }; default: return ElementalIrEmitter::MakeElementGenerator(hlo, diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h index 91f4d960aa62fff3e0699ece37a8c74d7dcf2f59..e8b56a39ce58b6aab35c1c977553c7ff7e753273 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h @@ -20,6 +20,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h" @@ -30,7 +31,6 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { namespace gpu { @@ -38,95 +38,87 @@ namespace gpu { class GpuElementalIrEmitter : public ElementalIrEmitter { public: // A NestedComputer computes an element of the output of the given computation - // given an ArraySlice of its input elements. + // given a Span of its input elements. using NestedComputer = std::function( - const HloComputation&, tensorflow::gtl::ArraySlice)>; + const HloComputation&, absl::Span)>; GpuElementalIrEmitter(const HloModuleConfig& hlo_module_config, - llvm::Module* module, llvm::IRBuilder<>* ir_builder, + llvm::Module* module, llvm::IRBuilder<>* b, NestedComputer compute_nested); llvm_ir::ElementGenerator MakeElementGenerator( const HloInstruction* hlo, - const HloToElementGeneratorMap& operand_to_generator) const override; + const HloToElementGeneratorMap& operand_to_generator) override; protected: - StatusOr EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const override; - - StatusOr EmitFloatBinaryOp( - const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const override; + StatusOr EmitFloatBinaryOp(const HloInstruction* op, + llvm::Value* lhs_value, + llvm::Value* rhs_value) override; StatusOr EmitErfcInv(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitLog(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitLog1p(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitSin(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitCos(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitExp(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitExpm1(PrimitiveType prim_type, - llvm::Value* value) const override; + llvm::Value* value) override; StatusOr EmitPow(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const override; + llvm::Value* rhs) override; StatusOr EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs, - llvm::Value* rhs) const override; + llvm::Value* rhs) override; + + StatusOr EmitTanh(PrimitiveType prim_type, + llvm::Value* value) override; - llvm::Value* EmitThreadId() const override; + llvm::Value* EmitThreadId() override; private: // Emits IR for op, which must have opcode kPower. StatusOr EmitPowerOp(const HloInstruction* op, llvm::Value* lhs_value, - llvm::Value* rhs_value) const; + llvm::Value* rhs_value); // Emits IR to call a device function named "callee_name" on the given // operand. Returns the IR value that represents the return value. llvm::Value* EmitDeviceFunctionCall( - const string& callee_name, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice input_type, - PrimitiveType output_type, - tensorflow::gtl::ArraySlice attributes) const; + const string& callee_name, absl::Span operands, + absl::Span input_type, PrimitiveType output_type, + absl::Span attributes); // Emits IR to call an LLVM intrinsic of type [T] -> T. Adjusts // callee_name according to T. Returns the IR value that represents the // return value of the function. StatusOr EmitLlvmIntrinsicMathCall( - const string& callee_name, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const; + const string& callee_name, absl::Span operands, + absl::Span input_types, PrimitiveType output_type); // Emits IR to call a libdevice function of type [T] -> T. Adjusts // callee_name according to T. Returns the IR value that represents the // return value of the function. StatusOr EmitLibdeviceMathCall( - const string& callee_name, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const; + const string& callee_name, absl::Span operands, + absl::Span input_types, PrimitiveType output_type); // Emits IR to call a function of type [T] -> T. Does not munge callee_name. // Returns the IR value that represents the return value of the function. StatusOr EmitMathCall( - const string& callee_name, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice input_types, - PrimitiveType output_type) const; + const string& callee_name, absl::Span operands, + absl::Span input_types, PrimitiveType output_type); const HloModuleConfig& hlo_module_config_; NestedComputer compute_nested_; diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc index e14ee6918bf148861ecccac99355fccf7ae93103..ca4a605af5d3b6b58b603d7ddad60ed9ae8a212f 100644 --- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc @@ -17,10 +17,11 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -42,8 +43,8 @@ StatusOr> FftScratchAllocator::AllocateBytes( if (byte_size > GetMemoryLimitInBytes(stream)) { return se::port::Status( se::port::error::RESOURCE_EXHAUSTED, - tensorflow::strings::Printf( - "Allocating %lld bytes exceeds the memory limit of %lld bytes.", + absl::StrFormat( + "Allocating %d bytes exceeds the memory limit of %d bytes.", byte_size, GetMemoryLimitInBytes(stream))); } @@ -91,8 +92,7 @@ string FftTypeToString(se::fft::Type type) { } // namespace -FftThunk::FftThunk(FftType fft_type, - tensorflow::gtl::ArraySlice fft_length, +FftThunk::FftThunk(FftType fft_type, absl::Span fft_length, const BufferAllocation::Slice& input_buffer, const BufferAllocation::Slice& output_buffer, const Shape& input_shape, const Shape& output_shape, @@ -107,7 +107,8 @@ FftThunk::FftThunk(FftType fft_type, output_shape_(output_shape) {} Status FftThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) { + se::Stream* stream, + HloExecutionProfiler* profiler) { VLOG(3) << "FFT type: " << FftTypeToString(fft_type_); VLOG(3) << "Input shape: " << ShapeUtil::HumanStringWithLayout(input_shape_); VLOG(3) << "Output shape: " @@ -116,6 +117,7 @@ Status FftThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, FftScratchAllocator scratch_allocator(buffer_allocations.device_ordinal(), buffer_allocations.memory_allocator()); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); if (fft_plan_ == nullptr) { const int64 fft_rank = fft_length_.size(); CHECK_LE(fft_rank, 3); @@ -210,7 +212,7 @@ Status FftThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, return Status::OK(); } return InternalError("Unable to launch fft for thunk %p with type %s", this, - FftTypeToString(fft_type_).c_str()); + FftTypeToString(fft_type_)); } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h index b0a22564f3a09bb67a3c01723f6e37c604656d45..2be50e08bd2b561b44245b20e1fb200e31e65a41 100644 --- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h @@ -16,15 +16,16 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FFT_THUNK_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FFT_THUNK_H_ +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -61,7 +62,7 @@ class FftThunk : public Thunk { public: // Constructs a thunk for launching an FFT on a stream. // Semantics of null hlo_instruction argument are as in Thunk. - FftThunk(FftType fft_type, tensorflow::gtl::ArraySlice fft_length, + FftThunk(FftType fft_type, absl::Span fft_length, const BufferAllocation::Slice& input_buffer, const BufferAllocation::Slice& output_buffer, const Shape& input_shape, const Shape& output_shape, @@ -72,7 +73,8 @@ class FftThunk : public Thunk { // Does the FFT for the thunk on "stream". Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: const se::fft::Type fft_type_; diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc index b36539e0cb8d0a2f4758dd90acbdd8fc7181b8ca..88f0b4d71c915c37f0b58cb91a8788fd8f9cc452 100644 --- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc @@ -15,7 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/for_thunk.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" @@ -27,8 +28,11 @@ ForThunk::ForThunk(const int64 loop_limit, const HloInstruction* hlo) : Thunk(Kind::kWhile, hlo), loop_limit_(loop_limit), - body_thunk_sequence_( - MakeUnique(std::move(*body_thunk_sequence), hlo)) {} + body_thunk_sequence_(absl::make_unique( + // Pass nullptr as the HloInstruction* to the body_thunk_sequence_ + // constructor because this SequentialThunk is logically "part of" + // this ForThunk, and shouldn't be profiled separately from it. + std::move(*body_thunk_sequence), nullptr)) {} Status ForThunk::Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) { @@ -37,11 +41,17 @@ Status ForThunk::Initialize(const GpuExecutable& executable, } Status ForThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) { + se::Stream* stream, + HloExecutionProfiler* profiler) { + VLOG(2) << "Executing ForThunk with " << loop_limit_ << " iters for " + << (hlo_instruction() ? hlo_instruction()->ToString() : ""); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); for (int64 i = 0; i < loop_limit_; ++i) { + profiler->StartHloComputation(); // Invoke loop body thunk sequence. - TF_RETURN_IF_ERROR( - body_thunk_sequence_->ExecuteOnStream(buffer_allocations, stream)); + TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(buffer_allocations, + stream, profiler)); + profiler->FinishHloComputation(hlo_instruction()->while_body()); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h index 41ddfe0ceb1d0516c1c64feca53212a925632209..c2d39071b292c6704e9b5857a68bd8b3f3b9a914 100644 --- a/tensorflow/compiler/xla/service/gpu/for_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h @@ -19,6 +19,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -39,7 +40,8 @@ class ForThunk : public Thunk { Status Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) override; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: const int64 loop_limit_; diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc index 3cd30b754c3242f00c704de1afab2282ed827b41..30c1f9088968305ad0207164ecb07ba13cc89ee6 100644 --- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc +++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc @@ -18,12 +18,14 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h" #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace xla { namespace gpu { @@ -64,10 +66,11 @@ double CalculateBytesReadByFusionParameter(HloInstruction* param) { // Slice for a more accurate estimate of bytes read. double bytes = 0.0; for (auto& instruction : instructions) { - if (c_all_of(instruction->users(), [](const HloInstruction* instruction) { - return instruction->opcode() == HloOpcode::kSlice || - instruction->opcode() == HloOpcode::kDynamicSlice; - })) { + if (absl::c_all_of( + instruction->users(), [](const HloInstruction* instruction) { + return instruction->opcode() == HloOpcode::kSlice || + instruction->opcode() == HloOpcode::kDynamicSlice; + })) { // All users are slice: accumulate bytes of all user slice instructions. for (auto& user : instruction->users()) { bytes += ShapeUtil::ByteSizeOf(user->shape()); @@ -223,10 +226,11 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) { // Skip 'fusion' instruction if we cannot merge into all of its users. // Merging into all users enables the removal of 'fusion' from the // computation. - if (!c_all_of(fusion->users(), [](const HloInstruction* user) { + if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) { return user->opcode() == HloOpcode::kFusion && (user->fusion_kind() == HloInstruction::FusionKind::kLoop || - user->fusion_kind() == HloInstruction::FusionKind::kInput); + (user->fusion_kind() == HloInstruction::FusionKind::kInput && + LayoutsAreReduceInputFusionFriendly(*fusion, *user))); })) { VLOG(3) << "Not merging " << fusion->name() << ": Some of its users are not loop/input fusion kernels."; @@ -241,11 +245,11 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) { // If 'fusion' has just one user, then an earlier fusion pass chose not to // fuse this producer/comsumer pair (likely because of expensive instruction // re-use by the consumer), and so we honor that choice here as well. - if (c_any_of(fusion->fused_instructions(), - [](const HloInstruction* instruction) { - return instruction->opcode() != HloOpcode::kParameter && - GpuInstructionFusion::IsExpensive(*instruction); - })) { + if (absl::c_any_of(fusion->fused_instructions(), + [](const HloInstruction* instruction) { + return instruction->opcode() != HloOpcode::kParameter && + GpuInstructionFusion::IsExpensive(*instruction); + })) { VLOG(3) << "Not merging " << fusion->name() << ": Contains one or more expensive instructions."; ++num_fail_expensive_fused_instruction_; @@ -287,11 +291,10 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) { << " flops_to_bytes_ratio: " << CalculateFlopsToBytesRatio(fusion) << " merged_to_current_bytes_ratio: " << merged_to_current_bytes_ratio << " into users { " - << tensorflow::str_util::Join(users, ", ", - [](string* out, HloInstruction* user) { - tensorflow::strings::StrAppend( - out, user->name()); - }) + << absl::StrJoin(users, ", ", + [](string* out, HloInstruction* user) { + absl::StrAppend(out, user->name()); + }) << " }"; // Remove 'fusion' instruction. CHECK_EQ(0, fusion->user_count()); diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.h b/tensorflow/compiler/xla/service/gpu/fusion_merger.h index 4c523a66de977cd32423b25f0d165c4f4ba51c4a..7e3f5775b8d97f43a0bba201d24f34c2d337fabb 100644 --- a/tensorflow/compiler/xla/service/gpu/fusion_merger.h +++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.h @@ -34,7 +34,7 @@ namespace gpu { // class FusionMerger : public HloPassInterface { public: - tensorflow::StringPiece name() const override { return "fusion merger"; } + absl::string_view name() const override { return "fusion merger"; } StatusOr Run(HloModule* module) override; diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc index 2217776c7d5a5f92c520d56222988f80401be9e4..7cc869ed9e89688d6ea06428a7bade3ebe55ea23 100644 --- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc +++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc @@ -17,9 +17,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" namespace xla { namespace gpu { @@ -40,7 +40,7 @@ class FusionMergerTest : public HloTestBase {}; // Tuple // TEST_F(FusionMergerTest, MergeSharedFusionInstruction) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule MergeSharedFusionInstruction comp.3 { @@ -104,7 +104,7 @@ ENTRY MergeSharedFusionInstruction.Computation0 { // // Fusion2 is not merged because it exceeds the threshold flops-to-bytes ratio. TEST_F(FusionMergerTest, FlopsToBytesRatioThresholdExceeded) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule FlopsToBytesRatioThresholdExceeded comp.2 { @@ -162,7 +162,7 @@ ENTRY FlopsToBytesRatioThresholdExceeded.Computation1 { // is merged into Fusion0 and Fusion1) would exceed the bytes transferred // threshold. TEST_F(FusionMergerTest, BytesTransferredThresholdExeceeded) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule BytesTransferredThresholdExeceeded comp.2 { @@ -210,7 +210,7 @@ ENTRY BytesTransferredThresholdExeceeded.Computation2 { // Fusion2 is reduced for this test which makes the merge operation into its // operand below the bytes transferred threshold. TEST_F(FusionMergerTest, BytesTransferredThresholdNotExeceeded) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule BytesTransferredThresholdNotExeceeded comp.2 { @@ -253,7 +253,7 @@ ENTRY BytesTransferredThresholdNotExeceeded.Computation2 { // Check that we're willing to merge f1_computation into f2_computation, even // though f2 is an input fusion node. TEST_F(FusionMergerTest, WillMergeIntoInputFusion) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule m f1_computation { @@ -286,6 +286,39 @@ TEST_F(FusionMergerTest, WillMergeIntoInputFusion) { op::Fusion(op::Parameter())); } +TEST_F(FusionMergerTest, WillNotMergeReduceUnfriendlyLayouts) { + auto module = ParseHloString(R"( + HloModule m + + f1_computation { + f1_p0 = f32[16,16,256]{0,1,2} parameter(0) + add = f32[16,16,256]{0,1,2} add(f1_p0, f1_p0) + // Note that the copy changes the layout from {0,1,2} to {2,1,0}. + ROOT f1_root = f32[16,16,256]{2,1,0} copy(add) + } + + add_computation { + add_lhs = f32[] parameter(0) + add_rhs = f32[] parameter(1) + ROOT add_root = f32[] add(add_lhs, add_rhs) + } + + f2_computation { + f2_p0 = f32[16,16,256]{2,1,0} parameter(0) + f2_zero = f32[] constant(0) + ROOT f2_root = f32[] reduce(f2_p0, f2_zero), dimensions={0,1,2}, + to_apply=add_computation + } + + ENTRY entry { + p0 = f32[16,16,256]{0,1,2} parameter(0) + f1 = f32[16,16,256]{2,1,0} fusion(p0), kind=kLoop, calls=f1_computation + ROOT f2 = f32[] fusion(f1), kind=kInput, calls=f2_computation + })") + .ValueOrDie(); + EXPECT_FALSE(FusionMerger().Run(module.get()).ValueOrDie()); +} + } // namespace } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc index 79fca43d022816645b8a07b9e806fe9cc3745e7c..9c4a4903667ea1a6c99ce9e912c9d0497b8e389f 100644 --- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc @@ -17,6 +17,7 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -31,16 +32,19 @@ namespace { // dimensions. struct MatrixDescriptor { MatrixDescriptor(se::DeviceMemoryBase matrix_data, bool needs_transpose, - int64 matrix_num_rows, int64 matrix_num_cols) + int64 matrix_num_rows, int64 matrix_num_cols, + int64 matrix_batch_size) : data(matrix_data), transpose(needs_transpose), num_rows(matrix_num_rows), - num_cols(matrix_num_cols) {} + num_cols(matrix_num_cols), + batch_size(matrix_batch_size) {} se::DeviceMemoryBase data; bool transpose; // Whether this matrix needs to be transposed. int64 num_rows; int64 num_cols; + int64 batch_size; }; // Performs a gemm call without an explicit algorithm on lhs_matrix and @@ -50,6 +54,9 @@ bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix, MatrixDescriptor output_matrix, double alpha, se::Stream* stream) { DCHECK(!output_matrix.transpose); + const int64 batch_size = lhs_matrix.batch_size; + CHECK_EQ(batch_size, rhs_matrix.batch_size); + CHECK_EQ(batch_size, output_matrix.batch_size); se::DeviceMemory lhs_data(lhs_matrix.data); se::DeviceMemory rhs_data(rhs_matrix.data); se::DeviceMemory output_data(output_matrix.data); @@ -60,13 +67,30 @@ bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix, : se::blas::Transpose::kNoTranspose; auto k = lhs_matrix.transpose ? lhs_matrix.num_rows : lhs_matrix.num_cols; + if (batch_size == 1) { + return stream + ->ThenBlasGemm( + lhs_transpose, rhs_transpose, output_matrix.num_rows, + output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/alpha, + lhs_data, /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data, + /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/0.0, + &output_data, /*leading dim of output=*/output_matrix.num_rows) + .ok(); + } + + int64 lhs_stride = lhs_matrix.num_rows * lhs_matrix.num_cols; + int64 rhs_stride = rhs_matrix.num_rows * rhs_matrix.num_cols; + int64 output_stride = output_matrix.num_rows * output_matrix.num_cols; return stream - ->ThenBlasGemm( + ->ThenBlasGemmStridedBatched( lhs_transpose, rhs_transpose, output_matrix.num_rows, - output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/alpha, - lhs_data, /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data, - /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/0.0, - &output_data, /*leading dim of output=*/output_matrix.num_rows) + output_matrix.num_cols, /*size of reduce dim=*/k, + /*alpha=*/alpha, lhs_data, + /*leading dim of LHS=*/lhs_matrix.num_rows, lhs_stride, rhs_data, + /*leading dim of RHS=*/rhs_matrix.num_rows, rhs_stride, + /*beta=*/0.0, &output_data, + /*leading dim of output=*/output_matrix.num_rows, output_stride, + batch_size) .ok(); } @@ -93,6 +117,10 @@ bool DoGemmWithAlgorithm(MatrixDescriptor lhs_matrix, se::blas::ProfileResult* output_profile_result) { DCHECK(!output_matrix.transpose); + CHECK_EQ(1, lhs_matrix.batch_size); + CHECK_EQ(1, rhs_matrix.batch_size); + CHECK_EQ(1, output_matrix.batch_size); + se::DeviceMemory lhs_data(lhs_matrix.data); se::DeviceMemory rhs_data(rhs_matrix.data); se::DeviceMemory output_data(output_matrix.data); @@ -141,9 +169,15 @@ StatusOr DoGemmAutotune( alpha, computation_type, algorithm, stream, &profile_result)); - if (profile_result.is_valid() && profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; + if (profile_result.is_valid()) { + VLOG(3) << "cublas gemm algorithm " << algorithm << " took " + << profile_result.elapsed_time_in_ms() << "ms"; + if (profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + } + } else { + VLOG(4) << "cublas gemm algorithm " << algorithm << " failed."; } } @@ -152,7 +186,7 @@ StatusOr DoGemmAutotune( } return InternalError( - "Unable to autotune cuBLAS gemm on stream %p; none of the %zu algorithms " + "Unable to autotune cuBLAS gemm on stream %p; none of the %u algorithms " "ran successfully", stream, algorithms.size()); } @@ -167,6 +201,8 @@ auto GetGemmFn(PrimitiveType type) -> decltype(&DoGemm) { return &DoGemm; case F64: return &DoGemm; + case C64: + return &DoGemm>; default: LOG(FATAL) << "Unsupported type."; } @@ -180,6 +216,8 @@ auto GetGemmWithAlgorithmFn(PrimitiveType type) return &DoGemmWithAlgorithm; case F64: return &DoGemmWithAlgorithm; + case C64: + return &DoGemmWithAlgorithm>; default: LOG(FATAL) << "Unsupported type."; } @@ -192,6 +230,8 @@ auto GetGemmAutotuneFn(PrimitiveType type) -> decltype(&DoGemmAutotune) { return &DoGemmAutotune; case F64: return &DoGemmAutotune; + case C64: + return &DoGemmAutotune>; default: LOG(FATAL) << "Unsupported type."; } @@ -210,6 +250,8 @@ se::blas::ComputationType GetBlasComputationType(PrimitiveType type) { return se::blas::ComputationType::kF32; case F64: return se::blas::ComputationType::kF64; + case C64: + return se::blas::ComputationType::kComplexF32; default: LOG(FATAL) << "Unsupported type."; } @@ -252,7 +294,8 @@ GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer, alpha_(alpha) {} Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) { + se::Stream* stream, + HloExecutionProfiler* profiler) { VLOG(2) << "Executing a GemmThunk"; se::DeviceMemoryBase lhs_data = @@ -262,12 +305,37 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, se::DeviceMemoryBase output_data = buffer_allocations.GetDeviceAddress(output_buffer_); + DotDimensionNumbers dim_nums = GetDimensionNumbers(*hlo_instruction()); + CHECK_EQ(dim_nums.lhs_batch_dimensions_size(), + dim_nums.rhs_batch_dimensions_size()); + CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2, + ShapeUtil::Rank(output_shape_)); + + int64 row_dim = dim_nums.lhs_batch_dimensions_size(); + int64 col_dim = dim_nums.lhs_batch_dimensions_size() + 1; + int64 batch_size = std::accumulate(output_shape_.dimensions().begin(), + output_shape_.dimensions().end() - 2, 1, + std::multiplies()); + + // Check that the batch dims don't cover the last two dims. + for (int64 batch_dim : dim_nums.lhs_batch_dimensions()) { + CHECK_NE(row_dim, batch_dim); + CHECK_NE(col_dim, batch_dim); + } + + // Verify that the non-batch dimensions are minor-most. This is required for + // efficient access. + for (const auto* shape : {&lhs_shape_, &rhs_shape_, &output_shape_}) { + CHECK_LT(shape->layout().minor_to_major(row_dim), 2); + CHECK_LT(shape->layout().minor_to_major(col_dim), 2); + } + // BLAS gemm reduces rows of LHS and columns of RHS. The Dot operator between // matrices reduces dimension 1 of LHS and dimension 0 of RHS regardless of // their layout. Therefore, we should treat dimension 0 as row and dimension 1 // as column when mapping a matrix Dot to BLAS gemm. - int64 output_num_rows = output_shape_.dimensions(0); - int64 output_num_cols = output_shape_.dimensions(1); + int64 output_num_rows = output_shape_.dimensions(row_dim); + int64 output_num_cols = output_shape_.dimensions(col_dim); // BLAS gemm expects the inputs and the output are in column-major order. // Therefore, we need to convert dot between row-major matrices to that @@ -290,34 +358,46 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, // the leading dimension of the LHS matrix of gemm is the number of rows in // B^T and thus the number of columns in B. - auto make_descriptor = [this](se::DeviceMemoryBase data, const Shape& shape, - bool transpose) -> MatrixDescriptor { - bool is_row_major = LayoutUtil::Minor(shape.layout(), 0) != 0; - bool layout_mismatch = LayoutUtil::Minor(shape.layout(), 0) != - LayoutUtil::Minor(output_shape_.layout(), 0); - return MatrixDescriptor(data, transpose ^ layout_mismatch, - shape.dimensions(is_row_major), - shape.dimensions(!is_row_major)); + auto make_descriptor = [&](se::DeviceMemoryBase data, const Shape& shape, + bool transpose) -> MatrixDescriptor { + bool is_row_major = LayoutUtil::Minor(shape.layout(), row_dim) != 0; + bool layout_mismatch = LayoutUtil::Minor(shape.layout(), row_dim) != + LayoutUtil::Minor(output_shape_.layout(), row_dim); + return MatrixDescriptor( + data, transpose ^ layout_mismatch, + shape.dimensions(row_dim + static_cast(is_row_major)), + shape.dimensions(row_dim + static_cast(!is_row_major)), + batch_size); }; - DotDimensionNumbers dim_nums = GetDimensionNumbers(*hlo_instruction()); - const MatrixDescriptor lhs_descriptor = make_descriptor( - lhs_data, lhs_shape_, dim_nums.lhs_contracting_dimensions(0) == 0); + lhs_data, lhs_shape_, dim_nums.lhs_contracting_dimensions(0) == row_dim); const MatrixDescriptor rhs_descriptor = make_descriptor( - rhs_data, rhs_shape_, dim_nums.rhs_contracting_dimensions(0) == 1); + rhs_data, rhs_shape_, dim_nums.rhs_contracting_dimensions(0) == col_dim); // Dispatches to a regular cublas gemm, a gemm-with-algorithm, or attempts to // autotune this gemm to figure out the best algorithm. - auto launch = [this](MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix, - MatrixDescriptor output_matrix, se::Stream* stream) { + auto launch = [&](MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix, + MatrixDescriptor output_matrix, se::Stream* stream) { PrimitiveType element_type = output_shape_.element_type(); se::blas::ComputationType computation_type = GetBlasComputationType(element_type); + // TODO(b/112111608): Implement auto tune for batched gemm. + if (batch_size != 1) { + return GetGemmFn(element_type)(lhs_matrix, rhs_matrix, output_matrix, + alpha_, stream); + } + + auto thunk_name = [&] { + return hlo_instruction() != nullptr ? hlo_instruction()->ToString() + : ""; + }; + const string& device_name = stream->parent()->GetDeviceDescription().name(); auto autotune_it = autotune_results_.find(device_name); if (autotune_it == autotune_results_.end()) { + VLOG(3) << "Starting autotune of GemmThunk " << thunk_name(); StatusOr best_algorithm = GetGemmAutotuneFn(element_type)(lhs_matrix, rhs_matrix, output_matrix, alpha_, computation_type, stream); @@ -325,11 +405,11 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, autotune_results_.insert({device_name, best_algorithm}).first; if (autotune_it->second.ok()) { - VLOG(2) << "Autotune on GemmThunk " << this + VLOG(2) << "Autotune on GemmThunk " << thunk_name() << " successful; best algorithm is " << best_algorithm.ValueOrDie(); } else { - VLOG(2) << "Autotune on GemmThunk " << this + VLOG(2) << "Autotune on GemmThunk " << thunk_name() << " unsuccessful. Will use generic gemm."; } } @@ -339,7 +419,7 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, if (best_algorithm.ok()) { auto algorithm = best_algorithm.ValueOrDie(); VLOG(2) << "Using algorithm " << algorithm - << " chosen by autotuning on GemmThunk " << this; + << " chosen by autotuning on GemmThunk " << thunk_name(); return GetGemmWithAlgorithmFn(element_type)( lhs_matrix, rhs_matrix, output_matrix, alpha_, computation_type, algorithm, stream, @@ -352,17 +432,18 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, alpha_, stream); }; + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); bool launch_ok; - if (LayoutUtil::Minor(output_shape_.layout(), 0) == 0) { - launch_ok = launch( - lhs_descriptor, rhs_descriptor, - MatrixDescriptor(output_data, false, output_num_rows, output_num_cols), - stream); + if (LayoutUtil::Minor(output_shape_.layout(), row_dim) == 0) { + launch_ok = launch(lhs_descriptor, rhs_descriptor, + MatrixDescriptor(output_data, false, output_num_rows, + output_num_cols, batch_size), + stream); } else { - launch_ok = launch( - rhs_descriptor, lhs_descriptor, - MatrixDescriptor(output_data, false, output_num_cols, output_num_rows), - stream); + launch_ok = launch(rhs_descriptor, lhs_descriptor, + MatrixDescriptor(output_data, false, output_num_cols, + output_num_rows, batch_size), + stream); } if (!launch_ok) { diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h index 7a4830d64e7caef5a1170cbdbf8ab373fdaf16e2..12c81f9bfc6bfdac63edf9c826b835057107fa41 100644 --- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -48,14 +49,15 @@ class GemmThunk : public Thunk { // Does the gemm operation for the thunk on "stream", which must be non-null. Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; - - // Returns true if we'll perform autotuning if run on the given stream. If - // so, we want the GPU to be quiescent during autotuning, so as not to - // introduce noise in our results. - bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override { - return autotune_results_.count( - stream->parent()->GetDeviceDescription().name()) != 0; + se::Stream* stream, + HloExecutionProfiler* profiler) override; + + bool WillAutotuneKernel(se::Stream* stream) override { + // We will autotune this kernel if we don't already have a autotune result + // for the stream device. + return autotune_results_.find( + stream->parent()->GetDeviceDescription().name()) == + autotune_results_.end(); } private: @@ -73,6 +75,8 @@ class GemmThunk : public Thunk { // results. The map's value is the best algorithm we've found for this thunk // on this device, or an error if none of the algorithms worked and we should // use the regular gemm without an algorithm. + // + // TODO(b/112415150): Make this thread safe. std::unordered_map> autotune_results_; }; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc deleted file mode 100644 index b85721980715e2ce2cd7a689ab12a6cea55ba3f1..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ /dev/null @@ -1,788 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h" - -#include -#include -#include -#include // NOLINT(build/c++11): only using std::call_once, not mutex. -#include - -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/DiagnosticPrinter.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" -#include "tensorflow/compiler/xla/protobuf_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" -#include "tensorflow/compiler/xla/service/algebraic_simplifier.h" -#include "tensorflow/compiler/xla/service/batchnorm_expander.h" -#include "tensorflow/compiler/xla/service/buffer_assignment.h" -#include "tensorflow/compiler/xla/service/buffer_liveness.h" -#include "tensorflow/compiler/xla/service/call_inliner.h" -#include "tensorflow/compiler/xla/service/conditional_simplifier.h" -#include "tensorflow/compiler/xla/service/dot_decomposer.h" -#include "tensorflow/compiler/xla/service/flatten_call_graph.h" -#include "tensorflow/compiler/xla/service/gather_expander.h" -#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h" -#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h" -#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h" -#include "tensorflow/compiler/xla/service/gpu/fusion_merger.h" -#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h" -#include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h" -#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" -#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h" -#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h" -#include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h" -#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" -#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" -#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h" -#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h" -#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h" -#include "tensorflow/compiler/xla/service/gpu/pad_insertion.h" -#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" -#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" -#include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h" -#include "tensorflow/compiler/xla/service/hlo.pb.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_constant_folding.h" -#include "tensorflow/compiler/xla/service/hlo_cse.h" -#include "tensorflow/compiler/xla/service/hlo_dce.h" -#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_pass_fix.h" -#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h" -#include "tensorflow/compiler/xla/service/hlo_proto_util.h" -#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h" -#include "tensorflow/compiler/xla/service/hlo_verifier.h" -#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" -#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" -#include "tensorflow/compiler/xla/service/reshape_mover.h" -#include "tensorflow/compiler/xla/service/transpose_folding.h" -#include "tensorflow/compiler/xla/service/tuple_simplifier.h" -#include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h" -#include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h" -#include "tensorflow/compiler/xla/service/while_loop_simplifier.h" -#include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/cleanup.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/platform/cuda_libdevice_path.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/regexp.h" -#include "tensorflow/core/platform/stream_executor_no_cuda.h" -#include "tensorflow/core/platform/subprocess.h" -#include "tensorflow/core/platform/tracing.h" -#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" - -namespace xla { -namespace gpu { - -/* static */ const char* GpuCompiler::kTargetTriple = "nvptx64-nvidia-cuda"; -/* static */ const char* GpuCompiler::kDataLayout = - "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"; - -namespace { - -namespace tracing = tensorflow::tracing; - -// Returns the directory containing nvvm libdevice files. config_cuda_data_dir -// should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the -// HloModule being compiled. -string GetLibdeviceDir(const string& config_cuda_data_dir) { - std::vector potential_libdevice_dirs; - if (!config_cuda_data_dir.empty()) { - potential_libdevice_dirs.push_back(config_cuda_data_dir); - } - potential_libdevice_dirs.push_back(tensorflow::LibdeviceRoot()); - - // Tries all potential libdevice directories in the order they are inserted. - // Returns the first directory that exists in the file system. - for (const string& potential_libdevice_dir : potential_libdevice_dirs) { - if (tensorflow::Env::Default()->IsDirectory(potential_libdevice_dir).ok()) { - VLOG(2) << "Found libdevice dir " << potential_libdevice_dir; - return potential_libdevice_dir; - } - VLOG(2) << "Unable to find potential libdevice dir " - << potential_libdevice_dir; - } - - // Last resort: maybe in the current folder. - return "."; -} - -// Runs optimization passes on the given HLO module. -Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) { - { - HloPassPipeline pipeline("optimization"); - pipeline.AddInvariantChecker(); - pipeline.AddPass(); - ReducePrecisionInsertion::AddPasses( - &pipeline, hlo_module->config().debug_options(), - ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION); - - // TODO(b/64094172): make Call work on GPU instead of inlining. - pipeline.AddPass(); - // Convert BF16 operations to F32 operations so that the GPU backend can - // support BF16 operations without directly implementing a BF16 lowering for - // most ops. - pipeline.AddPass(BF16, F32); - pipeline.AddPass(); - - { - auto& pass = - pipeline.AddPass>("simplification"); - pass.AddInvariantChecker(); - - // If cudnn batchnorms are enabled, rewrite batchnorm HLOs to cudnn calls - // where possible. Not every batchnorm op can be implemented as a call to - // cudnn, so decompose any remaining batchnorm ops into a soup of HLOs. - if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) { - pass.AddPass(); - } - // TODO(kramerb): Remove use_fusion once instruction fusion can create - // multi-output fusions from the unfused expander output. - pass.AddPass( - /*rewrite_training_op=*/true, - /*rewrite_inference_op=*/true, - /*rewrite_grad_op=*/true, - /*use_fusion=*/true); - - // Rewrite gather ops into smaller ones. - pass.AddPass(); - - // BatchNormExpander can create zero-sized ops, so zero-sized HLO - // elimination has to come after that pass. - pipeline.AddPass(); - - pass.AddPass( - /*is_layout_sensitive=*/false, - [](const Shape&, const Shape&) { return false; }); - pass.AddPass(); - pass.AddPass(); - pass.AddPass(); - pass.AddPass(); - pass.AddPass(); - pass.AddPass(); - pass.AddPass(); - } - - pipeline.AddPass( - [](const HloInstruction& dot, - const TransposeFolding::OperandIndices& candidate_operands) { - return ImplementedAsGemm(dot) ? candidate_operands - : TransposeFolding::OperandIndices{}; - }, - TransposeFolding::NeverFoldTranspose); - pipeline.AddPass(/*is_layout_sensitive=*/false); - pipeline.AddPass(); - TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); - } - - { - // Convert convolutions into CustomCalls to cudnn, then canonicalize them - // (PadInsertion). - HloPassPipeline pipeline("conv_canonicalization"); - pipeline.AddInvariantChecker(); - pipeline.AddPass(); - pipeline.AddPass(); - TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); - } - - { - HloPassPipeline pipeline("layout_assignment"); - pipeline.AddPass( - hlo_module->mutable_device_entry_computation_layout(), stream_exec); - - // The LayoutAssignment pass may leave behind kCopy instructions which are - // duplicate or NOPs, so remove them with algebraic simplification and CSE. - pipeline.AddPass>( - /*is_layout_sensitive=*/true, - /*valid_bitcast_callback=*/[](const Shape&, const Shape&) { - return true; - }); - - // Choose the fastest algorithm for each conv. - // - // We pick the algorithm before fusion so we can generate better HLO. After - // CudnnConvolutionRewriter, our convolutions are CustomCalls which return a - // tuple (conv_result, scratch_memory), and the each conv uses 0 bytes of - // scratch: - // - // customcall = (f32[...], f32[0]) - // return gte(customcall, 0) - // - // The algorithm picker then chooses the best algorithm, and potentially - // increases the scratch space. It replaces customcall with new_tuple, - // giving us the following: - // - // new_customcall = (f32[...], f32[N]) - // new_tuple = tuple(gte(new_customcall, 0), constant f32[0]) - // return gte(new_tuple, 0) - // - // The new tuple and gte instructions then be simplified away, because - // nobody is expected to use the scratch value. - // - // However, if we were to run CudnnConvolutionAlgorithmPicker after fusion - // the gte(customcall, 0) would probably already be into a fusion node. We - // can't simplify across HloComputation boundaries, so in this case we - // wouldn't be able to simplify away the new_tuple bits. - pipeline.AddPass(stream_exec, - device_allocator); - // Clean up new_tuple described above. - pipeline.AddPass(); - - pipeline.AddPass(/*is_layout_sensitive=*/true); - TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); - } - - { - HloPassFix fusion("fusion"); - fusion.AddInvariantChecker(); - fusion.AddPass(/*may_duplicate=*/false); - fusion.AddPass(/*may_duplicate=*/true); - fusion.AddPass(); - TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status()); - - HloPassPipeline reduce_pipeline("reduce-precision"); - reduce_pipeline.AddInvariantChecker(); - ReducePrecisionInsertion::AddPasses( - &reduce_pipeline, hlo_module->config().debug_options(), - ReducePrecisionInsertion::PassTiming::AFTER_FUSION); - StatusOr reduce_result = reduce_pipeline.Run(hlo_module); - TF_RETURN_IF_ERROR(reduce_result.status()); - - if (reduce_result.ValueOrDie()) { - // Do another fusion pass, with the expectation that we may be able to - // fuse the new ReducePrecision operations. - TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status()); - } - } - - { - // Do an aggressive LICM pass over while loops. In particular, this hoists - // constants that were sunk by WhileLoopConstantSinking. Leaving them in - // the while loop may result in unnecessary copies. - HloPassPipeline pipeline("while-loop-licm"); - pipeline.AddPass(true); - TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); - } - return Status::OK(); -} - -// Modifies the given HLO module so that it will be accepted by IrEmitter. -// Unlike optimization passes, the passes are necessary for correctness. -Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) { - // In some cases, we have to place the result of an instruction in a temporary - // buffer. For instance, the buffer that holds an external parameter is - // assumed immutable at this point, and should not be reused for output - // (b/27180329). Therefore, in that case, we set the output to be a copy of - // the parameter. - HloPassPipeline pipeline("GPU-ir-emit-prepare"); - pipeline.AddInvariantChecker(); - - // Copy insertion should be performed immediately before IR emission to avoid - // inserting unnecessary copies (later pass adds an instruction which - // materializes the value) or missing a necessary copy (later pass removes an - // instruction which materializes a value). DCE must be run immediately before - // (and sometime after) copy insertion, to avoid dead code from interfering - // with the rewrites. - pipeline.AddPass(); - pipeline.AddPass(); - pipeline.AddPass(); - return pipeline.Run(hlo_module).status(); -} - -// Prints a warning if the ptxas at ptxas_path has known bugs. -// -// Only prints a warning the first time it's called for a particular value of -// ptxas_path. -void WarnIfBadPtxasVersion(const string& ptxas_path) { - static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); - static std::unordered_set* seen_ptxas_paths GUARDED_BY(mu) = - new std::unordered_set(); - - tensorflow::mutex_lock lock(mu); - if (!seen_ptxas_paths->insert(ptxas_path).second) { - // Already checked this ptx binary, nothing to do. - return; - } - - tensorflow::SubProcess ptxas; - ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"}); - ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE); - if (!ptxas.Start()) { - LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version"; - return; - } - - string out; - int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out, - /*stderr_output=*/nullptr); - if (exit_code != 0) { - LOG(WARNING) << "Running " << ptxas_path << " --version returned " - << exit_code; - return; - } - - int64 vmaj, vmin, vdot; - string vmaj_str, vmin_str, vdot_str; - if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str, - &vmin_str, &vdot_str) || - !tensorflow::strings::safe_strto64(vmaj_str, &vmaj) || - !tensorflow::strings::safe_strto64(vmin_str, &vmin) || - !tensorflow::strings::safe_strto64(vdot_str, &vdot)) { - LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path - << " --version:\n" - << out; - return; - } - - // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some - // address calculations with large offsets (e.g. "load ptr + large_constant"), - // b/70245379. - if ((vmaj == 9 && vmin == 0 && vdot < 276) || - (vmaj == 9 && vmin == 1 && vdot < 121)) { - LOG(WARNING) << "*** WARNING *** You are using ptxas " << vmaj << "." - << vmin << "." << vdot - << ", which is in range [9.0.0, 9.0.276) + [9.1.0, 9.1.121). " - "These versions are known to miscompile XLA code, leading " - "to incorrect results or invalid-address errors."; - } -} - -// Prints a warning if the ptx->sass JIT in the driver has known bugs. -// -// Using such a driver only a problem if we fail to use ptxas to compile our ptx -// and have to use the driver instead, so you should only call this function if -// we're going to use the driver JIT. -// -// Only prints a warning the first time it's called. -void WarnIfBadDriverJITVersion() { - static std::once_flag run_once; - std::call_once(run_once, [] { - auto version_or_status = se::cuda::Diagnostician::FindKernelDriverVersion(); - if (!version_or_status.ok()) { - LOG(WARNING) << "Couldn't read CUDA driver version."; - return; - } - se::cuda::DriverVersion version = version_or_status.ValueOrDie(); - - // The following versions of the driver JIT miscompile some address - // calculations with large offsets (e.g. "load ptr + large_constant"), - // b/70245379: - // - // - 384.x before 384.108 - // - 387.x before 387.40 - // - 390.x before 390.10. - auto vmaj = std::get<0>(version); - auto vmin = std::get<1>(version); - if ((vmaj == 384 && vmin < 108) || // - (vmaj == 387 && vmin < 40) || // - (vmaj == 390 && vmin < 10)) { - LOG(WARNING) - << "*** WARNING *** Invoking the PTX->SASS JIT from driver version " - << se::cuda::DriverVersionToString(version) - << ", which is in range [384.0.0, 384.108.0) + [387.0.0, 387.40.0) + " - "[390.0.0, 390.10.0). These versions are known to miscompile XLA " - "code, leading to incorrect results or invalid-address errors."; - } - }); -} - -// Compiles the given PTX string using ptxas and returns the resulting machine -// code (i.e. a cubin) as a byte array. -StatusOr> CompilePtx(const string& ptx, int cc_major, - int cc_minor) { - tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true); - const string ptxas_path = - tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas"); - VLOG(2) << "Using ptxas at " << ptxas_path; - auto env = tensorflow::Env::Default(); - TF_RETURN_IF_ERROR(env->FileExists(ptxas_path)); - - WarnIfBadPtxasVersion(ptxas_path); - - // Write ptx into a temporary file. - string ptx_path; - if (!env->LocalTempFilename(&ptx_path)) { - return InternalError("couldn't get temp PTX file name"); - } - auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] { - TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path)); - }); - - TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_path, ptx)); - VLOG(2) << "ptx written to: " << ptx_path; - - // Invoke ptxas and collect its output. - string cubin_path; - if (!env->LocalTempFilename(&cubin_path)) { - return InternalError("couldn't get temp CUBIN file name"); - } - auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] { - // CUBIN file may never be created, so the failure to delete it should not - // produce TF error. - tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError(); - }); - tensorflow::SubProcess ptxas_info_dumper; - std::vector ptxas_args = { - ptxas_path, ptx_path, "-o", cubin_path, - tensorflow::strings::StrCat("-arch=sm_", cc_major, cc_minor)}; - if (VLOG_IS_ON(2)) { - ptxas_args.push_back("-v"); - } - ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args); - ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR, - tensorflow::ACTION_PIPE); - if (!ptxas_info_dumper.Start()) { - return InternalError("Failed to launch ptxas"); - } - string stderr_output; - int exit_status = ptxas_info_dumper.Communicate( - /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output); - XLA_LOG_LINES(tensorflow::INFO, stderr_output); - if (exit_status != 0) { - return InternalError("ptxas exited with non-zero error code %d", - exit_status); - } - - // Read in the result of compilation and return it as a byte vector. - string cubin; - TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(), - cubin_path, &cubin)); - std::vector cubin_vector(cubin.begin(), cubin.end()); - return cubin_vector; -} - -} // namespace - -GpuCompiler::GpuCompiler() - : pointer_size_(llvm::DataLayout(kDataLayout) - .getPointerSize(0 /* default address space */)) {} - -StatusOr> GpuCompiler::RunHloPasses( - std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) { - XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses"); - tracing::ScopedActivity activity("HLO Transforms", module->name(), - /*is_expensive=*/true); - TF_RETURN_IF_ERROR( - OptimizeHloModule(module.get(), stream_exec, device_allocator)); - return std::move(module); -} - -StatusOr> GpuCompiler::RunBackend( - std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) { - XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend"); - - TF_RET_CHECK(stream_exec != nullptr); - - TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get())); - - llvm::LLVMContext llvm_context; - std::string buffer; - llvm::raw_string_ostream error(buffer); - llvm::DiagnosticPrinterRawOStream printer(error); - auto DiagnosticHandler = [](const llvm::DiagnosticInfo& diag_info, - void* Context) { - auto printer = static_cast(Context); - diag_info.print(*printer); - }; - llvm_context.setDiagnosticHandlerCallBack(DiagnosticHandler, &printer); - - llvm::Module llvm_module(module->name().c_str(), llvm_context); - // Set the target triple and the data layout. - llvm_module.setTargetTriple(kTargetTriple); - llvm_module.setDataLayout(kDataLayout); - - // Determine the HLO schedule, which is an ordering of HLO instructions. This - // is used by buffer assignment to enable buffer reuse, and the same ordering - // must also be used to determine the thunk launch schedule. - std::unique_ptr stream_assignment = AssignStreams(*module); - TF_ASSIGN_OR_RETURN( - std::unique_ptr hlo_schedule, - HloSchedule::Build(*module, *stream_assignment, pointer_size_)); - - // Run buffer analysis on the HLO graph. This analysis figures out which - // temporary buffers are required to run the computation. - TF_ASSIGN_OR_RETURN( - std::unique_ptr buffer_assignment, - BufferAssigner::Run(module.get(), hlo_schedule->ConsumeHloOrdering(), - BufferSizeBytesFunction(), - /*color_alignment=*/[](LogicalBuffer::Color) { - return kCudaMallocAlignBytes; - })); - // BufferAssignment::Stats::ToString() and BufferAssignment::ToString() - // include headers, so no need for us to print them ourselves. - XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString()); - XLA_VLOG_LINES(2, buffer_assignment->ToString()); - XLA_VLOG_LINES(2, module->ToString()); - const string xla_dump_optimized_hlo_proto_to = - module->config().debug_options().xla_dump_optimized_hlo_proto_to(); - if (!xla_dump_optimized_hlo_proto_to.empty()) { - HloProto proto = MakeHloProto(*module, *buffer_assignment); - TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( - proto, xla_dump_optimized_hlo_proto_to, module->name())); - } - - IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(), - &stream_exec->GetDeviceDescription(), - &llvm_module); - - HloComputation* entry_computation = module->entry_computation(); - IrEmitterUnnested ir_emitter(module->config(), entry_computation, - &ir_emitter_context); - { - XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission"); - TF_RETURN_IF_ERROR( - entry_computation->root_instruction()->Accept(&ir_emitter)); - } - - if (user_pre_optimization_hook_) { - TF_CHECK_OK(user_pre_optimization_hook_(llvm_module)); - } - string ir_module_string_before_opt; - const bool embed_ir_in_executable = - module->config().debug_options().xla_embed_ir_in_executable(); - if (VLOG_IS_ON(2) || embed_ir_in_executable) { - ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module); - VLOG(2) << "LLVM module before optimizations:"; - XLA_VLOG_LINES(2, ir_module_string_before_opt); - } - - const string& ir_dump_directory = - module->config().debug_options().xla_dump_ir_to(); - - if (!ir_dump_directory.empty()) { - TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory( - /*directory_name=*/ir_dump_directory, - /*hlo_module_name=*/module->name(), llvm_module, - /*optimized=*/false)); - } - - { - XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - Running LLVM verifier"); - - std::string err; - llvm::raw_string_ostream err_stream(err); - - // verifyModule() returns true if the module is broken. - TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream)) - << "Invalid LLVM IR before optimizations:\n" - << err_stream.str() - << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. " - "Rerun with --xla_dump_ir_to to get the IR. "; - } - - string libdevice_dir; - { - tensorflow::mutex_lock lock(mutex_); - - // Find the directory containing libdevice. To avoid searching for it every - // time, we have a one-element cache, keyed on the module's config's - // cuda_data_dir. - const auto& config_cuda_data_dir = - module->config().debug_options().xla_gpu_cuda_data_dir(); - if (cached_libdevice_dir_.empty() || - cached_cuda_data_dir_ != config_cuda_data_dir) { - cached_cuda_data_dir_ = config_cuda_data_dir; - cached_libdevice_dir_ = GetLibdeviceDir(config_cuda_data_dir); - } - libdevice_dir = cached_libdevice_dir_; - } - int cc_major, cc_minor; - if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major, - &cc_minor)) { - LOG(WARNING) - << "Couldn't get compute capability for device; assuming sm_20."; - cc_major = 2; - cc_minor = 0; - } - - string ptx; - { - XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - CompileToPtx"); - TF_ASSIGN_OR_RETURN(ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor}, - module->config(), libdevice_dir)); - } - - if (!ir_dump_directory.empty()) { - TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory( - /*directory_name=*/ir_dump_directory, - /*hlo_module_name=*/module->name(), llvm_module, - /*optimized=*/true)); - } - - if (user_post_optimization_hook_) { - TF_CHECK_OK(user_post_optimization_hook_(llvm_module)); - } - VLOG(2) << "LLVM module after optimizations:"; - XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module)); - VLOG(2) << "PTX:"; - XLA_VLOG_LINES(2, ptx); - - // Write PTX to IR dump directory, if IR dumping was requested. - if (!ir_dump_directory.empty()) { - const string ptx_outfile = tensorflow::io::JoinPath( - ir_dump_directory, tensorflow::strings::StrCat(module->name(), ".ptx")); - auto status = [&] { - auto* env = tensorflow::Env::Default(); - TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(ir_dump_directory)); - TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_outfile, ptx)); - return Status::OK(); - }(); - if (!status.ok()) { - LOG(WARNING) << "Couldn't dump PTX for module " << module->name() - << " to " << ptx_outfile << ": " << status; - } - } - - const std::vector cubin = - CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor); - - auto thunk_schedule = MakeUnique( - ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment), - hlo_schedule->ThunkLaunchOrder()); - VLOG(2) << "Printing the thunk schedule..."; - XLA_VLOG_LINES(2, thunk_schedule->ToString()); - - std::unique_ptr profile_index_map; - std::unique_ptr profile_printer; - - if (module->config().hlo_profiling_enabled()) { - HloCostAnalysis cost_analysis(ShapeSizeBytesFunction()); - cost_analysis.set_bytes_per_second( - stream_exec->GetDeviceDescription().memory_bandwidth()); - TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis)); - profile_index_map = MakeUnique(*module); - profile_printer = - CreateHloProfilePrinterData(*profile_index_map, cost_analysis); - } - - auto* gpu_executable = new GpuExecutable( - ptx, cubin, {cc_major, cc_minor}, std::move(thunk_schedule), - std::move(module), std::move(buffer_assignment), - std::move(profile_printer), std::move(profile_index_map)); - if (embed_ir_in_executable) { - DCHECK_NE("", ir_module_string_before_opt); - gpu_executable->set_ir_module_string(ir_module_string_before_opt); - } - return std::unique_ptr(gpu_executable); -} - -std::vector GpuCompiler::CompilePtxOrGetCachedResult(const string& ptx, - int cc_major, - int cc_minor) { - XLA_SCOPED_LOGGING_TIMER("GpuCompiler::CompilePtxOrGetCachedResult"); - tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true); - bool inserted; - decltype(compilation_cache_.begin()) iter; - // Pointers into compilation_cache_ where the ptx and (optional) cubin are - // stored. - const string* cache_ptx = nullptr; - CompilationCacheValue* cache_value = nullptr; - - { - tensorflow::mutex_lock lock(mutex_); - std::tie(iter, inserted) = compilation_cache_.emplace( - std::piecewise_construct, - std::forward_as_tuple(ptx, cc_major, cc_minor), - std::forward_as_tuple()); - cache_ptx = &iter->first.ptx; - cache_value = &iter->second; - } - - // Compile the ptx if it wasn't in the cache before we called this function. - // Other threads asking for the same compilation key will block on - // cache_value->mutex_ until compilation is done. - { - tensorflow::mutex_lock lock(cache_value->mutex_); - if (inserted) { - CHECK(!cache_value->compilation_done); - if (!ptx.empty()) { - StatusOr> maybe_cubin = - CompilePtx(*cache_ptx, cc_major, cc_minor); - if (maybe_cubin.ok()) { - cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie(); - VLOG(2) << "Compiled PTX size:" << ptx.size() - << " CUBIN size: " << cache_value->cubin_data.size(); - } else { - bool log_warning = true; - if (maybe_cubin.status().code() == - tensorflow::error::Code::NOT_FOUND) { - // Missing ptxas is expected in some environments where CUDA SDK - // binaries are not available. We don't want to spam logs with - // identical warnings in this case. - - // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N - // for more general usage. - static std::atomic warning_done(false); - log_warning = !warning_done.exchange(true); - } - if (log_warning) { - LOG(WARNING) - << "Failed to compile ptx to cubin. Will attempt to let " - "GPU driver compile the ptx. " - << maybe_cubin.status(); - } - - // We're going to use the driver to JIT our PTX->SASS, so warn if - // the JIT in the driver has known bugs. - WarnIfBadDriverJITVersion(); - } - } - cache_value->compilation_done = true; - cache_value->compilation_done_cv_.notify_all(); - } else { - while (!cache_value->compilation_done) { - cache_value->compilation_done_cv_.wait(lock); - } - } - } - - CHECK(cache_value != nullptr); - CHECK(cache_value->compilation_done); - return cache_value->cubin_data; -} - -StatusOr>> -GpuCompiler::CompileAheadOfTime(std::vector> module, - const AotCompilationOptions& options) { - return Unimplemented("not yet implemented: GpuCompiler::CompileAheadOfTime"); -} - -se::Platform::Id GpuCompiler::PlatformId() const { - return se::cuda::kCudaPlatformId; -} - -} // namespace gpu -} // namespace xla - -static bool InitModule() { - xla::Compiler::RegisterCompilerFactory( - stream_executor::cuda::kCudaPlatformId, - []() { return xla::MakeUnique(); }); - return true; -} -static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h deleted file mode 100644 index f3b02ae5d8867bdf1d970e809bff95a15d9f54d2..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h +++ /dev/null @@ -1,155 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_ - -#include -#include -#include - -#include "tensorflow/compiler/xla/service/executable.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/llvm_compiler.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/gtl/optional.h" -#include "tensorflow/core/lib/hash/hash.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/stream_executor_no_cuda.h" -#include "tensorflow/core/platform/thread_annotations.h" - -namespace xla { -namespace gpu { - -// The GPU compiler generates efficient GPU executables. -class GpuCompiler : public LLVMCompiler { - public: - GpuCompiler(); - ~GpuCompiler() override {} - - // Bring in - // StatusOr>> Compile( - // std::vector> modules, - // std::vector> - // stream_execs) - using LLVMCompiler::Compile; - - StatusOr> RunHloPasses( - std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) override; - - StatusOr> RunBackend( - std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) override; - - StatusOr>> - CompileAheadOfTime(std::vector> module, - AotCompilationOptions const& options) override; - - se::Platform::Id PlatformId() const override; - - HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override { - // Capture just the pointer size, not the entire GpuCompiler object. - int64 pointer_size = pointer_size_; - return [pointer_size](const Shape& shape) { - return ShapeUtil::ByteSizeOf(shape, pointer_size); - }; - } - - // The triple that represents our target. - static const char* kTargetTriple; - - // The data layout of the emitted module. Copied from computeDataLayout in - // NVPTXTargetMachine.cpp. - static const char* kDataLayout; - - private: - // The size in bytes of a pointer. Used by ShapeSizeBytesFunction. - const int64 pointer_size_; - - tensorflow::mutex mutex_; - - // When compiling an HLO module, we need to find a path to the nvvm libdevice - // files. We search in the module's config.debug_options().cuda_data_dir() - // and in tensorflow::LibdeviceRoot(), the latter of which is a constant. - // - // We cache the cuda_data_dir() and the result of our search, so that if the - // next module we have to compile has the same cuda_data_dir(), we can skip - // the search. - string cached_cuda_data_dir_ GUARDED_BY(mutex_); - string cached_libdevice_dir_ GUARDED_BY(mutex_); - - // Tries to compile the given ptx string to cubin. Returns a vector with the - // compiled cubin. If compilation was unsuccessful, returns an empty vector. - std::vector CompilePtxOrGetCachedResult(const string& ptx, - int cc_major, int cc_minor); - - // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor} - // -> cubin so we don't recompile the same ptx twice. This is important for - // some interactive workflows. (We also cache at the HLO level, but sometimes - // we can't realize that two modules are the same until we lower to ptx.) - // - // Compilation of distinct PTX happens in parallel. If more than one thread - // attempts to compile the same PTX, the fist thread to obtain - // cache_value_->mutex_ performs the compilation. The rest wait() on - // cache_value_->compilation_done_cv_ until the compilation is done. - // - // If compiling the ptx fails, we return an empty cubin, cross our fingers, - // and leave compilation up to the driver. - struct CompilationCacheKey { - CompilationCacheKey(std::string ptx, int cc_major, int cc_minor) - : ptx(std::move(ptx)), cc_major(cc_major), cc_minor(cc_minor) {} - string ptx; - int cc_major; - int cc_minor; - }; - struct CompilationCacheHash { - size_t operator()(const CompilationCacheKey& key) const { - return tensorflow::Hash64Combine( - tensorflow::Hash64Combine(tensorflow::Hash64(key.ptx), key.cc_major), - key.cc_minor); - } - }; - struct CompilationCacheEq { - size_t operator()(const CompilationCacheKey& a, - const CompilationCacheKey& b) const { - return a.cc_major == b.cc_major && a.cc_minor == b.cc_minor && - a.ptx == b.ptx; - } - }; - struct CompilationCacheValue { - bool compilation_done = false; - std::vector cubin_data; - // mutex and condition variable to serialize compilation completing. - tensorflow::mutex mutex_; - tensorflow::condition_variable compilation_done_cv_; - }; - - // Don't even think about switching this to FlatMap; iterator stability is - // critical here. - std::unordered_map - compilation_cache_ GUARDED_BY(mutex_); - - TF_DISALLOW_COPY_AND_ASSIGN(GpuCompiler); -}; - -} // namespace gpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/gpu_constants.cc b/tensorflow/compiler/xla/service/gpu/gpu_constants.cc index aa360c7f73de2f0f9cf59c22b552b8e60ddb3a87..7f0b030fece8f25578bd90a538279d455350278a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_constants.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_constants.cc @@ -14,12 +14,23 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h" +#include "tensorflow/core/framework/allocator.h" namespace xla { namespace gpu { -// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#device-memory-accesses -const int64 kCudaMallocAlignBytes = 256; +// kEntryParameterAlignBytes is equal to EIGEN_MAX_ALIGN_BYTES, though including +// Eigen headers here to get that symbol may not be a good idea. +// EIGEN_MAX_ALIGN_BYTES may differ between CUDA-enabled builds vs CUDA-disabled +// builds and we don't want the IR generated by XLA:GPU to depend on that. +// +// TODO(b/111767313): Consider raising EIGEN_MAX_ALIGN_BYTES if it helps. +const int64 kEntryParameterAlignBytes = 16; + +const int64 kXlaAllocatedBufferAlignBytes = + tensorflow::Allocator::kAllocatorAlignment; + +const int64 kConstantBufferAlignBytes = kXlaAllocatedBufferAlignBytes; } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_constants.h b/tensorflow/compiler/xla/service/gpu/gpu_constants.h index eb1ca4c6c95a23d2a08f5f9c3cbc85e7d47d4f89..6f5f1fa09c57dfd246d702c0adc92c7e2e76805a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_constants.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_constants.h @@ -21,9 +21,15 @@ limitations under the License. namespace xla { namespace gpu { -// Minimum alignment of cudaMalloc. We require that buffers created by our -// DeviceMemoryAllocator, and all input/output buffers, have this alignment. -extern const int64 kCudaMallocAlignBytes; +// Minimum alignment for buffers passed as incoming arguments by TensorFlow. +extern const int64 kEntryParameterAlignBytes; + +// Minimum alignment for buffers allocated by XLA: the temp buffers and the live +// out (result) buffers. +extern const int64 kXlaAllocatedBufferAlignBytes; + +// Minimum alignment for constant buffers. +extern const int64 kConstantBufferAlignBytes; } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc index d9560779f313d5a559c3eb0f5b28ff5dd210d9d5..75f414e47fe3edcc1b10b392ed5cc5038be6c190 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc @@ -48,85 +48,17 @@ StatusOr GpuCopyInsertion::Run(HloModule* module) { TF_ASSIGN_OR_RETURN(bool changed, generic_copy_insertion.Run(module)); - TF_ASSIGN_OR_RETURN(std::unique_ptr dataflow, - HloDataflowAnalysis::Run(*module)); - - // Make sure all operands of a library call are in memory instead of constants - // in IR. - for (HloInstruction* hlo : - module->entry_computation()->MakeInstructionPostOrder()) { - // Inserts a copy of hlo->operand(n) if it's a constant. - auto copy_operand_if_constant = [&](int64 n) -> Status { - HloInstruction* operand = hlo->mutable_operand(n); - TF_RET_CHECK(ShapeUtil::IsArray(operand->shape())); - const auto& values = dataflow->GetValueSet(operand).values(); - if (std::any_of(values.begin(), values.end(), [](const HloValue* value) { - return value->defining_instruction()->opcode() == - HloOpcode::kConstant; - })) { - TF_ASSIGN_OR_RETURN(HloInstruction * copy, FindOrInsertCopy(operand)); - TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(n, copy)); - changed = true; - } - return Status::OK(); - }; - - if (IsCustomCallToDnnBatchNorm(*hlo)) { - // The epsilon and feature_index operands to a CUDNN batchnorm op don't - // need to be materialized in memory -- in fact, they must be constants. - // These are the last two operands of all three batchnorm ops. - for (int64 i = 0; i < hlo->operand_count() - 2; ++i) { - TF_RETURN_IF_ERROR(copy_operand_if_constant(i)); - } - } else if (IsCustomCallToDnnConvolution(*hlo)) { - // The last two arguments to a CUDNN convolution are two HLO constants for - // cudnn algorithm and tensor_ops_enabled flag, which shouldn't be copied. - for (int64 i = 0; i < hlo->operand_count() - 2; ++i) { - TF_RETURN_IF_ERROR(copy_operand_if_constant(i)); - } - } else if (ImplementedAsLibraryCall(*hlo) || - hlo->opcode() == HloOpcode::kCrossReplicaSum) { - // For all other library calls and cross-replica-sum, materialize all the - // operands into memory. (Cross-replica-sum gets its constant args - // materialized even if it's not implemented as a libcall to simplify the - // implementation. It's slower, but we can constant fold away constant - // args *anyway*, so we just need to make it work.) - for (int64 i = 0; i < hlo->operand_count(); ++i) { - TF_RETURN_IF_ERROR(copy_operand_if_constant(i)); - } - } - } - - // Init values of while and conditional nodes cannot be constants. Insert - // copies for any constants found at the operands of these nodes. - tensorflow::gtl::FlatSet inserted_copies; + // Check the assumption that the epsilon and feature_index constants of the + // CUDNN batchnorm op are not shared with other ops where we would replace + // them with a copy. These custom op calls are generated with the + // CudnnBatchNormRewriter, so this would only happen if HloCSE merges them. for (HloComputation* computation : module->computations()) { - for (HloInstruction* instruction : computation->instructions()) { - if (instruction->opcode() != HloOpcode::kWhile && - instruction->opcode() != HloOpcode::kConditional) { + for (HloInstruction* hlo : computation->instructions()) { + if (!IsCustomCallToDnnBatchNorm(*hlo)) { continue; } - for (auto operand : instruction->operands()) { - // Skip the operands that have already been replaced with a copy in a - // previous iteration (which is possible when a constant is used as an - // operand in multiple places). - if (ContainsKey(inserted_copies, operand)) { - continue; - } - for (auto& pair : dataflow->GetInstructionValueSet(operand)) { - const HloValueSet& value_set = pair.second; - for (const HloValue* value : value_set.values()) { - if (value->defining_instruction()->IsConstant() && - !ContainsKey(hlo_to_copy_map_, value->defining_instruction())) { - HloInstruction* constant = value->defining_instruction(); - TF_ASSIGN_OR_RETURN(HloInstruction * copy, - FindOrInsertCopy(constant)); - TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy)); - inserted_copies.insert(copy); - changed = true; - } - } - } + for (int64 i = hlo->operand_count() - 2; i < hlo->operand_count(); ++i) { + CHECK_EQ(hlo->operand(i)->opcode(), HloOpcode::kConstant); } } } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h index 0c6f9b511f3aac5f62182273b827adcd068cd633..8ffae18fe820aa01701731ee56a83aeacf0eab0d 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h @@ -27,7 +27,7 @@ namespace gpu { // inserting kCopy instructions. class GpuCopyInsertion : public HloPassInterface { public: - tensorflow::StringPiece name() const override { return "copy-insertion"; } + absl::string_view name() const override { return "copy-insertion"; } StatusOr Run(HloModule* module) override; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 25d8f720ea4791a4c94efcad6909cd0c113fbe70..31a9f9b1beb81da81a06f6dc8e7c13c105514092 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -19,11 +19,12 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h" #include "tensorflow/compiler/xla/service/logical_buffer.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" @@ -41,77 +42,6 @@ namespace { using tensorflow::tracing::ScopedAnnotation; -// A helper class for profiling HLO in the course of GPU program execution. -// All of the profiling is guarded internally, to avoid the caller needing to -// have lots of conditionals sprinkled around. -class HloExecutionProfiler { - public: - // If profiling is enabled, start an execution timer running. - explicit HloExecutionProfiler( - bool do_profile, HloExecutionProfile* profile, se::Stream* stream, - const std::vector::SmartPtr>& sub_streams, - const HloComputation* computation) - : do_profile_(do_profile), - profile_(profile), - stream_(stream), - sub_streams_(sub_streams), - computation_(computation) { - if (do_profile_) { - clock_rate_ghz_ = - stream->parent()->GetDeviceDescription().clock_rate_ghz(); - execution_timer_.reset(new se::Timer(stream->parent())); - per_op_timer_.reset(new se::Timer(stream->parent())); - stream->InitTimer(execution_timer_.get()) - .ThenStartTimer(execution_timer_.get()); - stream->InitTimer(per_op_timer_.get()); - } - } - - // If profiling is enabled, sets the total cycle count on the profile from the - // execution timer. - void FinishExecution() { - CHECK(!finished_execution_) << "Call FinishExecution only once!"; - finished_execution_ = true; - if (do_profile_) { - stream_->ThenWaitFor(&sub_streams_); - stream_->ThenStopTimer(execution_timer_.get()); - stream_->BlockHostUntilDone().IgnoreError(); - profile_->set_total_cycles_executed( - *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_); - } - } - - // If profiling is enabled, starts the per-operation timer. - void StartOperation() { - if (do_profile_) { - stream_->ThenStartTimer(per_op_timer_.get()); - } - } - - // If profiling is enabled, stops the per-operation timer and records the time - // that the hlo_instruction took to execute in the profile. - void FinishOperation(const HloInstruction* hlo_instruction) { - if (do_profile_) { - stream_->ThenWaitFor(&sub_streams_); - stream_->ThenStopTimer(per_op_timer_.get()); - stream_->BlockHostUntilDone().IgnoreError(); - profile_->SetCyclesTakenBy( - hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_); - } - } - - private: - const bool do_profile_; - double clock_rate_ghz_; - HloExecutionProfile* profile_; - se::Stream* stream_; - const std::vector::SmartPtr>& sub_streams_; - const HloComputation* computation_; - std::unique_ptr execution_timer_; - std::unique_ptr per_op_timer_; - bool finished_execution_ = false; -}; - } // namespace // Implementation note: HLO profiling is always enabled for GPU executables, @@ -155,7 +85,7 @@ Status GpuExecutable::ExecuteThunks( } // Stream 0 indicates `main_stream` and substreams start from stream 1. - std::vector::SmartPtr> sub_streams; + std::vector sub_streams; sub_streams.reserve(thunk_schedule_->StreamCount() - 1); while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) { sub_streams.emplace_back(); @@ -182,7 +112,7 @@ Status GpuExecutable::ExecuteThunks( // // TODO(jlebar): Should we cache the results of HloInstruction::ToString(), // since we expect it to be an expensive call? - tensorflow::gtl::optional op_annotation; + absl::optional op_annotation; if (top_level_annotation.IsEnabled()) { op_annotation.emplace( thunk->hlo_instruction() != nullptr @@ -201,24 +131,24 @@ Status GpuExecutable::ExecuteThunks( stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get()); } - // If this thunk requests it, wait for all currently-executing thunks to - // finish. This is useful e.g. if the thunk is about to perform autotuning. - if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) { + // If this thunk is about to autotune then wait for all currently executing + // thunks to finish. This reduces noise and thus the probability of + // choosing a suboptimal algorithm. + if (thunk->WillAutotuneKernel(stream)) { TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone()); } - profiler.StartOperation(); VLOG(2) << "Executing the thunk for " << thunk->hlo_instruction()->ToString() << " on stream " << stream_no; - TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream)); + TF_RETURN_IF_ERROR( + thunk->ExecuteOnStream(buffer_allocations, stream, &profiler)); if (thunk_schedule_->Depended(thunk)) { - auto finish_event = MakeUnique(main_stream->parent()); + auto finish_event = absl::make_unique(main_stream->parent()); finish_event->Init(); stream->ThenRecordEvent(finish_event.get()); thunk_to_finish_event[thunk] = std::move(finish_event); } - profiler.FinishOperation(thunk->hlo_instruction()); } main_stream->ThenWaitFor(&sub_streams); @@ -230,7 +160,7 @@ Status GpuExecutable::ExecuteThunks( if (!block_status.ok()) { return InternalError( "Failed to complete all kernels launched on stream %p: %s", - main_stream, block_status.error_message().c_str()); + main_stream, block_status.error_message()); } } @@ -253,9 +183,58 @@ Status GpuExecutable::ExecuteThunks( return Status::OK(); } +StatusOr +GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) { + tensorflow::mutex_lock lock(module_handle_mutex_); + auto it = module_globals_.find(executor); + if (it != module_globals_.end()) { + return &it->second; + } + + se::MultiModuleLoaderSpec module_spec; + if (!cubin().empty()) { + module_spec.AddCudaCubinInMemory(cubin()); + } + module_spec.AddCudaPtxInMemory(ptx().c_str()); + + tensorflow::gtl::FlatMap globals; + se::ModuleHandle module_handle; + executor->LoadModule(module_spec, &module_handle); + + for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); + ++i) { + const BufferAllocation& allocation = assignment_->GetAllocation(i); + if (allocation.is_constant()) { + TF_ASSIGN_OR_RETURN( + se::DeviceMemoryBase global, + executor->GetUntypedSymbol( + llvm_ir::ConstantBufferAllocationToGlobalName(allocation), + module_handle)); + VLOG(3) << "Resolved global " + << llvm_ir::ConstantBufferAllocationToGlobalName(allocation) + << " to " << global.opaque(); + InsertOrDie(&globals, i, global); + + const Literal& literal = + llvm_ir::LiteralForConstantAllocation(allocation); + CHECK(ShapeUtil::IsArray(literal.shape())); + if (!ShouldEmitLiteralInLlvmIr(literal)) { + VLOG(3) << "H2D memcpy for constant with shape " + << ShapeUtil::HumanString(literal.shape()); + TF_RETURN_IF_ERROR(executor->SynchronousMemcpyH2D( + literal.untyped_data(), allocation.size(), &global)); + } + } + } + + module_handles_.emplace(executor, + se::ScopedModuleHandle(executor, module_handle)); + return &module_globals_.emplace(executor, std::move(globals)).first->second; +} + StatusOr GpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) { DeviceMemoryAllocator* memory_allocator = run_options->allocator(); @@ -264,6 +243,10 @@ StatusOr GpuExecutable::ExecuteOnStream( } BufferAllocations::Builder buffer_allocations_builder; + se::StreamExecutor* executor = run_options->stream()->parent(); + + TF_ASSIGN_OR_RETURN(auto* const globals, ResolveConstantGlobals(executor)); + for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); ++i) { const BufferAllocation& allocation = assignment_->GetAllocation(i); @@ -277,16 +260,19 @@ StatusOr GpuExecutable::ExecuteOnStream( if (buffer.is_null() && buffer.size() > 0) { return FailedPrecondition( "Cannot run XLA computation because pointer to (sub-)buffer at " - "index %s of parameter %lld was null. All pointers to " - "(sub-)buffers must not be null, unless the (sub-)buffer has zero " - "elements.", - allocation.param_shape_index().ToString().c_str(), param_no); + "index %s of parameter %d was null. All pointers to (sub-)buffers " + "must not be null, unless the (sub-)buffer has zero elements.", + allocation.param_shape_index().ToString(), param_no); } buffer_allocations_builder.RegisterBuffer(i, buffer); } + + if (allocation.is_constant()) { + buffer_allocations_builder.RegisterBuffer(i, FindOrDie(*globals, i)); + } } - se::StreamExecutor* executor = run_options->stream()->parent(); + TF_ASSIGN_OR_RETURN( auto buffer_allocations, buffer_allocations_builder.Build( @@ -307,7 +293,7 @@ StatusOr GpuExecutable::ExecuteOnStream( // the respective location in ShapedBuffer. std::set buffers_in_result; TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus( - [&buffer_allocations, &buffers_in_result, &shaped_buffer, this]( + [&buffer_allocations, &buffers_in_result, this]( const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { const auto& sources = this->GetRootPointsToSet().element(index); // The points-to set is unambiguous so the set should be a @@ -339,7 +325,7 @@ StatusOr GpuExecutable::ExecuteOnStream( StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { // TODO(b/30671675): Implement asynchronous execution mode. return Unimplemented( "Asynchronous execution on stream is not yet supported on GPU."); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 80ec38c3ac114fe4ad9d56784330c1144d913db1..38b0f8f15bd28cf2659e4a53b6634e981545716b 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -19,6 +19,9 @@ limitations under the License. #include #include +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/executable.h" @@ -32,8 +35,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -66,7 +68,7 @@ class GpuExecutable : public Executable { } // Returns the compiled PTX for the computation. - tensorflow::StringPiece ptx() const { return ptx_; } + const string& ptx() const { return ptx_; } // Returns the cubin (compiled PTX) stored in this GpuExecutable. May be // empty, in which case compilation is left up to the GPU driver. @@ -76,12 +78,12 @@ class GpuExecutable : public Executable { // match the compute capability passed to this object's constructor. StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) override; StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) override; + absl::Span arguments) override; private: // If `block_host_until_done` is false, execution will not block the host @@ -98,6 +100,15 @@ class GpuExecutable : public Executable { // computation. Uses points-to analysis from buffer assignment. const PointsToSet& GetRootPointsToSet() const; + using BufferAllocToDeviceMemoryMap = + tensorflow::gtl::FlatMap; + + // Loads the PTX or CUBIN for this executable into `executor` and resolves the + // globals corresponding to constant buffers. Returns a map mapping buffer + // allocation indices to GPU pointers. + StatusOr ResolveConstantGlobals( + stream_executor::StreamExecutor* executor); + // The LLVM IR, in string format, of the unoptimized module generated for this // GpuExecutable. We save a string instead of an llvm::Module* because leaving // llvm::Module* in a singleton can cause the heap checker to emit false @@ -126,6 +137,14 @@ class GpuExecutable : public Executable { // memory for every output/temp buffers. const std::unique_ptr assignment_; + // Cache of module handles and constant buffer allocation maps used by + // `ResolveConstantGlobals`. + tensorflow::mutex module_handle_mutex_; + std::map + module_handles_ GUARDED_BY(module_handle_mutex_); + std::map + module_globals_ GUARDED_BY(module_handle_mutex_); + TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable); }; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d31fd5570c468b0c42fa308535fd335f3588a79 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc @@ -0,0 +1,84 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h" + +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" + +namespace xla { +namespace gpu { + +namespace { +void AppendParams(const HloInstruction& instr, + std::vector* params) { + if (instr.opcode() == HloOpcode::kFusion) { + params->insert(std::end(*params), std::begin(instr.fused_parameters()), + std::end(instr.fused_parameters())); + } else { + for (HloInstruction* operand : instr.operands()) { + params->push_back(operand); + } + } +} +} // namespace + +bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer, + const HloInstruction& reduce) { + std::vector params; + AppendParams(producer, ¶ms); + AppendParams(reduce, ¶ms); + int64 max_rank = -1; + const Layout* max_rank_layout; + for (HloInstruction* param : params) { + if (ShapeUtil::IsArray(param->shape()) && + ShapeUtil::Rank(param->shape()) > max_rank) { + max_rank = ShapeUtil::Rank(param->shape()); + max_rank_layout = ¶m->shape().layout(); + } + } + return absl::c_all_of(params, [&](HloInstruction* param) { + return (!ShapeUtil::IsArray(param->shape())) || + (ShapeUtil::Rank(param->shape()) < max_rank) || + (LayoutUtil::Equal(param->shape().layout(), *max_rank_layout)); + }); +} + +bool IsInputFusibleReduction(const HloInstruction& instr) { + if (instr.IsMultiOutputFusion()) { + for (const HloInstruction* operand : + instr.fused_expression_root()->operands()) { + if (IsReductionToVector(*operand)) { + CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput) + << " Multi-output fusion rooted at reduction-to-vector ops must be " + "of kind kInput: " + << instr.ToString(); + return true; + } + } + return false; + } else if (instr.opcode() == HloOpcode::kFusion) { + if (IsReductionToVector(*instr.fused_expression_root())) { + CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput) + << " Fusion rooted at reduction-to-vector op must be of kind kInput: " + << instr.ToString(); + return true; + } + return false; + } + return IsReductionToVector(instr); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h new file mode 100644 index 0000000000000000000000000000000000000000..f7c24a0d5bbfcc61389ea19ae7f769671e4e974d --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h @@ -0,0 +1,49 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_ + +#include "tensorflow/compiler/xla/service/hlo_instruction.h" + +// TODO(b/112957171): Extract logic to determine fusibility of HLO ops from +// GpuInstructionFusion, FusionMerger, and GpuMultiOutputFusion. + +namespace xla { +namespace gpu { + +// The code emitted for reduce-rooted input fusions (EmitReductionToVector) +// suffers from poor data locality if the layouts of input parameters differ. In +// such situtations it is better not to fuse. Only input params with +// maximum rank are considered. Params with smaller ranks will be broadcasted +// and have not been observed to cause data locality issues. +// TODO(b/111977086): Improve reduce emitters to remove this limitation. +bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer, + const HloInstruction& reduce); + +// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr` +// is either an unfused reduction-to-vector op, an input fusion rooted at a +// reduction-to-vector op, or a multi-output input fusion with at least one +// reduction-to-vector op root. +// Note that reduction ops are lowered in different ways. Reduce input fusions +// are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at +// reduction-to-vector ops. Other reduction ops are lowered by +// GpuElementalIrEmitter and fused like elementwise ops. +bool IsInputFusibleReduction(const HloInstruction& instr); + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_ diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d91b7bc61fda5a07c163a07ec0e1644d2ad9db49 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc @@ -0,0 +1,332 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h" + +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" + +namespace xla { +namespace gpu { + +using GpuFusibleTest = HloTestBase; + +const char kModulePrefix[] = R"( + HloModule test_module + scalar_add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) + })"; + +TEST_F(GpuFusibleTest, + LayoutsAreReduceInputFusionFriendly_ElementwiseProducer) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + ENTRY entry { + p0 = f32[2,2,2]{2,1,0} parameter(0) + c0 = f32[] constant(0) + exp = f32[2,2,2]{2,1,0} exponential(p0) + ROOT reduce = f32[2,2]{1,0} reduce(exp, c0), dimensions={2}, to_apply=scalar_add + })")) + .ValueOrDie(); + SCOPED_TRACE(module->ToString()); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce); + const HloInstruction* exp = + module->entry_computation()->root_instruction()->operand(0); + ASSERT_EQ(exp->opcode(), HloOpcode::kExp); + EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*exp, *reduce)); +} + +TEST_F(GpuFusibleTest, + LayoutsAreReduceInputFusionFriendly_MixedLayoutProducer) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + mixed_input_layouts_computation { + p0.1 = f16[128,1024,32,32]{1,3,2,0} parameter(0) + p1.1 = f16[128,1024,32,32]{3,2,1,0} parameter(1) + copy = f16[128,1024,32,32]{1,3,2,0} copy(p1.1) + c0 = f16[] constant(0) + broadcast = f16[128,1024,32,32]{1,3,2,0} broadcast(c0), dimensions={} + greater-than = pred[128,1024,32,32]{1,3,2,0} greater-than(copy, broadcast) + ROOT root = f16[128,1024,32,32]{1,3,2,0} select(greater-than, p0.1, broadcast) + } + fused_reduce { + p0.2 = f16[128,1024,32,32]{1,3,2,0} parameter(0) + convert = f32[128,1024,32,32]{1,3,2,0} convert(p0.2) + c0.2 = f32[] constant(0) + ROOT reduce = f32[1024]{0} reduce(convert, c0.2), dimensions={0,2,3}, to_apply=scalar_add + } + ENTRY entry { + p0 = f16[128,1024,32,32]{1,3,2,0} parameter(0) + p1 = f16[128,1024,32,32]{3,2,1,0} parameter(1) + loop_fusion = f16[128,1024,32,32]{1,3,2,0} fusion(p0, p1), kind=kLoop, calls=mixed_input_layouts_computation + reduce_fusion = f32[1024]{0} fusion(loop_fusion), kind=kInput, calls=fused_reduce + ROOT root = (f32[1024]{0}, f16[128,1024,32,32]{1,3,2,0}) tuple(reduce_fusion, loop_fusion) + })")) + .ValueOrDie(); + SCOPED_TRACE(module->ToString()); + const HloInstruction* reduce_fusion = + module->entry_computation()->root_instruction()->operand(0); + ASSERT_EQ(reduce_fusion->fused_expression_root()->opcode(), + HloOpcode::kReduce); + const HloInstruction* loop_fusion = + module->entry_computation()->root_instruction()->operand(1); + ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kSelect); + EXPECT_FALSE( + LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce_fusion)); +} + +TEST_F(GpuFusibleTest, LayoutsAreReduceInputFusionFriendly_CopyProducer) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_reduce { + p0.1 = f32[128,1024,32,32]{1,3,2,0} parameter(0) + c0.1 = f32[] constant(0) + ROOT reduce = f32[1024]{0} reduce(p0.1, c0.1), dimensions={0,2,3}, to_apply=scalar_add + } + ENTRY entry { + p0 = f16[128,1024,32,32]{3,2,1,0} parameter(0) + copy = f32[128,1024,32,32]{1,3,2,0} copy(p0) + ROOT reduce_fusion = f32[1024]{0} fusion(copy), kind=kInput, calls=fused_reduce + })")) + .ValueOrDie(); + SCOPED_TRACE(module->ToString()); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->fused_expression_root()->opcode(), HloOpcode::kReduce); + const HloInstruction* copy = + module->entry_computation()->root_instruction()->operand(0); + ASSERT_EQ(copy->opcode(), HloOpcode::kCopy); + EXPECT_FALSE(LayoutsAreReduceInputFusionFriendly(*copy, *reduce)); +} + +TEST_F(GpuFusibleTest, + LayoutsAreReduceInputFusionFriendly_LayoutChangingFusionProducer) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + layout_changing_computation { + p0.1 = f16[128,1024,32,32]{3,2,1,0} parameter(0) + p1.1 = f16[128,1024,32,32]{3,2,1,0} parameter(1) + c0 = f16[] constant(0) + broadcast = f16[128,1024,32,32]{3,2,1,0} broadcast(c0), dimensions={} + greater-than = pred[128,1024,32,32]{3,2,1,0} greater-than(p1.1, broadcast) + select = f16[128,1024,32,32]{3,2,1,0} select(greater-than, p0.1, broadcast) + ROOT root = f16[128,1024,32,32]{1,3,2,0} copy(select) + } + fused_reduce { + p0.2 = f16[128,1024,32,32]{1,3,2,0} parameter(0) + convert = f32[128,1024,32,32]{1,3,2,0} convert(p0.2) + c0.2 = f32[] constant(0) + ROOT reduce = f32[1024]{0} reduce(convert, c0.2), dimensions={0,2,3}, to_apply=scalar_add + } + ENTRY entry { + p0 = f16[128,1024,32,32]{3,2,1,0} parameter(0) + p1 = f16[128,1024,32,32]{3,2,1,0} parameter(1) + loop_fusion = f16[128,1024,32,32]{1,3,2,0} fusion(p0, p1), kind=kLoop, calls=layout_changing_computation + ROOT reduce_fusion = f32[1024]{0} fusion(loop_fusion), kind=kInput, calls=fused_reduce + })")) + .ValueOrDie(); + SCOPED_TRACE(module->ToString()); + const HloInstruction* reduce_fusion = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce_fusion->fused_expression_root()->opcode(), + HloOpcode::kReduce); + const HloInstruction* loop_fusion = + module->entry_computation()->root_instruction()->operand(0); + ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kCopy); + EXPECT_FALSE( + LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce_fusion)); +} + +TEST_F(GpuFusibleTest, + LayoutsAreReduceInputFusionFriendly_ConsiderMaximumRanksParamsOnly) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + broadcasting_computation { + p0.1 = f32[128,1024,32,32]{1,3,2,0} parameter(0) + p1.1 = f32[128]{0} parameter(1) + broadcast = f32[128,1024,32,32]{1,3,2,0} broadcast(p1.1), dimensions={0} + ROOT add = f32[128,1024,32,32]{1,3,2,0} add(p0.1, broadcast) + } + ENTRY entry { + p0 = f16[128,1024,32,32]{1,3,2,0} parameter(0) + p1 = f16[128]{0} parameter(1) + loop_fusion = f32[128,1024,32,32]{1,3,2,0} fusion(p0, p1), kind=kLoop, calls=broadcasting_computation + c0.2 = f32[] constant(0) + ROOT reduce = f32[128,1024]{0,1} reduce(loop_fusion, c0.2), dimensions={0,2,3}, to_apply=scalar_add + })")) + .ValueOrDie(); + SCOPED_TRACE(module->ToString()); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce); + const HloInstruction* loop_fusion = + module->entry_computation()->root_instruction()->operand(0); + ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kAdd); + EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce)); +} + +TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + ENTRY entry { + c0 = f32[] parameter(0) + p1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + // Reduction-to-vector lowered by IrEmitterUnnested. + ROOT reduce = f32[512]{0} reduce(p1, c0), dimensions={0,2,3}, to_apply=scalar_add + })")) + .ValueOrDie(); + SCOPED_TRACE(module->ToString()); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce); + EXPECT_TRUE(IsInputFusibleReduction(*reduce)); +} + +TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + ENTRY entry { + c0 = f32[] parameter(0) + p1 = f32[8,512,5,16,1,1]{5,4,3,2,1,0} parameter(1) + // Reduction lowered by GpuElementalIrEmitter. + ROOT reduce = f32[8,512,5,1,1]{4,3,2,1,0} reduce(p1, c0), dimensions={3}, to_apply=scalar_add + })")) + .ValueOrDie(); + SCOPED_TRACE(module->ToString()); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce); + EXPECT_FALSE(IsInputFusibleReduction(*reduce)); +} + +TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_reduction { + c0 = f32[] parameter(0) + p1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + ROOT reduce = f32[128,512]{1,0} reduce(p1, c0), dimensions={2,3}, to_apply=scalar_add + } + ENTRY entry { + p0 = f32[128,512,28,28]{3,2,1,0} parameter(0) + ROOT fusion = f32[128,512]{1,0} fusion(p0), kind=kInput, calls=fused_reduction + })")) + .ValueOrDie(); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_TRUE(IsInputFusibleReduction(*reduce)); +} + +TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_reduction { + c0 = f32[] parameter(0) + p1 = f32[8,512,5,16,1,1]{5,4,3,2,1,0} parameter(1) + ROOT reduce = f32[8,5,1,1]{3,2,1,0} reduce(p1, c0), dimensions={1,3}, to_apply=scalar_add + } + ENTRY entry { + p0 = f32[8,512,5,16,1,1]{5,4,3,2,1,0} parameter(0) + ROOT fusion = f32[8,5,1,1]{3,2,1,0} fusion(p0), kind=kLoop, calls=fused_reduction + })")) + .ValueOrDie(); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_FALSE(IsInputFusibleReduction(*reduce)); +} + +TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_reduction { + c0 = f32[] parameter(0) + p1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + reduce.0 = f32[128,512]{1,0} reduce(p1, c0), dimensions={2,3}, to_apply=scalar_add + reduce.1 = f32[128,512]{1,0} reduce(p1, c0), dimensions={2,3}, to_apply=scalar_add + ROOT root = (f32[128,512]{1,0}, f32[128,512]{1,0}) tuple(reduce.0, reduce.1) + } + ENTRY entry { + p0 = f32[128,512,28,28]{3,2,1,0} parameter(0) + ROOT fusion = (f32[128,512]{1,0}, f32[128,512]{1,0}) fusion(p0), kind=kInput, calls=fused_reduction + })")) + .ValueOrDie(); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_TRUE(IsInputFusibleReduction(*reduce)); +} + +TEST_F(GpuFusibleTest, + IsInputFusibleReduction_MultiOutputInputReduceFusionWithExtraOutputs) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_reduction { + c0 = f32[] parameter(0) + p1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + reduce = f32[128,512]{1,0} reduce(p1, c0), dimensions={2,3}, to_apply=scalar_add + mul = f32[128,512,28,28]{3,2,1,0} multiply(p1, p1) + ROOT root = (f32[128,512]{1,0}, f32[128,512,28,28]{3,2,1,0}) tuple(reduce, mul) + } + ENTRY entry { + p0 = f32[128,512,28,28]{3,2,1,0} parameter(0) + ROOT fusion = (f32[128,512]{1,0}, f32[128,512,28,28]{3,2,1,0}) fusion(p0), kind=kInput, calls=fused_reduction + })")) + .ValueOrDie(); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_TRUE(IsInputFusibleReduction(*reduce)); +} + +TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_reduction { + c0 = f32[] parameter(0) + p1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + reduce.0 = f32[512,28]{1,0} reduce(p1, c0), dimensions={0,2}, to_apply=scalar_add + reduce.1 = f32[512,28]{1,0} reduce(p1, c0), dimensions={0,2}, to_apply=scalar_add + ROOT root = (f32[512,28]{1,0}, f32[512,28]{1,0}) tuple(reduce.0, reduce.1) + } + ENTRY entry { + p0 = f32[128,512,28,28]{3,2,1,0} parameter(0) + ROOT fusion = (f32[512,28]{1,0}, f32[512,28]{1,0}) fusion(p0), kind=kLoop, calls=fused_reduction + })")) + .ValueOrDie(); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_FALSE(IsInputFusibleReduction(*reduce)); +} + +TEST_F(GpuFusibleTest, + IsInputFusibleReduction_MultiOutputLoopFusionReduceAndElementwiseOp) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_reduction { + c0 = f32[] parameter(0) + p1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + reduce = f32[512,28]{1,0} reduce(p1, c0), dimensions={0,2}, to_apply=scalar_add + mul = f32[128,512,28,28]{3,2,1,0} multiply(p1, p1) + ROOT root = (f32[512,28]{1,0}, f32[128,512,28,28]{3,2,1,0}) tuple(reduce, mul) + } + ENTRY entry { + p0 = f32[128,512,28,28]{3,2,1,0} parameter(0) + ROOT fusion = (f32[512,28]{1,0}, f32[128,512,28,28]{3,2,1,0}) fusion(p0), kind=kLoop, calls=fused_reduction + })")) + .ValueOrDie(); + const HloInstruction* reduce = + module->entry_computation()->root_instruction(); + ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_FALSE(IsInputFusibleReduction(*reduce)); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc new file mode 100644 index 0000000000000000000000000000000000000000..743035a84eaeb41fafb336844a1a7a07b82af4db --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc @@ -0,0 +1,218 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include +#include + +#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h" + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/buffer_value.h" +#include "tensorflow/compiler/xla/service/hlo_reachability.h" +#include "tensorflow/compiler/xla/service/hlo_scheduling.h" +#include "tensorflow/compiler/xla/types.h" + +namespace xla { +namespace gpu { + +namespace { + +// An HLO partial ordering based on the actual stream assignment and thunk +// launch order. +class GpuHloOrdering : public PredecessorHloOrdering { + public: + GpuHloOrdering(const HloModule* module, + const StreamAssignment& stream_assignment, + const std::vector& thunk_launch_order); + ~GpuHloOrdering() override = default; + + // Only the entry computation can possibly be sequentially ordered, and only + // if we've assigned all instructions to a single stream. + const std::vector* SequentialOrder( + const HloComputation& computation) const override { + return &computation == module_->entry_computation() ? entry_sequence_.get() + : nullptr; + } + + string ToString() const override { return ToStringHelper("GpuHloOrdering"); } + + private: + std::unique_ptr> entry_sequence_; +}; + +GpuHloOrdering::GpuHloOrdering( + const HloModule* module, const StreamAssignment& stream_assignment, + const std::vector& thunk_launch_order) + : PredecessorHloOrdering(module) { + // The entry computation has a total order when there's only one stream. + if (stream_assignment.StreamCount() == 1) { + entry_sequence_ = absl::make_unique>( + thunk_launch_order); + } + + // The ordering of instructions for the entry computation is determined by the + // total order of thunk launches, and stream assignment. Instructions are + // sequential within a stream and concurrent across streams. In addition, the + // GpuExecutable adds cross-stream dependency edges to ensure each instruction + // waits for its operands before executing. + // + // The predecessor map is built incrementally, in thunk launch order. We + // record the most-recently seen instructions per stream in + // 'last_instruction_per_stream'. This lets us quickly determine the + // same-stream predecessors of each instruction. + + // Compute the set of all instructions we will want to set reachability on. + auto predecessor_map = absl::make_unique( + module->entry_computation()->MakeInstructionPostOrder()); + + // The most recently visited instruction per stream. + std::vector last_instruction_per_stream( + stream_assignment.StreamCount(), nullptr); + + for (const HloInstruction* hlo : thunk_launch_order) { + predecessor_map->SetReachable(hlo, hlo); + if (stream_assignment.HasStreamAssigned(*hlo)) { + // Gather all instruction which are immediate predecessors of 'hlo' in the + // reachability graph. + std::vector immediate_preds; + immediate_preds.insert(immediate_preds.end(), hlo->operands().begin(), + hlo->operands().end()); + immediate_preds.insert(immediate_preds.end(), + hlo->control_predecessors().begin(), + hlo->control_predecessors().end()); + + // All ops already queued on the same instruction stream, and their + // transitive predecessors, are predecessors. + const int stream_no = stream_assignment.StreamNumberForHlo(*hlo); + if (last_instruction_per_stream[stream_no] != nullptr) { + immediate_preds.push_back(last_instruction_per_stream[stream_no]); + } + predecessor_map->FastSetReachabilityToUnion(immediate_preds, hlo); + last_instruction_per_stream[stream_no] = hlo; + } else { + // Only parameters and constants don't have an assigned stream, since they + // don't require a thunk. These ops don't have any predecessors. + CHECK(hlo->opcode() == HloOpcode::kParameter || + hlo->opcode() == HloOpcode::kConstant); + CHECK_EQ(hlo->operand_count(), 0); + } + } + predecessors_.emplace(module->entry_computation(), + std::move(predecessor_map)); + + // The ordering of instructions in subcomputations is based solely on control + // and data dependencies. + // + // TODO(toddw): Each subcomputation is actually emitted as a function in DFS + // postorder, so we can do better and establish the total order here. We don't + // do that yet since it's hard to ensure that the order here is the order used + // by IrEmitterNested. And mismatched ordering bugs would be hard to find. + for (auto* computation : module->computations()) { + if (computation != module->entry_computation() && + !computation->IsFusionComputation()) { + predecessors_.emplace(computation, computation->ComputeReachability()); + } + } +} + +// Computes a topological launch_order that is close to a breadth-first +// order. This heuristic works well for graphs where concurrent kernels are +// located at the same layer. It can often reduce dependency between concurrent +// GEMMs due to intra-stream total orders. E.g. consider the following HLO +// graph where the numbers in the parens indicate the stream assigned to each +// HLO. +// +// A(0) -> D(0) -> E(1) +// | +// v +// B(0) +// | +// v +// C(0) +// +// If the total order is A,B,C,D,E, then C and E would be sequentialized +// because C completes before D starts in stream 0, and E depends on D. +// However, if the total order is A,B,D,C,E, then C and E can run +// concurrently. +void BFSLaunchOrder(const HloComputation* computation, + std::vector* launch_order) { + // This topological sort uses two data structures: + // 1. `incoming_edge_count` which keeps track of the number of incoming + // edges to each HLO; + // 2. `queue` which contains all HLOs with no incoming edges. + // + // The sorting algorithm repeatedly pops the top from the queue and deletes + // that HLO from the graph, making more HLOs incoming-edge free. + std::deque queue; + std::unordered_map incoming_edge_count; + for (const auto& hlo : computation->instructions()) { + if (hlo->operand_count() == 0) { + queue.push_back(hlo); + } else { + incoming_edge_count[hlo] = + std::set(hlo->operands().begin(), + hlo->operands().end()) + .size(); + } + } + + while (!queue.empty()) { + const HloInstruction* x = queue.front(); + queue.pop_front(); + launch_order->push_back(x); + for (const HloInstruction* y : x->users()) { + --incoming_edge_count[y]; + if (incoming_edge_count[y] == 0) { + queue.push_back(y); + } + } + } +} + +} // end namespace + +GpuHloSchedule::GpuHloSchedule() {} + +/* static */ +StatusOr> GpuHloSchedule::Build( + const HloModule& module, const StreamAssignment& stream_assignment, + int64 pointer_size) { + std::unique_ptr schedule(new GpuHloSchedule); + + // Initialize thunk_launch_order_, the total order of thunk launches. + const HloComputation* entry_computation = module.entry_computation(); + if (stream_assignment.StreamCount() == 1) { + // All kernels are launched on a single stream, so there's no loss of + // concurrency by optimizing for minimal memory usage. + TF_ASSIGN_OR_RETURN( + schedule->thunk_launch_order_, + ScheduleOneComputation( + *entry_computation, [pointer_size](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size); + })); + } else { + // BFS tends to increase concurrency, but also increases memory usage. + BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_); + } + + schedule->hlo_ordering_ = absl::make_unique( + &module, stream_assignment, schedule->thunk_launch_order_); + + return std::move(schedule); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h new file mode 100644 index 0000000000000000000000000000000000000000..30a0e7cecd202e83898d34e00b5b49684d1b1b68 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h @@ -0,0 +1,68 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SCHEDULE_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SCHEDULE_H_ + +#include +#include + +#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_ordering.h" +#include "tensorflow/compiler/xla/statusor.h" + +namespace xla { +namespace gpu { + +// Determines the schedule of HLO instructions, represented by the total order +// of thunk launches, and the partial order of HLO instructions. The HLO +// instructions are only partially ordered, despite the total ordering of thunk +// launches, because thunks may be scheduled onto concurrent streams. This +// schedule is used by BufferAssigner to determine buffer liveness (i.e. to +// minimize allocations), and also by ThunkSchedule to determine the thunk +// launch order. +class GpuHloSchedule { + public: + // Constructs an GpuHloSchedule for the given module, based on the given + // stream assignment. + static StatusOr> Build( + const HloModule& module, const StreamAssignment& stream_assignment, + int64 pointer_size); + + // Returns the total order of thunk launches, represented in terms of HLO + // instructions. + const std::vector& ThunkLaunchOrder() const { + return thunk_launch_order_; + } + + // Returns the partial order of HLO instructions. This method may only be + // called once. The order is based on the total order of thunk lanches, the + // stream assignment, and the data dependencies in the HLO DAG. + std::unique_ptr ConsumeHloOrdering() { + return std::move(hlo_ordering_); + } + + private: + GpuHloSchedule(); + + std::vector thunk_launch_order_; + std::unique_ptr hlo_ordering_; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SCHEDULE_H_ diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0922e44a126eadab17d60d9ece53aae8d8f1c218 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc @@ -0,0 +1,404 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h" + +#include +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/test_helpers.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/types.h" + +namespace xla { +namespace gpu { + +class GpuHloScheduleTest : public HloTestBase { + protected: + using HloVec = std::vector; + + // Pre-canned shapes. + Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2}); + + static std::unique_ptr BuildGpuHloSchedule( + const HloModule& module, const StreamAssignment& streams) { + return GpuHloSchedule::Build(module, streams, /*pointer_size=*/8) + .ConsumeValueOrDie(); + } + + std::unique_ptr CreateNewModule() { + HloModuleConfig config; + auto debug_options = GetDebugOptionsForTest(); + debug_options.set_xla_gpu_disable_multi_streaming(false); + config.set_debug_options(debug_options); + return absl::make_unique("test_module", config); + } + + HloVec RemoveHlo(const HloVec& input, + const std::unordered_set& remove) { + HloVec result(input); + result.erase(std::remove_if(result.begin(), result.end(), + [&remove](const HloInstruction* x) { + return remove.count(x) > 0; + }), + result.end()); + return result; + } +}; + +// Test of a single stream, where data dependencies fully determine the +// execution order. +TEST_F(GpuHloScheduleTest, SequentialMatMul) { + HloComputation::Builder builder("entry_computation"); + HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/0, f32_2x2_, /*name=*/"x")); + HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/1, f32_2x2_, /*name=*/"y")); + HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/2, f32_2x2_, /*name=*/"z")); + HloInstruction* dot1 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, x, y)); + HloInstruction* dot2 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, z)); + + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build(dot2)); + + std::unique_ptr streams = AssignStreams(*module); + EXPECT_EQ(streams->StreamNumberForHlo(*dot1), + streams->StreamNumberForHlo(*dot2)); + + auto schedule = BuildGpuHloSchedule(*module, *streams); + // Remove parameters, which are unordered. + EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}), + HloVec({dot1, dot2})); + + // Parameters x,y,z are mutually unordered, while dot1 and dot2 are + // transitively ordered by operands. + auto order = schedule->ConsumeHloOrdering(); + EXPECT_TRUE(order->ExecutesBefore(x, dot1)); + EXPECT_TRUE(order->ExecutesBefore(x, dot2)); + EXPECT_TRUE(order->ExecutesBefore(y, dot1)); + EXPECT_TRUE(order->ExecutesBefore(y, dot2)); + EXPECT_TRUE(order->ExecutesBefore(z, dot2)); + EXPECT_TRUE(order->ExecutesBefore(dot1, dot2)); + + EXPECT_FALSE(order->ExecutesBefore(x, x)); + EXPECT_FALSE(order->ExecutesBefore(x, y)); + EXPECT_FALSE(order->ExecutesBefore(x, z)); + EXPECT_FALSE(order->ExecutesBefore(y, x)); + EXPECT_FALSE(order->ExecutesBefore(y, y)); + EXPECT_FALSE(order->ExecutesBefore(y, z)); + EXPECT_FALSE(order->ExecutesBefore(z, x)); + EXPECT_FALSE(order->ExecutesBefore(z, y)); + EXPECT_FALSE(order->ExecutesBefore(z, z)); + EXPECT_FALSE(order->ExecutesBefore(z, dot1)); + EXPECT_FALSE(order->ExecutesBefore(dot1, x)); + EXPECT_FALSE(order->ExecutesBefore(dot1, y)); + EXPECT_FALSE(order->ExecutesBefore(dot1, z)); + EXPECT_FALSE(order->ExecutesBefore(dot1, dot1)); + EXPECT_FALSE(order->ExecutesBefore(dot2, x)); + EXPECT_FALSE(order->ExecutesBefore(dot2, y)); + EXPECT_FALSE(order->ExecutesBefore(dot2, z)); + EXPECT_FALSE(order->ExecutesBefore(dot2, dot1)); + EXPECT_FALSE(order->ExecutesBefore(dot2, dot2)); +} + +// Test of a single stream, where data dependencies do not fully determine the +// execution order, but the stream assignment does. +TEST_F(GpuHloScheduleTest, SequentialAdd) { + HloComputation::Builder builder("entry_computation"); + HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/0, f32_2x2_, /*name=*/"x")); + HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/1, f32_2x2_, /*name=*/"y")); + HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/2, f32_2x2_, /*name=*/"z")); + HloInstruction* add1 = builder.AddInstruction( + HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, x, y)); + HloInstruction* add2 = builder.AddInstruction( + HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, y, z)); + HloInstruction* add3 = builder.AddInstruction( + HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2)); + + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build(add3)); + + std::unique_ptr streams = AssignStreams(*module); + EXPECT_EQ(streams->StreamNumberForHlo(*add1), + streams->StreamNumberForHlo(*add2)); + EXPECT_EQ(streams->StreamNumberForHlo(*add1), + streams->StreamNumberForHlo(*add3)); + + auto schedule = BuildGpuHloSchedule(*module, *streams); + // Remove parameters, which are unordered. + EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}), + HloVec({add1, add2, add3})); + + // Parameters x,y,z are mutually unordered, while add1, add2 and add3 are + // transitively ordered by operands. + auto order = schedule->ConsumeHloOrdering(); + EXPECT_TRUE(order->ExecutesBefore(x, add1)); + EXPECT_TRUE(order->ExecutesBefore(x, add2)); + EXPECT_TRUE(order->ExecutesBefore(x, add3)); + EXPECT_TRUE(order->ExecutesBefore(y, add1)); + EXPECT_TRUE(order->ExecutesBefore(y, add2)); + EXPECT_TRUE(order->ExecutesBefore(y, add3)); + EXPECT_TRUE(order->ExecutesBefore(z, add2)); + EXPECT_TRUE(order->ExecutesBefore(z, add3)); + EXPECT_TRUE(order->ExecutesBefore(add1, add3)); + EXPECT_TRUE(order->ExecutesBefore(add2, add3)); + // The HLO graph does not define an ordering for add1 and add2, but their + // assignment onto the same stream does define an ordering. + if (order->ExecutesBefore(add1, add2)) { + EXPECT_FALSE(order->ExecutesBefore(add2, add1)); + } else { + EXPECT_TRUE(order->ExecutesBefore(add2, add1)); + EXPECT_FALSE(order->ExecutesBefore(add1, add2)); + } + + EXPECT_FALSE(order->ExecutesBefore(x, x)); + EXPECT_FALSE(order->ExecutesBefore(x, y)); + EXPECT_FALSE(order->ExecutesBefore(x, z)); + EXPECT_FALSE(order->ExecutesBefore(y, x)); + EXPECT_FALSE(order->ExecutesBefore(y, y)); + EXPECT_FALSE(order->ExecutesBefore(y, z)); + EXPECT_FALSE(order->ExecutesBefore(z, x)); + EXPECT_FALSE(order->ExecutesBefore(z, y)); + EXPECT_FALSE(order->ExecutesBefore(z, z)); + EXPECT_FALSE(order->ExecutesBefore(z, add1)); + EXPECT_FALSE(order->ExecutesBefore(add1, x)); + EXPECT_FALSE(order->ExecutesBefore(add1, y)); + EXPECT_FALSE(order->ExecutesBefore(add1, z)); + EXPECT_FALSE(order->ExecutesBefore(add1, add1)); + EXPECT_FALSE(order->ExecutesBefore(add2, x)); + EXPECT_FALSE(order->ExecutesBefore(add2, y)); + EXPECT_FALSE(order->ExecutesBefore(add2, z)); + EXPECT_FALSE(order->ExecutesBefore(add2, add2)); +} + +// Test of two streams. +TEST_F(GpuHloScheduleTest, ConcurrentMatMul) { + HloComputation::Builder builder("entry_computation"); + HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/0, f32_2x2_, /*name=*/"x")); + HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/1, f32_2x2_, /*name=*/"y")); + HloInstruction* dot1 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, x, y)); + HloInstruction* dot2 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, y, x)); + HloInstruction* add = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, dot2)); + + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build(add)); + + std::unique_ptr streams = AssignStreams(*module); + EXPECT_NE(streams->StreamNumberForHlo(*dot1), + streams->StreamNumberForHlo(*dot2)); + + auto schedule = BuildGpuHloSchedule(*module, *streams); + // Remove parameters, which are unordered. + HloVec thunk_launch_order = RemoveHlo(schedule->ThunkLaunchOrder(), {x, y}); + EXPECT_TRUE(thunk_launch_order == HloVec({dot1, dot2, add}) || + thunk_launch_order == HloVec({dot2, dot1, add})); + + // Parameters x,y are mutually unordered, while dot1, dot2 and add are + // transitively ordered by operands. + auto order = schedule->ConsumeHloOrdering(); + EXPECT_TRUE(order->ExecutesBefore(x, dot1)); + EXPECT_TRUE(order->ExecutesBefore(x, dot2)); + EXPECT_TRUE(order->ExecutesBefore(y, dot1)); + EXPECT_TRUE(order->ExecutesBefore(y, dot2)); + EXPECT_TRUE(order->ExecutesBefore(dot1, add)); + EXPECT_TRUE(order->ExecutesBefore(dot2, add)); + + EXPECT_FALSE(order->ExecutesBefore(x, x)); + EXPECT_FALSE(order->ExecutesBefore(x, y)); + EXPECT_FALSE(order->ExecutesBefore(y, x)); + EXPECT_FALSE(order->ExecutesBefore(y, y)); + EXPECT_FALSE(order->ExecutesBefore(dot1, x)); + EXPECT_FALSE(order->ExecutesBefore(dot1, y)); + EXPECT_FALSE(order->ExecutesBefore(dot1, dot1)); + EXPECT_FALSE(order->ExecutesBefore(dot1, dot2)); + EXPECT_FALSE(order->ExecutesBefore(dot2, x)); + EXPECT_FALSE(order->ExecutesBefore(dot2, y)); + EXPECT_FALSE(order->ExecutesBefore(dot2, dot1)); + EXPECT_FALSE(order->ExecutesBefore(dot2, dot2)); + EXPECT_FALSE(order->ExecutesBefore(add, x)); + EXPECT_FALSE(order->ExecutesBefore(add, y)); + EXPECT_FALSE(order->ExecutesBefore(add, dot1)); + EXPECT_FALSE(order->ExecutesBefore(add, dot2)); + EXPECT_FALSE(order->ExecutesBefore(add, add)); +} + +// Test of multiple streams. +TEST_F(GpuHloScheduleTest, LatticeMatMul) { + // d00 -- layer 0 + // / \ + // d10 d11 -- layer 1 + // / \ / \ + // d20 d21 d22 -- layer 2 + // \ / \ / + // d30 d31 -- layer 3 + // \ / + // d40 -- layer 4 + HloComputation::Builder builder("entry_computation"); + std::vector params; + params.reserve(6); + for (int i = 0; i < 6; ++i) { + params.push_back(builder.AddInstruction(HloInstruction::CreateParameter( + i, f32_2x2_, /*name=*/absl::StrFormat("param%d", i)))); + } + HloInstruction* d00 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, params[2], params[3])); + HloInstruction* d10 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, params[1], d00)); + HloInstruction* d11 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, d00, params[4])); + HloInstruction* d20 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, params[0], d10)); + HloInstruction* d21 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, d10, d11)); + HloInstruction* d22 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, d11, params[5])); + HloInstruction* d30 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, d20, d21)); + HloInstruction* d31 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, d21, d22)); + HloInstruction* d40 = builder.AddInstruction( + HloInstruction::CreateCanonicalDot(f32_2x2_, d30, d31)); + + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build(d40)); + + std::unique_ptr streams = AssignStreams(*module); + // The two dots on layer 1 are concurrent. + EXPECT_NE(streams->StreamNumberForHlo(*d10), + streams->StreamNumberForHlo(*d11)); + // The three dots on layer 2 are concurrent. + EXPECT_NE(streams->StreamNumberForHlo(*d20), + streams->StreamNumberForHlo(*d21)); + EXPECT_NE(streams->StreamNumberForHlo(*d20), + streams->StreamNumberForHlo(*d22)); + EXPECT_NE(streams->StreamNumberForHlo(*d21), + streams->StreamNumberForHlo(*d22)); + // The two dots on layer 3 are concurrent. + EXPECT_NE(streams->StreamNumberForHlo(*d30), + streams->StreamNumberForHlo(*d31)); + + // We don't check the thunk launch order, since there are many valid total + // orders, and it's annoying to express. + auto schedule = BuildGpuHloSchedule(*module, *streams); + + auto order = schedule->ConsumeHloOrdering(); + const HloVec all_params( + {params[0], params[1], params[2], params[3], params[4], params[5]}); + const HloVec all_ops({d00, d10, d11, d20, d21, d22, d30, d31, d40}); + + // Parameters are mutually unordered, and never execute before ops. + for (const HloInstruction* param : all_params) { + for (const HloInstruction* param2 : all_params) { + EXPECT_FALSE(order->ExecutesBefore(param, param2)); + } + for (const HloInstruction* op : all_ops) { + EXPECT_FALSE(order->ExecutesBefore(op, param)); + } + } + + // Check ordering of params before ops. + for (const HloInstruction* op : all_ops) { + if (op == d20 || op == d30 || op == d40) { + EXPECT_TRUE(order->ExecutesBefore(params[0], op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(params[0], op)); + } + if (op != d00 && op != d11 && op != d22) { + EXPECT_TRUE(order->ExecutesBefore(params[1], op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(params[1], op)); + } + EXPECT_TRUE(order->ExecutesBefore(params[2], op)); + EXPECT_TRUE(order->ExecutesBefore(params[3], op)); + if (op != d00 && op != d10 && op != d20) { + EXPECT_TRUE(order->ExecutesBefore(params[4], op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(params[4], op)); + } + if (op == d22 || op == d31 || op == d40) { + EXPECT_TRUE(order->ExecutesBefore(params[5], op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(params[5], op)); + } + } + + // Check ordering of ops before ops. + for (const HloInstruction* op : all_ops) { + if (op != d00) { + EXPECT_TRUE(order->ExecutesBefore(d00, op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(d00, op)); + } + + if (op == d20 || op == d21 || op == d30 || op == d31 || op == d40) { + EXPECT_TRUE(order->ExecutesBefore(d10, op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(d10, op)); + } + + if (op == d21 || op == d22 || op == d30 || op == d31 || op == d40) { + EXPECT_TRUE(order->ExecutesBefore(d11, op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(d11, op)); + } + + if (op == d30 || op == d40) { + EXPECT_TRUE(order->ExecutesBefore(d20, op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(d20, op)); + } + + if (op == d30 || op == d31 || op == d40) { + EXPECT_TRUE(order->ExecutesBefore(d21, op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(d21, op)); + } + + if (op == d31 || op == d40) { + EXPECT_TRUE(order->ExecutesBefore(d22, op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(d22, op)); + } + + if (op == d40) { + EXPECT_TRUE(order->ExecutesBefore(d30, op)); + EXPECT_TRUE(order->ExecutesBefore(d31, op)); + } else { + EXPECT_FALSE(order->ExecutesBefore(d30, op)); + EXPECT_FALSE(order->ExecutesBefore(d31, op)); + } + + EXPECT_FALSE(order->ExecutesBefore(d40, op)); + } +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc index 4944c41f7d8dc7a78a3cd094aee4d7087c74857e..4268fb2c7a813b3b53e4cd48746028a7b369f28e 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc @@ -34,9 +34,8 @@ StatusOr GpuHloSupportChecker::Run(HloModule* module) { return xla::Unimplemented( "GPU backend does not support HLO instruction %s with shape " "containing a sparse layout: %s", - instruction->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(instruction->shape()) - .c_str()); + instruction->ToString(), + ShapeUtil::HumanStringWithLayout(instruction->shape())); } return Status::OK(); })); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h index d63e213d2b1efab4bcff75541cc5ab33d7a07976..bbb3340760c8330bd6570f33382f004315c6d0bd 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h @@ -28,9 +28,7 @@ class GpuHloSupportChecker : public HloPassInterface { GpuHloSupportChecker() = default; ~GpuHloSupportChecker() override = default; - tensorflow::StringPiece name() const override { - return "gpu_hlo_support_checker"; - } + absl::string_view name() const override { return "gpu_hlo_support_checker"; } // Note: always returns false (no instructions are ever modified by this // pass). diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc index 178457721a798a9fc46f36f3863b3c4b41d9a1e8..d033faee8d25ed81a1483f8314652ef999ab36c5 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc @@ -31,52 +31,58 @@ limitations under the License. namespace xla { namespace gpu { -using stream_executor::dnn::DataLayout; -using stream_executor::dnn::FilterLayout; - -static bool IsVoltaOrLater(const se::StreamExecutor& stream_executor) { - int major, minor; - CHECK(stream_executor.GetDeviceDescription().cuda_compute_capability(&major, - &minor)); - return major >= 7; -} +using se::dnn::DataLayout; +using se::dnn::FilterLayout; // Returns (input, filter, output) layouts. static std::tuple HeuristicLayoutAssignment(const HloInstruction* instr, - stream_executor::StreamExecutor* stream_executor) { + se::StreamExecutor* stream_executor) { // DataLayout and FilterLayout uses weird enum names. Translations: // N <=> Batch or Output // C <=> Depth or Input // H <=> Y // W <=> X // - // Therefore kOutputInputYX means NHWC; kBatchDepthYX means NCHW. + // Therefore kOutputInputYX and kBatchDepthYX mean NCHW. + // + // If you have trouble keeping these straight, consider that all that matters + // is the location of the channel dim: Is it major (NCHW), or minor (NHWC)? + + constexpr auto kAllNCHW = + std::make_tuple(DataLayout::kBatchDepthYX, FilterLayout::kOutputInputYX, + DataLayout::kBatchDepthYX); + constexpr auto kAllNHWC = + std::make_tuple(DataLayout::kBatchYXDepth, FilterLayout::kOutputYXInput, + DataLayout::kBatchYXDepth); - // As of today, our empirical evidence is that cudnn 7.0 is faster on V100 x - // fp16 with the mostly-NHWC layout. The heuristic may change as cudnn version - // changes, as well as the hardware updates. + // If we're not Volta or not fp16, the decision is easy: Use NCHW. if (!(instr->operand(0)->shape().element_type() == xla::PrimitiveType::F16 && IsVoltaOrLater(*stream_executor))) { - return std::make_tuple(DataLayout::kBatchDepthYX, - FilterLayout::kOutputInputYX, - DataLayout::kBatchDepthYX); + return kAllNCHW; } + VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString(); - // For BackwardInput that has stride, full NHWC layouts run significantly - // slower than (NHWC, NCHW, NCHW) or (NHWC, NCHW, NHWC). + + // Empirically we've found with Volta and cudnn 7 that backward-input convs + // with stride are significantly faster with NCHW layouts. // - // TODO(timshen): more closely compare (NHWC, NCHW, NCHW) and (NHWC, NCHW, - // NHWC). + // We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW), + // which on paper gives good performance. However, there are two observations: + // * a mixed layout combination is more cuDNN-bug prone, based on empirical + // envidence. + // * we've also observed that for mixed layouts, cuDNN transposes data back + // and forth from a different layout combination. If we end up with + // transposes anyway, we prefer to have them in XLA, as they can be fused. + // TODO(timshen): Figure out the exact condition. This may be achieved by + // auto-tuning layouts offline. if (instr->custom_call_target() == kCudnnConvBackwardInputCallTarget && window_util::HasStride(instr->window())) { - return std::make_tuple(DataLayout::kBatchYXDepth, - FilterLayout::kOutputInputYX, - DataLayout::kBatchDepthYX); + return kAllNCHW; } - return std::make_tuple(DataLayout::kBatchYXDepth, - FilterLayout::kOutputYXInput, - DataLayout::kBatchYXDepth); + + // For other Volta f16 convolutions, use NHWC. + return kAllNHWC; } // Adds layout constraints on the cudnn custom-call instruction. The layout @@ -159,11 +165,49 @@ Status GpuLayoutAssignment::AddBackendConstraintsToDnnConvCustomCall( Status GpuLayoutAssignment::AddBackendConstraints( LayoutConstraints* constraints) { - for (auto* instruction : constraints->computation()->instructions()) { + // Add convolution constraints in reverse postorder that the earliest + // convolution layout propagates first. This reduces the likelihood of fusion + // nodes with copies. + auto post_order = constraints->computation()->MakeInstructionPostOrder(); + for (auto iterator = post_order.rbegin(); iterator != post_order.rend(); + ++iterator) { + HloInstruction* instruction = *iterator; if (IsCustomCallToDnnConvolution(*instruction)) { TF_RETURN_IF_ERROR( AddBackendConstraintsToDnnConvCustomCall(instruction, constraints)); } + + // For batched dot we require the default layout. + // TODO(b/112111608): This is overly conservative, the only real restriction + // is that batch dimensions must be major. + if (instruction->opcode() == HloOpcode::kDot && + ImplementedAsGemm(*instruction) && + instruction->dot_dimension_numbers().lhs_batch_dimensions_size() > 0) { + // Verify that the batch dims come before the row and col dims. + const DotDimensionNumbers& dim_nums = + instruction->dot_dimension_numbers(); + CHECK_EQ(dim_nums.lhs_batch_dimensions_size(), + dim_nums.rhs_batch_dimensions_size()); + CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2, + ShapeUtil::Rank(instruction->shape())); + for (int64 batch_dim : dim_nums.lhs_batch_dimensions()) { + CHECK_LT(batch_dim, ShapeUtil::Rank(instruction->shape()) - 2); + } + + // Set both inputs and the output to default layout. + Shape op0_shape = instruction->operand(0)->shape(); + LayoutUtil::SetToDefaultLayout(&op0_shape); + Shape op1_shape = instruction->operand(1)->shape(); + LayoutUtil::SetToDefaultLayout(&op1_shape); + Shape output_shape = instruction->shape(); + LayoutUtil::SetToDefaultLayout(&output_shape); + TF_RETURN_IF_ERROR( + constraints->SetOperandLayout(op0_shape, instruction, 0)); + TF_RETURN_IF_ERROR( + constraints->SetOperandLayout(op1_shape, instruction, 1)); + TF_RETURN_IF_ERROR( + constraints->SetInstructionLayout(output_shape, instruction)); + } } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc index e48165c1426ea04839c245bc20b851a0f1710246..fbc8ddf599570b90e93eb463a1fd6c275b73711c 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc @@ -15,13 +15,16 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_layout.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" @@ -31,6 +34,8 @@ namespace xla { namespace gpu { namespace { +namespace op = xla::testing::opcode_matchers; + using LayoutAssignmentTest = HloTestBase; TEST_F(LayoutAssignmentTest, Elementwise) { @@ -115,7 +120,7 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) { for (const Shape& input_shape : AllLayoutsOf(shape)) { for (const Shape& result_shape : AllLayoutsOf(shape)) { - SCOPED_TRACE(tensorflow::strings::StrCat( + SCOPED_TRACE(absl::StrCat( "input_shape=", ShapeUtil::HumanStringWithLayout(input_shape), ", result_shape=", ShapeUtil::HumanStringWithLayout(result_shape))); @@ -132,10 +137,10 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) { HloInstruction::CreateParameter(4, aux_shape, "variance")); auto* epsilon = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); auto* feature_index = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR0(kFeatureIndex))); + LiteralUtil::CreateR0(kFeatureIndex))); auto* batchnorm = builder.AddInstruction(HloInstruction::CreateCustomCall( shape, @@ -188,7 +193,7 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) { // Enumerate all combinations of shapes. for (const Shape& input_shape : AllLayoutsOf(shape)) { for (const Shape& result_shape : AllLayoutsOf(shape)) { - SCOPED_TRACE(tensorflow::strings::StrCat( + SCOPED_TRACE(absl::StrCat( "input_shape=", ShapeUtil::HumanStringWithLayout(input_shape), ", result_shape=", ShapeUtil::HumanStringWithLayout(result_shape))); @@ -201,10 +206,10 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) { HloInstruction::CreateParameter(2, offset_scale_shape, "offset")); auto* epsilon = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); auto* feature_index = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR0(kFeatureIndex))); + LiteralUtil::CreateR0(kFeatureIndex))); auto* batchnorm = builder.AddInstruction(HloInstruction::CreateCustomCall( batchnorm_shape, {operand, scale, offset, epsilon, feature_index}, @@ -261,7 +266,7 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) { for (const Shape& input_shape : AllLayoutsOf(shape)) { for (const Shape& result_shape : AllLayoutsOf(shape)) { for (int constrained_param_no : {0, 4}) { - SCOPED_TRACE(tensorflow::strings::StrCat( + SCOPED_TRACE(absl::StrCat( "input_shape=", ShapeUtil::HumanStringWithLayout(input_shape), ", result_shape=", ShapeUtil::HumanStringWithLayout(result_shape))); @@ -278,10 +283,10 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) { HloInstruction::CreateParameter(4, shape, "var")); auto* epsilon = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); auto* feature_index = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR0(kFeatureIndex))); + LiteralUtil::CreateR0(kFeatureIndex))); auto* batchnorm = builder.AddInstruction(HloInstruction::CreateCustomCall( @@ -327,6 +332,33 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) { } } +TEST_F(LayoutAssignmentTest, DotLayout) { + const char* hlo_text = R"( + HloModule DotLayout + ENTRY dot { + p0 = f32[8,8,256,64]{3,1,2,0} parameter(0) + p1 = f32[8,8,256,64]{3,1,2,0} parameter(1) + ROOT dot.1330.10585 = f32[8,8,256,256]{3,2,1,0} dot(p0, p1), + lhs_batch_dims={0,1}, lhs_contracting_dims={3}, + rhs_batch_dims={0,1}, rhs_contracting_dims={3} + })"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_text)); + + ComputationLayout computation_layout( + module->entry_computation()->ComputeProgramShape()); + GpuLayoutAssignment layout_assignment(&computation_layout, + backend().default_stream_executor()); + EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie()); + + Shape expected_shape = + ShapeUtil::MakeShapeWithLayout(F32, {8, 8, 256, 64}, {3, 2, 1, 0}); + EXPECT_THAT(module->entry_computation()->root_instruction(), + op::Dot(op::ShapeWithLayout(expected_shape), + op::ShapeWithLayout(expected_shape))); +} + } // namespace } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc index 7bb8df6581b49b1bf8c84a972f715e8dc119d8de..f3c274429242d5c989146d14ea523b5910408cff 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc @@ -19,9 +19,12 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "llvm/IR/DataLayout.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h" +#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h" +#include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -34,15 +37,14 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { +namespace gpu { // TODO(b/30467474) Once GPU infeed implementation settles, consider // folding back the cpu and gpu infeed implementations into a generic // one if possible. -GpuTransferManager::GpuTransferManager() - : GenericTransferManager( - se::cuda::kCudaPlatformId, - /*pointer_size=*/llvm::DataLayout(gpu::GpuCompiler::kDataLayout) - .getPointerSize(0 /* default address space */)) {} +GpuTransferManager::GpuTransferManager(se::Platform::Id id, + unsigned pointer_size) + : GenericTransferManager(id, pointer_size) {} Status GpuTransferManager::TransferLiteralToInfeed( se::StreamExecutor* executor, const LiteralSlice& literal) { @@ -50,53 +52,28 @@ Status GpuTransferManager::TransferLiteralToInfeed( VLOG(2) << "Transferring literal to infeed with shape: " << ShapeUtil::HumanString(shape); - if (!ShapeUtil::IsTuple(shape)) { - int64 size = GetByteSizeRequirement(shape); - return TransferBufferToInfeed(executor, size, literal.untyped_data()); - } - - if (ShapeUtil::IsNestedTuple(shape)) { - return Unimplemented( - "Infeed with a nested tuple shape is not supported: %s", - ShapeUtil::HumanString(literal.shape()).c_str()); - } - // For a tuple, we transfer each of its elements to the device and // enqueue the resulting destination device addresses with the // infeed manager. - std::vector buffers; - buffers.reserve(ShapeUtil::TupleElementCount(shape)); - auto cleanup = tensorflow::gtl::MakeCleanup([buffers]() { - for (gpu::InfeedBuffer* b : buffers) { - b->Done(); - } - }); - - for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { - const Shape& tuple_element_shape = - ShapeUtil::GetTupleElementShape(shape, i); - int64 tuple_element_size = GetByteSizeRequirement(tuple_element_shape); - TF_ASSIGN_OR_RETURN( - gpu::InfeedBuffer * buffer, - TransferBufferToInfeedInternal(executor, tuple_element_size, - literal.untyped_data({i}))); - buffers.push_back(buffer); - } - - cleanup.release(); - return EnqueueBuffersToInfeed(executor, buffers); -} - -Status GpuTransferManager::TransferBufferToInfeed(se::StreamExecutor* executor, - int64 size, - const void* source) { - TF_ASSIGN_OR_RETURN(gpu::InfeedBuffer * buffer, - TransferBufferToInfeedInternal(executor, size, source)); - return EnqueueBuffersToInfeed(executor, {buffer}); + ShapeTree buffer_tree(shape); + + TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus( + shape, [&](const Shape& literal_subshape, const ShapeIndex& index) { + if (ShapeUtil::IsArray(literal_subshape)) { + int64 tuple_element_size = GetByteSizeRequirement(literal_subshape); + TF_ASSIGN_OR_RETURN( + *buffer_tree.mutable_element(index), + TransferBufferToInfeedInternal(executor, tuple_element_size, + literal.untyped_data(index))); + } + return Status::OK(); + })); + + return EnqueueBuffersToInfeed(executor, std::move(buffer_tree)); } Status GpuTransferManager::EnqueueBuffersToInfeed( - se::StreamExecutor* executor, std::vector buffers) { + se::StreamExecutor* executor, ShapeTree buffers) { gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager(); se::Stream* stream = infeed_manager->GetStream(executor); @@ -106,24 +83,21 @@ Status GpuTransferManager::EnqueueBuffersToInfeed( // possible. Status block_status = stream->BlockHostUntilDone(); if (!block_status.ok()) { - for (gpu::InfeedBuffer* b : buffers) { - b->Done(); - } return InternalError("Failed to complete data transfer on stream %p: %s", - stream, block_status.error_message().c_str()); + stream, block_status.error_message()); } - infeed_manager->EnqueueBuffers(buffers); + infeed_manager->EnqueueDestination(std::move(buffers)); VLOG(2) << "Infeed data transferred"; return Status::OK(); } -StatusOr GpuTransferManager::TransferBufferToInfeedInternal( +StatusOr GpuTransferManager::TransferBufferToInfeedInternal( se::StreamExecutor* executor, int64 size, const void* source) { if (size > std::numeric_limits::max()) { - return InvalidArgument("Infeed shape is too large: needs %lld bytes", size); + return InvalidArgument("Infeed shape is too large: needs %d bytes", size); } if (size == 0) { @@ -136,23 +110,86 @@ StatusOr GpuTransferManager::TransferBufferToInfeedInternal( return InternalError("Failed to obtain a stream"); } - gpu::InfeedBuffer* buffer = new gpu::InfeedBuffer(executor, size); - stream->ThenMemcpy(buffer->device_memory(), source, size); + InfeedBuffer buffer(executor, size); + stream->ThenMemcpy(buffer.device_memory(), source, size); VLOG(2) << "Queued infeed data on stream " << stream; - return buffer; + return std::move(buffer); +} + +static void ShapeTreeToLiteral( + ShapeTree>* shape_tree) { + // This is a struct instead of a lambda for std::function-free recursion. + struct Helper { + static void helper( + ShapeTree>* shape_tree, + ShapeIndex* index) { + const Shape& shape = ShapeUtil::GetSubshape(shape_tree->shape(), *index); + if (ShapeUtil::IsArray(shape)) { + (*shape_tree->mutable_element(*index))->WaitUntilAvailable(); + return; + } + + CHECK(ShapeUtil::IsTuple(shape)) + << ShapeUtil::HumanStringWithLayout(shape); + const int64 tuple_element_count = ShapeUtil::TupleElementCount(shape); + index->push_back(0); + for (int64 i = 0; i < tuple_element_count; ++i) { + index->back() = i; + helper(shape_tree, index); + } + index->pop_back(); + } + }; + ShapeIndex index; + Helper::helper(shape_tree, &index); +} + +Status GpuTransferManager::TransferLiteralFromOutfeed( + se::StreamExecutor* /*executor*/, const Shape& literal_shape, + MutableBorrowingLiteral literal) { + ShapeTree> outfeed_buffers( + &literal_shape); + + // First create a tree of literal buffers that the device can write to. + outfeed_buffers.ForEachMutableElement( + [&](const ShapeIndex& index, + std::unique_ptr* buffer) { + const Shape& shape = ShapeUtil::GetSubshape(literal_shape, index); + // Do not transfer tuple index buffers. + if (ShapeUtil::IsTuple(shape)) { + return; + } + *buffer = absl::make_unique( + GetByteSizeRequirement(shape)); + (*buffer)->set_destination( + absl::make_unique(literal, index)); + }); + + // Give the tree of buffers to the outfeed mananger. The device will fill it + // while we're waiting for it below. + gpu::OutfeedManager* outfeed_manager = gpu::GetOrCreateOutfeedManager(); + outfeed_manager->EnqueueDestination(&outfeed_buffers); + + // Now wait for the tree of buffers are written. + ShapeTreeToLiteral(&outfeed_buffers); + return Status::OK(); } +} // namespace gpu } // namespace xla -static std::unique_ptr CreateGpuTransferManager() { - return xla::MakeUnique(); +static std::unique_ptr CreateNVPTXTransferManager() { + return absl::make_unique( + /*id=*/stream_executor::cuda::kCudaPlatformId, + /*pointer_size=*/llvm::DataLayout(xla::gpu::NVPTXCompiler::kDataLayout) + .getPointerSize(0 /* default address space */)); } static bool InitModule() { xla::TransferManager::RegisterTransferManager( - stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager); + stream_executor::cuda::kCudaPlatformId, &CreateNVPTXTransferManager); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h index 09f8227f508a3159f3def285898e15bfad544552..fa88816bc8b0bf41f05358c0089b381305ed3182 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h @@ -13,14 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRANSFER_MANAGER_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRANSFER_MANAGER_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_ #include #include "tensorflow/compiler/xla/service/generic_transfer_manager.h" #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" +#include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/macros.h" @@ -28,33 +29,36 @@ limitations under the License. #include "tensorflow/core/platform/types.h" namespace xla { +namespace gpu { // An implementation of the XLA GenericTransferManager that // handles GPU-specific infeed. class GpuTransferManager : public GenericTransferManager { public: - GpuTransferManager(); + GpuTransferManager(se::Platform::Id id, unsigned pointer_size); ~GpuTransferManager() override {} Status TransferLiteralToInfeed(se::StreamExecutor* executor, const LiteralSlice& literal) override; - Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, - const void* source) override; + Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, + const Shape& literal_shape, + MutableBorrowingLiteral literal) override; private: // Initiates the infeed data transfers. InfeedBuffer->Done() must be // called to clean up the memory allocated for InfeedBuffer. - StatusOr TransferBufferToInfeedInternal( + StatusOr TransferBufferToInfeedInternal( se::StreamExecutor* executor, int64 size, const void* source); // Enqueues infeed data buffers with the infeed manager after their // transfer completes. Status EnqueueBuffersToInfeed(se::StreamExecutor* executor, - std::vector buffers); + ShapeTree buffers); TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager); }; +} // namespace gpu } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRANSFER_MANAGER_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc new file mode 100644 index 0000000000000000000000000000000000000000..b9c21e8edb2bdde03acb1fe6197a399724c9c8ab --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc @@ -0,0 +1,123 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" + +#include +#include +#include +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_execution_profile.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/core/util/ptr_util.h" + +namespace xla { +namespace gpu { +namespace { +void InitAndStartTimer(std::stack>* timers, + se::Stream* stream) { + timers->push(absl::make_unique(stream->parent())); + stream->InitTimer(timers->top().get()).ThenStartTimer(timers->top().get()); +} + +uint64 GetCyclesTaken(std::stack>* timers, + const std::vector& sub_streams, + se::Stream* stream, double clock_rate_ghz) { + CHECK_GT(timers->size(), 0); + stream->ThenWaitFor(&sub_streams); + stream->ThenStopTimer(timers->top().get()); + stream->BlockHostUntilDone().IgnoreError(); + double nanoseconds = timers->top()->Nanoseconds(); + timers->pop(); + return static_cast(nanoseconds * clock_rate_ghz); +} +} // namespace + +HloExecutionProfiler::HloExecutionProfiler( + bool do_profile, HloExecutionProfile* profile, se::Stream* stream, + const std::vector& sub_streams, + const HloComputation* computation) + : do_profile_(do_profile), + profile_(profile), + stream_(stream), + sub_streams_(sub_streams), + computation_(computation) { + if (do_profile_) { + clock_rate_ghz_ = stream->parent()->GetDeviceDescription().clock_rate_ghz(); + InitAndStartTimer(&timers_, stream); + } +} + +void HloExecutionProfiler::FinishExecution() { + CHECK(!finished_execution_) << "Call FinishExecution only once!"; + finished_execution_ = true; + if (do_profile_) { + profile_->set_total_cycles_executed( + *computation_, + GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_)); + } +} + +void HloExecutionProfiler::StartHloComputation() { + if (do_profile_) { + InitAndStartTimer(&timers_, stream_); + } +} + +void HloExecutionProfiler::FinishHloComputation( + const HloComputation* computation) { + if (do_profile_) { + profile_->set_total_cycles_executed( + *computation, + GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_)); + } +} + +void HloExecutionProfiler::StartHloInstruction() { + if (do_profile_) { + InitAndStartTimer(&timers_, stream_); + } +} + +void HloExecutionProfiler::FinishHloInstruction( + const HloInstruction* hlo_instruction) { + if (do_profile_) { + hlo_instructions_.erase(hlo_instruction); + profile_->SetCyclesTakenBy( + hlo_instruction, + GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_)); + } +} + +std::unique_ptr +HloExecutionProfiler::MakeScopedInstructionProfiler( + const HloInstruction* hlo_instruction) { + if (do_profile_ && hlo_instruction != nullptr) { + // Make sure that we are not already measuring the time for the same + // 'hlo_instruction'. + CHECK(hlo_instructions_.insert(hlo_instruction).second) + << hlo_instruction->name(); + } + return absl::make_unique(this, hlo_instruction); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h new file mode 100644 index 0000000000000000000000000000000000000000..80cde75f2bbb555f514fffea58ad92edf92fd0d1 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h @@ -0,0 +1,110 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_EXECUTION_PROFILER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_EXECUTION_PROFILER_H_ + +#include +#include +#include +#include + +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_execution_profile.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace gpu { + +class ScopedInstructionProfiler; + +// A helper class for profiling HLO in the course of GPU program execution. +// All of the profiling is guarded internally, to avoid the caller needing to +// have lots of conditionals sprinkled around. +class HloExecutionProfiler { + public: + // If profiling is enabled, start an execution timer running. + explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile, + se::Stream* stream, + const std::vector& sub_streams, + const HloComputation* computation); + + // If profiling is enabled, sets the total cycle count on the profile from the + // execution timer. + void FinishExecution(); + + // If profiling is enabled, starts a timer for a (sub)computation. + void StartHloComputation(); + + // If profiling is enabled stops the timer for a (sub)computation and records + // the time that the computation took to execute in the profile. + void FinishHloComputation(const HloComputation* computation); + + // If profiling is enabled, starts a per-operation timer. + void StartHloInstruction(); + + // If profiling is enabled, stops the per-operation timer and records the time + // that the hlo_instruction took to execute in the profile. + void FinishHloInstruction(const HloInstruction* hlo_instruction); + + // Returns a ScopedInstructionProfiler and triggers a call to + // StartHloInstruction(). Once the returned ScopedInstructionProfiler goes + // out of scope, it triggers a call to FinishHloInstruction(). + std::unique_ptr MakeScopedInstructionProfiler( + const HloInstruction* hlo_instruction); + + private: + const bool do_profile_; + double clock_rate_ghz_; + HloExecutionProfile* profile_; + se::Stream* stream_; + const std::vector& sub_streams_; + const HloComputation* computation_; + std::stack> timers_; + // Contains the HLO instructions for which we are currently measuring the + // time. + std::unordered_set hlo_instructions_; + bool finished_execution_ = false; +}; + +// This class can be used within the ExecuteOnStream() implementations of +// Thunks. It ensures that we always have a pair of matching +// StartHloInstruction() and FinishHloInstruction() calls to the profiler. +class ScopedInstructionProfiler { + public: + ScopedInstructionProfiler(HloExecutionProfiler* profiler, + const HloInstruction* hlo_instruction) + : profiler_(profiler), hlo_instruction_(hlo_instruction) { + if (hlo_instruction != nullptr) { + profiler->StartHloInstruction(); + } + } + ~ScopedInstructionProfiler() { + if (hlo_instruction_ != nullptr) { + profiler_->FinishHloInstruction(hlo_instruction_); + } + } + + private: + HloExecutionProfiler* profiler_; + const HloInstruction* hlo_instruction_; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_EXECUTION_PROFILER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc deleted file mode 100644 index f766f968826d960a8e86308f2395301aaa09f1ae..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc +++ /dev/null @@ -1,218 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include -#include -#include - -#include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h" - -#include "tensorflow/compiler/xla/ptr_util.h" -#include "tensorflow/compiler/xla/service/buffer_value.h" -#include "tensorflow/compiler/xla/service/hlo_reachability.h" -#include "tensorflow/compiler/xla/service/hlo_scheduling.h" -#include "tensorflow/compiler/xla/types.h" - -namespace xla { -namespace gpu { - -namespace { - -// An HLO partial ordering based on the actual stream assignment and thunk -// launch order. -class GpuHloOrdering : public PredecessorHloOrdering { - public: - GpuHloOrdering(const HloModule* module, - const StreamAssignment& stream_assignment, - const std::vector& thunk_launch_order); - ~GpuHloOrdering() override = default; - - // Only the entry computation can possibly be sequentially ordered, and only - // if we've assigned all instructions to a single stream. - const std::vector* SequentialOrder( - const HloComputation& computation) const override { - return &computation == module_->entry_computation() ? entry_sequence_.get() - : nullptr; - } - - string ToString() const override { return ToStringHelper("GpuHloOrdering"); } - - private: - std::unique_ptr> entry_sequence_; -}; - -GpuHloOrdering::GpuHloOrdering( - const HloModule* module, const StreamAssignment& stream_assignment, - const std::vector& thunk_launch_order) - : PredecessorHloOrdering(module) { - // The entry computation has a total order when there's only one stream. - if (stream_assignment.StreamCount() == 1) { - entry_sequence_ = - MakeUnique>(thunk_launch_order); - } - - // The ordering of instructions for the entry computation is determined by the - // total order of thunk launches, and stream assignment. Instructions are - // sequential within a stream and concurrent across streams. In addition, the - // GpuExecutable adds cross-stream dependency edges to ensure each instruction - // waits for its operands before executing. - // - // The predecessor map is built incrementally, in thunk launch order. We - // record the most-recently seen instructions per stream in - // 'last_instruction_per_stream'. This lets us quickly determine the - // same-stream predecessors of each instruction. - - // Compute the set of all instructions we will want to set reachability on. - auto predecessor_map = MakeUnique( - module->entry_computation()->MakeInstructionPostOrder()); - - // The most recently visited instruction per stream. - std::vector last_instruction_per_stream( - stream_assignment.StreamCount(), nullptr); - - for (const HloInstruction* hlo : thunk_launch_order) { - predecessor_map->SetReachable(hlo, hlo); - if (stream_assignment.HasStreamAssigned(*hlo)) { - // Gather all instruction which are immediate predecessors of 'hlo' in the - // reachability graph. - std::vector immediate_preds; - immediate_preds.insert(immediate_preds.end(), hlo->operands().begin(), - hlo->operands().end()); - immediate_preds.insert(immediate_preds.end(), - hlo->control_predecessors().begin(), - hlo->control_predecessors().end()); - - // All ops already queued on the same instruction stream, and their - // transitive predecessors, are predecessors. - const int stream_no = stream_assignment.StreamNumberForHlo(*hlo); - if (last_instruction_per_stream[stream_no] != nullptr) { - immediate_preds.push_back(last_instruction_per_stream[stream_no]); - } - predecessor_map->SetReachabilityToUnion(immediate_preds, hlo); - last_instruction_per_stream[stream_no] = hlo; - } else { - // Only parameters and constants don't have an assigned stream, since they - // don't require a thunk. These ops don't have any predecessors. - CHECK(hlo->opcode() == HloOpcode::kParameter || - hlo->opcode() == HloOpcode::kConstant); - CHECK_EQ(hlo->operand_count(), 0); - } - } - predecessors_.emplace(module->entry_computation(), - std::move(predecessor_map)); - - // The ordering of instructions in subcomputations is based solely on control - // and data dependencies. - // - // TODO(toddw): Each subcomputation is actually emitted as a function in DFS - // postorder, so we can do better and establish the total order here. We don't - // do that yet since it's hard to ensure that the order here is the order used - // by IrEmitterNested. And mismatched ordering bugs would be hard to find. - for (auto* computation : module->computations()) { - if (computation != module->entry_computation() && - !computation->IsFusionComputation()) { - predecessors_.emplace(computation, computation->ComputeReachability()); - } - } -} - -// Computes a topological launch_order that is close to a breadth-first -// order. This heuristic works well for graphs where concurrent kernels are -// located at the same layer. It can often reduce dependency between concurrent -// GEMMs due to intra-stream total orders. E.g. consider the following HLO -// graph where the numbers in the parens indicate the stream assigned to each -// HLO. -// -// A(0) -> D(0) -> E(1) -// | -// v -// B(0) -// | -// v -// C(0) -// -// If the total order is A,B,C,D,E, then C and E would be sequentialized -// because C completes before D starts in stream 0, and E depends on D. -// However, if the total order is A,B,D,C,E, then C and E can run -// concurrently. -void BFSLaunchOrder(const HloComputation* computation, - std::vector* launch_order) { - // This topological sort uses two data structures: - // 1. `incoming_edge_count` which keeps track of the number of incoming - // edges to each HLO; - // 2. `queue` which contains all HLOs with no incoming edges. - // - // The sorting algorithm repeatedly pops the top from the queue and deletes - // that HLO from the graph, making more HLOs incoming-edge free. - std::deque queue; - std::unordered_map incoming_edge_count; - for (const auto& hlo : computation->instructions()) { - if (hlo->operand_count() == 0) { - queue.push_back(hlo); - } else { - incoming_edge_count[hlo] = - std::set(hlo->operands().begin(), - hlo->operands().end()) - .size(); - } - } - - while (!queue.empty()) { - const HloInstruction* x = queue.front(); - queue.pop_front(); - launch_order->push_back(x); - for (const HloInstruction* y : x->users()) { - --incoming_edge_count[y]; - if (incoming_edge_count[y] == 0) { - queue.push_back(y); - } - } - } -} - -} // end namespace - -HloSchedule::HloSchedule() {} - -/* static */ -StatusOr> HloSchedule::Build( - const HloModule& module, const StreamAssignment& stream_assignment, - int64 pointer_size) { - std::unique_ptr schedule(new HloSchedule); - - // Initialize thunk_launch_order_, the total order of thunk launches. - const HloComputation* entry_computation = module.entry_computation(); - if (stream_assignment.StreamCount() == 1) { - // All kernels are launched on a single stream, so there's no loss of - // concurrency by optimizing for minimal memory usage. - TF_ASSIGN_OR_RETURN( - schedule->thunk_launch_order_, - CreateMemoryMinimizingSequence( - *entry_computation, [pointer_size](const BufferValue& buffer) { - return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size); - })); - } else { - // BFS tends to increase concurrency, but also increases memory usage. - BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_); - } - - schedule->hlo_ordering_ = MakeUnique( - &module, stream_assignment, schedule->thunk_launch_order_); - - return std::move(schedule); -} - -} // namespace gpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/hlo_schedule.h deleted file mode 100644 index 1ce7a48ac8fcbbad0b3697845681582fe806b322..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/hlo_schedule.h +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_SCHEDULE_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_SCHEDULE_H_ - -#include -#include - -#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/hlo_ordering.h" -#include "tensorflow/compiler/xla/statusor.h" - -namespace xla { -namespace gpu { - -// Determines the schedule of HLO instructions, represented by the total order -// of thunk launches, and the partial order of HLO instructions. The HLO -// instructions are only partially ordered, despite the total ordering of thunk -// launches, because thunks may be scheduled onto concurrent streams. This -// schedule is used by BufferAssigner to determine buffer liveness (i.e. to -// minimize allocations), and also by ThunkSchedule to determine the thunk -// launch order. -class HloSchedule { - public: - // Constructs an HloSchedule for the given module, based on the given stream - // assignment. - static StatusOr> Build( - const HloModule& module, const StreamAssignment& stream_assignment, - int64 pointer_size); - - // Returns the total order of thunk launches, represented in terms of HLO - // instructions. - const std::vector& ThunkLaunchOrder() const { - return thunk_launch_order_; - } - - // Returns the partial order of HLO instructions. This method may only be - // called once. The order is based on the total order of thunk lanches, the - // stream assignment, and the data dependencies in the HLO DAG. - std::unique_ptr ConsumeHloOrdering() { - return std::move(hlo_ordering_); - } - - private: - HloSchedule(); - - std::vector thunk_launch_order_; - std::unique_ptr hlo_ordering_; -}; - -} // namespace gpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_SCHEDULE_H_ diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc deleted file mode 100644 index e230d538cc2df826778e8d13eaaaf31ec81c57f0..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc +++ /dev/null @@ -1,404 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h" - -#include -#include - -#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/test_helpers.h" -#include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/types.h" - -namespace xla { -namespace gpu { - -class HloScheduleTest : public HloTestBase { - protected: - using HloVec = std::vector; - - // Pre-canned shapes. - Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2}); - - static std::unique_ptr BuildHloSchedule( - const HloModule& module, const StreamAssignment& streams) { - return HloSchedule::Build(module, streams, /*pointer_size=*/8) - .ConsumeValueOrDie(); - } - - std::unique_ptr CreateNewModule() { - HloModuleConfig config; - auto debug_options = GetDebugOptionsForTest(); - debug_options.set_xla_gpu_disable_multi_streaming(false); - config.set_debug_options(debug_options); - return MakeUnique("test_module", VersionedComputationHandle(), - config); - } - - HloVec RemoveHlo(const HloVec& input, - const std::unordered_set& remove) { - HloVec result(input); - result.erase(std::remove_if(result.begin(), result.end(), - [&remove](const HloInstruction* x) { - return remove.count(x) > 0; - }), - result.end()); - return result; - } -}; - -// Test of a single stream, where data dependencies fully determine the -// execution order. -TEST_F(HloScheduleTest, SequentialMatMul) { - HloComputation::Builder builder("entry_computation"); - HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( - /*parameter_number=*/0, f32_2x2_, /*name=*/"x")); - HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter( - /*parameter_number=*/1, f32_2x2_, /*name=*/"y")); - HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter( - /*parameter_number=*/2, f32_2x2_, /*name=*/"z")); - HloInstruction* dot1 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, x, y)); - HloInstruction* dot2 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, z)); - - auto module = CreateNewModule(); - module->AddEntryComputation(builder.Build(dot2)); - - std::unique_ptr streams = AssignStreams(*module); - EXPECT_EQ(streams->StreamNumberForHlo(*dot1), - streams->StreamNumberForHlo(*dot2)); - - auto schedule = BuildHloSchedule(*module, *streams); - // Remove parameters, which are unordered. - EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}), - HloVec({dot1, dot2})); - - // Parameters x,y,z are mutually unordered, while dot1 and dot2 are - // transitively ordered by operands. - auto order = schedule->ConsumeHloOrdering(); - EXPECT_TRUE(order->ExecutesBefore(x, dot1)); - EXPECT_TRUE(order->ExecutesBefore(x, dot2)); - EXPECT_TRUE(order->ExecutesBefore(y, dot1)); - EXPECT_TRUE(order->ExecutesBefore(y, dot2)); - EXPECT_TRUE(order->ExecutesBefore(z, dot2)); - EXPECT_TRUE(order->ExecutesBefore(dot1, dot2)); - - EXPECT_FALSE(order->ExecutesBefore(x, x)); - EXPECT_FALSE(order->ExecutesBefore(x, y)); - EXPECT_FALSE(order->ExecutesBefore(x, z)); - EXPECT_FALSE(order->ExecutesBefore(y, x)); - EXPECT_FALSE(order->ExecutesBefore(y, y)); - EXPECT_FALSE(order->ExecutesBefore(y, z)); - EXPECT_FALSE(order->ExecutesBefore(z, x)); - EXPECT_FALSE(order->ExecutesBefore(z, y)); - EXPECT_FALSE(order->ExecutesBefore(z, z)); - EXPECT_FALSE(order->ExecutesBefore(z, dot1)); - EXPECT_FALSE(order->ExecutesBefore(dot1, x)); - EXPECT_FALSE(order->ExecutesBefore(dot1, y)); - EXPECT_FALSE(order->ExecutesBefore(dot1, z)); - EXPECT_FALSE(order->ExecutesBefore(dot1, dot1)); - EXPECT_FALSE(order->ExecutesBefore(dot2, x)); - EXPECT_FALSE(order->ExecutesBefore(dot2, y)); - EXPECT_FALSE(order->ExecutesBefore(dot2, z)); - EXPECT_FALSE(order->ExecutesBefore(dot2, dot1)); - EXPECT_FALSE(order->ExecutesBefore(dot2, dot2)); -} - -// Test of a single stream, where data dependencies do not fully determine the -// execution order, but the stream assignment does. -TEST_F(HloScheduleTest, SequentialAdd) { - HloComputation::Builder builder("entry_computation"); - HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( - /*parameter_number=*/0, f32_2x2_, /*name=*/"x")); - HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter( - /*parameter_number=*/1, f32_2x2_, /*name=*/"y")); - HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter( - /*parameter_number=*/2, f32_2x2_, /*name=*/"z")); - HloInstruction* add1 = builder.AddInstruction( - HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, x, y)); - HloInstruction* add2 = builder.AddInstruction( - HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, y, z)); - HloInstruction* add3 = builder.AddInstruction( - HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2)); - - auto module = CreateNewModule(); - module->AddEntryComputation(builder.Build(add3)); - - std::unique_ptr streams = AssignStreams(*module); - EXPECT_EQ(streams->StreamNumberForHlo(*add1), - streams->StreamNumberForHlo(*add2)); - EXPECT_EQ(streams->StreamNumberForHlo(*add1), - streams->StreamNumberForHlo(*add3)); - - auto schedule = BuildHloSchedule(*module, *streams); - // Remove parameters, which are unordered. - EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}), - HloVec({add1, add2, add3})); - - // Parameters x,y,z are mutually unordered, while add1, add2 and add3 are - // transitively ordered by operands. - auto order = schedule->ConsumeHloOrdering(); - EXPECT_TRUE(order->ExecutesBefore(x, add1)); - EXPECT_TRUE(order->ExecutesBefore(x, add2)); - EXPECT_TRUE(order->ExecutesBefore(x, add3)); - EXPECT_TRUE(order->ExecutesBefore(y, add1)); - EXPECT_TRUE(order->ExecutesBefore(y, add2)); - EXPECT_TRUE(order->ExecutesBefore(y, add3)); - EXPECT_TRUE(order->ExecutesBefore(z, add2)); - EXPECT_TRUE(order->ExecutesBefore(z, add3)); - EXPECT_TRUE(order->ExecutesBefore(add1, add3)); - EXPECT_TRUE(order->ExecutesBefore(add2, add3)); - // The HLO graph does not define an ordering for add1 and add2, but their - // assignment onto the same stream does define an ordering. - if (order->ExecutesBefore(add1, add2)) { - EXPECT_FALSE(order->ExecutesBefore(add2, add1)); - } else { - EXPECT_TRUE(order->ExecutesBefore(add2, add1)); - EXPECT_FALSE(order->ExecutesBefore(add1, add2)); - } - - EXPECT_FALSE(order->ExecutesBefore(x, x)); - EXPECT_FALSE(order->ExecutesBefore(x, y)); - EXPECT_FALSE(order->ExecutesBefore(x, z)); - EXPECT_FALSE(order->ExecutesBefore(y, x)); - EXPECT_FALSE(order->ExecutesBefore(y, y)); - EXPECT_FALSE(order->ExecutesBefore(y, z)); - EXPECT_FALSE(order->ExecutesBefore(z, x)); - EXPECT_FALSE(order->ExecutesBefore(z, y)); - EXPECT_FALSE(order->ExecutesBefore(z, z)); - EXPECT_FALSE(order->ExecutesBefore(z, add1)); - EXPECT_FALSE(order->ExecutesBefore(add1, x)); - EXPECT_FALSE(order->ExecutesBefore(add1, y)); - EXPECT_FALSE(order->ExecutesBefore(add1, z)); - EXPECT_FALSE(order->ExecutesBefore(add1, add1)); - EXPECT_FALSE(order->ExecutesBefore(add2, x)); - EXPECT_FALSE(order->ExecutesBefore(add2, y)); - EXPECT_FALSE(order->ExecutesBefore(add2, z)); - EXPECT_FALSE(order->ExecutesBefore(add2, add2)); -} - -// Test of two streams. -TEST_F(HloScheduleTest, ConcurrentMatMul) { - HloComputation::Builder builder("entry_computation"); - HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( - /*parameter_number=*/0, f32_2x2_, /*name=*/"x")); - HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter( - /*parameter_number=*/1, f32_2x2_, /*name=*/"y")); - HloInstruction* dot1 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, x, y)); - HloInstruction* dot2 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, y, x)); - HloInstruction* add = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, dot2)); - - auto module = CreateNewModule(); - module->AddEntryComputation(builder.Build(add)); - - std::unique_ptr streams = AssignStreams(*module); - EXPECT_NE(streams->StreamNumberForHlo(*dot1), - streams->StreamNumberForHlo(*dot2)); - - auto schedule = BuildHloSchedule(*module, *streams); - // Remove parameters, which are unordered. - HloVec thunk_launch_order = RemoveHlo(schedule->ThunkLaunchOrder(), {x, y}); - EXPECT_TRUE(thunk_launch_order == HloVec({dot1, dot2, add}) || - thunk_launch_order == HloVec({dot2, dot1, add})); - - // Parameters x,y are mutually unordered, while dot1, dot2 and add are - // transitively ordered by operands. - auto order = schedule->ConsumeHloOrdering(); - EXPECT_TRUE(order->ExecutesBefore(x, dot1)); - EXPECT_TRUE(order->ExecutesBefore(x, dot2)); - EXPECT_TRUE(order->ExecutesBefore(y, dot1)); - EXPECT_TRUE(order->ExecutesBefore(y, dot2)); - EXPECT_TRUE(order->ExecutesBefore(dot1, add)); - EXPECT_TRUE(order->ExecutesBefore(dot2, add)); - - EXPECT_FALSE(order->ExecutesBefore(x, x)); - EXPECT_FALSE(order->ExecutesBefore(x, y)); - EXPECT_FALSE(order->ExecutesBefore(y, x)); - EXPECT_FALSE(order->ExecutesBefore(y, y)); - EXPECT_FALSE(order->ExecutesBefore(dot1, x)); - EXPECT_FALSE(order->ExecutesBefore(dot1, y)); - EXPECT_FALSE(order->ExecutesBefore(dot1, dot1)); - EXPECT_FALSE(order->ExecutesBefore(dot1, dot2)); - EXPECT_FALSE(order->ExecutesBefore(dot2, x)); - EXPECT_FALSE(order->ExecutesBefore(dot2, y)); - EXPECT_FALSE(order->ExecutesBefore(dot2, dot1)); - EXPECT_FALSE(order->ExecutesBefore(dot2, dot2)); - EXPECT_FALSE(order->ExecutesBefore(add, x)); - EXPECT_FALSE(order->ExecutesBefore(add, y)); - EXPECT_FALSE(order->ExecutesBefore(add, dot1)); - EXPECT_FALSE(order->ExecutesBefore(add, dot2)); - EXPECT_FALSE(order->ExecutesBefore(add, add)); -} - -// Test of multiple streams. -TEST_F(HloScheduleTest, LatticeMatMul) { - // d00 -- layer 0 - // / \ - // d10 d11 -- layer 1 - // / \ / \ - // d20 d21 d22 -- layer 2 - // \ / \ / - // d30 d31 -- layer 3 - // \ / - // d40 -- layer 4 - HloComputation::Builder builder("entry_computation"); - std::vector params; - params.reserve(6); - for (int i = 0; i < 6; ++i) { - params.push_back(builder.AddInstruction(HloInstruction::CreateParameter( - i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i)))); - } - HloInstruction* d00 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, params[2], params[3])); - HloInstruction* d10 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, params[1], d00)); - HloInstruction* d11 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, d00, params[4])); - HloInstruction* d20 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, params[0], d10)); - HloInstruction* d21 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, d10, d11)); - HloInstruction* d22 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, d11, params[5])); - HloInstruction* d30 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, d20, d21)); - HloInstruction* d31 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, d21, d22)); - HloInstruction* d40 = builder.AddInstruction( - HloInstruction::CreateCanonicalDot(f32_2x2_, d30, d31)); - - auto module = CreateNewModule(); - module->AddEntryComputation(builder.Build(d40)); - - std::unique_ptr streams = AssignStreams(*module); - // The two dots on layer 1 are concurrent. - EXPECT_NE(streams->StreamNumberForHlo(*d10), - streams->StreamNumberForHlo(*d11)); - // The three dots on layer 2 are concurrent. - EXPECT_NE(streams->StreamNumberForHlo(*d20), - streams->StreamNumberForHlo(*d21)); - EXPECT_NE(streams->StreamNumberForHlo(*d20), - streams->StreamNumberForHlo(*d22)); - EXPECT_NE(streams->StreamNumberForHlo(*d21), - streams->StreamNumberForHlo(*d22)); - // The two dots on layer 3 are concurrent. - EXPECT_NE(streams->StreamNumberForHlo(*d30), - streams->StreamNumberForHlo(*d31)); - - // We don't check the thunk launch order, since there are many valid total - // orders, and it's annoying to express. - auto schedule = BuildHloSchedule(*module, *streams); - - auto order = schedule->ConsumeHloOrdering(); - const HloVec all_params( - {params[0], params[1], params[2], params[3], params[4], params[5]}); - const HloVec all_ops({d00, d10, d11, d20, d21, d22, d30, d31, d40}); - - // Parameters are mutually unordered, and never execute before ops. - for (const HloInstruction* param : all_params) { - for (const HloInstruction* param2 : all_params) { - EXPECT_FALSE(order->ExecutesBefore(param, param2)); - } - for (const HloInstruction* op : all_ops) { - EXPECT_FALSE(order->ExecutesBefore(op, param)); - } - } - - // Check ordering of params before ops. - for (const HloInstruction* op : all_ops) { - if (op == d20 || op == d30 || op == d40) { - EXPECT_TRUE(order->ExecutesBefore(params[0], op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(params[0], op)); - } - if (op != d00 && op != d11 && op != d22) { - EXPECT_TRUE(order->ExecutesBefore(params[1], op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(params[1], op)); - } - EXPECT_TRUE(order->ExecutesBefore(params[2], op)); - EXPECT_TRUE(order->ExecutesBefore(params[3], op)); - if (op != d00 && op != d10 && op != d20) { - EXPECT_TRUE(order->ExecutesBefore(params[4], op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(params[4], op)); - } - if (op == d22 || op == d31 || op == d40) { - EXPECT_TRUE(order->ExecutesBefore(params[5], op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(params[5], op)); - } - } - - // Check ordering of ops before ops. - for (const HloInstruction* op : all_ops) { - if (op != d00) { - EXPECT_TRUE(order->ExecutesBefore(d00, op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(d00, op)); - } - - if (op == d20 || op == d21 || op == d30 || op == d31 || op == d40) { - EXPECT_TRUE(order->ExecutesBefore(d10, op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(d10, op)); - } - - if (op == d21 || op == d22 || op == d30 || op == d31 || op == d40) { - EXPECT_TRUE(order->ExecutesBefore(d11, op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(d11, op)); - } - - if (op == d30 || op == d40) { - EXPECT_TRUE(order->ExecutesBefore(d20, op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(d20, op)); - } - - if (op == d30 || op == d31 || op == d40) { - EXPECT_TRUE(order->ExecutesBefore(d21, op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(d21, op)); - } - - if (op == d31 || op == d40) { - EXPECT_TRUE(order->ExecutesBefore(d22, op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(d22, op)); - } - - if (op == d40) { - EXPECT_TRUE(order->ExecutesBefore(d30, op)); - EXPECT_TRUE(order->ExecutesBefore(d31, op)); - } else { - EXPECT_FALSE(order->ExecutesBefore(d30, op)); - EXPECT_FALSE(order->ExecutesBefore(d31, op)); - } - - EXPECT_FALSE(order->ExecutesBefore(d40, op)); - } -} - -} // namespace gpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc index 061210352cf12e6802d066d311fd2cb481673f15..51627402b45f594dab3480129ba182d54d01b811 100644 --- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc +++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc @@ -15,31 +15,32 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h" +#include "absl/strings/str_cat.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" namespace xla { namespace gpu { -using tensorflow::strings::StrAppend; -using tensorflow::strings::StrCat; +using absl::StrAppend; +using absl::StrCat; void HloToIrBindings::EmitBasePointersForHlos( - tensorflow::gtl::ArraySlice io_hlos, - tensorflow::gtl::ArraySlice non_io_hlos) { + absl::Span io_hlos, + absl::Span non_io_hlos) { // I/O HLOs are bound to the arguments of the current IR function. I.e., // // void IrFunction(io_0, io_1, ..., io_{m-1}, temp_buffer_base) { - llvm::Function* function = ir_builder_->GetInsertBlock()->getParent(); + llvm::Function* function = b_->GetInsertBlock()->getParent(); CHECK_EQ(io_hlos.size() + 1, function->arg_size()); // An HLO can have duplicated operands. This data structure remembers which @@ -79,8 +80,8 @@ void HloToIrBindings::EmitBasePointersForHlos( const int64 offset = slice.offset(); CHECK_NE(nullptr, temp_buffer_base_); // Emit IR for GetTupleElement instruction and bind to emitted value. - llvm::Value* base_ptr = ir_builder_->CreateInBoundsGEP( - temp_buffer_base_, ir_builder_->getInt64(offset)); + llvm::Value* base_ptr = + b_->CreateInBoundsGEP(temp_buffer_base_, b_->getInt64(offset)); BindHloToIrValue(*non_io_hlo, EmitGetTupleElement(non_io_hlo, base_ptr)); } @@ -108,15 +109,20 @@ void HloToIrBindings::EmitBasePointersForHlos( if (slice.allocation()->is_thread_local()) { llvm::Type* pointee_type = llvm_ir::ShapeToIrType(non_io_hlo->shape(), module_); - BindHloToIrValue(*non_io_hlo, - ir_builder_->CreateAlloca(pointee_type), index); + BindHloToIrValue(*non_io_hlo, b_->CreateAlloca(pointee_type), + index); + } else if (slice.allocation()->is_constant()) { + llvm::Value* global_for_constant = + module_->getGlobalVariable(llvm_ir::AsStringRef( + llvm_ir::ConstantBufferAllocationToGlobalName( + *slice.allocation()))); + BindHloToIrValue(*non_io_hlo, global_for_constant); } else { const int64 offset = slice.offset(); CHECK_NE(nullptr, temp_buffer_base_); BindHloToIrValue( *non_io_hlo, - ir_builder_->CreateInBoundsGEP(temp_buffer_base_, - ir_builder_->getInt64(offset)), + b_->CreateInBoundsGEP(temp_buffer_base_, b_->getInt64(offset)), index); } }); @@ -129,15 +135,23 @@ llvm::Value* HloToIrBindings::EmitGetTupleElement(const HloInstruction* gte, if (gte->operand(0)->opcode() != HloOpcode::kGetTupleElement) { return llvm_ir::EmitGetTupleElement( gte->shape(), gte->tuple_index(), /*alignment=*/1, - GetTypedIrValue(*gte->operand(0), {}, base_ptr), ir_builder_, module_); + GetTypedIrValue(*gte->operand(0), {}, base_ptr), b_, module_); } return llvm_ir::EmitGetTupleElement( gte->shape(), gte->tuple_index(), /*alignment=*/1, - EmitGetTupleElement(gte->operand(0), base_ptr), ir_builder_, module_); + EmitGetTupleElement(gte->operand(0), base_ptr), b_, module_); +} + +// Returns true if `value` has a name that should not be changed. +static bool HasMeaningfulName(llvm::Value* value) { + if (auto* global = llvm::dyn_cast(value)) { + return global->getLinkage() != llvm::GlobalValue::PrivateLinkage; + } + return false; } llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo, - const ShapeIndex& shape_index, + ShapeIndexView shape_index, llvm::Value* ir_value) { llvm::Type* pointee_type = llvm_ir::ShapeToIrType( ShapeUtil::GetSubshape(hlo.shape(), shape_index), module_); @@ -145,20 +159,24 @@ llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo, llvm::Value* typed_ir_value; if (llvm::isa(ir_value)) { - typed_ir_value = llvm::ConstantExpr::getBitCast( + typed_ir_value = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( llvm::cast(ir_value), dest_type); } else { - typed_ir_value = - ir_builder_->CreateBitCast(ir_value, pointee_type->getPointerTo()); + typed_ir_value = b_->CreateBitCast(ir_value, pointee_type->getPointerTo()); + } + if (!HasMeaningfulName(ir_value)) { + ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "raw"))); + } + if (!HasMeaningfulName(typed_ir_value)) { + typed_ir_value->setName( + llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "typed"))); } - ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "raw"))); - typed_ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "typed"))); return typed_ir_value; } void HloToIrBindings::BindHloToIrValue(const HloInstruction& hlo, llvm::Value* ir_value, - const ShapeIndex& shape_index) { + ShapeIndexView shape_index) { VLOG(2) << "Binding " << hlo.ToString(); const Shape& hlo_shape = hlo.shape(); @@ -202,7 +220,7 @@ llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo, << " of " << hlo.ToString(); llvm_ir::IrArray ir_array(base_ptr, ShapeUtil::GetSubshape(hlo.shape(), shape_index)); - alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array); + alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array, shape_index); // The GPU backend emits one kernel per top-level HLO, and LLVM views // execution of one kernel as the "whole program" executed on the GPU. diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h index 3d34311b4368d17cb074aaf33c71fc865e96387e..c0edae530cedba45c897b07b7b9cc72eaaab397c 100644 --- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h +++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h @@ -18,6 +18,7 @@ limitations under the License. #include +#include "absl/types/span.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/map_util.h" @@ -25,7 +26,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { namespace gpu { @@ -36,22 +36,21 @@ class HloToIrBindings { public: HloToIrBindings(const HloModule& module, const BufferAssignment* buffer_assignment, - llvm::IRBuilder<>* ir_builder, llvm::Module* llvm_module, + llvm::IRBuilder<>* b, llvm::Module* llvm_module, bool is_nested) : buffer_assignment_(buffer_assignment), is_nested_(is_nested), - ir_builder_(ir_builder), + b_(b), module_(llvm_module), - alias_analysis_(module, *buffer_assignment_, - &ir_builder_->getContext()) {} + alias_analysis_(module, *buffer_assignment_, &b_->getContext()) {} void EmitBasePointersForHlos( - tensorflow::gtl::ArraySlice io_hlos, - tensorflow::gtl::ArraySlice non_io_hlos); + absl::Span io_hlos, + absl::Span non_io_hlos); // Rebinds the given HLO to the LLVM IR value that represent its address. void BindHloToIrValue(const HloInstruction& hlo, llvm::Value* ir_value, - const ShapeIndex& shape_index = {}); + ShapeIndexView shape_index = {}); // Unbinds all IR values that's defined in an LLVM function, e.g., function // arguments and stack variables. Global variables will be kept in bindings_. @@ -71,7 +70,7 @@ class HloToIrBindings { // A helper method that returns the base pointer of the IrArray containing the // output of "inst".at the given ShapeIndex. llvm::Value* GetBasePointer(const HloInstruction& hlo, - const ShapeIndex& shape_index = {}) const { + ShapeIndexView shape_index = {}) const { auto it = base_ptrs_.find(&hlo); CHECK(it != base_ptrs_.end()) << hlo.ToString(); return it->second.element(shape_index); @@ -97,14 +96,14 @@ class HloToIrBindings { // Returns an llvm typed ir representation of 'ir_value' based on 'hlo' shape. llvm::Value* GetTypedIrValue(const HloInstruction& hlo, - const ShapeIndex& shape_index, + ShapeIndexView shape_index, llvm::Value* ir_value); const BufferAssignment* buffer_assignment_; const bool is_nested_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; llvm::Module* module_; // Stores the underlying llvm::IrArray for each HloInstruction. diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc index ae310beefad0c81c17fd4140b441b3a19a002e2c..a4364b0deb6c97b7b580e18bf67d5f3a8fd3cc62 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc @@ -15,79 +15,16 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h" -#include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" -#include "tensorflow/core/platform/logging.h" +#include "absl/memory/memory.h" namespace xla { namespace gpu { -InfeedManager::InfeedManager() : host_to_device_executor_(nullptr) {} - -void InfeedManager::Reset() { - tensorflow::mutex_lock l(mu_); - CHECK(dequeued_buffer_.empty()); - for (auto buffer : enqueued_buffer_) { - buffer->Done(); - } - enqueued_buffer_.clear(); -} - -void InfeedManager::EnqueueBuffers(const std::vector& buffers) { - tensorflow::mutex_lock l(mu_); - bool was_empty = enqueued_buffer_.empty(); - for (gpu::InfeedBuffer* b : buffers) { - enqueued_buffer_.push_back(b); - } - if (was_empty) { - // This has the potential to suffer from the notified thread - // immediately trying and failing to acquire mu_, but seems - // preferable to the alternative of notifying outside the lock - // on every enqueue. - cv_.notify_one(); - } -} - -InfeedBuffer* InfeedManager::BlockingDequeueBuffer() { - bool became_empty = false; - InfeedBuffer* current_buffer; - { - tensorflow::mutex_lock l(mu_); - while (enqueued_buffer_.empty()) { - cv_.wait(l); - } - current_buffer = enqueued_buffer_.front(); - enqueued_buffer_.pop_front(); - dequeued_buffer_.insert(current_buffer); - if (enqueued_buffer_.empty()) { - became_empty = true; - } - } - if (became_empty) { - for (const auto& callback : on_empty_callbacks_) { - callback(); - } - } - return current_buffer; -} - -void InfeedManager::ReleaseBuffers(const std::vector& buffers) { - { - tensorflow::mutex_lock l(mu_); - for (gpu::InfeedBuffer* b : buffers) { - CHECK(ContainsKey(dequeued_buffer_, b)); - dequeued_buffer_.erase(b); - } - } - for (gpu::InfeedBuffer* b : buffers) { - b->Done(); - } -} - se::Stream* InfeedManager::GetStream(se::StreamExecutor* executor) { + tensorflow::mutex_lock l(host_to_device_stream_mu_); if (host_to_device_executor_ == nullptr) { host_to_device_executor_ = executor; - host_to_device_stream_ = MakeUnique(executor); + host_to_device_stream_ = absl::make_unique(executor); host_to_device_stream_->Init(); } @@ -100,10 +37,6 @@ se::Stream* InfeedManager::GetStream(se::StreamExecutor* executor) { return host_to_device_stream_.get(); } -void InfeedManager::RegisterOnEmptyCallback(std::function callback) { - on_empty_callbacks_.push_back(std::move(callback)); -} - InfeedManager* GetOrCreateInfeedManager() { static InfeedManager* manager = new InfeedManager; return manager; diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h index a3fc15cfe36a490f38daabca9ff36fbb1012aead..7e418882e051a77e10bd12000bbc9769980f5f14 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_manager.h +++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h @@ -20,12 +20,9 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_MANAGER_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_MANAGER_H_ -#include -#include - +#include "tensorflow/compiler/xla/service/gpu/xfeed_queue.h" +#include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -47,90 +44,41 @@ namespace gpu { // the client. The client manages the memory of the buffer. class InfeedBuffer { public: + InfeedBuffer() = default; InfeedBuffer(se::StreamExecutor* executor, int64 length) - : executor_(executor), length_(length) { - device_memory_ = executor_->AllocateArray(length); - CHECK(!device_memory_.is_null()); + : device_memory_(executor, executor->AllocateArray(length)), + length_(length) { + CHECK(!device_memory_->is_null()); } - ~InfeedBuffer() { executor_->Deallocate(&device_memory_); } - int64 length() const { return length_; } - // Callback to signal that this buffer is consumed. This helps the - // client to manage memory for the infeed buffers. - void Done() { delete this; } - - se::DeviceMemoryBase* device_memory() { return &device_memory_; } + se::DeviceMemoryBase* device_memory() { return device_memory_.ptr(); } private: - se::StreamExecutor* executor_; // Not owned. - const int64 length_; - se::DeviceMemoryBase device_memory_; + se::ScopedDeviceMemory device_memory_; + int64 length_; }; // Client-side class used to enqueue infeed buffers. -class InfeedManager { +class InfeedManager : public XfeedQueue> { public: - InfeedManager(); - - // Calls the completion callback for any enqueued buffers that have - // not been dequeued by the runtime, and empties the infeed - // queue. Reset may not be called while a runtime computation is - // processing a dequeued buffer. The only safe way to ensure this - // condition is to call Reset when no computation is taking place. - void Reset(); - - // Adds a set of buffers to the infeed queue atomically. buffer->Done - // will be called when the buffer will no longer be accessed by the - // InfeedManager, either as a result of a call to Reset or because the - // runtime has dequeued and used the buffer. - void EnqueueBuffers(const std::vector& buffers); - - // Blocks until the infeed queue is non-empty, then returns the - // buffer at the head of the queue. Adds the current buffer to the - // to-be released set. - InfeedBuffer* BlockingDequeueBuffer(); - - // Releases a set of buffers from the to-be released set. - void ReleaseBuffers(const std::vector& buffers); - // Returns a cached stream associated with an executor. Allocates a // new stream on the first invocation. On subsequent invocations, if // the cached executor is not the same as the requested executor, // returns null. se::Stream* GetStream(se::StreamExecutor* executor); - // Registers a callback that will be called when 'enqueued_buffer_' becomes - // empty. - void RegisterOnEmptyCallback(std::function callback); - private: - // TODO(b/30467474): Revisit if this mutex becomes a point of - // contention. - tensorflow::mutex mu_; - - // Condition variable that is signaled every time a buffer is - // enqueued to an empty queue. - tensorflow::condition_variable cv_; - - // InfeedBuffer* queue contents are not owned, but buffer->Done must - // be called when the buffer is no longer needed by the runtime. - std::deque enqueued_buffer_; - - // Buffers that are dequeued and currently being processed by the - // runtime. Not owned. - tensorflow::gtl::FlatSet dequeued_buffer_; + // Mutex for serializing the creation of host_to_device_stream_. + tensorflow::mutex host_to_device_stream_mu_; // Cached host to device stream for queuing infeed data. - std::unique_ptr host_to_device_stream_; + std::unique_ptr host_to_device_stream_ + GUARDED_BY(host_to_device_stream_mu_); // Executor that the host_to_device_stream belongs to. Not owned. - se::StreamExecutor* host_to_device_executor_; - - // List of callbacks which will be called when 'enqueued_buffer_' becomes - // empty. - std::vector> on_empty_callbacks_; + se::StreamExecutor* host_to_device_executor_ = nullptr; }; // Singleton creator-or-accessor: Returns the GPU infeed manager. diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc index ea34d5b30c91e8b809e3e17a904e27e589fd6b5f..8c3a026740851767855beae59d6a3c92f7a0d6bd 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc @@ -13,8 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/service/gpu/infeed_manager.h" #include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" +#include "tensorflow/compiler/xla/service/gpu/infeed_manager.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -22,57 +23,82 @@ namespace xla { namespace gpu { InfeedThunk::InfeedThunk( - tensorflow::gtl::ArraySlice tuple_element_buffers, - const BufferAllocation::Slice& destination_buffer, + const ShapeTree& infeed_slices, const HloInstruction* hlo_instruction) - : Thunk(Kind::kInfeed, hlo_instruction), - tuple_element_buffers_(tuple_element_buffers.begin(), - tuple_element_buffers.end()), - destination_buffer_(destination_buffer) {} + : Thunk(Kind::kInfeed, hlo_instruction), infeed_slices_(infeed_slices) {} Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) { - VLOG(2) << "Infeeding to GPU "; + se::Stream* stream, + HloExecutionProfiler* profiler) { + VLOG(2) << "Infeeding to GPU: " << hlo_instruction()->ToString(); + + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); + ShapeTree infeed_buffers = + GetOrCreateInfeedManager()->BlockingGetNextDestination(); + + { + // The infeed buffer has an extra outer tuple with a token. Adjust the index + // accordingly. + ShapeIndex index = {0}; + std::function*)> copy_tuple_contents = + [&](std::vector* tuple_element_addresses) { + const Shape& shape = ShapeUtil::GetSubshape(infeed_buffers.shape(), + ShapeIndexView(index, 1)); + // For the leaf buffers of the tuple copy the elements directly. + if (ShapeUtil::IsArray(shape)) { + const BufferAllocation::Slice& tuple_element_buffer = + infeed_slices_.element(index); + se::DeviceMemoryBase tuple_element_address = + buffer_allocations.GetDeviceAddress(tuple_element_buffer); - se::DeviceMemoryBase destination_address = - buffer_allocations.GetDeviceAddress(destination_buffer_); + InfeedBuffer* buffer = + infeed_buffers.mutable_element(ShapeIndexView(index, 1)); + stream->ThenMemcpy(&tuple_element_address, + *(buffer->device_memory()), buffer->length()); + tuple_element_addresses->push_back(tuple_element_address.opaque()); + return; + } + + const int64 tuple_element_count = ShapeUtil::TupleElementCount(shape); + index.push_back(0); + std::vector inner_tuple_element_addresses; + for (int64 i = 0; i < tuple_element_count; ++i) { + index.back() = i; + copy_tuple_contents(&inner_tuple_element_addresses); + } + index.pop_back(); + + // Create a buffer of pointers for non-leaf buffers. + CHECK_EQ(tuple_element_count, inner_tuple_element_addresses.size()); + auto host_size = inner_tuple_element_addresses.size() * sizeof(void*); + se::DeviceMemoryBase tuple_address = + buffer_allocations.GetDeviceAddress( + infeed_slices_.element(index)); + stream->ThenMemcpy(&tuple_address, + inner_tuple_element_addresses.data(), host_size); + tuple_element_addresses->push_back(tuple_address.opaque()); + }; - InfeedManager* infeed_manager = GetOrCreateInfeedManager(); - std::vector infeed_buffers; - if (ShapeUtil::IsTuple(hlo_instruction()->shape())) { - CHECK(!ShapeUtil::IsNestedTuple(hlo_instruction()->shape())); - // Transfer the tuple elements first. std::vector tuple_element_addresses; - for (BufferAllocation::Slice tuple_element_buffer : - tuple_element_buffers_) { - se::DeviceMemoryBase tuple_element_address = - buffer_allocations.GetDeviceAddress(tuple_element_buffer); - - InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer(); - infeed_buffers.push_back(buffer); - stream->ThenMemcpy(&tuple_element_address, *(buffer->device_memory()), - buffer->length()); - tuple_element_addresses.push_back(tuple_element_address.opaque()); - } - // Transfer the tuple outer buffer. - auto host_size = tuple_element_addresses.size() * sizeof(void*); - stream->ThenMemcpy(&destination_address, tuple_element_addresses.data(), - host_size); - } else { - InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer(); - infeed_buffers.push_back(buffer); - stream->ThenMemcpy(&destination_address, *(buffer->device_memory()), - buffer->length()); + copy_tuple_contents(&tuple_element_addresses); + CHECK_EQ(1, tuple_element_addresses.size()); } + // Construct top-level tuple of infeed containing the data and the token. Use + // a nullptr for the token, it should never be dereferenced. + se::DeviceMemoryBase data_address = + buffer_allocations.GetDeviceAddress(infeed_slices_.element({0})); + void* infeed_addresses[] = {data_address.opaque(), nullptr}; + se::DeviceMemoryBase top_level_address = + buffer_allocations.GetDeviceAddress(infeed_slices_.element({})); + stream->ThenMemcpy(&top_level_address, infeed_addresses, 2 * sizeof(void*)); + Status block_status = stream->BlockHostUntilDone(); if (!block_status.ok()) { return InternalError("Failed to complete data transfer on stream %p: %s", - stream, block_status.error_message().c_str()); + stream, block_status.error_message()); } - infeed_manager->ReleaseBuffers(infeed_buffers); - VLOG(2) << "Infeeding to GPU complete"; return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h index 93713cb12defd95bdd69cb0aa7ad7b4e37fc8fae..59487e245b78e66c45409fe712e86d3392e50580 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -32,23 +33,19 @@ namespace gpu { class InfeedThunk : public Thunk { public: // Constructs a InfeedThunk that copies data from the on-device - // infeed queue to the device buffer - // `destination_buffer`. `mem_size` is the size of the data in - // bytes. - InfeedThunk(tensorflow::gtl::ArraySlice - tuple_element_buffers, - const BufferAllocation::Slice& destination_buffer, + // infeed queue into the buffers in the given shape tree. + InfeedThunk(const ShapeTree& infeed_slices, const HloInstruction* hlo_instruction); InfeedThunk(const InfeedThunk&) = delete; InfeedThunk& operator=(const InfeedThunk&) = delete; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: - const std::vector tuple_element_buffers_; - const BufferAllocation::Slice destination_buffer_; + const ShapeTree infeed_slices_; }; } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc index 36a1b82a26d84fb557c894f0bf122aef064b052e..4d5d8e99f88149aabfd0a4aeafc7e6724d29418d 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/pattern_matcher.h" @@ -26,7 +27,7 @@ namespace gpu { namespace { -bool IsFusile(const HloInstruction& hlo) { +bool IsFusible(const HloInstruction& hlo) { // Don't fuse get-tuple-element on GPU: We can, but it's slower than not // fusing. We never generate kernels for unfused GTEs. Instead, if an // unfused GTE is an input to a kernel (including a fusion kernel), we @@ -40,7 +41,8 @@ bool IsFusile(const HloInstruction& hlo) { hlo.opcode() == HloOpcode::kDynamicSlice || hlo.opcode() == HloOpcode::kDynamicUpdateSlice || hlo.opcode() == HloOpcode::kFusion || - hlo.opcode() == HloOpcode::kPad || + hlo.opcode() == HloOpcode::kGather || + hlo.opcode() == HloOpcode::kIota || hlo.opcode() == HloOpcode::kPad || hlo.opcode() == HloOpcode::kReduce || hlo.opcode() == HloOpcode::kReduceWindow || hlo.opcode() == HloOpcode::kReshape || @@ -72,20 +74,80 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) { } } +// This function limits the maximum number of operands to a fusion. +// +// There's a cap on how many parameters we can pass to a CUDA kernel, but +// exactly what that limit is is hazy, as it depends on (among other things) how +// much GPU constant memory is in use for other purposes. +// +// Moreover, we don't even know at the point that we're running fusion how many +// arguments the CUDA kernel for a fusion node will have: It depends on buffer +// assignment, where we will decide which of the fusion's operands live in XLA's +// big temp buffer versus in other allocations. +// +// As a heuristic, we simply cap the number of fusion operands plus outputs at +// kMaxOperandsAndOutputsPerFusion. This puts an upper bound on the number of +// parameters to the kernel, working around the correctness problem. +// +// This limit is also often good for performance. In a fusion with many +// operands, each GPU thread likely has to do a lot of work, and so possibly +// uses a lot of registers, thus limiting occupancy. +/*static*/ bool GpuInstructionFusion::FusionWouldBeTooLarge( + const HloInstruction* a, const HloInstruction* b) { + // Compute the number of outputs of the (possibly multi-output) fusion node + // we're considering creating. + // + // This isn't precise; we may be off by one if + // - We're creating a multi-output fusion out of two non-MOFs. Creating a + // MOF adds a new buffer, namely, the tuple buffer. + // - We're merging two MOFs. In this case, we should count the tuple buffer + // only once. + // - WLOG there's an edge from `a` to `b` and `b` is the only consumer of + // `a`. In this case the result of `a` is not part of the output of the + // fusion. + // + // But because this is a heuristic and our limit + // kMaxOperandsAndOutputsPerFusion is a large value (so +/- 1 doesn't make a + // big difference), we ignore this small inaccuracy in favor of simplicity. + int64 num_output_buffers = ShapeUtil::SubshapeCount(a->shape()) + + ShapeUtil::SubshapeCount(b->shape()); + + // The new fusion will have no more operands and outputs than + // producer_operands + consumer_operands - 1 + num_output_buffers + // (minus one because we may be fusing a producer->consumer edge between `a` + // and `b`). + // + // This fact may be enough to let us avoid having to compute the true total + // number of operands, which can be expensive. + if (a->operand_count() + b->operand_count() - 1 + num_output_buffers <= + kMaxOperandsAndOutputsPerFusion) { + return false; + } + + // Compute the precise number of operands to the new fusion. + tensorflow::gtl::FlatSet operands( + a->operands().begin(), a->operands().end()); + operands.insert(b->operands().begin(), b->operands().end()); + // If there's an edge between `a` and `b`, don't count it: We're fusing that + // producer -> consumer relationship. + operands.erase(a); + operands.erase(b); + return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion; +} + bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer, int64 operand_index) { HloInstruction* producer = consumer->mutable_operand(operand_index); // Check if we can use output fusion for (A @ B) * alpha - if (consumer->operand_count() == 2 && - (producer->opcode() == HloOpcode::kDot || - (producer->opcode() == HloOpcode::kFusion && - producer->fused_expression_root()->opcode() == HloOpcode::kDot))) { + if (producer->opcode() == HloOpcode::kDot || + (producer->opcode() == HloOpcode::kFusion && + producer->fused_expression_root()->opcode() == HloOpcode::kDot)) { int64 other_operand_index = 1 - operand_index; - const HloInstruction* alpha = consumer->operand(other_operand_index); HloInstruction* op1 = nullptr; HloInstruction* op2 = nullptr; - if (consumer->opcode() == HloOpcode::kFusion && + if (consumer->operand_count() == 1 && + consumer->opcode() == HloOpcode::kFusion && consumer->fusion_kind() == HloInstruction::FusionKind::kLoop && Match(consumer->fused_expression_root(), match::Op() @@ -103,10 +165,12 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer, op2->opcode() != HloOpcode::kBroadcast) { return false; } - if (IsIEEEFloatingPointScalarConstant(alpha)) { + if (IsIEEEFloatingPointScalarConstant(op2->operand(0))) { return true; } - } else if (consumer->opcode() == HloOpcode::kMultiply) { + } else if (consumer->operand_count() == 2 && + consumer->opcode() == HloOpcode::kMultiply) { + const HloInstruction* alpha = consumer->operand(other_operand_index); // Fuse if 'alpha' is a broadcast of a scalar constant. if (alpha->opcode() == HloOpcode::kBroadcast && alpha->dimensions().empty() && @@ -139,6 +203,7 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer, IsIEEEFloatingPointScalarConstant(producer->operand(0)) && fused_parameter_users[0]->opcode() == HloOpcode::kMultiply; } + return false; } // Other output fusions are not currently supported on GPUs. @@ -157,6 +222,13 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer, return false; } + // Do not fuse into reduce input fusions if the resulting kernel would suffer + // from poor data locality (due to unfriendly input layouts). + if (IsInputFusibleReduction(*consumer) && + !LayoutsAreReduceInputFusionFriendly(*producer, *consumer)) { + return false; + } + // We can't fuse library calls, so if a user of such an op could become a // bitcast, leave it unfused. See `xla::InstructionFusion::ShouldFuse` for // further rationale. @@ -173,8 +245,21 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer, return false; } - return IsFusile(*producer) && IsFusile(*consumer) && - InstructionFusion::ShouldFuse(consumer, operand_index); + // Fuse scalar constants into loop fusion nodes, this reduces the number of + // parameters and makes matching scalar broadcasts easier. + if (ShapeUtil::IsEffectiveScalar(producer->shape()) && + consumer->opcode() == HloOpcode::kFusion && + producer->opcode() == HloOpcode::kConstant) { + return true; + } + + if (!IsFusible(*producer) || !IsFusible(*consumer) || + !InstructionFusion::ShouldFuse(consumer, operand_index)) { + return false; + } + + // We put this check last because it's potentially expensive. + return !FusionWouldBeTooLarge(consumer, producer); } bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer, diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h index f629d9ff2c7165b652369612c30979150f93bd24..c91f6343a69268ca687004dbe0ffbb863271a95c 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h @@ -27,6 +27,19 @@ class GpuInstructionFusion : public InstructionFusion { explicit GpuInstructionFusion(bool may_duplicate) : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate) {} + // Maximum number of operands plus outputs allowed on a single fusion node. + // Exposed publicly mainly for tests. + static constexpr int64 kMaxOperandsAndOutputsPerFusion = 64; + + // Determines whether the combination of `a` and `b` into a (possibly + // multi-output) fusion would be "too large" -- i.e., have more operands and + // outputs than is allowed. + // + // `ShouldFuse` and `ShouldFuseIntoMultiOutput` call this; it's public so that + // other fusion passes (e.g. GPU multi-output fusion) can also call this. + static bool FusionWouldBeTooLarge(const HloInstruction* a, + const HloInstruction* b); + static bool IsExpensive(const HloInstruction& instruction); bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override; diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc index ec60f3a1673b26182a0603dbbbff0a1b80d650c1..bca775c4750dd3aa679846d54e29a9d277adad79 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc @@ -17,9 +17,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/compiler/xla/util.h" namespace op = xla::testing::opcode_matchers; @@ -33,7 +33,7 @@ TEST_F(InstructionFusionTest, CostlyProducerAndOperandElementReusingConsumerNotFused) { HloComputation::Builder builder(TestName()); HloInstruction* const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5))); HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary( ShapeUtil::MakeShape(S32, {}), HloOpcode::kExp, const0)); HloInstruction* broadcast2 = @@ -53,7 +53,7 @@ TEST_F(InstructionFusionTest, NonCostlyProducerAndOperandElementReusingConsumerFused) { HloComputation::Builder builder(TestName()); HloInstruction* const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5))); HloInstruction* negate1 = builder.AddInstruction(HloInstruction::CreateUnary( ShapeUtil::MakeShape(S32, {}), HloOpcode::kNegate, const0)); HloInstruction* broadcast2 = @@ -73,7 +73,7 @@ TEST_F(InstructionFusionTest, CostlyProducerAndNonOperandElementReusingConsumerFused_Reshape) { HloComputation::Builder builder(TestName()); HloInstruction* const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5))); HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary( ShapeUtil::MakeShape(S32, {}), HloOpcode::kExp, const0)); HloInstruction* reshape2 = builder.AddInstruction( @@ -92,7 +92,7 @@ TEST_F(InstructionFusionTest, CostlyProducerAndNonOperandElementReusingConsumerFused_Transpose) { HloComputation::Builder builder(TestName()); HloInstruction* const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5))); HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary( ShapeUtil::MakeShape(S32, {}), HloOpcode::kExp, const0)); HloInstruction* transpose2 = builder.AddInstruction( @@ -143,7 +143,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) { // Tests that broadcasts fused into a fusion with a reduce root. TEST_F(InstructionFusionTest, BroadcastIntoReduce) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module add { @@ -168,11 +168,83 @@ TEST_F(InstructionFusionTest, BroadcastIntoReduce) { HloInstruction* root = module->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Fusion()); EXPECT_THAT(root->fused_expression_root(), - op::Reduce(op::Broadcast(op::Parameter()), op::Parameter())); + op::Reduce(op::Broadcast(op::Constant()), op::Constant())); +} + +TEST_F(InstructionFusionTest, DoNotFuseLayoutChangingOpWithReduce) { + auto module = ParseHloString(R"( + HloModule test_module + + add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) + } + + ENTRY entry { + p0 = f32[16,16,16,16]{3,2,1,0} parameter(0) + copy = f32[16,16,16,16]{0,1,2,3} copy(p0) + constant.1 = f32[] constant(0) + ROOT reduce = f32[16] reduce(copy, constant.1), dimensions={0,1,2}, to_apply=add + })") + .ValueOrDie(); + + EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()); +} + +TEST_F(InstructionFusionTest, DoNotFuseLayoutChangingOpWithReduceFusion) { + auto module = ParseHloString(R"( + HloModule test_module + + add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) + } + + fused_reduce { + p0.1 = f32[16,16,16,16]{0,1,2,3} parameter(0) + mul = f32[16,16,16,16]{0,1,2,3} multiply(p0.1, p0.1) + c0.1 = f32[] constant(0) + ROOT root = f32[] reduce(mul, c0.1), dimensions={0,1,2,3}, to_apply=add + } + + ENTRY entry { + p0 = f32[16,16,16,16]{3,2,1,0} parameter(0) + copy = f32[16,16,16,16]{0,1,2,3} copy(p0) + fusion = f32[] fusion(copy), kind=kInput, calls=fused_reduce + ROOT root = (f32[]) tuple(fusion) + })") + .ValueOrDie(); + + EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()); +} + +TEST_F(InstructionFusionTest, FuseLayoutChangingOpWithElementwise) { + auto module = ParseHloString(R"( + HloModule test_module + ENTRY entry { + p0 = f32[16,16,16,16]{3,2,1,0} parameter(0) + copy = f32[16,16,16,16]{0,1,2,3} copy(p0) + ROOT add = f32[16,16,16,16]{0,1,2,3} add(copy, copy) + })") + .ValueOrDie(); + + EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()); + + HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Fusion()); + EXPECT_THAT(root->fused_expression_root(), op::Add(op::Copy(), op::Copy())); } TEST_F(InstructionFusionTest, BitcastIntoAdd) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY BroadcastIntoAdd { @@ -194,7 +266,7 @@ TEST_F(InstructionFusionTest, BitcastIntoAdd) { } TEST_F(InstructionFusionTest, AddIntoBitcast) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY BroadcastIntoAdd { @@ -216,7 +288,7 @@ TEST_F(InstructionFusionTest, AddIntoBitcast) { } TEST_F(InstructionFusionTest, DontFuseGTE) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY DontFuseGTE { p0 = (f32[10], f32[10]) parameter(0) @@ -232,7 +304,7 @@ TEST_F(InstructionFusionTest, DontFuseGTE) { } TEST_F(InstructionFusionTest, DotOutputFusion) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { alpha = f32[] constant(3) @@ -255,13 +327,13 @@ TEST_F(InstructionFusionTest, DotOutputFusion) { EXPECT_THAT( root->fused_expression_root(), op::Multiply(op::Dot(op::Parameter(), op::Transpose(op::Parameter())), - op::Broadcast(op::Parameter()))); + op::Broadcast(op::Constant()))); } // Compute sum(1/p0), where p0 has type f32, twice. Check that the division is // duplicated and fused into both reduces. TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module Add { lhs = f32[] parameter(0) @@ -292,7 +364,7 @@ TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) { // is *not* duplicated and fused into both reduces, because we say that integer // division is not cheap. TEST_F(InstructionFusionTest, IntegerDivIsNotCheap) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module Add { lhs = s32[] parameter(0) @@ -317,7 +389,7 @@ TEST_F(InstructionFusionTest, IntegerDivIsNotCheap) { } TEST_F(InstructionFusionTest, DotOutputFusionImpossible) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY NoOutputFusion { alpha = f32[] constant(3) @@ -339,7 +411,7 @@ TEST_F(InstructionFusionTest, DotOutputFusionImpossible) { EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kLoop); EXPECT_THAT(root->fused_expression_root(), op::Multiply(op::Multiply(op::Parameter(), op::Parameter()), - op::Broadcast(op::Parameter()))); + op::Broadcast(op::Constant()))); } // Counts the HLO ops with a given op code in the specified module. @@ -365,13 +437,13 @@ static StatusOr FindHloInstruction( } return NotFound( "Computation '%s' does not contain an instruction with op code '%s'.", - computation.name().c_str(), HloOpcodeString(op).c_str()); + computation.name(), HloOpcodeString(op)); } TEST_F(InstructionFusionTest, MultiOutputFusion) { // sub --> add --> tuple // \---------------/ - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[4,3]{1,0} parameter(0) @@ -403,7 +475,7 @@ TEST_F(InstructionFusionTest, MultiOutputFusion) { TEST_F(InstructionFusionTest, MultiOutputFusionExpensiveOp) { // tanh --> add --> tuple // \---------------/ - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[4,3]{1,0} parameter(0) @@ -424,7 +496,7 @@ TEST_F(InstructionFusionTest, MultiOutputFusionExpensiveOp) { TEST_F(InstructionFusionTest, MultiOutputFusion2) { // sub --> add1 --\--------\ // \----------> add2 --> tuple - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[4,3]{1,0} parameter(0) @@ -457,7 +529,7 @@ TEST_F(InstructionFusionTest, MultiOutputFusion2) { TEST_F(InstructionFusionTest, MultiOutputFusion3) { // sub --> add1 ----\--------\ // \ --> add2 --> add3 --> tuple - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[4,3]{1,0} parameter(0) @@ -492,7 +564,7 @@ TEST_F(InstructionFusionTest, MultiOutputFusion3) { TEST_F(InstructionFusionTest, NoCyclesDueToMultiOutputFusion) { // sub --> mul ---\ // \--> call --> add --> tuple - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { c = f32[] constant(42) @@ -527,7 +599,7 @@ TEST_F(InstructionFusionTest, NoCyclesDueToMultiOutputFusion) { TEST_F(InstructionFusionTest, NoMultiOutputFusionWithIncompatibleShapes) { // sub[2,3] --> add[4,3] --> tuple([2,3], [4,3]) // \-------------------------/ - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[2,3]{1,0} parameter(0) @@ -548,7 +620,7 @@ TEST_F(InstructionFusionTest, NoMultiOutputFusionWithIncompatibleShapes) { } TEST_F(InstructionFusionTest, FuseIntoInputFusionInstruction) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module add_computation { @@ -581,5 +653,60 @@ TEST_F(InstructionFusionTest, FuseIntoInputFusionInstruction) { << module->ToString(); } +TEST_F(InstructionFusionTest, FuseScalarConstant) { + auto module = ParseHloString(R"( + HloModule test_module + + ENTRY FuseScalarConstant { + p0 = f32[] parameter(0) + c0 = f32[] constant(1) + add1 = f32[] add(p0, c0) + b0 = f32[2]{0} broadcast(add1), dimensions={} + c1 = f32[2]{0} constant({1, 2}) + ROOT add2 = f32[2]{0} add(b0, c1) + })") + .ValueOrDie(); + + EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()); + + HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Fusion()); + EXPECT_THAT(root->fused_expression_root(), + op::Add(op::Broadcast(op::Add(op::Parameter(), op::Constant())), + op::Parameter())); +} + +// Check that we limit the number of operands to fusions we create. +TEST_F(InstructionFusionTest, AvoidsLargeFusion) { + constexpr int64 kNumParams = 200; + ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion); + + // Compute p0 + p1 + ... + pN. + HloComputation::Builder b(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {10, 100}); + auto param0 = + b.AddInstruction(HloInstruction::CreateParameter(0, shape, "p")); + auto sum = param0; + for (int64 i = 1; i < kNumParams; ++i) { + auto param = + b.AddInstruction(HloInstruction::CreateParameter(i, shape, "p")); + sum = b.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sum, param)); + } + auto module = CreateNewModule(); + auto computation = module->AddEntryComputation(b.Build()); + EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()); + SCOPED_TRACE(module->ToString()); + for (const HloInstruction* instr : computation->instructions()) { + EXPECT_LE(instr->operand_count(), + GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion) + << instr->ToString(); + } +} + } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc index 22e715099526c20532bb298e84e50457d89f615e..20d523abe0552f0bc22c365007c096666ec888f6 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc @@ -38,40 +38,44 @@ namespace gpu { namespace { // Return whether the given shape is a matrix with no padding. -bool IsRank2WithNoPadding(const Shape& shape) { - return ShapeUtil::Rank(shape) == 2 && !LayoutUtil::IsPadded(shape); +bool IsRank2WithNoPadding(const Shape& shape, int64 batch_dimensions_size) { + return ShapeUtil::Rank(shape) == batch_dimensions_size + 2 && + !LayoutUtil::IsPadded(shape); } // In a gemm operation where output = lhs * rhs, check whether the given shapes // are valid for the operation. bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape, - const Shape& output_shape) { + const Shape& output_shape, + int64 batch_dimensions_size) { // The inputs and the output must // 1) be matrices with no padding and a non-zero number of elements, // 2) have an allowed element type. PrimitiveType output_primitive_type = output_shape.element_type(); bool type_is_allowed = (output_primitive_type == F16 || output_primitive_type == F32 || - output_primitive_type == F64); - return type_is_allowed && IsRank2WithNoPadding(lhs_shape) && - IsRank2WithNoPadding(rhs_shape) && - IsRank2WithNoPadding(output_shape) && - !ShapeUtil::HasZeroElements(lhs_shape) && - !ShapeUtil::HasZeroElements(rhs_shape); + output_primitive_type == F64 || output_primitive_type == C64); + return type_is_allowed && + IsRank2WithNoPadding(lhs_shape, batch_dimensions_size) && + IsRank2WithNoPadding(rhs_shape, batch_dimensions_size) && + IsRank2WithNoPadding(output_shape, batch_dimensions_size) && + !ShapeUtil::IsZeroElementArray(lhs_shape) && + !ShapeUtil::IsZeroElementArray(rhs_shape); } bool DotImplementedAsGemm(const HloInstruction& dot) { CHECK_EQ(dot.opcode(), HloOpcode::kDot); const Shape& lhs_shape = dot.operand(0)->shape(); const Shape& rhs_shape = dot.operand(1)->shape(); + const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers(); // If gemm can accept the operand shapes, use it rather than a custom // kernel. - if (AreValidGemmShapes(lhs_shape, rhs_shape, dot.shape())) { + if (AreValidGemmShapes(lhs_shape, rhs_shape, dot.shape(), + dim_numbers.lhs_batch_dimensions_size())) { // The size of the reduction dimension should match. The shape inference // guarantees this invariant, so the check here is for programming // errors. - const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers(); CHECK_EQ(lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions(0)), rhs_shape.dimensions(dim_numbers.rhs_contracting_dimensions(0))); return true; @@ -81,11 +85,6 @@ bool DotImplementedAsGemm(const HloInstruction& dot) { } // namespace bool ImplementedAsGemm(const HloInstruction& hlo) { - // We can only do this if the HLO is unnested. - if (hlo.parent() != hlo.GetModule()->entry_computation()) { - return false; - } - // For certain types of Dot, we can call pre-canned BLAS gemm. if (hlo.opcode() == HloOpcode::kDot) { return DotImplementedAsGemm(hlo); @@ -145,10 +144,12 @@ bool ImplementedAsLibraryCall(const HloInstruction& hlo) { IsCustomCallToDnnConvolution(hlo); } -static HloInstruction* CreateCudnnConv( - const char* call_target, const Shape& shape, HloInstruction* lhs, - HloInstruction* rhs, const Window& window, - const ConvolutionDimensionNumbers& dnums) { +static HloInstruction* CreateCudnnConv(const char* call_target, + const Shape& shape, HloInstruction* lhs, + HloInstruction* rhs, + const Window& window, + const ConvolutionDimensionNumbers& dnums, + int64 feature_group_count) { HloComputation* computation = lhs->parent(); // This call returns a tuple of (conv_result, scratch_memory), where @@ -162,43 +163,38 @@ static HloInstruction* CreateCudnnConv( Shape call_shape = ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})}); - // Our CustomCall takes four arguments: The conv lhs and rhs, the cudnn - // algorithm to use, and a boolean indicating whether to use tensor cores. - // - // It's up to a later pass to choose the algorithm and decide whether to use - // tensor cores, so to indicate that we haven't yet made a choice, we speicfy - // -1 and false for those args. - HloInstruction* negative_one = computation->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(-1))); - HloInstruction* false_constant = computation->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); - HloInstruction* custom_call = - computation->AddInstruction(HloInstruction::CreateCustomCall( - call_shape, {lhs, rhs, negative_one, false_constant}, call_target)); + HloInstruction* custom_call = computation->AddInstruction( + HloInstruction::CreateCustomCall(call_shape, {lhs, rhs}, call_target)); custom_call->set_window(window); custom_call->set_convolution_dimension_numbers(dnums); + custom_call->set_feature_group_count(feature_group_count); return custom_call; } -HloInstruction* CreateCudnnConvForward( - const Shape& shape, HloInstruction* input, HloInstruction* kernel, - const Window& window, const ConvolutionDimensionNumbers& dnums) { +HloInstruction* CreateCudnnConvForward(const Shape& shape, + HloInstruction* input, + HloInstruction* kernel, + const Window& window, + const ConvolutionDimensionNumbers& dnums, + int64 feature_group_count) { return CreateCudnnConv(kCudnnConvForwardCallTarget, shape, input, kernel, - window, dnums); + window, dnums, feature_group_count); } HloInstruction* CreateCudnnConvBackwardInput( const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter, - const Window& window, const ConvolutionDimensionNumbers& dnums) { + const Window& window, const ConvolutionDimensionNumbers& dnums, + int64 feature_group_count) { return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, shape, output, - reverse_filter, window, dnums); + reverse_filter, window, dnums, feature_group_count); } HloInstruction* CreateCudnnConvBackwardFilter( const Shape& shape, HloInstruction* input, HloInstruction* output, - const Window& window, const ConvolutionDimensionNumbers& dnums) { + const Window& window, const ConvolutionDimensionNumbers& dnums, + int64 feature_group_count) { return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, shape, input, - output, window, dnums); + output, window, dnums, feature_group_count); } bool IsReductionToVector(const HloInstruction& reduce) { @@ -227,8 +223,8 @@ bool IsReductionToVector(const HloInstruction& reduce) { // This emits a device-side call to // "i32 vprintf(i8* fmt, arguments_type* arguments)" in the driver; see // http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls -llvm::Value* EmitPrintf(tensorflow::StringPiece fmt, - tensorflow::gtl::ArraySlice arguments, +llvm::Value* EmitPrintf(absl::string_view fmt, + absl::Span arguments, llvm::IRBuilder<>* builder) { std::vector argument_types; for (auto argument : arguments) { @@ -253,15 +249,17 @@ llvm::Value* EmitPrintf(tensorflow::StringPiece fmt, arguments_ptr}); } -llvm::Value* EmitShuffleDown(llvm::Value* value, llvm::Value* offset, - llvm::IRBuilder<>* builder) { +llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset, + llvm::IRBuilder<>* builder) { int bit_width = value->getType()->getPrimitiveSizeInBits(); + llvm::Value* all_warps_mask = builder->getInt32(-1); // Special case for efficiency if (value->getType()->isFloatTy() && bit_width == 32) { return llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_shfl_down_f32, - {value, offset, builder->getInt32(kWarpSize - 1)}, {}, builder); + llvm::Intrinsic::nvvm_shfl_sync_down_f32, + {all_warps_mask, value, offset, builder->getInt32(kWarpSize - 1)}, {}, + builder); } // We must split values wider than 32 bits as the "shfl" instruction operates @@ -275,10 +273,11 @@ llvm::Value* EmitShuffleDown(llvm::Value* value, llvm::Value* offset, for (int i = 0; i < num_segments; ++i) { x = builder->CreateInsertElement( x, - llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_shfl_down_i32, - {builder->CreateExtractElement(x, i), - offset, builder->getInt32(kWarpSize - 1)}, - {}, builder), + llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::nvvm_shfl_sync_down_i32, + {all_warps_mask, builder->CreateExtractElement(x, i), offset, + builder->getInt32(kWarpSize - 1)}, + {}, builder), i); } return builder->CreateBitCast( diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h index 59455f389e733fee2d6cace7486f919a0c5e834e..59c65fc2686cd4a00a3770ebaedf637e8f556828 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h @@ -31,6 +31,12 @@ namespace gpu { constexpr int64 kWarpSize = 32; // Returns true if `hlo` will be implemented as a call to BLAS gemm. +// +// Precondition: `hlo` is in an "unnested context", meaning, it lives within the +// entry computation, within the either of a while loop's subcomputations, +// within any of a conditional's subcomputations, etc., but *does not* live +// within a reduce subcomputation, a map subcomputation, a fusion +// subcomputation, etc. It's OK if `hlo` *is* a fusion. bool ImplementedAsGemm(const HloInstruction& hlo); // A call to cuDNN for batch normalization is represented as CustomCall HLO with @@ -103,15 +109,20 @@ bool IsCustomCallToDnnConvolution(const HloInstruction& hlo); // // The created cudnn call will use the default cudnn algorithm and no scratch // space. -HloInstruction* CreateCudnnConvForward( - const Shape& shape, HloInstruction* input, HloInstruction* kernel, - const Window& window, const ConvolutionDimensionNumbers& dnums); +HloInstruction* CreateCudnnConvForward(const Shape& shape, + HloInstruction* input, + HloInstruction* kernel, + const Window& window, + const ConvolutionDimensionNumbers& dnums, + int64 feature_group_count); HloInstruction* CreateCudnnConvBackwardInput( const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter, - const Window& window, const ConvolutionDimensionNumbers& dnums); + const Window& window, const ConvolutionDimensionNumbers& dnums, + int64 feature_group_count); HloInstruction* CreateCudnnConvBackwardFilter( const Shape& shape, HloInstruction* input, HloInstruction* output, - const Window& window, const ConvolutionDimensionNumbers& dnums); + const Window& window, const ConvolutionDimensionNumbers& dnums, + int64 feature_group_count); // Returns true if `hlo` will be implemented as a library call, e.g. cuBLAS gemm // or cuDNN convolution. @@ -120,18 +131,22 @@ bool ImplementedAsLibraryCall(const HloInstruction& hlo); bool IsReductionToVector(const HloInstruction& reduce); // Emits call to "vprintf" with given format and arguments. -llvm::Value* EmitPrintf(tensorflow::StringPiece fmt, - tensorflow::gtl::ArraySlice arguments, +llvm::Value* EmitPrintf(absl::string_view fmt, + absl::Span arguments, llvm::IRBuilder<>* builder); // Emits code to shuffle data between threads of a warp. This has the same -// semantics as the PTX "shfl.down" instruction [0] but works for values of any -// size. The last operand of the emitted "shfl" is `kWarpSize - 1`. +// semantics as the PTX "shfl.sync.down" instruction but works for values that +// aren't 32 bits in size. The last operand of the emitted "shfl" is +// `kWarpSize - 1`. +// +// This function emits a "full-warp" shuffle, which all threads of a warp +// participate in. *Do not use this function from a divergent context:* You +// can't correctly do so on both Volta and earlier GPUs. // -// [0] -// http://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl -llvm::Value* EmitShuffleDown(llvm::Value* value, llvm::Value* offset, - llvm::IRBuilder<>* builder); +// https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync +llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset, + llvm::IRBuilder<>* builder); } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc index 1e0db2821a2c212d0f212ae94ab69231bc6053ea..ffca5d6549a8316a7c7b7946d9943f091c133d1b 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" +#include "absl/algorithm/container.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" @@ -57,44 +58,30 @@ IrEmitter::IrEmitter(const HloModuleConfig& hlo_module_config, IrEmitterContext* ir_emitter_context, bool is_nested) : ir_emitter_context_(ir_emitter_context), module_(ir_emitter_context->llvm_module()), - ir_builder_(module_->getContext()), + b_(module_->getContext()), bindings_(ir_emitter_context->hlo_module(), - &ir_emitter_context->buffer_assignment(), &ir_builder_, module_, + &ir_emitter_context->buffer_assignment(), &b_, module_, is_nested), hlo_module_config_(hlo_module_config) { - ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags( + b_.setFastMathFlags(llvm_ir::GetFastMathFlags( /*fast_math_enabled=*/hlo_module_config.debug_options() - .xla_enable_fast_math())); + .xla_gpu_enable_fast_math())); } Status IrEmitter::DefaultAction(HloInstruction* hlo) { ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator; for (const HloInstruction* operand : hlo->operands()) { operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) { - return GetIrArray(*operand, *hlo) - .EmitReadArrayElement(index, &ir_builder_); + return GetIrArray(*operand, *hlo).EmitReadArrayElement(index, &b_); }; } return EmitTargetElementLoop( - *hlo, GpuElementalIrEmitter(hlo_module_config_, module_, &ir_builder_, + *hlo, GpuElementalIrEmitter(hlo_module_config_, module_, &b_, GetNestedComputer()) .MakeElementGenerator(hlo, operand_to_generator)); } Status IrEmitter::HandleConstant(HloInstruction* constant) { - const Literal& literal = constant->literal(); - llvm::Constant* initializer = - llvm_ir::ConvertLiteralToIrConstant(literal, module_); - llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable( - *module_, initializer->getType(), - /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, initializer, - /*Name=*/""); - VLOG(2) << "HandleConstant: " << constant->ToString() << std::endl - << " emitted_value: " << llvm_ir::DumpToString(*global_for_const) - << std::endl - << " its type: " - << llvm_ir::DumpToString(*global_for_const->getType()); - bindings_.BindHloToIrValue(*constant, global_for_const); return Status::OK(); } @@ -119,15 +106,10 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) { get_tuple_element->shape(), get_tuple_element->tuple_index(), // TODO(b/26344050): tighten the alignment here // based on the real element type. - /*alignment=*/1, GetBasePointer(*operand), &ir_builder_, module_)); + /*alignment=*/1, GetBasePointer(*operand), &b_, module_)); return Status::OK(); } -Status IrEmitter::HandleSort(HloInstruction*) { - // TODO(b/26783907): Implement sort on GPU. - return Unimplemented("sort"); -} - Status IrEmitter::HandleSend(HloInstruction*) { return Unimplemented("Send is not implemented on GPU"); } @@ -144,19 +126,22 @@ Status IrEmitter::HandleRecvDone(HloInstruction*) { return Unimplemented("Recv-done is not implemented on GPU"); } +Status IrEmitter::HandleScatter(HloInstruction*) { + return Unimplemented("Scatter is not implemented on GPUs."); +} + Status IrEmitter::HandleTuple(HloInstruction* tuple) { std::vector base_ptrs; for (const HloInstruction* operand : tuple->operands()) { base_ptrs.push_back(GetBasePointer(*operand)); } - llvm_ir::EmitTuple(GetIrArray(*tuple, *tuple), base_ptrs, &ir_builder_, - module_); + llvm_ir::EmitTuple(GetIrArray(*tuple, *tuple), base_ptrs, &b_, module_); return Status::OK(); } Status IrEmitter::EmitCallToNestedComputation( const HloComputation& nested_computation, - tensorflow::gtl::ArraySlice operands, llvm::Value* output) { + absl::Span operands, llvm::Value* output) { TF_RET_CHECK(nested_computation.num_parameters() > 0); llvm::Function*& emitted_function = computation_to_ir_function_[&nested_computation]; @@ -171,7 +156,7 @@ Status IrEmitter::EmitCallToNestedComputation( std::vector arguments(operands.begin(), operands.end()); arguments.push_back(output); arguments.push_back(bindings_.GetTempBufferBase()); - ir_builder_.CreateCall(emitted_function, arguments); + Call(emitted_function, arguments); return Status::OK(); } @@ -191,45 +176,44 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation( HloOpcode root_opcode = computation.root_instruction()->opcode(); PrimitiveType element_type = computation.root_instruction()->shape().element_type(); - llvm::Value* source = ir_builder_.CreateLoad(source_address, "source"); + bool is_atomic_integral = element_type == S32 || element_type == U32 || + element_type == S64 || element_type == U64; + llvm::Value* source = Load(source_address, "source"); if (root_opcode == HloOpcode::kAdd) { // NVPTX supports atomicAdd on F32 and integer types. if (element_type == F32) { // F32 + F32 llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_atomic_load_add_f32, {output_address, source}, - {output_address->getType()}, &ir_builder_); + {output_address->getType()}, &b_); return true; } - if (primitive_util::IsIntegralType(element_type)) { + if (is_atomic_integral) { // integral + integral - ir_builder_.CreateAtomicRMW(llvm::AtomicRMWInst::Add, output_address, - source, - llvm::AtomicOrdering::SequentiallyConsistent); + AtomicRMW(llvm::AtomicRMWInst::Add, output_address, source, + llvm::AtomicOrdering::SequentiallyConsistent); return true; } } - // NVPTX supports atomicMax and atomicMin on only integer types. - if (root_opcode == HloOpcode::kMaximum && - primitive_util::IsIntegralType(element_type)) { + // NVPTX supports atomicMax and atomicMin only on integer types. + if (root_opcode == HloOpcode::kMaximum && is_atomic_integral) { // max(integral, integral) auto opcode = primitive_util::IsSignedIntegralType(element_type) ? llvm::AtomicRMWInst::Max : llvm::AtomicRMWInst::UMax; - ir_builder_.CreateAtomicRMW(opcode, output_address, source, - llvm::AtomicOrdering::SequentiallyConsistent); + AtomicRMW(opcode, output_address, source, + llvm::AtomicOrdering::SequentiallyConsistent); return true; } - if (root_opcode == HloOpcode::kMinimum && - primitive_util::IsIntegralType(element_type)) { + if (root_opcode == HloOpcode::kMinimum && is_atomic_integral) { // min(integral, integral) auto opcode = primitive_util::IsSignedIntegralType(element_type) ? llvm::AtomicRMWInst::Min : llvm::AtomicRMWInst::UMin; - ir_builder_.CreateAtomicRMW(opcode, output_address, source, - llvm::AtomicOrdering::SequentiallyConsistent); + AtomicRMW(opcode, output_address, source, + llvm::AtomicOrdering::SequentiallyConsistent); return true; } @@ -301,20 +285,20 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation, llvm::Type* element_address_type = element_type->getPointerTo(); int atomic_size = (element_size < 32) ? 32 : element_size; - llvm::Type* atomic_type = ir_builder_.getIntNTy(atomic_size); + llvm::Type* atomic_type = b_.getIntNTy(atomic_size); llvm::Type* atomic_address_type = atomic_type->getPointerTo(output_address_type->getPointerAddressSpace()); // cas_old_output_address and cas_new_output_address point to the scratch // memory where we store the old and new values for the repeated atomicCAS // operations. - llvm::Value* cas_old_output_address = ir_builder_.CreateAlloca( - atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address"); - llvm::Value* cas_new_output_address = ir_builder_.CreateAlloca( - atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address"); + llvm::Value* cas_old_output_address = + Alloca(atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address"); + llvm::Value* cas_new_output_address = + Alloca(atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address"); // Emit preparation code to the preheader. - llvm::BasicBlock* loop_preheader_bb = ir_builder_.GetInsertBlock(); + llvm::BasicBlock* loop_preheader_bb = b_.GetInsertBlock(); llvm::Value* atomic_memory_address; // binop_output_address points to the scratch memory that stores the @@ -325,77 +309,65 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation, CHECK_EQ((element_size % sizeof(char)), 0); llvm::Type* address_int_type = module_->getDataLayout().getIntPtrType(output_address_type); - atomic_memory_address = - ir_builder_.CreatePtrToInt(output_address, address_int_type); + atomic_memory_address = PtrToInt(output_address, address_int_type); llvm::Value* mask = llvm::ConstantInt::get(address_int_type, 3); - llvm::Value* offset = ir_builder_.CreateAnd(atomic_memory_address, mask); + llvm::Value* offset = And(atomic_memory_address, mask); mask = llvm::ConstantInt::get(address_int_type, -4); - atomic_memory_address = ir_builder_.CreateAnd(atomic_memory_address, mask); + atomic_memory_address = And(atomic_memory_address, mask); atomic_memory_address = - ir_builder_.CreateIntToPtr(atomic_memory_address, atomic_address_type); - binop_output_address = ir_builder_.CreateAdd( - ir_builder_.CreatePtrToInt(cas_new_output_address, address_int_type), - offset); + IntToPtr(atomic_memory_address, atomic_address_type); binop_output_address = - ir_builder_.CreateIntToPtr(binop_output_address, element_address_type); + Add(PtrToInt(cas_new_output_address, address_int_type), offset); + binop_output_address = IntToPtr(binop_output_address, element_address_type); } else { - atomic_memory_address = - ir_builder_.CreateBitCast(output_address, atomic_address_type); + atomic_memory_address = BitCast(output_address, atomic_address_type); binop_output_address = - ir_builder_.CreateBitCast(cas_new_output_address, element_address_type); + BitCast(cas_new_output_address, element_address_type); } // Use the value from the memory that atomicCAS operates on to initialize // cas_old_output. - llvm::Value* cas_old_output = - ir_builder_.CreateLoad(atomic_memory_address, "cas_old_output"); - ir_builder_.CreateStore(cas_old_output, cas_old_output_address); + llvm::Value* cas_old_output = Load(atomic_memory_address, "cas_old_output"); + Store(cas_old_output, cas_old_output_address); llvm::BasicBlock* loop_exit_bb = loop_preheader_bb->splitBasicBlock( - ir_builder_.GetInsertPoint(), "atomic_op_loop_exit"); - llvm::BasicBlock* loop_body_bb = - llvm::BasicBlock::Create(ir_builder_.getContext(), "atomic_op_loop_body", - ir_builder_.GetInsertBlock()->getParent()); - ir_builder_.SetInsertPoint(loop_body_bb); + b_.GetInsertPoint(), "atomic_op_loop_exit"); + llvm::BasicBlock* loop_body_bb = llvm::BasicBlock::Create( + b_.getContext(), "atomic_op_loop_body", b_.GetInsertBlock()->getParent()); + b_.SetInsertPoint(loop_body_bb); // Change preheader's successor from loop_exit_bb to loop_body_bb. loop_preheader_bb->getTerminator()->setSuccessor(0, loop_body_bb); // Emit the body of the loop that repeatedly invokes atomicCAS. // // Use cas_old_output to initialize cas_new_output. - cas_old_output = - ir_builder_.CreateLoad(cas_old_output_address, "cas_old_output"); - ir_builder_.CreateStore(cas_old_output, cas_new_output_address); + cas_old_output = Load(cas_old_output_address, "cas_old_output"); + Store(cas_old_output, cas_new_output_address); // Emits code to calculate new_output = operation(old_output, source); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( computation, {binop_output_address, source_address}, binop_output_address)); - llvm::Value* cas_new_output = - ir_builder_.CreateLoad(cas_new_output_address, "cas_new_output"); + llvm::Value* cas_new_output = Load(cas_new_output_address, "cas_new_output"); // Emit code to perform the atomicCAS operation // (cas_old_output, success) = atomicCAS(memory_address, cas_old_output, // cas_new_output); - llvm::Value* ret_value = ir_builder_.CreateAtomicCmpXchg( - atomic_memory_address, cas_old_output, cas_new_output, - llvm::AtomicOrdering::SequentiallyConsistent, - llvm::AtomicOrdering::SequentiallyConsistent); + llvm::Value* ret_value = + AtomicCmpXchg(atomic_memory_address, cas_old_output, cas_new_output, + llvm::AtomicOrdering::SequentiallyConsistent, + llvm::AtomicOrdering::SequentiallyConsistent); // Extract the memory value returned from atomicCAS and store it as // cas_old_output. - ir_builder_.CreateStore( - ir_builder_.CreateExtractValue(ret_value, 0, "cas_old_output"), - cas_old_output_address); + Store(ExtractValue(ret_value, 0, "cas_old_output"), cas_old_output_address); // Extract the success bit returned from atomicCAS and generate a // conditional branch on the success bit. - ir_builder_.CreateCondBr( - ir_builder_.CreateExtractValue(ret_value, 1, "success"), loop_exit_bb, - loop_body_bb); + CondBr(ExtractValue(ret_value, 1, "success"), loop_exit_bb, loop_body_bb); // Set the insertion point to the exit basic block so that the caller of // this method can continue emitting code to the right place. - SetToFirstInsertPoint(loop_exit_bb, &ir_builder_); + SetToFirstInsertPoint(loop_exit_bb, &b_); return Status::OK(); } @@ -406,8 +378,8 @@ Status IrEmitter::EmitAtomicOperationForNestedComputation( // TODO(b/30258929): We only accept binary computations so far. return Unimplemented( "We only support atomic functions with exactly two parameters, but " - "computation %s has %lld.", - computation.name().c_str(), computation.num_parameters()); + "computation %s has %d.", + computation.name(), computation.num_parameters()); } if (MaybeEmitDirectAtomicOperation(computation, output_address, @@ -421,46 +393,49 @@ Status IrEmitter::EmitAtomicOperationForNestedComputation( Status IrEmitter::HandleSelect(HloInstruction* select) { auto pred = select->operand(0); - auto on_true = select->operand(1); - auto on_false = select->operand(2); TF_RET_CHECK(pred->shape().element_type() == PRED); - - if (ShapeUtil::IsTuple(select->shape())) { - llvm_ir::EmitTupleSelect(GetIrArray(*select, *select), - GetIrArray(*pred, *select), - GetBasePointer(*on_true), - GetBasePointer(*on_false), &ir_builder_, module_); - return Status::OK(); - } - // We must not call the subclass `DefaultAction` method, lest its // `HandleSelect` call `IrEmitter::HandleSelect` and its `DefaultAction` // assume no handler has already been called. return IrEmitter::DefaultAction(select); } +Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) { + auto pred = tuple_select->operand(0); + auto on_true = tuple_select->operand(1); + auto on_false = tuple_select->operand(2); + TF_RET_CHECK(pred->shape().element_type() == PRED); + TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape())); + TF_RET_CHECK(ShapeUtil::IsTuple(tuple_select->shape())); + llvm_ir::EmitTupleSelect(GetIrArray(*tuple_select, *tuple_select), + GetIrArray(*pred, *tuple_select), + GetBasePointer(*on_true), GetBasePointer(*on_false), + &b_, module_); + return Status::OK(); +} + namespace { -llvm::Value* Real(llvm::Value* x, llvm::IRBuilder<>* ir_builder) { - return ir_builder->CreateExtractValue(x, {0}); -} - -llvm::Value* Imag(llvm::Value* x, llvm::IRBuilder<>* ir_builder) { - return ir_builder->CreateExtractValue(x, {1}); -} - -std::pair MultiplyComplex( - llvm::Value* lhs_value, llvm::Value* rhs_value, - llvm::IRBuilder<>* ir_builder) { - llvm::Value* lhs_real = Real(lhs_value, ir_builder); - llvm::Value* lhs_imag = Imag(lhs_value, ir_builder); - llvm::Value* rhs_real = Real(rhs_value, ir_builder); - llvm::Value* rhs_imag = Imag(rhs_value, ir_builder); - llvm::Value* real_result1 = ir_builder->CreateFMul(lhs_real, rhs_real); - llvm::Value* real_result2 = ir_builder->CreateFMul(lhs_imag, rhs_imag); - llvm::Value* real_result = ir_builder->CreateFSub(real_result1, real_result2); - llvm::Value* imag_result1 = ir_builder->CreateFMul(lhs_real, rhs_imag); - llvm::Value* imag_result2 = ir_builder->CreateFMul(lhs_imag, rhs_real); - llvm::Value* imag_result = ir_builder->CreateFAdd(imag_result1, imag_result2); +llvm::Value* Real(llvm::Value* x, llvm::IRBuilder<>* b) { + return b->CreateExtractValue(x, {0}); +} + +llvm::Value* Imag(llvm::Value* x, llvm::IRBuilder<>* b) { + return b->CreateExtractValue(x, {1}); +} + +std::pair MultiplyComplex(llvm::Value* lhs_value, + llvm::Value* rhs_value, + llvm::IRBuilder<>* b) { + llvm::Value* lhs_real = Real(lhs_value, b); + llvm::Value* lhs_imag = Imag(lhs_value, b); + llvm::Value* rhs_real = Real(rhs_value, b); + llvm::Value* rhs_imag = Imag(rhs_value, b); + llvm::Value* real_result1 = b->CreateFMul(lhs_real, rhs_real); + llvm::Value* real_result2 = b->CreateFMul(lhs_imag, rhs_imag); + llvm::Value* real_result = b->CreateFSub(real_result1, real_result2); + llvm::Value* imag_result1 = b->CreateFMul(lhs_real, rhs_imag); + llvm::Value* imag_result2 = b->CreateFMul(lhs_imag, rhs_real); + llvm::Value* imag_result = b->CreateFAdd(imag_result1, imag_result2); return {real_result, imag_result}; } } // namespace @@ -474,23 +449,29 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { const Shape& lhs_shape = lhs_instruction->shape(); const Shape& rhs_shape = rhs_instruction->shape(); + const DotDimensionNumbers& dnums = dot->dot_dimension_numbers(); + CHECK_EQ(dnums.lhs_batch_dimensions_size(), + dnums.rhs_batch_dimensions_size()); + // TODO(b/110211620): Convert to use i32 index_type when it is possible. + llvm::Type* index_type = b_.getInt64Ty(); + llvm_ir::IrArray::Index element_index(index_type); if (ShapeUtil::IsScalar(lhs_shape) && ShapeUtil::IsScalar(rhs_shape)) { // If the operands are scalar, don't emit any loops. llvm::Value* lhs_value = - lhs_array.EmitReadArrayElement(/*index=*/{}, &ir_builder_); + lhs_array.EmitReadArrayElement(/*index=*/element_index, &b_); llvm::Value* rhs_value = - rhs_array.EmitReadArrayElement(/*index=*/{}, &ir_builder_); + rhs_array.EmitReadArrayElement(/*index=*/element_index, &b_); llvm::Value* result; if (ShapeUtil::ElementIsComplex(lhs_shape)) { - auto value = MultiplyComplex(lhs_value, rhs_value, &ir_builder_); + auto value = MultiplyComplex(lhs_value, rhs_value, &b_); result = llvm::ConstantAggregateZero::get(lhs_array.GetElementLlvmType()); - result = ir_builder_.CreateInsertValue(result, value.first, {0}); - result = ir_builder_.CreateInsertValue(result, value.second, {1}); + result = InsertValue(result, value.first, {0}); + result = InsertValue(result, value.second, {1}); } else { - result = ir_builder_.CreateFMul(lhs_value, rhs_value); + result = FMul(lhs_value, rhs_value); } - target_array.EmitWriteArrayElement(/*index=*/{}, result, &ir_builder_); + target_array.EmitWriteArrayElement(/*index=*/element_index, result, &b_); return Status::OK(); } @@ -506,9 +487,15 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { const int64 lhs_reduction_dimension = ShapeUtil::GetDimensionNumber(lhs_shape, -1); const int64 rhs_reduction_dimension = - ShapeUtil::Rank(rhs_shape) >= 2 + ShapeUtil::Rank(rhs_shape) >= 2 + dnums.lhs_batch_dimensions_size() ? ShapeUtil::GetDimensionNumber(rhs_shape, -2) - : 0; + : dnums.lhs_batch_dimensions_size(); + + // Check that the batch dims don't cover the last two dims. + for (int64 batch_dim : dnums.lhs_batch_dimensions()) { + CHECK_NE(lhs_reduction_dimension, batch_dim); + CHECK_NE(rhs_reduction_dimension, batch_dim); + } // Verify the reduction dimension in the two operands are the same size. TF_RET_CHECK(lhs_shape.dimensions(lhs_reduction_dimension) == @@ -517,11 +504,18 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { // Create loop nests which loop through the LHS operand dimensions and the RHS // operand dimensions. The reduction dimension of the LHS and RHS are handled // in a separate innermost loop which performs the sum of products. - llvm_ir::ForLoopNest loop_nest(IrName(dot), &ir_builder_); - llvm_ir::IrArray::Index lhs_index = EmitOperandArrayLoopNest( - lhs_array, lhs_reduction_dimension, "lhs", &loop_nest); - llvm_ir::IrArray::Index rhs_index = EmitOperandArrayLoopNest( - rhs_array, rhs_reduction_dimension, "rhs", &loop_nest); + llvm_ir::ForLoopNest loop_nest(IrName(dot), &b_); + llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest( + lhs_array, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs"); + llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest( + rhs_array, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs"); + + // We don't have to iterate over the batch dimensions in both arrays, simplify + // the loop nest of the rhs. + for (int i = 0; i != dnums.lhs_batch_dimensions_size(); ++i) { + DCHECK(absl::c_linear_search(dnums.lhs_batch_dimensions(), i)); + rhs_index[i] = lhs_index[i]; + } // Create the reduction loop which does the sum of products reduction. std::unique_ptr reduction_loop = loop_nest.AddLoop( @@ -541,7 +535,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { llvm::Value* accum_address = llvm_ir::EmitAllocaAtFunctionEntry( accum_type, // The pointee type of the alloca instruction. "accum_address", // The name of the alloca instruction. - &ir_builder_); + &b_); // Initialize the accumulator in the preheader to zero. new llvm::StoreInst( @@ -555,59 +549,58 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { // updated_accum = accum + lhs_element * rhs_element // *accum_address = updated_accum TF_RET_CHECK(!reduction_loop->GetBodyBasicBlock()->empty()); - ir_builder_.SetInsertPoint( + b_.SetInsertPoint( &*reduction_loop->GetBodyBasicBlock()->getFirstInsertionPt()); - llvm::Value* lhs_element = - lhs_array.EmitReadArrayElement(lhs_index, &ir_builder_); - llvm::Value* rhs_element = - rhs_array.EmitReadArrayElement(rhs_index, &ir_builder_); - llvm::Value* accum = ir_builder_.CreateLoad(accum_address); + llvm::Value* lhs_element = lhs_array.EmitReadArrayElement(lhs_index, &b_); + llvm::Value* rhs_element = rhs_array.EmitReadArrayElement(rhs_index, &b_); + llvm::Value* accum = Load(accum_address); llvm::Value* updated_accum; if (ShapeUtil::ElementIsComplex(lhs_shape)) { - auto value = MultiplyComplex(lhs_element, rhs_element, &ir_builder_); - llvm::Value* accum_real = Real(accum, &ir_builder_); - llvm::Value* real_sum = ir_builder_.CreateFAdd(accum_real, value.first); - updated_accum = ir_builder_.CreateInsertValue(accum, real_sum, {0}); - llvm::Value* accum_imag = Imag(accum, &ir_builder_); - llvm::Value* imag_sum = ir_builder_.CreateFAdd(accum_imag, value.second); - updated_accum = ir_builder_.CreateInsertValue(updated_accum, imag_sum, {1}); + auto value = MultiplyComplex(lhs_element, rhs_element, &b_); + llvm::Value* accum_real = Real(accum, &b_); + llvm::Value* real_sum = FAdd(accum_real, value.first); + updated_accum = InsertValue(accum, real_sum, {0}); + llvm::Value* accum_imag = Imag(accum, &b_); + llvm::Value* imag_sum = FAdd(accum_imag, value.second); + updated_accum = InsertValue(updated_accum, imag_sum, {1}); } else { - llvm::Value* product = ir_builder_.CreateFMul(lhs_element, rhs_element); - updated_accum = ir_builder_.CreateFAdd(accum, product); + llvm::Value* product = FMul(lhs_element, rhs_element); + updated_accum = FAdd(accum, product); } - ir_builder_.CreateStore(updated_accum, accum_address); + Store(updated_accum, accum_address); // After the reduction loop exits, store the accumulator into the target // address. The index into the target address is the concatenation of the rhs // and lhs indexes with the reduction dimensions removed. The terms from the // rhs index are the lower dimensions in the index so we add them first. - llvm_ir::IrArray::Index target_index; + llvm_ir::IrArray::Index target_index(index_type); for (size_t dimension = 0; dimension < lhs_index.size(); ++dimension) { if (dimension != lhs_reduction_dimension) { target_index.push_back(lhs_index[dimension]); } } - for (size_t dimension = 0; dimension < rhs_index.size(); ++dimension) { + // Skip over the batch dimensions to not have them in the index twice. + for (size_t dimension = dnums.lhs_batch_dimensions_size(); + dimension < rhs_index.size(); ++dimension) { if (dimension != rhs_reduction_dimension) { target_index.push_back(rhs_index[dimension]); } } - SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), &ir_builder_); + SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), &b_); target_array.EmitWriteArrayElement( target_index, - ir_builder_.CreateLoad( - accum_address), // The value written to the target array. - &ir_builder_); + Load(accum_address), // The value written to the target array. + &b_); // Set the IR builder insert point to the exit basic block of the outer most // loop. This ensures later instructions are inserted after this loop nest. - ir_builder_.SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock()); + b_.SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock()); return Status::OK(); } Status IrEmitter::HandleConvolution(HloInstruction* convolution) { - if (ShapeUtil::HasZeroElements(convolution->shape())) { + if (ShapeUtil::IsZeroElementArray(convolution->shape())) { // Emit no code for an empty output. return Status::OK(); } @@ -617,7 +610,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) { } Status IrEmitter::HandleFft(HloInstruction* fft) { - if (ShapeUtil::HasZeroElements(fft->shape())) { + if (ShapeUtil::IsZeroElementArray(fft->shape())) { // Emit no code for an empty output. return Status::OK(); } @@ -634,20 +627,22 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) { } Status IrEmitter::HandleReduce(HloInstruction* reduce) { + // TODO(b/112040122): Support variadic reduce. + if (!ShapeUtil::IsArray(reduce->shape())) { + return Unimplemented("Variadic reduce is not supported on GPU"); + } auto arg = reduce->operand(0); auto init_value = reduce->operand(1); - tensorflow::gtl::ArraySlice dimensions(reduce->dimensions()); + absl::Span dimensions(reduce->dimensions()); HloComputation* function = reduce->to_apply(); return EmitTargetElementLoop( *reduce, [=](const llvm_ir::IrArray::Index& index) -> StatusOr { // Initialize an accumulator with init_value. llvm::AllocaInst* accumulator_addr = - ir_builder_.CreateAlloca(llvm_ir::PrimitiveTypeToIrType( + Alloca(llvm_ir::PrimitiveTypeToIrType( reduce->shape().element_type(), module_)); - ir_builder_.CreateStore( - ir_builder_.CreateLoad(GetBasePointer(*init_value)), - accumulator_addr); + Store(Load(GetBasePointer(*init_value)), accumulator_addr); // The enclosing loops go over all the target elements. Now we have to // compute the actual target element. For this, we build a new loop nest @@ -655,12 +650,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) { // AddLoopsForShapeOnDimensions will return an Index where induction // Value*s are placed for each dimension in dimensions, and all the rest // are nullptrs. - llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &ir_builder_); + llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_); const llvm_ir::IrArray::Index reduced_dims_index = loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions, "reduction_dim"); - SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_); + SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_); // Build a full index for the input argument, using reduced_dims_index // as the base. In reduced_dims_index only the reduction dimensions are @@ -679,13 +674,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) { // Apply the reduction function to the loaded value. llvm::Value* input_address = - GetIrArray(*arg, *reduce) - .EmitArrayElementAddress(input_index, &ir_builder_); + GetIrArray(*arg, *reduce).EmitArrayElementAddress(input_index, &b_); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *function, {accumulator_addr, input_address}, accumulator_addr)); - SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_); - return ir_builder_.CreateLoad(accumulator_addr); + SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); + return Load(accumulator_addr); }); } @@ -698,8 +692,8 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) { for (HloInstruction* operand : fusion->operands()) { parameter_arrays.push_back(GetIrArray(*operand, *fusion)); } - GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_, - &ir_builder_, GetNestedComputer()); + GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_, &b_, + GetNestedComputer()); FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter); TF_RETURN_IF_ERROR(fusion->fused_expression_root()->Accept(&fused_emitter)); @@ -729,24 +723,6 @@ Status IrEmitter::HandleOutfeed(HloInstruction*) { return Unimplemented("Outfeed is not supported on GPU."); } -Status IrEmitter::HandleRng(HloInstruction* random) { - ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator; - for (const HloInstruction* operand : random->operands()) { - operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) { - return GetIrArray(*operand, *random) - .EmitReadArrayElement(index, &ir_builder_); - }; - } - // Emits a single-threaded loop because the loop body generated by the element - // generator for Rng can't be parallelized (b/32333178). - return llvm_ir::LoopEmitter( - GpuElementalIrEmitter(hlo_module_config_, module_, &ir_builder_, - GetNestedComputer()) - .MakeElementGenerator(random, operand_to_generator), - GetIrArray(*random, *random), &ir_builder_) - .EmitLoop(IrName(random)); -} - Status IrEmitter::HandleBatchNormInference(HloInstruction*) { return Unimplemented( "The GPU backend does not implement BatchNormInference directly. It " @@ -770,52 +746,22 @@ Status IrEmitter::HandleBatchNormGrad(HloInstruction*) { "to a cudnn CustomCall using CudnnBatchNormRewriter."); } -llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest( - const llvm_ir::IrArray& operand_array, int64 reduction_dimension, - tensorflow::StringPiece name_suffix, llvm_ir::ForLoopNest* loop_nest) { - // Prepares the dimension list we will use to emit the loop nest. Outermost - // loops are added first. Add loops in major-to-minor order, and skip the - // reduction dimension. - std::vector dimensions; - const Shape& shape = operand_array.GetShape(); - for (int i = 0; i < LayoutUtil::MinorToMajor(shape).size(); ++i) { - int64 dimension = LayoutUtil::Major(shape.layout(), i); - if (dimension != reduction_dimension) { - dimensions.push_back(dimension); - } - } - - // Create loop nest with one for-loop for each dimension of the - // output. - llvm_ir::IrArray::Index index = - loop_nest->AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix); - // Verify every dimension except the reduction dimension was set in the index. - for (size_t dimension = 0; dimension < index.size(); ++dimension) { - if (dimension == reduction_dimension) { - DCHECK_EQ(nullptr, index[dimension]); - } else { - DCHECK_NE(nullptr, index[dimension]); - } - } - return index; -} - StatusOr IrEmitter::ComputeNestedElement( const HloComputation& computation, - tensorflow::gtl::ArraySlice parameter_elements) { + absl::Span parameter_elements) { llvm::Value* return_buffer = llvm_ir::EmitAllocaAtFunctionEntry( llvm_ir::PrimitiveTypeToIrType( computation.root_instruction()->shape().element_type(), module_), - "return_buffer", &ir_builder_); + "return_buffer", &b_); std::vector parameter_buffers; for (llvm::Value* parameter_element : parameter_elements) { parameter_buffers.push_back(llvm_ir::EmitAllocaAtFunctionEntry( - parameter_element->getType(), "parameter_buffer", &ir_builder_)); - ir_builder_.CreateStore(parameter_element, parameter_buffers.back()); + parameter_element->getType(), "parameter_buffer", &b_)); + Store(parameter_element, parameter_buffers.back()); } TF_RETURN_IF_ERROR(EmitCallToNestedComputation(computation, parameter_buffers, return_buffer)); - return ir_builder_.CreateLoad(return_buffer); + return Load(return_buffer); } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h index e55dfc6dae844ceb1d28ad389d133c80823bad9a..579268f07185fd2d8ec74750f1bf833101149437 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h @@ -22,6 +22,8 @@ limitations under the License. #include #include +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Value.h" @@ -35,13 +37,12 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -64,7 +65,8 @@ namespace gpu { // IrEmitterUnnested, but the code is generated using FusedIrEmitter, which is // not a subclass of gpu::IrEmitter, and in fact is better understood as an IR // generator generator. See comments on that class. -class IrEmitter : public DfsHloVisitorWithDefault { +class IrEmitter : public DfsHloVisitorWithDefault, + public IrBuilderMixin { public: IrEmitter(const IrEmitter&) = delete; IrEmitter& operator=(const IrEmitter&) = delete; @@ -79,7 +81,6 @@ class IrEmitter : public DfsHloVisitorWithDefault { Status HandleCrossReplicaSum(HloInstruction* crs) override; Status HandleInfeed(HloInstruction* infeed) override; Status HandleOutfeed(HloInstruction* outfeed) override; - Status HandleSort(HloInstruction* sort) override; Status HandleSend(HloInstruction* send) override; Status HandleSendDone(HloInstruction* send_done) override; Status HandleRecv(HloInstruction* recv) override; @@ -87,17 +88,20 @@ class IrEmitter : public DfsHloVisitorWithDefault { Status HandleParameter(HloInstruction* parameter) override; Status HandleReduce(HloInstruction* reduce) override; Status HandleTuple(HloInstruction* tuple) override; + Status HandleScatter(HloInstruction* scatter) override; Status HandleSelect(HloInstruction* select) override; + Status HandleTupleSelect(HloInstruction* tuple_select) override; Status HandleFusion(HloInstruction* fusion) override; Status HandleCall(HloInstruction* call) override; Status HandleCustomCall(HloInstruction* custom_call) override; - Status HandleRng(HloInstruction* random) override; Status HandleBatchNormInference(HloInstruction* batch_norm) override; Status HandleBatchNormTraining(HloInstruction* batch_norm) override; Status HandleBatchNormGrad(HloInstruction* batch_norm) override; Status FinishVisit(HloInstruction* root) override { return Status::OK(); } + llvm::IRBuilder<>* builder() { return &b_; } + protected: // Constructs an IrEmitter with the given IrEmitter context. // ir_emitter_context is owned by the caller and should outlive the IrEmitter @@ -139,9 +143,9 @@ class IrEmitter : public DfsHloVisitorWithDefault { // Emits a call in IR to the given nested computation with the given operands // and output. If no IR function has been previously emitted for the // computation, also emits such a function. - Status EmitCallToNestedComputation( - const HloComputation& nested_computation, - tensorflow::gtl::ArraySlice operands, llvm::Value* output); + Status EmitCallToNestedComputation(const HloComputation& nested_computation, + absl::Span operands, + llvm::Value* output); // Emits an atomic operation that implements `nested_computation` in the // sequentially consistent memory model. `output_address` and `source_address` @@ -161,7 +165,7 @@ class IrEmitter : public DfsHloVisitorWithDefault { // The following fields track the IR emission state. According to LLVM memory // management rules, their memory is owned by the module. - llvm::IRBuilder<> ir_builder_; + llvm::IRBuilder<> b_; // Mapping from HLO to its underlying LLVM value. HloToIrBindings bindings_; @@ -170,17 +174,6 @@ class IrEmitter : public DfsHloVisitorWithDefault { const HloModuleConfig& hlo_module_config_; private: - // Emits a series of nested loops for iterating over an operand array in the - // dot operation. Loops are constructed in major to minor dimension layout - // order. No loop is emitted for the given reduction_dimension. The function - // returns an IrArray index for the given operand_array containing the indvars - // of the loops. All dimensions of the index are filled except for the - // reduction dimension. name_suffix is the string to append to the names of - // LLVM constructs (eg, basic blocks) constructed by this method. - llvm_ir::IrArray::Index EmitOperandArrayLoopNest( - const llvm_ir::IrArray& operand_array, int64 reduction_dimension, - tensorflow::StringPiece name_suffix, llvm_ir::ForLoopNest* loop_nest); - // A helper method for EmitAtomicOperationForNestedComputation. Certain // computations, such as floating-point addition and integer maximization, can // be simply implemented using an LLVM atomic instruction. If "computation" is @@ -197,9 +190,16 @@ class IrEmitter : public DfsHloVisitorWithDefault { llvm::Value* output_address, llvm::Value* source_address); + // A helper method for HandleSort(). It adds the inner comparison loop where + // we compare elements pointed to by 'keys_index' and 'compare_keys_index'. + void EmitCompareLoop(int64 dimension_to_sort, + const llvm_ir::IrArray::Index& keys_index, + const llvm_ir::IrArray::Index& compare_keys_index, + const llvm_ir::IrArray& keys_array); + StatusOr ComputeNestedElement( const HloComputation& computation, - tensorflow::gtl::ArraySlice parameter_elements); + absl::Span parameter_elements); // Emits an atomic operation that implements `nested_computation` in the // sequentially consistent memory model. `output_address` and `source_address` diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc index bb47a4280541ce2806472aa9365bb0ef38c0c3b3..5c827e5f9cf3e1c04af444dae338a2ec411ce372 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc @@ -70,10 +70,10 @@ llvm::Function* IrEmitterNested::EmitBasePointersForNestedComputation( argument_dereferenceable_bytes.push_back(root_size); } // The base pointer of the memory block for all pre-allocated temp buffers. - argument_types.push_back(ir_builder_.getInt8PtrTy()); + argument_types.push_back(b_.getInt8PtrTy()); llvm::FunctionType* function_type = - llvm::FunctionType::get(ir_builder_.getVoidTy(), argument_types, false); + llvm::FunctionType::get(b_.getVoidTy(), argument_types, false); llvm::Function* function = llvm::Function::Create( function_type, // The function type. llvm::GlobalValue::InternalLinkage, // The linkage type. @@ -96,8 +96,7 @@ llvm::Function* IrEmitterNested::EmitBasePointersForNestedComputation( llvm::BasicBlock::Create(function->getContext(), "entry", function); // Emit a "return void" at entry_bb's end, and sets the insert point before // that return instruction. - ir_builder_.SetInsertPoint( - llvm::ReturnInst::Create(function->getContext(), entry_bb)); + b_.SetInsertPoint(llvm::ReturnInst::Create(function->getContext(), entry_bb)); std::vector non_io_hlos; for (const auto* hlo : nested_computation.instructions()) { @@ -120,25 +119,24 @@ Status IrEmitterNested::EmitTargetElementLoop( // For MOF we give the loop emitter an array for every output it should // generate. if (hlo.IsMultiOutputFusion()) { + const int64 num_elems = ShapeUtil::TupleElementCount(hlo.shape()); std::vector target_arrays; - for (int64 i = 0, e = ShapeUtil::TupleElementCount(hlo.shape()); i != e; - ++i) { + target_arrays.reserve(num_elems); + for (int64 i = 0; i != num_elems; ++i) { target_arrays.push_back(GetIrArray(hlo, hlo, {i})); } TF_RETURN_IF_ERROR( - llvm_ir::LoopEmitter(element_generator, target_arrays, &ir_builder_) - .EmitLoop()); + llvm_ir::LoopEmitter(element_generator, target_arrays, &b_).EmitLoop()); std::vector tuple_operand_ptrs; + tuple_operand_ptrs.reserve(num_elems); for (const llvm_ir::IrArray& array : target_arrays) { tuple_operand_ptrs.push_back(array.GetBasePointer()); } - llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &ir_builder_, - module_); + llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &b_, module_); return Status::OK(); } - return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo), - &ir_builder_) + return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo), &b_) .EmitLoop(); } diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index ae4e305b8013b9b2415f9b708cb3ba8057475c8f..389a98facb9b553a91342bb7fc42642179aaf698 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -21,6 +21,12 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h" +#include "absl/algorithm/container.h" +#include "absl/container/inlined_vector.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/types/optional.h" +#include "absl/types/span.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Function.h" @@ -28,10 +34,11 @@ limitations under the License. #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h" +#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h" +#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h" #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h" #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h" @@ -47,29 +54,33 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h" #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h" #include "tensorflow/compiler/xla/service/gpu/memset_thunk.h" +#include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h" #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h" #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/gpu/tuple_thunk.h" #include "tensorflow/compiler/xla/service/gpu/while_thunk.h" -#include "tensorflow/compiler/xla/service/gpu/while_transformer.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h" +#include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h" +#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" -#include "tensorflow/compiler/xla/service/llvm_ir/ops.h" +#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h" #include "tensorflow/compiler/xla/service/name_uniquer.h" +#include "tensorflow/compiler/xla/service/while_loop_analysis.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/window_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -77,12 +88,12 @@ namespace gpu { namespace { +using absl::InlinedVector; +using absl::nullopt; +using absl::optional; +using absl::StrCat; +using llvm_ir::IrArray; using llvm_ir::IrName; -using tensorflow::gtl::ArraySlice; -using tensorflow::gtl::InlinedVector; -using tensorflow::gtl::nullopt; -using tensorflow::gtl::optional; -using tensorflow::strings::StrCat; // If a dimensions is smaller than this, untiled transposition may be more // efficient. @@ -162,43 +173,9 @@ Status IrEmitterUnnested::Postprocess(HloInstruction* hlo) { return DfsHloVisitor::Postprocess(hlo); } -namespace { -bool ImplementedAsHostToDeviceMemcpy(const BufferAssignment& buffer_assignment, - const HloInstruction& hlo) { - // `hlo` needs to satisfy the following conditions to be implemented as a - // host-to-device cuMemcpy. - // - // 1. `hlo` is a kCopy instruction. - // 2. `hlo`'s only operand is a kConstant instruction. - // 3. `hlo` and its operand have the same shape (thus the same layout too). - // 4. The address of `hlo`'s buffer is known at runtime (without dereferencing - // pointers in a tuple). - return hlo.opcode() == HloOpcode::kCopy && - hlo.operand(0)->opcode() == HloOpcode::kConstant && - ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) && - buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok(); -} - -bool ImplementedAsDeviceToDeviceMemcpy( - const BufferAssignment& buffer_assignment, const HloInstruction& hlo) { - // `hlo` needs to satisfy three conditions to be implemented as a - // device-to-device cuMemcpy. - // - // 1. `hlo` is a kCopy instruction. - // 2. `hlo` and its operand have the same shape (thus the same layout too). - // 3. `hlo` and its operand have a statically-known buffer assignment - // (constants do not, for instance), which means the source buffer also - // resides on the device. - return hlo.opcode() == HloOpcode::kCopy && - ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) && - buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok() && - buffer_assignment.GetUniqueTopLevelSlice(hlo.operand(0)).ok(); -} -} // namespace - llvm::Function* IrEmitterUnnested::BuildKernelPrototype( const HloInstruction& inst, - tensorflow::gtl::ArraySlice args) { + absl::Span args) { // Compute the kernel name. The opcode string may contain "-" which cannot be // in a PTX function name, so sanitize the name before uniquifying it. string kernel_name = ir_emitter_context_->name_uniquer()->GetUniqueName( @@ -209,7 +186,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype( llvm::LLVMContext& context = module->getContext(); llvm::FunctionType* kernel_type = llvm::FunctionType::get( /*Result=*/llvm::Type::getVoidTy(context), - std::vector(args.size(), ir_builder_.getInt8PtrTy()), + std::vector(args.size(), b_.getInt8PtrTy()), /*isVarArg=*/false); llvm::Function* kernel = llvm::Function::Create(kernel_type, llvm::GlobalValue::ExternalLinkage, @@ -224,9 +201,20 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype( ++arg_it; kernel->addDereferenceableAttr(arg_no + 1, alloc->size()); + + const int64 alignment = [&] { + if (alloc->is_entry_computation_parameter()) { + return kEntryParameterAlignBytes; + } else if (alloc->is_constant()) { + return kConstantBufferAlignBytes; + } else { + return kXlaAllocatedBufferAlignBytes; + } + }(); + kernel->addParamAttr( - arg_no, llvm::Attribute::get(context, llvm::Attribute::Alignment, - kCudaMallocAlignBytes)); + arg_no, + llvm::Attribute::get(context, llvm::Attribute::Alignment, alignment)); if (alloc->IsPreallocatedTempBuffer()) { fn_arg->setName("temp_buf"); @@ -245,7 +233,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype( nvvm_annotations_node->addOperand(llvm::MDNode::get( context, {llvm::ConstantAsMetadata::get(kernel), llvm::MDString::get(context, "kernel"), - llvm::ConstantAsMetadata::get(ir_builder_.getInt32(1))})); + llvm::ConstantAsMetadata::get(b_.getInt32(1))})); // Update the insert point to the entry basic block. llvm::BasicBlock* entry_bb = @@ -253,7 +241,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype( // Emit a "return void" at entry_bb's end, and set the insert point before // that return instruction. - ir_builder_.SetInsertPoint(llvm::ReturnInst::Create(context, entry_bb)); + b_.SetInsertPoint(llvm::ReturnInst::Create(context, entry_bb)); return kernel; } @@ -281,6 +269,69 @@ int ComputeMaxUnrollFactor(const HloInstruction* hlo) { // Cannot unroll. return 1; } + +// Returns the llvm type for the indices used in the kernel that contains the +// hlo instruction. Such indices include the index for the parallel loop and +// the indices for the tensors accessed by the kernel. The return type is i32 +// iff the following conditions are met: +// . The launch_size of the kernel is within the range of i32. +// . The sizes of all the tensors accessed within the kernel are within the +// range of i32. +// Otherwise, the return type is i64. +llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size, + llvm::IRBuilder<>* b) { + // Find the unnested hlo instructon for which the kernel is generated for. + const HloInstruction* unnested_hlo = hlo; + const HloComputation* computation = hlo->parent(); + if (computation->IsFusionComputation()) { + unnested_hlo = computation->FusionInstruction(); + } + + auto shape_in_range = [&](const Shape& s) { + bool in_range = true; + ShapeUtil::ForEachSubshape( + s, [&](const Shape& sub_shape, const ShapeIndex& /*index*/) { + if (ShapeUtil::IsArray(sub_shape) && + !IsInt32(ShapeUtil::ElementsIn(sub_shape))) { + in_range = false; + } + }); + + return in_range; + }; + + llvm::Type* i64_ty = b->getInt64Ty(); + // Check launch dimension + if (!IsInt32(launch_size)) { + return i64_ty; + } + + // Check the size of result tensors + if (!shape_in_range(unnested_hlo->shape())) { + return i64_ty; + } + + auto hlo_shape_in_range = [&](const HloInstruction* operand) -> bool { + return shape_in_range(operand->shape()); + }; + + // Check the size of input tensors + if (!absl::c_all_of(unnested_hlo->operands(), hlo_shape_in_range)) { + return i64_ty; + } + + // Check the size of the internal result tensors + if (unnested_hlo->opcode() == HloOpcode::kFusion) { + if (!absl::c_all_of( + unnested_hlo->fused_instructions_computation()->instructions(), + hlo_shape_in_range)) { + return i64_ty; + } + } + + return b->getInt32Ty(); +} + } // namespace Status IrEmitterUnnested::DefaultAction(HloInstruction* hlo) { @@ -290,21 +341,18 @@ Status IrEmitterUnnested::DefaultAction(HloInstruction* hlo) { unroll_factor = ComputeMaxUnrollFactor(hlo); } - thunk_sequence_->emplace_back(BuildKernelThunk(hlo, unroll_factor)); + thunk_sequence_->emplace_back(BuildKernelThunk( + hlo, /*implements_whole_instruction=*/true, unroll_factor)); return IrEmitter::DefaultAction(hlo); } Status IrEmitterUnnested::HandleDot(HloInstruction* dot) { - const DotDimensionNumbers& dnums = dot->dot_dimension_numbers(); - if (dnums.lhs_batch_dimensions_size() > 0 || - dnums.rhs_batch_dimensions_size() > 0) { - return Unimplemented("Dot with batch dimensions not implemented."); - } if (ImplementedAsGemm(*dot)) { thunk_sequence_->emplace_back(BuildGemmThunk(dot)); return Status::OK(); } - thunk_sequence_->emplace_back(BuildKernelThunk(dot)); + thunk_sequence_->emplace_back( + BuildKernelThunk(dot, /*implements_whole_instruction=*/true)); return IrEmitter::HandleDot(dot); } @@ -314,7 +362,8 @@ Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) { } Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) { - thunk_sequence_->emplace_back(BuildKernelThunk(convolution)); + thunk_sequence_->emplace_back( + BuildKernelThunk(convolution, /*implements_whole_instruction=*/true)); return IrEmitter::HandleConvolution(convolution); } @@ -336,7 +385,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) { int64 feature_index_value = feature_index->literal().Get({}); thunk_sequence_->emplace_back( - MakeUnique( + absl::make_unique( /*operand=*/GetAllocationSlice(*custom_call->operand(0)), /*scale=*/GetAllocationSlice(*custom_call->operand(1)), /*offset=*/GetAllocationSlice(*custom_call->operand(2)), @@ -366,7 +415,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) { auto output_mean = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie(); auto output_inv_stddev = assn.GetUniqueSlice(custom_call, {2}).ValueOrDie(); thunk_sequence_->emplace_back( - MakeUnique( + absl::make_unique( /*operand=*/GetAllocationSlice(*custom_call->operand(0)), /*scale=*/GetAllocationSlice(*custom_call->operand(1)), /*offset=*/GetAllocationSlice(*custom_call->operand(2)), @@ -396,19 +445,20 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) { auto output_grad_scale = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie(); auto output_grad_offset = assn.GetUniqueSlice(custom_call, {2}).ValueOrDie(); - thunk_sequence_->emplace_back(MakeUnique( - /*operand=*/GetAllocationSlice(*custom_call->operand(0)), - /*scale=*/GetAllocationSlice(*custom_call->operand(1)), - /*mean=*/GetAllocationSlice(*custom_call->operand(2)), - /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)), - /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)), - /*epsilon=*/epsilon_value, - /*feature_index=*/feature_index_value, - /*output_grad_data=*/output_grad_data, - /*output_grad_scale=*/output_grad_scale, - /*output_grad_offset=*/output_grad_offset, - /*output_tuple=*/GetAllocationSlice(*custom_call), - /*hlo=*/custom_call)); + thunk_sequence_->emplace_back( + absl::make_unique( + /*operand=*/GetAllocationSlice(*custom_call->operand(0)), + /*scale=*/GetAllocationSlice(*custom_call->operand(1)), + /*mean=*/GetAllocationSlice(*custom_call->operand(2)), + /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)), + /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)), + /*epsilon=*/epsilon_value, + /*feature_index=*/feature_index_value, + /*output_grad_data=*/output_grad_data, + /*output_grad_scale=*/output_grad_scale, + /*output_grad_offset=*/output_grad_offset, + /*output_tuple=*/GetAllocationSlice(*custom_call), + /*hlo=*/custom_call)); return Status::OK(); } @@ -423,19 +473,12 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) { auto conv_result_slice = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie(); auto scratch_slice = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie(); - const HloInstruction* algorithm_inst = custom_call->operand(2); - CHECK(algorithm_inst->IsConstant()) << algorithm_inst->ToString(); - int64 algorithm = algorithm_inst->literal().Get({}); - - const HloInstruction* tensor_ops_enabled_inst = custom_call->operand(3); - CHECK(tensor_ops_enabled_inst->IsConstant()) - << tensor_ops_enabled_inst->ToString(); - bool tensor_ops_enabled = tensor_ops_enabled_inst->literal().Get({}); - + TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config, + custom_call->backend_config()); const auto& target = custom_call->custom_call_target(); std::unique_ptr thunk; if (target == kCudnnConvForwardCallTarget) { - thunk = MakeUnique( + thunk = absl::make_unique( CudnnConvKind::kForward, /*input_buffer=*/lhs_slice, /*filter_buffer=*/rhs_slice, @@ -446,9 +489,10 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) { /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape, // custom_call->window(), custom_call->convolution_dimension_numbers(), - algorithm, tensor_ops_enabled, custom_call); + custom_call->feature_group_count(), backend_config.algorithm(), + backend_config.tensor_ops_enabled(), custom_call); } else if (target == kCudnnConvBackwardInputCallTarget) { - thunk = MakeUnique( + thunk = absl::make_unique( CudnnConvKind::kBackwardInput, /*input_buffer=*/conv_result_slice, /*filter_buffer=*/rhs_slice, @@ -459,9 +503,10 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) { /*filter_shape=*/rhs_shape, /*output_shape=*/lhs_shape, // custom_call->window(), custom_call->convolution_dimension_numbers(), - algorithm, tensor_ops_enabled, custom_call); + custom_call->feature_group_count(), backend_config.algorithm(), + backend_config.tensor_ops_enabled(), custom_call); } else if (target == kCudnnConvBackwardFilterCallTarget) { - thunk = MakeUnique( + thunk = absl::make_unique( CudnnConvKind::kBackwardFilter, /*input_buffer=*/lhs_slice, /*filter_buffer=*/conv_result_slice, @@ -472,7 +517,8 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) { /*filter_shape=*/conv_result_shape, /*output_shape=*/rhs_shape, // custom_call->window(), custom_call->convolution_dimension_numbers(), - algorithm, tensor_ops_enabled, custom_call); + custom_call->feature_group_count(), backend_config.algorithm(), + backend_config.tensor_ops_enabled(), custom_call); } else { LOG(FATAL) << "Unexpected custom call target: " << custom_call->custom_call_target(); @@ -502,32 +548,45 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { switch (root->opcode()) { case HloOpcode::kTuple: case HloOpcode::kReduce: { + if (root->opcode() == HloOpcode::kReduce && + ShapeUtil::IsTuple(root->shape())) { + // TODO(b/112040122): Support variadic reduce. + return Unimplemented("Variadic reduce is not supported on GPU"); + } VLOG(3) << "Emitting fused reduction to vector: " << fusion->ToString(); std::vector> thunks; - ArraySlice reduces = + absl::Span output_instructions = root->opcode() == HloOpcode::kTuple ? root->operands() - : ArraySlice(&root, 1); + : absl::Span(&root, 1); // For multi-output fusion emit an initializer for each tuple element. // Otherwise it's sufficient to just initialize the single output. - for (int i = 0, e = reduces.size(); i != e; ++i) { - TF_ASSIGN_OR_RETURN( - std::unique_ptr initializer_thunk, - BuildInitializerThunk( - fusion, reduces[i] == root ? ShapeIndex() : ShapeIndex({i}))); - thunks.push_back(std::move(initializer_thunk)); + HloInstruction* first_reduce = nullptr; + for (int i = 0, e = output_instructions.size(); i != e; ++i) { + if (output_instructions[i]->opcode() == HloOpcode::kReduce) { + TF_ASSIGN_OR_RETURN( + std::unique_ptr initializer_thunk, + BuildInitializerThunk(fusion, output_instructions[i] == root + ? ShapeIndex() + : ShapeIndex({i}))); + thunks.push_back(std::move(initializer_thunk)); + first_reduce = + first_reduce == nullptr ? output_instructions[i] : first_reduce; + } } - thunks.push_back(BuildKernelThunk(fusion)); + CHECK(first_reduce != nullptr); + thunks.push_back( + BuildKernelThunk(fusion, /*implements_whole_instruction=*/false)); thunk_sequence_->emplace_back( - MakeUnique(std::move(thunks), fusion)); - std::vector parameter_arrays; + absl::make_unique(std::move(thunks), fusion)); + std::vector parameter_arrays; for (HloInstruction* operand : fusion->operands()) { parameter_arrays.push_back(GetIrArray(*operand, *fusion)); } GpuElementalIrEmitter elemental_emitter( - hlo_module_config_, ir_emitter_context_->llvm_module(), - &ir_builder_, GetNestedComputer()); + hlo_module_config_, ir_emitter_context_->llvm_module(), &b_, + GetNestedComputer()); FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter); TF_RETURN_IF_ERROR(root->Accept(&fused_emitter)); @@ -536,29 +595,49 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { // fusion is a special case of that. InlinedVector input_gens; InlinedVector init_value_gens; + std::vector> + extra_output_gens; InlinedVector reducers; - for (const HloInstruction* reduce : reduces) { - CHECK_EQ(HloOpcode::kReduce, reduce->opcode()); - // TODO(kramerb): CHECK that layouts are equal. Currently this - // breaks multioutputfusion_test. The test has pre-fused - // instructions, but layout_assignment will not assign any layouts - // for instructions inside of a fused computation. It just removes - // the layouts instead. - CHECK(ShapeUtil::Compatible(reduces[0]->shape(), reduce->shape())); - CHECK(ShapeUtil::Compatible(reduces[0]->operand(0)->shape(), - reduce->operand(0)->shape())); - CHECK(ShapeUtil::Compatible(reduces[0]->operand(1)->shape(), - reduce->operand(1)->shape())); - CHECK(reduces[0]->dimensions() == reduce->dimensions()); - input_gens.push_back(fused_emitter.GetGenerator(reduce->operand(0))); - init_value_gens.push_back( - fused_emitter.GetGenerator(reduce->operand(1))); - reducers.push_back(reduce->to_apply()); + InlinedVector reduce_output_shapes; + for (int i = 0, e = output_instructions.size(); i != e; ++i) { + const HloInstruction* inst = output_instructions[i]; + ShapeIndex output_shape_index; + if (root->opcode() == HloOpcode::kTuple) { + output_shape_index = {i}; + } + if (inst->opcode() == HloOpcode::kReduce) { + CHECK(IsReductionToVector(*inst)) + << "Only reductions to vector are supported"; + // Shapes, layouts and dimensions must be the same for all reduces + // inside of this fusion. + CHECK(ShapeUtil::Equal(first_reduce->shape(), inst->shape())); + CHECK(ShapeUtil::Equal(first_reduce->operand(0)->shape(), + inst->operand(0)->shape())); + CHECK(ShapeUtil::Equal(first_reduce->operand(1)->shape(), + inst->operand(1)->shape())); + CHECK(first_reduce->dimensions() == inst->dimensions()); + input_gens.push_back(fused_emitter.GetGenerator(inst->operand(0))); + init_value_gens.push_back( + fused_emitter.GetGenerator(inst->operand(1))); + reducers.push_back(inst->to_apply()); + reduce_output_shapes.push_back(std::move(output_shape_index)); + } else { + // For extra outputs we can relax shape equality to allow different + // types (with the same number of elements). Layouts still have to + // match. + CHECK(ShapeUtil::CompatibleIgnoringElementType( + first_reduce->operand(0)->shape(), inst->shape())); + CHECK(LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(), + inst->shape().layout())); + extra_output_gens.emplace_back(fused_emitter.GetGenerator(inst), + std::move(output_shape_index)); + } } - const Shape& input_shape = reduces[0]->operand(0)->shape(); - return EmitReductionToVector(reduces[0], input_shape, input_gens, - init_value_gens, reduces[0]->dimensions(), - reducers); + const Shape& input_shape = first_reduce->operand(0)->shape(); + return EmitReductionToVector(first_reduce, input_shape, input_gens, + init_value_gens, + first_reduce->dimensions(), reducers, + reduce_output_shapes, extra_output_gens); } default: LOG(FATAL) << "Bad opcode for input fusion: " @@ -572,21 +651,22 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { // touching the un-updated elements. // Set up kernel thunk and fused ir emitter. - thunk_sequence_->emplace_back(BuildKernelThunk(fusion)); - std::vector operand_arrays; + thunk_sequence_->emplace_back( + BuildKernelThunk(fusion, /*implements_whole_instruction=*/true)); + std::vector operand_arrays; for (HloInstruction* operand : fusion->operands()) { operand_arrays.push_back(GetIrArray(*operand, *fusion)); } GpuElementalIrEmitter elemental_emitter(hlo_module_config_, ir_emitter_context_->llvm_module(), - &ir_builder_, GetNestedComputer()); + &b_, GetNestedComputer()); // Shape of the dynamic-update-slice's "update" operand. Shape update_shape = root->operand(1)->shape(); // Array to write into. Because this is an in-place operation, this is the // same as operand 0's array. - llvm_ir::IrArray output_array = GetIrArray(*fusion, *fusion); + IrArray output_array = GetIrArray(*fusion, *fusion); LaunchDimensions launch_dimensions = CalculateLaunchDimensions( update_shape, ir_emitter_context_->device_description()); @@ -597,357 +677,69 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { return llvm_ir::EmitParallelFusedDynamicUpdateSliceInPlace( fusion, operand_arrays, output_array, &elemental_emitter, - launch_dimensions, &ir_builder_); + launch_dimensions, &b_); } + if (ImplementedAsGemm(*fusion)) { thunk_sequence_->emplace_back(BuildGemmThunk(fusion)); return Status::OK(); } - CHECK(fusion->fusion_kind() == HloInstruction::FusionKind::kLoop); - int unroll_factor = ComputeMaxUnrollFactor(fusion); - - thunk_sequence_->emplace_back(BuildKernelThunk(fusion, unroll_factor)); - return IrEmitter::HandleFusion(fusion); -} - -namespace { - -// Returns the indices of the first elements of all consecutive subarrays of the -// given array. For example: -// ConsecutiveSegments({m, m+1, m+2, n, k, k+1}) = {0, 3, 4} -std::vector ConsecutiveSegments(tensorflow::gtl::ArraySlice xs) { - std::vector is = {0}; - for (size_t i = 1; i < xs.size(); ++i) { - if (1 != xs[i] - xs[i - 1]) { - is.push_back(i); - } - } - return is; -} - -// Merges the sequences of dimensions of the given shape which start at the -// given indices `segs`. -Shape MergeDimensions(tensorflow::gtl::ArraySlice segs, - const Shape& shape) { - std::vector dimensions; - for (size_t i = 1; i <= segs.size(); ++i) { - dimensions.push_back(std::accumulate( - shape.dimensions().begin() + segs[i - 1], - shape.dimensions().begin() + - (segs.size() == i ? shape.dimensions().size() : segs[i]), - 1, std::multiplies())); - } - return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(), - dimensions); -} + CHECK_EQ(fusion->fusion_kind(), HloInstruction::FusionKind::kLoop); -// Returns whether the given shapes and permutation are a 0-2-1 transpose, and -// if so, the normalized and rank-reduced shapes. The shapes must have the same -// dimensions, so this considers layout only. -// -// This function recognizes higher-rank transposes which are elementwise -// equivalent to a 0-2-1 transpose. -std::tuple IsTranspose021(const Shape& a, const Shape& b) { - CHECK(ShapeUtil::Compatible(a, b)); - std::vector perm(a.dimensions().size()); - { - auto layout_a_orig = LayoutUtil::MinorToMajor(a); - std::vector layout_a(layout_a_orig.rbegin(), layout_a_orig.rend()); - auto layout_b_orig = LayoutUtil::MinorToMajor(b); - std::vector layout_b(layout_b_orig.rbegin(), layout_b_orig.rend()); - for (size_t i = 0; i < perm.size(); ++i) { - perm[i] = PositionInContainer(layout_b, layout_a[i]); - } + if (CheckAndEmitHloWithTile021(fusion)) { + return Status::OK(); } - auto segs = ConsecutiveSegments(perm); - Shape norm_a = - ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a); - Shape norm_b = - ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(b); - if (3 == segs.size() && 0 == perm[0]) { - Shape reduced_a = MergeDimensions(segs, norm_a); - Shape reduced_b = ShapeUtil::MakeShapeWithDescendingLayout( - b.element_type(), - Permute({0, 2, 1}, AsInt64Slice(reduced_a.dimensions()))); - return std::make_tuple(true, reduced_a, reduced_b); - } - return std::make_tuple(false, ShapeUtil::MakeNil(), ShapeUtil::MakeNil()); -} - -// Returns whether the given shapes are potentially of a 0-2-1 transpose. -// As 0-2-1 is a self-inverse permutation, which shape is input or output is -// arbitrary. -bool AreShapesForTranspose021(const Shape& a, const Shape& b) { - return 3 == b.dimensions().size() && - ShapeUtil::Compatible( - ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a), - ShapeUtil::PermuteDimensions( - {0, 2, 1}, - ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( - b))); -} -// Emits a tiled 0-2-1 transpose, assuming both input and output lain out from -// major to minor. The x- and y- dimensions are tiled in square tiles of edge -// length `tile_size`. Each thread block of `tile_size` x `num_rows` threads -// transposes one tile: each thread copies a row from the input to a shared -// memory tile, then copies a column from the shared memory tile to the output. -// -// `tile_size` should usually be same as warp size. -// -// Returns (number of tiles = number of thread blocks needed). -// -// TODO(b/33320379): Here each block transposes 1 tile. It may be more efficient -// to launch fewer blocks so each transposes many tiles, and -// in any case, the number of blocks we can launch is limited. -// -// This is the same algorithm in CUDA: -// https://github.com/tensorflow/tensorflow/blob/d2693c8a70567cc78b2e8a9ac8020d321620ca83/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc#L189 -int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output, - const int64 tile_size, const int64 num_rows, - llvm::IRBuilder<>* builder) { - // Adds `addend` to the given `dim` of `index`. - auto offset_dim = [builder](llvm_ir::IrArray::Index index, - llvm::Value* addend, int64 dim) { - index[dim] = builder->CreateAdd(index[dim], addend); - return index; - }; - - CHECK(AreShapesForTranspose021(input.GetShape(), output.GetShape())); - - Shape input_shape = - ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( - input.GetShape()); - Shape output_shape = - ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( - output.GetShape()); - input = input.CastToShape(input_shape, builder); - output = output.CastToShape(output_shape, builder); - - llvm::Type* tile_type = llvm::ArrayType::get( - llvm::ArrayType::get(input.GetElementLlvmType(), tile_size), - // One extra here to avoid share memory bank conflict - tile_size + 1); - auto* tile = new llvm::GlobalVariable( - *builder->GetInsertBlock()->getParent()->getParent(), tile_type, - /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage, - llvm::UndefValue::get(tile_type), "tile", nullptr, - llvm::GlobalValue::NotThreadLocal, - /*AddressSpace=*/3 /* GPU shared memory */); - - // let x = threadIdx.x - llvm::Value* x = llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, builder); - llvm_ir::AddRangeMetadata(0, num_rows * tile_size, - static_cast(x)); - x = builder->CreateIntCast(x, builder->getInt64Ty(), /*isSigned=*/true, - "thread.id.x"); - - // computing logical thread ids - // logical_x = x % tile_size - auto logical_x = builder->CreateURem(x, builder->getInt64(tile_size)); - - // logical_y = x / tile_size - auto logical_y = builder->CreateUDiv(x, builder->getInt64(tile_size)); - - // `emit_cp` emits equivalent to following pseudocode: - // if (tile_size == tile_width && tile_size == tile_height) { - // unroll for (i in range(0, tile_size, num_rows)) { - // emit_cp_element(index + {0, i, 0}, y + logical_y); - // } - // } else if (x < tile_width) { - // tile_height_upperbound = ceil(tile_height / num_rows) * num_rows; - // for (i in range(0, tile_height_upperbound, num_rows)) { - // y_loc = i + logical_y; - // if (y_loc < tile_height) - // emit_cp_element(index + {0, i, 0}, y_loc); - // } - // } - // - // We use this to emit both the copy from input to tile and the copy from tile - // to output. - // - // `index` is the origin of the row or column in the input or output array. - // - // `emit_cp_element(index, y)` emits code to copy a single element between the - // tile and the input or output array, where `y` is the `y`-position in the - // tile, whether which is row or column is a function of whether we're copying - // from input or to output, and `index` is the index into the input or output - // array. - auto emit_cp_tile = [builder, tile_size, &offset_dim, num_rows, logical_x, - logical_y]( - std::function - emit_cp_element, - llvm::Value* tile_width, llvm::Value* tile_height, - const llvm_ir::IrArray::Index& index, - const string& loop_name) { - llvm_ir::LlvmIfData if_not_last_row = llvm_ir::EmitIfThenElse( - builder->CreateAnd( - builder->CreateICmpEQ(builder->getInt64(tile_size), tile_width), - builder->CreateICmpEQ(builder->getInt64(tile_size), tile_height)), - "not_last_row", builder); - builder->SetInsertPoint(if_not_last_row.true_block->getTerminator()); - for (int64 i = 0; i < tile_size; i += num_rows) { - auto source_idx = offset_dim(index, builder->getInt64(i), /*dim=*/1); - auto y_loc = builder->CreateAdd(builder->getInt64(i), logical_y); - emit_cp_element(source_idx, y_loc); - } - builder->SetInsertPoint(if_not_last_row.false_block->getTerminator()); - llvm_ir::LlvmIfData if_in_tile = llvm_ir::EmitIfThenElse( - builder->CreateICmpULT(logical_x, tile_width), "x_in_tile", builder); - builder->SetInsertPoint(if_in_tile.true_block->getTerminator()); - - // tile_height_upper_bound = ceil(tile_height / num_rows) * num_rows - auto tile_height_upper_bound = builder->CreateMul( - builder->CreateUDiv( - builder->CreateAdd(tile_height, builder->getInt64(num_rows - 1)), - builder->getInt64(num_rows)), - builder->getInt64(num_rows)); - - auto loop = llvm_ir::ForLoop::EmitForLoop( - loop_name, builder->getInt64(0), tile_height_upper_bound, - builder->getInt64(num_rows), builder); - llvm_ir::SetToFirstInsertPoint(loop->GetHeaderBasicBlock(), builder); - builder->SetInsertPoint(loop->GetBodyBasicBlock()->getTerminator()); - - auto y_loc = builder->CreateAdd(loop->GetIndVarValue(), logical_y); - auto if_y_in_tile = llvm_ir::EmitIfThenElse( - builder->CreateICmpULT(y_loc, tile_height), "y_in_tile", builder); - builder->SetInsertPoint(if_y_in_tile.true_block->getTerminator()); - - emit_cp_element(offset_dim(index, loop->GetIndVarValue(), /*dim=*/1), - y_loc); - builder->SetInsertPoint(if_not_last_row.after_block->getTerminator()); - }; - - auto input_dims_in_tiles = input_shape.dimensions(); - // Unpermuted dimensions are untiled. - for (int i = 1; i < 3; ++i) { - input_dims_in_tiles[i] = - CeilOfRatio(input_dims_in_tiles[i], tile_size); - } - int64 num_tiles = - std::accumulate(input_dims_in_tiles.begin(), input_dims_in_tiles.end(), 1, - std::multiplies()); - const llvm_ir::IrArray::Index input_tile_index( - /*linear=*/builder->CreateIntCast( - llvm_ir::AddRangeMetadata( - 0, num_tiles, - static_cast(llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, - builder))), - builder->getInt64Ty(), /*isSigned=*/true, "block.id.x"), - ShapeUtil::MakeShapeWithDescendingLayout( - PRED /*arbitrary*/, AsInt64Slice(input_dims_in_tiles)), - builder); - const llvm_ir::IrArray::Index input_tile_origin = ({ - llvm_ir::IrArray::Index index = input_tile_index; - for (int i = 1; i < 3; ++i) { - index[i] = builder->CreateMul(index[i], builder->getInt64(tile_size), - "tile_origin." + std::to_string(i)); - } - index; - }); - const llvm_ir::IrArray::Index input_index = - offset_dim(offset_dim(input_tile_origin, logical_x, /*dim=*/2), logical_y, - /*dim=*/1); - std::vector tile_dims(input_shape.dimensions().size()); - // Only last row or column may not have full size. - for (int i = 1; i < 3; ++i) { - tile_dims[i] = builder->CreateSelect( - builder->CreateICmpEQ(input_tile_index[i], - builder->getInt64(input_dims_in_tiles[i] - 1)), - builder->getInt64(input_shape.dimensions(i) - - (input_dims_in_tiles[i] - 1) * tile_size), - builder->getInt64(tile_size), "tile_size"); - } - - // Load data from input memory to shared memory tile. - emit_cp_tile( - // tile[y, x] = input_array[index] - [builder, tile, &input, logical_x](const llvm_ir::IrArray::Index& index, - llvm::Value* y) { - builder->CreateStore( - input.EmitReadArrayElement(index, builder, "input_element"), - builder->CreateGEP(tile, {builder->getInt64(0), y, logical_x})); - }, - tile_dims[2], tile_dims[1], input_index, "input"); + int unroll_factor = ComputeMaxUnrollFactor(fusion); - // Wait for all threads to reach this point, lest we copy a value from tile to - // output before the other thread copies it from input to tile. - // This is `__syncthreads` in CUDA. - llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, builder); - - const llvm_ir::IrArray::Index output_tile_index( - Permute({0, 2, 1}, input_tile_index.multidim())); - const llvm_ir::IrArray::Index output_tile_origin( - Permute({0, 2, 1}, input_tile_origin.multidim())); - const llvm_ir::IrArray::Index output_index = - offset_dim(offset_dim(output_tile_origin, logical_x, /*dim=*/2), - logical_y, /*dim=*/1); - - // Store data from shared memory tile to output memory. - emit_cp_tile( - // output_array[index] = tile[x, y] - [builder, tile, &output, logical_x](const llvm_ir::IrArray::Index& index, - llvm::Value* y) { - output.EmitWriteArrayElement( - index, - builder->CreateLoad( - builder->CreateGEP(tile, {builder->getInt64(0), logical_x, y}), - "output_element"), - builder); - }, - tile_dims[1], tile_dims[2], output_index, "output"); - - return num_tiles; + thunk_sequence_->emplace_back(BuildKernelThunk( + fusion, /*implements_whole_instruction=*/true, unroll_factor)); + return IrEmitter::HandleFusion(fusion); } -} // namespace - Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) { - if (ImplementedAsHostToDeviceMemcpy(ir_emitter_context_->buffer_assignment(), - *copy)) { - thunk_sequence_->emplace_back(BuildHostToDeviceCopyThunk(copy)); - return Status::OK(); - } - if (ImplementedAsDeviceToDeviceMemcpy( - ir_emitter_context_->buffer_assignment(), *copy)) { + CHECK(ShapeUtil::Compatible(copy->operand(0)->shape(), copy->shape())); + const BufferAssignment& buffer_assignment = + ir_emitter_context_->buffer_assignment(); + if (LayoutUtil::Equal(copy->operand(0)->shape().layout(), + copy->shape().layout()) && + buffer_assignment.GetUniqueTopLevelSlice(copy->operand(0)).ok()) { thunk_sequence_->emplace_back(BuildDeviceToDeviceCopyThunk(copy)); return Status::OK(); } - bool is_transpose_021; - Shape reduced_input_shape, reduced_output_shape; - std::tie(is_transpose_021, reduced_input_shape, reduced_output_shape) = - IsTranspose021(copy->operand(0)->shape(), copy->shape()); - if (is_transpose_021 && - reduced_input_shape.dimensions(1) >= kMinDimensionToTransposeTiled && - reduced_input_shape.dimensions(2) >= kMinDimensionToTransposeTiled) { - thunk_sequence_->emplace_back(BuildKernelThunk(copy)); - VLOG(3) << "Emitting tiled 0-2-1 transposition"; - constexpr int64 tile_size = 32; - constexpr int64 num_rows = 8; - int64 num_tiles = EmitTranspose021Tiled( - GetIrArray(*copy->operand(0), *copy) - .CastToShape(reduced_input_shape, &ir_builder_), - GetIrArray(*copy, *copy) - .CastToShape(reduced_output_shape, &ir_builder_), - tile_size, num_rows, &ir_builder_); - UpdateLaunchDimensions(LaunchDimensions(num_tiles, num_rows * tile_size), - LastThunk(), ir_emitter_context_->llvm_module()); + if (CheckAndEmitHloWithTile021(copy)) { return Status::OK(); } return IrEmitter::HandleCopy(copy); } +Status IrEmitterUnnested::EmitExtraOutputsForReduce( + const HloInstruction* reduce, const IrArray::Index& index, + absl::Span> + extra_output_gens) { + for (int i = 0; i != extra_output_gens.size(); ++i) { + const HloInstruction* output = reduce->parent()->FusionInstruction(); + llvm::Value* extra_output_address = + GetIrArray(*output, *output, extra_output_gens[i].second) + .EmitArrayElementAddress(index, &b_, + "extra_output_element_address"); + TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value, + extra_output_gens[i].first(index)); + Store(extra_output_ir_value, extra_output_address); + } + return Status::OK(); +} + Status IrEmitterUnnested::EmitReductionToScalar( HloInstruction* reduce, const Shape& input_shape, - tensorflow::gtl::ArraySlice input_gens, - tensorflow::gtl::ArraySlice init_value_gens, - tensorflow::gtl::ArraySlice reducers) { + absl::Span input_gens, + absl::Span init_value_gens, + absl::Span reducers, + absl::Span reduce_output_shapes, + absl::Span> + extra_output_gens) { // Number of elements processed by a single thread. constexpr int64 kTileSize = 16; int64 num_elems = ShapeUtil::ElementsIn(input_shape); @@ -959,6 +751,18 @@ Status IrEmitterUnnested::EmitReductionToScalar( int64 num_tiles = RoundUpToNearest(CeilOfRatio(num_elems, kTileSize), kWarpSize); + Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout( + reduce->shape().element_type(), {num_tiles}, {0}); + LaunchDimensions launch_dimensions = CalculateLaunchDimensions( + tiled_input_shape, ir_emitter_context_->device_description()); + + llvm::Type* index_ty = + GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_); + + auto index_typed_constant = [&](uint64 c) -> llvm::Constant* { + return llvm::ConstantInt::get(index_ty, c); + }; + // Check whether every thread will process a full tile's worth of elements // without reading outside the bounds of the input. If this is true, we can // skip some bounds checks in the final algorithm. @@ -995,112 +799,107 @@ Status IrEmitterUnnested::EmitReductionToScalar( // // RoundUpToNextMultipleOf(Ceil(num_elems / kTileSize), warpSize), // // // // and threads_per_block is a multiple of warpSize. - // reduce_kernel<<>>(); - // - auto loop_body_emitter = - [=](const llvm_ir::IrArray::Index& tile_index) -> Status { + // reduce_kernel // + auto loop_body_emitter = [=](const IrArray::Index& tile_index) -> Status { const int num_reduces = reducers.size(); llvm::Type* element_ir_type = llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_); std::vector partial_reduction_result_addresses; for (int i = 0; i != num_reduces; ++i) { - llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca( - element_ir_type, /*ArraySize=*/nullptr, - "partial_reduction_result." + llvm::Twine(i)); + llvm::Value* partial_reduction_result_address = + Alloca(element_ir_type, /*ArraySize=*/nullptr, + "partial_reduction_result." + llvm::Twine(i)); TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value, - init_value_gens[i](llvm_ir::IrArray::Index({}))); - ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address); + init_value_gens[i](IrArray::Index(index_ty))); + Store(init_ir_value, partial_reduction_result_address); partial_reduction_result_addresses.push_back( partial_reduction_result_address); } llvm::Value* x_in_tiles = tile_index[0]; + x_in_tiles = ZExtOrTrunc(x_in_tiles, index_ty); // Emit an inner for-loop that reduces the elements in the tile. auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status { std::unique_ptr tile_element_loop = - llvm_ir::ForLoop::EmitForLoop("element_id_in_tile", - ir_builder_.getInt64(0), - ir_builder_.getInt64(kTileSize), - ir_builder_.getInt64(1), &ir_builder_); + llvm_ir::ForLoop::EmitForLoop( + "element_id_in_tile", index_typed_constant(0), + index_typed_constant(kTileSize), index_typed_constant(1), &b_); // Emit the body of the partial reduction loop. llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(), - &ir_builder_); - llvm::Value* x = ir_builder_.CreateNSWAdd( - ir_builder_.CreateNSWMul(x_in_tiles, ir_builder_.getInt64(kTileSize)), - tile_element_loop->GetIndVarValue()); + &b_); + llvm::Value* x = + NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileSize)), + tile_element_loop->GetIndVarValue()); // Unless we know the tile is entirely in bounds, we have to emit a // x-in-bounds check before reading from the input. if (!tile_in_bounds) { llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( - ir_builder_.CreateICmpULT(x, ir_builder_.getInt64(num_elems)), - "x_in_bounds", &ir_builder_); + ICmpULT(x, index_typed_constant(num_elems)), "x_in_bounds", &b_); // Emit code that reads the input element and accumulates it to // the partial reduction result. - llvm_ir::SetToFirstInsertPoint(if_data.true_block, &ir_builder_); + llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_); } - llvm_ir::IrArray::Index input_index( - /*linear=*/x, input_shape, &ir_builder_); - llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type); + + IrArray::Index input_index( + /*linear=*/x, input_shape, &b_); + llvm::Value* input_address = Alloca(element_ir_type); for (int i = 0; i != num_reduces; ++i) { TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, input_gens[i](input_index)); - ir_builder_.CreateStore(input_ir_value, input_address); + Store(input_ir_value, input_address); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *reducers[i], {partial_reduction_result_addresses[i], input_address}, partial_reduction_result_addresses[i])); } - return Status::OK(); + return EmitExtraOutputsForReduce(reduce, input_index, extra_output_gens); }; // x_end = kTileSize + x_in_tiles * kTileSize, i.e., the location that's // immediately beyond the tile. - llvm::Value* x_end = ir_builder_.CreateNSWAdd( - ir_builder_.getInt64(kTileSize), - ir_builder_.CreateNSWMul(x_in_tiles, ir_builder_.getInt64(kTileSize))); + llvm::Value* x_end = + NSWAdd(index_typed_constant(kTileSize), + NSWMul(x_in_tiles, index_typed_constant(kTileSize))); // The tile is entirely in bound if all_threads_in_bounds or // x_end <= num_elems. - llvm::Value* tile_in_bounds = ir_builder_.CreateOr( - ir_builder_.CreateICmpULE(x_end, ir_builder_.getInt64(num_elems)), - ir_builder_.getInt1(all_threads_in_bounds)); + llvm::Value* tile_in_bounds = + Or(ICmpULE(x_end, index_typed_constant(num_elems)), + b_.getInt1(all_threads_in_bounds)); llvm_ir::LlvmIfData if_tile_in_bounds_data = - llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &ir_builder_); - llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block, - &ir_builder_); + llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &b_); + llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block, &b_); TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/true)); - llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block, - &ir_builder_); + llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block, &b_); TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/false)); // After the if-then-else statement on tile_in_bounds, emit calls to // shfl_down that accumulate the partial reduction results of all threads // from the warp. - llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block, - &ir_builder_); + llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block, &b_); int bit_width = llvm_ir::GetSizeInBits(element_ir_type); // bitcast cannot be applied to aggregate types (even packed ones), so we // instead bitcast addresses of load/store to intN* of the same bit-width. llvm::Type* shuffle_ir_type = element_ir_type->isStructTy() - ? ir_builder_.getIntNTy(bit_width) + ? b_.getIntNTy(bit_width) : element_ir_type; for (int shuffle_distance = kWarpSize / 2; shuffle_distance >= 1; shuffle_distance /= 2) { - llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca( - element_ir_type, nullptr, "result_from_other_lane"); + llvm::Value* result_from_other_lane = + Alloca(element_ir_type, nullptr, "result_from_other_lane"); for (int i = 0; i != num_reduces; ++i) { - llvm::Value* partial_reduction_result = ir_builder_.CreateLoad( - ir_builder_.CreateBitCast(partial_reduction_result_addresses[i], - shuffle_ir_type->getPointerTo()), - "partial_reduction_result"); - ir_builder_.CreateStore( - EmitShuffleDown(partial_reduction_result, - ir_builder_.getInt32(shuffle_distance), - &ir_builder_), - ir_builder_.CreateBitCast(result_from_other_lane, - shuffle_ir_type->getPointerTo())); + llvm::Value* partial_reduction_result = + Load(BitCast(partial_reduction_result_addresses[i], + shuffle_ir_type->getPointerTo()), + "partial_reduction_result"); + CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0) + << "Requires block size a multiple of the warp size, otherwise we " + "will read undefined elements."; + Store(EmitFullWarpShuffleDown(partial_reduction_result, + b_.getInt32(shuffle_distance), &b_), + BitCast(result_from_other_lane, shuffle_ir_type->getPointerTo())); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *reducers[i], {partial_reduction_result_addresses[i], result_from_other_lane}, @@ -1114,28 +913,22 @@ Status IrEmitterUnnested::EmitReductionToScalar( // Emit an atomic operation that accumulates the partial reduction result of // lane 0 (which holds the partially accumulated result for its warp) to the // output element. - llvm::Value* lane_id = ir_builder_.CreateURem( - x_in_tiles, ir_builder_.getInt64(kWarpSize), "lane_id"); + llvm::Value* lane_id = + URem(x_in_tiles, index_typed_constant(kWarpSize), "lane_id"); llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse( - ir_builder_.CreateICmpEQ(lane_id, ir_builder_.getInt64(0)), - "lane_id_is_zero", &ir_builder_); - llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, - &ir_builder_); + ICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", &b_); + llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_); for (int i = 0; i != num_reduces; ++i) { - ShapeIndex output_shape_index; - if (output->IsMultiOutputFusion()) { - output_shape_index = {i}; - } llvm::Value* output_address = - GetIrArray(*output, *output, output_shape_index) + GetIrArray(*output, *output, reduce_output_shapes[i]) .EmitArrayElementAddress( - llvm_ir::IrArray::Index( - /*linear=*/ir_builder_.getInt64(0), + IrArray::Index( + /*linear=*/b_.getInt64(0), ShapeUtil::GetSubshape(output->shape(), - output_shape_index), - &ir_builder_), - &ir_builder_, "output_element_address"); + reduce_output_shapes[i]), + &b_), + &b_, "output_element_address"); TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation( *reducers[i], output_address, partial_reduction_result_addresses[i])); } @@ -1143,27 +936,26 @@ Status IrEmitterUnnested::EmitReductionToScalar( }; // Emit a parallel loop that iterates through all input tiles, one per thread. - Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout( - reduce->shape().element_type(), {num_tiles}, {0}); - LaunchDimensions launch_dimensions = CalculateLaunchDimensions( - tiled_input_shape, ir_emitter_context_->device_description()); CHECK(LastThunk()->kind() == Thunk::Kind::kSequential); UpdateLaunchDimensions( launch_dimensions, static_cast(LastThunk())->thunks().back().get(), ir_emitter_context_->llvm_module()); return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape, - launch_dimensions, &ir_builder_) - .EmitLoop(IrName(reduce)); + launch_dimensions, &b_) + .EmitLoop(IrName(reduce), index_ty); } Status IrEmitterUnnested::EmitColumnReduction( int64 height, int64 width, HloInstruction* reduce, const Shape& input_shape, - tensorflow::gtl::ArraySlice input_gens, - tensorflow::gtl::ArraySlice init_value_gens, - tensorflow::gtl::ArraySlice reducers) { - // Divide the input matrix into tiles of size Kx1. For example, when the - // input matrix is 4x4 and K=2, the tiled matrix looks like + absl::Span input_gens, + absl::Span init_value_gens, + absl::Span reducers, + absl::Span reduce_output_shapes, + absl::Span> + extra_output_gens) { + // Divide the input matrix into tiles of size KxL. For example, when the + // input matrix is 4x4, K=2, and L=1 the tiled matrix looks like // // 0123 // 0123 @@ -1175,85 +967,129 @@ Status IrEmitterUnnested::EmitColumnReduction( // // We choose 128 as the tile size based on empirical evidence. It's big enough // to reduce the amount of atomic adds in the end, maximizing the memory - // bandwidth. - constexpr int64 kTileSize = 128; + // bandwidth. A tile width of 2 allows for high memory bandwidth utilization + // on 16b input data. + constexpr int64 kTileHeight = 128; + constexpr int64 kTileWidth = 2; - // If the height is not a multiple of the tile size, we pad the bottom of the + // If the height is not a multiple of kTileHeight, we pad the bottom of the // input matrix. - const int64 height_in_tiles = CeilOfRatio(height, kTileSize); + const int64 height_in_tiles = CeilOfRatio(height, kTileHeight); + // If width is not a multiple of kTileWidth the rightmost thread will process + // fewer input elements. + const int64 width_in_tiles = CeilOfRatio(width, kTileWidth); + Shape tiled_input_shape = + ShapeUtil::MakeShapeWithLayout(reduce->shape().element_type(), + {height_in_tiles, width_in_tiles}, {1, 0}); + LaunchDimensions launch_dimensions = CalculateLaunchDimensions( + tiled_input_shape, ir_emitter_context_->device_description()); + + // TODO(b/110211620): Convert to use i32 index_type when it is possible. + llvm::Type* index_ty = b_.getInt64Ty(); + + auto index_typed_constant = [&](uint64 c) -> llvm::Constant* { + return llvm::ConstantInt::get(index_ty, c); + }; // for (linear_index = threadIdx.x + blockIdx.x * blockDim.x; - // linear_index < height_in_tiles * width; + // linear_index < height_in_tiles * width_in_tiles; // linear_index += blockDim.x * gridDim.x) { - // y_in_tiles = linear_index / width; - // x = linear_index % width; + // y_in_tiles = linear_index / width_in_tiles; + // x_in_tiles = linear_index % width_in_tiles; // - // partial_result = init_value; - // if (height % kTileSize == 0 || - // y_in_tiles * kTileSize + kTileSize <= height) { - // for (element_id_in_tile : range(kTileSize)) { - // y = y_in_tiles * kTileSize + element_id_in_tile; - // partial_result = Reducer(partial_result, input[y][x]); + // partial_results[kTileWidth] = init_values; + // tile_in_y_bounds = height % kTileHeight == 0 || + // y_in_tiles * kTileHeight + kTileHeight <= height; + // tile_in_x_bounds = width % kTileWidth == 0 || + // x_in_tiles * kTileWidth + kTileWidth <= width; + // // The implementation handles y and x bound checks separately. + // if (tile_in_y_bounds && tile_in_x_bounds) { + // for (y_offset : range(kTileHeight)) { + // y = y_in_tiles * kTileHeight + y_offset; + // for (x_offset : range(kTileWidth)) { + // x = x_in_tiles * kTileWidth + x_offset; + // partial_result = Reducer(partial_result[x_offset], input[y][x]); + // } // } // } else { - // for (element_id_in_tile : range(kTileSize)) { - // y = y_in_tiles * kTileSize + element_id_in_tile; - // if (y < height) { - // partial_result = Reducer(partial_result, input[y][x]); + // for (y_offset : range(kTileHeight)) { + // y = y_in_tiles * kTileHeight + y_offset; + // for (y_offset : range(kTileHeight)) { + // x = x_in_tiles * kTileWidth + x_offset; + // if (y < height && x < width) { + // partial_result = Reducer(partial_result, input[y][x]); + // } // } // } // } - // AtomicReducer(&output[x], partial_result); + // for (x_offset : range(kTileWidth)) { + // AtomicReducer(&output[x + x_offset], partial_result[x_offset]); + // } // } - auto loop_body_emitter = - [=](const llvm_ir::IrArray::Index& tile_index) -> Status { + auto loop_body_emitter = [=](const IrArray::Index& tile_index) -> Status { const int num_reduces = reducers.size(); // Emit the loop body that reduces one tile. llvm::Type* element_ir_type = llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_); std::vector partial_reduction_result_addresses; for (int i = 0; i != num_reduces; ++i) { - llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca( - element_ir_type, /*ArraySize=*/nullptr, - "partial_reduction_result." + llvm::Twine(i)); - TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value, - init_value_gens[i](llvm_ir::IrArray::Index({}))); - ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address); - partial_reduction_result_addresses.push_back( - partial_reduction_result_address); + for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) { + llvm::Value* partial_reduction_result_address = + Alloca(element_ir_type, /*ArraySize=*/nullptr, + "partial_reduction_result." + + llvm::Twine(i * kTileWidth + x_offset)); + TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value, + init_value_gens[i](IrArray::Index(index_ty))); + Store(init_ir_value, partial_reduction_result_address); + partial_reduction_result_addresses.push_back( + partial_reduction_result_address); + } } // Emit an inner for-loop that partially reduces the elements in the given // tile. llvm::Value* y_in_tiles = tile_index[0]; - llvm::Value* x = tile_index[1]; + llvm::Value* x_in_tiles = tile_index[1]; - auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status { + y_in_tiles = ZExtOrTrunc(y_in_tiles, index_ty); + x_in_tiles = ZExtOrTrunc(x_in_tiles, index_ty); + + auto emit_tile_element_loop = [=](bool tile_in_y_bounds, + bool tile_in_x_bounds) -> Status { std::unique_ptr tile_element_loop = - llvm_ir::ForLoop::EmitForLoop("element_id_in_tile", - ir_builder_.getInt64(0), - ir_builder_.getInt64(kTileSize), - ir_builder_.getInt64(1), &ir_builder_); + llvm_ir::ForLoop::EmitForLoop( + "element_id_in_tile", index_typed_constant(0), + index_typed_constant(kTileHeight), index_typed_constant(1), &b_); // Emit the body of the partial reduction loop. llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(), - &ir_builder_); - llvm::Value* y = ir_builder_.CreateNSWAdd( - ir_builder_.CreateNSWMul(y_in_tiles, ir_builder_.getInt64(kTileSize)), - tile_element_loop->GetIndVarValue()); - // Unless we know the tile is entirely in bounds, we have to emit a - // y-in-bounds check before reading from the input. - if (!tile_in_bounds) { + &b_); + llvm::Value* y = + NSWAdd(NSWMul(y_in_tiles, index_typed_constant(kTileHeight)), + tile_element_loop->GetIndVarValue()); + + // Unless we know that y is in bounds, we have to emit a check before + // reading from the input. + if (!tile_in_y_bounds) { llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( - ir_builder_.CreateICmpULT(y, ir_builder_.getInt64(height)), - "y_in_bounds", &ir_builder_); + ICmpULT(y, index_typed_constant(height)), "y_in_bounds", &b_); // Emit code that reads the input element and accumulates it to // the partial reduction result. - llvm_ir::SetToFirstInsertPoint(if_data.true_block, &ir_builder_); + llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_); } - llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type); - { + for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) { + llvm::Value* x = + NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileWidth)), + index_typed_constant(x_offset)); + // Unless we know that x is in bounds, we have to emit a check before + // reading from the input. + if (!tile_in_x_bounds) { + llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse( + ICmpULT(x, index_typed_constant(width)), "x_in_bounds", &b_); + llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_); + } + llvm::Value* input_address = Alloca(element_ir_type); // {y,x} is an index to input_matrix_shape [height,width]. We need to // convert that to an index to input_shape (the shape of the operand of // "reduce"). This conversion is composed of a transposition from @@ -1269,97 +1105,145 @@ Status IrEmitterUnnested::EmitColumnReduction( const Shape input_matrix_shape = ShapeUtil::MakeShapeWithDescendingLayout(input_shape.element_type(), {height, width}); - const llvm_ir::IrArray::Index input_matrix_index( - {y, x}, input_matrix_shape, &ir_builder_); - const llvm_ir::IrArray::Index input_index = + const IrArray::Index input_matrix_index({y, x}, input_matrix_shape, + &b_); + const IrArray::Index input_index = input_matrix_index .SourceIndexOfReshape(input_matrix_shape, - normalized_input_shape, &ir_builder_) + normalized_input_shape, &b_) .SourceIndexOfTranspose(normalized_input_shape, input_shape, - transpose_dimension_mapping, - &ir_builder_); + transpose_dimension_mapping, &b_); for (int i = 0; i != num_reduces; ++i) { TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, input_gens[i](input_index)); - ir_builder_.CreateStore(input_ir_value, input_address); + Store(input_ir_value, input_address); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *reducers[i], - {partial_reduction_result_addresses[i], input_address}, - partial_reduction_result_addresses[i])); + {partial_reduction_result_addresses[i * kTileWidth + x_offset], + input_address}, + partial_reduction_result_addresses[i * kTileWidth + x_offset])); + TF_RETURN_IF_ERROR(EmitExtraOutputsForReduce(reduce, input_index, + extra_output_gens)); } - return Status::OK(); } + return Status::OK(); }; - // y_end = kTileSize + y_in_tiles * kTileSize, i.e., the y location that's - // immediately beyond the tile. - llvm::Value* y_end = ir_builder_.CreateNSWAdd( - ir_builder_.getInt64(kTileSize), - ir_builder_.CreateNSWMul(y_in_tiles, ir_builder_.getInt64(kTileSize))); - llvm::Value* tile_in_bounds = ir_builder_.CreateOr( - ir_builder_.CreateICmpULE(y_end, ir_builder_.getInt64(height)), - ir_builder_.getInt1(height % kTileSize == 0)); - // The tile is entirely in bound if "height" is a multiple of kTileSize or + // y_end = kTileHeight + y_in_tiles * kTileHeight, i.e., the y location + // that's immediately beyond the tile. + llvm::Value* y_end = + NSWAdd(index_typed_constant(kTileHeight), + NSWMul(y_in_tiles, index_typed_constant(kTileHeight))); + // x_end = kTileWidth + x_in_tiles * kTileWidth, i.e., the x location + // that's immediately beyond the tile. + llvm::Value* x_end = + NSWAdd(index_typed_constant(kTileWidth), + NSWMul(x_in_tiles, index_typed_constant(kTileWidth))); + llvm::Value* tile_in_y_bounds = + Or(ICmpULE(y_end, index_typed_constant(height)), + b_.getInt1(height % kTileHeight == 0)); + llvm::Value* tile_in_x_bounds = + Or(ICmpULE(x_end, index_typed_constant(width)), + b_.getInt1(width % kTileWidth == 0)); + // The tile is in y bounds if "height" is a multiple of kTileHeight or // y_end <= height. - llvm_ir::LlvmIfData if_tile_in_bounds_data = - llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &ir_builder_); - llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block, - &ir_builder_); - TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/true)); - llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block, - &ir_builder_); - TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/false)); - - // After the if-then-else statement on tile_in_bounds, emit atomic - // operations to accumulate the partial reduction result to the output - // element. - llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block, - &ir_builder_); + llvm_ir::LlvmIfData if_tile_in_y_bounds_data = + llvm_ir::EmitIfThenElse(tile_in_y_bounds, "tile_in_y_bounds", &b_); + llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.true_block, &b_); + // The tile is in x bounds if "width" is a multiple of kTileWidth or + // x_end <= width. + llvm_ir::LlvmIfData if_tile_in_x_bounds_data = + llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_); + llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_); + TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true, + /*tile_in_x_bounds=*/true)); + llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_); + TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true, + /*tile_in_x_bounds=*/false)); + llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.false_block, &b_); + if_tile_in_x_bounds_data = + llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_); + llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_); + TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false, + /*tile_in_x_bounds=*/true)); + llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_); + TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false, + /*tile_in_x_bounds=*/false)); + + // After the nested if-then-else statement on tile_in_y_bounds and + // tile_in_x_bounds, emit atomic operations to accumulate the partial + // reduction result to the output element. + llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.after_block, &b_); const HloInstruction* output = reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce; for (int i = 0; i != num_reduces; ++i) { - ShapeIndex output_shape_index; - if (output->IsMultiOutputFusion()) { - output_shape_index = {i}; + for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) { + llvm::Value* x = + NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileWidth)), + index_typed_constant(x_offset)); + llvm::Value* output_address = + GetIrArray(*output, *output, reduce_output_shapes[i]) + .EmitArrayElementAddress( + IrArray::Index( + x, + ShapeUtil::GetSubshape(output->shape(), + reduce_output_shapes[i]), + &b_), + &b_, "output_element_address"); + TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation( + *reducers[i], output_address, + partial_reduction_result_addresses[i * kTileWidth + x_offset])); } - llvm::Value* output_address = - GetIrArray(*output, *output, output_shape_index) - .EmitArrayElementAddress( - llvm_ir::IrArray::Index( - x, - ShapeUtil::GetSubshape(output->shape(), - output_shape_index), - &ir_builder_), - &ir_builder_, "output_element_address"); - TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation( - *reducers[i], output_address, partial_reduction_result_addresses[i])); } return Status::OK(); }; // Emit a parallel loop that iterate through all input tiles. - Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout( - reduce->shape().element_type(), {height_in_tiles, width}, {1, 0}); - LaunchDimensions launch_dimensions = CalculateLaunchDimensions( - tiled_input_shape, ir_emitter_context_->device_description()); CHECK(LastThunk()->kind() == Thunk::Kind::kSequential); UpdateLaunchDimensions( launch_dimensions, static_cast(LastThunk())->thunks().back().get(), ir_emitter_context_->llvm_module()); return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape, - launch_dimensions, &ir_builder_) - .EmitLoop(IrName(reduce)); + launch_dimensions, &b_) + .EmitLoop(IrName(reduce), index_ty); +} + +static std::pair ComputeTilingSchemeForReduction( + int64 depth, int64 width, int64 kWarpSize) { + constexpr int64 kTargetNumElementsPerThread = 64; + int64 x_tile_size = kTargetNumElementsPerThread; + int64 z_tile_size = 1; + + // Only tile along the x dimension with tile size kTargetNumElementsPerThread + // if doing so doesn't require a slow version of loop with bound check on each + // dimension. A more sophisticated heuristics is to enable tile along the + // x dimension with tile size kTargetNumElementsPerThread when either width is + // a factor of (kWarpSize * kTargetNumElementsPerThread) or width is big + // enough so that only a small fraction of the threads execute the slow + // version of loop with bound check. + if (width % (kWarpSize * kTargetNumElementsPerThread) != 0) { + x_tile_size = 8; + z_tile_size = 8; + while (depth % z_tile_size != 0) { + z_tile_size -= 1; + } + } + + return std::pair(x_tile_size, z_tile_size); } Status IrEmitterUnnested::EmitRowReduction( int64 depth, int64 height, int64 width, HloInstruction* reduce, const Shape& input_shape, - tensorflow::gtl::ArraySlice input_gens, - tensorflow::gtl::ArraySlice init_value_gens, - tensorflow::gtl::ArraySlice reducers) { + absl::Span input_gens, + absl::Span init_value_gens, + absl::Span reducers, + absl::Span reduce_output_shapes, + absl::Span> + extra_output_gens) { // A naive algorithm is: - // 1. Divide the input tensor into tiles of size 1x1xK. + // 1. Divide the x dimension of the input tensor into tiles of size 1x1xX. // 2. Partially reduces each tile to a scalar using one thread. // 3. Accumulates that scalar to the output vector using atomic operations. // @@ -1370,15 +1254,15 @@ Status IrEmitterUnnested::EmitRowReduction( // int y = linear_index / width_in_tiles % height; // int z = linear_index / (height * width_in_tiles); // float partial_result = 0; - // for (element_id_in_tile : range(kTileSize)) { - // int x = x_in_tiles * kTileSize + element_id_in_tile; + // for (element_id_in_tile : range(x_tile_size)) { + // int x = x_in_tiles * x_tile_size + element_id_in_tile; // if (x < width) - // partial_result = reducer(partial_result, input[z][y][z]); + // partial_result = reducer(partial_result, input[z][y][x]); // } // AtomicReducer(&output[y], partial_result); // } // - // Three optimizations are performed. + // Four optimizations are performed. // // 1. To coalesce global memory accesses, dilate the tile with a factor of 32 // (i.e. the warp size). For example, suppose the width is 8x32=256. Instead @@ -1405,29 +1289,46 @@ Status IrEmitterUnnested::EmitRowReduction( // element_id_in_tile, which makes the code more friendly to optimizations // such as LICM. // + // 4. When the width is too small and x_tile_size is less than the target + // number of elements per thread and use a small factor of depth as + // z_tile_size to increase the number of elements calculated by each + // partial sum. This can reduce the needed number of dynamic shfl_down and + // atomic operations. + // // for (linear_index = threadIdx.x + blockIdx.x * blockDim.x; // linear_index < depth * height * width_in_tiles; // linear_index += blockDim.x * gridDim.x) { // int x_in_tiles = linear_index % width_in_tiles; // int y = linear_index / width_in_tiles % height; - // int z = linear_index / (height * width_in_tiles); + // int z_in_tiles = linear_index / (height * width_in_tiles); // int warp_id = x_in_tiles / warpSize; // int lane_id = x_in_tiles % warpSize; // float partial_result = 0; // int x = warp_id * kTileSize * warpSize + lane_id; - // if (width % (kTileSize * warpSize) == 0 || - // x + (kTileSize - 1) * warpSize < width) { - // // The entire tile is in bounds. - // for (int element_id_in_tile = 0; element_id_in_tile < kTileSize; - // ++element_id_in_tile, x += warpSize) { - // partial_result = Reducer(partial_result, input[z][y][x]); + // if (width % (x_tile_size * warpSize) == 0 || + // x + (x_tile_size - 1) * warpSize < width) { + // // The entire x_tile is in bounds. + // for (int element_id_in_z_tile = 0; element_id_in_z_tile < z_tile_size; + // ++element_id_in_z_tile) { + // z = z_in_tiles * z_tile_size + element_id_in_z_tile; + // int tx = x; + // for (int element_id_in_x_tile = 0; + // element_id_in_x_tile < x_tile_size; + // ++element_id_in_x_tile, tx += warpSize) { + // partial_result = Reducer(partial_result, input[z][y][tx]); + // } // } // } else { // // The tile is partially in bounds. - // for (int element_id_in_tile = 0; element_id_in_tile < kTileSize; - // ++element_id_in_tile, x += warpSize) { - // if (x < width) - // partial_result = Reducer(partial_result, input[z][y][x]); + // for (int element_id_in_z_tile = 0; element_id_in_z_tile < z_tile_size; + // ++element_id_in_z_tile) { + // z = z_in_tiles * z_tile_size + element_id_in_z_tile; + // int tx = x; + // for (int element_id_in_x_tile = 0; element_id_in_x_tile < + // x_tile_size; ++element_id_in_tile, tx += warpSize) { + // if (tx < width) + // partial_result = Reducer(partial_result, input[z][y][tx]); + // } // } // } // for (shuffle_distance = 16; shuffle_distance > 0; shuffle_distance /= 2) @@ -1438,162 +1339,195 @@ Status IrEmitterUnnested::EmitRowReduction( // AtomicReducer(&output[y], partial_result); // } // - // Choose 8 as the tile size, which matches Eigen's RowReduceKernel. - constexpr int64 kTileSize = 8; + + int64 x_tile_size; + int64 z_tile_size; + std::tie(x_tile_size, z_tile_size) = + ComputeTilingSchemeForReduction(depth, width, kWarpSize); + // Round the width in tiles up to the nearest multiple of kWarpSize, so that // the use of shfl_down is valid. const int64 width_in_tiles = - RoundUpToNearest(CeilOfRatio(width, kTileSize), kWarpSize); + RoundUpToNearest(CeilOfRatio(width, x_tile_size), kWarpSize); + Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout( + reduce->shape().element_type(), + {depth / z_tile_size, height, width_in_tiles}, {2, 1, 0}); + LaunchDimensions launch_dimensions = CalculateLaunchDimensions( + tiled_input_shape, ir_emitter_context_->device_description()); + llvm::Type* index_ty = + GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_); - auto loop_body_emitter = - [=](const llvm_ir::IrArray::Index& tile_index) -> Status { + auto index_typed_constant = [&](uint64 c) -> llvm::Constant* { + return llvm::ConstantInt::get(index_ty, c); + }; + + auto loop_body_emitter = [=](const IrArray::Index& tile_index) { const int num_reduces = reducers.size(); - // Emit the loop body that reduces one tile. llvm::Type* element_ir_type = llvm_ir::PrimitiveTypeToIrType( input_shape.element_type(), ir_emitter_context_->llvm_module()); std::vector partial_reduction_result_addresses; for (int i = 0; i != num_reduces; ++i) { - llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca( - element_ir_type, /*ArraySize=*/nullptr, - "partial_reduction_result." + llvm::Twine(i)); + llvm::Value* partial_reduction_result_address = + Alloca(element_ir_type, /*ArraySize=*/nullptr, + "partial_reduction_result." + llvm::Twine(i)); TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value, - init_value_gens[i](llvm_ir::IrArray::Index({}))); - ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address); + init_value_gens[i](IrArray::Index(index_ty))); + Store(init_ir_value, partial_reduction_result_address); partial_reduction_result_addresses.push_back( partial_reduction_result_address); } - // Emit an inner for-loop that partially reduces the elements in the given - // tile. - llvm::Value* z = tile_index[0]; + llvm::Value* z_tile = tile_index[0]; llvm::Value* y = tile_index[1]; llvm::Value* x_tile = tile_index[2]; - llvm::Value* warp_id = ir_builder_.CreateUDiv( - x_tile, ir_builder_.getInt64(kWarpSize), "warp_id"); - llvm::Value* lane_id = ir_builder_.CreateURem( - x_tile, ir_builder_.getInt64(kWarpSize), "lane_id"); - - // The x-location of the last element in this tile. - // last_x = lane_id + warpSize * (kTileSize - 1 + warp_id * kTileSize); - llvm::Value* last_x = ir_builder_.CreateNSWAdd( - lane_id, - ir_builder_.CreateNSWMul( - ir_builder_.getInt64(kWarpSize), - ir_builder_.CreateNSWAdd( - ir_builder_.getInt64(kTileSize - 1), - ir_builder_.CreateNSWMul(warp_id, - ir_builder_.getInt64(kTileSize))))); - auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status { - std::unique_ptr tile_element_loop = - llvm_ir::ForLoop::EmitForLoop("element_id_in_tile", - ir_builder_.getInt64(0), - ir_builder_.getInt64(kTileSize), - ir_builder_.getInt64(1), &ir_builder_); + x_tile = ZExtOrTrunc(x_tile, index_ty); - // Emit the body of the partial reduction loop. - llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(), - &ir_builder_); - // x = lane_id + warpSize * (element_id_in_tile + warp_id * kTileSize); - llvm::Value* x = ir_builder_.CreateNSWAdd( - lane_id, - ir_builder_.CreateNSWMul( - ir_builder_.getInt64(kWarpSize), - ir_builder_.CreateNSWAdd( - tile_element_loop->GetIndVarValue(), - ir_builder_.CreateNSWMul(warp_id, - ir_builder_.getInt64(kTileSize))))); + llvm::Value* warp_id = + UDiv(x_tile, index_typed_constant(kWarpSize), "warp_id"); + llvm::Value* lane_id = + URem(x_tile, index_typed_constant(kWarpSize), "lane_id"); - // Unless we know the tile is entirely in bounds, we have to emit a - // x-in-bounds check before reading from the input. - if (!tile_in_bounds) { - llvm_ir::LlvmIfData if_x_in_bounds_data = llvm_ir::EmitIfThenElse( - ir_builder_.CreateICmpULT(x, ir_builder_.getInt64(width)), - "x_in_bounds", &ir_builder_); - - // Points ir_builder_ to the then-block. - llvm_ir::SetToFirstInsertPoint(if_x_in_bounds_data.true_block, - &ir_builder_); - } - - // Emit code that reads the input element and accumulates it to the - // partial reduction result. - llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type); - { - // {z,y,x} is an index to input_3d_tensor_shape [depth,height,width]. We - // need to convert that to an index to input_shape (the shape of the - // operand of "reduce"). This conversion is composed of a transposition - // from input_shape to normalized_input_shape and a reshape from - // normalized_input_shape to input_3d_tensor_shape. - const Shape normalized_input_shape = - ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( - input_shape); - auto input_shape_min2maj = LayoutUtil::MinorToMajor(input_shape); - const std::vector transpose_dimension_mapping( - input_shape_min2maj.rbegin(), input_shape_min2maj.rend()); - const Shape input_3d_tensor_shape = - ShapeUtil::MakeShapeWithDescendingLayout(input_shape.element_type(), - {depth, height, width}); - const llvm_ir::IrArray::Index input_3d_tensor_index( - {z, y, x}, input_3d_tensor_shape, &ir_builder_); - const llvm_ir::IrArray::Index input_index = - input_3d_tensor_index - .SourceIndexOfReshape(input_3d_tensor_shape, - normalized_input_shape, &ir_builder_) - .SourceIndexOfTranspose(normalized_input_shape, input_shape, - transpose_dimension_mapping, - &ir_builder_); - for (int i = 0; i != num_reduces; ++i) { - TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, - input_gens[i](input_index)); - ir_builder_.CreateStore(input_ir_value, input_address); - TF_RETURN_IF_ERROR(EmitCallToNestedComputation( - *reducers[i], - {partial_reduction_result_addresses[i], input_address}, - partial_reduction_result_addresses[i])); - } + // The x-location of the last element in this z-x-tile. + // last_x = lane_id + warpSize * (x_tile_size - 1 + warp_id * x_tile_size); + llvm::Value* last_x = NSWAdd( + lane_id, + NSWMul(index_typed_constant(kWarpSize), + NSWAdd(index_typed_constant(x_tile_size - 1), + NSWMul(warp_id, index_typed_constant(x_tile_size))))); + + KernelSupportLibrary ksl( + &b_, + /*unroll_mode=*/xla::llvm_ir::UnrollMode::kFullyUnroll, + /*prevent_vectorization=*/false); + + // Emit a for-loop that partially reduces the elements in the given + // z-x-tile. + auto emit_z_x_tile_element_loop = [&](bool x_tile_in_bounds, + int64 x_tile_loop_bound) -> Status { + auto emit_z_tile_element_loop = [&](llvm::Value* z_indvar) -> Status { + llvm::Value* z = + NSWAdd(z_indvar, NSWMul(index_typed_constant(z_tile_size), z_tile)); + TF_RETURN_IF_ERROR(ksl.For( + "x_tile", + /*start=*/index_typed_constant(0), + /*end=*/index_typed_constant(x_tile_loop_bound), + /*step=*/1, [&](llvm::Value* x_indvar) -> Status { + // x = lane_id + + // warpSize * (element_id_in_x_tile + warp_id * x_tile_size); + llvm::Value* x = NSWAdd( + lane_id, + NSWMul(index_typed_constant(kWarpSize), + NSWAdd(x_indvar, + NSWMul(warp_id, llvm::ConstantInt::get( + index_ty, x_tile_size))))); + + // Unless we know the x-tile is entirely in bounds, we have to + // emit a x-in-bounds check before reading from the input. + if (!x_tile_in_bounds) { + llvm_ir::LlvmIfData if_x_in_bounds_data = + llvm_ir::EmitIfThenElse( + ICmpULT(x, index_typed_constant(width)), "x_in_bounds", + &b_); + // Points b_ to the then-block. + llvm_ir::SetToFirstInsertPoint(if_x_in_bounds_data.true_block, + &b_); + } + + // Emit code that reads the input element and accumulates it + // to the partial reduction result. + llvm::Value* input_address = Alloca(element_ir_type); + { + // {z,y,x} is an index to input_3d_tensor_shape + // [depth,height,width]. We need to convert that to an index + // to input_shape (the shape of the operand of "reduce"). + // This conversion is composed of a transposition from + // input_shape to normalized_input_shape and a reshape from + // normalized_input_shape to input_3d_tensor_shape. + const Shape normalized_input_shape = ShapeUtil:: + MakeShapeWithDescendingLayoutAndSamePhysicalLayout( + input_shape); + auto input_shape_min2maj = + LayoutUtil::MinorToMajor(input_shape); + const std::vector transpose_dimension_mapping( + input_shape_min2maj.rbegin(), input_shape_min2maj.rend()); + const Shape input_3d_tensor_shape = + ShapeUtil::MakeShapeWithDescendingLayout( + input_shape.element_type(), {depth, height, width}); + const IrArray::Index input_3d_tensor_index( + {z, y, x}, input_3d_tensor_shape, &b_); + const IrArray::Index input_index = + input_3d_tensor_index + .SourceIndexOfReshape(input_3d_tensor_shape, + normalized_input_shape, &b_) + .SourceIndexOfTranspose( + normalized_input_shape, input_shape, + transpose_dimension_mapping, &b_); + + for (int i = 0; i != num_reduces; ++i) { + TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, + input_gens[i](input_index)); + Store(input_ir_value, input_address); + TF_RETURN_IF_ERROR(EmitCallToNestedComputation( + *reducers[i], + {partial_reduction_result_addresses[i], input_address}, + partial_reduction_result_addresses[i])); + } + return EmitExtraOutputsForReduce(reduce, input_index, + extra_output_gens); + } + })); return Status::OK(); - } - }; + }; - llvm::Value* tile_in_bounds = ir_builder_.CreateOr( - ir_builder_.getInt1(width % (kTileSize * kWarpSize) == 0), - ir_builder_.CreateICmpULT(last_x, ir_builder_.getInt64(width))); - llvm_ir::LlvmIfData if_tile_in_bounds_data = - llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &ir_builder_); - llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block, - &ir_builder_); - TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/true)); - llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block, - &ir_builder_); - TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/false)); + return ksl.For("z_tile", + /*start=*/index_typed_constant(0), + /*end=*/index_typed_constant(z_tile_size), + /*step=*/1, emit_z_tile_element_loop); + }; - // After the if-then-else statement on tile_in_bounds, emit calls to - // shfl_down that accumulate the partial reduction results of all threads - // from the warp. - llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block, - &ir_builder_); + llvm::Value* tile_in_bounds = + Or(b_.getInt1(width % (x_tile_size * kWarpSize) == 0), + ICmpULT(last_x, index_typed_constant(width))); + + TF_RETURN_IF_ERROR( + ksl.If(tile_in_bounds, + /*true_block_generator=*/ + [&]() -> Status { + return emit_z_x_tile_element_loop(/*x_tile_in_bounds=*/true, + x_tile_size); + }, + /*false_block_generator=*/ + [&]() -> Status { + return emit_z_x_tile_element_loop( + /*x_tile_in_bounds=*/false, + CeilOfRatio(width % (x_tile_size * kWarpSize), kWarpSize)); + })); + + // After accumulating the elements of the z_x_tile, emit calls to + // shfl_down that accumulate the partial reduction results of all + // threads in a warp. int bit_width = llvm_ir::GetSizeInBits(element_ir_type); // bitcast cannot be applied to aggregate types (even packed ones), so we // instead bitcast addresses of load/store to intN* of the same bit-width. llvm::Type* shuffle_ir_type = element_ir_type->isStructTy() - ? ir_builder_.getIntNTy(bit_width) + ? b_.getIntNTy(bit_width) : element_ir_type; for (int shuffle_distance = 16; shuffle_distance >= 1; shuffle_distance /= 2) { - llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca( - element_ir_type, nullptr, "result_from_other_lane"); + llvm::Value* result_from_other_lane = + Alloca(element_ir_type, nullptr, "result_from_other_lane"); for (int i = 0; i != num_reduces; ++i) { - llvm::Value* partial_reduction_result = ir_builder_.CreateLoad( - ir_builder_.CreateBitCast(partial_reduction_result_addresses[i], - shuffle_ir_type->getPointerTo()), - "partial_reduction_result"); - ir_builder_.CreateStore( - EmitShuffleDown(partial_reduction_result, - ir_builder_.getInt32(shuffle_distance), - &ir_builder_), - ir_builder_.CreateBitCast(result_from_other_lane, - shuffle_ir_type->getPointerTo())); + llvm::Value* partial_reduction_result = + Load(BitCast(partial_reduction_result_addresses[i], + shuffle_ir_type->getPointerTo()), + "partial_reduction_result"); + CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0) + << "Requires block size a multiple of the warp size, otherwise we " + "will read undefined elements."; + Store(EmitFullWarpShuffleDown(partial_reduction_result, + b_.getInt32(shuffle_distance), &b_), + BitCast(result_from_other_lane, shuffle_ir_type->getPointerTo())); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *reducers[i], {partial_reduction_result_addresses[i], result_from_other_lane}, @@ -1608,44 +1542,42 @@ Status IrEmitterUnnested::EmitRowReduction( // lane 0 (which holds the partially accumulated result for its warp) to the // output element. llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse( - ir_builder_.CreateICmpEQ(lane_id, ir_builder_.getInt64(0)), - "lane_id_is_zero", &ir_builder_); - llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, - &ir_builder_); + ICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", &b_); + llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_); for (int i = 0; i != num_reduces; ++i) { - ShapeIndex output_shape_index; - if (output->IsMultiOutputFusion()) { - output_shape_index = {i}; - } llvm::Value* output_address = - GetIrArray(*output, *output, output_shape_index) + GetIrArray(*output, *output, reduce_output_shapes[i]) .EmitArrayElementAddress( - llvm_ir::IrArray::Index( - y, - ShapeUtil::GetSubshape(output->shape(), - output_shape_index), - &ir_builder_), - &ir_builder_, "output_element_address"); - TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation( - *reducers[i], output_address, partial_reduction_result_addresses[i])); + IrArray::Index(y, + ShapeUtil::GetSubshape( + output->shape(), reduce_output_shapes[i]), + &b_), + &b_, "output_element_address"); + // We don't need to emit atomic operations if there is only one tile of + // results. 'depth' is the z dimension, 'width' is the x dimension. + if (z_tile_size >= depth && x_tile_size >= width) { + TF_RETURN_IF_ERROR(EmitCallToNestedComputation( + *reducers[i], + {output_address, partial_reduction_result_addresses[i]}, + output_address)); + } else { + TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation( + *reducers[i], output_address, + partial_reduction_result_addresses[i])); + } } return Status::OK(); }; // Emit a parallel loop that iterates through every input tiles. - Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout( - reduce->shape().element_type(), {depth, height, width_in_tiles}, - {2, 1, 0}); - LaunchDimensions launch_dimensions = CalculateLaunchDimensions( - tiled_input_shape, ir_emitter_context_->device_description()); CHECK(LastThunk()->kind() == Thunk::Kind::kSequential); UpdateLaunchDimensions( launch_dimensions, static_cast(LastThunk())->thunks().back().get(), ir_emitter_context_->llvm_module()); return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape, - launch_dimensions, &ir_builder_) - .EmitLoop(IrName(reduce)); + launch_dimensions, &b_) + .EmitLoop(IrName(reduce), index_ty); } // Figures out whether `reduce` is a row or column reduction, and which @@ -1656,10 +1588,13 @@ Status IrEmitterUnnested::EmitRowReduction( // elementwise. Status IrEmitterUnnested::EmitReductionToVector( HloInstruction* reduce, const Shape& input_shape, - tensorflow::gtl::ArraySlice input_gens, - tensorflow::gtl::ArraySlice init_value_gens, - tensorflow::gtl::ArraySlice dimensions_to_reduce, - tensorflow::gtl::ArraySlice reducers) { + absl::Span input_gens, + absl::Span init_value_gens, + absl::Span dimensions_to_reduce, + absl::Span reducers, + absl::Span reduce_output_shapes, + absl::Span> + extra_output_gens) { // This emission requires "reduce" to have an input layout. It is either set // by LayoutAssignment (for a top-level kReduce) or by InstructionFusion (for // a fused kReduce). @@ -1695,7 +1630,8 @@ Status IrEmitterUnnested::EmitReductionToVector( // dimension of the input is to keep. if (input_dims_to_keep.empty()) { return EmitReductionToScalar(reduce, input_shape, input_gens, - init_value_gens, reducers); + init_value_gens, reducers, + reduce_output_shapes, extra_output_gens); } else if (input_dims_to_keep.front() == LayoutUtil::Minor(input_shape.layout(), 0)) { // Column reduction. Treat the result of "input" as a matrix whose width @@ -1713,7 +1649,8 @@ Status IrEmitterUnnested::EmitReductionToVector( } } return EmitColumnReduction(height, width, reduce, input_shape, input_gens, - init_value_gens, reducers); + init_value_gens, reducers, reduce_output_shapes, + extra_output_gens); } else { // Reduce the row dimension of a matrix or reduce dimension 0 and 2 in a // 3D tensor. The size of dimension 1 (the height) is the size of the @@ -1739,51 +1676,58 @@ Status IrEmitterUnnested::EmitReductionToVector( } const int64 height = ShapeUtil::ElementsIn(reduce->shape()); return EmitRowReduction(depth, height, width, reduce, input_shape, - input_gens, init_value_gens, reducers); + input_gens, init_value_gens, reducers, + reduce_output_shapes, extra_output_gens); } } Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) { + // TODO(b/112040122): Support multi-output reduce. + if (!ShapeUtil::IsArray(reduce->shape())) { + return Unimplemented("Multi-output reduce is not supported on GPU"); + } auto input = reduce->operand(0); auto init_value = reduce->operand(1); - tensorflow::gtl::ArraySlice dimensions_to_reduce(reduce->dimensions()); + absl::Span dimensions_to_reduce(reduce->dimensions()); HloComputation* reducer = reduce->to_apply(); // HandleReduce specializes reduction from a multi-dimensional array to a 1D // array. The specialized version requires an initializer thunk that // initializes the output array to the initial value of the reduce. - if (IsReductionToVector(*reduce) && - // NVPTX backend can't do atomic cmpxchg any narrower than 32 bits - 32 <= primitive_util::BitWidth(reduce->shape().element_type())) { + if (IsReductionToVector(*reduce)) { TF_ASSIGN_OR_RETURN(std::unique_ptr initializer_thunk, BuildInitializerThunk(reduce)); std::vector> thunks; thunks.push_back(std::move(initializer_thunk)); - thunks.push_back(BuildKernelThunk(reduce)); + thunks.push_back( + BuildKernelThunk(reduce, /*implements_whole_instruction=*/false)); thunk_sequence_->emplace_back( - MakeUnique(std::move(thunks), reduce)); + absl::make_unique(std::move(thunks), reduce)); return EmitReductionToVector( - reduce, input->shape(), {[&](const llvm_ir::IrArray::Index& index) { - return GetIrArray(*input, *reduce) - .EmitReadArrayElement(index, &ir_builder_); + reduce, input->shape(), {[&](const IrArray::Index& index) { + return GetIrArray(*input, *reduce).EmitReadArrayElement(index, &b_); }}, - {[&](const llvm_ir::IrArray::Index& index) { + {[&](const IrArray::Index& index) { return GetIrArray(*init_value, *reduce) - .EmitReadArrayElement(index, &ir_builder_); + .EmitReadArrayElement(index, &b_); }}, - dimensions_to_reduce, {reducer}); + dimensions_to_reduce, {reducer}, {{}}, {}); } - thunk_sequence_->emplace_back(BuildKernelThunk(reduce)); + thunk_sequence_->emplace_back( + BuildKernelThunk(reduce, /*implements_whole_instruction=*/true)); return IrEmitter::HandleReduce(reduce); } Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) { bool all_tuple_elements_have_buffer = - c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) { - return ir_emitter_context_->buffer_assignment().HasTopLevelAllocation( - tuple_element); + absl::c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) { + return ir_emitter_context_->buffer_assignment() + .GetUniqueTopLevelSlice(tuple_element) + .ok(); }); + // TODO(b/111689850): This logic isn't quite correct. + // // Tuples (especially tuples that are the final result of a computation) can // be so huge that if we were to emit a kernel that took each tuple element as // a parameter, we would exceed the max allowable number of parameters to a @@ -1791,19 +1735,20 @@ Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) { // buffer, we collect their buffer addresses in a host array, and then copy // that array to the tuple's buffer. // - // Some tuple elements (e.g. const or bitcast of const) might not have a - // buffer -- their contents are stored in code. In that case, we fall back to - // emitting kernels which have access to their buffer addresses in code. + // Some tuple elements might not have an unambiguous buffer (like the result + // of a select-tuple). In that case, we fall back to emitting kernels which + // have access to their buffer addresses in code. if (all_tuple_elements_have_buffer) { std::vector tuple_element_buffers; for (const HloInstruction* tuple_element : tuple->operands()) { tuple_element_buffers.push_back(GetAllocationSlice(*tuple_element)); } - thunk_sequence_->emplace_back(MakeUnique( + thunk_sequence_->emplace_back(absl::make_unique( tuple_element_buffers, GetAllocationSlice(*tuple), tuple)); return Status::OK(); } - thunk_sequence_->emplace_back(BuildKernelThunk(tuple)); + thunk_sequence_->emplace_back( + BuildKernelThunk(tuple, /*implements_whole_instruction=*/true)); return IrEmitter::HandleTuple(tuple); } @@ -1828,9 +1773,10 @@ Status IrEmitterUnnested::HandleSelectAndScatter( BuildInitializerThunk(select_and_scatter)); std::vector> thunks; thunks.push_back(std::move(initializer_thunk)); - thunks.push_back(BuildKernelThunk(select_and_scatter)); - thunk_sequence_->emplace_back( - MakeUnique(std::move(thunks), select_and_scatter)); + thunks.push_back(BuildKernelThunk(select_and_scatter, + /*implements_whole_instruction=*/false)); + thunk_sequence_->emplace_back(absl::make_unique( + std::move(thunks), select_and_scatter)); // TODO(b/31410564): Implement dilation rate for select-and-scatter. if (window_util::HasDilation(window)) { @@ -1838,6 +1784,14 @@ Status IrEmitterUnnested::HandleSelectAndScatter( "Dilation for SelectAndScatter not implemented on GPU."); } + LaunchDimensions launch_dimensions = CalculateLaunchDimensions( + source->shape(), ir_emitter_context_->device_description()); + llvm::Type* index_type = GetIndexTypeForKernel( + select_and_scatter, launch_dimensions.launch_bound(), &b_); + auto index_typed_constant = [&](uint64 c) -> llvm::Constant* { + return llvm::ConstantInt::get(index_type, c); + }; + // kSelectAndScatter is implemented as two kernel launches: the first launch // initializes the output array to the given initial value, // and the second accumulates the "source" matrix to the @@ -1857,114 +1811,106 @@ Status IrEmitterUnnested::HandleSelectAndScatter( // selected_index = I // initialized_flag = true // output(selected_index) = scatter(output(selected_index), source(S)) - auto loop_body_emitter = - [=](const llvm_ir::IrArray::Index& source_index) -> Status { + auto loop_body_emitter = [=](const IrArray::Index& source_index) -> Status { // Allocate space to keep the currently selected value, its index, and a // boolean flag if the value is initialized. The initialized_flag is set // false. llvm::Value* selected_value_address = llvm_ir::EmitAllocaAtFunctionEntry( llvm_ir::PrimitiveTypeToIrType(operand_element_type, ir_emitter_context_->llvm_module()), - "selected_value_address", &ir_builder_); + "selected_value_address", &b_); llvm::Value* selected_index_address = llvm_ir::EmitAllocaAtFunctionEntryWithCount( - ir_builder_.getInt64Ty(), ir_builder_.getInt32(rank), - "selected_index_address", &ir_builder_); + index_type, index_typed_constant(rank), "selected_index_address", + &b_); llvm::Value* initialized_flag_address = llvm_ir::EmitAllocaAtFunctionEntry( - ir_builder_.getInt1Ty(), "initialized_flag_address", &ir_builder_); - ir_builder_.CreateStore(ir_builder_.getInt1(false), - initialized_flag_address); + b_.getInt1Ty(), "initialized_flag_address", &b_); + Store(b_.getInt1(false), initialized_flag_address); // Create the inner loop to iterate over the window. - llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"), - &ir_builder_); + llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"), &b_, + index_type); std::vector window_size; for (const auto& dim : window.dimensions()) { window_size.push_back(dim.size()); CHECK_GT(dim.size(), 0); } - const llvm_ir::IrArray::Index window_index = window_loops.AddLoopsForShape( + const IrArray::Index window_index = window_loops.AddLoopsForShape( ShapeUtil::MakeShape(operand_element_type, window_size), "window"); llvm_ir::SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(), - &ir_builder_); + &b_); // Compute the operand index to visit and evaluate the condition whether the // operand index is within the bounds. The unsigned comparison includes // checking whether the operand index >= 0. - llvm_ir::IrArray::Index operand_index(source_index.size()); - llvm::Value* in_bounds_condition = ir_builder_.getInt1(true); + IrArray::Index operand_index(index_type, source_index.size()); + llvm::Value* in_bounds_condition = b_.getInt1(true); for (int64 i = 0; i < rank; ++i) { - llvm::Value* strided_index = ir_builder_.CreateNSWMul( - source_index[i], ir_builder_.getInt64(window.dimensions(i).stride())); - operand_index[i] = ir_builder_.CreateNSWSub( - ir_builder_.CreateNSWAdd(strided_index, window_index[i]), - ir_builder_.getInt64(window.dimensions(i).padding_low())); - llvm::Value* index_condition = ir_builder_.CreateICmpULT( + llvm::Value* strided_index = NSWMul( + source_index[i], index_typed_constant(window.dimensions(i).stride())); + operand_index[i] = + NSWSub(NSWAdd(strided_index, window_index[i]), + index_typed_constant(window.dimensions(i).padding_low())); + llvm::Value* index_condition = ICmpULT( operand_index[i], - ir_builder_.getInt64(ShapeUtil::GetDimension(operand->shape(), i))); - in_bounds_condition = - ir_builder_.CreateAnd(in_bounds_condition, index_condition); + index_typed_constant(ShapeUtil::GetDimension(operand->shape(), i))); + in_bounds_condition = And(in_bounds_condition, index_condition); } CHECK(in_bounds_condition != nullptr); // Only need to do something if the operand index is within the bounds. // First check if the initialized_flag is set. llvm_ir::LlvmIfData if_in_bounds = - llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_); - llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, &ir_builder_); + llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_); + llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, &b_); llvm_ir::LlvmIfData if_initialized = llvm_ir::EmitIfThenElse( - ir_builder_.CreateLoad(initialized_flag_address), "initialized", - &ir_builder_); + Load(initialized_flag_address), "initialized", &b_); // If the initialized_flag is false, initialize the selected value and index // with the currently visiting operand. - llvm_ir::SetToFirstInsertPoint(if_initialized.false_block, &ir_builder_); - const auto save_operand_index = [&]( - const llvm_ir::IrArray::Index& operand_index) { + llvm_ir::SetToFirstInsertPoint(if_initialized.false_block, &b_); + const auto save_operand_index = [&](const IrArray::Index& operand_index) { for (int64 i = 0; i < rank; ++i) { llvm::Value* selected_index_address_slot = - ir_builder_.CreateInBoundsGEP(selected_index_address, - {ir_builder_.getInt32(i)}); - ir_builder_.CreateStore(operand_index[i], selected_index_address_slot); + InBoundsGEP(selected_index_address, {b_.getInt32(i)}); + Store(operand_index[i], selected_index_address_slot); } }; - llvm_ir::IrArray operand_array = GetIrArray(*operand, *select_and_scatter); + IrArray operand_array = GetIrArray(*operand, *select_and_scatter); llvm::Value* operand_data = - operand_array.EmitReadArrayElement(operand_index, &ir_builder_); - ir_builder_.CreateStore(operand_data, selected_value_address); + operand_array.EmitReadArrayElement(operand_index, &b_); + Store(operand_data, selected_value_address); save_operand_index(operand_index); - ir_builder_.CreateStore(ir_builder_.getInt1(true), - initialized_flag_address); + Store(b_.getInt1(true), initialized_flag_address); // If the initialized_flag is true, call the `select` function to // potentially update the selected value and index with the currently // visiting operand. - llvm_ir::SetToFirstInsertPoint(if_initialized.true_block, &ir_builder_); + llvm_ir::SetToFirstInsertPoint(if_initialized.true_block, &b_); const Shape output_shape = ShapeUtil::MakeShape(PRED, {}); llvm::Value* operand_address = - operand_array.EmitArrayElementAddress(operand_index, &ir_builder_); + operand_array.EmitArrayElementAddress(operand_index, &b_); llvm::Value* select_return_buffer = llvm_ir::EmitAllocaAtFunctionEntry( llvm_ir::PrimitiveTypeToIrType(PRED, ir_emitter_context_->llvm_module()), - "select_return_buffer", &ir_builder_); + "select_return_buffer", &b_); TF_RETURN_IF_ERROR(EmitCallToNestedComputation( *select_and_scatter->select(), {selected_value_address, operand_address}, select_return_buffer)); - llvm::Value* result = ir_builder_.CreateLoad(select_return_buffer); + llvm::Value* result = Load(select_return_buffer); // If the 'select' function returns false, update the selected value and the // index to the currently visiting operand. - llvm::Value* cond = ir_builder_.CreateICmpNE( + llvm::Value* cond = ICmpNE( result, llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType( PRED, ir_emitter_context_->llvm_module()), 0), "boolean_predicate"); llvm_ir::LlvmIfData if_select_lhs = - llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &ir_builder_); - llvm_ir::SetToFirstInsertPoint(if_select_lhs.false_block, &ir_builder_); - ir_builder_.CreateStore(ir_builder_.CreateLoad(operand_address), - selected_value_address); + llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &b_); + llvm_ir::SetToFirstInsertPoint(if_select_lhs.false_block, &b_); + Store(Load(operand_address), selected_value_address); save_operand_index(operand_index); // After iterating over the window elements, scatter the source element to @@ -1972,27 +1918,24 @@ Status IrEmitterUnnested::HandleSelectAndScatter( // location is computed by calling the `scatter` function with the source // value and the current output value. llvm_ir::SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(), - &ir_builder_); - llvm_ir::IrArray::Index selected_index; + &b_); + IrArray::Index selected_index(operand_index.GetType()); for (int64 i = 0; i < rank; ++i) { - llvm::Value* selected_index_address_slot = ir_builder_.CreateInBoundsGEP( - selected_index_address, {ir_builder_.getInt32(i)}); - selected_index.push_back( - ir_builder_.CreateLoad(selected_index_address_slot)); + llvm::Value* selected_index_address_slot = + InBoundsGEP(selected_index_address, {b_.getInt32(i)}); + selected_index.push_back(Load(selected_index_address_slot)); } llvm::Value* source_value_address = GetIrArray(*source, *select_and_scatter) - .EmitArrayElementAddress(source_index, &ir_builder_); + .EmitArrayElementAddress(source_index, &b_); llvm::Value* output_value_address = GetIrArray(*select_and_scatter, *select_and_scatter) - .EmitArrayElementAddress(selected_index, &ir_builder_); + .EmitArrayElementAddress(selected_index, &b_); return EmitAtomicOperationForNestedComputation( *select_and_scatter->scatter(), output_value_address, source_value_address); }; - LaunchDimensions launch_dimensions = CalculateLaunchDimensions( - source->shape(), ir_emitter_context_->device_description()); UpdateLaunchDimensions( launch_dimensions, // IrEmitterUnnested implements kSelectAndScatter as a SequentialThunk @@ -2002,8 +1945,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter( static_cast(LastThunk())->thunks().back().get(), ir_emitter_context_->llvm_module()); return ParallelLoopEmitter(loop_body_emitter, source->shape(), - launch_dimensions, &ir_builder_) - .EmitLoop(IrName(select_and_scatter)); + launch_dimensions, &b_) + .EmitLoop(IrName(select_and_scatter), index_type); } Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) { @@ -2012,33 +1955,147 @@ Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) { condition->root_instruction()->shape().element_type() == PRED) << "While condition computation must return bool"; // Build ForThunk for conformant while loops, otherwise build WhileThunk. - auto result = CanTransformWhileToFor(xla_while); - if (result.ok()) { - auto tuple = result.ConsumeValueOrDie(); - // loop_trip_count = (limit - start + increment - 1) / increment - const int64 loop_trip_count = - (std::get<1>(tuple) - std::get<0>(tuple) + std::get<2>(tuple) - 1) / - std::get<2>(tuple); - thunk_sequence_->emplace_back(BuildForThunk(xla_while, loop_trip_count)); + // TODO(b/112163966): Move trip count computation earlier in the pipeline. + if (auto loop_trip_count = ComputeWhileLoopTripCount(xla_while)) { + thunk_sequence_->emplace_back(BuildForThunk(xla_while, *loop_trip_count)); VLOG(3) << "Built ForThunk for while: " << xla_while->name(); } else { thunk_sequence_->emplace_back(BuildWhileThunk(xla_while)); - VLOG(3) << "Built WhileThunk for while: " << xla_while->name() - << " while-to-for transform status: " << result.status(); + VLOG(3) << "Built WhileThunk for while: " << xla_while->name(); } return Status::OK(); } -Status IrEmitterUnnested::HandleRng(HloInstruction* random) { - thunk_sequence_->push_back(BuildKernelThunk(random)); - return IrEmitter::HandleRng(random); +Status IrEmitterUnnested::HandleRng(HloInstruction* rng) { + // Build the kernel to generate the random numbers. + // + // Unroll the kernel so that the duplicated computation that calculates the + // 128 bit sample can be optimized away by LLVM. + thunk_sequence_->emplace_back( + BuildKernelThunk(rng, /*implements_whole_instruction=*/false, + ComputeMaxUnrollFactor(rng))); + ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator; + for (const HloInstruction* operand : rng->operands()) { + operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) { + return GetIrArray(*operand, *rng).EmitReadArrayElement(index, &b_); + }; + } + TF_RETURN_IF_ERROR(EmitTargetElementLoop( + *rng, GpuElementalIrEmitter(hlo_module_config_, module_, &b_, + GetNestedComputer()) + .MakeElementGenerator(rng, operand_to_generator))); + std::unique_ptr rng_thunk = std::move(thunk_sequence_->back()); + thunk_sequence_->pop_back(); + + // Emit a kernel to increment the global state for Philox RNG algorithm. + thunk_sequence_->emplace_back( + BuildKernelThunk(rng, /*implements_whole_instruction=*/false)); + llvm_ir::IncrementVariableForPhiloxRngState(1, module_, &b_); + std::unique_ptr increment_seed_thunk = + std::move(thunk_sequence_->back()); + thunk_sequence_->pop_back(); + + // Build the SequentialThunk for the RNG hlo. + std::vector> thunks; + thunks.reserve(2); + thunks.push_back(std::move(rng_thunk)); + thunks.push_back(std::move(increment_seed_thunk)); + thunk_sequence_->emplace_back( + absl::make_unique(std::move(thunks), rng)); + + return Status::OK(); } Status IrEmitterUnnested::HandleSelect(HloInstruction* select) { - thunk_sequence_->push_back(BuildKernelThunk(select)); + thunk_sequence_->push_back( + BuildKernelThunk(select, /*implements_whole_instruction=*/true)); return IrEmitter::HandleSelect(select); } +Status IrEmitterUnnested::HandleSort(HloInstruction* sort) { + std::vector> thunks; + auto keys = sort->operand(0); + auto values = sort->operand_count() > 1 ? sort->operand(1) : nullptr; + ShapeIndex keys_shape_index({}); + ShapeIndex values_shape_index({}); + if (values != nullptr) { + keys_shape_index = ShapeIndex({0}); + values_shape_index = ShapeIndex({1}); + } + auto keys_destination = GetAllocationSlice(*sort, keys_shape_index); + auto values_destination = GetAllocationSlice(*sort, values_shape_index); + + if (keys_destination != GetAllocationSlice(*keys)) { + thunks.push_back(absl::make_unique( + /*source_address=*/GetAllocationSlice(*keys), + /*destination_buffer=*/keys_destination, + /*mem_size=*/ShapeUtil::ByteSizeOf(keys->shape()), nullptr)); + } + if (values != nullptr && values_destination != GetAllocationSlice(*values)) { + // TODO(b/26783907): Figure out why we never seem to share buffers for + // key/value sort. + thunks.push_back(absl::make_unique( + /*source_address=*/GetAllocationSlice(*values), + /*destination_buffer=*/values_destination, + /*mem_size=*/ShapeUtil::ByteSizeOf(values->shape()), nullptr)); + } + + int64 dimension_to_sort = sort->dimensions(0); + int64 dimension_to_sort_bound = keys->shape().dimensions(dimension_to_sort); + int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound); + auto index_type = b_.getInt64Ty(); + + // Naive C++ code for the outer loops: + // + // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound); + // ++stage) { + // int64 first_xor_mask = (1LL << (stage + 1)) - 1; + // SortInPlace(first_xor_mask); + // for (int64 mask = stage - 1; mask >= 0; --mask) { + // int64 later_xor_mask = 1LL << mask; + // SortInPlace(later_xor_mask); + // } + // } + // + // This follows the algorithm described on Wikipedia: + // https://en.wikipedia.org/wiki/Bitonic_sorter + + for (int64 stage = 0; stage < num_stages; ++stage) { + for (int64 mask = stage; mask >= 0; --mask) { + thunks.push_back( + BuildKernelThunk(sort, /*implements_whole_instruction=*/false)); + LaunchDimensions launch_dimensions = CalculateLaunchDimensions( + keys->shape(), ir_emitter_context_->device_description()); + UpdateLaunchDimensions(launch_dimensions, thunks.back().get(), + ir_emitter_context_->llvm_module()); + + llvm::Value* xor_mask; + if (mask == stage) { + xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1); + } else { + xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask); + } + + TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace( + dimension_to_sort, GetIrArray(*sort, *sort, keys_shape_index), + values != nullptr ? absl::make_optional( + GetIrArray(*sort, *sort, values_shape_index)) + : absl::nullopt, + IrName(sort), xor_mask, &b_, &launch_dimensions)); + } + } + + thunk_sequence_->emplace_back( + absl::make_unique(std::move(thunks), sort)); + return Status::OK(); +} + +Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) { + thunk_sequence_->push_back( + BuildKernelThunk(tuple_select, /*implements_whole_instruction=*/true)); + return IrEmitter::HandleTupleSelect(tuple_select); +} + Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) { if (hlo_module_config_.replica_count() != 1) { // TODO(b/33011107): Support nontrivial cross replica sum on GPU. @@ -2056,7 +2113,7 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) { if (crs->operand_count() == 1) { CHECK(ShapeUtil::IsArray(crs->operand(0)->shape())) << "Operands to cross-replica-sum must be arrays: " << crs->ToString(); - thunk_sequence_->push_back(MakeUnique( + thunk_sequence_->push_back(absl::make_unique( /*source_address=*/GetAllocationSlice(*crs->operand(0)), /*destination_buffer=*/GetAllocationSlice(*crs), /*mem_size=*/ShapeUtil::ByteSizeOf(crs->shape()), crs)); @@ -2071,17 +2128,21 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) { tuple_element_buffers.push_back(ir_emitter_context_->buffer_assignment() .GetUniqueSlice(crs, {i}) .ValueOrDie()); - thunks.push_back(MakeUnique( + thunks.push_back(absl::make_unique( /*source_address=*/GetAllocationSlice(*crs->operand(i)), /*destination_buffer=*/tuple_element_buffers.back(), - /*mem_size=*/ShapeUtil::ByteSizeOf(crs->operand(i)->shape()), crs)); + /*mem_size=*/ShapeUtil::ByteSizeOf(crs->operand(i)->shape()), nullptr)); } // Output a tuple of the buffers above. - thunks.push_back(MakeUnique(tuple_element_buffers, - GetAllocationSlice(*crs), crs)); + thunks.push_back(absl::make_unique( + tuple_element_buffers, GetAllocationSlice(*crs), nullptr)); thunk_sequence_->push_back( - MakeUnique(std::move(thunks), crs)); + absl::make_unique(std::move(thunks), crs)); + return Status::OK(); +} + +Status IrEmitterUnnested::HandleAfterAll(HloInstruction* gen_token) { return Status::OK(); } @@ -2090,6 +2151,11 @@ Status IrEmitterUnnested::HandleInfeed(HloInstruction* infeed) { return Status::OK(); } +Status IrEmitterUnnested::HandleOutfeed(HloInstruction* outfeed) { + thunk_sequence_->emplace_back(BuildOutfeedThunk(outfeed)); + return Status::OK(); +} + // Figures out how to access the buffers for all subshapes of hlo's operands and // for hlo itself (i.e. all the buffers produced by HLO). // @@ -2177,11 +2243,6 @@ GetHloBufferSlices(const HloInstruction* hlo, // Adds entries for all subshapes of instr to `slices`. auto add_slices_for = [&](const HloInstruction* instr) { - // GPU constants don't have buffers; don't bother looking for one. - if (instr->IsConstant()) { - return; - } - ShapeUtil::ForEachSubshape( instr->shape(), [&](const Shape& /*shape*/, const ShapeIndex& index) { if (slices.count({instr, index})) { @@ -2208,13 +2269,9 @@ GetHloBufferSlices(const HloInstruction* hlo, return slices; } -Status IrEmitterUnnested::HandleGather(HloInstruction* gather) { - // TODO(b/72710576): Gather is not implemented on GPUs - return Unimplemented("Gather is not implemented on GPUs."); -} - std::unique_ptr IrEmitterUnnested::BuildKernelThunk( - const HloInstruction* inst, int unroll_factor) { + const HloInstruction* inst, bool implements_whole_instruction, + int unroll_factor) { const BufferAssignment& buffer_assn = ir_emitter_context_->buffer_assignment(); @@ -2231,7 +2288,7 @@ std::unique_ptr IrEmitterUnnested::BuildKernelThunk( for (const auto& kv : hlo_slices) { buffers_needed.insert(kv.second.first.allocation()); } - tensorflow::gtl::optional temp_buffer; + absl::optional temp_buffer; for (const BufferAllocation& alloc : buffer_assn.Allocations()) { if (alloc.IsPreallocatedTempBuffer()) { if (!temp_buffer.has_value()) { @@ -2247,21 +2304,25 @@ std::unique_ptr IrEmitterUnnested::BuildKernelThunk( // We'll pass a pointer to each of the elements of `buffers` to our kernel, in // this order. - std::vector buffers(buffers_needed.begin(), - buffers_needed.end()); - std::sort(buffers.begin(), buffers.end(), + std::vector non_constant_buffers; + absl::c_copy_if(buffers_needed, std::back_inserter(non_constant_buffers), + [](const BufferAllocation* allocation) { + return !allocation->is_constant(); + }); + + std::sort(non_constant_buffers.begin(), non_constant_buffers.end(), [](const BufferAllocation* a, const BufferAllocation* b) { return a->index() < b->index(); }); - llvm::Function* kernel = BuildKernelPrototype(*inst, buffers); + llvm::Function* kernel = BuildKernelPrototype(*inst, non_constant_buffers); // Build a map from a BufferAllocation to the corresponding argument in our // kernel. std::unordered_map kernel_args; { auto arg_it = kernel->arg_begin(); - auto buffers_it = buffers.begin(); + auto buffers_it = non_constant_buffers.begin(); for (; arg_it != kernel->arg_end(); ++arg_it, ++buffers_it) { kernel_args[*buffers_it] = arg_it; } @@ -2279,18 +2340,24 @@ std::unique_ptr IrEmitterUnnested::BuildKernelThunk( << " is found in slice " << slice.ToString() << " at GTE index " << gte_index.ToString(); - llvm::Value* loc = - ir_builder_.CreateInBoundsGEP(kernel_args.at(slice.allocation()), - {ir_builder_.getInt64(slice.offset())}); + llvm::Value* loc; + if (slice.allocation()->is_constant()) { + loc = ir_emitter_context_->llvm_module()->getGlobalVariable( + llvm_ir::AsStringRef(llvm_ir::ConstantBufferAllocationToGlobalName( + *slice.allocation()))); + CHECK_NE(loc, nullptr); + } else { + loc = InBoundsGEP(kernel_args.at(slice.allocation()), + {b_.getInt64(slice.offset())}); + } // If gte_index is nonempty, we have to dereference `loc` to get to the // value we're ultimately interested in. llvm::Type* int8_double_pointer = - llvm::PointerType::get(ir_builder_.getInt8PtrTy(), /*AddressSpace=*/0); + llvm::PointerType::get(b_.getInt8PtrTy(), /*AddressSpace=*/0); for (int64 idx : gte_index) { - loc = ir_builder_.CreateBitCast(loc, int8_double_pointer); - loc = ir_builder_.CreateLoad( - ir_builder_.CreateInBoundsGEP(loc, {ir_builder_.getInt64(idx)})); + loc = BitCast(loc, int8_double_pointer); + loc = Load(InBoundsGEP(loc, {b_.getInt64(idx)})); } bindings_.BindHloToIrValue(*instr, loc, index); @@ -2302,18 +2369,19 @@ std::unique_ptr IrEmitterUnnested::BuildKernelThunk( bindings_.SetTempBufferBase(kernel_args.at(*temp_buffer)); } else { bindings_.SetTempBufferBase( - llvm::ConstantPointerNull::get(ir_builder_.getInt8PtrTy())); + llvm::ConstantPointerNull::get(b_.getInt8PtrTy())); } - return MakeUnique(buffers, llvm_ir::AsString(kernel->getName()), - inst, unroll_factor); + return absl::make_unique( + non_constant_buffers, llvm_ir::AsString(kernel->getName()), + implements_whole_instruction ? inst : nullptr, unroll_factor); } std::unique_ptr IrEmitterUnnested::BuildHostToDeviceCopyThunk( const HloInstruction* inst) { const HloInstruction* operand = inst->operand(0); CHECK_EQ(HloOpcode::kConstant, operand->opcode()); - return MakeUnique( + return absl::make_unique( /*source_address=*/operand->literal().untyped_data(), /*destination_buffer=*/GetAllocationSlice(*inst), /*mem_size=*/ @@ -2325,7 +2393,7 @@ std::unique_ptr IrEmitterUnnested::BuildHostToDeviceCopyThunk( std::unique_ptr IrEmitterUnnested::BuildDeviceToDeviceCopyThunk( const HloInstruction* inst) { const HloInstruction* operand = inst->operand(0); - return MakeUnique( + return absl::make_unique( /*source_address=*/GetAllocationSlice(*operand), /*destination_buffer=*/GetAllocationSlice(*inst), /*mem_size=*/ @@ -2338,17 +2406,31 @@ std::unique_ptr IrEmitterUnnested::BuildInfeedThunk( const HloInstruction* inst) { CHECK_EQ(HloOpcode::kInfeed, inst->opcode()); - std::vector tuple_element_buffers; - for (int64 i = 0; i < inst->shape().tuple_shapes_size(); ++i) { - BufferAllocation::Slice buffer = ir_emitter_context_->buffer_assignment() - .GetUniqueSlice(inst, {i}) - .ConsumeValueOrDie(); - tuple_element_buffers.push_back(buffer); - } + ShapeTree slices(inst->shape()); + slices.ForEachMutableElement( + [&](const ShapeIndex& index, BufferAllocation::Slice* slice) { + *slice = ir_emitter_context_->buffer_assignment() + .GetUniqueSlice(inst, index) + .ConsumeValueOrDie(); + }); + return absl::make_unique(slices, inst); +} - return MakeUnique( - tuple_element_buffers, - /*destination_buffer=*/GetAllocationSlice(*inst), inst); +std::unique_ptr IrEmitterUnnested::BuildOutfeedThunk( + const HloInstruction* inst) { + CHECK_EQ(HloOpcode::kOutfeed, inst->opcode()); + + ShapeTree slices(inst->operand(0)->shape()); + slices.ForEachMutableElement( + [&](const ShapeIndex& index, BufferAllocation::Slice* slice) { + auto status_or_slice = + ir_emitter_context_->buffer_assignment().GetUniqueSlice( + inst->operand(0), index); + if (status_or_slice.ok()) { + *slice = status_or_slice.ConsumeValueOrDie(); + } + }); + return absl::make_unique(std::move(slices), inst); } namespace { @@ -2371,7 +2453,7 @@ std::unique_ptr IrEmitterUnnested::BuildGemmThunk( if (inst->opcode() == HloOpcode::kDot) { const HloInstruction* lhs = inst->operand(0); const HloInstruction* rhs = inst->operand(1); - return MakeUnique( + return absl::make_unique( GetAllocationSlice(*lhs), // The buffer assigned to LHS. GetAllocationSlice(*rhs), // The buffer assigned to RHS. GetAllocationSlice(*inst), // The output buffer. @@ -2393,7 +2475,9 @@ std::unique_ptr IrEmitterUnnested::BuildGemmThunk( if (alpha->opcode() == HloOpcode::kBroadcast) { alpha = alpha->operand(0); } - alpha = inst->operand(alpha->parameter_number()); + if (alpha->opcode() == HloOpcode::kParameter) { + alpha = inst->operand(alpha->parameter_number()); + } // TODO(b/74185543): Remove the following if block once we support fusion // with a non-constant as well. Then we will just always use the constant // on the device. @@ -2411,7 +2495,7 @@ std::unique_ptr IrEmitterUnnested::BuildGemmThunk( const HloInstruction* rhs = inst->operand(rhs_parameter->parameter_number()); - return MakeUnique( + return absl::make_unique( GetAllocationSlice(*lhs), // The buffer assigned to LHS. GetAllocationSlice(*rhs), // The buffer assigned to RHS. GetAllocationSlice(*inst), // The output buffer. @@ -2428,26 +2512,30 @@ std::unique_ptr IrEmitterUnnested::BuildGemmThunk( std::unique_ptr IrEmitterUnnested::BuildFftThunk( const HloInstruction* inst) { const HloInstruction* operand = inst->operand(0); - return MakeUnique(inst->fft_type(), inst->fft_length(), - /*input_buffer=*/GetAllocationSlice(*operand), - /*output_buffer=*/GetAllocationSlice(*inst), - /*input_shape=*/operand->shape(), - /*output_shape=*/inst->shape(), inst); + return absl::make_unique( + inst->fft_type(), inst->fft_length(), + /*input_buffer=*/GetAllocationSlice(*operand), + /*output_buffer=*/GetAllocationSlice(*inst), + /*input_shape=*/operand->shape(), + /*output_shape=*/inst->shape(), inst); } StatusOr> IrEmitterUnnested::BuildInitializerThunk( const HloInstruction* hlo, const ShapeIndex& index) { bool fused = HloOpcode::kFusion == hlo->opcode(); const HloInstruction* inst = fused ? hlo->fused_expression_root() : hlo; - const HloInstruction* init_value = [&] { + const HloInstruction* init_value_operand = [&] { switch (inst->opcode()) { case HloOpcode::kSelectAndScatter: return inst->operand(2); case HloOpcode::kReduce: return inst->operand(1); case HloOpcode::kTuple: - CHECK(hlo->IsMultiOutputFusion() && - inst->operand(index.back())->opcode() == HloOpcode::kReduce); + CHECK(hlo->IsMultiOutputFusion()) + << ": " << hlo->ToString() << " is not a multi-output fusion."; + CHECK(inst->operand(index.back())->opcode() == HloOpcode::kReduce) + << ": Found '" << inst->operand(index.back())->opcode() << "' in " + << inst->ToString() << " but expected 'reduce'."; // For multi-output fusion look through the tuple. return inst->operand(index.back())->operand(1); default: @@ -2456,10 +2544,16 @@ StatusOr> IrEmitterUnnested::BuildInitializerThunk( } }(); + const HloInstruction* init_value = init_value_operand; if (fused && init_value->opcode() == HloOpcode::kParameter) { init_value = hlo->operand(init_value->parameter_number()); } + // Initializer thunks don't implement a whole instruction, and we want to + // profile the whole instruction instead of the individual thunks it consists + // of. Therefore we pass nullptr as the HloInstruction* to the thunks we + // generate below. + // // In the common case, the initializer is a constant. In this case, emit a // device-memset call if we can. Currently StreamExecutor only supports // zeroing and 32-bit memsets. @@ -2470,27 +2564,29 @@ StatusOr> IrEmitterUnnested::BuildInitializerThunk( // Are all the bytes of this scalar equal to 0? If so, we can create a // MemzeroThunk. - ArraySlice literal_bytes( + absl::Span literal_bytes( reinterpret_cast(literal.untyped_data()), num_bytes); - if (c_all_of(literal_bytes, [](uint8 byte) { return byte == 0; })) { - return {MakeUnique(GetAllocationSlice(*hlo, index), hlo)}; + if (absl::c_all_of(literal_bytes, [](uint8 byte) { return byte == 0; })) { + return {absl::make_unique(GetAllocationSlice(*hlo, index), + nullptr)}; } // If the literal is 8 or 16 bits wide, we can emit a 32-bit memset by // repeating the literal 4 or 2 times, so long as the destination buffer is // an even multiple of 32 bits long. + const Shape& output_shape = ShapeUtil::GetSubshape(hlo->shape(), index); if ((num_bytes == 1 || num_bytes == 2) && - ShapeUtil::ByteSizeOf(hlo->shape()) % 4 == 0) { + ShapeUtil::ByteSizeOf(output_shape) % 4 == 0) { uint16 pattern16; if (num_bytes == 1) { uint8 b = literal_bytes.front(); pattern16 = uint16{b} | (uint16{b} << 8); } else { - pattern16 = literal_bytes.front(); + memcpy(&pattern16, literal_bytes.data(), sizeof(pattern16)); } uint32 pattern32 = uint32{pattern16} | (uint32{pattern16} << 16); - return {MakeUnique( - pattern32, GetAllocationSlice(*hlo, index), hlo)}; + return {absl::make_unique( + pattern32, GetAllocationSlice(*hlo, index), nullptr)}; } // If the literal is an even multiple of 32 bits wide, we can emit a 32-bit @@ -2500,20 +2596,41 @@ StatusOr> IrEmitterUnnested::BuildInitializerThunk( literal_bytes.size() - 4) == 0) { uint32 word; memcpy(&word, literal_bytes.data(), sizeof(word)); - return {MakeUnique( - word, GetAllocationSlice(*hlo, index), hlo)}; + return {absl::make_unique( + word, GetAllocationSlice(*hlo, index), nullptr)}; } } // Otherwise fall back to our slow initializer code. - std::unique_ptr kernel_thunk = BuildKernelThunk(hlo); - TF_RETURN_IF_ERROR(EmitTargetElementLoopInThunk( - *hlo, - [=](const llvm_ir::IrArray::Index& index) { - return GetIrArray(*init_value, *hlo) - .EmitReadArrayElement(index, &ir_builder_); - }, - kernel_thunk.get())); + std::unique_ptr kernel_thunk = + BuildKernelThunk(hlo, /*implements_whole_instruction=*/false); + LaunchDimensions launch_dimensions = + CalculateLaunchDimensions(ShapeUtil::GetSubshape(hlo->shape(), index), + ir_emitter_context_->device_description()); + UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(), + ir_emitter_context_->llvm_module()); + // If the init_value was fused into this reduce we have to generate it first. + if (fused && init_value_operand->opcode() != HloOpcode::kParameter) { + CHECK_EQ(HloOpcode::kConstant, init_value_operand->opcode()); + + const Literal& literal = init_value_operand->literal(); + llvm::Constant* initializer = + llvm_ir::ConvertLiteralToIrConstant(literal, module_); + + llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable( + *module_, initializer->getType(), + /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, initializer, + /*Name=*/""); + global_for_const->setAlignment(kConstantBufferAlignBytes); + bindings_.BindHloToIrValue(*init_value_operand, global_for_const); + } + TF_RETURN_IF_ERROR(ParallelLoopEmitter( + [=](const IrArray::Index& index) { + return GetIrArray(*init_value, *hlo) + .EmitReadArrayElement(index, &b_); + }, + GetIrArray(*hlo, *hlo, index), launch_dimensions, &b_) + .EmitLoop(IrName(hlo))); // Clean up state left behind by emitting the loop above. (This is normally // done in IrEmitterUnnested::Postprocess().) @@ -2537,8 +2654,7 @@ Status CheckHloBuffersShareAllocation( if (slice_a != slice_b) { return InternalError( "instruction %s %s does not share allocation with instruction %s %s", - a->ToString().c_str(), slice_a.ToString().c_str(), - b->ToString().c_str(), slice_b.ToString().c_str()); + a->ToString(), slice_a.ToString(), b->ToString(), slice_b.ToString()); } return Status::OK(); } @@ -2623,15 +2739,15 @@ std::unique_ptr IrEmitterUnnested::BuildWhileThunk( HloComputation* condition = hlo->while_condition(); IrEmitterUnnested ir_emitter_condition(hlo_module_config_, condition, ir_emitter_context_); - TF_CHECK_OK(condition->root_instruction()->Accept(&ir_emitter_condition)); + TF_CHECK_OK(condition->Accept(&ir_emitter_condition)); // Generate thunk sequence for while 'body'. HloComputation* body = hlo->while_body(); IrEmitterUnnested ir_emitter_body(hlo_module_config_, body, ir_emitter_context_); - TF_CHECK_OK(body->root_instruction()->Accept(&ir_emitter_body)); + TF_CHECK_OK(body->Accept(&ir_emitter_body)); - return MakeUnique( + return absl::make_unique( GetAllocationSlice(*condition->root_instruction()), // cond result ir_emitter_condition.ConsumeThunkSequence(), ir_emitter_body.ConsumeThunkSequence(), hlo); @@ -2647,10 +2763,10 @@ std::unique_ptr IrEmitterUnnested::BuildForThunk( HloComputation* body = hlo->while_body(); IrEmitterUnnested ir_emitter_body(hlo_module_config_, body, ir_emitter_context_); - TF_CHECK_OK(body->root_instruction()->Accept(&ir_emitter_body)); + TF_CHECK_OK(body->Accept(&ir_emitter_body)); - return MakeUnique(loop_limit, - ir_emitter_body.ConsumeThunkSequence(), hlo); + return absl::make_unique( + loop_limit, ir_emitter_body.ConsumeThunkSequence(), hlo); } std::unique_ptr IrEmitterUnnested::BuildConditionalThunk( @@ -2663,14 +2779,14 @@ std::unique_ptr IrEmitterUnnested::BuildConditionalThunk( HloComputation* true_computation = hlo->true_computation(); IrEmitterUnnested ir_emitter_true(hlo_module_config_, true_computation, ir_emitter_context_); - TF_CHECK_OK(true_computation->root_instruction()->Accept(&ir_emitter_true)); + TF_CHECK_OK(true_computation->Accept(&ir_emitter_true)); HloComputation* false_computation = hlo->false_computation(); IrEmitterUnnested ir_emitter_false(hlo_module_config_, false_computation, ir_emitter_context_); - TF_CHECK_OK(false_computation->root_instruction()->Accept(&ir_emitter_false)); + TF_CHECK_OK(false_computation->Accept(&ir_emitter_false)); - return MakeUnique( + return absl::make_unique( GetAllocationSlice(*hlo->operand(0)), GetAllocationSlice(*hlo->operand(1)), GetAllocationSlice(*hlo->operand(2)), @@ -2696,37 +2812,586 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk( ir_emitter_context_->llvm_module()); if (!hlo.IsMultiOutputFusion()) { return ParallelLoopEmitter(element_generator, GetIrArray(hlo, hlo), - launch_dimensions, &ir_builder_, unroll_factor) - .EmitLoop(IrName(&hlo)); + launch_dimensions, &b_, unroll_factor) + .EmitLoop( + IrName(&hlo), + GetIndexTypeForKernel(&hlo, launch_dimensions.launch_bound(), &b_)); } - // For multiple outputs fusion, we need to emit each operand and the root. - std::vector output_arrays; + // For multioutput fusion, we need to emit each operand and the root. + std::vector output_arrays; for (int64 i = 0; i < ShapeUtil::TupleElementCount(hlo.shape()); ++i) { output_arrays.push_back(GetIrArray(hlo, hlo, {i})); } - TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, output_arrays, - launch_dimensions, &ir_builder_, - unroll_factor) - .EmitLoop(IrName(&hlo))); + TF_RETURN_IF_ERROR( + ParallelLoopEmitter(element_generator, output_arrays, launch_dimensions, + &b_, unroll_factor) + .EmitLoop(IrName(&hlo), + GetIndexTypeForKernel( + &hlo, launch_dimensions.launch_bound(), &b_))); std::vector tuple_operand_ptrs; for (int64 i = 0; i < output_arrays.size(); ++i) { tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer()); } - ir_builder_.SetInsertPoint(ir_builder_.GetInsertBlock()->getTerminator()); - llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &ir_builder_, - module_); + b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator()); + llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &b_, module_); return Status::OK(); } Status IrEmitterUnnested::EmitTargetElementLoop( const HloInstruction& hlo, const llvm_ir::ElementGenerator& element_generator) { - CHECK(Thunk::Kind::kKernel == LastThunk()->kind()); + CHECK_EQ(Thunk::Kind::kKernel, LastThunk()->kind()); return EmitTargetElementLoopInThunk(hlo, element_generator, static_cast(LastThunk())); } +int IrEmitterUnnested::ConstructIrArrayForOutputs( + const HloInstruction& hlo, std::vector* output_arrays) { + int64 num_outputs = 1; + if (hlo.IsMultiOutputFusion()) { + num_outputs = ShapeUtil::TupleElementCount(hlo.shape()); + output_arrays->reserve(num_outputs); + for (int64 i = 0; i < num_outputs; ++i) { + output_arrays->push_back(GetIrArray(hlo, hlo, {i})); + } + } else { + output_arrays->push_back(GetIrArray(hlo, hlo)); + } + return num_outputs; +} + +int IrEmitterUnnested::ConstructIrArrayForInputs( + const HloInstruction& hlo, std::vector* param_arrays) { + int64 num_params = hlo.operands().size(); + param_arrays->reserve(num_params); + for (const HloInstruction* param : hlo.operands()) { + param_arrays->push_back(GetIrArray(*param, hlo)); + } + return num_params; +} + +int IrEmitterUnnested::ConstructOutputReducedShapeAndCastOutputIrArrayToShape( + const HloInstruction& hlo, const std::vector& output_arrays, + absl::Span reduced_output_dims, + std::vector* output_reduced_shapes, + std::vector* output_in_reduced_shape_arrays) { + int64 num_outputs = 1; + if (hlo.IsMultiOutputFusion()) { + num_outputs = ShapeUtil::TupleElementCount(hlo.shape()); + output_in_reduced_shape_arrays->reserve(num_outputs); + output_reduced_shapes->reserve(num_outputs); + for (int64 i = 0; i < num_outputs; ++i) { + output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout( + ShapeUtil::GetSubshape(hlo.shape(), {i}).element_type(), + reduced_output_dims)); + output_in_reduced_shape_arrays->push_back( + output_arrays[i].CastToShape((*output_reduced_shapes)[i], &b_)); + } + } else { + output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout( + hlo.shape().element_type(), reduced_output_dims)); + output_in_reduced_shape_arrays->push_back( + output_arrays[0].CastToShape((*output_reduced_shapes)[0], &b_)); + } + return num_outputs; +} + +int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape( + const HloInstruction& hlo, const std::vector& param_arrays, + const std::vector& param_buffers, + absl::Span reduced_output_dims, + std::vector* param_reduced_shapes, + std::vector* param_in_reduced_shape_arrays) { + int64 num_params = hlo.operands().size(); + param_in_reduced_shape_arrays->reserve(num_params); + param_reduced_shapes->reserve(num_params); + for (int64 id = 0; id < num_params; ++id) { + if (param_buffers[id] == nullptr) { + param_reduced_shapes->push_back(Shape()); + param_in_reduced_shape_arrays->push_back(IrArray()); + continue; + } + const HloInstruction* param = hlo.operand(id); + param_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout( + param->shape().element_type(), + Permute({0, 2, 1}, reduced_output_dims))); + param_in_reduced_shape_arrays->push_back( + param_arrays[id].CastToShape((*param_reduced_shapes)[id], &b_)); + } + return num_params; +} + +namespace { + +// Reads thread_idx.x and converts it to a (y,x) coordinate, assuming that the +// thread lives within a square tile of size tile_size (so thread blocks are of +// size tile_size * tile_size). +std::tuple CalculateYXCoordinateWithinTile( + llvm::IRBuilder<>* builder, llvm::Value* tile_size, + int64 threads_per_tile) { + // Calculate the starting element coordinate within a tile for the current + // thread, (y, x) from thread_id. + llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, builder); + llvm_ir::AddRangeMetadata(0, threads_per_tile, + llvm::cast(thread_id)); + thread_id = builder->CreateIntCast(thread_id, tile_size->getType(), + /*isSigned=*/true, "thread.id.x"); + auto x = builder->CreateURem(thread_id, tile_size); + auto y = builder->CreateUDiv(thread_id, tile_size); + return std::make_tuple(y, x); +} + +// Reads block_idx.x, casts it to type index_ty, and adds the assumption that +// it's in the range [0, num_blocks]. +llvm::Value* GetBlockIdx(llvm::IRBuilder<>* builder, llvm::Type* index_ty, + int64 num_blocks) { + llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, builder); + llvm_ir::AddRangeMetadata(0, num_blocks, + llvm::cast(block_id)); + return builder->CreateIntCast(block_id, index_ty, /*isSigned=*/true, + "block.id.x"); +} + +// Emits code to process up to (tile_size/num_rows) elements in a tile, given +// `emit_elem_function` is the function to emit code to process one element, `y` +// and `x` are the coordinates for the first element to process, and `index` is +// the index for the origin of the tile. Emits bounds check to ensure that each +// processed element is within the boundary defined by `tile_width` and +// `tile_height`. +void EmitTiledElementalCodeWithBoundsCheck( + int64 tile_size, int64 num_rows, const IrArray::Index& index, + const string& loop_name, KernelSupportLibrary* ksl, + llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x, + llvm::Value* tile_width, llvm::Value* tile_height, + const std::function& + emit_elem_function) { + llvm::Type* index_ty = tile_width->getType(); + // Emits a constant value with index type. + auto index_typed_constant = [&](uint64 c) -> llvm::Constant* { + return llvm::ConstantInt::get(index_ty, c); + }; + // Adds `addend` to the given `dim` of `index`. + auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) { + index[dim] = builder->CreateAdd(index[dim], addend); + return index; + }; + + auto emit_full_tile = [&] { + for (int64 i = 0; i < tile_size; i += num_rows) { + auto source_idx = offset_dim(index, index_typed_constant(i), /*dim=*/1); + auto y_loc = builder->CreateAdd(index_typed_constant(i), y); + emit_elem_function(source_idx, y_loc); + } + }; + + auto emit_last_row = [&] { + ksl->IfReturnVoid("x_in_tile", builder->CreateICmpULT(x, tile_width), [&] { + // tile_height_upper_bound = + // ceil(tile_height / num_rows) * num_rows + auto tile_height_upper_bound = builder->CreateMul( + builder->CreateUDiv( + builder->CreateAdd(tile_height, + index_typed_constant(num_rows - 1)), + index_typed_constant(num_rows)), + index_typed_constant(num_rows)); + ksl->ForReturnVoid( + loop_name, /*start=*/index_typed_constant(0), + /*end=*/tile_height_upper_bound, + /*step=*/index_typed_constant(num_rows), [&](llvm::Value* y_indvar) { + auto y_loc = builder->CreateAdd(y_indvar, y); + ksl->IfReturnVoid( + "y_in_tile", builder->CreateICmpULT(y_loc, tile_height), [&] { + emit_elem_function(offset_dim(index, y_indvar, /*dim=*/1), + y_loc); + }); + }); + }); + }; + ksl->IfReturnVoid( + "full_tile", + builder->CreateAnd( + builder->CreateICmpEQ(index_typed_constant(tile_size), tile_width), + builder->CreateICmpEQ(index_typed_constant(tile_size), tile_height)), + emit_full_tile, emit_last_row); +} +} // namespace + +// Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose +// algorithm to improve the memory access patterns for the input parameters +// which have a shape that is a 0-2-1 transpose of the output tensors. +// +// For the purpose of tiling, the output tensors have a logical shape of three +// components 0-2-1 while the relevant input parameters have a logical shape of +// three components 0-1-2 in the order major to minor. The x- and y- dimensions +// of the tensors are tiled in square tiles of edge length `kTileSize`. Each +// thread block of `kTileSize` x `kNumRows` threads transposes one tile: each +// thread copies kTileSize/kNumRows elements from the input to a shared memory +// tile, then the otherwise "regular hlo kernel" reads from the shared memory +// instead of the original input. +// +// This is similar to the following CUDA algorithm in TensorFlow: +// https://goo.gl/MStRV6. +// +// `kTileSize` should usually be same as warp size. We currently choose 32 for +// `kTileSize` and 4 for `kNumRows`. The CUDA algorithm uses 8 for `kNumRows`. +// +// TODO(b/33320379): Here each block transposes 1 tile. It may be more efficient +// to launch fewer blocks so each transposes many tiles. +LaunchDimensions IrEmitterUnnested::EmitHlo021Tile( + HloInstruction* hlo, absl::Span reduced_output_dims, + absl::Span tiled_param_ids) { + // Parameters for the tiling algorithm. + constexpr int64 kTileSize = 32; + constexpr int64 kNumRows = 4; + constexpr int64 kThreadsPerTile = kTileSize * kNumRows; + + // Construct IrArrays for the inputs and outputs. + std::vector output_arrays; + int64 num_outputs = ConstructIrArrayForOutputs(*hlo, &output_arrays); + std::vector param_arrays; + int64 num_params = ConstructIrArrayForInputs(*hlo, ¶m_arrays); + + // Allocate shared memory buffers to store the tiled inputs. + std::vector param_shmem_buffers(num_params, nullptr); + for (int64 id : tiled_param_ids) { + const HloInstruction* param = hlo->operand(id); + // Add 1 to the minor dimension to reduce shared memory bank conflicts. + llvm::Type* tile_type = llvm::ArrayType::get( + llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType( + param->shape().element_type(), module_), + kTileSize + 1), + kTileSize); + const int kNVPTXSharedMemoryAddrSpace = 3; + auto* tile_base_ptr = new llvm::GlobalVariable( + *b_.GetInsertBlock()->getParent()->getParent(), tile_type, + /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage, + llvm::UndefValue::get(tile_type), + llvm_ir::AsStringRef(IrName(hlo, StrCat("tile", id))), nullptr, + llvm::GlobalValue::NotThreadLocal, kNVPTXSharedMemoryAddrSpace); + param_shmem_buffers[id] = tile_base_ptr; + VLOG(3) << "Added shmem buffer for parameter " << id << ": " + << llvm_ir::DumpToString(*tile_base_ptr); + } + + // The 0-2-1 shape of the tiling scheme is the reduced shape of the HLO result + // for the purpose of tiling. Calculate the logical output dimensions in the + // tile from the reduced output dimensions. + std::vector output_dims_in_tiles = std::vector( + reduced_output_dims.begin(), reduced_output_dims.end()); + CHECK_EQ(output_dims_in_tiles.size(), 3); + for (int i = 1; i < 3; ++i) { + output_dims_in_tiles[i] = + CeilOfRatio(output_dims_in_tiles[i], kTileSize); + } + const int64 num_tiles = + absl::c_accumulate(output_dims_in_tiles, 1, std::multiplies()); + LaunchDimensions launch_dimensions(num_tiles, kThreadsPerTile); + + llvm::Type* index_ty = + GetIndexTypeForKernel(hlo, launch_dimensions.launch_bound(), &b_); + auto index_typed_constant = [&](uint64 c) -> llvm::Constant* { + return llvm::ConstantInt::get(index_ty, c); + }; + + // Cast each output IrArray to its corresponding reduced shape and keep the + // reduced shape live during IR emission. + std::vector output_in_reduced_shape_arrays; + std::vector output_reduced_shapes; + CHECK_EQ(ConstructOutputReducedShapeAndCastOutputIrArrayToShape( + *hlo, output_arrays, reduced_output_dims, &output_reduced_shapes, + &output_in_reduced_shape_arrays), + num_outputs); + + // For each tiled parameter, cast its input IrArray to the corresponding + // reduced shape and keep the reduced shape live during IR emission. + std::vector param_in_reduced_shape_arrays; + std::vector param_reduced_shapes; + CHECK_EQ(ConstructInputReducedShapeAndCastInputIrArrayToShape( + *hlo, param_arrays, param_shmem_buffers, reduced_output_dims, + ¶m_reduced_shapes, ¶m_in_reduced_shape_arrays), + num_params); + + // Calculate the starting element coordinate within a tile for the current + // thread, (y, x) from thread_id. + llvm::Value* x; + llvm::Value* y; + std::tie(y, x) = CalculateYXCoordinateWithinTile( + &b_, index_typed_constant(kTileSize), kThreadsPerTile); + + // Calculate the index for the current output tile from block_id. + const IrArray::Index output_tile_index( + GetBlockIdx(&b_, index_ty, num_tiles), + ShapeUtil::MakeShapeWithDescendingLayout(PRED /*arbitrary*/, + output_dims_in_tiles), + &b_); + + // Output tile origin is the index for the first element of the current output + // tile. + const IrArray::Index output_tile_origin = [&] { + IrArray::Index index = output_tile_index; + for (int i = 1; i < 3; ++i) { + index[i] = Mul(output_tile_index[i], index_typed_constant(kTileSize), + "tile_origin." + std::to_string(i)); + } + return index; + }(); + + // Calculate the input tile origin from the output tile origin. + const IrArray::Index input_tile_origin( + Permute({0, 2, 1}, output_tile_origin.multidim())); + + // Calculate the current output tile bounds in each of the logical dimensions. + std::vector output_tile_bounds(3); + for (int i = 1; i < 3; ++i) { + // Only last row or column may not have full size. + output_tile_bounds[i] = + Select(ICmpEQ(output_tile_index[i], + index_typed_constant(output_dims_in_tiles[i] - 1)), + index_typed_constant(reduced_output_dims[i] - + (output_dims_in_tiles[i] - 1) * kTileSize), + index_typed_constant(kTileSize), "kTileSize"); + } + + KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll); + + // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck. + auto emit_tiled_elemental_code_with_bounds_check = + [&](const IrArray::Index& index, const string& loop_name, + llvm::Value* tile_width, llvm::Value* tile_height, + const std::function& + emit_elem_function) { + EmitTiledElementalCodeWithBoundsCheck( + kTileSize, kNumRows, index, loop_name, &ksl, &b_, y, x, tile_width, + tile_height, emit_elem_function); + }; + + // Adds `addend` to the given `dim` of `index`. + auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) { + index[dim] = Add(index[dim], addend); + return index; + }; + const IrArray::Index input_index = + offset_dim(offset_dim(input_tile_origin, x, /*dim=*/2), y, /*dim=*/1); + + // Copy input parameter values to shared memory buffers: + // tile[y, x] = input[index] + emit_tiled_elemental_code_with_bounds_check( + input_index, "input", output_tile_bounds[1], output_tile_bounds[2], + [&](const IrArray::Index& index, llvm::Value* y_loc) { + for (int64 id : tiled_param_ids) { + IrArray& input_in_logical_shape = param_in_reduced_shape_arrays[id]; + llvm::Value* shmem_buffer = param_shmem_buffers[id]; + // TODO(jlebar): Add AA metadata to this store. Tile buffers are + // global variables, so LLVM can't infer much about it. + Store(input_in_logical_shape.EmitReadArrayElement(index, &b_, + "input_element"), + GEP(shmem_buffer, {index_typed_constant(0), y_loc, x})); + } + }); + + // Wait for all threads to reach this point, lest we copy a value from tile to + // output before the other thread copies it from input to tile. + // This is `__syncthreads` in CUDA. + llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_); + + llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x); + + const IrArray::Index output_index = + offset_dim(offset_dim(output_tile_origin, x, /*dim=*/2), y, /*dim=*/1); + + // Write to output[index] by emitting code like normal, except that values for + // the tiled parameters are read from the shmem buffers. + if (hlo->opcode() == HloOpcode::kCopy) { + emit_tiled_elemental_code_with_bounds_check( + output_index, "output", output_tile_bounds[2], output_tile_bounds[1], + [&](const IrArray::Index& index, llvm::Value* y_loc) { + // TODO(jlebar): Add AA metadata to this load. + llvm::Instruction* load_from_shmem_buffer = + Load(GEP(param_shmem_buffers[0], {b_.getInt64(0), x, y_loc}), + "output_element"); + output_in_reduced_shape_arrays[0].EmitWriteArrayElement( + index, load_from_shmem_buffer, &b_); + }); + } else { + CHECK_EQ(hlo->opcode(), HloOpcode::kFusion); + emit_tiled_elemental_code_with_bounds_check( + output_index, "output", output_tile_bounds[2], output_tile_bounds[1], + [&](const IrArray::Index& index, llvm::Value* y_loc) { + GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_, + GetNestedComputer()); + FusedIrEmitter fused_emitter(param_arrays, &elem_emitter); + tiled_param_info.set_y(y_loc); + fused_emitter.SetTiledParameterInfo(&tiled_param_info); + TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter)); + IrArray::Index untiled_index = llvm_ir::GetUnreducedOutputIndex( + index, output_reduced_shapes[0], output_arrays[0].GetShape(), + &b_); + const llvm_ir::ElementGenerator& output_generator = + fused_emitter.GetRootGenerator(); + llvm::Value* output_value = + output_generator(untiled_index).ValueOrDie(); + if (hlo->IsMultiOutputFusion()) { + CHECK(output_value->getType()->isStructTy()); + CHECK_EQ(output_value->getType()->getStructNumElements(), + output_in_reduced_shape_arrays.size()); + for (int64 i = 0; i < output_in_reduced_shape_arrays.size(); ++i) { + output_in_reduced_shape_arrays[i].EmitWriteArrayElement( + index, ExtractValue(output_value, i), &b_); + } + } else { + output_in_reduced_shape_arrays[0].EmitWriteArrayElement( + index, output_value, &b_); + } + }); + } + + // For multioutput fusion, emit a tuple with all the individual outputs. + if (hlo->IsMultiOutputFusion()) { + std::vector tuple_operand_ptrs; + for (int64 i = 0; i < output_arrays.size(); ++i) { + tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer()); + } + llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo), tuple_operand_ptrs, &b_, + module_); + } + + return launch_dimensions; +} + +bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) { + HloOpcode opcode = hlo->opcode(); + CHECK(opcode == HloOpcode::kFusion || opcode == HloOpcode::kCopy); + CHECK(opcode != HloOpcode::kFusion || + hlo->fusion_kind() == HloInstruction::FusionKind::kLoop) + << "Only loop fusions are supported."; + + const Shape& output_shape = hlo->IsMultiOutputFusion() + ? ShapeUtil::GetSubshape(hlo->shape(), {0}) + : hlo->shape(); + + // If the output_shape is reduced to 021 shape, find all the parameters of the + // hlo that are in the corresponding 012 shape. + std::vector params_012; + optional> reduced_dims_021; + for (int64 operand_idx = 0; operand_idx < hlo->operand_count(); + ++operand_idx) { + HloInstruction* operand = hlo->mutable_operand(operand_idx); + auto find_transpose_result = + llvm_ir::FindTranspose021(operand->shape(), output_shape); + if (!find_transpose_result.has_value()) { + continue; + } + const std::vector& curr_reduced_dims_021 = *find_transpose_result; + if (!reduced_dims_021.has_value()) { + reduced_dims_021 = curr_reduced_dims_021; + } + if (!absl::c_equal(*reduced_dims_021, curr_reduced_dims_021)) { + // There is more than one possible transpose. Instead of picking one + // transpose, we simply give up here. + return false; + } + params_012.push_back(operand_idx); + } + + if (!reduced_dims_021.has_value()) { + return false; + } + + if ((*reduced_dims_021)[1] < kMinDimensionToTransposeTiled || + (*reduced_dims_021)[2] < kMinDimensionToTransposeTiled) { + return false; + } + + // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the + // elements are of size 4 bytes), and CUDA has an architectural limit of 48kb + // shared memory per SM. (This is increased to 96kb in Volta, but we don't + // use this, in part because it eats into our L1 cache space.) + // + // For correctness we need to ensure that we don't make more than 48kb worth + // of shmem tiles per block. And for performance, we'd probably like to use + // significantly less, so that we can fit more than one block at a time on a + // gpu core. + // + // We say without benchmarks that we want at least 3 threads/block, + // corresponding to 3 shmem tiles if the elements are 32 bits wide. We choose + // which params get the shmem transpose treatment arbitrarily; it's not clear + // if there's a Right Choice. + // + // This is only sound if tiled transposes are the only place where we use + // shared memory in fusions. If in the future other fusible ops use shared + // memory, we'll have to adjust this heuristic. + constexpr int kMinBlocksPerCore = 3; + constexpr int64 kShmemPerCore = 48 * 1024; + int64 shmem_used = 0; + for (int64 i = 0; i < params_012.size(); ++i) { + const HloInstruction* operand = hlo->operand(params_012[i]); + shmem_used += + 32 * 33 * + ShapeUtil::ByteSizeOfPrimitiveType(operand->shape().element_type()); + + if (kMinBlocksPerCore * shmem_used > kShmemPerCore) { + // Erase this element and everything after it from params_012. + params_012.resize(i); + break; + } + } + + VLOG(3) << "EmitHlo021Tile Emitting hlo tile 0-2-1" << hlo->ToString(); + thunk_sequence_->emplace_back( + BuildKernelThunk(hlo, /*implements_whole_instruction=*/true)); + const LaunchDimensions launch_dimensions = + EmitHlo021Tile(hlo, *reduced_dims_021, params_012); + UpdateLaunchDimensions(launch_dimensions, LastThunk(), + ir_emitter_context_->llvm_module()); + + return true; +} + +Status IrEmitterUnnested::EmitConstantGlobals() { + for (const BufferAllocation& allocation : + ir_emitter_context_->buffer_assignment().Allocations()) { + if (!allocation.is_constant()) { + continue; + } + + const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation); + const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal); + llvm::ArrayType* global_type = + llvm::ArrayType::get(b_.getInt8Ty(), allocation.size()); + llvm::Constant* initializer = + should_emit_initializer + ? llvm_ir::ConvertLiteralToIrConstant(literal, module_) + : llvm::ConstantAggregateZero::get(global_type); + if (should_emit_initializer) { + VLOG(3) << "Emitted initializer for constant with shape " + << ShapeUtil::HumanString(literal.shape()); + } + + // These globals will be looked up by name by GpuExecutable so we need to + // give them an external linkage. Not all of their uses are visible in the + // LLVM IR (e.g. TupleThunk) so we can't give then a linkage that merely + // preserves their names (like available_externally), we also need to ensure + // that they stick around even if they're "unused". + // + // We may have to be more more clever here in the future if we notice that + // we're keeping around too many globals because of their linkage. + llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable( + global_type, /*isConstant=*/should_emit_initializer, + llvm::GlobalValue::ExternalLinkage, + /*Initializer=*/initializer, + llvm_ir::AsStringRef( + llvm_ir::ConstantBufferAllocationToGlobalName(allocation))); + global_for_const->setAlignment(kConstantBufferAlignBytes); + ir_emitter_context_->llvm_module()->getGlobalList().push_back( + global_for_const); + } + + return Status::OK(); +} + } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index b41eaa303b0aad104ad0369438e192fa404d7878..084462330ed20108a9ec850b4cbc588afe77cc01 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" +#include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h" namespace xla { namespace gpu { @@ -67,16 +68,19 @@ class IrEmitterUnnested : public IrEmitter { Status HandleDot(HloInstruction* dot) override; Status HandleFft(HloInstruction* fft) override; Status HandleFusion(HloInstruction* fusion) override; - Status HandleGather(HloInstruction* gather) override; Status HandleGetTupleElement(HloInstruction* get_tuple_element) override; Status HandleReduce(HloInstruction* reduce) override; Status HandleSelectAndScatter(HloInstruction* instruction) override; Status HandleTuple(HloInstruction* tuple) override; Status HandleWhile(HloInstruction* xla_while) override; Status HandleInfeed(HloInstruction* xla_infeed) override; + Status HandleOutfeed(HloInstruction* outfeed) override; Status HandleRng(HloInstruction* random) override; Status HandleSelect(HloInstruction* select) override; + Status HandleSort(HloInstruction* sort) override; + Status HandleTupleSelect(HloInstruction* tuple_select) override; Status HandleCrossReplicaSum(HloInstruction* crs) override; + Status HandleAfterAll(HloInstruction* gen_token) override; Status EmitTargetElementLoop( const HloInstruction& hlo, @@ -88,6 +92,9 @@ class IrEmitterUnnested : public IrEmitter { const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter, KernelThunk* thunk); + // Emits LLVM global variables corresponding to constant instructions. + Status EmitConstantGlobals(); + private: // Builds the appropriate thunk for the instruction hlo and returns the owning // pointer to it. The caller needs to make sure `inst` outlives the lifetime @@ -98,7 +105,13 @@ class IrEmitterUnnested : public IrEmitter { // This kernel takes as arguments pointers to the given buffer allocations. llvm::Function* BuildKernelPrototype( const HloInstruction& inst, - tensorflow::gtl::ArraySlice args); + absl::Span args); + + // Helper for writing extra outputs from inside a reduce kernel. + Status EmitExtraOutputsForReduce( + const HloInstruction* reduce, const llvm_ir::IrArray::Index& index, + absl::Span> + extra_output_gens); // EmitColumnReduction and EmitRowReduction emit code for column and row // reduction of a matrix and/or 3D tensor. Row and column reduction have @@ -108,33 +121,42 @@ class IrEmitterUnnested : public IrEmitter { // Emits code that reduces a matrix of shape [height x width] to a vector of // [width]. Other parameters have the same meaning as those of // `EmitReductionToVector`. Note that input shape might not be - // [height x width], but can be bitcast to [height x weight] with "height" + // [height x width], but can be bitcast to [height x width] with "height" // being the major dimension. Status EmitColumnReduction( int64 height, int64 width, HloInstruction* reduce, const Shape& input_shape, - tensorflow::gtl::ArraySlice input_gens, - tensorflow::gtl::ArraySlice init_value_gens, - tensorflow::gtl::ArraySlice reducers); + absl::Span input_gens, + absl::Span init_value_gens, + absl::Span reducers, + absl::Span reduce_output_shapes, + absl::Span> + extra_output_gens); // Emits code that reduces a 3D tensor of shape [depth x height x width] to a // vector of shape [height]. Other parameters have the same meaning as those // of `EmitReductionToVector`. Note that input shape might not be - // [depth x height x width], but can be bitcast to [depth x height x weight] + // [depth x height x width], but can be bitcast to [depth x height x width] // with "depth" being the most major dimension. Status EmitRowReduction( int64 depth, int64 height, int64 width, HloInstruction* reduce, const Shape& input_shape, - tensorflow::gtl::ArraySlice input_gens, - tensorflow::gtl::ArraySlice init_value_gens, - tensorflow::gtl::ArraySlice reducers); + absl::Span input_gens, + absl::Span init_value_gens, + absl::Span reducers, + absl::Span reduce_output_shapes, + absl::Span> + extra_output_gens); // Emits code that reduces a tensor of arbitrary rank to a scalar. Status EmitReductionToScalar( HloInstruction* reduce, const Shape& input_shape, - tensorflow::gtl::ArraySlice input_gens, - tensorflow::gtl::ArraySlice init_value_gens, - tensorflow::gtl::ArraySlice reducers); + absl::Span input_gens, + absl::Span init_value_gens, + absl::Span reducers, + absl::Span reduce_output_shapes, + absl::Span> + extra_output_gens); // Figures out whether `reduce` is a row or column reduction, and which // dimensions to reduce, and calls either `EmitRowReduction` or @@ -147,20 +169,70 @@ class IrEmitterUnnested : public IrEmitter { // Multiple reduces can be emitted in the same loop, assuming they have the // same input and output shapes, and the same reduce dimensions. // + // extra_output_gens can contain extra generators for intermediate outputs. + // These must have the same shape as the reduce input as they are computed + // when the reduce inputs are being read. + // // Prerequisite: `IsReductionToVector(*reduce)` Status EmitReductionToVector( HloInstruction* reduce, const Shape& input_shape, - tensorflow::gtl::ArraySlice input_gens, - tensorflow::gtl::ArraySlice init_value_gens, - tensorflow::gtl::ArraySlice dimensions_to_reduce, - tensorflow::gtl::ArraySlice reducers); + absl::Span input_gens, + absl::Span init_value_gens, + absl::Span dimensions_to_reduce, + absl::Span reducers, + absl::Span reduce_output_shapes, + absl::Span> + extra_output_gens); + + // Returns true if a 0-2-1 tiling algorithm is already used to emit the kernel + // for the hlo instruction. + bool CheckAndEmitHloWithTile021(HloInstruction* hlo); + // Emits a kernel for the hlo instruction using a 0-2-1 tiling algorithm and + // returns the launch dimensions for the kernel. This is a helper to support + // the implementation of CheckAndEmitHloWithTile021. + LaunchDimensions EmitHlo021Tile(HloInstruction* hlo, + absl::Span reduced_output_dims, + absl::Span tiled_param_ids); + // Generates the IrArray for each output of hlo and returns the number of + // outputs. + int ConstructIrArrayForOutputs(const HloInstruction& hlo, + std::vector* output_arrays); + // Generates the IrArray for each input of hlo and returns the number of + // inputs. + int ConstructIrArrayForInputs(const HloInstruction& hlo, + std::vector* param_arrays); + // For each output of the `hlo` instruction, constructs the reduced shape for + // the output with the given `reduced_output_dims` and cast the original + // output IrArray element in `output_arrays` to the reduced shape. Returns + // the number of outputs. + int ConstructOutputReducedShapeAndCastOutputIrArrayToShape( + const HloInstruction& hlo, + const std::vector& output_arrays, + absl::Span reduced_output_dims, + std::vector* output_reduced_shapes, + std::vector* output_in_reduced_shape_arrays); + // For each input of the `hlo` instruction, checks its value in + // `param_buffers` to find out whether the input has a reduced shape. If the + // input has a reduced shape, constructs the reduced shape for the input and + // casts the original input IrArray in `param_arrays` to the reduced shape. + // Return the total number of inputs. + int ConstructInputReducedShapeAndCastInputIrArrayToShape( + const HloInstruction& hlo, + const std::vector& param_arrays, + const std::vector& param_buffers, + absl::Span reduced_output_dims, + std::vector* param_reduced_shapes, + std::vector* param_in_reduced_shape_arrays); // Returns a KernelThunk that invokes the kernel emitted for `inst`. The // caller needs to make sure `inst` outlives the lifetime of the returned // Thunk object. The kernel implementation will be unrolled if unroll_factor - // is greater than one. - std::unique_ptr BuildKernelThunk(const HloInstruction* inst, - int unroll_factor = 1); + // is greater than one. 'implements_whole_instruction' specifies whether this + // KernelThunk implements the whole 'inst' HloInstruction. In some cases + // 'inst' will be implemented by a sequence of Thunks. + std::unique_ptr BuildKernelThunk( + const HloInstruction* inst, bool implements_whole_instruction, + int unroll_factor = 1); // Returns a FftThunk that calls cuFFT to implement `inst`. std::unique_ptr BuildFftThunk(const HloInstruction* inst); @@ -181,10 +253,14 @@ class IrEmitterUnnested : public IrEmitter { std::unique_ptr BuildDeviceToDeviceCopyThunk( const HloInstruction* inst); - // Returns an InfeedThunk that performs device-to-device memcpy to implement + // Returns an InfeedThunk that performs a host-to-device memcpy to implement // `inst`. std::unique_ptr BuildInfeedThunk(const HloInstruction* inst); + // Returns an OutfeedThunk that performs a device-to-host memcpy to implement + // `inst`. + std::unique_ptr BuildOutfeedThunk(const HloInstruction* inst); + // Returns a WhileThunk that invokes thunk sequences for 'condition' and // 'body' sub-computations of while instruction 'hlo'. std::unique_ptr BuildWhileThunk(const HloInstruction* hlo); diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc index f56c1ce69f11ed79c8be76834269f29de93a9645..e09b8fbd3ba275e14accbf88c21f3d10f34198d9 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc @@ -15,21 +15,22 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { namespace gpu { -KernelThunk::KernelThunk( - tensorflow::gtl::ArraySlice args, - const string& kernel_name, const HloInstruction* hlo_instruction, - int unroll_factor) +KernelThunk::KernelThunk(absl::Span args, + const string& kernel_name, + const HloInstruction* hlo_instruction, + int unroll_factor) : Thunk(Kind::kKernel, hlo_instruction), args_(args.begin(), args.end()), kernel_name_(kernel_name), @@ -40,11 +41,7 @@ Status KernelThunk::Initialize(const GpuExecutable& executable, tensorflow::mutex_lock lock(mutex_); if (!loader_spec_) { loader_spec_.reset(new se::MultiKernelLoaderSpec(args_.size())); - tensorflow::StringPiece ptx = executable.ptx(); - // Convert tensorflow::StringPiece to se::port::StringPiece because - // StreamExecutor uses the latter. - loader_spec_->AddCudaPtxInMemory( - se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_); + loader_spec_->AddCudaPtxInMemory(executable.ptx(), kernel_name_); if (!executable.cubin().empty()) { loader_spec_->AddCudaCubinInMemory( @@ -62,7 +59,7 @@ Status KernelThunk::Initialize(const GpuExecutable& executable, if (kernel_cache_.end() == it) { it = kernel_cache_.emplace(executor, se::KernelBase(executor)).first; if (!executor->GetKernel(*loader_spec_, &it->second)) { - return InternalError("Unable to load kernel %s", kernel_name_.c_str()); + return InternalError("Unable to load kernel %s", kernel_name_); } } @@ -75,7 +72,8 @@ void KernelThunk::SetLaunchDimensions(const LaunchDimensions& launch_dims) { } Status KernelThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) { + se::Stream* stream, + HloExecutionProfiler* profiler) { // Load the kernel. se::StreamExecutor* executor = stream->parent(); LaunchDimensions launch_dimensions; @@ -93,18 +91,19 @@ Status KernelThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, VLOG(3) << "Launching " << kernel->name(); // Launch the kernel with potentially multiple blocks and threads. static constexpr int kKernelArgsLimit = 1024; - auto kernel_args = MakeUnique>(); + auto kernel_args = absl::make_unique>(); for (const BufferAllocation* arg : args_) { const auto& buf = buffer_allocations.GetDeviceAddress(arg->index()); kernel_args->add_device_memory_argument(buf); VLOG(3) << " Arg: alloc #" << arg->index() << ": " << buf.opaque() << " (" << buf.size() << "B)"; } + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); if (!stream->parent()->Launch( stream, se::ThreadDim(launch_dimensions.threads_per_block()), se::BlockDim(launch_dimensions.block_count()), *kernel, *kernel_args)) { - return InternalError("Unable to launch kernel %s", kernel_name_.c_str()); + return InternalError("Unable to launch kernel %s", kernel_name_); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h index 7def27e189b66747569344a3dbe5c0c446f903be..f63db5c3696f8f3bbd5956724240b2b06b4f1b98 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h @@ -20,13 +20,14 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -46,7 +47,7 @@ class KernelThunk : public Thunk { // Constructs a thunk for the given kernel. // // `hlo_instruction` is as in Thunk. Other arguments are as the class members. - KernelThunk(tensorflow::gtl::ArraySlice args, + KernelThunk(absl::Span args, const string& kernel_name, const HloInstruction* hlo_instruction, int unroll_factor); KernelThunk(const KernelThunk&) = delete; @@ -62,7 +63,8 @@ class KernelThunk : public Thunk { // Executes the kernel for the thunk on "stream", which must be non-null. Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: // Buffers passed to the kernel as arguments. diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD index 7de8f9e1ee922bdbf65fd1299702482e1843f17e..698d2d51cc81a6c87f6578f1f35cdb47cf6bb4f2 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD @@ -17,12 +17,12 @@ cc_library( name = "llvm_gpu_backend", srcs = [ "dump_ir_pass.cc", - "gpu_backend_lib.cc", + "nvptx_backend_lib.cc", "utils.cc", ], hdrs = [ "dump_ir_pass.h", - "gpu_backend_lib.h", + "nvptx_backend_lib.h", "utils.h", ], deps = [ @@ -34,6 +34,10 @@ cc_library( "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@llvm//:amdgpu_code_gen", "@llvm//:analysis", "@llvm//:bit_reader", "@llvm//:bit_writer", diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc index 12a8a59488bfdd6ce55f762926cd63ba56bf9d7f..85bc58cb445627695a46171db64cd8a1f10e0fc8 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc @@ -15,14 +15,14 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h" +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" #include "llvm/IR/Module.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/raw_ostream.h" #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -86,10 +86,11 @@ void IrDumpingPassManager::run(llvm::Module &module) { const llvm::PassInfo *PI = llvm::PassRegistry::getPassRegistry()->getPassInfo(P->getPassID()); const string basename = ReplaceFilenameExtension( - tensorflow::io::Basename(input_filename_), - tensorflow::strings::Printf( + absl::string_view(tensorflow::io::Basename(input_filename_)), + absl::StrFormat( "pass-%02d.before.%s.ll", i, - (PI == nullptr ? "unknown" : PI->getPassArgument().data()))); + absl::string_view(PI == nullptr ? "unknown" + : PI->getPassArgument().data()))); llvm::legacy::PassManager::add( new DumpIrPass(tensorflow::io::JoinPath(output_dir_, basename))); } diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc deleted file mode 100644 index a4e4e85bf3d2c197cfc691b7fca0920aa6571729..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc +++ /dev/null @@ -1,506 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h" - -#include -#include -#include -#include - -#include "tensorflow/compiler/xla/ptr_util.h" -#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h" -#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h" -#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/util.h" - -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/ADT/StringSet.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Bitcode/BitcodeReader.h" -#include "llvm/Bitcode/BitcodeWriter.h" -#include "llvm/CodeGen/CommandFlags.inc" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" -#include "llvm/Linker/Linker.h" -#include "llvm/PassRegistry.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/FormattedStream.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Support/ToolOutputFile.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/IPO/AlwaysInliner.h" -#include "llvm/Transforms/IPO/Internalize.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h" -#include "llvm/Transforms/Scalar.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/tracing.h" - -namespace xla { -namespace gpu { -namespace { - -// Default inline threshold value to use in llvm. -const int kDefaultInlineThreshold = 1100; - -// Gets the libdevice filename for a particular compute capability. When -// presented with a GPU we don't recognize, we just return the libdevice from -// compute_20. -static string GetLibdeviceFilename(const string& libdevice_dir_path, - std::pair compute_capability) { - // Since CUDA 9.0, all GPU versions are included in a single file - const char* unified_libdevice_filename = "libdevice.10.bc"; - std::vector unified_libdevice_files; - const Status status = tensorflow::Env::Default()->GetMatchingPaths( - tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename), - &unified_libdevice_files); - if (status.ok() && unified_libdevice_files.size() == 1) { - return unified_libdevice_filename; - } - // There are only four libdevice files: compute_{20,30,35,50}. Each GPU - // version gets mapped to one of these. Note in particular that sm_60 and - // sm_61 map to libdevice.compute_30. - static auto* m = new std::map, int>({{{2, 0}, 20}, - {{2, 1}, 20}, - {{3, 0}, 30}, - {{3, 2}, 30}, - {{3, 5}, 35}, - {{3, 7}, 35}, - {{5, 0}, 50}, - {{5, 2}, 50}, - {{5, 3}, 50}, - {{6, 0}, 30}, - {{6, 1}, 30}, - {{6, 2}, 30}}); - int libdevice_version = 20; - auto it = m->find(compute_capability); - if (it != m->end()) { - libdevice_version = it->second; - } else { - LOG(WARNING) << "Unknown compute capability (" << compute_capability.first - << ", " << compute_capability.second << ") ." - << "Defaulting to libdevice for compute_" << libdevice_version; - } - return tensorflow::strings::StrCat("libdevice.compute_", libdevice_version, - ".10.bc"); -} - -// Gets the GPU name as it's known to LLVM for a given compute capability. If -// we see an unrecognized compute capability, we return "sm_30". -static string GetSmName(std::pair compute_capability) { - static auto* m = new std::map, int>({{{2, 0}, 20}, - {{2, 1}, 21}, - {{3, 0}, 30}, - {{3, 2}, 32}, - {{3, 5}, 35}, - {{3, 7}, 37}, - {{5, 0}, 50}, - {{5, 2}, 52}, - {{5, 3}, 53}, - {{6, 0}, 60}, - {{6, 1}, 61}, - {{6, 2}, 62}, - // TODO: Change this to 70 once LLVM NVPTX supports it - {{7, 0}, 60}}); - int sm_version = 30; - auto it = m->find(compute_capability); - if (it != m->end()) { - sm_version = it->second; - } else { - LOG(WARNING) << "Unknown compute capability (" << compute_capability.first - << ", " << compute_capability.second << ") ." - << "Defaulting to telling LLVM that we're compiling for sm_" - << sm_version; - } - return tensorflow::strings::StrCat("sm_", sm_version); -} - -// Convenience function for producing a name of a temporary compilation product -// from the input filename. -string MakeNameForTempProduct(const std::string& input_filename, - tensorflow::StringPiece extension) { - return ReplaceFilenameExtension( - tensorflow::io::Basename(llvm_ir::AsString(input_filename)), extension); -} - -// Initializes LLVM passes. Uses the PassRegistry mechanism. -void InitializePasses(llvm::PassRegistry* pass_registry) { - llvm::initializeCore(*pass_registry); - llvm::initializeCodeGen(*pass_registry); - llvm::initializeScalarOpts(*pass_registry); - llvm::initializeObjCARCOpts(*pass_registry); - llvm::initializeVectorization(*pass_registry); - llvm::initializeIPO(*pass_registry); - llvm::initializeAnalysis(*pass_registry); - llvm::initializeTransformUtils(*pass_registry); - llvm::initializeInstCombine(*pass_registry); - llvm::initializeInstrumentation(*pass_registry); - llvm::initializeTarget(*pass_registry); - llvm::initializeCodeGenPreparePass(*pass_registry); -} - -// Returns the TargetMachine, given a triple. -std::unique_ptr GetTargetMachine( - llvm::Triple triple, tensorflow::StringPiece cpu_name, - const HloModuleConfig& hlo_module_config) { - std::string error; - const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error); - if (target == nullptr) { - LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'" - << " -- " << error; - return nullptr; - } - - TargetOptions target_options = InitTargetOptionsFromCodeGenFlags(); - llvm_ir::SetTargetOptions( - /*fast_math_enabled=*/hlo_module_config.debug_options() - .xla_enable_fast_math(), - &target_options); - - // Enable FMA synthesis. - target_options.AllowFPOpFusion = FPOpFusion::Fast; - - // Set the verbose assembly options. - target_options.MCOptions.AsmVerbose = false; - - // The selection of codegen optimization level is copied from function - // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp. - CodeGenOpt::Level codegen_opt_level; - switch (hlo_module_config.debug_options().xla_backend_optimization_level()) { - case 1: - codegen_opt_level = CodeGenOpt::Less; - break; - case 2: - codegen_opt_level = CodeGenOpt::Default; - break; - case 3: - codegen_opt_level = CodeGenOpt::Aggressive; - break; - default: - codegen_opt_level = CodeGenOpt::None; - } - return WrapUnique(target->createTargetMachine( - triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx42", target_options, - Optional(RelocModel), Optional(CMModel), - codegen_opt_level)); -} - -// Adds the standard LLVM optimization passes, based on the speed optimization -// level (opt_level) and size optimization level (size_level). Both module -// and function-level passes are added, so two pass managers are passed in and -// modified by this function. -void AddOptimizationPasses(unsigned opt_level, unsigned size_level, - llvm::TargetMachine* target_machine, - llvm::legacy::PassManagerBase* module_passes, - llvm::legacy::FunctionPassManager* function_passes) { - PassManagerBuilder builder; - builder.OptLevel = opt_level; - builder.SizeLevel = size_level; - - if (opt_level > 1) { - builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold); - } else { - // Only inline functions marked with "alwaysinline". - builder.Inliner = llvm::createAlwaysInlinerLegacyPass(); - } - - builder.DisableUnitAtATime = false; - builder.DisableUnrollLoops = opt_level == 0; - builder.LoopVectorize = opt_level > 0; - builder.SLPVectorize = opt_level > 1 && size_level < 2; - - // NVPTX's early-as-possible passes include NVVM reflect. - target_machine->adjustPassManager(builder); - - builder.populateFunctionPassManager(*function_passes); - builder.populateModulePassManager(*module_passes); -} - -// Emits the given module to a bit code file. -void EmitBitcodeToFile(const Module& module, tensorflow::StringPiece filename) { - std::error_code error_code; - llvm::ToolOutputFile outfile(filename.ToString().c_str(), error_code, - llvm::sys::fs::F_None); - if (error_code) { - LOG(FATAL) << "opening bitcode file for writing: " << error_code.message(); - } - - llvm::WriteBitcodeToFile(module, outfile.os()); - outfile.keep(); -} - -// Emits the given module to PTX. target_machine is an initialized TargetMachine -// for the NVPTX target. -string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) { - std::string ptx; // need a std::string instead of a ::string. - { - llvm::raw_string_ostream stream(ptx); - llvm::buffer_ostream pstream(stream); - // The extension is stripped by IrDumpingPassManager, so we need to - // get creative to add a suffix. - string module_id(llvm_ir::AsString(module->getModuleIdentifier())); - IrDumpingPassManager codegen_passes( - ReplaceFilenameExtension(tensorflow::io::Basename(module_id), - "-nvptx.dummy"), - "", false); - codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass( - llvm::Triple(module->getTargetTriple()))); - - target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr, - llvm::TargetMachine::CGFT_AssemblyFile); - codegen_passes.run(*module); - } - - return ptx; -} - -// LLVM has an extensive flags mechanism of its own, which is only accessible -// through the command line. Internal libraries within LLVM register parsers for -// flags, with no other way to configure them except pass these flags. -// To do this programmatically, we invoke ParseCommandLineOptions manually with -// a "fake argv". -// Note: setting flags with this method is stateful, since flags are just -// static globals within LLVM libraries. -void FeedLLVMWithFlags(const std::vector& cl_opts) { - std::vector fake_argv = {""}; - for (const string& cl_opt : cl_opts) { - fake_argv.push_back(cl_opt.c_str()); - } - llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]); -} - -// Returns whether the module could use any libdevice functions. This function -// may have false positives -- the module might not use libdevice even if this -// function returns true. -bool CouldNeedLibdevice(const llvm::Module& module) { - for (const llvm::Function& function : module.functions()) { - // This is a conservative approximation -- not all such functions are in - // libdevice. - if (!function.isIntrinsic() && function.isDeclaration()) { - return true; - } - } - return false; -} - -// Links libdevice into the given module if the module needs libdevice. -Status LinkLibdeviceIfNecessary(llvm::Module* module, - std::pair compute_capability, - const string& libdevice_dir_path) { - if (!CouldNeedLibdevice(*module)) { - return Status::OK(); - } - - llvm::Linker linker(*module); - string libdevice_path = tensorflow::io::JoinPath( - libdevice_dir_path, GetLibdeviceFilename(libdevice_dir_path, - compute_capability)); - TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path)); - VLOG(1) << "Linking with libdevice from: " << libdevice_path; - std::unique_ptr libdevice_module = - LoadIRModule(libdevice_path, &module->getContext()); - if (linker.linkInModule( - std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded, - [](Module& M, const StringSet<>& GVS) { - internalizeModule(M, [&M, &GVS](const GlobalValue& GV) { - return !GV.hasName() || (GVS.count(GV.getName()) == 0); - }); - })) { - return tensorflow::errors::Internal(tensorflow::strings::StrCat( - "Error linking libdevice from ", libdevice_path)); - } - return Status::OK(); -} - -StatusOr CompileModuleToPtx(llvm::Module* module, - std::pair compute_capability, - const HloModuleConfig& hlo_module_config, - const string& libdevice_dir_path) { - // If the module has no functions or globals, there's nothing to compile. Just - // return an empty string. - if (module->empty() && module->global_empty()) { - VLOG(2) << "Module '" << llvm_ir::AsString(module->getName()) - << "' is empty. Skipping compilation."; - return string(); - } - // Link the input module with libdevice, to pull in implementations of some - // builtins. - TF_RETURN_IF_ERROR( - LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path)); - - // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass - // can access it. - module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", - hlo_module_config.debug_options().xla_gpu_ftz()); - - // If ftz is enabled, set it as an attribute on every function in the module. - if (hlo_module_config.debug_options().xla_gpu_ftz()) { - for (llvm::Function& fn : *module) { - fn.addFnAttr("nvptx-f32ftz", "true"); - } - } - - IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false); - - // Add an appropriate TargetLibraryInfo pass for the module's triple. - llvm::TargetLibraryInfoWrapperPass* tliwp = - new llvm::TargetLibraryInfoWrapperPass( - llvm::Triple(module->getTargetTriple())); - module_passes.add(tliwp); - - // Try to fetch the target triple from the module. If not present, set a - // default target triple. - llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); - if (target_triple.getArch() == llvm::Triple::UnknownArch) { - LOG(WARNING) << "target triple not found in the module"; - target_triple = llvm::Triple("nvptx64-unknown-unknown"); - } - - // Figure out the exact name of the processor as known to the NVPTX backend - // from the gpu_architecture flag. - std::unique_ptr target_machine = GetTargetMachine( - target_triple, GetSmName(compute_capability), hlo_module_config); - module_passes.add(llvm::createTargetTransformInfoWrapperPass( - target_machine->getTargetIRAnalysis())); - - // The LLVM IR verifier performs sanity checking on the IR. This helps - // discover problems and report them in a meaningful manner, rather than let - // later passes report obscure assertions because of unfulfilled invariants. - module_passes.add(llvm::createVerifierPass()); - - // Create the function-level pass manager. It needs data layout information - // too. - llvm::legacy::FunctionPassManager function_passes(module); - - int32 opt_level = - hlo_module_config.debug_options().xla_backend_optimization_level(); - - CHECK_GE(opt_level, 2) - << "The XLA GPU backend doesn't support unoptimized code generation"; - - AddOptimizationPasses(opt_level, - /*size_level=*/0, target_machine.get(), &module_passes, - &function_passes); - - // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA - // again after the standard optimization passes [http://b/13329423]. - // TODO(jingyue): SROA may further expose more optimization opportunities such - // as more precise alias analysis and more function inlining (SROA may change - // the inlining cost of a function). For now, running SROA already emits good - // enough code for the evaluated benchmarks. We may want to run more - // optimizations later. - if (opt_level > 0) { - // LLVM's optimizer turns on SROA when the optimization level is greater - // than 0. We mimic this behavior here. - module_passes.add(llvm::createSROAPass()); - } - - // Verify that the module is well formed after optimizations ran. - module_passes.add(llvm::createVerifierPass()); - - // Done populating the pass managers. Now run them. - - function_passes.doInitialization(); - for (auto func = module->begin(); func != module->end(); ++func) { - function_passes.run(*func); - } - function_passes.doFinalization(); - module_passes.run(*module); - - // Finally, produce PTX. - return EmitModuleToPTX(module, target_machine.get()); -} - -// One-time module initializer. -// Must be called only once -- DO NOT CALL DIRECTLY. -void GPUBackendInit(const HloModuleConfig& hlo_module_config) { - // Feed all customized flags here, so we can override them with llvm_cl_opts - // without redeploy the compiler for development purpose. - - // This flag tunes a threshold in branch folding. The default threshold, which - // is one, is not suitable for CUDA programs where branches are more expensive - // than for CPU programs. Setting the threshold to 2 improves the latency of - // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the - // latency of other benchmarks so far. - // - // I also tried setting this threshold to other values: - // * 3-6 gives similar results as 2; - // * >6 start hurting the performance of at least dot product kernels. - // - // TODO(jingyue): The current threshold only considers the numbr of IR - // instructions which do not accurately reflect the true cost. We need a - // better cost model. - FeedLLVMWithFlags({"-bonus-inst-threshold=2"}); - // TODO(b/22073864): Increase limit when scan memory dependency. - // This helps to reduce more redundant load instructions. - // - // The specific value is currently large enough for s3d in shoc benchmark, - // which contains a lot of load instructions and many arithmetic instructions - // between those loads. - FeedLLVMWithFlags({"-memdep-block-scan-limit=500"}); - - llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config); - - // Initialize the NVPTX target; it's the only target we link with, so call its - // specific initialization functions instead of the catch-all InitializeAll*. - LLVMInitializeNVPTXTarget(); - LLVMInitializeNVPTXTargetInfo(); - LLVMInitializeNVPTXTargetMC(); - LLVMInitializeNVPTXAsmPrinter(); - - // Initialize the LLVM optimization passes. - llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry(); - InitializePasses(registry); -} - -} // namespace - -StatusOr CompileToPtx(llvm::Module* module, - std::pair compute_capability, - const HloModuleConfig& hlo_module_config, - const string& libdevice_dir_path) { - static std::once_flag backend_init_flag; - std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config); - - string ptx; - { - tensorflow::tracing::ScopedActivity activity( - "Compiling IR", llvm_ir::AsString(module->getName()), - /*is_expensive=*/true); - XLA_SCOPED_LOGGING_TIMER("Compile module " + - llvm_ir::AsString(module->getName())); - TF_ASSIGN_OR_RETURN( - ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config, - libdevice_dir_path)); - } - return ptx; -} - -} // namespace gpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h deleted file mode 100644 index 0a345191d34e6f40db043c559a67a44a6748321c..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// LLVM-based compiler backend. -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_ - -#include -#include - -#include "llvm/IR/Module.h" -#include "tensorflow/compiler/xla/service/hlo_module_config.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" - -namespace xla { -namespace gpu { - -// Compiles the argument module and returns it. libdevice_dir_path is the parent -// directory of the libdevice bitcode libraries. The contents of the module may -// be changed. -// -// The Compile.* interfaces each create their own llvm::LLVMContext objects for -// thread safety, but note that LLVM's multithreaded support is very -// preliminary; multithreaded use is not recommended at this time. -StatusOr CompileToPtx(llvm::Module* module, - std::pair compute_capability, - const HloModuleConfig& hlo_module_config, - const string& libdevice_dir_path); - -} // namespace gpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_GPU_BACKEND_LIB_H_ diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc new file mode 100644 index 0000000000000000000000000000000000000000..8751e3a9c2a4c8da46d3ecd8437629450d4a2ba2 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -0,0 +1,506 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h" + +#include +#include +#include +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h" +#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/util.h" + +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CodeGen/CommandFlags.inc" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Linker/Linker.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/IPO/Internalize.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Scalar.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/tracing.h" + +namespace xla { +namespace gpu { +namespace { + +// Default inline threshold value to use in llvm. +const int kDefaultInlineThreshold = 1100; + +// Gets the libdevice filename for a particular compute capability. When +// presented with a GPU we don't recognize, we just return the libdevice from +// compute_20. +static string GetLibdeviceFilename(const string& libdevice_dir_path, + std::pair compute_capability) { + // Since CUDA 9.0, all GPU versions are included in a single file + const char* unified_libdevice_filename = "libdevice.10.bc"; + std::vector unified_libdevice_files; + const Status status = tensorflow::Env::Default()->GetMatchingPaths( + tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename), + &unified_libdevice_files); + if (status.ok() && unified_libdevice_files.size() == 1) { + return unified_libdevice_filename; + } + // There are only four libdevice files: compute_{20,30,35,50}. Each GPU + // version gets mapped to one of these. Note in particular that sm_60 and + // sm_61 map to libdevice.compute_30. + static auto* m = new std::map, int>({{{2, 0}, 20}, + {{2, 1}, 20}, + {{3, 0}, 30}, + {{3, 2}, 30}, + {{3, 5}, 35}, + {{3, 7}, 35}, + {{5, 0}, 50}, + {{5, 2}, 50}, + {{5, 3}, 50}, + {{6, 0}, 30}, + {{6, 1}, 30}, + {{6, 2}, 30}}); + int libdevice_version = 20; + auto it = m->find(compute_capability); + if (it != m->end()) { + libdevice_version = it->second; + } else { + LOG(WARNING) << "Unknown compute capability (" << compute_capability.first + << ", " << compute_capability.second << ") ." + << "Defaulting to libdevice for compute_" << libdevice_version; + } + return absl::StrCat("libdevice.compute_", libdevice_version, ".10.bc"); +} + +// Gets the GPU name as it's known to LLVM for a given compute capability. If +// we see an unrecognized compute capability, we return "sm_30". +static string GetSmName(std::pair compute_capability) { + static auto* m = new std::map, int>({ + {{3, 0}, 30}, + {{3, 2}, 32}, + {{3, 5}, 35}, + {{3, 7}, 37}, + {{5, 0}, 50}, + {{5, 2}, 52}, + {{5, 3}, 53}, + {{6, 0}, 60}, + {{6, 1}, 61}, + {{6, 2}, 62}, + {{7, 0}, 70}, + {{7, 2}, 72}, + }); + int sm_version = 30; + auto it = m->find(compute_capability); + if (it != m->end()) { + sm_version = it->second; + } else { + LOG(WARNING) << "Unknown compute capability (" << compute_capability.first + << ", " << compute_capability.second << ") ." + << "Defaulting to telling LLVM that we're compiling for sm_" + << sm_version; + } + return absl::StrCat("sm_", sm_version); +} + +// Convenience function for producing a name of a temporary compilation product +// from the input filename. +string MakeNameForTempProduct(const std::string& input_filename, + absl::string_view extension) { + return ReplaceFilenameExtension(absl::string_view(tensorflow::io::Basename( + llvm_ir::AsString(input_filename))), + extension); +} + +// Initializes LLVM passes. Uses the PassRegistry mechanism. +void InitializePasses(llvm::PassRegistry* pass_registry) { + llvm::initializeCore(*pass_registry); + llvm::initializeCodeGen(*pass_registry); + llvm::initializeScalarOpts(*pass_registry); + llvm::initializeObjCARCOpts(*pass_registry); + llvm::initializeVectorization(*pass_registry); + llvm::initializeIPO(*pass_registry); + llvm::initializeAnalysis(*pass_registry); + llvm::initializeTransformUtils(*pass_registry); + llvm::initializeInstCombine(*pass_registry); + llvm::initializeInstrumentation(*pass_registry); + llvm::initializeTarget(*pass_registry); + llvm::initializeCodeGenPreparePass(*pass_registry); +} + +// Returns the TargetMachine, given a triple. +std::unique_ptr GetTargetMachine( + llvm::Triple triple, absl::string_view cpu_name, + const HloModuleConfig& hlo_module_config) { + std::string error; + const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error); + if (target == nullptr) { + LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'" + << " -- " << error; + return nullptr; + } + + TargetOptions target_options = InitTargetOptionsFromCodeGenFlags(); + llvm_ir::SetTargetOptions( + /*fast_math_enabled=*/hlo_module_config.debug_options() + .xla_gpu_enable_fast_math(), + &target_options); + + // Enable FMA synthesis. + target_options.AllowFPOpFusion = FPOpFusion::Fast; + + // Set the verbose assembly options. + target_options.MCOptions.AsmVerbose = false; + + // The selection of codegen optimization level is copied from function + // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp. + CodeGenOpt::Level codegen_opt_level; + switch (hlo_module_config.debug_options().xla_backend_optimization_level()) { + case 1: + codegen_opt_level = CodeGenOpt::Less; + break; + case 2: + codegen_opt_level = CodeGenOpt::Default; + break; + case 3: + codegen_opt_level = CodeGenOpt::Aggressive; + break; + default: + codegen_opt_level = CodeGenOpt::None; + } + return absl::WrapUnique(target->createTargetMachine( + triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx60", target_options, + Optional(RelocModel), Optional(CMModel), + codegen_opt_level)); +} + +// Adds the standard LLVM optimization passes, based on the speed optimization +// level (opt_level) and size optimization level (size_level). Both module +// and function-level passes are added, so two pass managers are passed in and +// modified by this function. +void AddOptimizationPasses(unsigned opt_level, unsigned size_level, + llvm::TargetMachine* target_machine, + llvm::legacy::PassManagerBase* module_passes, + llvm::legacy::FunctionPassManager* function_passes) { + PassManagerBuilder builder; + builder.OptLevel = opt_level; + builder.SizeLevel = size_level; + + if (opt_level > 1) { + builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold); + } else { + // Only inline functions marked with "alwaysinline". + builder.Inliner = llvm::createAlwaysInlinerLegacyPass(); + } + + builder.DisableUnitAtATime = false; + builder.DisableUnrollLoops = opt_level == 0; + builder.LoopVectorize = opt_level > 0; + builder.SLPVectorize = opt_level > 1 && size_level < 2; + + // NVPTX's early-as-possible passes include NVVM reflect. + target_machine->adjustPassManager(builder); + + builder.populateFunctionPassManager(*function_passes); + builder.populateModulePassManager(*module_passes); +} + +// Emits the given module to a bit code file. +void EmitBitcodeToFile(const Module& module, absl::string_view filename) { + std::error_code error_code; + llvm::ToolOutputFile outfile(string(filename).c_str(), error_code, + llvm::sys::fs::F_None); + if (error_code) { + LOG(FATAL) << "opening bitcode file for writing: " << error_code.message(); + } + + llvm::WriteBitcodeToFile(module, outfile.os()); + outfile.keep(); +} + +// Emits the given module to PTX. target_machine is an initialized TargetMachine +// for the NVPTX target. +string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) { + std::string ptx; // need a std::string instead of a ::string. + { + llvm::raw_string_ostream stream(ptx); + llvm::buffer_ostream pstream(stream); + // The extension is stripped by IrDumpingPassManager, so we need to + // get creative to add a suffix. + string module_id(llvm_ir::AsString(module->getModuleIdentifier())); + IrDumpingPassManager codegen_passes( + ReplaceFilenameExtension( + absl::string_view(tensorflow::io::Basename(module_id)), + "-nvptx.dummy"), + "", false); + codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass( + llvm::Triple(module->getTargetTriple()))); + + target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr, + llvm::TargetMachine::CGFT_AssemblyFile); + codegen_passes.run(*module); + } + + return ptx; +} + +// LLVM has an extensive flags mechanism of its own, which is only accessible +// through the command line. Internal libraries within LLVM register parsers for +// flags, with no other way to configure them except pass these flags. +// To do this programmatically, we invoke ParseCommandLineOptions manually with +// a "fake argv". +// Note: setting flags with this method is stateful, since flags are just +// static globals within LLVM libraries. +void FeedLLVMWithFlags(const std::vector& cl_opts) { + std::vector fake_argv = {""}; + for (const string& cl_opt : cl_opts) { + fake_argv.push_back(cl_opt.c_str()); + } + llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]); +} + +// Returns whether the module could use any libdevice functions. This function +// may have false positives -- the module might not use libdevice even if this +// function returns true. +bool CouldNeedLibdevice(const llvm::Module& module) { + for (const llvm::Function& function : module.functions()) { + // This is a conservative approximation -- not all such functions are in + // libdevice. + if (!function.isIntrinsic() && function.isDeclaration()) { + return true; + } + } + return false; +} + +// Links libdevice into the given module if the module needs libdevice. +Status LinkLibdeviceIfNecessary(llvm::Module* module, + std::pair compute_capability, + const string& libdevice_dir_path) { + if (!CouldNeedLibdevice(*module)) { + return Status::OK(); + } + + llvm::Linker linker(*module); + string libdevice_path = tensorflow::io::JoinPath( + libdevice_dir_path, + GetLibdeviceFilename(libdevice_dir_path, compute_capability)); + TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path)); + VLOG(1) << "Linking with libdevice from: " << libdevice_path; + std::unique_ptr libdevice_module = + LoadIRModule(libdevice_path, &module->getContext()); + if (linker.linkInModule( + std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded, + [](Module& M, const StringSet<>& GVS) { + internalizeModule(M, [&GVS](const GlobalValue& GV) { + return !GV.hasName() || (GVS.count(GV.getName()) == 0); + }); + })) { + return tensorflow::errors::Internal( + absl::StrCat("Error linking libdevice from ", libdevice_path)); + } + return Status::OK(); +} + +StatusOr CompileModuleToPtx(llvm::Module* module, + std::pair compute_capability, + const HloModuleConfig& hlo_module_config, + const string& libdevice_dir_path) { + // If the module has no functions or globals, there's nothing to compile. Just + // return an empty string. + if (module->empty() && module->global_empty()) { + VLOG(2) << "Module '" << llvm_ir::AsString(module->getName()) + << "' is empty. Skipping compilation."; + return string(); + } + // Link the input module with libdevice, to pull in implementations of some + // builtins. + TF_RETURN_IF_ERROR( + LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path)); + + // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass + // can access it. + module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", + hlo_module_config.debug_options().xla_gpu_ftz()); + + // If ftz is enabled, set it as an attribute on every function in the module. + if (hlo_module_config.debug_options().xla_gpu_ftz()) { + for (llvm::Function& fn : *module) { + fn.addFnAttr("nvptx-f32ftz", "true"); + } + } + + IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false); + + // Add an appropriate TargetLibraryInfo pass for the module's triple. + llvm::TargetLibraryInfoWrapperPass* tliwp = + new llvm::TargetLibraryInfoWrapperPass( + llvm::Triple(module->getTargetTriple())); + module_passes.add(tliwp); + + // Try to fetch the target triple from the module. If not present, set a + // default target triple. + llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); + if (target_triple.getArch() == llvm::Triple::UnknownArch) { + LOG(WARNING) << "target triple not found in the module"; + target_triple = llvm::Triple("nvptx64-unknown-unknown"); + } + + // Figure out the exact name of the processor as known to the NVPTX backend + // from the gpu_architecture flag. + std::unique_ptr target_machine = GetTargetMachine( + target_triple, GetSmName(compute_capability), hlo_module_config); + module_passes.add(llvm::createTargetTransformInfoWrapperPass( + target_machine->getTargetIRAnalysis())); + + // The LLVM IR verifier performs sanity checking on the IR. This helps + // discover problems and report them in a meaningful manner, rather than let + // later passes report obscure assertions because of unfulfilled invariants. + module_passes.add(llvm::createVerifierPass()); + + // Create the function-level pass manager. It needs data layout information + // too. + llvm::legacy::FunctionPassManager function_passes(module); + + int32 opt_level = + hlo_module_config.debug_options().xla_backend_optimization_level(); + + CHECK_GE(opt_level, 2) + << "The XLA GPU backend doesn't support unoptimized code generation"; + + AddOptimizationPasses(opt_level, + /*size_level=*/0, target_machine.get(), &module_passes, + &function_passes); + + // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA + // again after the standard optimization passes [http://b/13329423]. + // TODO(jingyue): SROA may further expose more optimization opportunities such + // as more precise alias analysis and more function inlining (SROA may change + // the inlining cost of a function). For now, running SROA already emits good + // enough code for the evaluated benchmarks. We may want to run more + // optimizations later. + if (opt_level > 0) { + // LLVM's optimizer turns on SROA when the optimization level is greater + // than 0. We mimic this behavior here. + module_passes.add(llvm::createSROAPass()); + } + + // Verify that the module is well formed after optimizations ran. + module_passes.add(llvm::createVerifierPass()); + + // Done populating the pass managers. Now run them. + + function_passes.doInitialization(); + for (auto func = module->begin(); func != module->end(); ++func) { + function_passes.run(*func); + } + function_passes.doFinalization(); + module_passes.run(*module); + + // Finally, produce PTX. + return EmitModuleToPTX(module, target_machine.get()); +} + +// One-time module initializer. +// Must be called only once -- DO NOT CALL DIRECTLY. +void GPUBackendInit(const HloModuleConfig& hlo_module_config) { + // Feed all customized flags here, so we can override them with llvm_cl_opts + // without redeploy the compiler for development purpose. + + // This flag tunes a threshold in branch folding. The default threshold, which + // is one, is not suitable for CUDA programs where branches are more expensive + // than for CPU programs. Setting the threshold to 2 improves the latency of + // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the + // latency of other benchmarks so far. + // + // I also tried setting this threshold to other values: + // * 3-6 gives similar results as 2; + // * >6 start hurting the performance of at least dot product kernels. + // + // TODO(jingyue): The current threshold only considers the numbr of IR + // instructions which do not accurately reflect the true cost. We need a + // better cost model. + FeedLLVMWithFlags({"-bonus-inst-threshold=2"}); + // TODO(b/22073864): Increase limit when scan memory dependency. + // This helps to reduce more redundant load instructions. + // + // The specific value is currently large enough for s3d in shoc benchmark, + // which contains a lot of load instructions and many arithmetic instructions + // between those loads. + FeedLLVMWithFlags({"-memdep-block-scan-limit=500"}); + + llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config); + + // Initialize the NVPTX target; it's the only target we link with, so call its + // specific initialization functions instead of the catch-all InitializeAll*. + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + + // Initialize the LLVM optimization passes. + llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry(); + InitializePasses(registry); +} + +} // namespace + +StatusOr CompileToPtx(llvm::Module* module, + std::pair compute_capability, + const HloModuleConfig& hlo_module_config, + const string& libdevice_dir_path) { + static std::once_flag backend_init_flag; + std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config); + + string ptx; + { + tensorflow::tracing::ScopedActivity activity( + "Compiling IR", llvm_ir::AsString(module->getName()), + /*is_expensive=*/true); + XLA_SCOPED_LOGGING_TIMER("Compile module " + + llvm_ir::AsString(module->getName())); + TF_ASSIGN_OR_RETURN( + ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config, + libdevice_dir_path)); + } + return ptx; +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h new file mode 100644 index 0000000000000000000000000000000000000000..9654175bfafbb2521743e7894188abe5b5a15217 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h @@ -0,0 +1,47 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// LLVM-based compiler backend. +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_BACKEND_LIB_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_BACKEND_LIB_H_ + +#include +#include + +#include "absl/strings/string_view.h" +#include "llvm/IR/Module.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/types.h" + +namespace xla { +namespace gpu { + +// Compiles the argument module and returns it. libdevice_dir_path is the parent +// directory of the libdevice bitcode libraries. The contents of the module may +// be changed. +// +// The Compile.* interfaces each create their own llvm::LLVMContext objects for +// thread safety, but note that LLVM's multithreaded support is very +// preliminary; multithreaded use is not recommended at this time. +StatusOr CompileToPtx(llvm::Module* module, + std::pair compute_capability, + const HloModuleConfig& hlo_module_config, + const string& libdevice_dir_path); + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_NVPTX_BACKEND_LIB_H_ diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc index 9ef9bc3a50fc76f83f05e19163ab339f2da6ef3c..3b2c3591d95ee5a319c82336e9b500d14f88734f 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc @@ -17,13 +17,13 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Support/SourceMgr.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace { @@ -52,14 +52,13 @@ std::unique_ptr LoadIRModule(const string& filename, return module; } -string ReplaceFilenameExtension(tensorflow::StringPiece filename, - tensorflow::StringPiece new_extension) { +string ReplaceFilenameExtension(absl::string_view filename, + absl::string_view new_extension) { auto pos = filename.rfind('.'); - tensorflow::StringPiece stem = - pos == tensorflow::StringPiece::npos - ? filename - : tensorflow::StringPiece(filename.data(), pos); - return tensorflow::strings::StrCat(stem, ".", new_extension); + absl::string_view stem = pos == absl::string_view::npos + ? filename + : absl::string_view(filename.data(), pos); + return absl::StrCat(stem, ".", new_extension); } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h index a6daeca95a6da66cb31b82805a6896f57cb80354..60f4926849cd3e8ad144f657f9feb3c3e1ea25e2 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h @@ -18,8 +18,8 @@ limitations under the License. #include #include +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" namespace llvm { class LLVMContext; @@ -41,8 +41,8 @@ std::unique_ptr LoadIRModule(const string& filename, // // For example: // ReplaceFilenameExtension("/foo/baz.txt", "cc") --> "/foo/baz.cc" -string ReplaceFilenameExtension(tensorflow::StringPiece filename, - tensorflow::StringPiece new_extension); +string ReplaceFilenameExtension(absl::string_view filename, + absl::string_view new_extension); } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc index d4100a898b5bb9eec382c34932c2db104c9e985b..9fd6cf7157ecd659e7eb1d2c5228eca931ff6a01 100644 --- a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc @@ -14,21 +14,27 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/memset_thunk.h" + +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/stream_executor/stream_executor.h" namespace xla { namespace gpu { Status MemzeroThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); stream->ThenMemZero(&dest_data, dest_data.size()); return Status::OK(); } Status Memset32BitValueThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); stream->ThenMemset32(&dest_data, value_, dest_data.size()); return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h index 51c332d287d139335b356fc66411b5ffaa448b5a..d1fec0bd76b8a80f4a1e1c2e818f248997da7a75 100644 --- a/tensorflow/compiler/xla/service/gpu/memset_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MEMSET_THUNK_H_ #include "tensorflow/compiler/xla/service/buffer_assignment.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/status.h" @@ -36,7 +37,8 @@ class MemzeroThunk : public Thunk { : Thunk(Kind::kMemzero, hlo), dest_(dest) {} Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: const BufferAllocation::Slice dest_; @@ -52,7 +54,8 @@ class Memset32BitValueThunk : public Thunk { : Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {} Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: uint32 value_; diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc new file mode 100644 index 0000000000000000000000000000000000000000..c21f76f6eb1874bfa5a1d296c78ea0e3b9261eca --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc @@ -0,0 +1,266 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "absl/algorithm/container.h" +#include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h" +#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/core/lib/gtl/flatset.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace gpu { + +GpuMultiOutputFusion::GpuMultiOutputFusion() : MultiOutputFusion(INT64_MAX) {} + +bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1, + HloInstruction* instr2) { + auto get_element_instr = + [&](const HloInstruction* instr) -> const HloInstruction* { + const HloInstruction* element_instr = instr; + if (instr->opcode() == HloOpcode::kFusion) { + auto fused_expression_root = instr->fused_expression_root(); + if (instr->IsMultiOutputFusion()) { + // If possible, we want to pick a reduce operand of the fusion root, + // because it has the most constraints. + for (const auto* inst : fused_expression_root->operands()) { + if (IsReductionToVector(*inst)) { + return inst; + } + } + return fused_expression_root->operands()[0]; + } else { + element_instr = fused_expression_root; + } + } + return element_instr; + }; + + auto get_element_shape = [&](const HloInstruction* element_instr) { + // Special handling of kReduce instructions -- the fusion + // applies to the first operand. + if (IsReductionToVector(*element_instr)) { + return element_instr->operand(0)->shape(); + } + return element_instr->shape(); + }; + + // The shapes in all tuple operands should agree, unless it is a reduce. + // In that case, the operand of the reduce needs to have the same shape + // as the other tuple operands, but also we need to compare the output + // shapes of the reduces. + auto* element_instr_1 = get_element_instr(instr1); + auto* element_instr_2 = get_element_instr(instr2); + if (element_instr_1->opcode() == HloOpcode::kReduce && + element_instr_2->opcode() == HloOpcode::kReduce && + !ShapeUtil::Equal(element_instr_1->shape(), element_instr_2->shape())) { + return false; + } + // The elementwise output shapes must be the same (including layout). + return ShapeUtil::EqualIgnoringFpPrecision( + get_element_shape(element_instr_1), get_element_shape(element_instr_2)); +} + +bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) { + // We can fuse reduces and loop fusions. Elementwise instructions can be fused + // with any other instruction. + // TODO(b/112957171): This should use the same isFusible logic as + // instruction_fusion. + return instr->IsFusible() && + (IsInputFusibleReduction(*instr) || + (instr->opcode() == HloOpcode::kFusion && + instr->fusion_kind() == HloInstruction::FusionKind::kLoop) || + instr->IsElementwise()); +} + +int64 GpuMultiOutputFusion::GetProfit(HloInstruction* instr1, + HloInstruction* instr2) { + tensorflow::gtl::FlatSet in_list; + for (auto instr : instr1->operands()) { + if (!IsProfitableOperand(instr)) { + continue; + } + in_list.insert(instr); + } + int64 profit = 0; + for (auto instr : instr2->operands()) { + if (!IsProfitableOperand(instr) || in_list.count(instr) == 0) { + continue; + } + profit += ShapeUtil::ByteSizeOf(instr->shape()); + } + VLOG(2) << "Fusing instr1=" << instr1->name() << " instr2=" << instr2->name() + << ", the profit is =" << profit; + return profit; +} + +bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1, + HloInstruction* instr2) { + if (!MultiOutputFusion::LegalToFuse(instr1, instr2)) { + return false; + } + + // If we're fusing fusions only do it if the fusion kind matches. Loop fusions + // merge into bigger loop fusions and input (reduce) fusions become fusions + // with multiple reduce outputs. We could fuse reduce and loop fusions + // together too (the result being an input fusion) if we find cases where this + // improves things. Also disable fusing standalone input-fusible reduces into + // loop fusions. + CHECK(instr1->opcode() == HloOpcode::kFusion); + if ((instr2->opcode() == HloOpcode::kFusion && + instr1->fusion_kind() != instr2->fusion_kind()) || + (IsReductionToVector(*instr2) && + instr1->fusion_kind() == HloInstruction::FusionKind::kLoop)) { + return false; + } + + // Do this check last, as it may be expensive. + return !GpuInstructionFusion::FusionWouldBeTooLarge(instr1, instr2); +} + +bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() { + bool changed = false; + RecomputeReachability(); + + tensorflow::gtl::FlatSet to_fuse; + // Keep a list of the instructions to fuse after making all the fusion + // decisions. We first aggressively add instructions to potential_fusion_list, + // then filter out instructions that will be no longer fusible because of + // reachability change. This avoids recalculating reachability on a large set + // of instructions. + std::vector> + potential_fusion_list; + std::vector> fusion_list; + std::vector instrs_to_update_reachability; + + // For each reduce or reduce multi-output fusion, try to fuse it with loop + // fusions operands. + for (HloInstruction* consumer : computation()->MakeInstructionPostOrder()) { + if (consumer->user_count() == 0) { + VLOG(3) << consumer->name() << " has no users."; + continue; + } + if (!IsInputFusibleReduction(*consumer)) { + VLOG(3) << consumer->name() << " is not an input-fusible reduction."; + continue; + } + VLOG(3) << consumer->name() + << " is a fusion candidate. Looking for fuseable operands."; + + auto consumer_operands = consumer->operands(); + for (size_t i = 0; i < consumer_operands.size(); ++i) { + HloInstruction* producer = consumer_operands[i]; + if (!producer->IsFusible()) { + VLOG(3) << producer->name() << " is not fusible."; + continue; + } + const bool is_loop_fusion = + producer->opcode() == HloOpcode::kFusion && + producer->fusion_kind() == HloInstruction::FusionKind::kLoop; + if (!producer->IsElementwise() && !is_loop_fusion) { + VLOG(3) << producer->name() << " is not a loop fusion."; + continue; + } + if (!ShapesCompatibleForFusion(producer, consumer)) { + VLOG(3) << producer->name() << " has an incompatible shape."; + continue; + } + if (!LayoutsAreReduceInputFusionFriendly(*producer, *consumer)) { + VLOG(3) << producer->name() << " has inputs with mixed layouts."; + continue; + } + // If we have already decided to fuse this producer, skip it. + if (ContainsKey(to_fuse, producer)) { + VLOG(3) << producer->name() << " will be fused with another consumer."; + continue; + } + // Do not fuse a producer if the other operands of the fusion are + // reachable from the producer, this would create a cycle. + if (absl::c_any_of(consumer_operands, [&](HloInstruction* operand) { + return producer != operand && + reachability()->IsReachable(producer, operand); + })) { + VLOG(3) << producer->name() << " would introduce a cycle when fused."; + break; + } + to_fuse.insert(producer); + potential_fusion_list.emplace_back(producer, consumer); + instrs_to_update_reachability.push_back(producer); + instrs_to_update_reachability.push_back(consumer); + break; + } + } + + // Filter out pairs that will be no longer fusible because of reachability + // change. + for (auto& fusion_pair : potential_fusion_list) { + HloInstruction* producer = fusion_pair.first; + HloInstruction* consumer = fusion_pair.second; + if (!absl::c_any_of(consumer->operands(), [&](HloInstruction* operand) { + return producer != operand && + reachability()->IsReachable(producer, operand); + })) { + UpdateReachability(producer, consumer, instrs_to_update_reachability); + fusion_list.push_back(fusion_pair); + } + } + + for (auto fusions_to_create : fusion_list) { + HloInstruction* producer = fusions_to_create.first; + HloInstruction* consumer = fusions_to_create.second; + if (consumer->opcode() != HloOpcode::kFusion) { + // Fusing with a reduce (fusion) always results in an input fusion. + HloInstruction* input_fusion = + computation()->AddInstruction(HloInstruction::CreateFusion( + consumer->shape(), HloInstruction::FusionKind::kInput, consumer)); + VLOG(2) << "Fuse producer " << producer->name() << " and its consumer " + << consumer->name() << " into " << input_fusion->name(); + TF_CHECK_OK(computation()->ReplaceInstruction(consumer, input_fusion)); + if (producer->opcode() == HloOpcode::kFusion) { + input_fusion->MergeFusionInstructionIntoMultiOutput(producer); + } else { + input_fusion->FuseInstructionIntoMultiOutput(producer); + } + } else { + VLOG(2) << "Fuse producer " << producer->name() << " into its consumer " + << consumer->name(); + + if (producer->opcode() == HloOpcode::kFusion) { + consumer->MergeFusionInstructionIntoMultiOutput(producer); + } else { + consumer->FuseInstructionIntoMultiOutput(producer); + } + } + changed = true; + } + return changed; +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h new file mode 100644 index 0000000000000000000000000000000000000000..f0b4d67ab8463a39161f71908746cad9e2a8670a --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h @@ -0,0 +1,56 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_ + +#include "tensorflow/compiler/xla/service/multi_output_fusion.h" + +namespace xla { +namespace gpu { + +// Multi-output fusion of sibling and producer-consumer instructions for the +// GPU backend. +class GpuMultiOutputFusion : public MultiOutputFusion { + public: + GpuMultiOutputFusion(); + + protected: + // Test if instr1 and instr2 have the compatible shapes that can be legally + // fused. + bool ShapesCompatibleForFusion(HloInstruction* instr1, + HloInstruction* instr2) override; + + // We currently only consider reduce and reduce fusion nodes as candidates. + bool IsFusible(HloInstruction* instr) override; + + // This function estimates the amount of memory reads saved by merging + // instr1 and instr2 into one multi-output fusion instruction. For a fusion + // instruction, all the operands need to be loaded from memory. If we merge + // instr1 and instr2, common operands will not be loaded twice. The profit is + // estimated as the size of the common operands b/w instr1 and instr2. + int64 GetProfit(HloInstruction* instr1, HloInstruction* instr2) override; + + // Test if it's legal to fuse instr1 and instr2 into one fusion instruction. + bool LegalToFuse(HloInstruction* instr1, HloInstruction* instr2) override; + + // Fuse loop fusions into reduce fusions. + bool DoProducerConsumerMultiOutputFusion() override; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_ diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c822c94f1b102e02be4a13a35892a2c181702383 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc @@ -0,0 +1,625 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h" + +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/util.h" + +namespace xla { +namespace gpu { + +namespace op = xla::testing::opcode_matchers; + +using MultiOutputFusionTest = HloTestBase; + +const char kModulePrefix[] = R"( + HloModule test_module + + scalar_add_computation { + scalar_lhs.0 = f32[] parameter(0) + scalar_rhs.0 = f32[] parameter(1) + ROOT add.0 = f32[] add(scalar_lhs.0, scalar_rhs.0) + } + scalar_mul_computation { + scalar_lhs.1 = f32[] parameter(0) + scalar_rhs.1 = f32[] parameter(1) + ROOT mul.1 = f32[] multiply(scalar_lhs.1, scalar_rhs.1) + })"; + +TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) { + // Fusion with reduce instruction root and a sibling reduce instruction + // sharing the same input param. + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_computation { + p1.1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + mul = f32[128,512,28,28]{3,2,1,0} multiply(p1.1, p1.1) + const.1 = f32[] parameter(0) + ROOT reduce.1 = f32[512]{0} reduce(mul, const.1), dimensions={0,2,3}, to_apply=scalar_add_computation + } + + ENTRY entry { + p0 = f32[] parameter(0) + p1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + const.2 = f32[] constant(1) + fusion = f32[512] fusion(p0, p1), kind=kInput, calls=fused_computation + reduce.2 = f32[512]{0} reduce(p1, const.2), dimensions={0,2,3}, to_apply=scalar_add_computation + ROOT root = (f32[512]{0}, f32[512]{0}) tuple(fusion, reduce.2) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* fusion = + module->entry_computation()->root_instruction()->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Reduce(), op::Reduce())); +} + +TEST_F(MultiOutputFusionTest, MultiOutputFusionDifferentReduceInputShapes) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_computation_1 { + p1.1 = f32[6400]{0} parameter(1) + mul = f32[6400]{0} multiply(p1.1, p1.1) + const.1 = f32[] parameter(0) + ROOT reduce.1 = f32[] reduce(mul, const.1), dimensions={0}, to_apply=scalar_add_computation + } + + fused_computation_2 { + p1.2 = f32[6400]{0} parameter(1) + r1 = f32[64,100]{0,1} reshape(p1.2) + const.2 = f32[] parameter(0) + ROOT reduce.2 = f32[] reduce(r1, const.2), dimensions={1,0}, to_apply=scalar_mul_computation + } + + ENTRY entry { + p0 = f32[] parameter(0) + p1 = f32[6400]{0} parameter(1) + fusion.1 = f32[] fusion(p0, p1), kind=kInput, calls=fused_computation_1 + fusion.2 = f32[] fusion(p0, p1), kind=kInput, calls=fused_computation_2 + ROOT root = (f32[], f32[]) tuple(fusion.1, fusion.2) + })")) + .ValueOrDie(); + ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); +} + +TEST_F(MultiOutputFusionTest, MultiOutputFusionDifferentReduceOutputShapes) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_computation_1 { + p1.1 = f32[10,10]{1,0} parameter(1) + mul = f32[10,10]{1,0} multiply(p1.1, p1.1) + const.1 = f32[] parameter(0) + ROOT reduce.1 = f32[] reduce(mul, const.1), dimensions={0,1}, to_apply=scalar_add_computation + } + + fused_computation_2 { + p1.2 = f32[10,10]{1,0} parameter(1) + const.2 = f32[10]{0} parameter(0) + ROOT reduce.2 = f32[10]{0} reduce(p1.2, const.2), dimensions={0}, to_apply=scalar_mul_computation + } + + ENTRY entry { + p0 = f32[] parameter(0) + p1.3 = f32[10,10]{1,0} parameter(1) + fusion.1 = f32[] fusion(p0, p1.3), kind=kInput, calls=fused_computation_1 + p2 = f32[] parameter(2) + fusion.2 = f32[10]{0} fusion(p2, p1.3), kind=kInput, calls=fused_computation_2 + ROOT root = (f32[], f32[10]{0}) tuple(fusion.1, fusion.2) + })")) + .ValueOrDie(); + ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); +} + +TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingReduceFusions) { + // Two sibling fusions with reduce instruction roots sharing the same input + // param. + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_computation_1 { + p1.1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + mul = f32[128,512,28,28]{3,2,1,0} multiply(p1.1, p1.1) + const.1 = f32[] parameter(0) + ROOT reduce.1 = f32[512]{0} reduce(mul, const.1), dimensions={0,2,3}, to_apply=scalar_add_computation + } + + fused_computation_2 { + p1.2 = f32[128,512,28,28]{3,2,1,0} parameter(1) + const.2 = f32[] parameter(0) + ROOT reduce.2 = f32[512]{0} reduce(p1.2, const.2), dimensions={0,2,3}, to_apply=scalar_add_computation + } + + ENTRY entry { + p0 = f32[] parameter(0) + p1 = f32[128,512,28,28]{3,2,1,0} parameter(1) + fusion.1 = f32[512] fusion(p0, p1), kind=kInput, calls=fused_computation_1 + fusion.2 = f32[512] fusion(p0, p1), kind=kInput, calls=fused_computation_2 + ROOT root = (f32[512]{0}, f32[512]{0}) tuple(fusion.1, fusion.2) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* fusion = + module->entry_computation()->root_instruction()->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Reduce(), op::Reduce())); +} + +TEST_F(MultiOutputFusionTest, + MultiOutputFusionSiblingReduceAndReduceMultiOutputFusion) { + // Multi-output fusion with two reduce instructions root and a sibling reduce + // instruction sharing the same input param. + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_computation (p0: f32[128,512,28,28]) -> (f32[512], f32[512]) { + const.1 = f32[] constant(1) + p0.1 = f32[128,512,28,28]{3,2,1,0} parameter(0) + mul = f32[128,512,28,28]{3,2,1,0} multiply(f32[128,512,28,28]{3,2,1,0} p0.1, f32[128,512,28,28]{3,2,1,0} p0.1) + reduce.1 = f32[512]{0} reduce(f32[128,512,28,28]{3,2,1,0} mul, f32[] const.1), dimensions={0,2,3}, to_apply=scalar_add_computation + reduce.2 = f32[512]{0} reduce(f32[128,512,28,28]{3,2,1,0} p0.1, f32[] const.1), dimensions={0,2,3}, to_apply=scalar_add_computation + ROOT tuple = (f32[512]{0}, f32[512]{0}) tuple(f32[512]{0} reduce.1, f32[512]{0} reduce.2) + } + + ENTRY entry (p0: f32[128,512,28,28]) -> (f32[512], f32[512], f32[512]) { + p0 = f32[128,512,28,28]{3,2,1,0} parameter(0) + const = f32[] constant(1) + fusion = (f32[512]{0}, f32[512]{0}) fusion(f32[128,512,28,28]{3,2,1,0} p0), kind=kInput, calls=fused_computation + get-tuple-element = f32[512]{0} get-tuple-element((f32[512]{0}, f32[512]{0}) fusion), index=0 + get-tuple-element.1 = f32[512]{0} get-tuple-element((f32[512]{0}, f32[512]{0}) fusion), index=1 + reduce.3 = f32[512]{0} reduce(p0, const), dimensions={0,2,3}, to_apply=scalar_add_computation + ROOT root = (f32[512]{0}, f32[512]{0}, f32[512]{0}) tuple(f32[512]{0} get-tuple-element, f32[512]{0} get-tuple-element.1, f32[512]{0} reduce.3) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* fusion = + module->entry_computation()->root_instruction()->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Reduce(), op::Reduce(), op::Reduce())); +} + +TEST_F(MultiOutputFusionTest, + MultiOutputFusionSiblingFusionCheckAgainstReduceOperand) { + // Verify that if we already have a multi-output fusion that we prefer to pick + // a reduce op from its operands for checking shape compatibility. + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_computation_1 { + p1.1 = f32[10,10]{1,0} parameter(1) + mul = f32[10,10]{1,0} multiply(p1.1, p1.1) + const.1 = f32[] parameter(0) + reduce.1 = f32[] reduce(p1.1, const.1), dimensions={0,1}, to_apply=scalar_add_computation + ROOT tuple = (f32[10,10], f32[]) tuple(mul, reduce.1) + } + + fused_computation_2 { + p1.2 = f32[10,10]{1,0} parameter(1) + const.2 = f32[10] parameter(0) + ROOT reduce.2 = f32[10] reduce(p1.2, const.2), dimensions={0}, to_apply=scalar_mul_computation + } + + ENTRY entry { + p0 = f32[] parameter(0) + p1 = f32[10,10]{1,0} parameter(1) + p2 = f32[10]{0} parameter(2) + fusion.1 = (f32[10,10], f32[10]) fusion(p0, p1), kind=kInput, calls=fused_computation_1 + get-tuple-element.1 = f32[10,10] get-tuple-element((f32[10,10], f32[10]) fusion.1), index=0 + get-tuple-element.2 = f32[] get-tuple-element((f32[10,10], f32[10]) fusion.1), index=1 + fusion.2 = f32[10] fusion(p2, p1), kind=kInput, calls=fused_computation_2 + ROOT root = (f32[10,10], f32[], f32[10]) tuple(get-tuple-element.1, get-tuple-element.2, fusion.2) + })")) + .ValueOrDie(); + ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); +} + +TEST_F(MultiOutputFusionTest, MultiOutputFusionTwoLoops) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_computation_1 { + p0.1 = f32[6400]{0} parameter(0) + ROOT mul = f32[6400]{0} multiply(p0.1, p0.1) + } + + fused_computation_2 { + p0.2 = f32[6400]{0} parameter(0) + const.2 = f32[] constant(1) + ROOT div = f32[6400]{0} divide(p0.2, const.2) + } + + ENTRY entry { + p0 = f32[6400]{0} parameter(0) + fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1 + fusion.2 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_2 + ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, fusion.2) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* fusion = + module->entry_computation()->root_instruction()->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Multiply(), op::Divide())); +} + +TEST_F(MultiOutputFusionTest, MultiOutputFusionLoopReduceToInputFusion) { + // Fusing a reduce into a loop fusion would require changing the fusion kind. + // That's not supported yet. + auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"( + fused_computation_1 { + p0.1 = f32[6400]{0} parameter(0) + ROOT mul = f32[6400]{0} multiply(p0.1, p0.1) + } + + ENTRY entry { + p0 = f32[6400]{0} parameter(0) + fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1 + const.2 = f32[] constant(0) + reduce = f32[] reduce(p0, const.2), dimensions={0}, to_apply=scalar_add_computation + ROOT root = (f32[6400]{0}, f32[]) tuple(fusion.1, reduce) + })")) + .ValueOrDie(); + ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); +} + +TEST_F(MultiOutputFusionTest, MultiOutputFusionLoopElementwise) { + auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"( + fused_computation_1 { + p0.1 = f32[6400]{0} parameter(0) + ROOT mul = f32[6400]{0} multiply(p0.1, p0.1) + } + + ENTRY entry { + p0 = f32[6400]{0} parameter(0) + fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1 + const.2 = f32[] constant(1) + div = f32[6400]{0} divide(p0, const.2) + ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, div) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* fusion = + module->entry_computation()->root_instruction()->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Multiply(), op::Divide())); +} + +TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingLoopsDifferentShapes) { + auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"( + fused_computation_1 { + p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0) + ROOT mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1) + } + + fused_computation_2 { + p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0) + const.2 = f32[] constant(0) + ROOT reduce = f32[8,1,5,1,1]{4,3,2,1,0} reduce(p0.2, const.2), dimensions={3}, to_apply=scalar_add_computation + } + + ENTRY entry { + p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0) + fusion.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_1 + fusion.2 = f32[8,1,5,1,1]{4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2 + ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,1,1]{4,3,2,1,0}) tuple(fusion.1, fusion.2) + })")) + .ValueOrDie(); + ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); +} + +TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingLoopAndMultiOutputLoop) { + auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"( + fused_computation_1 { + p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0) + mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1) + exp = f32[8,1,5,16,1,1]{5,4,3,2,1,0} exponential(p0.1) + ROOT tuple = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(mul, exp) + } + + fused_computation_2 { + p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0) + const.2 = f32[] constant(0) + ROOT add = f32[8,1,5,16,1,1]{5,4,3,2,1,0} add(p0.2, const.2) + } + + ENTRY entry { + p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0) + fusion.1 = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) fusion(p0), kind=kLoop, calls=fused_computation_1 + fusion.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2 + gte0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=0 + gte1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=1 + ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(gte0, gte1, fusion.2) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* fusion = + module->entry_computation()->root_instruction()->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Multiply(), op::Exp(), op::Add())); +} + +TEST_F(MultiOutputFusionTest, + MultiOutputFusionSiblingLoopAndMultiOutputLoopDifferentShapes) { + auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"( + fused_computation_1 { + p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0) + mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1) + exp = f32[8,1,5,16,1,1]{5,4,3,2,1,0} exponential(p0.1) + ROOT tuple = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(mul, exp) + } + + fused_computation_2 { + p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0) + const.2 = f32[] constant(0) + ROOT reduce = f32[8,1,5,1,1]{4,3,2,1,0} reduce(p0.2, const.2), dimensions={3}, to_apply=scalar_add_computation + } + + ENTRY entry { + p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0) + fusion.1 = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) fusion(p0), kind=kLoop, calls=fused_computation_1 + fusion.2 = f32[8,1,5,1,1]{4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2 + gte0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=0 + gte1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=1 + ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,1,1]{4,3,2,1,0}) tuple(gte0, gte1, fusion.2) + })")) + .ValueOrDie(); + ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); +} + +TEST_F(MultiOutputFusionTest, ProducerConsumerFusionElementwiseAndReduce) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + ENTRY reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + c0 = f32[] constant(0) + exp = f32[2,2,2]{2,1,0} exponential(p0) + reduce = f32[2,2]{1,0} reduce(exp, c0), dimensions={2}, to_apply=scalar_add_computation + ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce, exp) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Tuple(op::GetTupleElement(), op::GetTupleElement())); + const HloInstruction* fusion = root->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Reduce(), op::Exp())); +} + +TEST_F(MultiOutputFusionTest, ProducerConsumerFusionLoopFusionAndReduce) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_add { + p0.1 = f32[2,2,2]{2,1,0} parameter(0) + p1.1 = f32[2,2,2]{2,1,0} parameter(1) + ROOT add = f32[2,2,2]{2,1,0} add(p0.1, p1.1) + } + + ENTRY reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + p1 = f32[2,2,2]{2,1,0} parameter(1) + c0 = f32[] constant(0) + add = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_add + reduce = f32[2,2]{1,0} reduce(add, c0), dimensions={2}, to_apply=scalar_add_computation + ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce, add) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Tuple(op::GetTupleElement(), op::GetTupleElement())); + const HloInstruction* fusion = root->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Reduce(), op::Add())); +} + +TEST_F(MultiOutputFusionTest, ProducerConsumerFusionLoopFusionAndReduceFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_select { + p1.1 = f32[2,2,2]{2,1,0} parameter(1) + c0 = f32[] constant(0) + broadcast = f32[2,2,2]{2,1,0} broadcast(f32[] c0), dimensions={} + greater-than = pred[2,2,2]{2,1,0} greater-than(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast) + p0.1 = f32[2,2,2]{2,1,0} parameter(0) + ROOT select = f32[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f32[2,2,2]{2,1,0} p0.1, f32[2,2,2]{2,1,0} broadcast) + } + + fused_reduce { + p0.2 = f32[2,2,2]{2,1,0} parameter(0) + c1 = f32[] constant(0) + r1 = f32[2,2]{1,0} reduce(p0.2, c1), dimensions={2}, to_apply=scalar_add_computation + mul = f32[2,2,2]{2,1,0} multiply(p0.2, p0.2) + r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=scalar_add_computation + ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2) + } + + ENTRY reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + p1 = f32[2,2,2]{2,1,0} parameter(1) + select = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_select + fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(select), kind=kInput, calls=fused_reduce + gte0 = f32[2,2]{1,0} get-tuple-element(fusion), index=0 + gte1 = f32[2,2]{1,0} get-tuple-element(fusion), index=1 + ROOT root = (f32[2,2]{1,0}, f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(gte1, gte1, select) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Tuple(op::GetTupleElement(), op::GetTupleElement(), + op::GetTupleElement())); + const HloInstruction* fusion = root->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Reduce(), op::Reduce(), op::Select())); +} + +TEST_F(MultiOutputFusionTest, ProducerConsumerFusionDoNotFuseLoopReduceFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_element_wise { + p0.1 = f32[2,2,2]{2,1,0} parameter(0) + p1.1 = f32[2,2,2]{2,1,0} parameter(1) + ROOT root = f32[2,2,2]{2,1,0} add(p0.1, p1.1) + } + + fused_reduce { + p0.2 = f32[2,2,2]{2,1,0} parameter(0) + mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2) + c1 = f32[] constant(0) + ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={1}, to_apply=scalar_add_computation + } + + ENTRY reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + p1 = f32[2,2,2]{2,1,0} parameter(1) + element_wise = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_element_wise + fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(element_wise), kind=kLoop, calls=fused_reduce + ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(fusion, element_wise) + })")) + .ValueOrDie(); + ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); +} + +TEST_F(MultiOutputFusionTest, + ProducerConsumerFusionFp16LoopFusionAndReduceFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_select { + p1.1 = f16[2,2,2]{2,1,0} parameter(1) + c0 = f16[] constant(0) + broadcast = f16[2,2,2]{2,1,0} broadcast(f16[] c0), dimensions={} + greater-than = pred[2,2,2]{2,1,0} greater-than(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast) + p0.1 = f16[2,2,2]{2,1,0} parameter(0) + ROOT select = f16[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f16[2,2,2]{2,1,0} p0.1, f16[2,2,2]{2,1,0} broadcast) + } + fused_reduce { + p0.2 = f16[2,2,2]{2,1,0} parameter(0) + convert = f32[2,2,2]{2,1,0} convert(p0.2) + c1 = f32[] constant(0) + r1 = f32[2,2]{1,0} reduce(convert, c1), dimensions={2}, to_apply=scalar_add_computation + mul = f32[2,2,2]{2,1,0} multiply(convert, convert) + r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=scalar_add_computation + ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2) + } + ENTRY reduce { + p0 = f16[2,2,2]{2,1,0} parameter(0) + p1 = f16[2,2,2]{2,1,0} parameter(1) + select = f16[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_select + fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(select), kind=kInput, calls=fused_reduce + gte0 = f32[2,2]{1,0} get-tuple-element(fusion), index=0 + gte1 = f32[2,2]{1,0} get-tuple-element(fusion), index=1 + ROOT root = (f32[2,2]{1,0}, f32[2,2]{1,0}, f16[2,2,2]{2,1,0}) tuple(gte1, gte1, select) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Tuple(op::GetTupleElement(), op::GetTupleElement(), + op::GetTupleElement())); + const HloInstruction* fusion = root->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Reduce(), op::Reduce(), op::Select())); +} + +TEST_F(MultiOutputFusionTest, + ProducerConsumerFusionReduceUnfriendlyLoopFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + mixed_input_layouts_computation { + p0.1 = f16[128,1024,32,32]{1,3,2,0} parameter(0) + p1.1 = f16[128,1024,32,32]{3,2,1,0} parameter(1) + copy = f16[128,1024,32,32]{1,3,2,0} copy(p1.1) + c0 = f16[] constant(0) + broadcast = f16[128,1024,32,32]{1,3,2,0} broadcast(c0), dimensions={} + greater-than = pred[128,1024,32,32]{1,3,2,0} greater-than(copy, broadcast) + ROOT root = f16[128,1024,32,32]{1,3,2,0} select(greater-than, p0.1, broadcast) + } + fused_reduce { + p0.2 = f16[128,1024,32,32]{1,3,2,0} parameter(0) + convert = f32[128,1024,32,32]{1,3,2,0} convert(p0.2) + c0.2 = f32[] constant(0) + ROOT reduce = f32[1024]{0} reduce(convert, c0.2), dimensions={0,2,3}, to_apply=scalar_add_computation + } + ENTRY reduce { + p0 = f16[128,1024,32,32]{3,2,1,0} parameter(0) + p1 = f16[128,1024,32,32]{1,3,2,0} parameter(1) + loop_fusion = f16[128,1024,32,32]{1,3,2,0} fusion(p0, p1), kind=kLoop, calls=mixed_input_layouts_computation + reduce_fusion = f32[1024]{0} fusion(loop_fusion), kind=kInput, calls=fused_reduce + ROOT root = (f32[1024]{0}, f16[128,1024,32,32]{1,3,2,0}) tuple(reduce_fusion, loop_fusion) + })")) + .ValueOrDie(); + ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); +} + +// Check that we limit the number of operands to fusions we create. +TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) { + constexpr int64 kNumParams = 200; + ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion); + + // Compute + // p0 * p1, + // p0 * p1 + p1 * p2 + // p0 * p1 + p1 * p2 + p2 * p3 + // ... + // where each of the (pi * pj)'s is represented as a fusion node so that + // multi-output fusion will pay attention to it. + auto module = CreateNewModule(); + HloComputation::Builder b(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {10, 100}); + + std::vector params; + for (int64 i = 0; i < kNumParams; ++i) { + params.push_back( + b.AddInstruction(HloInstruction::CreateParameter(i, shape, "p"))); + } + + // Creates a fusion node that calculates x*y. + auto make_fusion = [&](HloInstruction* x, HloInstruction* y) { + HloComputation::Builder sub_builder("subcomp"); + auto* p0 = sub_builder.AddInstruction( + HloInstruction::CreateParameter(0, shape, "p")); + auto* p1 = sub_builder.AddInstruction( + HloInstruction::CreateParameter(1, shape, "p")); + sub_builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, p0, p1)); + HloComputation* subcomp = + module->AddEmbeddedComputation(sub_builder.Build()); + return HloInstruction::CreateFusion( + shape, HloInstruction::FusionKind::kLoop, {x, y}, subcomp); + }; + + auto* sum = b.AddInstruction(make_fusion(params[0], params[1])); + for (int64 i = 2; i < kNumParams; ++i) { + sum = b.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kAdd, sum, + b.AddInstruction(make_fusion(params[i - 1], params[i])))); + } + auto computation = module->AddEntryComputation(b.Build()); + EXPECT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + for (const HloInstruction* instr : computation->instructions()) { + EXPECT_LE(instr->operand_count() + ShapeUtil::SubshapeCount(instr->shape()), + GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion) + << instr->ToString(); + } +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc new file mode 100644 index 0000000000000000000000000000000000000000..f6325b33680629b7e3d3814b088582a5007de6dc --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -0,0 +1,841 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h" + +#include +#include +#include +#include // NOLINT(build/c++11): only using std::call_once, not mutex. +#include + +#include "absl/memory/memory.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_cat.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "tensorflow/compiler/xla/protobuf_util.h" +#include "tensorflow/compiler/xla/service/algebraic_simplifier.h" +#include "tensorflow/compiler/xla/service/batchnorm_expander.h" +#include "tensorflow/compiler/xla/service/buffer_assignment.h" +#include "tensorflow/compiler/xla/service/buffer_liveness.h" +#include "tensorflow/compiler/xla/service/call_inliner.h" +#include "tensorflow/compiler/xla/service/conditional_simplifier.h" +#include "tensorflow/compiler/xla/service/flatten_call_graph.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h" +#include "tensorflow/compiler/xla/service/gpu/fusion_merger.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h" +#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h" +#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h" +#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h" +#include "tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h" +#include "tensorflow/compiler/xla/service/gpu/pad_insertion.h" +#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" +#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" +#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h" +#include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_constant_folding.h" +#include "tensorflow/compiler/xla/service/hlo_cse.h" +#include "tensorflow/compiler/xla/service/hlo_dce.h" +#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_pass_fix.h" +#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h" +#include "tensorflow/compiler/xla/service/hlo_proto_util.h" +#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h" +#include "tensorflow/compiler/xla/service/hlo_verifier.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" +#include "tensorflow/compiler/xla/service/reshape_mover.h" +#include "tensorflow/compiler/xla/service/scatter_expander.h" +#include "tensorflow/compiler/xla/service/transpose_folding.h" +#include "tensorflow/compiler/xla/service/tuple_simplifier.h" +#include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h" +#include "tensorflow/compiler/xla/service/while_loop_simplifier.h" +#include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/gtl/cleanup.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/platform/cuda_libdevice_path.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/regexp.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/core/platform/subprocess.h" +#include "tensorflow/core/platform/tracing.h" +#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" + +namespace xla { +namespace gpu { + +/* static */ const char* NVPTXCompiler::kTargetTriple = "nvptx64-nvidia-cuda"; +/* static */ const char* NVPTXCompiler::kDataLayout = + "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"; + +namespace { + +namespace tracing = tensorflow::tracing; + +// Returns the directory containing nvvm libdevice files. config_cuda_data_dir +// should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the +// HloModule being compiled. +string GetLibdeviceDir(const string& config_cuda_data_dir) { + std::vector potential_libdevice_dirs; + if (!config_cuda_data_dir.empty()) { + potential_libdevice_dirs.push_back(config_cuda_data_dir); + } + potential_libdevice_dirs.push_back(tensorflow::LibdeviceRoot()); + + // Tries all potential libdevice directories in the order they are inserted. + // Returns the first directory that exists in the file system. + for (const string& potential_libdevice_dir : potential_libdevice_dirs) { + if (tensorflow::Env::Default()->IsDirectory(potential_libdevice_dir).ok()) { + VLOG(2) << "Found libdevice dir " << potential_libdevice_dir; + return potential_libdevice_dir; + } + VLOG(2) << "Unable to find potential libdevice dir " + << potential_libdevice_dir; + } + + // Last resort: maybe in the current folder. + return "."; +} + +// Runs optimization passes on the given HLO module. +// +// It takes a compiler pointer, as passes may compile and execute HLOs on the +// fly for cuDNN verification or other purposes. +Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator, + Compiler* compiler) { + { + HloPassPipeline pipeline("optimization"); + pipeline.AddInvariantChecker(/*layout_sensitive=*/false, + /*allow_mixed_precision=*/false); + pipeline.AddPass(); + ReducePrecisionInsertion::AddPasses( + &pipeline, hlo_module->config().debug_options(), + ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION); + + // TODO(b/64094172): make Call work on GPU instead of inlining. + pipeline.AddPass(); + // Convert BF16 operations to F32 operations so that the GPU backend can + // support BF16 operations without directly implementing a BF16 lowering for + // most ops. + pipeline.AddPass(BF16, F32); + + { + auto& pass = + pipeline.AddPass>("simplification"); + pass.AddInvariantChecker(/*layout_sensitive=*/false, + /*allow_mixed_precision=*/false); + + // If cudnn batchnorms are enabled, rewrite batchnorm HLOs to cudnn calls + // where possible. Not every batchnorm op can be implemented as a call to + // cudnn, so decompose any remaining batchnorm ops into a soup of HLOs. + if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) { + pass.AddPass(); + } + pass.AddPass( + /*rewrite_training_op=*/true, + /*rewrite_inference_op=*/true, + /*rewrite_grad_op=*/true); + + // BatchNormExpander can create zero-sized ops, so zero-sized HLO + // elimination has to come after that pass. + pipeline.AddPass(); + + pipeline.AddPass(); + + pass.AddPass( + /*is_layout_sensitive=*/false, + [](const Shape&, const Shape&) { return false; }); + pass.AddPass(); + pass.AddPass(); + pass.AddPass(); + pass.AddPass(); + pass.AddPass(); + pass.AddPass(); + pass.AddPass(); + } + + pipeline.AddPass( + [](const HloInstruction& dot, + const TransposeFolding::OperandIndices& candidate_operands) { + return ImplementedAsGemm(dot) ? candidate_operands + : TransposeFolding::OperandIndices{}; + }, + TransposeFolding::NeverFoldTranspose); + pipeline.AddPass(/*is_layout_sensitive=*/false); + pipeline.AddPass(); + TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); + } + + { + // Convert convolutions into CustomCalls to cudnn, then canonicalize them + // (PadInsertion). + HloPassPipeline pipeline("conv_canonicalization"); + pipeline.AddInvariantChecker(/*layout_sensitive=*/false, + /*allow_mixed_precision=*/false); + pipeline.AddPass(); + // CudnnConvolutionRewriter may add instructions of the form + // reverse(constant), which it expects will be simplified by constant + // folding. + pipeline.AddPass(); + pipeline.AddPass(); + if (IsVoltaOrLater(*stream_exec)) { + pipeline.AddPass(); + // PadForTensorCores leaves behind unnecessary tuple/get-tuple-element + // pairs that TupleSimplifier fixes. + pipeline.AddPass(); + } + TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); + } + + { + // Run layout assignment in a separate pipeline from + // "post-layout-assignment" because we want everything after layout + // assignment to have a layout-sensitive invariant-checker, but + // HloPassPipeline also runs its invariant checker before any passes are + // run, meaning, the pipeline that contains layout assignment cannot contain + // a layout-sensitive verifier! + HloPassPipeline pipeline("layout assignment"); + pipeline.AddPass( + hlo_module->mutable_entry_computation_layout(), stream_exec); + TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); + } + + { + HloPassPipeline pipeline("post-layout_assignment"); + pipeline.AddInvariantChecker(/*layout_sensitive=*/true, + /*allow_mixed_precision=*/false); + + // The LayoutAssignment pass may leave behind kCopy instructions which are + // duplicate or NOPs, so remove them with algebraic simplification and CSE. + pipeline.AddPass>( + /*is_layout_sensitive=*/true, + /*valid_bitcast_callback=*/[](const Shape&, const Shape&) { + return true; + }); + + // Choose the fastest algorithm for each conv. + // + // We pick the algorithm before fusion so we can generate better HLO. After + // CudnnConvolutionRewriter, our convolutions are CustomCalls which return a + // tuple (conv_result, scratch_memory), and the each conv uses 0 bytes of + // scratch: + // + // customcall = (f32[...], f32[0]) + // return gte(customcall, 0) + // + // The algorithm picker then chooses the best algorithm, and potentially + // increases the scratch space. It replaces customcall with new_tuple, + // giving us the following: + // + // new_customcall = (f32[...], f32[N]) + // new_tuple = tuple(gte(new_customcall, 0), constant f32[0]) + // return gte(new_tuple, 0) + // + // The new tuple and gte instructions then be simplified away, because + // nobody is expected to use the scratch value. + // + // However, if we were to run CudnnConvolutionAlgorithmPicker after fusion + // the gte(customcall, 0) would probably already be into a fusion node. We + // can't simplify across HloComputation boundaries, so in this case we + // wouldn't be able to simplify away the new_tuple bits. + pipeline.AddPass( + stream_exec, device_allocator, compiler); + // Clean up new_tuple described above. + pipeline.AddPass(); + + pipeline.AddPass(/*is_layout_sensitive=*/true); + TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); + } + + { + HloPassFix fusion("fusion"); + fusion.AddInvariantChecker(/*layout_sensitive=*/true, + /*allow_mixed_precision=*/false); + fusion.AddPass(/*may_duplicate=*/false); + fusion.AddPass(/*may_duplicate=*/true); + fusion.AddPass(); + fusion.AddPass(); + fusion.AddPass(/*is_layout_sensitive=*/true, + /*only_fusion_computations=*/true); + fusion.AddPass(); + TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status()); + + HloPassPipeline reduce_pipeline("reduce-precision"); + reduce_pipeline.AddInvariantChecker( + /*is_layout_sensitive=*/true, /*allow_mixed_precision=*/false); + ReducePrecisionInsertion::AddPasses( + &reduce_pipeline, hlo_module->config().debug_options(), + ReducePrecisionInsertion::PassTiming::AFTER_FUSION); + StatusOr reduce_result = reduce_pipeline.Run(hlo_module); + TF_RETURN_IF_ERROR(reduce_result.status()); + + if (reduce_result.ValueOrDie()) { + // Do another fusion pass, with the expectation that we may be able to + // fuse the new ReducePrecision operations. + TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status()); + } + } + + return Status::OK(); +} + +// Modifies the given HLO module so that it will be accepted by IrEmitter. +// Unlike optimization passes, the passes are necessary for correctness. +Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) { + // In some cases, we have to place the result of an instruction in a temporary + // buffer. For instance, the buffer that holds an external parameter is + // assumed immutable at this point, and should not be reused for output + // (b/27180329). Therefore, in that case, we set the output to be a copy of + // the parameter. + HloPassPipeline pipeline("GPU-ir-emit-prepare"); + pipeline.AddInvariantChecker(/*layout_sensitive=*/true, + /*allow_mixed_precision=*/false); + + // Copy insertion should be performed immediately before IR emission to avoid + // inserting unnecessary copies (later pass adds an instruction which + // materializes the value) or missing a necessary copy (later pass removes an + // instruction which materializes a value). DCE must be run immediately before + // (and sometime after) copy insertion, to avoid dead code from interfering + // with the rewrites. + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(); + return pipeline.Run(hlo_module).status(); +} + +// Prints a warning if the ptxas at ptxas_path has known bugs. +// +// Only prints a warning the first time it's called for a particular value of +// ptxas_path. +void WarnIfBadPtxasVersion(const string& ptxas_path) { + static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); + static std::unordered_set* seen_ptxas_paths GUARDED_BY(mu) = + new std::unordered_set(); + + tensorflow::mutex_lock lock(mu); + if (!seen_ptxas_paths->insert(ptxas_path).second) { + // Already checked this ptx binary, nothing to do. + return; + } + + tensorflow::SubProcess ptxas; + ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"}); + ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE); + if (!ptxas.Start()) { + LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version"; + return; + } + + string out; + int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out, + /*stderr_output=*/nullptr); + if (exit_code != 0) { + LOG(WARNING) << "Running " << ptxas_path << " --version returned " + << exit_code; + return; + } + + int64 vmaj, vmin, vdot; + string vmaj_str, vmin_str, vdot_str; + if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str, + &vmin_str, &vdot_str) || + !absl::SimpleAtoi(vmaj_str, &vmaj) || + !absl::SimpleAtoi(vmin_str, &vmin) || + !absl::SimpleAtoi(vdot_str, &vdot)) { + LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path + << " --version:\n" + << out; + return; + } + + // We need ptxas >= 9.0 as a hard requirement, because we compile targeting + // PTX 6.0. An older ptxas will just fail to compile any of our code. + // + // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some + // address calculations with large offsets (e.g. "load ptr + large_constant"), + // b/70245379. + // + // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way + // that appears related to address calculations, b/111107644. ptxas 9.2.88 + // appears to work, as far as we can tell. + if (vmaj < 9) { + LOG(ERROR) + << "You are using ptxas 8.x, but XLA requires ptxas 9.x (and strongly " + "prefers >= 9.2.88). Compilation of XLA kernels below will likely " + "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas " + "binary is sufficient."; + } else if ((vmaj < 9 || vmin < 2 || vdot < 88)) { + LOG(WARNING) + << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "." + << vdot + << ", which older than 9.2.88. ptxas 9.x before 9.2.88 is known to " + "miscompile XLA code, leading to incorrect results or " + "invalid-address errors.\n\nYou do not need to update to CUDA " + "9.2.88; cherry-picking the ptxas binary is sufficient."; + } +} + +// Prints a warning if the ptx->sass JIT in the driver has known bugs. +// +// Using such a driver only a problem if we fail to use ptxas to compile our ptx +// and have to use the driver instead, so you should only call this function if +// we're going to use the driver JIT. +// +// Only prints a warning the first time it's called. +void WarnIfBadDriverJITVersion() { + static std::once_flag run_once; + std::call_once(run_once, [] { + auto version_or_status = se::cuda::Diagnostician::FindKernelDriverVersion(); + if (!version_or_status.ok()) { + LOG(WARNING) << "Couldn't read CUDA driver version."; + return; + } + se::cuda::DriverVersion version = version_or_status.ValueOrDie(); + + // The following versions of the driver JIT miscompile some address + // calculations with large offsets (e.g. "load ptr + large_constant"), + // b/70245379: + // + // - 384.x before 384.108 + // - 387.x before 387.40 + // - 390.x before 390.10. + // + // In addition, only >= 396.20 contains ptxas >= 9.2.88, which contains the + // fix for the "large multioutput fusions" miscompile, b/111107644. + if (version < std::make_tuple(396, 20, 0)) { + LOG(WARNING) + << "*** WARNING *** Invoking the PTX->SASS JIT from driver version " + << se::cuda::DriverVersionToString(version) + << ", which is older than 396.20.0. These versions are known to " + "miscompile XLA code, leading to incorrect results or " + "invalid-address errors.\nXLA only uses the driver JIT if it " + "cannot find ptxas; you don't need to update your driver if " + "you can point XLA to ptxas 9.2.88 or newer."; + } + }); +} + +// Compiles the given PTX string using ptxas and returns the resulting machine +// code (i.e. a cubin) as a byte array. +StatusOr> CompilePtx(const string& ptx, int cc_major, + int cc_minor) { + tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true); + const string ptxas_path = + tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas"); + VLOG(2) << "Using ptxas at " << ptxas_path; + auto env = tensorflow::Env::Default(); + TF_RETURN_IF_ERROR(env->FileExists(ptxas_path)); + + WarnIfBadPtxasVersion(ptxas_path); + + // Write ptx into a temporary file. + string ptx_path; + if (!env->LocalTempFilename(&ptx_path)) { + return InternalError("couldn't get temp PTX file name"); + } + auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] { + TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path)); + }); + + TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_path, ptx)); + VLOG(2) << "ptx written to: " << ptx_path; + + // Invoke ptxas and collect its output. + string cubin_path; + if (!env->LocalTempFilename(&cubin_path)) { + return InternalError("couldn't get temp CUBIN file name"); + } + auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] { + // CUBIN file may never be created, so the failure to delete it should not + // produce TF error. + tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError(); + }); + tensorflow::SubProcess ptxas_info_dumper; + std::vector ptxas_args = { + ptxas_path, ptx_path, "-o", cubin_path, + absl::StrCat("-arch=sm_", cc_major, cc_minor)}; + if (VLOG_IS_ON(2)) { + ptxas_args.push_back("-v"); + } + ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args); + ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR, + tensorflow::ACTION_PIPE); + if (!ptxas_info_dumper.Start()) { + return InternalError("Failed to launch ptxas"); + } + string stderr_output; + int exit_status = ptxas_info_dumper.Communicate( + /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output); + XLA_LOG_LINES(tensorflow::INFO, stderr_output); + if (exit_status != 0) { + return InternalError("ptxas exited with non-zero error code %d", + exit_status); + } + + // Read in the result of compilation and return it as a byte vector. + string cubin; + TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(), + cubin_path, &cubin)); + std::vector cubin_vector(cubin.begin(), cubin.end()); + return cubin_vector; +} + +} // namespace + +NVPTXCompiler::NVPTXCompiler() + : pointer_size_(llvm::DataLayout(kDataLayout) + .getPointerSize(0 /* default address space */)) {} + +StatusOr> NVPTXCompiler::RunHloPasses( + std::unique_ptr module, se::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) { + // We dump the post-optimization HLO in RunBackend so no need to dump it here. + VLOG(2) << "*** HLO Before Optimization"; + XLA_VLOG_LINES(2, module->ToString()); + + XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses"); + tracing::ScopedActivity activity("HLO Transforms", module->name(), + /*is_expensive=*/true); + TF_RETURN_IF_ERROR( + OptimizeHloModule(module.get(), stream_exec, device_allocator, this)); + return std::move(module); +} + +StatusOr> NVPTXCompiler::RunBackend( + std::unique_ptr module, se::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) { + XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend"); + + TF_RET_CHECK(stream_exec != nullptr); + + TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get())); + + llvm::LLVMContext llvm_context; + std::string buffer; + llvm::raw_string_ostream error(buffer); + llvm::DiagnosticPrinterRawOStream printer(error); + auto DiagnosticHandler = [](const llvm::DiagnosticInfo& diag_info, + void* Context) { + auto printer = static_cast(Context); + diag_info.print(*printer); + }; + llvm_context.setDiagnosticHandlerCallBack(DiagnosticHandler, &printer); + + llvm::Module llvm_module(module->name().c_str(), llvm_context); + // Set the target triple and the data layout. + llvm_module.setTargetTriple(kTargetTriple); + llvm_module.setDataLayout(kDataLayout); + + // Determine the HLO schedule, which is an ordering of HLO instructions. This + // is used by buffer assignment to enable buffer reuse, and the same ordering + // must also be used to determine the thunk launch schedule. + std::unique_ptr stream_assignment = AssignStreams(*module); + TF_ASSIGN_OR_RETURN( + std::unique_ptr hlo_schedule, + GpuHloSchedule::Build(*module, *stream_assignment, pointer_size_)); + + // Run buffer analysis on the HLO graph. This analysis figures out which + // temporary buffers are required to run the computation. + TF_ASSIGN_OR_RETURN( + std::unique_ptr buffer_assignment, + BufferAssigner::Run( + module.get(), hlo_schedule->ConsumeHloOrdering(), + BufferSizeBytesFunction(), + /*color_alignment=*/ + [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; }, + /*allow_input_output_aliasing=*/false, + /*allocate_buffers_for_constants=*/true)); + // BufferAssignment::Stats::ToString() and BufferAssignment::ToString() + // include headers, so no need for us to print them ourselves. + XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString()); + XLA_VLOG_LINES(2, buffer_assignment->ToString()); + VLOG(2) << "*** HLO After Optimization"; + XLA_VLOG_LINES(2, module->ToString()); + const string xla_dump_optimized_hlo_proto_to = + module->config().debug_options().xla_dump_optimized_hlo_proto_to(); + if (!xla_dump_optimized_hlo_proto_to.empty()) { + HloProto proto = MakeHloProto(*module, *buffer_assignment); + TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( + proto, xla_dump_optimized_hlo_proto_to, module->name())); + } + + IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(), + &stream_exec->GetDeviceDescription(), + &llvm_module); + + HloComputation* entry_computation = module->entry_computation(); + IrEmitterUnnested ir_emitter(module->config(), entry_computation, + &ir_emitter_context); + + TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals()); + + { + XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - IR emission"); + TF_RETURN_IF_ERROR(entry_computation->Accept(&ir_emitter)); + } + + if (user_pre_optimization_hook_) { + TF_CHECK_OK(user_pre_optimization_hook_(llvm_module)); + } + string ir_module_string_before_opt; + const bool embed_ir_in_executable = + module->config().debug_options().xla_embed_ir_in_executable(); + if (VLOG_IS_ON(2) || embed_ir_in_executable) { + ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module); + VLOG(2) << "LLVM module before optimizations:"; + XLA_VLOG_LINES(2, ir_module_string_before_opt); + } + + const string& ir_dump_directory = + module->config().debug_options().xla_dump_ir_to(); + + if (!ir_dump_directory.empty()) { + TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory( + /*directory_name=*/ir_dump_directory, + /*hlo_module_name=*/module->name(), llvm_module, + /*optimized=*/false)); + } + + { + XLA_SCOPED_LOGGING_TIMER( + "NVPTXCompiler::RunBackend - Running LLVM verifier"); + + std::string err; + llvm::raw_string_ostream err_stream(err); + + // verifyModule() returns true if the module is broken. + TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream)) + << "Invalid LLVM IR before optimizations:\n" + << err_stream.str() + << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. " + "Rerun with --xla_dump_ir_to to get the IR. "; + } + + string libdevice_dir; + { + tensorflow::mutex_lock lock(mutex_); + + // Find the directory containing libdevice. To avoid searching for it every + // time, we have a one-element cache, keyed on the module's config's + // cuda_data_dir. + const auto& config_cuda_data_dir = + module->config().debug_options().xla_gpu_cuda_data_dir(); + if (cached_libdevice_dir_.empty() || + cached_cuda_data_dir_ != config_cuda_data_dir) { + cached_cuda_data_dir_ = config_cuda_data_dir; + cached_libdevice_dir_ = GetLibdeviceDir(config_cuda_data_dir); + } + libdevice_dir = cached_libdevice_dir_; + } + int cc_major, cc_minor; + if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major, + &cc_minor)) { + LOG(WARNING) + << "Couldn't get compute capability for device; assuming sm_20."; + cc_major = 2; + cc_minor = 0; + } + + string ptx; + { + XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - CompileToPtx"); + TF_ASSIGN_OR_RETURN(ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor}, + module->config(), libdevice_dir)); + } + + if (!ir_dump_directory.empty()) { + TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory( + /*directory_name=*/ir_dump_directory, + /*hlo_module_name=*/module->name(), llvm_module, + /*optimized=*/true)); + } + + if (user_post_optimization_hook_) { + TF_CHECK_OK(user_post_optimization_hook_(llvm_module)); + } + VLOG(2) << "LLVM module after optimizations:"; + XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module)); + VLOG(2) << "PTX:"; + XLA_VLOG_LINES(2, ptx); + + // Write PTX to IR dump directory, if IR dumping was requested. + if (!ir_dump_directory.empty()) { + const string ptx_outfile = tensorflow::io::JoinPath( + ir_dump_directory, absl::StrCat(module->name(), ".ptx")); + auto status = [&] { + auto* env = tensorflow::Env::Default(); + TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(ir_dump_directory)); + TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_outfile, ptx)); + return Status::OK(); + }(); + if (!status.ok()) { + LOG(WARNING) << "Couldn't dump PTX for module " << module->name() + << " to " << ptx_outfile << ": " << status; + } + } + + const std::vector cubin = + CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor); + + auto thunk_schedule = absl::make_unique( + ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment), + hlo_schedule->ThunkLaunchOrder()); + VLOG(2) << "Printing the thunk schedule..."; + XLA_VLOG_LINES(2, thunk_schedule->ToString()); + + std::unique_ptr profile_index_map; + std::unique_ptr profile_printer; + + if (module->config().hlo_profiling_enabled()) { + HloCostAnalysis cost_analysis(ShapeSizeBytesFunction()); + cost_analysis.set_bytes_per_second( + stream_exec->GetDeviceDescription().memory_bandwidth()); + TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis)); + profile_index_map = absl::make_unique(*module); + profile_printer = + CreateHloProfilePrinterData(*profile_index_map, cost_analysis); + } + + auto* gpu_executable = new GpuExecutable( + ptx, cubin, {cc_major, cc_minor}, std::move(thunk_schedule), + std::move(module), std::move(buffer_assignment), + std::move(profile_printer), std::move(profile_index_map)); + if (embed_ir_in_executable) { + DCHECK_NE("", ir_module_string_before_opt); + gpu_executable->set_ir_module_string(ir_module_string_before_opt); + } + return std::unique_ptr(gpu_executable); +} + +std::vector NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx, + int cc_major, + int cc_minor) { + XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompilePtxOrGetCachedResult"); + tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true); + bool inserted; + decltype(compilation_cache_.begin()) iter; + // Pointers into compilation_cache_ where the ptx and (optional) cubin are + // stored. + const string* cache_ptx = nullptr; + CompilationCacheValue* cache_value = nullptr; + + { + tensorflow::mutex_lock lock(mutex_); + std::tie(iter, inserted) = compilation_cache_.emplace( + std::piecewise_construct, + std::forward_as_tuple(ptx, cc_major, cc_minor), + std::forward_as_tuple()); + cache_ptx = &iter->first.ptx; + cache_value = &iter->second; + } + + // Compile the ptx if it wasn't in the cache before we called this function. + // Other threads asking for the same compilation key will block on + // cache_value->mutex_ until compilation is done. + { + tensorflow::mutex_lock lock(cache_value->mutex_); + if (inserted) { + CHECK(!cache_value->compilation_done); + if (!ptx.empty()) { + StatusOr> maybe_cubin = + CompilePtx(*cache_ptx, cc_major, cc_minor); + if (maybe_cubin.ok()) { + cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie(); + VLOG(2) << "Compiled PTX size:" << ptx.size() + << " CUBIN size: " << cache_value->cubin_data.size(); + } else { + bool log_warning = true; + if (maybe_cubin.status().code() == + tensorflow::error::Code::NOT_FOUND) { + // Missing ptxas is expected in some environments where CUDA SDK + // binaries are not available. We don't want to spam logs with + // identical warnings in this case. + + // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N + // for more general usage. + static std::atomic warning_done(false); + log_warning = !warning_done.exchange(true); + } + if (log_warning) { + LOG(WARNING) + << "Failed to compile ptx to cubin. Will attempt to let " + "GPU driver compile the ptx. " + << maybe_cubin.status(); + } + + // We're going to use the driver to JIT our PTX->SASS, so warn if + // the JIT in the driver has known bugs. + WarnIfBadDriverJITVersion(); + } + } + cache_value->compilation_done = true; + cache_value->compilation_done_cv_.notify_all(); + } else { + while (!cache_value->compilation_done) { + cache_value->compilation_done_cv_.wait(lock); + } + } + } + + CHECK(cache_value != nullptr); + CHECK(cache_value->compilation_done); + return cache_value->cubin_data; +} + +StatusOr>> +NVPTXCompiler::CompileAheadOfTime( + std::vector> module, + const AotCompilationOptions& options) { + return Unimplemented( + "not yet implemented: NVPTXCompiler::CompileAheadOfTime"); +} + +se::Platform::Id NVPTXCompiler::PlatformId() const { + return se::cuda::kCudaPlatformId; +} + +} // namespace gpu +} // namespace xla + +static bool InitModule() { + xla::Compiler::RegisterCompilerFactory( + stream_executor::cuda::kCudaPlatformId, + []() { return absl::make_unique(); }); + return true; +} +static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h new file mode 100644 index 0000000000000000000000000000000000000000..8e97774750344bfc141daa7d752300762c708613 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h @@ -0,0 +1,155 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_COMPILER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_COMPILER_H_ + +#include +#include +#include + +#include "absl/types/optional.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/service/executable.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/llvm_compiler.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/core/platform/thread_annotations.h" + +namespace xla { +namespace gpu { + +// The GPU compiler generates efficient GPU executables. +class NVPTXCompiler : public LLVMCompiler { + public: + NVPTXCompiler(); + ~NVPTXCompiler() override {} + + // Bring in + // StatusOr>> Compile( + // std::vector> modules, + // std::vector> + // stream_execs) + using LLVMCompiler::Compile; + + StatusOr> RunHloPasses( + std::unique_ptr module, se::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) override; + + StatusOr> RunBackend( + std::unique_ptr module, se::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) override; + + StatusOr>> + CompileAheadOfTime(std::vector> module, + AotCompilationOptions const& options) override; + + se::Platform::Id PlatformId() const override; + + HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override { + // Capture just the pointer size, not the entire NVPTXCompiler object. + int64 pointer_size = pointer_size_; + return [pointer_size](const Shape& shape) { + return ShapeUtil::ByteSizeOf(shape, pointer_size); + }; + } + + // The triple that represents our target. + static const char* kTargetTriple; + + // The data layout of the emitted module. Copied from computeDataLayout in + // NVPTXTargetMachine.cpp. + static const char* kDataLayout; + + private: + // The size in bytes of a pointer. Used by ShapeSizeBytesFunction. + const int64 pointer_size_; + + tensorflow::mutex mutex_; + + // When compiling an HLO module, we need to find a path to the nvvm libdevice + // files. We search in the module's config.debug_options().cuda_data_dir() + // and in tensorflow::LibdeviceRoot(), the latter of which is a constant. + // + // We cache the cuda_data_dir() and the result of our search, so that if the + // next module we have to compile has the same cuda_data_dir(), we can skip + // the search. + string cached_cuda_data_dir_ GUARDED_BY(mutex_); + string cached_libdevice_dir_ GUARDED_BY(mutex_); + + // Tries to compile the given ptx string to cubin. Returns a vector with the + // compiled cubin. If compilation was unsuccessful, returns an empty vector. + std::vector CompilePtxOrGetCachedResult(const string& ptx, + int cc_major, int cc_minor); + + // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor} + // -> cubin so we don't recompile the same ptx twice. This is important for + // some interactive workflows. (We also cache at the HLO level, but sometimes + // we can't realize that two modules are the same until we lower to ptx.) + // + // Compilation of distinct PTX happens in parallel. If more than one thread + // attempts to compile the same PTX, the fist thread to obtain + // cache_value_->mutex_ performs the compilation. The rest wait() on + // cache_value_->compilation_done_cv_ until the compilation is done. + // + // If compiling the ptx fails, we return an empty cubin, cross our fingers, + // and leave compilation up to the driver. + struct CompilationCacheKey { + CompilationCacheKey(std::string ptx, int cc_major, int cc_minor) + : ptx(std::move(ptx)), cc_major(cc_major), cc_minor(cc_minor) {} + string ptx; + int cc_major; + int cc_minor; + }; + struct CompilationCacheHash { + size_t operator()(const CompilationCacheKey& key) const { + return tensorflow::Hash64Combine( + tensorflow::Hash64Combine(tensorflow::Hash64(key.ptx), key.cc_major), + key.cc_minor); + } + }; + struct CompilationCacheEq { + size_t operator()(const CompilationCacheKey& a, + const CompilationCacheKey& b) const { + return a.cc_major == b.cc_major && a.cc_minor == b.cc_minor && + a.ptx == b.ptx; + } + }; + struct CompilationCacheValue { + bool compilation_done = false; + std::vector cubin_data; + // mutex and condition variable to serialize compilation completing. + tensorflow::mutex mutex_; + tensorflow::condition_variable compilation_done_cv_; + }; + + // Don't even think about switching this to FlatMap; iterator stability is + // critical here. + std::unordered_map + compilation_cache_ GUARDED_BY(mutex_); + + TF_DISALLOW_COPY_AND_ASSIGN(NVPTXCompiler); +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_COMPILER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_manager.cc b/tensorflow/compiler/xla/service/gpu/outfeed_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..2fa170964e974a6535307d7a21eb3e7760d02536 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/outfeed_manager.cc @@ -0,0 +1,32 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h" + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/map_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/core/platform/logging.h" + +namespace xla { +namespace gpu { + +OutfeedManager* GetOrCreateOutfeedManager() { + static auto* manager = new OutfeedManager; + return manager; +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_manager.h b/tensorflow/compiler/xla/service/gpu/outfeed_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..160ba4b691f818ff01b41b8603c11853ea12c253 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/outfeed_manager.h @@ -0,0 +1,66 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_OUTFEED_MANAGER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_OUTFEED_MANAGER_H_ + +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/gpu/xfeed_queue.h" +#include "tensorflow/compiler/xla/shape_tree.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/notification.h" + +namespace xla { +namespace gpu { + +// TODO(b/30467474) Once GPU outfeed implementation settles, consider +// folding back the cpu and gpu outfeed implementations into a generic +// one if possible. + +// Defines a buffer holding the destination for an outfeed in host memory and a +// notification when that triggers when the transfer is done. +class OutfeedBuffer { + public: + OutfeedBuffer(int64 length) : length_(length) {} + + // Waits for the device transfer to be finished. + void WaitUntilAvailable() { done_.WaitForNotification(); } + + int64 length() const { return length_; } + void set_destination(std::unique_ptr destination) { + destination_ = std::move(destination); + } + MutableBorrowingLiteral* destination() { return destination_.get(); } + + // Callback to signal that this buffer is consumed. + void Done() { done_.Notify(); } + + private: + std::unique_ptr destination_; + const int64 length_; + tensorflow::Notification done_; +}; + +// Manages a thread-safe queue of buffers. The buffers are supposed to be +// produced by the transfer manager and consumed by the device. +using OutfeedManager = XfeedQueue>*>; + +// Singleton creator-or-accessor: Returns the GPU outfeed manager. +OutfeedManager* GetOrCreateOutfeedManager(); + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_OUTFEED_MANAGER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0f3e84a4cb25792cf10d38fc529f3e638acf8e4 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc @@ -0,0 +1,107 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" +#include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace gpu { + +OutfeedThunk::OutfeedThunk(ShapeTree outfeed_slices, + const HloInstruction* hlo_instruction) + : Thunk(Kind::kOutfeed, hlo_instruction), + outfeed_slices_(std::move(outfeed_slices)) {} + +Status OutfeedThunk::ExecuteOnStream( + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { + VLOG(2) << "Outfeeding from GPU: " << hlo_instruction()->ToString(); + + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); + OutfeedManager* outfeed_manager = GetOrCreateOutfeedManager(); + ShapeTree>* outfeed_buffers = + outfeed_manager->BlockingGetNextDestination(); + + // Nothing to be done for empty tuples. + if (ShapeUtil::IsEmptyTuple(hlo_instruction()->operand(0)->shape())) { + return Status::OK(); + } + CHECK(ShapeUtil::Compatible(hlo_instruction()->operand(0)->shape(), + outfeed_buffers->shape())); + + TF_RETURN_IF_ERROR(outfeed_buffers->ForEachMutableElementWithStatus( + [&](const ShapeIndex& index, std::unique_ptr* buffer) { + if (!*buffer) { // Tuple pointers. + return Status::OK(); + } + + BufferAllocation::Slice slice = outfeed_slices_.element(index); + se::DeviceMemoryBase data_address; + if (slice.allocation()) { + // If we have a static allocation, read it from there. This avoids + // synchronizing the host and device just to read a pointer. + data_address = buffer_allocations.GetDeviceAddress(slice); + } else { + // Otherwise we have to read the tuple pointer first. + CHECK(!index.empty()); + // Copy the parent buffer to the host. + BufferAllocation::Slice tuple_slice = + outfeed_slices_.element(ShapeIndexView(index).ConsumeFront()); + if (!tuple_slice.allocation()) { + return Unimplemented( + "Nested dynamic tuples are not supported on GPU"); + } + se::DeviceMemoryBase tuple_address = + buffer_allocations.GetDeviceAddress(tuple_slice); + CHECK(tuple_slice.size() % sizeof(void*) == 0) + << "Tuple size must be a multiple of pointer size"; + std::vector tuple_element_buffer_addresses(tuple_slice.size() / + sizeof(void*)); + stream->ThenMemcpy(tuple_element_buffer_addresses.data(), + tuple_address, tuple_slice.size()); + TF_RETURN_IF_ERROR(stream->BlockHostUntilDone()); + // The data address is specified by the element of the tuple pointer + // buffer. + data_address = + se::DeviceMemoryBase(tuple_element_buffer_addresses[index.back()], + (*buffer)->length()); + } + + // TODO(b/111309141): Run this on a separate stream so it doesn't block + // the GPU from doing work during the transfer. This could be handled by + // making StreamAssignment do something intelligent with outfeed thunks. + stream + ->ThenMemcpy((*buffer)->destination()->untyped_data(), data_address, + (*buffer)->length()) + .ThenDoHostCallback([buffer]() { (*buffer)->Done(); }); + return Status::OK(); + })); + + Status block_status = stream->BlockHostUntilDone(); + if (!block_status.ok()) { + return InternalError("Failed to complete data transfer on stream %p: %s", + stream, block_status.error_message()); + } + + VLOG(2) << "Outfeeding from GPU complete"; + return Status::OK(); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h new file mode 100644 index 0000000000000000000000000000000000000000..8ed89f05f0c5bb2e3893e695d413bac3b231112d --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h @@ -0,0 +1,52 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_OUTFEED_THUNK_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_OUTFEED_THUNK_H_ + +#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" +#include "tensorflow/compiler/xla/service/gpu/thunk.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace gpu { + +// A thunk that outfeeds data. Data must be already resident on the host. This +// thunk performs a host to device copy from the buffer allocated for the +// outfeed op to the host location. +class OutfeedThunk : public Thunk { + public: + // Constructs a OutfeedThunk that copies data to the host-side + // outfeed queue from the buffers in the given shape tree. + OutfeedThunk(ShapeTree outfeed_slices, + const HloInstruction* hlo_instruction); + + OutfeedThunk(const OutfeedThunk&) = delete; + OutfeedThunk& operator=(const OutfeedThunk&) = delete; + + Status ExecuteOnStream(const BufferAllocations& buffer_allocations, + se::Stream* stream, + HloExecutionProfiler* profiler) override; + + private: + const ShapeTree outfeed_slices_; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_OUTFEED_THUNK_H_ diff --git a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa84d7722351b68770b876e3880b472eec3233d7 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc @@ -0,0 +1,232 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h" + +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/compiler/xla/window_util.h" + +namespace xla { +namespace gpu { + + +// We want the input/output feature counts of an f16 conv to be factors of 8, +// because without this cudnn can't use tensor cores on the conv. +static constexpr int64 kDesiredNumFeaturesFactor = 8; + +// We won't pad a conv if doing so increases the total number of bytes in the +// lhs, rhs, or result by more than this amount. +// +// TODO(jlebar): This number was tuned experimentally. It represents a +// compromise on our current benchmarks; it speeds some up significantly, and +// doesn't slow any down. But we can observe by changing this value that +// there's additional room for speedups. Achieving those speedups without also +// slowing other things down will likely require a more sophisticated heuristic, +// possibly some form of auto-tuning. +static constexpr double kMaxBytesTouchedIncrease = 1.2; + +// Pads the given dimensions in the given shape up to a multiple of +// kDesiredNumFeaturesFactor. +static Shape PadShape(Shape s, absl::Span dims) { + for (int64 dim : dims) { + int64 dim_to_pad_size = s.dimensions(dim); + int64 new_dim_to_pad_size = + RoundUpToNearest(dim_to_pad_size, kDesiredNumFeaturesFactor); + s.set_dimensions(dim, new_dim_to_pad_size); + } + return s; +} + +// Creates and returns an HLO that zero-pads one or more dimensions in the given +// instruction so that its shape is equal to the given shape. +// +// Padding is added to the end of each relevant dimension. +// +// If the instruction already has the given shape, simply returns it without an +// intervening pad. +static HloInstruction* PadInstruction(HloInstruction* instr, + const Shape& new_shape) { + HloComputation* comp = instr->parent(); + + const Shape& shape = instr->shape(); + auto* zero = comp->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::Zero(shape.element_type()).CloneToUnique())); + + PaddingConfig pad_config = MakeNoPaddingConfig(ShapeUtil::Rank(shape)); + + bool added_padding = false; + for (int64 dim = 0; dim < ShapeUtil::Rank(shape); ++dim) { + if (shape.dimensions(dim) == new_shape.dimensions(dim)) { + continue; + } + CHECK_GT(new_shape.dimensions(dim), shape.dimensions(dim)); + pad_config.mutable_dimensions(dim)->set_edge_padding_high( + new_shape.dimensions(dim) - shape.dimensions(dim)); + added_padding = true; + } + + if (!added_padding) { + return instr; + } + return comp->AddInstruction( + HloInstruction::CreatePad(new_shape, instr, zero, pad_config)); +} + +// Pads the input/output feature dimensions of the given cudnn convolution +// custom-call to be multiples of kDesiredNumFeaturesFactor. +static StatusOr PadFeaturesDims(HloInstruction* conv) { + CHECK_EQ(0, conv->shape().tuple_shapes(1).dimensions(0)) + << "conv must use 0 scratch bytes, i.e. this pass must be run " + "before CudnnConvolutionAlgorithmPicker."; + + const auto& target = conv->custom_call_target(); + const auto& dnums = conv->convolution_dimension_numbers(); + auto* lhs = conv->mutable_operand(0); + auto* rhs = conv->mutable_operand(1); + const Shape& result_shape = conv->shape().tuple_shapes(0); + + Shape new_lhs_shape = [&] { + if (target == kCudnnConvForwardCallTarget || + target == kCudnnConvBackwardFilterCallTarget) { + // LHS is "input". + return PadShape(lhs->shape(), {dnums.input_feature_dimension()}); + } + CHECK_EQ(target, kCudnnConvBackwardInputCallTarget); + // LHS is "output". + return PadShape(lhs->shape(), {dnums.output_feature_dimension()}); + }(); + + Shape new_rhs_shape = [&] { + if (target == kCudnnConvForwardCallTarget || + target == kCudnnConvBackwardInputCallTarget) { + // RHS is "filter". + return PadShape(rhs->shape(), {dnums.kernel_input_feature_dimension(), + dnums.kernel_output_feature_dimension()}); + } + CHECK_EQ(target, kCudnnConvBackwardFilterCallTarget); + // RHS is "output". + return PadShape(rhs->shape(), {dnums.output_feature_dimension()}); + }(); + + if (ShapeUtil::Equal(lhs->shape(), new_lhs_shape) && + ShapeUtil::Equal(rhs->shape(), new_rhs_shape)) { + VLOG(3) << "No need to pad features of " << conv->ToString(); + return false; + } + + Shape new_result_shape = [&] { + if (target == kCudnnConvForwardCallTarget) { + // Result is "output". + return PadShape(result_shape, {dnums.output_feature_dimension()}); + } + if (target == kCudnnConvBackwardInputCallTarget) { + // Result is "input". + return PadShape(result_shape, {dnums.input_feature_dimension()}); + } + CHECK_EQ(target, kCudnnConvBackwardFilterCallTarget); + // Result is "filter". + return PadShape(result_shape, {dnums.kernel_input_feature_dimension(), + dnums.kernel_output_feature_dimension()}); + }(); + + // Check that padding wouldn't increase the total bytes read/written by this + // operation too much. + auto check_size_increase = [&](const Shape& old_shape, + const Shape& new_shape) { + int64 old_bytes = ShapeUtil::ByteSizeOf(old_shape); + int64 new_bytes = ShapeUtil::ByteSizeOf(new_shape); + if (new_bytes <= old_bytes * kMaxBytesTouchedIncrease) { + return true; + } + VLOG(3) << "Not padding convolution; doing so would change input / result " + "shape from " + << ShapeUtil::HumanString(old_shape) << " to " + << ShapeUtil::HumanString(new_shape) << ", a size increase of " + << new_bytes / static_cast(old_bytes) << "x > " + << kMaxBytesTouchedIncrease << "x: " << conv->ToString(); + return false; + }; + if (!check_size_increase(lhs->shape(), new_lhs_shape) || + !check_size_increase(rhs->shape(), new_rhs_shape) || + !check_size_increase(result_shape, new_result_shape)) { + return false; + } + + // OK, let's do the transformation! + + auto* new_lhs = PadInstruction(lhs, new_lhs_shape); + auto* new_rhs = PadInstruction(rhs, new_rhs_shape); + CHECK(new_lhs != lhs || new_rhs != rhs) + << "We should have had to pad either LHS or RHS."; + + auto add = [&](std::unique_ptr new_instr) { + return conv->parent()->AddInstruction(std::move(new_instr)); + }; + + Shape new_conv_shape = ShapeUtil::MakeTupleShape( + {new_result_shape, ShapeUtil::MakeShape(U8, {0})}); + auto* new_conv = + add(conv->CloneWithNewOperands(new_conv_shape, {new_lhs, new_rhs})); + + // Slice the new conv result if necessary, keeping in mind that new_conv has + // tuple shape (new_result_shape, u8[0]). + if (!ShapeUtil::Equal(result_shape, new_result_shape)) { + std::vector start_indices(result_shape.dimensions_size(), 0); + std::vector end_indices(result_shape.dimensions().begin(), + result_shape.dimensions().end()); + std::vector strides(result_shape.dimensions_size(), 1); + + auto* new_conv_result = add( + HloInstruction::CreateGetTupleElement(new_result_shape, new_conv, 0)); + auto* empty_temp_buffer = + add(HloInstruction::CreateConstant(LiteralUtil::CreateR1({}))); + auto* sliced_result = add(HloInstruction::CreateSlice( + result_shape, new_conv_result, start_indices, end_indices, strides)); + new_conv = + add(HloInstruction::CreateTuple({sliced_result, empty_temp_buffer})); + } + + VLOG(2) << "Padded features of " << conv->ToString() << ", replaced with " + << new_conv->ToString(); + TF_RETURN_IF_ERROR(conv->parent()->ReplaceInstruction(conv, new_conv)); + return true; +} + +static std::vector GetRelevantConvs(HloComputation* comp) { + std::vector convs; + for (HloInstruction* instr : comp->instructions()) { + if (IsCustomCallToDnnConvolution(*instr) && + instr->operand(0)->shape().element_type() == F16) { + convs.push_back(instr); + } + } + return convs; +} + +StatusOr PadForTensorCores::Run(HloModule* module) { + bool changed = false; + for (HloComputation* comp : module->MakeNonfusionComputations()) { + for (HloInstruction* conv : GetRelevantConvs(comp)) { + TF_ASSIGN_OR_RETURN(bool result, PadFeaturesDims(conv)); + changed |= result; + } + } + return changed; +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h new file mode 100644 index 0000000000000000000000000000000000000000..11dc56a64fda74cab12024e5f2c6fa2f63c9167d --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h @@ -0,0 +1,43 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PAD_FOR_TENSOR_CORES_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PAD_FOR_TENSOR_CORES_H_ + +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" + +namespace xla { +namespace gpu { + +// Ensures that f16 cudnn convolutions have input/output channel dimensions that +// are multiples of 8, inserting pads/slices as necessary. +// +// This is useful primarily for Volta and newer GPUs, where tensor cores can +// only be used if the channel dims are multiples of 8. It's probably the +// opposite of useful on other GPUs, so you should check what GPU you're +// targeting before running this pass. +// +// TODO(jlebar): Also pad dots. +class PadForTensorCores : public HloPassInterface { + public: + absl::string_view name() const override { return "pad for tensor cores"; } + + StatusOr Run(HloModule* module) override; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PAD_FOR_TENSOR_CORES_H_ diff --git a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5c92b0dcb873b873074704dca8f27d4067b070df --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc @@ -0,0 +1,164 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h" + +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" +#include "tensorflow/compiler/xla/util.h" + +namespace xla { +namespace gpu { +namespace { + +namespace op = xla::testing::opcode_matchers; +using ::testing::_; + +class PadForTensorCoresTest : public HloVerifiedTestBase {}; + +TEST_F(PadForTensorCoresTest, PadF16ForwardConvInputChannels) { + ParseAndVerifyModule(R"( + HloModule TestModule + + ENTRY TestComputation { + input = f16[10,20,30,41] parameter(0) + filter = f16[2,2,41,40] parameter(1) + ROOT result = (f16[10,20,30,40], u8[0]) custom-call(input, filter), + window={size=2x2}, dim_labels=b01f_01io->b01f, + custom_call_target="__cudnn$convForward" + })"); + EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie()); + auto* root = module().entry_computation()->root_instruction(); + + SCOPED_TRACE(module().ToString()); + EXPECT_THAT(root, op::CustomCall(kCudnnConvForwardCallTarget, + op::Pad(op::Parameter(0), _), + op::Pad(op::Parameter(1), _))); + EXPECT_TRUE(ShapeUtil::Equal(root->operand(0)->shape(), + ShapeUtil::MakeShape(F16, {10, 20, 30, 48}))); + EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->shape(), + ShapeUtil::MakeShape(F16, {2, 2, 48, 40}))); +} + +TEST_F(PadForTensorCoresTest, PadF16BackwardInputConvOutputChannels) { + ParseAndVerifyModule(R"( + HloModule TestModule + + ENTRY TestComputation { + output = f16[10,20,30,41] parameter(0) + filter = f16[2,2,40,41] parameter(1) + ROOT result = (f16[10,20,30,40], u8[0]) custom-call(output, filter), + window={size=2x2}, dim_labels=b01f_01io->b01f, + custom_call_target="__cudnn$convBackwardInput" + })"); + EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie()); + auto* root = module().entry_computation()->root_instruction(); + EXPECT_THAT(root, op::CustomCall(kCudnnConvBackwardInputCallTarget, + op::Pad(op::Parameter(0), _), + op::Pad(op::Parameter(1), _))); + EXPECT_TRUE(ShapeUtil::Equal(root->operand(0)->shape(), + ShapeUtil::MakeShape(F16, {10, 20, 30, 48}))); + EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->shape(), + ShapeUtil::MakeShape(F16, {2, 2, 40, 48}))); +} + +TEST_F(PadForTensorCoresTest, PadF16ForwardConvOutputChannels) { + ParseAndVerifyModule(R"( + HloModule TestModule + + ENTRY TestComputation { + input = f16[10,20,30,40] parameter(0) + filter = f16[2,2,40,41] parameter(1) + ROOT result = (f16[10,20,30,41], u8[0]) custom-call(input, filter), + window={size=2x2}, dim_labels=b01f_01io->b01f, + custom_call_target="__cudnn$convForward" + })"); + EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie()); + auto* root = module().entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Tuple(op::Slice(op::GetTupleElement(op::CustomCall( + kCudnnConvForwardCallTarget, op::Parameter(0), + op::Pad(op::Parameter(1), _)))), + _)); +} + +TEST_F(PadForTensorCoresTest, PadF16BackwardInputConvInputChannels) { + ParseAndVerifyModule(R"( + HloModule TestModule + + ENTRY TestComputation { + output = f16[10,20,30,40] parameter(0) + filter = f16[2,2,41,40] parameter(1) + result = (f16[10,20,30,41], u8[0]) custom-call(output, filter), + window={size=2x2}, dim_labels=b01f_01io->b01f, + custom_call_target="__cudnn$convBackwardInput" + ROOT gte = f16[10,20,30,41] get-tuple-element(result), index=0 + })"); + EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie()); + auto* root = module().entry_computation()->root_instruction(); + EXPECT_THAT(root, op::GetTupleElement(op::Tuple( + op::Slice(op::GetTupleElement(op::CustomCall( + kCudnnConvBackwardInputCallTarget, op::Parameter(0), + op::Pad(op::Parameter(1), _)))), + _))); +} + +TEST_F(PadForTensorCoresTest, PadF16BackwardFilterConvInputChannels) { + ParseAndVerifyModule(R"( + HloModule TestModule + + ENTRY TestComputation { + input = f16[10,20,30,41] parameter(0) + output = f16[10,20,30,40] parameter(1) + result = (f16[2,2,41,40], u8[0]) custom-call(input, output), + window={size=2x2}, dim_labels=b01f_01io->b01f, + custom_call_target="__cudnn$convBackwardFilter" + ROOT gte = f16[2,2,41,40] get-tuple-element(result), index=0 + })"); + EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie()); + auto* root = module().entry_computation()->root_instruction(); + EXPECT_THAT(root, op::GetTupleElement(op::Tuple( + op::Slice(op::GetTupleElement(op::CustomCall( + kCudnnConvBackwardFilterCallTarget, + op::Pad(op::Parameter(0), _), op::Parameter(1)))), + _))); +} + +TEST_F(PadForTensorCoresTest, PadF16BackwardFilterConvOutputChannels) { + ParseAndVerifyModule(R"( + HloModule TestModule + + ENTRY TestComputation { + input = f16[10,20,30,40] parameter(0) + output = f16[10,20,30,41] parameter(1) + result = (f16[2,2,40,41], u8[0]) custom-call(input, output), + window={size=2x2}, dim_labels=b01f_01io->b01f, + custom_call_target="__cudnn$convBackwardFilter" + ROOT gte = f16[2,2,40,41] get-tuple-element(result), index=0 + })"); + EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie()); + auto* root = module().entry_computation()->root_instruction(); + EXPECT_THAT(root, op::GetTupleElement(op::Tuple( + op::Slice(op::GetTupleElement(op::CustomCall( + kCudnnConvBackwardFilterCallTarget, + op::Parameter(0), op::Pad(op::Parameter(1), _)))), + _))); +} + +} // anonymous namespace +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc index c8f0d4185c63c5bafca6f30acab31cbe8e987277..9d85d746d84908eaa8d720bc3cccc475d81710f3 100644 --- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc +++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/pad_insertion.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/hlo_creation_utils.h" @@ -68,7 +70,7 @@ HloInstruction* MaybePaddedAndSlicedInput( PrimitiveType element_type = input->shape().element_type(); HloInstruction* padding = computation->AddInstruction(HloInstruction::CreateConstant( - MakeUnique(Literal::Zero(element_type)))); + absl::make_unique(LiteralUtil::Zero(element_type)))); input = MakePadHlo(input, padding, padding_config).ValueOrDie(); } @@ -125,7 +127,7 @@ HloInstruction* MaybePaddedKernel(const Window& conv_window, PrimitiveType element_type = kernel->shape().element_type(); HloInstruction* padding = computation->AddInstruction(HloInstruction::CreateConstant( - MakeUnique(Literal::Zero(element_type)))); + absl::make_unique(LiteralUtil::Zero(element_type)))); return MakePadHlo(kernel, padding, padding_config).ValueOrDie(); } } // namespace @@ -164,9 +166,9 @@ bool PadInsertion::CanonicalizeForwardConvolution(HloInstruction* conv) { Shape old_conv_shape = conv->shape().tuple_shapes(0); VLOG(1) << "Canonicalizing forward conv"; - auto new_conv = CreateCudnnConvForward(old_conv_shape, new_input, new_kernel, - new_conv_window, - conv->convolution_dimension_numbers()); + auto new_conv = CreateCudnnConvForward( + old_conv_shape, new_input, new_kernel, new_conv_window, + conv->convolution_dimension_numbers(), conv->feature_group_count()); VLOG(1) << "Replacing:\n " << conv->ToString() << "\nwith:\n " << new_conv->ToString(); TF_CHECK_OK(conv->parent()->ReplaceInstruction(conv, new_conv)); @@ -234,9 +236,9 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution( // Create a new backward convolution replacing the old one. HloComputation* computation = backward_conv->parent(); HloInstruction* output = backward_conv->mutable_operand(1); - HloInstruction* padding = - computation->AddInstruction(HloInstruction::CreateConstant( - MakeUnique(Literal::Zero(input->shape().element_type())))); + HloInstruction* padding = computation->AddInstruction( + HloInstruction::CreateConstant(absl::make_unique( + LiteralUtil::Zero(input->shape().element_type())))); HloInstruction* padded_input = MakePadHlo(input, padding, input_padding_config).ValueOrDie(); @@ -245,7 +247,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution( Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0); HloInstruction* new_backward_conv = CreateCudnnConvBackwardFilter( backward_conv_shape, padded_input, output, new_backward_conv_window, - backward_conv_dnums); + backward_conv_dnums, backward_conv->feature_group_count()); VLOG(1) << "Canonicalizing backward filter conv"; VLOG(1) << "Replacing:\n " << backward_conv->ToString() << "\nwith:\n " @@ -310,7 +312,7 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution( HloInstruction* new_backward_conv_call = CreateCudnnConvBackwardInput( new_backward_conv_shape, output, filter, new_backward_conv_window, - backward_conv_dnums); + backward_conv_dnums, backward_conv->feature_group_count()); // The CustomCall created above returns a tuple (conv_result, scratch_memory). // Extract out the two elements. diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.h b/tensorflow/compiler/xla/service/gpu/pad_insertion.h index 67e51509e4c717951c83c7e41943af1de762dee0..a622e894ed9c0d1534262e6b72a5f4ea7b7821ad 100644 --- a/tensorflow/compiler/xla/service/gpu/pad_insertion.h +++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.h @@ -26,7 +26,7 @@ namespace gpu { // padding, so that they can be lowered to cuDNN convolution. class PadInsertion : public HloPassInterface { public: - tensorflow::StringPiece name() const override { return "pad insertion"; } + absl::string_view name() const override { return "pad insertion"; } StatusOr Run(HloModule* module) override; diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc index d8c07dc3119fb81a3ef22822acb11b7c4d5bbca5..8154d75d23a6d49153ccb6824402aff73f365617 100644 --- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc @@ -32,33 +32,33 @@ namespace gpu { ParallelLoopEmitter::ParallelLoopEmitter( BodyEmitter body_emitter, const Shape& shape, - const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder, + const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b, int unroll_factor) - : LoopEmitter(body_emitter, shape, ir_builder), + : LoopEmitter(body_emitter, shape, b), launch_dimensions_(launch_dimensions), unroll_factor_(unroll_factor) {} ParallelLoopEmitter::ParallelLoopEmitter( const llvm_ir::ElementGenerator& target_element_generator, - tensorflow::gtl::ArraySlice target_arrays, - const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder, + absl::Span target_arrays, + const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b, int unroll_factor) - : LoopEmitter(target_element_generator, target_arrays, ir_builder), + : LoopEmitter(target_element_generator, target_arrays, b), launch_dimensions_(launch_dimensions), unroll_factor_(unroll_factor) {} ParallelLoopEmitter::ParallelLoopEmitter( const llvm_ir::ElementGenerator& target_element_generator, const llvm_ir::IrArray& target_array, - const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder, + const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b, int unroll_factor) - : LoopEmitter(target_element_generator, target_array, ir_builder), + : LoopEmitter(target_element_generator, target_array, b), launch_dimensions_(launch_dimensions), unroll_factor_(unroll_factor) {} std::vector -ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock( - tensorflow::StringPiece loop_name) { +ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name, + llvm::Type* index_type) { // Emit the following code in LLVM IR: // linear_index = blockIdx.x * blockDim.x + threadIdx.x; // if (linear_index < num_elements) { @@ -71,31 +71,30 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock( // // %nctaid.x is currently specified as 2147483647. VLOG(3) << "EmitIndexAndSetExitBasicBlock unroll_factor " << unroll_factor_; + CHECK_NE(index_type, nullptr); std::vector array_indices; - llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, ir_builder_); + llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_); llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(), static_cast(block_id)); - block_id = - ir_builder_->CreateZExt(block_id, ir_builder_->getInt64Ty(), "block_id"); + block_id = b_->CreateZExtOrTrunc(block_id, index_type, "block_id"); // Per the PTX documentation: // "It is guaranteed that [...] 0 <= %tid.x < %ntid.x" // // %ntid.x is currently specified as 1024. llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, ir_builder_); + llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_); llvm_ir::AddRangeMetadata(0, launch_dimensions_.threads_per_block(), static_cast(thread_id)); - thread_id = ir_builder_->CreateZExt(thread_id, ir_builder_->getInt64Ty(), - "thread_id"); - - llvm::Value* linear_index_base = ir_builder_->CreateAdd( - ir_builder_->CreateMul( - block_id, - ir_builder_->getInt64(launch_dimensions_.threads_per_block()), "", - /*HasNUW=*/true, /*HasNSW=*/true), + thread_id = b_->CreateZExtOrTrunc(thread_id, index_type, "thread_id"); + + llvm::Value* linear_index_base = b_->CreateAdd( + b_->CreateMul(block_id, + llvm::ConstantInt::get( + index_type, launch_dimensions_.threads_per_block()), + "", + /*HasNUW=*/true, /*HasNSW=*/true), thread_id, "linear_index", /*HasNUW=*/true, /*HasNSW=*/true); // Add an @llvm.assume(linear_index < threads_per_block * num_blocks). @@ -108,39 +107,41 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock( // conditions in the same basic block as their operands. llvm_ir::EmitCallToIntrinsic( llvm::Intrinsic::assume, - {ir_builder_->CreateICmpULT( + {b_->CreateICmpULT( linear_index_base, - ir_builder_->getInt64(launch_dimensions_.threads_per_block() * - launch_dimensions_.block_count()), + llvm::ConstantInt::get(index_type, + launch_dimensions_.threads_per_block() * + launch_dimensions_.block_count()), "linear_index_in_range")}, - {}, ir_builder_); + {}, b_); if (unroll_factor_ > 1) { - linear_index_base = ir_builder_->CreateMul( - linear_index_base, ir_builder_->getInt64(unroll_factor_), + linear_index_base = b_->CreateMul( + linear_index_base, llvm::ConstantInt::get(index_type, unroll_factor_), "linear_index_base", /*HasNUW=*/true, /*HasNSW=*/true); } - array_indices.emplace_back(linear_index_base, shape_, ir_builder_); + array_indices.emplace_back(linear_index_base, shape_, b_); for (int i = 1; i < unroll_factor_; ++i) { - llvm::Value* linear_index = ir_builder_->CreateAdd( - linear_index_base, ir_builder_->getInt64(i), "linear_index", - /*HasNUW=*/true, /*HasNSW=*/true); - array_indices.emplace_back(linear_index, shape_, ir_builder_); + llvm::Value* linear_index = + b_->CreateAdd(linear_index_base, llvm::ConstantInt::get(index_type, i), + "linear_index", + /*HasNUW=*/true, /*HasNSW=*/true); + array_indices.emplace_back(linear_index, shape_, b_); } auto if_in_bounds = llvm_ir::EmitIfThenElse( - ir_builder_->CreateICmpULT( + b_->CreateICmpULT( linear_index_base, - ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))), - llvm_ir::IrName(loop_name, "in_bounds"), ir_builder_, false); + llvm::ConstantInt::get(index_type, ShapeUtil::ElementsIn(shape_))), + llvm_ir::IrName(loop_name, "in_bounds"), b_, false); // Set exit_bb_ to the exit block of the if structure. exit_bb_ = if_in_bounds.after_block; CHECK_NE(nullptr, exit_bb_); // Set IR builder insertion point to the body of the if structure. - llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_); + llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, b_); return array_indices; } diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h index 25318b3bed8bf4a2dfe3a4a974269d0405c3bfec..f32ea1ce4c4192f39851a6441c46663df3063724 100644 --- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h @@ -34,31 +34,30 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter { // The meanings of other parameters are the same as LoopEmitter. ParallelLoopEmitter(BodyEmitter body_emitter, const Shape& shape, const LaunchDimensions& launch_dimensions, - llvm::IRBuilder<>* ir_builder, int unroll_factor = 1); + llvm::IRBuilder<>* b, int unroll_factor = 1); // Constructs a ParallelLoopEmitter from an element generator that generates // each element of the given target array. ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator, const llvm_ir::IrArray& target_array, const LaunchDimensions& launch_dimensions, - llvm::IRBuilder<>* ir_builder, int unroll_factor = 1); + llvm::IRBuilder<>* b, int unroll_factor = 1); // Constructs a loop emitter for a loop that generates on element of each of N // arrays on each iteration. // // This is used in multi-output fusion. target_element_generator should // produce a struct with N elements, one for each of target_arrays. - ParallelLoopEmitter( - const llvm_ir::ElementGenerator& target_element_generator, - tensorflow::gtl::ArraySlice target_arrays, - const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder, - int unroll_factor = 1); + ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator, + absl::Span target_arrays, + const LaunchDimensions& launch_dimensions, + llvm::IRBuilder<>* b, int unroll_factor = 1); ParallelLoopEmitter(const ParallelLoopEmitter&) = delete; ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete; ~ParallelLoopEmitter() override = default; std::vector EmitIndexAndSetExitBasicBlock( - tensorflow::StringPiece loop_name) override; + absl::string_view loop_name, llvm::Type* index_type) override; private: // The thread and block dimension to parallelize the loop on. diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc index d3fd0544fb68809125e9b9f7a5e5b7eff8c6ef43..cf9f102d31305da15dabaf6247f23c5ca9a9e054 100644 --- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc +++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc @@ -18,15 +18,15 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/bits.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -34,9 +34,8 @@ namespace gpu { std::ostream& operator<<(std::ostream& out, const LaunchDimensions& launch_dims) { - out << tensorflow::strings::Printf("[block: %lld, thread: %lld]", - launch_dims.block_count(), - launch_dims.threads_per_block()); + out << absl::StrFormat("[block: %d, thread: %d]", launch_dims.block_count(), + launch_dims.threads_per_block()); return out; } @@ -91,9 +90,9 @@ LaunchDimensions CalculateLaunchDimensions( } int64 block_count = CeilOfRatio(num_elements, threads_per_block); - VLOG(2) << tensorflow::strings::Printf( + VLOG(2) << absl::StrFormat( "Initialized the block count to ceil(# of elements / threads per " - "block) = ceil(%lld/%lld) = %lld", + "block) = ceil(%d/%d) = %d", num_elements, threads_per_block, block_count); return LaunchDimensions(block_count, threads_per_block); diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h index c125474edb1036090a926020f2b1e7fcf64c751a..02471129e004b4876ce20a62cade34060c65b478 100644 --- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h +++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h @@ -47,6 +47,7 @@ class LaunchDimensions { int64 block_count() const { return block_count_; } int64 threads_per_block() const { return threads_per_block_; } + int64 launch_bound() const { return block_count() * threads_per_block(); } private: int64 block_count_; diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc index 88cb10883e97ae663dc492ad088e6daf9133d7f5..84285be70a4ba94101040a639c39b3eaecbb5bb3 100644 --- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/core/lib/core/errors.h" namespace xla { @@ -33,9 +34,12 @@ Status SequentialThunk::Initialize(const GpuExecutable& executable, } Status SequentialThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, se::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); for (const auto& thunk : thunks_) { - TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream)); + TF_RETURN_IF_ERROR( + thunk->ExecuteOnStream(buffer_allocations, stream, profiler)); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h index 135f79e413dfaa27f2f2264e0daa3beb3c305e0f..3c4de1d1a6c912ba31f56c29b10ca004d1e56da6 100644 --- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h @@ -19,6 +19,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -41,7 +42,8 @@ class SequentialThunk : public Thunk { Status Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) override; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: // The list of sub-thunks. diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc index e4cfc6999f2da04dd7e7a34d854fdb3d75b8bfc6..5b6cf2c04d05378a363232e33a6df6432cd6848e 100644 --- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc +++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc @@ -15,8 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_reachability.h" @@ -33,13 +33,13 @@ int StreamAssignment::StreamNumberForHlo(const HloInstruction& hlo) const { } void StreamAssignment::AssignStreamToHlo(const HloInstruction* hlo, - int stream_no) { - CHECK_GE(stream_no, 0); - if (stream_no >= stream_count_) { - stream_count_ = stream_no + 1; + int stream_num) { + CHECK_GE(stream_num, 0); + if (stream_num >= stream_count_) { + stream_count_ = stream_num + 1; } - InsertOrDie(&hlo_to_stream_number_, hlo, stream_no); - VLOG(2) << "Assign stream #" << stream_no << " to " << hlo->ToString(); + InsertOrDie(&hlo_to_stream_number_, hlo, stream_num); + VLOG(2) << "Assign stream #" << stream_num << " to " << hlo->ToString(); } namespace { @@ -51,6 +51,12 @@ bool CanRunConcurrently(const HloInstruction& a, const HloInstruction& b, return !reachability.IsConnected(&a, &b); } +constexpr int kInvalidStreamNum = -1; +// Returns true iff `stream_num` is an invalid stream number. +inline bool IsStreamNumValid(int stream_num) { + return stream_num != kInvalidStreamNum; +} + // Returns which existing stream to assign to `hlo`, or -1 if a stream is not // needed. `stream_assignment` is the existing stream assignment for all // instructions topologically before `hlo`. `seen_gemms` contains all GEMMs that @@ -62,7 +68,7 @@ int ComputeStreamToAssign( if (hlo.opcode() == HloOpcode::kParameter || hlo.opcode() == HloOpcode::kConstant) { // kParameter and kConstant do not need a thunk. - return -1; + return kInvalidStreamNum; } if (hlo.GetModule() @@ -75,17 +81,17 @@ int ComputeStreamToAssign( if (!ImplementedAsGemm(hlo)) { // If `hlo` is not implemented as a GEMM, keep it close to its operands to // avoid excessive synchronization. - int stream_no = -1; + int stream_num = -1; for (const auto* operand : hlo.operands()) { if (stream_assignment.HasStreamAssigned(*operand)) { - stream_no = - std::max(stream_no, stream_assignment.StreamNumberForHlo(*operand)); + stream_num = std::max(stream_num, + stream_assignment.StreamNumberForHlo(*operand)); } } - if (stream_no == -1) { - stream_no = 0; + if (!IsStreamNumValid(stream_num)) { + stream_num = 0; } - return stream_no; + return stream_num; } // Assign different streams to concurrent GEMMs. The code below uses a @@ -94,17 +100,17 @@ int ComputeStreamToAssign( // `hlo` a different stream. std::set forbidden_stream_numbers; for (const auto* seen_gemm : seen_gemms) { - int stream_no = stream_assignment.StreamNumberForHlo(*seen_gemm); - if (!forbidden_stream_numbers.count(stream_no) && + int stream_num = stream_assignment.StreamNumberForHlo(*seen_gemm); + if (!forbidden_stream_numbers.count(stream_num) && CanRunConcurrently(*seen_gemm, hlo, reachability)) { - forbidden_stream_numbers.insert(stream_no); + forbidden_stream_numbers.insert(stream_num); } } - for (int stream_no = 0; stream_no < stream_assignment.StreamCount(); - ++stream_no) { - if (!forbidden_stream_numbers.count(stream_no)) { - return stream_no; + for (int stream_num = 0; stream_num < stream_assignment.StreamCount(); + ++stream_num) { + if (!forbidden_stream_numbers.count(stream_num)) { + return stream_num; } } return stream_assignment.StreamCount(); @@ -113,16 +119,32 @@ int ComputeStreamToAssign( } // namespace std::unique_ptr AssignStreams(const HloModule& module) { - auto stream_assignment = MakeUnique(); + auto stream_assignment = absl::make_unique(); const HloComputation& computation = *module.entry_computation(); std::unique_ptr reachability = computation.ComputeReachability(); std::vector seen_gemms; + // The execution of different RNG Hlo instructions in the same module updates + // a common global variable. To avoid a race condition, we simply assign all + // RNG kernels to the same stream to make them run sequentially. + // + // TODO(b/111791052): If we remove such a common variable, we will need to + // clean up the code here. + int stream_num_for_rng = kInvalidStreamNum; for (const auto* hlo : computation.MakeInstructionPostOrder()) { - int stream_no = ComputeStreamToAssign(*hlo, *stream_assignment, - *reachability, seen_gemms); - if (stream_no != -1) { - stream_assignment->AssignStreamToHlo(hlo, stream_no); + // If we ever enable fusion of RNG instructions, we will need to extend this + // code to look inside a fused instruction. + int stream_num = (hlo->opcode() == HloOpcode::kRng && + IsStreamNumValid(stream_num_for_rng)) + ? stream_num_for_rng + : ComputeStreamToAssign(*hlo, *stream_assignment, + *reachability, seen_gemms); + if (IsStreamNumValid(stream_num)) { + stream_assignment->AssignStreamToHlo(hlo, stream_num); + if (hlo->opcode() == HloOpcode::kRng && + !IsStreamNumValid(stream_num_for_rng)) { + stream_num_for_rng = stream_num; + } } if (ImplementedAsGemm(*hlo)) { seen_gemms.push_back(hlo); diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc index 696fa7e0194032b5c78bf11383c3280a62de07fa..091aca23e54bf0585b91e7a05c0837d8a0a2b764 100644 --- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc +++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc @@ -15,13 +15,14 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { namespace gpu { @@ -33,8 +34,7 @@ class StreamAssignmentTest : public HloTestBase { auto debug_options = GetDebugOptionsForTest(); debug_options.set_xla_gpu_disable_multi_streaming(false); config.set_debug_options(debug_options); - return MakeUnique("test_module", VersionedComputationHandle(), - config); + return absl::make_unique("test_module", config); } // Pre-canned shapes. @@ -98,7 +98,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) { params.reserve(6); for (int i = 0; i < 6; ++i) { params.push_back(builder.AddInstruction(HloInstruction::CreateParameter( - i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i)))); + i, f32_2x2_, /*name=*/absl::StrFormat("param%d", i)))); } HloInstruction* d00 = builder.AddInstruction( HloInstruction::CreateCanonicalDot(f32_2x2_, params[2], params[3])); diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc index a50ddf6ac63c7fa7ccace94bc7f40f438aedccf8..08ff52211af163fec39646ca6bf14da9d1b815e4 100644 --- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc +++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc @@ -16,14 +16,23 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h" #include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/util.h" namespace xla { namespace gpu { -using stream_executor::dnn::DataLayout; -using stream_executor::dnn::DataLayoutString; -using stream_executor::dnn::FilterLayout; -using stream_executor::dnn::FilterLayoutString; +using se::dnn::DataLayout; +using se::dnn::DataLayoutString; +using se::dnn::FilterLayout; +using se::dnn::FilterLayoutString; + +bool IsVoltaOrLater(const se::StreamExecutor& stream_executor) { + int major, minor; + CHECK(stream_executor.GetDeviceDescription().cuda_compute_capability(&major, + &minor)); + return major >= 7; +} StatusOr> StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums, @@ -46,8 +55,9 @@ StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums, input_layout.push_back(dnums.input_feature_dimension()); break; default: - return tensorflow::errors::Internal("Invalid input layout: ", - DataLayoutString(input)); + return InternalError("Invalid input layout %s for conv with dnums %s", + DataLayoutString(input), + ConvolutionDimensionNumbersToString(dnums)); } std::vector filter_layout; @@ -67,8 +77,9 @@ StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums, filter_layout.push_back(dnums.kernel_input_feature_dimension()); break; default: - return tensorflow::errors::Internal("Invalid filter layout: ", - FilterLayoutString(filter)); + return InternalError("Invalid filter layout %s for conv with dnums %s", + FilterLayoutString(filter), + ConvolutionDimensionNumbersToString(dnums)); } std::vector output_layout; @@ -88,8 +99,9 @@ StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums, output_layout.push_back(dnums.output_feature_dimension()); break; default: - return tensorflow::errors::Internal("Invalid output layout: ", - DataLayoutString(output)); + return InternalError("Invalid output layout %s for conv with dnums %s", + DataLayoutString(output), + ConvolutionDimensionNumbersToString(dnums)); } return std::make_tuple(LayoutUtil::MakeLayoutFromMajorToMinor(input_layout), @@ -121,8 +133,9 @@ XlaConvLayoutsToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums, } else if (LayoutUtil::Equal(input, nhwc_input)) { input_layout = DataLayout::kBatchYXDepth; } else { - return tensorflow::errors::Internal("Invalid input layout: ", - input.ShortDebugString()); + return InternalError("Invalid input layout %s for conv with dnums %s", + LayoutUtil::HumanString(input), + ConvolutionDimensionNumbersToString(dnums)); } FilterLayout filter_layout; @@ -131,8 +144,9 @@ XlaConvLayoutsToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums, } else if (LayoutUtil::Equal(filter, nhwc_filter)) { filter_layout = FilterLayout::kOutputYXInput; } else { - return tensorflow::errors::Internal("Invalid filter layout: ", - filter.ShortDebugString()); + return InternalError("Invalid filter layout %s for conv with dnums %s", + LayoutUtil::HumanString(filter), + ConvolutionDimensionNumbersToString(dnums)); } DataLayout output_layout; @@ -141,8 +155,9 @@ XlaConvLayoutsToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums, } else if (LayoutUtil::Equal(output, nhwc_output)) { output_layout = DataLayout::kBatchYXDepth; } else { - return tensorflow::errors::Internal("Invalid output layout: ", - output.ShortDebugString()); + return InternalError("Invalid output layout %s for conv with dnums %s", + LayoutUtil::HumanString(output), + ConvolutionDimensionNumbersToString(dnums)); } return std::make_tuple(input_layout, filter_layout, output_layout); diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h index 8218f4fd11d3978d0ecc53fc15e287aea4b69ec3..1fc46bafa10e7ba6c896f081d5c836bd400886c9 100644 --- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h +++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h @@ -16,6 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_ +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -24,18 +26,20 @@ limitations under the License. namespace xla { namespace gpu { +// Returns true if the given StreamExecutor is for a Volta or newer nvidia GPU. +bool IsVoltaOrLater(const se::StreamExecutor& stream_exec); + // Returns (input, filter, output) XLA Layout protos given the StreamExecutor // layouts. StatusOr> StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums, - stream_executor::dnn::DataLayout input, - stream_executor::dnn::FilterLayout filter, - stream_executor::dnn::DataLayout output); + se::dnn::DataLayout input, + se::dnn::FilterLayout filter, + se::dnn::DataLayout output); // Returns (input, filter, output) StreamExecutor layouts given the XLA layouts. -StatusOr> +StatusOr< + std::tuple> XlaConvLayoutsToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums, const Layout& input, const Layout& filter, const Layout& output); diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..db4a33dc564b62b5fe54b725ea453a6fcbfb3287 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD @@ -0,0 +1,227 @@ +# Description: GPU-specific XLA tests. For example, codegen tests that +# verify the IR emitted. +# +# TODO(jlebar): None of these tests actually use the GPU, so they should not +# need to run on machines with GPUs present. + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = [":friends"]) + +package_group( + name = "friends", + includes = [ + "//tensorflow/compiler/xla:friends", + ], +) + +# Filegroup used to collect source files for dependency checking. +filegroup( + name = "c_srcs", + data = glob([ + "**/*.cc", + "**/*.h", + ]), +) + +load("//tensorflow:tensorflow.bzl", "tf_cc_test") + +cc_library( + name = "gpu_codegen_test", + testonly = True, + srcs = ["gpu_codegen_test.cc"], + hdrs = ["gpu_codegen_test.h"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", + "//tensorflow/compiler/xla/service:gpu_plugin", + "//tensorflow/compiler/xla/service/gpu:gpu_executable", + "//tensorflow/compiler/xla/tests:filecheck", + "//tensorflow/compiler/xla/tests:llvm_irgen_test_base", + "//tensorflow/core:lib", + "@com_google_absl//absl/memory", + ], +) + +tf_cc_test( + name = "gpu_copy_test", + srcs = ["gpu_copy_test.cc"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", + ], +) + +tf_cc_test( + name = "gpu_ftz_test", + srcs = ["gpu_ftz_test.cc"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + ":gpu_codegen_test", + "//tensorflow/core:test_main", + ], +) + +tf_cc_test( + name = "gpu_index_test", + srcs = ["gpu_index_test.cc"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla:xla_proto", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", + ], +) + +tf_cc_test( + name = "gpu_infeed_test", + srcs = ["infeed_test.cc"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:test_helpers", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:global_data", + "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:literal_test_util", + "//tensorflow/core:lib", + "//tensorflow/core:test_main", + ], +) + +tf_cc_test( + name = "gpu_kernel_tiling_test", + srcs = ["gpu_kernel_tiling_test.cc"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + +tf_cc_test( + name = "gpu_ldg_test", + srcs = ["gpu_ldg_test.cc"], + tags = ["requires-gpu-sm35"], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", + ], +) + +tf_cc_test( + name = "gpu_noalias_test", + srcs = ["gpu_noalias_test.cc"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "@com_google_absl//absl/memory", + ], +) + +tf_cc_test( + name = "gpu_fusion_test", + srcs = ["gpu_fusion_test.cc"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + +tf_cc_test( + name = "gpu_unrolling_test", + srcs = ["gpu_unrolling_test.cc"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + +tf_cc_test( + name = "gpu_alignment_test", + testonly = True, + srcs = ["gpu_alignment_test.cc"], + tags = [ + "requires-gpu-sm35", + ], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla/service:gpu_plugin", + "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry", + "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis", + "//tensorflow/compiler/xla/tests:filecheck", + "//tensorflow/compiler/xla/tests:llvm_irgen_test_base", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..672c68e59b59dff19f0c5575db26dea455c45053 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc @@ -0,0 +1,54 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h" +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h" +#include "tensorflow/compiler/xla/tests/filecheck.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { +namespace { + +class GpuAlignmentTest : public GpuCodegenTest {}; + +TEST_F(GpuAlignmentTest, Test) { + const char* hlo_string = R"( +HloModule GpuAlignmentTest + +ENTRY main { + zero = f32[] constant(0) + tok = token[] after-all() + a = f32[100] parameter(0) + b_tup = (f32[200], token[]) infeed(tok) + b = f32[200] get-tuple-element(b_tup), index=0 + a_padded = f32[150] pad(a, zero), padding=0_50 + b_sliced = f32[150] slice(b), slice={[0:150]} + ROOT c = f32[150] add(a_padded, b_sliced) +} +)"; + + CompileAndVerifyIr(hlo_string, R"( +CHECK: @fusion(i8* align 64 dereferenceable(600) %alloc0, i8* align 16 dereferenceable(400) %alloc1, i8* align 64 dereferenceable(864) %temp_buf) +)"); +} + +} // namespace +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..79e77d4c4d649020cf52ac25c220c3f90e8469b9 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc @@ -0,0 +1,49 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/tests/filecheck.h" +#include "tensorflow/core/platform/logging.h" + +namespace xla { +namespace gpu { + +std::unique_ptr GpuCodegenTest::CreateNewModuleWithFTZ(bool ftz) { + HloModuleConfig config; + auto debug_options = legacy_flags::GetDebugOptionsFromFlags(); + debug_options.set_xla_gpu_ftz(ftz); + debug_options.set_xla_gpu_max_kernel_unroll_factor(1); + // TODO(b/38354253): Change tests to use Parameters instead of Constants. + debug_options.add_xla_disable_hlo_passes("constant_folding"); + config.set_debug_options(debug_options); + + return absl::make_unique(TestName(), config); +} + +void GpuCodegenTest::CompileAndVerifyPtx(std::unique_ptr hlo_module, + const string& pattern) { + std::unique_ptr executable = + std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie()); + string ptx_str(static_cast(executable.get())->ptx()); + StatusOr filecheck_result = RunFileCheck(ptx_str, pattern); + ASSERT_TRUE(filecheck_result.ok()); + EXPECT_TRUE(filecheck_result.ValueOrDie()); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h new file mode 100644 index 0000000000000000000000000000000000000000..e4a3573babb7ed746504c1466f85b582aa4d044f --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h @@ -0,0 +1,42 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_ + +#include + +#include "tensorflow/compiler/xla/tests/llvm_irgen_test_base.h" + +namespace xla { +namespace gpu { + +// Tests that verify IR or PTX emitted by the GPU backend is as expected. +class GpuCodegenTest : public LlvmIrGenTestBase { + protected: + // Like HloTestBase::CreateNewModule(), with a flag for configuring the ftz + // option. + std::unique_ptr CreateNewModuleWithFTZ(bool ftz); + + // Compiles the given HLO module to PTX and verifies the PTX matches the given + // FileCheck pattern. (See http://llvm.org/docs/CommandGuide/FileCheck.html). + void CompileAndVerifyPtx(std::unique_ptr hlo_module, + const string& pattern); +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_ diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4550f36fdfc097632fed4956fcd3e42ef8a919c5 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc @@ -0,0 +1,59 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { + +class GpuCopyTest : public GpuCodegenTest {}; + +// The GPU backend should not emit a copy kernel for the kCopy instruction in +// this test. Instead, it should generate a CopyThunk which invokes cuMemcpy at +// runtime. +TEST_F(GpuCopyTest, UseMemcpy) { + HloComputation::Builder builder(TestName()); + + std::unique_ptr literal = + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + HloInstruction* constant = builder.AddInstruction( + HloInstruction::CreateConstant(std::move(literal))); + builder.AddInstruction(HloInstruction::CreateUnary( + constant->shape(), HloOpcode::kCopy, constant)); + + std::unique_ptr computation = builder.Build(); + + auto hlo_module = CreateNewModule(); + hlo_module->AddEntryComputation(std::move(computation)); + + // There should not be any kernel prefixed "copy". + CompileAndVerifyIr(std::move(hlo_module), "; CHECK-NOT: define void @_copy", + /*match_optimized_ir=*/false); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..177b94934c7f519172508b5cc6e088f908401193 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc @@ -0,0 +1,119 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" + +// Check that the ftz (flush denormals to zero) flag is reflected in PTX as +// expected. + +namespace xla { +namespace gpu { +namespace { + +class GpuFtzTest : public GpuCodegenTest { + public: + explicit GpuFtzTest(bool ftz) : ftz_(ftz) {} + + // Creates an HLO module that performs the given binary operation on some + // data. + std::unique_ptr CreateBinaryOpModule(HloOpcode op) { + HloComputation::Builder builder(TestName()); + + Shape param_shape = ShapeUtil::MakeShapeWithLayout( + F32, /*dimensions=*/{100, 100}, /*minor_to_major=*/{1, 0}); + HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( + /* parameter_number=*/0, param_shape, "x")); + HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter( + /* parameter_number=*/1, param_shape, "y")); + builder.AddInstruction(HloInstruction::CreateBinary(param_shape, op, x, y)); + + auto hlo_module = CreateNewModuleWithFTZ(ftz_); + hlo_module->AddEntryComputation(builder.Build()); + return hlo_module; + } + + // Creates an HLO module that performs the given unary operation on some data. + std::unique_ptr CreateUnaryOpModule(HloOpcode op) { + HloComputation::Builder builder(TestName()); + + Shape param_shape = ShapeUtil::MakeShapeWithLayout( + F32, /*dimensions=*/{100, 100}, /*minor_to_major=*/{1, 0}); + HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter( + /* parameter_number=*/0, param_shape, "x")); + builder.AddInstruction(HloInstruction::CreateUnary(param_shape, op, x)); + + auto hlo_module = CreateNewModuleWithFTZ(ftz_); + hlo_module->AddEntryComputation(builder.Build()); + return hlo_module; + } + + bool ftz_; +}; + +class GpuFtzEnabledTest : public GpuFtzTest { + public: + GpuFtzEnabledTest() : GpuFtzTest(/*ftz=*/true) {} +}; + +class GpuFtzDisabledTest : public GpuFtzTest { + public: + GpuFtzDisabledTest() : GpuFtzTest(/*ftz=*/false) {} +}; + +// Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise. +TEST_F(GpuFtzEnabledTest, MultiplyFtz) { + CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( + CHECK-NOT: mul.f32 + CHECK: mul.ftz.f32 + CHECK-NOT: mul.f32 + )"); +} +TEST_F(GpuFtzDisabledTest, MultiplyFtz) { + CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( + CHECK-NOT: mul.ftz.f32 + CHECK: mul.f32 + CHECK-NOT: mul.ftz.f32 + )"); +} + +// In NVPTX, exp(float) is implemented in libdevice, and consults __nvvm_reflect +// to determine whether or not ftz is enabled. The implementation uses two +// calls to ex2.approx. When ftz is on, we get two calls to the ftz version; +// when ftz is off, we get one call to the ftz version and one call to the +// regular version. +TEST_F(GpuFtzEnabledTest, ExpFtz) { + CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( + CHECK-NOT: ex2.approx.f32 + CHECK: ex2.approx.ftz.f32 + CHECK-NOT: ex2.approx.f32 + CHECK: ex2.approx.ftz.f32 + CHECK-NOT: ex2.approx.f32 + CHECK-NOT: ex2.approx.ftz.f32 + )"); +} + +TEST_F(GpuFtzDisabledTest, ExpFtz) { + CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( + CHECK-NOT: ex2.approx.f32 + CHECK-DAG: ex2.approx.ftz.f32 + CHECK-DAG: ex2.approx.f32 + CHECK-NOT: ex2.approx.f32 + CHECK-NOT: ex2.approx.ftz.f32 + )"); +} + +} // namespace +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..674b436a8e3135a5dfe3731647897696bf1321cd --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc @@ -0,0 +1,59 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { +namespace { + +class GpuFusionTest : public GpuCodegenTest {}; + +TEST_F(GpuFusionTest, FusedReshape) { + const char* hlo_text = R"( + HloModule test_module + + fused_computation { + p0.param_0 = f32[4,1,1]{2,1,0} parameter(0) + p1.param_1 = f32[4,1]{1,0} parameter(1) + reshape = f32[4,1]{1,0} reshape(p0.param_0) + ROOT add = f32[4,1] add(reshape, p1.param_1) + } + + ENTRY BroadcastIntoAdd { + p0 = f32[4,1,1]{2,1,0} parameter(0) + p1 = f32[4,1]{1,0} parameter(1) + ROOT fusion = f32[4,1]{1,0} fusion(p0, p1), kind=kLoop, + calls=fused_computation + } +)"; + + CompileAndVerifyIr(hlo_text, + R"( +; CHECK-LABEL: @fusion +; CHECK: fadd +; CHECK: } + )"); +} + +} // namespace +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a06576df7b874745236a8d9075355a01ec42e777 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc @@ -0,0 +1,147 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/xla.pb.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { + +// This file tests the index expressions used to reference source tensors. When +// the destination tensor and source tensor have compatible shapes, the linear +// index is used to access the source tensor. Otherwise, dimensional indices +// computed from the linear index are used to access the source tensor. + +class GpuIndexTest : public GpuCodegenTest {}; + +TEST_F(GpuIndexTest, CompatibleUseLinearIndex) { + HloComputation::Builder builder(TestName()); + + auto param_shape = ShapeUtil::MakeShape(F32, {5, 7, 2}); + HloInstruction* param_x = builder.AddInstruction( + HloInstruction::CreateParameter(0, param_shape, "x")); + HloInstruction* param_y = builder.AddInstruction( + HloInstruction::CreateParameter(1, param_shape, "y")); + builder.AddInstruction(HloInstruction::CreateBinary( + ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y)); + + auto hlo_module = CreateNewModule(); + hlo_module->AddEntryComputation(builder.Build()); + + // Check the optimized IR as the unoptimized IR contains dead udiv and urem. + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-NOT: udiv +; CHECK-NOT: urem + )", + /*match_optimized_ir=*/true); +} + +TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshape) { + HloModuleConfig config; + config.set_debug_options(HloTestBase::GetDebugOptionsForTest()); + auto module = ParseHloString(R"( + HloModule test_module + + ENTRY CompatibleUseLinearIndexWithReshape { + x = f32[5,7,2]{2,1,0} parameter(0) + y = f32[5,14]{1,0} parameter(1) + reshape = f32[5,7,2]{2,1,0} reshape(y) + ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, reshape) + })", + config) + .ValueOrDie(); + + // Check the optimized IR as the unoptimized IR contains dead udiv and urem. + CompileAndVerifyIr(std::move(module), + R"( +; CHECK-NOT: udiv +; CHECK-NOT: urem + )", + /*match_optimized_ir=*/true); +} + +TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) { + HloModuleConfig config; + config.set_debug_options(HloTestBase::GetDebugOptionsForTest()); + auto module = ParseHloString(R"( + HloModule test_module + + ENTRY CompatibleUseLinearIndexWithReshape { + x = f32[5,7,2]{2,1,0} parameter(0) + y = f32[14]{0} parameter(1) + reshape = f32[7,2]{1,0} reshape(y) + broadcast = f32[5,7,2]{2,1,0} broadcast(reshape), dimensions={1,2} + ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, broadcast) + })", + config) + .ValueOrDie(); + + // Check the optimized IR reuses the linear index by calculating modulo 14. + CompileAndVerifyIr(std::move(module), + R"( +; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14 +; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)* +; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64 +; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]] + )", + /*match_optimized_ir=*/true); +} + +TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithSizeOneDimensions) { + HloModuleConfig config; + auto debug_options = HloTestBase::GetDebugOptionsForTest(); + debug_options.set_xla_gpu_max_kernel_unroll_factor(1); + config.set_debug_options(debug_options); + + auto module = ParseHloString(R"( + HloModule test_module + + ENTRY CompatibleUseLinearIndexWithSizeOneDimensions { + x = f32[1,1024,1,256]{3,2,1,0} parameter(0) + ROOT y = f16[1,1024,1,256]{2,3,1,0} convert(x) + })", + config) + .ValueOrDie(); + + // Check that the unoptimized IR reuses the linear index. + CompileAndVerifyIr(std::move(module), + R"( +; CHECK-LABEL: @fusion +; CHECK: udiv i32 %[[linear_index:.*]], 262144 +; CHECK: %[[ld_addr:.*]] = getelementptr inbounds float, float* {{.*}}, i32 %[[linear_index]] +; CHECK: load float, float* %[[ld_addr]] +; CHECK: %[[st_addr:.*]] = getelementptr inbounds half, half* {{.*}}, i32 %[[linear_index]] +; CHECK: store half {{.*}}, half* %[[st_addr]] + )", + /*match_optimized_ir=*/false); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..15d1e269cc22b88f5269175084f20600f165011c --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc @@ -0,0 +1,198 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { +namespace { + +class GpuKernelTilingTest : public GpuCodegenTest { + protected: + GpuKernelTilingTest() {} + + // Most tests in this file want to skip layout assignment, but a few need it + // enabled. + HloModuleConfig ConfigWithLayoutAssignment() { + return GetModuleConfigForTest(); + } + + HloModuleConfig ConfigWithoutLayoutAssignment() { + HloModuleConfig config; + auto debug_options = HloTestBase::GetDebugOptionsForTest(); + // Disable layout_assignment to use the preassigned layouts. + debug_options.add_xla_disable_hlo_passes("layout-assignment"); + config.set_debug_options(debug_options); + return config; + } +}; + +TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) { + const char *const kHloString = R"( + HloModule unnested_transpose_1 + + ENTRY unnested_transpose_1 { + para0 = f16[32,3,64]{2,1,0} parameter(0) + ROOT copy1 = f16[32,3,64]{1,0,2} copy(para0) + })"; + + // Check that a call to llvm.nvvm.barrier0 is generated. + // + // We must enable layout assignment in order for this test to work correctly. + // AlgebraicSimplifier removes copy1; it's added back by layout assignment, + // which respects the module's entry computation layout. But if we don't run + // layout assignment...well, nobody else adds the copy back. + auto hlo_module = + ParseHloString(kHloString, ConfigWithLayoutAssignment()).ValueOrDie(); + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: define void @copy +; CHECK: tail call void @llvm.nvvm.barrier0() +; CHECK: } +)", + /*match_optimized_ir=*/true); + + // Check that the kernel runs correctly. + EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0})); +} + +TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) { + const char *const kHloString = R"( + HloModule unnested_transpose_2 + + ENTRY unnested_transpose_2 { + para0 = f16[2,3,64]{2,1,0} parameter(0) + ROOT copy1 = f16[2,3,64]{1,0,2} copy(para0) + })"; + + // Check that a call to llvm.nvvm.barrier0 is not generated. As in + // UnnestedTransposeWithProperDimensionsTiled, we must run layout assignment + // here. + auto hlo_module = + ParseHloString(kHloString, ConfigWithLayoutAssignment()).ValueOrDie(); + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: define void @copy +; CHECK-NOT: tail call void @llvm.nvvm.barrier0() +; CHECK: } +)", + /*match_optimized_ir=*/true); +} + +TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) { + const char *const kHloString = R"( + HloModule multiple_output_fusion_1 + fused_computation.1 { + param0 = f32[4,5,6,7,8]{4,3,2,1,0} parameter(0) + copy = f32[4,5,6,7,8]{2,1,4,3,0} copy(param0) + ROOT convert = f16[4,5,6,7,8]{2,1,4,3,0} convert(copy) + } + + ENTRY copy_in_fusion_run_without_hlo_passes { + para0 = f32[4,5,6,7,8]{4,3,2,1,0} parameter(0) + ROOT fusion.1 = f16[4,5,6,7,8]{2,1,4,3,0} fusion(para0), kind=kLoop, + calls=fused_computation.1 + })"; + + // Check that a call to llvm.nvvm.barrier0 is generated. + auto hlo_module = + ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie(); + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: define void @fusion +; CHECK: tail call void @llvm.nvvm.barrier0() +; CHECK: } +)", + /*match_optimized_ir=*/true); + + // Check that the kernel runs correctly. + EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0})); +} + +TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) { + const char *const kHloString = R"( + HloModule multiple_output_fusion_1 + fused_computation.1 { + param0 = f16[8,31,31,65]{3,2,1,0} parameter(0) + param1 = f16[8,31,31,65]{3,2,1,0} parameter(1) + copy0 = f16[8,31,31,65]{2,1,3,0} copy(param0) + copy1 = f16[8,31,31,65]{2,1,3,0} copy(param1) + ROOT tuple1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0}) + tuple(copy0, copy1) + } + + ENTRY multiple_output_fusion_1 { + para0 = f16[8,31,31,65]{3,2,1,0} parameter(0) + para1 = f16[8,31,31,65]{3,2,1,0} parameter(1) + ROOT fusion.1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0}) + fusion(para0,para1), kind=kLoop, calls=fused_computation.1 + })"; + + // Check that a call to llvm.nvvm.barrier0 is generated. + auto hlo_module = + ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie(); + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: define void @fusion +; CHECK: tail call void @llvm.nvvm.barrier0() +; CHECK: } +)", + /*match_optimized_ir=*/true); + + // Check that the kernel runs correctly. + EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0})); +} + +TEST_F(GpuKernelTilingTest, + MultipleOutputFusionWithTwoPossibleTransposesNotTiled) { + const char *const kHloString = R"( + HloModule multiple_output_fusion_2 + fused_computation.1 { + param0 = f16[8,31,31,65]{3,2,1,0} parameter(0) + param1 = f16[8,31,31,65]{1,3,2,0} parameter(1) + copy2 = f16[8,31,31,65]{2,1,3,0} copy(param0) + copy3 = f16[8,31,31,65]{2,1,3,0} copy(param1) + ROOT tuple1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0}) + tuple(copy2, copy3) + } + + ENTRY multiple_output_fusion_2 { + para0 = f16[8,31,31,65]{3,2,1,0} parameter(0) + para1 = f16[8,31,31,65]{1,3,2,0} parameter(1) + ROOT fusion1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0}) + fusion(para0,para1), kind=kLoop, calls=fused_computation.1 + })"; + + // Check that a call to llvm.nvvm.barrier0 is not generated. + auto hlo_module = + ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie(); + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: define void @fusion +; CHECK-NOT: tail call void @llvm.nvvm.barrier0() +; CHECK: } +)", + /*match_optimized_ir=*/true); +} + +} // namespace +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a9ecd9dae7c9ddde0b56d8615e4a39fb3df0af9 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc @@ -0,0 +1,141 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Tests that we emit ld.global.nc (the PTX instruction corresponding to CUDA's +// __ldg builtin) for reads of buffers that don't change during a kernel's +// execution. + +#include +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { + +class GpuLdgTest : public GpuCodegenTest {}; + +// Parameters are never overwritten, so parameter reads should get ld.global.nc +// reads. +TEST_F(GpuLdgTest, LdgForParamRead) { + HloComputation::Builder builder(TestName()); + + auto shape = ShapeUtil::MakeShape(F32, {2, 2}); + HloInstruction* param = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x")); + builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param)); + std::unique_ptr computation = builder.Build(); + + auto hlo_module = CreateNewModule(); + hlo_module->AddEntryComputation(std::move(computation)); + + CompileAndVerifyPtx(std::move(hlo_module), R"( + CHECK-NOT: ld.global.f32 + CHECK: ld.global.nc.f32 + )"); +} + +// Check that reading a buffer produced by a non-parameter HLO also results in +// ld.global.nc, if that buffer isn't modified within the instruction that reads +// it. +TEST_F(GpuLdgTest, LdgForNonParamRead) { + HloComputation::Builder builder(TestName()); + + auto shape = ShapeUtil::MakeShape(F32, {2, 2}); + HloInstruction* param = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x")); + HloInstruction* add = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param)); + HloInstruction* square = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, add)); + builder.AddInstruction(HloInstruction::CreateTuple({add, square})); + std::unique_ptr computation = builder.Build(); + + auto hlo_module = CreateNewModule(); + hlo_module->AddEntryComputation(std::move(computation)); + + CompileAndVerifyPtx(std::move(hlo_module), R"( + CHECK: { + CHECK-NOT: ld.global.f32 + CHECK: ld.global.nc.f32 + CHECK: } + )"); +} + +// Check that reading a buffer that's modified in-place does not produce +// ld.global.nc. +// +// We do this by creating a reduce that feeds into a sin. We don't currently +// fuse sin into reduce, and the sin is elementwise, so it reuses its input +// buffer as its output. +// +// It seems like a fair bet that we won't start fusing sin into the output of +// reduce in the foreseeable future. But if that turns out to be wrong, I give +// you, future reader, permission to delete this test. +TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) { + auto hlo_module = CreateNewModule(); + HloComputation::Builder builder(TestName()); + + HloComputation* reduce_computation; + { + auto embedded_builder = HloComputation::Builder("add"); + auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {}), "lhs")); + auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter( + 1, ShapeUtil::MakeShape(F32, {}), "rhs")); + embedded_builder.AddInstruction( + HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs)); + reduce_computation = + hlo_module->AddEmbeddedComputation(embedded_builder.Build()); + } + + auto param_shape = ShapeUtil::MakeShape(F32, {2, 2}); + auto reduce_shape = ShapeUtil::MakeShape(F32, {2}); + HloInstruction* param = builder.AddInstruction( + HloInstruction::CreateParameter(0, param_shape, "x")); + HloInstruction* reduce = builder.AddInstruction(HloInstruction::CreateReduce( + reduce_shape, + builder.AddInstruction(HloInstruction::CreateBinary( + param_shape, HloOpcode::kAdd, param, param)), + builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))), + {0}, reduce_computation)); + builder.AddInstruction( + HloInstruction::CreateUnary(reduce_shape, HloOpcode::kSin, reduce)); + + std::unique_ptr computation = builder.Build(); + hlo_module->AddEntryComputation(std::move(computation)); + + CompileAndVerifyPtx(std::move(hlo_module), R"( + CHECK-LABEL: .entry sin + CHECK: { + CHECK-NOT: ld.global.nc.f32 + CHECK: ld.global.f32 + CHECK: } + )"); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..15198865bda98f9718342d5a444a20305f923b48 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc @@ -0,0 +1,68 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { + +class GpuNoAliasTest : public GpuCodegenTest {}; + +TEST_F(GpuNoAliasTest, Concat) { + HloComputation::Builder builder(TestName()); + + auto param_shape = ShapeUtil::MakeShape(F32, {2, 2}); + HloInstruction* param_x = builder.AddInstruction( + HloInstruction::CreateParameter(0, param_shape, "x")); + HloInstruction* param_y = builder.AddInstruction( + HloInstruction::CreateParameter(1, param_shape, "y")); + HloInstruction* concat = + builder.AddInstruction(HloInstruction::CreateConcatenate( + ShapeUtil::MakeShape(F32, {2, 4}), {param_x, param_y}, 1)); + builder.AddInstruction(HloInstruction::CreateConcatenate( + ShapeUtil::MakeShape(F32, {2, 6}), {concat, param_x}, 1)); + + std::unique_ptr computation = builder.Build(); + + auto hlo_module = CreateNewModule(); + hlo_module->AddEntryComputation(std::move(computation)); + + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK: %[[x_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %x{{.*}}, i32 0 +; CHECK: load float, float* %[[x_gep]], {{.*}}, !noalias ![[param_noalias:.*]] +; CHECK: %[[y_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %y{{.*}}, i32 0 +; CHECK: load float, float* %[[y_gep]], {{.*}}, !noalias ![[param_noalias]] +; CHECK: %[[result_ptr:.*]] = bitcast [2 x [6 x float]]* %fusion{{.*}} to float* +; CHECK: %[[result_gep:.*]] = getelementptr inbounds float, float* %[[result_ptr]] +; CHECK: store float {{.*}}, float* %[[result_gep]], !alias.scope ![[param_noalias]] +; CHECK: ![[param_noalias]] = !{![[retval_buffer:.*]]} + )", + /*match_optimized_ir=*/false); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f2d5568cafc9db0f5f067437fdd5e2e775ad2c8 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc @@ -0,0 +1,188 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { +namespace { + +class GpuUnrollingTest : public GpuCodegenTest {}; + +const char *const kAddModule = R"( + HloModule test_module + + fused_computation { + p0.param_0 = f32[2,2]{1,0} parameter(0) + p1.param_1 = f32[2,2]{1,0} parameter(1) + ROOT add = f32[2,2] add(p0.param_0, p1.param_1) + } + + ENTRY BroadcastIntoAdd { + p0 = f32[2,2]{1,0} parameter(0) + p1 = f32[2,2]{1,0} parameter(1) + ROOT fusion = f32[2,2]{1,0} fusion(p0, p1), kind=kLoop, + calls=fused_computation + })"; + +TEST_F(GpuUnrollingTest, DoNotUnroll) { + HloModuleConfig config; + auto debug_options = HloTestBase::GetDebugOptionsForTest(); + debug_options.set_xla_gpu_max_kernel_unroll_factor(1); + config.set_debug_options(debug_options); + auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie(); + + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: @fusion +; CHECK: fadd +; CHECK-NOT: fadd +; CHECK: } + )", + /*match_optimized_ir=*/true); +} + +TEST_F(GpuUnrollingTest, UnrollFourTimes) { + HloModuleConfig config; + auto debug_options = HloTestBase::GetDebugOptionsForTest(); + // We request a factor of 8, but the computation works on 4 elements, limiting + // the maximum unroll factor. + debug_options.set_xla_gpu_max_kernel_unroll_factor(8); + config.set_debug_options(debug_options); + auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie(); + + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: @fusion +; CHECK: fadd +; CHECK: fadd +; CHECK: fadd +; CHECK: fadd +; CHECK-NOT: fadd +; CHECK: } + )", + /*match_optimized_ir=*/true); +} + +TEST_F(GpuUnrollingTest, UnrollDefaultTimes) { + // The default unrolling factor is 4. + HloModuleConfig config; + config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags()); + auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie(); + + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: @fusion +; CHECK: load <4 x float> +; CHECK: fadd +; CHECK: fadd +; CHECK: fadd +; CHECK: fadd +; CHECK-NOT: fadd +; CHECK: store <4 x float> +; CHECK: } + )", + /*match_optimized_ir=*/true); +} + +TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) { + HloModuleConfig config; + auto debug_options = HloTestBase::GetDebugOptionsForTest(); + debug_options.set_xla_gpu_max_kernel_unroll_factor(4); + config.set_debug_options(debug_options); + + const char *const kUnfusedAddModule = R"( + HloModule test_module + + ENTRY AddFunc { + p0 = f32[2,2]{1,0} parameter(0) + p1 = f32[2,2]{1,0} parameter(1) + ROOT add = f32[2,2]{1,0} add(p0, p1) + })"; + auto hlo_module = ParseHloString(kUnfusedAddModule, config).ValueOrDie(); + + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: @add +; CHECK: load <4 x float> +; CHECK: fadd +; CHECK: fadd +; CHECK: fadd +; CHECK: fadd +; CHECK-NOT: fadd +; CHECK: store <4 x float> +; CHECK: } + )", + /*match_optimized_ir=*/true); +} + +TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) { + HloModuleConfig config; + auto debug_options = HloTestBase::GetDebugOptionsForTest(); + debug_options.set_xla_gpu_max_kernel_unroll_factor(2); + // Disable layout assignment for this test. Layout assignment does not expect + // fusions to be present, and so it does the wrong thing. + debug_options.add_xla_disable_hlo_passes("layout-assignment"); + config.set_debug_options(debug_options); + + const char *const kMultiOutputFusionModule = R"( + HloModule test_module + + fused_computation { + p0.param_0 = f32[2,2]{1,0} parameter(0) + p1.param_1 = f32[2,2]{1,0} parameter(1) + add = f32[2,2]{1,0} add(p0.param_0, p1.param_1) + mul = f32[2,2]{1,0} multiply(p0.param_0, p1.param_1) + ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(add, mul) + } + + ENTRY BroadcastIntoAdd { + p0 = f32[2,2]{1,0} parameter(0) + p1 = f32[2,2]{1,0} parameter(1) + ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p0, p1), kind=kLoop, + calls=fused_computation + })"; + auto hlo_module = + ParseHloString(kMultiOutputFusionModule, config).ValueOrDie(); + + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: @fusion +; CHECK: load <2 x float> +; CHECK: load <2 x float> +; CHECK-NOT: load <2 x float> +; CHECK: fadd +; CHECK: fmul +; CHECK: fadd +; CHECK: fmul +; CHECK: store <2 x float> +; CHECK: store <2 x float> +; CHECK-NOT: store <2 x float> +; CHECK-NOT: fadd +; CHECK-NOT: fmul +; CHECK: } + )", + /*match_optimized_ir=*/true); +} + +} // namespace +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9072b30317d253fd6d50e9d98949cad4eaebfe7b --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc @@ -0,0 +1,121 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "tensorflow/compiler/xla/client/global_data.h" +#include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/test_helpers.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/literal_test_util.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/math/math_util.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace { + +class InfeedTest : public ClientLibraryTestBase { + protected: + // Transfers the given literal to the infeed interface of the device, and + // check if the returned data from Infeed HLO is same as the literal. + void TestInfeedRoundTrip(const Literal& literal) { + // TODO(b/30481585) Explicitly reset the Infeed state so that the + // test is not affected by the state from the previous tests. + ASSERT_IS_OK(client_->TransferToInfeed(literal)); + XlaBuilder builder(TestName()); + Infeed(&builder, literal.shape()); + if (ShapeUtil::IsTuple(literal.shape())) { + // TODO(b/30609564): Use ComputeAndCompareLiteral instead. + ComputeAndCompareTuple(&builder, literal, {}); + } else { + ComputeAndCompareLiteral(&builder, literal, {}); + } + } +}; + +TEST_F(InfeedTest, SingleInfeedR0Bool) { + TestInfeedRoundTrip(*LiteralUtil::CreateR0(true)); +} + +TEST_F(InfeedTest, SingleInfeedR1U32) { + TestInfeedRoundTrip(*LiteralUtil::CreateR1({1, 2, 3})); +} + +TEST_F(InfeedTest, SingleInfeedR2F32) { + TestInfeedRoundTrip(*LiteralUtil::CreateR2F32Linspace(0.0, 1.0, 128, 64)); +} + +TEST_F(InfeedTest, SingleInfeedR3F32) { + TestInfeedRoundTrip( + *LiteralUtil::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, + {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}})); +} + +TEST_F(InfeedTest, SingleInfeedR3F32DifferentLayout) { + const Layout r3_dim0minor = LayoutUtil::MakeLayout({0, 1, 2}); + const Layout r3_dim0major = LayoutUtil::MakeLayout({2, 1, 0}); + + TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout( + {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, + {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}, + r3_dim0minor)); + + TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout( + {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, + {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}, + r3_dim0major)); +} + +TEST_F(InfeedTest, SingleInfeedR4S32) { + TestInfeedRoundTrip(*LiteralUtil::CreateR4( + {{{{1, -2}, {-4, 5}, {6, 7}}, {{8, 9}, {10, 11}, {12, 13}}}, + {{{10, 3}, {7, -2}, {3, 6}}, {{2, 5}, {-11, 5}, {-2, -5}}}})); +} + +// Tests that a large infeed can be handled. +TEST_F(InfeedTest, LargeInfeed) { + Array4D array(80, 100, 8, 128); + array.FillIota(1.0f); + TestInfeedRoundTrip(*LiteralUtil::CreateR4FromArray4D(array)); +} + +TEST_F(InfeedTest, SingleInfeedTuple) { + TestInfeedRoundTrip( + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({1, 2, 3}).get(), + LiteralUtil::CreateR0(false).get()})); +} + +TEST_F(InfeedTest, SingleInfeedEmptyTuple) { + TestInfeedRoundTrip(*LiteralUtil::MakeTuple({})); +} + +// Tests that a large tuple infeed can be handled. +TEST_F(InfeedTest, SingleInfeedLargeTuple) { + Array4D array(40, 100, 8, 128); + array.FillIota(1.0f); + TestInfeedRoundTrip(*LiteralUtil::MakeTuple( + {LiteralUtil::CreateR4FromArray4D(array).get(), + LiteralUtil::CreateR0(5).get()})); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc new file mode 100644 index 0000000000000000000000000000000000000000..c78605cebbc671272b8df9faf0e0cc54be2f5b1c --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/thunk.cc @@ -0,0 +1,59 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/thunk.h" + +namespace xla { +namespace gpu { + +std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) { + switch (kind) { + case Thunk::kConditional: + return os << "kConditional"; + case Thunk::kConvolution: + return os << "kConvolution"; + case Thunk::kCopy: + return os << "kCopy"; + case Thunk::kCudnnBatchNormBackward: + return os << "kCudnnBatchNormBackward"; + case Thunk::kCudnnBatchNormForwardInference: + return os << "kCudnnBatchNormForwardInference"; + case Thunk::kCudnnBatchNormForwardTraining: + return os << "kCudnnBatchNormForwardTraining"; + case Thunk::kFft: + return os << "kFft"; + case Thunk::kGemm: + return os << "kGemm"; + case Thunk::kInfeed: + return os << "kInfeed"; + case Thunk::kKernel: + return os << "kKernel"; + case Thunk::kMemset32BitValue: + return os << "kMemset32BitValue"; + case Thunk::kMemzero: + return os << "kMemzero"; + case Thunk::kOutfeed: + return os << "kOutfeed"; + case Thunk::kSequential: + return os << "kSequential"; + case Thunk::kTuple: + return os << "kTuple"; + case Thunk::kWhile: + return os << "kWhile"; + } +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h index 931c0bffab850362dbd2df975657dd47d9cbd3ae..e68bee035a029178844282995429eaa960cc4817 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk.h +++ b/tensorflow/compiler/xla/service/gpu/thunk.h @@ -20,6 +20,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -40,7 +41,7 @@ class GpuExecutable; // This is thread-compatible. class Thunk { public: - enum class Kind { + enum Kind { kConditional, kConvolution, kCopy, @@ -53,6 +54,7 @@ class Thunk { kKernel, kMemset32BitValue, kMemzero, + kOutfeed, kSequential, kTuple, kWhile, @@ -80,25 +82,18 @@ class Thunk { return Status::OK(); } - // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream) - // before calling ExecuteOnStream(stream). If it returns true, it's the - // user's responsibility to wait for all activity on the GPU to finish before - // calling ExecuteOnStream. - // - // This value is not required to be constant for a given Thunk. For example, - // a Thunk that performs autotuning may return true for its first run and - // false thereafter. - virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) { - return false; - } + // Returns true if this kernel will autotune for the stream device the next + // time it is run. + virtual bool WillAutotuneKernel(se::Stream* /*stream*/) { return false; } // Execute the kernel for the thunk on the given stream. This method must be // called after Initialize and can be called multiple times over Thunk's - // lifetime. Stream argument must be non-null. + // lifetime. 'stream' and 'profiler' must be non-null. // // Precondition: Initialize(stream->parent()) has been called. virtual Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) = 0; + se::Stream* stream, + HloExecutionProfiler* profiler) = 0; private: Kind kind_; @@ -108,6 +103,8 @@ class Thunk { // A sequence of thunks. using ThunkSequence = std::vector>; +std::ostream& operator<<(std::ostream& os, Thunk::Kind kind); + } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc index bdb062837c5ba4b588ea0d535a786f33fe4f4015..141f3219387940a08ef22cbcc0be0971a14c2cd6 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc +++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc @@ -144,16 +144,15 @@ const std::list& ThunkSchedule::DependsOn( string ThunkSchedule::ToString() const { string result = "Total order:\n"; for (Thunk* thunk : thunk_total_order_) { - tensorflow::strings::StrAppend(&result, "\t", - thunk->hlo_instruction()->ToString(), "\n"); + absl::StrAppend(&result, "\t", thunk->hlo_instruction()->ToString(), "\n"); } - tensorflow::strings::StrAppend(&result, "Dependencies:\n"); + absl::StrAppend(&result, "Dependencies:\n"); for (const auto& entry : depends_on_) { const Thunk* dependent = entry.first; for (const Thunk* dependency : entry.second) { - tensorflow::strings::StrAppend( - &result, "\t", dependent->hlo_instruction()->name(), " depends on ", - dependency->hlo_instruction()->name(), "\n"); + absl::StrAppend(&result, "\t", dependent->hlo_instruction()->name(), + " depends on ", dependency->hlo_instruction()->name(), + "\n"); } } return result; diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc index 97cb04c38fbf18e516857f5269c984696ca204c3..989b542ff4503600b2e3c751a23345959fab6fd6 100644 --- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc @@ -15,30 +15,42 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/tuple_thunk.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/util.h" namespace xla { namespace gpu { Status TupleThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) { - std::vector tuple_element_buffer_addresses; - for (BufferAllocation::Slice tuple_element_buffer : tuple_element_buffers_) { - tuple_element_buffer_addresses.push_back( - buffer_allocations.GetDeviceAddress(tuple_element_buffer).opaque()); + se::Stream* stream, + HloExecutionProfiler* profiler) { + auto size = tuple_element_buffers_.size(); + auto tuple_element_buffer_addresses = absl::make_unique(size); + for (int i = 0; i != size; ++i) { + tuple_element_buffer_addresses[i] = + buffer_allocations.GetDeviceAddress(tuple_element_buffers_[i]).opaque(); } se::DeviceMemory dest_buffer_address( buffer_allocations.GetDeviceAddress(dest_buffer_)); - auto host_size = tuple_element_buffer_addresses.size() * sizeof(void*); + auto host_size = size * sizeof(void*); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); if (!stream ->ThenMemcpy(&dest_buffer_address, - tuple_element_buffer_addresses.data(), host_size) + tuple_element_buffer_addresses.get(), host_size) .ok()) { return InternalError( "Unable to launch MemcpyH2D from %p to %p with size %lu", - tuple_element_buffer_addresses.data(), dest_buffer_address.opaque(), - sizeof(void*) * tuple_element_buffer_addresses.size()); + tuple_element_buffer_addresses.get(), dest_buffer_address.opaque(), + host_size); + } + // Free the tuple address buffer when memcpy is done. + auto* buffers_raw = tuple_element_buffer_addresses.release(); + if (!stream->ThenDoHostCallback([buffers_raw] { delete[] buffers_raw; }) + .ok()) { + delete[] buffers_raw; + return InternalError("Unable to enqueue host callback!"); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h index 951f809b51937c97a6e7de0345ec58a8b66a4242..dcdbf2cf3c2aa87cc11a3473a765cb405b50e2a6 100644 --- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h @@ -18,11 +18,12 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -33,8 +34,7 @@ namespace gpu { // issue (b/31336476). class TupleThunk : public Thunk { public: - TupleThunk(tensorflow::gtl::ArraySlice - tuple_element_buffers, + TupleThunk(absl::Span tuple_element_buffers, const BufferAllocation::Slice& dest_buffer, const HloInstruction* hlo_instruction) : Thunk(Kind::kTuple, hlo_instruction), @@ -46,7 +46,8 @@ class TupleThunk : public Thunk { TupleThunk& operator=(const TupleThunk&) = delete; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: const std::vector tuple_element_buffers_; diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc index 30b9640c4c75dae61e9a90da5fb10e9d4a90cd26..c4754fe378960834e1157b0ff25c03c0fc4754c7 100644 --- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc @@ -15,7 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/while_thunk.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" @@ -29,10 +30,14 @@ WhileThunk::WhileThunk( const HloInstruction* hlo) : Thunk(Kind::kWhile, hlo), condition_result_buffer_index_(condition_result_buffer_index), - condition_thunk_sequence_(MakeUnique( - std::move(*condition_thunk_sequence), hlo)), - body_thunk_sequence_( - MakeUnique(std::move(*body_thunk_sequence), hlo)) {} + // Pass nullptr as the HloInstruction* to the condition_thunk_sequence_ + // and body_thunk_sequence_ constructors because these SequentialThunks + // are logically "part of" this WhileThunk, and shouldn't be profiled + // separately from it. + condition_thunk_sequence_(absl::make_unique( + std::move(*condition_thunk_sequence), nullptr)), + body_thunk_sequence_(absl::make_unique( + std::move(*body_thunk_sequence), nullptr)) {} Status WhileThunk::Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) { @@ -43,32 +48,44 @@ Status WhileThunk::Initialize(const GpuExecutable& executable, } Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) { + se::Stream* stream, + HloExecutionProfiler* profiler) { se::DeviceMemoryBase condition_result_data = buffer_allocations.GetDeviceAddress(condition_result_buffer_index_); + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); while (true) { // Invoke thunk sequence for while 'condition' computation. - TF_RETURN_IF_ERROR( - condition_thunk_sequence_->ExecuteOnStream(buffer_allocations, stream)); + profiler->StartHloComputation(); + VLOG(3) << "Executing condition computation"; + TF_RETURN_IF_ERROR(condition_thunk_sequence_->ExecuteOnStream( + buffer_allocations, stream, profiler)); + profiler->FinishHloComputation(hlo_instruction()->while_condition()); // Copy the result of condition computation and break the loop if 'false'. bool condition_result; stream->ThenMemcpy(&condition_result, condition_result_data, sizeof(bool)); + VLOG(3) << "condition_result = " << condition_result; Status block_status = stream->BlockHostUntilDone(); if (!block_status.ok()) { return InternalError( "Failed to complete all kernels launched on stream %p: %s", stream, - block_status.error_message().c_str()); + block_status.error_message()); } if (!condition_result) { break; } - // Invoke thunk sequence for while 'body' computation. - TF_RETURN_IF_ERROR( - body_thunk_sequence_->ExecuteOnStream(buffer_allocations, stream)); + // We measure the time of one execution of the while body computation. The + // while body may be executed more than once, the last measurement "wins". + profiler->StartHloComputation(); + VLOG(3) << "Executing body computation"; + // Invoke thunk sequence for while 'body' computation, and pass on + // 'profiler' to measure the timing of the thunks in 'body_thunk_sequence_'. + TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(buffer_allocations, + stream, profiler)); + profiler->FinishHloComputation(hlo_instruction()->while_body()); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h index 22176685a92df9c95b10f755b209309843c0fa3a..9270f95ee67cf0bd3ab8082452a9d8703cb4304e 100644 --- a/tensorflow/compiler/xla/service/gpu/while_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h @@ -19,6 +19,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -48,7 +49,8 @@ class WhileThunk : public Thunk { Status Initialize(const GpuExecutable& executable, se::StreamExecutor* executor) override; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - se::Stream* stream) override; + se::Stream* stream, + HloExecutionProfiler* profiler) override; private: const BufferAllocation::Slice condition_result_buffer_index_; diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc deleted file mode 100644 index ad55728c45599c801aad7e12fac95ae9f0c4fc3b..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc +++ /dev/null @@ -1,521 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/gpu/while_transformer.h" - -#include -#include - -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/core/errors.h" - -namespace xla { -namespace gpu { - -namespace { - -// TODO(b/33483676) Use an expression tree to specify computations to pattern -// match for while transformations. - -// ExprTree is a simple recursive data structure used to express computation -// patterns to match. -// -// Each ExprTree node is comprised of an HloOpcode, and a set of operands (each -// of type ExprTree). Operands can be added by specifying the index and -// HloOpcode of the operand. -// -// For example, the following computation: -// -// Parameter -// | -// Const GetTupleElement -// \ / -// Add (root) -// -// Can be matched with the following expression tree: -// -// ExprTree add(HloOpcode::kAdd, -// ExprTree(HloOpcode::kConstant), -// ExprTree(HloOpcode::kGetTupleElement, -// tuple_index, ExprTree(HloOpcode::kParameter))); -// -// Match the ExprTree root against an Hlo graph: -// -// ExprTree::TaggedInstructionMap tagged_instructions; -// TF_RETURN_IF_ERROR(add.Match(computation_->root_instruction(), -// &tagged_instructions)); -// -// Instructions that are "tagged" with a context-specific string will -// be returned in 'tagged_instructions' for further processing (i.e. parsing -// constants or recording the tuple_index). -// -class ExprTree { - public: - explicit ExprTree(HloOpcode opcode) : opcode_(opcode) {} - ExprTree(HloOpcode opcode, const string& tag) : opcode_(opcode), tag_(tag) {} - ExprTree(HloOpcode opcode, const ExprTree& operand0) : opcode_(opcode) { - SetOperand(0, operand0); - } - ExprTree(HloOpcode opcode, int64 index0, const ExprTree& operand0) - : opcode_(opcode) { - SetOperand(index0, operand0); - } - ExprTree(HloOpcode opcode, int64 index0, const ExprTree& operand0, - int64 index1, const ExprTree& operand1) - : opcode_(opcode) { - SetOperand(index0, operand0); - SetOperand(index1, operand1); - } - ExprTree(HloOpcode opcode, const string& tag, const ExprTree& operand0) - : opcode_(opcode), tag_(tag) { - SetOperand(0, operand0); - } - ExprTree(HloOpcode opcode, const ExprTree& operand0, const ExprTree& operand1) - : opcode_(opcode) { - SetOperand(0, operand0); - SetOperand(1, operand1); - } - - ExprTree(const ExprTree& to_copy) { - opcode_ = to_copy.opcode_; - tag_ = to_copy.tag_; - if (to_copy.fused_root_tree_ != nullptr) { - fused_root_tree_.reset(new ExprTree(*to_copy.fused_root_tree_)); - } - for (auto& pair : to_copy.operands_) { - CHECK(operands_.find(pair.first) == operands_.end()); - operands_.insert(std::make_pair( - pair.first, std::unique_ptr(new ExprTree(*pair.second)))); - } - } - - void SetFusedRoot(const ExprTree& fused_root) { - fused_root_tree_.reset(new ExprTree(fused_root)); - } - - typedef std::unordered_map - TaggedInstructionMap; - - // Matches 'instruction' HloOpcode against 'opcode_'. - // Recursively matches each operand in 'operands_'. - // Recursively matches fused instructions starting at 'fused_root_tree_' - // if 'opcode_ == kFusion'. - // Returns OK status, and instructions in 'tagged_instructions' for each - // matched ExprTree node with a non-empty 'tag_'. - // Returns error message on failure. - Status Match(const HloInstruction* instruction, - TaggedInstructionMap* tagged_instructions) const { - if (opcode_ != instruction->opcode()) { - return InvalidArgument("got opcode %s, want %s", - HloOpcodeString(instruction->opcode()).c_str(), - HloOpcodeString(opcode_).c_str()); - } - - VLOG(2) << "Matched " << HloOpcodeString(opcode_) << ": " << tag_; - if (!tag_.empty()) { - tagged_instructions->insert({tag_, instruction}); - } - - if (instruction->opcode() == HloOpcode::kFusion) { - CHECK(fused_root_tree_ != nullptr); - // Match fused instructions for this node starting a 'fused_root_tree'. - TF_RETURN_IF_ERROR(fused_root_tree_->Match( - instruction->fused_expression_root(), tagged_instructions)); - } - - // Match each operand in 'operands_'. - for (auto& pair : operands_) { - TF_RETURN_IF_ERROR(pair.second->Match(instruction->operand(pair.first), - tagged_instructions)); - } - return Status::OK(); - } - - private: - void SetOperand(int64 index, const ExprTree& operand) { - CHECK_EQ(0, operands_.count(index)); - operands_.insert(std::make_pair(index, MakeUnique(operand))); - } - - HloOpcode opcode_; - std::unordered_map> operands_; - std::unique_ptr fused_root_tree_; - string tag_; -}; - -// MatcherBase is a base class that provides common functionality for -// sub-classes which match specific target sub-computations (i.e. loop -// induction variable initialization, comparison and update). -class MatcherBase { - public: - MatcherBase() {} - virtual ~MatcherBase() {} - - // Attempts to match each ExprTree in 'expr_trees_'. - // Returns OK on the first successful match, error status otherwise. - virtual Status Run() { - Status status; - for (const ExprTree& expr_tree : expr_trees_) { - status = MatchExprTree(expr_tree); - if (status.ok()) { - return status; - } - } - return status; - } - - virtual Status MatchExprTree(const ExprTree& expr_tree) = 0; - - // Returns the constant value parsed form kConstant 'instruction'. - // Returns error status otherwise. - Status ParseConstInteger(const HloInstruction* instruction, - int64* const_value) const { - CHECK_EQ(HloOpcode::kConstant, instruction->opcode()); - PrimitiveType element_type = instruction->shape().element_type(); - if (element_type != S32 && element_type != S64) { - return InvalidArgument("Expected constant of integral type."); - } - const Literal& literal = instruction->literal(); - PrimitiveType type = literal.shape().element_type(); - if (type != S32 && type != S64) { - return InvalidArgument("Must use S32 or S64 integral types."); - } - if (type == S32) { - *const_value = static_cast(literal.GetFirstElement()); - } else if (type == S64) { - *const_value = literal.GetFirstElement(); - } - return Status::OK(); - } - - StatusOr GetTaggedInstruction( - const string& tag, - const ExprTree::TaggedInstructionMap& tagged_instructions) { - auto it = tagged_instructions.find(tag); - if (it == tagged_instructions.end()) { - return InvalidArgument("Cound not find instruction for tag: %s", - tag.c_str()); - } - return it->second; - } - - protected: - std::vector expr_trees_; - - private: - TF_DISALLOW_COPY_AND_ASSIGN(MatcherBase); -}; - -// WhileConditionComputationMatcher attempts to match a target computation -// pattern in the while condition sub-computation. -// If the target pattern is matched, two pieces of information are extracted -// from 'tagged' instructions returned by the matcher: -// -// *) 'tuple_index': -// *) The loop induction variable tuple_index from the GetTupleElement -// instruction of the matched computation. -// *) Used in subsequent matching passes of while init operand and body -// computations to select loop induction variable tuple element. -// -// *) 'loop_limit': -// *) The integral value from Constant root operand in matched computation. -// *) Used as the constant for the loop limit. -// -class WhileConditionComputationMatcher : public MatcherBase { - public: - explicit WhileConditionComputationMatcher(const HloComputation* computation) - : computation_(computation) { - expr_trees_.emplace_back(BuildCondExprTree()); - } - - int64 loop_limit() const { return loop_limit_; } - int64 tuple_index() const { return tuple_index_; } - - private: - // Builds expression tree for the following condition computation: - // - // Const Parameter - // \ / - // Fusion ------------> FusionParam FusionParam - // \ / - // GTE / - // \ / - // LessThan (fused root) - // - ExprTree BuildCondExprTree() { - // Build ExprTree for fused instructions. - ExprTree fused_root( - HloOpcode::kLt, - ExprTree(HloOpcode::kGetTupleElement, "gte", - ExprTree(HloOpcode::kParameter, "gte.fusion_param.param0")), - ExprTree(HloOpcode::kParameter)); - - // Build top-level computation. - ExprTree root(HloOpcode::kFusion, - ExprTree(HloOpcode::kConstant, "loop_limit"), - ExprTree(HloOpcode::kParameter, "param0")); - - root.SetFusedRoot(fused_root); - return root; - } - - Status MatchExprTree(const ExprTree& expr_tree) override { - VLOG(2) << "MATCHING while condition"; - ExprTree::TaggedInstructionMap tagged_instructions; - TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(), - &tagged_instructions)); - - // Get tagged GTE instruction and set 'tuple_index_'. - TF_ASSIGN_OR_RETURN(const HloInstruction* gte, - GetTaggedInstruction("gte", tagged_instructions)); - tuple_index_ = gte->tuple_index(); - - // Get tagged Constant instruction and parse 'loop_limit_'. - TF_ASSIGN_OR_RETURN( - const HloInstruction* const_hlo, - GetTaggedInstruction("loop_limit", tagged_instructions)); - TF_RETURN_IF_ERROR(ParseConstInteger(const_hlo, &loop_limit_)); - - // Get tagged "param0" instruction, and check that it matches - // 'computation_' parameter 0. - TF_ASSIGN_OR_RETURN(const HloInstruction* param0, - GetTaggedInstruction("param0", tagged_instructions)); - if (param0 != computation_->parameter_instruction(0)) { - return InvalidArgument("Unexpected Parameter0 instruction : %s", - param0->name().c_str()); - } - - // Get tagged 'gte.fusion_param.param0', find its associated fusion operand, - // and compare it to 'computation_' parameter0. - TF_ASSIGN_OR_RETURN( - const HloInstruction* gte_fusion_param0, - GetTaggedInstruction("gte.fusion_param.param0", tagged_instructions)); - CHECK_EQ(HloOpcode::kParameter, gte_fusion_param0->opcode()); - CHECK(gte_fusion_param0->IsFused()); - if (gte_fusion_param0->parent()->FusionInstruction()->operand( - gte_fusion_param0->parameter_number()) != - computation_->parameter_instruction(0)) { - return InvalidArgument("Could not match fusion param: %s", - gte_fusion_param0->name().c_str()); - } - - return Status::OK(); - } - - const HloComputation* computation_; - - int64 loop_limit_ = -1; - int64 tuple_index_ = -1; - - TF_DISALLOW_COPY_AND_ASSIGN(WhileConditionComputationMatcher); -}; - -// WhileInitOperandMatcher matches a target computation pattern of the -// while instructions 'init' operand, indexing the tuple at 'tuple_index'. -// On success, parses constant 'loop_start' which represents the loop induction -// variable start values, then returns OK. -// Returns error status otherwise. -class WhileInitOperandMatcher : public MatcherBase { - public: - WhileInitOperandMatcher(const HloInstruction* while_hlo, - const int64 tuple_index) - : while_hlo_(while_hlo), tuple_index_(tuple_index) { - expr_trees_.emplace_back(BuildInitExprTree()); - } - - int64 loop_start() const { return loop_start_; } - - private: - // Builds expression tree for the following while init operand subcomputation: - // - // Const - // | - // Copy - // | - // Tuple0 - // | - // While - // - ExprTree BuildInitExprTree() { - return ExprTree( - HloOpcode::kWhile, "while", - ExprTree(HloOpcode::kTuple, tuple_index_, - ExprTree(HloOpcode::kCopy, - ExprTree(HloOpcode::kConstant, "loop_start")))); - } - - Status MatchExprTree(const ExprTree& expr_tree) override { - VLOG(2) << "MATCHING while init"; - ExprTree::TaggedInstructionMap tagged_instructions; - TF_RETURN_IF_ERROR(expr_tree.Match(while_hlo_, &tagged_instructions)); - - // Get tagged while instruction check against 'while_hlo_'. - TF_ASSIGN_OR_RETURN(const HloInstruction* while_hlo, - GetTaggedInstruction("while", tagged_instructions)); - if (while_hlo != while_hlo_) { - return InvalidArgument("Expected While for instruction : %s", - while_hlo->name().c_str()); - } - - // Get tagged Constant instruction and parse 'loop_start_'. - TF_ASSIGN_OR_RETURN( - const HloInstruction* const_hlo, - GetTaggedInstruction("loop_start", tagged_instructions)); - TF_RETURN_IF_ERROR(ParseConstInteger(const_hlo, &loop_start_)); - - return Status::OK(); - } - - const HloInstruction* while_hlo_; - const int64 tuple_index_; - - int64 loop_start_ = -1; - - TF_DISALLOW_COPY_AND_ASSIGN(WhileInitOperandMatcher); -}; - -// WhileBodyComputationMatcher matches a target computation pattern for -// the loop induction variable update. Matching proceeds from the while body -// computation root[tuple_index] to param[tuple_index], where 'tuple_index' -// If the target pattern is matched, parses a constant which represents the -// loop induction variable increment value, then returns status OK. -// Returns error status otherwise. -class WhileBodyComputationMatcher : public MatcherBase { - public: - WhileBodyComputationMatcher(const HloComputation* computation, - const int64 tuple_index) - : computation_(computation), tuple_index_(tuple_index) { - expr_trees_.emplace_back(BuildBodyExprTree(0, 1)); - expr_trees_.emplace_back(BuildBodyExprTree(1, 0)); - } - - int64 loop_increment() const { return loop_increment_; } - - private: - // Builds expression tree for the following while body computation: - // - // - // FusionParam FusionParam - // \ / - // Const Param \ GTE1 - // \ / \ / - // Fusion -----------> Add - // | - // Copy - // | - // Tuple0 - // - ExprTree BuildBodyExprTree(const int64 const_index, const int64 gte_index) { - // Build ExprTree for fused instructions. - ExprTree gte1 = - ExprTree(HloOpcode::kGetTupleElement, "gte", - ExprTree(HloOpcode::kParameter, "gte.fusion_param.param0")); - ExprTree fused_root(HloOpcode::kAdd, const_index, - ExprTree(HloOpcode::kParameter), gte_index, gte1); - - // Build fusion instruction (and set fused root). - ExprTree fusion(HloOpcode::kFusion, 0, - ExprTree(HloOpcode::kConstant, "loop_increment"), 1, - ExprTree(HloOpcode::kParameter, "param0")); - fusion.SetFusedRoot(fused_root); - - // Build top-level computation. - ExprTree tuple0(HloOpcode::kTuple, tuple_index_, - ExprTree(HloOpcode::kCopy, fusion)); - return tuple0; - } - - Status MatchExprTree(const ExprTree& expr_tree) override { - VLOG(2) << "MATCHING while body"; - ExprTree::TaggedInstructionMap tagged_instructions; - TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(), - &tagged_instructions)); - - for (const auto& pair : tagged_instructions) { - const auto& tag = pair.first; - const auto& inst = pair.second; - - if (tag == "gte" && inst->tuple_index() != tuple_index_) { - // Check that the matched GTE instruction is at the 'tuple_index' we - // matched in the while condition computation. - return InvalidArgument("Unexpected tuple index instruction : %s", - inst->name().c_str()); - } else if (tag == "loop_increment") { - // Parse the constant which represents the loop induction variable - // increment value. - TF_RETURN_IF_ERROR(ParseConstInteger(inst, &loop_increment_)); - } else if (tag == "param0" && - inst != computation_->parameter_instruction(0)) { - // Check that the matched parameter == parameter 0 from 'computation_'. - return InvalidArgument("Unexpected Parameter0 instruction : %s", - inst->name().c_str()); - } else if (tag == "gte.fusion_param.param0") { - // Fusion parameter: lookup and compare with associated fusion operand. - CHECK_EQ(HloOpcode::kParameter, inst->opcode()); - CHECK(inst->IsFused()); - if (inst->parent()->FusionInstruction()->operand( - inst->parameter_number()) != - computation_->parameter_instruction(0)) { - return InvalidArgument("Could not match fusion param: %s", - inst->name().c_str()); - } - } - } - return Status::OK(); - } - - const HloComputation* computation_; - const int64 tuple_index_; - - int64 loop_increment_ = -1; - - TF_DISALLOW_COPY_AND_ASSIGN(WhileBodyComputationMatcher); -}; - -} // namespace - -StatusOr> CanTransformWhileToFor( - const HloInstruction* while_hlo) { - if (while_hlo->opcode() != HloOpcode::kWhile) { - return InvalidArgument("Expected While instruction."); - } - - WhileConditionComputationMatcher cond_matcher(while_hlo->while_condition()); - TF_RETURN_IF_ERROR(cond_matcher.Run()); - - WhileInitOperandMatcher init_matcher(while_hlo, cond_matcher.tuple_index()); - TF_RETURN_IF_ERROR(init_matcher.Run()); - - WhileBodyComputationMatcher body_matcher(while_hlo->while_body(), - cond_matcher.tuple_index()); - TF_RETURN_IF_ERROR(body_matcher.Run()); - - // Check for valid For loop parameters. - if (init_matcher.loop_start() >= cond_matcher.loop_limit()) { - return InvalidArgument("Loop start must be less than loop limit."); - } - if (body_matcher.loop_increment() <= 0) { - return InvalidArgument("Loop increment must greater than zero."); - } - return std::make_tuple(init_matcher.loop_start(), cond_matcher.loop_limit(), - body_matcher.loop_increment()); -} - -} // namespace gpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.h b/tensorflow/compiler/xla/service/gpu/while_transformer.h deleted file mode 100644 index fe3a954e1828ee4a323872eea81f64c7e780ad24..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/while_transformer.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_ - -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/statusor.h" - -namespace xla { -namespace gpu { - -// Runs an analysis of the while loop instruction 'while_hlo' (and its -// associated sub-computations) to determine if it can be transformed into an -// equivalent "for" loop with the following "for" loop parameters: -// -// *) 'loop_start': loop induction variable starting value. -// *) 'loop_limit': loop induction variable limit value. -// *) 'loop_increment': loop induction variable per-iteration increment value. -// -// Returns an std::tuple = (loop_start, loop_limit, loop_increment) on success. -// The values in the returned tuple are values extracted from the 'while_hlo' -// operand (and its sub-computations) during analysis. -// Returns an error status on failure. -StatusOr> CanTransformWhileToFor( - const HloInstruction* while_hlo); - -} // namespace gpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc index 2f290f61bd527e9827472a78256f015e066e44be..40183de96ee363996e6b0b883a78e7a8b5d13ab2 100644 --- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc +++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc @@ -13,11 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/service/gpu/while_transformer.h" - #include "tensorflow/compiler/xla/service/copy_insertion.h" #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" +#include "tensorflow/compiler/xla/service/while_loop_analysis.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -42,7 +41,7 @@ class WhileTransformerTest : public HloTestBase { const int64 tuple_index, const int64 limit) { auto builder = HloComputation::Builder(TestName() + ".Condition"); auto limit_const = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(limit))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(limit))); auto loop_state = builder.AddInstruction(HloInstruction::CreateParameter( 0, GetLoopStateShape(tuple_index), "loop_state")); auto induction_variable = @@ -65,8 +64,8 @@ class WhileTransformerTest : public HloTestBase { auto induction_variable = builder.AddInstruction(HloInstruction::CreateGetTupleElement( induction_variable_shape_, loop_state, ind_var_tuple_index)); - auto inc = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(increment))); + auto inc = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(increment))); auto add0 = builder.AddInstruction(HloInstruction::CreateBinary( induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc)); // Update data GTE(data_tuple_index). @@ -89,10 +88,12 @@ class WhileTransformerTest : public HloTestBase { const int64 ind_var_tuple_index, const int64 ind_var_init) { auto builder = HloComputation::Builder(TestName() + ".While"); - auto induction_var_init = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(ind_var_init))); - auto data_init = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}))); + auto induction_var_init = + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(ind_var_init))); + auto data_init = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1( + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}))); auto loop_state_init = ind_var_tuple_index == 0 ? builder.AddInstruction( @@ -108,16 +109,17 @@ class WhileTransformerTest : public HloTestBase { void RunFusionPasses() { // Run standard fusion passes. - EXPECT_TRUE(gpu::GpuInstructionFusion(/*may_duplicate=*/false) - .Run(module_.get()) - .ValueOrDie()); - EXPECT_TRUE(gpu::GpuInstructionFusion(/*may_duplicate=*/true) - .Run(module_.get()) - .ValueOrDie()); + TF_ASSERT_OK(gpu::GpuInstructionFusion(/*may_duplicate=*/false) + .Run(module_.get()) + .status()); + TF_ASSERT_OK(gpu::GpuInstructionFusion(/*may_duplicate=*/true) + .Run(module_.get()) + .status()); } void RunCopyInsertionPass() { - HloVerifier verifier; + HloVerifier verifier(/*layout_sensitive=*/false, + /*allow_mixed_precision=*/false); TF_ASSERT_OK(verifier.Run(module_.get()).status()); CopyInsertion copy_insertion; TF_ASSERT_OK(copy_insertion.Run(module_.get()).status()); @@ -139,10 +141,7 @@ class WhileTransformerTest : public HloTestBase { Shape condition_result_shape_; }; -// TODO(b/68830972): The while transformer is far too fragile. It patterns -// matches the exact expressions of opcodes. Re-enable when transformation is -// more general -TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement0) { +TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) { // Build computation with induction variable at tuple element 0. auto condition = module_->AddEmbeddedComputation(BuildConditionComputation(0, 10)); @@ -151,18 +150,13 @@ TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement0) { // Run HLO Optimization passes. RunFusionPasses(); RunCopyInsertionPass(); - // Run WhileTransformer. - auto result = gpu::CanTransformWhileToFor(while_hlo); - TF_ASSERT_OK(result.status()); - // Check results. - EXPECT_THAT(result.ConsumeValueOrDie(), - Eq(std::tuple(0, 10, 1))); + + auto result = ComputeWhileLoopTripCount(while_hlo); + ASSERT_TRUE(result); + EXPECT_EQ(10, *result); } -// TODO(b/68830972): The while transformer is far too fragile. It patterns -// matches the exact expressions of opcodes. Re-enable when transformation is -// more general -TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement1) { +TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) { // Build computation with induction variable at tuple element 1. auto condition = module_->AddEmbeddedComputation(BuildConditionComputation(1, 10)); @@ -171,19 +165,14 @@ TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement1) { // Run HLO Optimization passes. RunFusionPasses(); RunCopyInsertionPass(); - // Run WhileTransformer. - auto result = gpu::CanTransformWhileToFor(while_hlo); - TF_ASSERT_OK(result.status()); - // Check results. - EXPECT_THAT(result.ConsumeValueOrDie(), - Eq(std::tuple(0, 10, 1))); + + auto result = ComputeWhileLoopTripCount(while_hlo); + ASSERT_TRUE(result); + EXPECT_EQ(10, *result); } -// TODO(b/68830972): The while transformer is far too fragile. It patterns -// matches the exact expressions of opcodes. Re-enable when transformation is -// more general -TEST_F(WhileTransformerTest, DISABLED_InvalidLoopLimit) { - // Build computation with invalid loop limit. +TEST_F(WhileTransformerTest, ImpossibleLoopLimit) { + // Build computation with an impossible loop limit. auto condition = module_->AddEmbeddedComputation(BuildConditionComputation(0, 5)); auto body = module_->AddEmbeddedComputation(BuildBodyComputation(0, 1, 1)); @@ -191,17 +180,13 @@ TEST_F(WhileTransformerTest, DISABLED_InvalidLoopLimit) { // Run HLO Optimization passes. RunFusionPasses(); RunCopyInsertionPass(); - // Run WhileTransformer. - auto result = gpu::CanTransformWhileToFor(while_hlo); - ASSERT_FALSE(result.ok()); - EXPECT_THAT(result.status().error_message(), - HasSubstr("Loop start must be less than loop limit.")); + + auto result = ComputeWhileLoopTripCount(while_hlo); + ASSERT_TRUE(result); + EXPECT_EQ(0, *result); } -// TODO(b/68830972): The while transformer is far too fragile. It patterns -// matches the exact expressions of opcodes. Re-enable when transformation is -// more general -TEST_F(WhileTransformerTest, DISABLED_InvalidLoopIncrement) { +TEST_F(WhileTransformerTest, InvalidLoopIncrement) { // Build computation with invalid loop increment. auto condition = module_->AddEmbeddedComputation(BuildConditionComputation(0, 10)); @@ -210,11 +195,9 @@ TEST_F(WhileTransformerTest, DISABLED_InvalidLoopIncrement) { // Run HLO Optimization passes. RunFusionPasses(); RunCopyInsertionPass(); - // Run WhileTransformer. - auto result = gpu::CanTransformWhileToFor(while_hlo); - ASSERT_FALSE(result.ok()); - EXPECT_THAT(result.status().error_message(), - HasSubstr("Loop increment must greater than zero.")); + + auto result = ComputeWhileLoopTripCount(while_hlo); + ASSERT_FALSE(result); } } // namespace diff --git a/tensorflow/compiler/xla/service/gpu/xfeed_queue.h b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h new file mode 100644 index 0000000000000000000000000000000000000000..dd46ff433ba0ad6bfa3999b96845fdaebe148aca --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h @@ -0,0 +1,90 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_XFEED_QUEUE_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_XFEED_QUEUE_H_ + +#include +#include +#include + +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/notification.h" +#include "tensorflow/core/platform/thread_annotations.h" + +namespace xla { +namespace gpu { + +// TODO(b/30467474) Once GPU outfeed implementation settles, consider +// folding back the cpu and gpu outfeed implementations into a generic +// one if possible. + +// Manages a thread-safe queue of buffers. +template +class XfeedQueue { + public: + // Adds a tree of buffers to the queue. The individual buffers correspond to + // the elements of a tuple and may be nullptr if the buffer is a tuple index + // buffer. + void EnqueueDestination(BufferType buffers) { + tensorflow::mutex_lock l(mu_); + enqueued_buffers_.push_back(std::move(buffers)); + cv_.notify_one(); + } + + // Blocks until the queue is non-empty, then returns the buffer at the head of + // the queue. + BufferType BlockingGetNextDestination() { + bool became_empty; + BufferType current_buffer; + { + tensorflow::mutex_lock l(mu_); + while (enqueued_buffers_.empty()) { + cv_.wait(l); + } + current_buffer = std::move(enqueued_buffers_.front()); + enqueued_buffers_.pop_front(); + became_empty = enqueued_buffers_.empty(); + } + if (became_empty) { + for (const auto& callback : on_empty_callbacks_) { + callback(); + } + } + return current_buffer; + } + + void RegisterOnEmptyCallback(std::function callback) { + on_empty_callbacks_.push_back(std::move(callback)); + } + + private: + tensorflow::mutex mu_; + + // Condition variable that is signaled every time a buffer is enqueued. + tensorflow::condition_variable cv_; + + // The queue of trees of buffers. Buffer* queue contents are not owned. + std::deque enqueued_buffers_ GUARDED_BY(mu_); + + // List of callbacks which will be called when 'enqueued_buffers_' becomes + // empty. + std::vector> on_empty_callbacks_; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_XFEED_QUEUE_H_ diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc index acf661148699dab18916e3065ee647d37fda6208..a2be89511babc23ebcd5cb40abee2a95d16dc451 100644 --- a/tensorflow/compiler/xla/service/graphviz_example.cc +++ b/tensorflow/compiler/xla/service/graphviz_example.cc @@ -22,8 +22,10 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -32,7 +34,6 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/types.h" @@ -42,12 +43,11 @@ namespace { // Adds a computation to the given HLO module which adds a scalar constant to // its parameter and returns the result. HloComputation* AddScalarConstantComputation(int64 addend, HloModule* module) { - auto builder = - HloComputation::Builder(tensorflow::strings::StrCat("add_", addend)); + auto builder = HloComputation::Builder(absl::StrCat("add_", addend)); auto x_value = builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(F32, {}), "x_value")); auto half = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.5))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.5))); builder.AddInstruction(HloInstruction::CreateBinary( half->shape(), HloOpcode::kAdd, x_value, half)); return module->AddEmbeddedComputation(builder.Build()); @@ -83,7 +83,7 @@ HloComputation* CallForwardingComputation(HloComputation* computation, // the module. std::unique_ptr MakeBigGraph() { HloModuleConfig config; - auto module = MakeUnique("BigGraph", config); + auto module = absl::make_unique("BigGraph", config); auto builder = HloComputation::Builder("TestBigGraphvizGraph"); @@ -122,7 +122,7 @@ std::unique_ptr MakeBigGraph() { auto rng = builder.AddInstruction( HloInstruction::CreateRng(vshape, RNG_UNIFORM, {param_m, param_m})); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto add_computation = ScalarSumComputation(module.get()); builder.AddInstruction( HloInstruction::CreateReduce(vshape, rng, one, {1}, add_computation)); diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc index 06a5e0351b63270b61b998ca2211f480f256f759..38c3982ebf170d5733d56a05106835d1eaa4f2e1 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.cc +++ b/tensorflow/compiler/xla/service/heap_simulator.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/util.h" @@ -26,6 +27,47 @@ namespace xla { using tensorflow::gtl::FlatMap; using tensorflow::gtl::FlatSet; +/*static*/ +StatusOr HeapSimulator::MinimumMemoryForModule( + const SequentialHloOrdering::HloModuleSequence& module_sequence, + const LogicalBuffer::SizeFunction& size_function) { + if (module_sequence.empty()) { + return 0; + } + + const HloModule* module = module_sequence.begin()->first->parent(); + TF_ASSIGN_OR_RETURN(std::unique_ptr points_to_analysis, + TuplePointsToAnalysis::Run(module)); + + // The absolute minimum memory required for a given sequence of instructions + // is determined by the sequence of Alloc and Free calls on a simulated heap, + // ignoring fragmentation. We run the heap simulation on the whole module, + // rather than summing each computation, since it gives us a better lower + // bound, by minimizing the liveness of sub-computations. + TF_ASSIGN_OR_RETURN( + HeapSimulator::Result result, + HeapSimulator::Run(absl::make_unique(), *module, + module_sequence, *points_to_analysis, size_function)); + return result.heap_size; +} + +/*static*/ +StatusOr HeapSimulator::MinimumMemoryForComputation( + const HloComputation& computation, + const std::vector& sequence, + const TuplePointsToAnalysis& points_to_analysis, + const LogicalBuffer::SizeFunction& size_function, + const tensorflow::gtl::FlatMap* + memory_by_computation) { + TF_ASSIGN_OR_RETURN( + HeapSimulator::Result result, + HeapSimulator::Run(absl::make_unique(), + computation, sequence, points_to_analysis, + size_function, HeapSimulator::Options(), + memory_by_computation)); + return result.heap_size; +} + /*static*/ StatusOr HeapSimulator::Run( std::unique_ptr algorithm, const HloModule& module, @@ -46,9 +88,11 @@ StatusOr HeapSimulator::Run( std::unique_ptr algorithm, const HloComputation& computation, const std::vector& instruction_sequence, const TuplePointsToAnalysis& points_to_analysis, - const BufferValue::SizeFunction& size_fn, const Options& options) { + const BufferValue::SizeFunction& size_fn, const Options& options, + const tensorflow::gtl::FlatMap* + memory_by_computation) { HeapSimulator heap(std::move(algorithm), size_fn, options, - /*module_sequence=*/nullptr); + /*module_sequence=*/nullptr, memory_by_computation); TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence, points_to_analysis)); return heap.Finish(); @@ -100,7 +144,7 @@ Status HeapSimulator::RunComputation( } } else { // A GetTupleElement doesn't need to keep all of its operand's buffers - // alive. It only needs the buffers that relate to the element its + // alive. It only needs the buffers that relate to the element it's // extracting, and the tuple it's extracting from, but not the buffers // for the other elements. for (const BufferValue* buffer : points_to.element({})) { @@ -188,6 +232,9 @@ Status HeapSimulator::RunComputation( // // INVARIANT: Either Alloc or ShareBuffer will be called for each buffer // that we should assign. + + // Make sure each buffer get reused at most once. + FlatSet reused_buffers; for (const BufferValue* buffer : buffers_defined_by_instruction) { if (IgnoreBuffer(buffer)) { continue; @@ -200,6 +247,9 @@ Status HeapSimulator::RunComputation( bool shared = false; if (options_.may_reuse_operand_buffers) { for (const BufferValue* operand_buffer : operand_buffers_to_free) { + if (reused_buffers.count(operand_buffer) != 0) { + continue; + } if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) && buffer->instruction()->opcode() != HloOpcode::kCopy && points_to_analysis.CanShareOperandBufferWithUser( @@ -209,6 +259,7 @@ Status HeapSimulator::RunComputation( << operand_buffer->ToString(); ShareBuffer(buffer, operand_buffer, instruction); shared = true; + reused_buffers.insert(operand_buffer); break; } } @@ -219,14 +270,20 @@ Status HeapSimulator::RunComputation( Alloc(buffer, instruction); } } + // Account for the memory used by subcomputations when estimating the + // current heap size. + if (memory_by_computation_ != nullptr) { + algorithm_->AccountForSubcomputationMemory(instruction, + *memory_by_computation_); + } - // If the whole module is sequential, we can save memory by running the - // heap-simulation for sub-computations inline. E.g. the buffers for the - // condition and body of a kWhile instruction are only live for the duration - // of the instruction itself. + // If all computations in the module have been scheduled, we can save memory + // by running the heap-simulation for sub-computations inline. E.g. the + // buffers for the condition and body of a kWhile instruction are only live + // for the duration of the instruction itself. // // The order that the sub-computations are simulated does not affect - // correctness; since the whole module is sequential, we know that the + // correctness; since the whole module has been scheduled, we know that the // sub-computations will never be run concurrently. if (module_sequence_ != nullptr) { if (instruction->opcode() == HloOpcode::kCall || @@ -286,12 +343,15 @@ Status HeapSimulator::RunComputation( HeapSimulator::HeapSimulator( std::unique_ptr algorithm, const BufferValue::SizeFunction& size_fn, const Options& options, - const SequentialHloOrdering::HloModuleSequence* module_sequence) - : no_fragmentation_stats_(MakeUnique()), + const SequentialHloOrdering::HloModuleSequence* module_sequence, + const tensorflow::gtl::FlatMap* + memory_by_computation) + : no_fragmentation_stats_(absl::make_unique()), algorithm_(std::move(algorithm)), size_fn_(size_fn), options_(options), - module_sequence_(module_sequence) { + module_sequence_(module_sequence), + memory_by_computation_(memory_by_computation) { debug_trace_.set_whole_module_simulation(module_sequence_ != nullptr); } @@ -320,9 +380,10 @@ void HeapSimulator::Alloc(const BufferValue* buffer, allocated_buffers_.insert(buffer); const int64 size = size_fn_(*buffer); - algorithm_->Alloc(buffer, size); - no_fragmentation_stats_->Alloc(buffer, size); - + const HloInstruction* instruction_to_calc_aliasing = + memory_by_computation_ == nullptr ? nullptr : instruction; + algorithm_->Alloc(buffer, size, instruction_to_calc_aliasing); + no_fragmentation_stats_->Alloc(buffer, size, instruction_to_calc_aliasing); FillDebugTrace(HeapSimulatorTrace::Event::ALLOC, buffer, instruction, nullptr); } @@ -460,6 +521,38 @@ void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size) { } } +void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size, + const HloInstruction* instruction) { + // The output buffer of while/call/conditional is always aliased with the + // output buffer of the root instruction in the body. Don't double count. + if (instruction == nullptr || + (instruction->opcode() != HloOpcode::kWhile && + instruction->opcode() != HloOpcode::kCall && + instruction->opcode() != HloOpcode::kConditional)) { + Alloc(buffer, size); + } +} + +void NoFragmentationStatsHeap::AccountForSubcomputationMemory( + const HloInstruction* instruction, + const tensorflow::gtl::FlatMap& + memory_by_computation) { + // We only count the memory usage of the largest subcomputation, instead of + // adding them all, because subcomputations won't execute in parallel. + int64 max_subcomputation_bytes = 0; + for (const auto* c : instruction->called_computations()) { + auto it = memory_by_computation.find(c); + if (it != memory_by_computation.end()) { + int64 subcomputation_bytes = it->second; + if (subcomputation_bytes > max_subcomputation_bytes) { + max_subcomputation_bytes = subcomputation_bytes; + } + } + } + max_heap_size_ = + std::max(max_heap_size_, current_heap_size_ + max_subcomputation_bytes); +} + void NoFragmentationStatsHeap::Free(const BufferValue* buffer, int64 size) { current_heap_size_ -= size; } diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h index 8b2b43a37a5c41d334e5338c6a6fad160f03a51e..af05bedee72d4878f83765e5a5c5baf61bd71ba2 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.h +++ b/tensorflow/compiler/xla/service/heap_simulator.h @@ -36,6 +36,7 @@ namespace xla { // Forward declare classes defined below. class HeapAlgorithm; +class NoFragmentationStatsHeap; // HeapSimulator assigns buffer offsets by running a simulation of a regular // memory heap with Alloc and Free calls. It only works for completely @@ -85,6 +86,23 @@ class HeapSimulator { const BufferValueFlatSet* buffers_to_assign; }; + // Returns the minimum memory required to compute an HLO module where all + // computations have been scheduled (represented by the given + // module_sequence), assuming no fragmentation. + static StatusOr MinimumMemoryForModule( + const SequentialHloOrdering::HloModuleSequence& module_sequence, + const LogicalBuffer::SizeFunction& size_function); + + // Returns the minimum memory required to compute the given computation, + // assuming no fragmentation. + static StatusOr MinimumMemoryForComputation( + const HloComputation& computation, + const std::vector& sequence, + const TuplePointsToAnalysis& points_to_analysis, + const LogicalBuffer::SizeFunction& size_function, + const tensorflow::gtl::FlatMap* + memory_by_computation = nullptr); + // Run the heap simulation with the given algorithm, assuming the given // module_sequence, which must contain a topologically-consistent total // ordering of all instructions within each computation. The result is invalid @@ -111,7 +129,9 @@ class HeapSimulator { const std::vector& instruction_sequence, const TuplePointsToAnalysis& points_to_analysis, const BufferValue::SizeFunction& size_fn, - const Options& options = Options()); + const Options& options = Options(), + const tensorflow::gtl::FlatMap* + memory_by_computation = nullptr); private: // If 'module_sequence' is non-null, it is used to find kCall and kWhile @@ -120,7 +140,9 @@ class HeapSimulator { HeapSimulator( std::unique_ptr algorithm, const BufferValue::SizeFunction& size_fn, const Options& options, - const SequentialHloOrdering::HloModuleSequence* module_sequence); + const SequentialHloOrdering::HloModuleSequence* module_sequence = nullptr, + const tensorflow::gtl::FlatMap* + memory_by_computation = nullptr); ~HeapSimulator(); Status RunComputation( @@ -140,11 +162,20 @@ class HeapSimulator { const HloInstruction* instruction, const BufferValue* shared_with_canonical); - const std::unique_ptr no_fragmentation_stats_; + // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap, + // in which case we are calculating the same allocs/frees twice in the + // simulation. + const std::unique_ptr no_fragmentation_stats_; const std::unique_ptr algorithm_; const BufferValue::SizeFunction size_fn_; const Options options_; + // module_sequence_ is set by buffer assignment, and memory_by_computation_ is + // set by hlo scheduling. Then, in RunComputation, we check both in order to + // handle subcomputations. It would be good to unify the handling of + // subcomputations, but it's not clear how. const SequentialHloOrdering::HloModuleSequence* module_sequence_; + const tensorflow::gtl::FlatMap* + memory_by_computation_; // In addition to Alloc and Free, the heap simulator exposes a concept of // buffer sharing. When ShareBuffer is called, instead of allocating new @@ -189,6 +220,26 @@ class HeapAlgorithm { // Alloc allocates a buffer of 'size' bytes. virtual void Alloc(const BufferValue* buffer, int64 size) = 0; + // NoFragmentationStatsHeap overrides this method. + virtual void Alloc(const BufferValue* buffer, int64 size, + const HloInstruction* instruction) { + Alloc(buffer, size); + } + + // Takes memory usage of subcomputations into account when calculating the + // memory usage of a computation. Currently, we don't handle buffer aliasing + // between computations entirely correctly. We are careful to not double count + // for the output buffers of whiles/conds/calls. But we don't take into + // account other aliases, such as for the while init. A more thorough solution + // would require something like BufferAssignment::BuildColocatedBufferSets. + // TODO(b/65835246): + // Since TuplePointsToAnalysis is being replaced with a module-aware alias + // analysis, it's not worth making major changes to HeapSimulator now. + virtual void AccountForSubcomputationMemory( + const HloInstruction* instruction, + const tensorflow::gtl::FlatMap& + memory_by_computation) {} + // Free de-allocates a previously allocated buffer. virtual void Free(const BufferValue* buffer, int64 size) = 0; @@ -207,7 +258,17 @@ class NoFragmentationStatsHeap : public HeapAlgorithm { ~NoFragmentationStatsHeap() override = default; void Alloc(const BufferValue* buffer, int64 size) override; + + void Alloc(const BufferValue* buffer, int64 size, + const HloInstruction* instruction) override; + + void AccountForSubcomputationMemory( + const HloInstruction* instruction, + const tensorflow::gtl::FlatMap& + memory_by_computation) override; + void Free(const BufferValue* buffer, int64 size) override; + Result Finish() override; private: diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc index 6271652412c2979ff926702f12722102344b0dfb..5f85f145657b67634844c849447ef545a6dea468 100644 --- a/tensorflow/compiler/xla/service/heap_simulator_test.cc +++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc @@ -19,7 +19,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/buffer_value.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -34,6 +35,65 @@ limitations under the License. namespace xla { namespace { +class MinimumMemoryForSequenceTest : public HloTestBase {}; + +TEST_F(MinimumMemoryForSequenceTest, MultiComputation) { + auto module = CreateNewModule(); + const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {}); + const Shape tuple_shape = + ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape}); + + auto cond_builder = HloComputation::Builder("WhileCond"); + // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element) + HloInstruction* cond_param = cond_builder.AddInstruction( + HloInstruction::CreateParameter(0, tuple_shape, "cond_param")); + HloInstruction* cond_iter = cond_builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0)); + HloInstruction* cond_data = cond_builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1)); + // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte) + HloInstruction* cond_lt = cond_builder.AddInstruction( + HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}), + HloOpcode::kLt, cond_iter, cond_data)); + HloComputation* cond_computation = + module->AddEmbeddedComputation(cond_builder.Build()); + + auto body_builder = HloComputation::Builder("WhileBody"); + // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element) + HloInstruction* body_param = body_builder.AddInstruction( + HloInstruction::CreateParameter(0, tuple_shape, "body_param")); + HloComputation* body_computation = + module->AddEmbeddedComputation(body_builder.Build()); + + auto builder = HloComputation::Builder(TestName()); + // Entry params: 8 bytes (4 bytes per param), TOTAL=8 + HloInstruction* iter = builder.AddInstruction( + HloInstruction::CreateParameter(0, scalar_shape, "param_iter")); + HloInstruction* data = builder.AddInstruction( + HloInstruction::CreateParameter(1, scalar_shape, "param_data")); + // Tuple: 16 bytes (8 bytes per pointer), TOTAL=24 + HloInstruction* tuple = + builder.AddInstruction(HloInstruction::CreateTuple({iter, data})); + // While: 8 bytes (4 bytes per element), TOTAL=32 + // Both cond and body use a max of 24 bytes, TOTAL=56 + HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile( + tuple_shape, cond_computation, body_computation, tuple)); + HloComputation* entry_computation = + module->AddEntryComputation(builder.Build()); + + auto size_fn = [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8); + }; + + SequentialHloOrdering::HloModuleSequence module_sequence; + module_sequence[cond_computation] = {cond_param, cond_iter, cond_data, + cond_lt}; + module_sequence[body_computation] = {body_param}; + module_sequence[entry_computation] = {iter, data, tuple, while_op}; + EXPECT_EQ(56, HeapSimulator::MinimumMemoryForModule(module_sequence, size_fn) + .ValueOrDie()); +} + const char kAlloc[] = "Alloc"; const char kFree[] = "Free"; const char kFinish[] = "Finish"; @@ -78,7 +138,7 @@ class HeapSimulatorTracker { const string& name, std::unique_ptr computation, const std::vector& instruction_sequence) { HloModuleConfig config; - module_ = MakeUnique(name, config); + module_ = absl::make_unique(name, config); module_->AddEntryComputation(std::move(computation)); points_to_analysis_ = TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie(); @@ -87,8 +147,8 @@ class HeapSimulatorTracker { // the secondary sorting criteria of DecreasingSizeRunsHeap to sort calls by // buffer id, for determinism in the tests. auto zero_size = [](const BufferValue& buffer) { return 0; }; - auto algorithm = MakeUnique( - MakeUnique(&actual_calls_)); + auto algorithm = absl::make_unique( + absl::make_unique(&actual_calls_)); result_ = HeapSimulator::Run( std::move(algorithm), *module_->entry_computation(), instruction_sequence, *points_to_analysis_, zero_size) @@ -97,7 +157,7 @@ class HeapSimulatorTracker { explicit HeapSimulatorTracker(const string& name) { HloModuleConfig config; - module_ = MakeUnique(name, config); + module_ = absl::make_unique(name, config); } // Similar to the single entry computation constructor above, but runs the @@ -123,8 +183,8 @@ class HeapSimulatorTracker { auto size_fn = [&reverse_position](const BufferValue& buffer) { return reverse_position[buffer.instruction()]; }; - auto algorithm = MakeUnique( - MakeUnique(&actual_calls_)); + auto algorithm = absl::make_unique( + absl::make_unique(&actual_calls_)); result_ = HeapSimulator::Run(std::move(algorithm), *module_, module_sequence, *points_to_analysis_, size_fn) .ConsumeValueOrDie(); @@ -139,6 +199,11 @@ class HeapSimulatorTracker { .ConsumeValueOrDie(); } + int64 OffsetAt(const HloInstruction* instruction, const ShapeIndex& index) { + const BufferValue* buffer = BufferAt(instruction, index); + return result_.chunk_map.at(buffer).offset; + } + // Ensures the expected sequence of Alloc/Free/Finish calls was performed. void ExpectCallSequence(const CallSequence& expected) const { EXPECT_EQ(expected, actual_calls_); @@ -150,10 +215,9 @@ class HeapSimulatorTracker { const ShapeIndex& index_a, const HloInstruction* instruction_b, const ShapeIndex& index_b) { - const BufferValue* a = BufferAt(instruction_a, index_a); - const BufferValue* b = BufferAt(instruction_b, index_b); - EXPECT_EQ(result_.chunk_map[a].offset, result_.chunk_map[b].offset) - << *a << ", " << *b; + int64 offset_a = OffsetAt(instruction_a, index_a); + int64 offset_b = OffsetAt(instruction_b, index_b); + EXPECT_EQ(offset_a, offset_b); } private: @@ -176,7 +240,7 @@ class HeapSimulatorTest : public HloTestBase { TEST_F(HeapSimulatorTest, ScalarConstant) { auto builder = HloComputation::Builder(TestName()); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); // Constants aren't assigned. See b/32248867 HeapSimulatorTracker tracker(TestName(), builder.Build(), {const0}); @@ -252,6 +316,43 @@ TEST_F(HeapSimulatorTest, MultiplyAdd) { tracker.ExpectSharedBuffers(add, {}, mul, {}); } +TEST_F(HeapSimulatorTest, BufferReusedOnce) { + HeapSimulatorTracker tracker(TestName()); + auto builder = HloComputation::Builder(TestName()); + + HloComputation::Builder fusion_builder("fusion"); + { + HloComputation::Builder& builder = fusion_builder; + auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter( + /*parameter_number=*/0, f32vec4_, "A")); + auto exp = builder.AddInstruction( + HloInstruction::CreateUnary(f32vec4_, HloOpcode::kExp, a_param)); + auto neg = builder.AddInstruction( + HloInstruction::CreateUnary(f32vec4_, HloOpcode::kNegate, a_param)); + + builder.AddInstruction(HloInstruction::CreateTuple({exp, neg})); + } + auto fusion_computation = + tracker.module()->AddEmbeddedComputation(fusion_builder.Build()); + auto a_param = builder.AddInstruction( + HloInstruction::CreateParameter(0, f32vec4_, "paramA")); + auto neg = builder.AddInstruction( + HloInstruction::CreateUnary(f32vec4_, HloOpcode::kNegate, a_param)); + auto fusion = builder.AddInstruction(HloInstruction::CreateFusion( + ShapeUtil::MakeTupleShape({f32vec4_, f32vec4_}), + HloInstruction::FusionKind::kLoop, {neg}, fusion_computation)); + tracker.module()->AddEntryComputation(builder.Build()); + + tracker.RunWholeModule({a_param, neg, fusion}); + + auto neg_buffer = tracker.OffsetAt(neg, {}); + int64 output_buffer_0 = tracker.OffsetAt(fusion, {0}); + int64 output_buffer_1 = tracker.OffsetAt(fusion, {1}); + // Only one buffer should be shared. + EXPECT_TRUE((neg_buffer == output_buffer_0) ^ + (neg_buffer == output_buffer_1)); +} + TEST_F(HeapSimulatorTest, MultiplyDot) { auto builder = HloComputation::Builder(TestName()); auto paramA = builder.AddInstruction( @@ -574,8 +675,9 @@ class HeapAlgorithmTestBase : public ::testing::Test { const BufferValue* DummyBufferValue() { const BufferValue::Id id = buffers_.size(); auto const0 = builder_.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); - buffers_.emplace_back(MakeUnique(id, const0, ShapeIndex{})); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); + buffers_.emplace_back( + absl::make_unique(id, const0, ShapeIndex{})); return buffers_.back().get(); } @@ -624,7 +726,8 @@ class DecreasingSizeRunsHeapTest : public HeapAlgorithmTestBase {}; TEST_F(DecreasingSizeRunsHeapTest, Empty) { CallSequence call_sequence; - DecreasingSizeRunsHeap heap(MakeUnique(&call_sequence)); + DecreasingSizeRunsHeap heap( + absl::make_unique(&call_sequence)); heap.Finish(); EXPECT_EQ(call_sequence, CallSequence({ {kFinish, nullptr}, @@ -633,7 +736,8 @@ TEST_F(DecreasingSizeRunsHeapTest, Empty) { TEST_F(DecreasingSizeRunsHeapTest, Simple) { CallSequence call_sequence; - DecreasingSizeRunsHeap heap(MakeUnique(&call_sequence)); + DecreasingSizeRunsHeap heap( + absl::make_unique(&call_sequence)); heap.Alloc(buffer_a_, 10); heap.Alloc(buffer_b_, 20); heap.Alloc(buffer_c_, 30); @@ -660,7 +764,8 @@ TEST_F(DecreasingSizeRunsHeapTest, Simple) { TEST_F(DecreasingSizeRunsHeapTest, Mixed) { CallSequence call_sequence; - DecreasingSizeRunsHeap heap(MakeUnique(&call_sequence)); + DecreasingSizeRunsHeap heap( + absl::make_unique(&call_sequence)); heap.Alloc(buffer_a_, 10); heap.Alloc(buffer_b_, 20); heap.Free(buffer_b_, 20); diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto index 1f7c1cffd324ad2f4e4cdf11046c8459b8ceb6d5..58b7af93ebfce74951c0f2d65ab226fc94d62e4b 100644 --- a/tensorflow/compiler/xla/service/hlo.proto +++ b/tensorflow/compiler/xla/service/hlo.proto @@ -34,6 +34,7 @@ import "tensorflow/compiler/xla/xla_data.proto"; option cc_enable_arenas = true; // Serialization of HloInstruction. +// Next ID: 53 message HloInstructionProto { reserved 10; reserved "parameter_name"; @@ -45,6 +46,8 @@ message HloInstructionProto { reserved "control_predecessor_names"; reserved 6; reserved "called_computation_names"; + reserved 44; + reserved "replica_group_ids"; string name = 1; string opcode = 2; @@ -74,6 +77,11 @@ message HloInstructionProto { // Describes the dimension numbers used for a convolution. xla.ConvolutionDimensionNumbers convolution_dimension_numbers = 16; + // The number of feature groups. Used for a convolution. Must be a divisor of + // the input feature dimension and output feature dimension. If not specified, + // it will use a default value of 1. + int64 feature_group_count = 50; + // Describes the [begin, end) index range and stride for slices. message SliceDimensions { int64 start = 1; @@ -133,7 +141,7 @@ message HloInstructionProto { // Gather dimension numbers. xla.GatherDimensionNumbers gather_dimension_numbers = 33; - repeated int64 gather_window_bounds = 34; + repeated int64 gather_slice_sizes = 34; // Compute Host. string channel_name = 41; @@ -150,6 +158,24 @@ message HloInstructionProto { // Backend configuration for the instruction. Has backend-specific meaning. string backend_config = 43; + + // Cross replica op fields. + repeated ReplicaGroup replica_groups = 49; + int64 all_reduce_id = 45; + string cross_replica_sum_barrier = 46; + + // Whether this Send/Recv instruction transfers data to/from the host. Only + // present for Send and Recv instructions and their SendDone and RecvDone + // partners. + bool is_host_transfer = 47; + + xla.ScatterDimensionNumbers scatter_dimension_numbers = 48; + + // Precision configuration for the instruction. Has backend-specific meaning. + xla.PrecisionConfigProto precision_config = 51; + + // Collective permute field. + repeated SourceTarget source_target_pairs = 52; } // Serialization of HloComputation. @@ -234,8 +260,9 @@ message BufferAllocationProto { int64 index = 1; int64 size = 2; bool is_thread_local = 3; - bool is_reusable = 4; + bool is_tuple = 11; bool is_entry_computation_parameter = 5; + bool is_constant = 12; int64 parameter_number = 6; repeated int64 parameter_shape_index = 10; bool maybe_live_out = 7; diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc index a88283ed9a6459b4fa9310e160b59c77d51f1027..0986da65cbd3d550ecfa01212364518aba651d86 100644 --- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc @@ -20,6 +20,8 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_buffer.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -28,15 +30,11 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" namespace xla { -using ::tensorflow::str_util::Join; -using ::tensorflow::strings::StrAppend; -using ::tensorflow::strings::StrCat; +using absl::StrAppend; // Data structure used to construct the alias analysis. Thrown away after alias // analysis is complete. This data structure keeps track of which sets of @@ -414,7 +412,7 @@ Status HloAliasAnalysis::Verify() const { } string HloAliasAnalysis::ToString() const { - string out = StrCat("HloAliasAnalysis, module ", module_->name(), "\n"); + string out = absl::StrCat("HloAliasAnalysis, module ", module_->name(), "\n"); StrAppend(&out, " Buffers at each position:\n"); for (const HloComputation* computation : module_->computations()) { for (const HloInstruction* instruction : computation->instructions()) { @@ -452,15 +450,16 @@ string HloAliasAnalysis::ToString() const { /* static */ StatusOr> HloAliasAnalysis::Run( - HloModule* module) { + HloModule* module, const HloDataflowAnalysis::FusionCanShareBufferFunction& + fusion_can_share_buffer) { VLOG(2) << "HloAliasAnalysis::Run on module " << module->name(); XLA_VLOG_LINES(2, module->ToString()); - auto alias_analysis = WrapUnique(new HloAliasAnalysis(module)); - TF_ASSIGN_OR_RETURN( - alias_analysis->dataflow_analysis_, - HloDataflowAnalysis::Run(*module, /*ssa_form=*/true, - /*bitcast_defines_value=*/false)); + auto alias_analysis = absl::WrapUnique(new HloAliasAnalysis(module)); + TF_ASSIGN_OR_RETURN(alias_analysis->dataflow_analysis_, + HloDataflowAnalysis::Run(*module, /*ssa_form=*/true, + /*bitcast_defines_value=*/false, + fusion_can_share_buffer)); BufferValueMap buffer_map(alias_analysis->dataflow_analysis()); buffer_map.MergeAliasedBuffers(); @@ -493,6 +492,16 @@ StatusOr> HloAliasAnalysis::Run( bool HloAliasAnalysis::HasLiveRangeInterference( const HloOrdering& ordering) const { for (const HloBuffer& buffer : buffers()) { + CHECK(!buffer.values().empty()); + if (ShapeUtil::IsToken(buffer.values().front()->shape())) { + // Tokens have no on-device representation and cannot interfere. + for (const HloValue* value : buffer.values()) { + // If one of the values is a token, all values must be a token. + DCHECK(ShapeUtil::IsToken(value->shape())); + } + continue; + } + // Check that the values in the buffer are totally ordered with respect to // 'ordering'. Begin by sorting the values with respect to 'ordering' with a // tie-break using value ID. The tie-break is necessary because we need a @@ -517,7 +526,6 @@ bool HloAliasAnalysis::HasLiveRangeInterference( // a buffer and A interferes with C, then necessarily A also interferes // with B. So to check interference you only need to check interference // between A and B, and between B and C. - CHECK(!values.empty()); for (int i = 1; i < values.size(); ++i) { if (!ordering.IsDefinedBefore(*values[i - 1], *values[i])) { VLOG(1) << values[i - 1]->ToShortString() << " and " @@ -527,10 +535,10 @@ bool HloAliasAnalysis::HasLiveRangeInterference( if (ordering.MayInterfere(*values[i - 1], *values[i], dataflow_analysis())) { VLOG(1) << "In buffer " << buffer.id() << " containing values:\n " - << Join(values, ", ", - [](string* out, const HloValue* value) { - StrAppend(out, value->ToShortString()); - }) + << absl::StrJoin(values, ", ", + [](string* out, const HloValue* value) { + StrAppend(out, value->ToShortString()); + }) << "\nValue " << values[i - 1]->ToShortString() << " may interfere with value " << values[i]->ToShortString(); diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h index 67dfd4301b3a027a496911ecf6f06841dfd6423a..e345804537723f01e9ccb63e7d6ded1bd68f4196 100644 --- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h @@ -20,6 +20,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo_buffer.h" #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -29,7 +30,6 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" namespace xla { @@ -39,7 +39,10 @@ class HloAliasAnalysis { public: // The callgraph of the given HloModule must be flattened // (xla::FlattenCallGraph) prior to running the analysis. - static StatusOr> Run(HloModule* module); + static StatusOr> Run( + HloModule* module, + const HloDataflowAnalysis::FusionCanShareBufferFunction& + fusion_can_share_buffer); string ToString() const; diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc index 8f18d50f6e033fab1c01f42017b951c224c22799..54abe3345d25a8cc1fdd66bd6ee75157fe9b7f77 100644 --- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/flatten_call_graph.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -28,7 +28,7 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" -#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/logging.h" @@ -39,15 +39,19 @@ namespace { using ::testing::UnorderedElementsAre; -class HloAliasAnalysisTest : public HloTestBase { +class HloAliasAnalysisTest : public HloVerifiedTestBase { protected: - HloAliasAnalysisTest() : module_(CreateNewModule()) {} + HloAliasAnalysisTest() : HloVerifiedTestBase() { + module_ = CreateNewModule(); + } // Run alias analysis on the member module. For convenience returns a // reference to the generated analysis stored in analysis_. HloAliasAnalysis& RunAnalysis() { hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis"); - analysis_ = HloAliasAnalysis::Run(module_.get()).ConsumeValueOrDie(); + analysis_ = HloAliasAnalysis::Run(module_, + /*fusion_can_share_buffer=*/nullptr) + .ConsumeValueOrDie(); return *analysis_; } @@ -89,7 +93,7 @@ class HloAliasAnalysisTest : public HloTestBase { // never occurs, but HLO graphs with interference can be explicitly // constructed. bool AnyValuesInSameBufferInterfere() { - DependencyHloOrdering ordering(module_.get()); + DependencyHloOrdering ordering(module_); for (const HloBuffer& buffer : analysis_->buffers()) { for (const HloValue* value_a : buffer.values()) { for (const HloValue* value_b : buffer.values()) { @@ -106,7 +110,7 @@ class HloAliasAnalysisTest : public HloTestBase { return false; } - std::unique_ptr module_; + HloModule* module_; std::unique_ptr analysis_; const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {}); @@ -116,9 +120,9 @@ TEST_F(HloAliasAnalysisTest, BinaryOperation) { // Test the analysis on a single binary operation (Add). auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto add = builder.AddInstruction(HloInstruction::CreateBinary( scalar_shape_, HloOpcode::kAdd, constant1, constant2)); module_->AddEntryComputation(builder.Build()); @@ -228,9 +232,9 @@ TEST_F(HloAliasAnalysisTest, SingleCall) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto call = builder.AddInstruction(HloInstruction::CreateCall( scalar_shape_, {constant1, constant2}, called_computation)); module_->AddEntryComputation(builder.Build()); @@ -267,9 +271,9 @@ TEST_F(HloAliasAnalysisTest, ComputationCalledTwice) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto call1 = builder.AddInstruction(HloInstruction::CreateCall( scalar_shape_, {constant1, constant2}, called_computation)); auto call2 = builder.AddInstruction(HloInstruction::CreateCall( @@ -346,15 +350,15 @@ TEST_F(HloAliasAnalysisTest, SingleWhile) { auto cond_param = cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto xla_while = builder.AddInstruction( @@ -439,15 +443,15 @@ TEST_F(HloAliasAnalysisTest, SequentialWhiles) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto xla_while0 = builder.AddInstruction( @@ -459,7 +463,7 @@ TEST_F(HloAliasAnalysisTest, SequentialWhiles) { module_->AddEntryComputation(builder.Build()); FlattenCallGraph flattener; - TF_ASSERT_OK(flattener.Run(module_.get()).status()); + TF_ASSERT_OK(flattener.Run(module_).status()); const HloAliasAnalysis& analysis = RunAnalysis(); @@ -498,7 +502,7 @@ TEST_F(HloAliasAnalysisTest, NestedWhiles) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); return cond_builder.Build(); }; // Build separate condition computations so the call graph is flat. The @@ -543,9 +547,9 @@ TEST_F(HloAliasAnalysisTest, NestedWhiles) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto entry_while = builder.AddInstruction( @@ -608,17 +612,17 @@ TEST_F(HloAliasAnalysisTest, SwizzlingWhile) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); auto cond_constant = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2, constant3})); auto xla_while = builder.AddInstruction( @@ -654,19 +658,18 @@ TEST_F(HloAliasAnalysisTest, SwizzlingWhile) { } TEST_F(HloAliasAnalysisTest, TupleSelect) { - // Test a kSelect of a tuple value. Non-top-level element flow through the - // instruction. + // Test a kTupleSelect. Non-top-level element flow through the instruction. auto builder = HloComputation::Builder(TestName()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); auto constant4 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(4.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(4.0))); auto tuple1 = builder.AddInstruction(HloInstruction::CreateTuple({constant1})); auto tuple2 = @@ -677,13 +680,13 @@ TEST_F(HloAliasAnalysisTest, TupleSelect) { builder.AddInstruction(HloInstruction::CreateTuple({constant4})); const Shape tuple_shape = tuple1->shape(); auto select11 = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple1)); + tuple_shape, HloOpcode::kTupleSelect, pred, tuple1, tuple1)); auto select12 = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple_shape, HloOpcode::kTupleSelect, pred, tuple1, tuple2)); auto select34 = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple3, tuple4)); + tuple_shape, HloOpcode::kTupleSelect, pred, tuple3, tuple4)); auto select1234 = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, select12, select34)); + tuple_shape, HloOpcode::kTupleSelect, pred, select12, select34)); module_->AddEntryComputation(builder.Build()); @@ -718,7 +721,7 @@ TEST_F(HloAliasAnalysisTest, TupleSelect) { } TEST_F(HloAliasAnalysisTest, TupleSelectToWhile) { - // Test a tuple-shaped kSelect feeding a kWhile instruction. HLO: + // Test a tuple-shaped kTupleSelect feeding a kWhile instruction. HLO: // // body((F32[], F32[]) %tuple_param): // %negate = Negate(%tuple_param{0}) @@ -754,22 +757,22 @@ TEST_F(HloAliasAnalysisTest, TupleSelectToWhile) { auto cond_param = cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple1 = builder.AddInstruction(HloInstruction::CreateTuple({constant1})); auto tuple2 = builder.AddInstruction(HloInstruction::CreateTuple({constant2})); auto select = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple_shape, HloOpcode::kTupleSelect, pred, tuple1, tuple2)); auto xla_while = builder.AddInstruction( HloInstruction::CreateWhile(tuple_shape, condition, body, select)); @@ -806,7 +809,7 @@ TEST_F(HloAliasAnalysisTest, Bitcast) { // Bitcasting a value should not produce a new buffer. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary( scalar_shape_, HloOpcode::kBitcast, constant)); @@ -825,7 +828,7 @@ TEST_F(HloAliasAnalysisTest, BitcastInterference) { // interference. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary( scalar_shape_, HloOpcode::kBitcast, constant)); builder.AddInstruction(HloInstruction::CreateTuple({constant, bitcast})); @@ -834,7 +837,7 @@ TEST_F(HloAliasAnalysisTest, BitcastInterference) { const HloAliasAnalysis& analysis = RunAnalysis(); - DependencyHloOrdering ordering(module_.get()); + DependencyHloOrdering ordering(module_); EXPECT_FALSE(analysis.HasLiveRangeInterference(ordering)); } @@ -844,13 +847,13 @@ TEST_F(HloAliasAnalysisTest, WhileInterference) { // the other use of the init. auto builder = HloComputation::Builder(TestName()); auto init = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto cond_builder = HloComputation::Builder("condition"); auto cond_param = cond_builder.AddInstruction( HloInstruction::CreateParameter(0, init->shape(), "param")); auto cond_root = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); @@ -876,7 +879,7 @@ TEST_F(HloAliasAnalysisTest, WhileInterference) { { // Dependency ordering should interfere because the negate and while are // unordered. - DependencyHloOrdering ordering(module_.get()); + DependencyHloOrdering ordering(module_); EXPECT_TRUE(analysis.HasLiveRangeInterference(ordering)); } @@ -887,13 +890,13 @@ TEST_F(HloAliasAnalysisTest, WhileInterference) { sequence[condition] = {cond_param, cond_root}; { sequence[entry] = {init, xla_while, negate, entry_root}; - SequentialHloOrdering ordering(module_.get(), sequence); + SequentialHloOrdering ordering(module_, sequence); EXPECT_TRUE(analysis.HasLiveRangeInterference(ordering)); } { sequence[entry] = {init, negate, xla_while, entry_root}; - SequentialHloOrdering ordering(module_.get(), sequence); + SequentialHloOrdering ordering(module_, sequence); EXPECT_FALSE(analysis.HasLiveRangeInterference(ordering)); } } diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc index e16413f361fb0216792b47c3c67ef3c1357c2221..6c11a073b74c61e44dfe81a32261ae78ae7b46fb 100644 --- a/tensorflow/compiler/xla/service/hlo_buffer.cc +++ b/tensorflow/compiler/xla/service/hlo_buffer.cc @@ -20,6 +20,8 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -27,15 +29,10 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" namespace xla { -using ::tensorflow::str_util::Join; -using ::tensorflow::strings::StrCat; - bool HloBuffer::operator==(const HloBuffer& other) const { bool equal = id() == other.id(); if (equal) { @@ -59,10 +56,11 @@ std::vector HloBuffer::ComputePositions() const { } string HloBuffer::ToString() const { - return StrCat("HloBuffer ", id_, ", values: ", - Join(values_, ", ", [](string* result, const HloValue* value) { - result->append(value->ToShortString()); - })); + return absl::StrCat( + "HloBuffer ", id_, ", values: ", + absl::StrJoin(values_, ", ", [](string* result, const HloValue* value) { + result->append(value->ToShortString()); + })); } std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer) { diff --git a/tensorflow/compiler/xla/service/hlo_buffer.h b/tensorflow/compiler/xla/service/hlo_buffer.h index 4873463b2ea4fee3ee39dff31fc3429a4998142f..a88c87e46c8100571aff24f70a2a19fe8ce71ebc 100644 --- a/tensorflow/compiler/xla/service/hlo_buffer.h +++ b/tensorflow/compiler/xla/service/hlo_buffer.h @@ -84,7 +84,7 @@ class HloBuffer { return a->id() == b->id(); } - HloBuffer(Id id, tensorflow::gtl::ArraySlice values) + HloBuffer(Id id, absl::Span values) : id_(id), values_(values.begin(), values.end()) {} // Return the unique identifier for this HloBuffer. diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils.h b/tensorflow/compiler/xla/service/hlo_casting_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..7f73bba036534a62a70a80431236cffa766c9b38 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_casting_utils.h @@ -0,0 +1,104 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Casting utilitiy functions for HLO instructions. + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_ + +#include +#include "tensorflow/core/platform/logging.h" + +namespace xla { + +class HloInstruction; + +template +using EnableIfDerivedFromHlo = + typename std::enable_if::value>::type; + +// TODO(b/93238915): Switch implementation from C++'s dynamic_cast to LLVM-like +// RTTI if it turns out to be a performance issue. +// Casts an HloInstruction pointer to one of its subclasses, dies if argument is +// nullptr or runtime information does not match. +// +// Similar to LLVM's cast. +template * = nullptr> +const T* Cast(const HloInstruction* instruction) { + CHECK(instruction != nullptr); + const T* casted = dynamic_cast(instruction); + CHECK(casted != nullptr); + return casted; +} + +// Non-const overload of Cast. +template * = nullptr> +T* Cast(HloInstruction* instruction) { + return const_cast( + Cast(const_cast(instruction))); +} + +// Works just like the Cast, except that it allows for a null pointer as an +// argument which it then propagates. +// +// Similar to LLVM's cast_or_null. +template * = nullptr> +const T* CastOrNull(const HloInstruction* instruction) { + return instruction != nullptr ? Cast(instruction) : nullptr; +} + +// Non-const overload of CastOrNull. +template * = nullptr> +T* CastOrNull(HloInstruction* instruction) { + return const_cast( + CastOrNull(const_cast(instruction))); +} + +// Casts an HloInstruction pointer to one of its subclasses, dies if argument is +// nullptr, returns nullptr if runtime information does not match. +// +// Similar to LLVM's dyn_cast. +template * = nullptr> +const T* DynCast(const HloInstruction* instruction) { + CHECK(instruction != nullptr); + return dynamic_cast(instruction); +} + +// Non-const overload of DynCast. +template * = nullptr> +T* DynCast(HloInstruction* instruction) { + return const_cast( + DynCast(const_cast(instruction))); +} + +// Works just like the DynCast, except that it allows for a null pointer as an +// argument which it then propagates. +// +// Similar to LLVM's dyn_cast_or_null. +template * = nullptr> +const T* DynCastOrNull(const HloInstruction* instruction) { + return instruction != nullptr ? DynCast(instruction) : nullptr; +} + +// Non-const overload of DynCastOrNull. +template * = nullptr> +T* DynCastOrNull(HloInstruction* instruction) { + return const_cast( + DynCastOrNull(const_cast(instruction))); +} + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_ diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3364275409122254bf99b40a7d2fcbb2d7564cc --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc @@ -0,0 +1,113 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" + +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace { + +class DummyInstruction : public HloInstruction { + public: + DummyInstruction() + : HloInstruction(HloOpcode::kConstant, ShapeUtil::MakeShape(F32, {})) {} +}; + +class AnotherDummyInstruction : public HloInstruction { + public: + AnotherDummyInstruction() + : HloInstruction(HloOpcode::kParameter, ShapeUtil::MakeShape(F32, {})) {} +}; + +TEST(HloCastingUtilsTest, CastSucceeds) { + DummyInstruction instruction; + DummyInstruction* casted = + Cast(static_cast(&instruction)); + ASSERT_EQ(casted, &instruction); +} + +TEST(HloCastingUtilsTest, CastDiesForWrongType) { + AnotherDummyInstruction instruction; + ASSERT_DEATH( + Cast(static_cast(&instruction)), ""); +} + +TEST(HloCastingUtilsTest, CastDiesForNullptr) { + HloInstruction* null = nullptr; + ASSERT_DEATH(Cast(null), ""); +} + +TEST(HloCastingUtilsTest, CastOrNullSucceeds) { + DummyInstruction instruction; + DummyInstruction* casted = + Cast(static_cast(&instruction)); + ASSERT_EQ(casted, &instruction); +} + +TEST(HloCastingUtilsTest, CastOrNullDiesForWrongType) { + AnotherDummyInstruction instruction; + ASSERT_DEATH( + Cast(static_cast(&instruction)), ""); +} + +TEST(HloCastingUtilsTest, CastOrNullReturnsNullptrForNullptr) { + HloInstruction* null = nullptr; + DummyInstruction* casted = CastOrNull(null); + ASSERT_EQ(casted, nullptr); +} + +TEST(HloCastingUtilsTest, DynCastSucceeds) { + DummyInstruction instruction; + DummyInstruction* casted = + DynCast(static_cast(&instruction)); + ASSERT_EQ(casted, &instruction); +} + +TEST(HloCastingUtilsTest, DynCastReturnsNullptrForWrongType) { + AnotherDummyInstruction instruction; + DummyInstruction* casted = + DynCast(static_cast(&instruction)); + ASSERT_EQ(casted, nullptr); +} + +TEST(HloCastingUtilsTest, DynCastDiesForNullptr) { + HloInstruction* null = nullptr; + ASSERT_DEATH(DynCast(null), ""); +} + +TEST(HloCastingUtilsTest, DynCastOrNullSucceeds) { + DummyInstruction instruction; + DummyInstruction* casted = DynCastOrNull( + static_cast(&instruction)); + ASSERT_EQ(casted, &instruction); +} + +TEST(HloCastingUtilsTest, DynCastOrNullReturnsNullptrForWrongType) { + AnotherDummyInstruction instruction; + DummyInstruction* casted = DynCastOrNull( + static_cast(&instruction)); + ASSERT_EQ(casted, nullptr); +} + +TEST(HloCastingUtilsTest, DynCastOrNullReturnsNullptrForNullptr) { + HloInstruction* null = nullptr; + DummyInstruction* casted = DynCastOrNull(null); + ASSERT_EQ(casted, nullptr); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index b61eabbbf526249710ee434565bb68a493a089d5..fe7f2be888d2037e4f6d3879bcc716de4eee07f9 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -23,9 +23,13 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" @@ -36,13 +40,11 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" namespace xla { -using ::tensorflow::strings::StrCat; +using absl::StrCat; std::unique_ptr HloComputation::Builder::Build( HloInstruction* root_instruction) { @@ -56,15 +58,15 @@ std::unique_ptr HloComputation::Builder::Build( HloInstruction* root = root_instruction ? root_instruction : last_added_instruction_; CHECK_NE(nullptr, root); - return WrapUnique(new HloComputation(name_, parameter_count, &instructions_, - root, fusion_instruction_)); + return absl::WrapUnique(new HloComputation( + name_, parameter_count, &instructions_, root, fusion_instruction_)); } HloComputation::HloComputation( const string& name, int parameter_count, std::vector>* instructions, HloInstruction* root_instruction, HloInstruction* fusion_instruction) - : name_(name), + : name_(NameUniquer::GetSanitizedName(name)), unique_id_(-1), root_instruction_(root_instruction), fusion_instruction_(fusion_instruction) { @@ -120,6 +122,30 @@ HloInstruction* HloComputation::AddParameter( return instructions_.back().get(); } +namespace { + +// Returns the new name for a fusion parameter when we change its number. +// +// Fusion parameters are named foo.param_1, bar.param_2, etc. We are +// renumbering the parameters, so replace the final number in the name with +// the updated value. +string RenameFusionParameter(const string& original_name, int64 new_param_no) { + const string param_underscore = ".param_"; + size_t index = original_name.rfind(param_underscore); + if (index == string::npos) { + return original_name; + } + string after_param = original_name.substr(index + param_underscore.size()); + int64 numeric_suffix; + if (absl::SimpleAtoi(after_param, &numeric_suffix)) { + return StrCat(original_name.substr(0, index + param_underscore.size()), + new_param_no); + } + return original_name; +} + +} // namespace + Status HloComputation::RemoveParameter(int64 param_no) { CHECK_GE(param_no, 0); CHECK_LT(param_no, param_instructions_.size()); @@ -132,21 +158,8 @@ Status HloComputation::RemoveParameter(int64 param_no) { while (param_no < param_instructions_.size()) { param_instruction = param_instructions_[param_no]; - string param_name = param_instruction->name(); - // Fusion parameters are named foo.param_1, bar.param_2, etc. We are - // renumbering the parameters, so replace the final number in the name with - // the updated value. - const string param_underscore = ".param_"; - size_t index = param_name.rfind(param_underscore); - if (index == string::npos) { - string after_param = name().substr(index + param_underscore.size()); - int64 numeric_suffix; - if (tensorflow::strings::safe_strto64(after_param, &numeric_suffix)) { - param_name = - StrCat(param_name.substr(0, index), param_underscore, param_no); - } - } - + string param_name = + RenameFusionParameter(param_instruction->name(), param_no); HloInstruction* new_instr = AddInstructionInternal(HloInstruction::CreateParameter( param_no, param_instruction->shape(), param_name)); @@ -159,6 +172,34 @@ Status HloComputation::RemoveParameter(int64 param_no) { return Status::OK(); } +Status HloComputation::RemoveUnusedParameters() { + CHECK(IsFusionComputation()); + int64 removed = 0; + for (int64 i = 0; i < param_instructions_.size(); ++i) { + HloInstruction* param_instruction = param_instructions_[i]; + if (param_instruction->user_count() == 0 && + param_instruction != root_instruction()) { + TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction)); + ++removed; + continue; + } + + if (removed > 0) { + const int64 param_no = i - removed; + string param_name = + RenameFusionParameter(param_instruction->name(), param_no); + HloInstruction* new_instr = + AddInstructionInternal(HloInstruction::CreateParameter( + param_no, param_instruction->shape(), param_name)); + TF_RETURN_IF_ERROR(param_instruction->ReplaceAllUsesWith(new_instr)); + param_instructions_[param_no] = new_instr; + TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction)); + } + } + param_instructions_.resize(param_instructions_.size() - removed); + return Status::OK(); +} + bool HloComputation::IsRemovable(const HloInstruction* instruction) { // If the instruction has control predecessors or successors then we cannot // remove the instruction without violating ordering constraints (added, for @@ -234,7 +275,6 @@ Status HloComputation::RemoveInstruction(HloInstruction* instruction) { TF_RET_CHECK(instruction_iterators_.count(instruction) != 0); auto inst_it = instruction_iterators_.at(instruction); (*inst_it)->set_parent(nullptr); - instruction->DetachFromOperands(); instructions_.erase(inst_it); return Status::OK(); } @@ -246,9 +286,8 @@ void HloComputation::set_root_instruction( if (!IsFusionComputation()) { CHECK(ShapeUtil::Compatible(new_root_instruction->shape(), root_instruction_->shape())) - << new_root_instruction->shape().ShortDebugString() - << " is incompatible with " - << root_instruction_->shape().ShortDebugString(); + << new_root_instruction->shape() << " is incompatible with " + << root_instruction_->shape(); } bool root_found = false; for (auto& instruction : instructions_) { @@ -264,46 +303,11 @@ void HloComputation::set_root_instruction( namespace { -// Helper class which computes the post order of an expression rooted at a -// particular instruction. -class InstructionPostOrderer : public DfsHloVisitorWithDefault { - public: - // added_instructions is the set of instructions which have already been - // accounted for in the post order in previous invocations of - // GetOrder. Without this mechanism, instructions which are predecessors of - // multiple root instructions of the computation can be added to the post - // order more than once. - static std::list GetOrder( - HloInstruction* root, - tensorflow::gtl::FlatSet* added_instructions) { - InstructionPostOrderer orderer(added_instructions); - TF_CHECK_OK(root->Accept(&orderer)); - return std::move(orderer.post_order_); - } - - private: - explicit InstructionPostOrderer( - tensorflow::gtl::FlatSet* added_instructions) - : added_instructions_(added_instructions) {} - ~InstructionPostOrderer() override {} - - Status DefaultAction(HloInstruction* hlo_instruction) override { - if (added_instructions_->count(hlo_instruction) == 0) { - post_order_.push_back(hlo_instruction); - added_instructions_->insert(hlo_instruction); - } - return Status::OK(); - } - - std::list post_order_; - tensorflow::gtl::FlatSet* added_instructions_; -}; - // Helper which builds a post order of the HLO call graph. void ComputeComputationPostOrder( HloComputation* computation, tensorflow::gtl::FlatSet* visited, - std::list* post_order) { + std::vector* post_order) { if (visited->insert(computation).second) { for (auto* instruction : computation->instructions()) { for (HloComputation* called_computation : @@ -317,10 +321,107 @@ void ComputeComputationPostOrder( } // namespace -std::list HloComputation::MakeInstructionPostOrder() const { - std::list post_order; - std::list trace_instructions; - tensorflow::gtl::FlatSet added_instructions; +void HloComputation::ComputeInstructionPostOrder( + const HloComputation::ChannelDependencyMap& channel_dependency_map, + std::vector* post_order, HloInstruction* root, + tensorflow::gtl::FlatMap* visited) const { + std::vector dfs_stack; + dfs_stack.push_back(root); + while (!dfs_stack.empty()) { + const auto current = dfs_stack.back(); + auto it = visited->find(current); + if (it != visited->end()) { + if (it->second == kVisited) { + // Already visited. + dfs_stack.pop_back(); + continue; + } + // Visit this node. + CHECK_EQ(kVisiting, it->second); + dfs_stack.pop_back(); + post_order->push_back(current); + it->second = kVisited; + continue; + } + + visited->insert({current, kVisiting}); + + // Add the operands to the stack in reverse order so the first operand is + // processed first. This will produce a more natural ordering and a nicer + // result for thigns like HLO stringification. + const auto& operands = current->operands(); + for (int64 i = operands.size() - 1; i >= 0; --i) { + dfs_stack.emplace_back(operands[i]); + } + + for (HloInstruction* op : current->control_predecessors()) { + dfs_stack.emplace_back(op); + } + + // Add inputs for send->recv_done dependencies and cross-replica-sum + // dependencies. + switch (current->opcode()) { + case HloOpcode::kRecvDone: { + auto it = channel_dependency_map.find(current->channel_id()); + if (it != channel_dependency_map.end()) { + for (HloInstruction* op : it->second) { + dfs_stack.emplace_back(op); + } + } + break; + } + case HloOpcode::kCrossReplicaSum: { + auto all_reduce_id = current->all_reduce_id(); + if (all_reduce_id) { + auto it = channel_dependency_map.find(all_reduce_id.value()); + if (it != channel_dependency_map.end()) { + for (HloInstruction* op : it->second) { + dfs_stack.emplace_back(op); + } + } + } + break; + } + default: + break; + } + } +} + +HloComputation::ChannelDependencyMap +HloComputation::ComputeChannelDependencies() const { + ChannelDependencyMap channel_dependency_map; + for (const auto& instruction : instructions_) { + switch (instruction->opcode()) { + case HloOpcode::kSend: { + channel_dependency_map[instruction->channel_id()].push_back( + instruction.get()); + break; + } + case HloOpcode::kCrossReplicaSum: { + auto all_reduce_id = instruction->all_reduce_id(); + if (all_reduce_id) { + auto& dependencies = channel_dependency_map[all_reduce_id.value()]; + absl::c_copy(instruction->operands(), + std::back_inserter(dependencies)); + absl::c_copy(instruction->control_predecessors(), + std::back_inserter(dependencies)); + } + break; + } + default: + break; + } + } + return channel_dependency_map; +} + +std::vector HloComputation::MakeInstructionPostOrder() const { + auto channel_dependency_map = ComputeChannelDependencies(); + std::vector post_order; + post_order.reserve(instruction_count()); + std::vector trace_instructions; + tensorflow::gtl::FlatMap visited; for (auto& instruction : instructions_) { if (instruction->opcode() == HloOpcode::kTrace) { // Trace instructions aren't handled by the DFS visitor. Add trace @@ -328,21 +429,21 @@ std::list HloComputation::MakeInstructionPostOrder() const { // users). trace_instructions.push_back(instruction.get()); } else if (instruction->users().empty()) { - post_order.splice(post_order.end(), - InstructionPostOrderer::GetOrder(instruction.get(), - &added_instructions)); + ComputeInstructionPostOrder(channel_dependency_map, &post_order, + instruction.get(), &visited); } } - post_order.splice(post_order.end(), trace_instructions); + post_order.insert(post_order.end(), trace_instructions.begin(), + trace_instructions.end()); CHECK_EQ(instructions_.size(), post_order.size()) << "number of instructions does not match post order size"; return post_order; } -std::list HloComputation::MakeEmbeddedComputationsList() +std::vector HloComputation::MakeEmbeddedComputationsList() const { tensorflow::gtl::FlatSet visited; - std::list post_order; + std::vector post_order; // To avoid special handling of this computation, cast away const of // 'this'. 'this' is immediately removed from the post order after @@ -451,13 +552,13 @@ HloComputation::CreateFromProto( return to_proto_id[a.get()] < to_proto_id[b.get()]; }); - return WrapUnique(new HloComputation(proto.name(), parameter_count, - &instructions, root, - /*fusion_instruction=*/nullptr)); + return absl::WrapUnique(new HloComputation(proto.name(), parameter_count, + &instructions, root, + /*fusion_instruction=*/nullptr)); } void HloComputation::FuseInstructionsInto( - tensorflow::gtl::ArraySlice instructions_to_fuse, + absl::Span instructions_to_fuse, HloInstruction* fusion_instruction) { CHECK_EQ(HloOpcode::kFusion, fusion_instruction->opcode()); HloInstruction* root = instructions_to_fuse.front(); @@ -476,7 +577,7 @@ void HloComputation::FuseInstructionsInto( } HloInstruction* HloComputation::CreateFusionInstruction( - tensorflow::gtl::ArraySlice instructions_to_fuse, + absl::Span instructions_to_fuse, HloInstruction::FusionKind fusion_kind) { HloInstruction* root = instructions_to_fuse.front(); HloInstruction* fusion_instruction = AddInstruction( @@ -486,23 +587,11 @@ HloInstruction* HloComputation::CreateFusionInstruction( } StatusOr HloComputation::DeepCopyHelper( - HloInstruction* instruction, const ShapeTree* indices_to_copy, - ShapeTree* copies_added, ShapeIndex* index) { - if (ShapeUtil::IsArray(instruction->shape())) { - if (indices_to_copy == nullptr || indices_to_copy->element(*index)) { - // Use kCopy to copy array elements - HloInstruction* copy = AddInstruction(HloInstruction::CreateUnary( - instruction->shape(), HloOpcode::kCopy, instruction)); - if (copies_added != nullptr) { - *copies_added->mutable_element(*index) = copy; - } - return copy; - } else { - // Array elements which are not to be copied are passed through - // transparently. - return instruction; - } - } else if (ShapeUtil::IsTuple(instruction->shape())) { + HloInstruction* instruction, ShapeIndex* index, + const std::function< + HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation)>& copy_leaf) { + if (ShapeUtil::IsTuple(instruction->shape())) { std::vector elements; for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape()); i++) { @@ -512,17 +601,22 @@ StatusOr HloComputation::DeepCopyHelper( instruction, i)); index->push_back(i); - TF_ASSIGN_OR_RETURN( - HloInstruction * element, - DeepCopyHelper(gte, indices_to_copy, copies_added, index)); + TF_ASSIGN_OR_RETURN(HloInstruction * element, + DeepCopyHelper(gte, index, copy_leaf)); elements.push_back(element); index->pop_back(); } return AddInstruction(HloInstruction::CreateTuple(elements)); - } else { - return FailedPrecondition( - "Can only copy array and tuple shaped instructions"); } + if (ShapeUtil::IsToken(instruction->shape())) { + // Tokens have no on-device representation and cannot be copied. Pass + // through transparently. + return instruction; + } + + // Array shape. + TF_RET_CHECK(ShapeUtil::IsArray(instruction->shape())); + return copy_leaf(instruction, *index, this); } StatusOr HloComputation::DeepCopyInstruction( @@ -531,20 +625,48 @@ StatusOr HloComputation::DeepCopyInstruction( if (instruction->parent() != this) { return FailedPrecondition( "Can't deep copy instruction %s: instruction is not in computation %s", - instruction->name().c_str(), name().c_str()); + instruction->name(), name()); } if (indices_to_copy != nullptr && !ShapeUtil::Compatible(instruction->shape(), indices_to_copy->shape())) { return FailedPrecondition( "Can't deep copy instruction %s: given shape tree of indices to copy " "has incompatible shapes: %s vs. %s", - instruction->name().c_str(), - ShapeUtil::HumanString(instruction->shape()).c_str(), - ShapeUtil::HumanString(indices_to_copy->shape()).c_str()); + instruction->name(), ShapeUtil::HumanString(instruction->shape()), + ShapeUtil::HumanString(indices_to_copy->shape())); } ShapeIndex index; - return DeepCopyHelper(instruction, indices_to_copy, copies_added, &index); + auto copy_leaf = [indices_to_copy, copies_added]( + HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation) { + if (indices_to_copy == nullptr || indices_to_copy->element(leaf_index)) { + HloInstruction* copy = computation->AddInstruction( + HloInstruction::CreateUnary(leaf->shape(), HloOpcode::kCopy, leaf)); + if (copies_added != nullptr) { + *copies_added->mutable_element(leaf_index) = copy; + } + return copy; + } + // Elements which are not to be copied are passed through + // transparently. + return leaf; + }; + return DeepCopyHelper(instruction, &index, copy_leaf); +} + +StatusOr HloComputation::DeepCopyInstructionWithCustomCopier( + HloInstruction* instruction, + const std::function< + HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation)>& copy_leaf) { + if (instruction->parent() != this) { + return FailedPrecondition( + "Can't deep copy instruction %s: instruction is not in computation %s", + instruction->name(), name()); + } + ShapeIndex index; + return DeepCopyHelper(instruction, &index, copy_leaf); } ProgramShape HloComputation::ComputeProgramShape() const { @@ -560,6 +682,9 @@ ProgramShape HloComputation::ComputeProgramShape() const { } bool HloComputation::operator==(const HloComputation& other) const { + if (this == &other) { + return true; + } std::set> visited; std::function eq = [&visited, &eq](const HloInstruction* a, const HloInstruction* b) { @@ -609,15 +734,39 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction, std::unique_ptr HloComputation::ComputeReachability() const { - const std::list all = MakeInstructionPostOrder(); - auto result = MakeUnique(all); + const auto& all = MakeInstructionPostOrder(); + auto result = absl::make_unique(all); + auto channel_dependency_map = ComputeChannelDependencies(); std::vector inputs; for (const HloInstruction* hlo : all) { inputs.assign(hlo->operands().begin(), hlo->operands().end()); inputs.insert(inputs.end(), hlo->control_predecessors().begin(), hlo->control_predecessors().end()); - result->SetReachabilityToUnion(inputs, hlo); + + switch (hlo->opcode()) { + case HloOpcode::kRecvDone: { + auto it = channel_dependency_map.find(hlo->channel_id()); + if (it != channel_dependency_map.end()) { + absl::c_copy(it->second, std::back_inserter(inputs)); + } + break; + } + case HloOpcode::kCrossReplicaSum: { + auto all_reduce_id = hlo->all_reduce_id(); + if (all_reduce_id) { + auto it = channel_dependency_map.find(all_reduce_id.value()); + if (it != channel_dependency_map.end()) { + absl::c_copy(it->second, std::back_inserter(inputs)); + } + } + break; + } + default: + break; + } + + result->FastSetReachabilityToUnion(inputs, hlo); } return result; } @@ -659,11 +808,10 @@ std::vector HloComputation::CollectUnreachableRoots() const { } } VLOG(3) << "Unreachable roots:" - << tensorflow::str_util::Join( - unreachable_roots, "\n\t", - [](string* out, const HloInstruction* hlo) { - tensorflow::strings::StrAppend(out, hlo->ToString()); - }); + << absl::StrJoin(unreachable_roots, "\n\t", + [](string* out, const HloInstruction* hlo) { + absl::StrAppend(out, hlo->ToString()); + }); return unreachable_roots; } @@ -765,7 +913,7 @@ std::unique_ptr HloComputation::CloneWithReplacements( HloCloneContext* context, const string& suffix) { std::unique_ptr context_ptr; if (context == nullptr) { - context_ptr = MakeUnique(parent(), suffix); + context_ptr = absl::make_unique(parent(), suffix); context = context_ptr.get(); } @@ -827,15 +975,6 @@ std::unique_ptr HloComputation::CloneWithReplacements( } } context->MapComputation(this, result.get()); - // We cloned the elements of 'replacements', so they're all going to be - // destroyed. HloInstructions need to be detached from their operands before - // they're destroyed, otherwise they stick around in the operands' users lists - // and cause use-after-frees. - for (auto& kv : replacements) { - if (std::unique_ptr& new_instr = kv.second) { - new_instr->DetachFromOperands(); - } - } return result; } @@ -843,4 +982,12 @@ void HloComputation::UniquifyName(NameUniquer* name_uniquer) { name_ = name_uniquer->GetUniqueName(name_); } +HloInstruction* HloComputation::GetInstructionWithName(absl::string_view name) { + auto instructions_in_computation = instructions(); + auto it = absl::c_find_if( + instructions_in_computation, + [&](HloInstruction* instr) { return instr->name() == name; }); + return it == instructions_in_computation.end() ? nullptr : *it; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h index 0da4a305f3d5d694a1918fed294337100b0a27fd..fe2d3bbbe53bdcb7b2ea8a35f35e50fb3e8823b4 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.h +++ b/tensorflow/compiler/xla/service/hlo_computation.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_ +#include #include #include #include @@ -24,6 +25,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/iterator_util.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h" @@ -38,7 +40,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/platform/macros.h" @@ -113,6 +114,11 @@ class HloComputation { // instruction. Status RemoveParameter(int64 param_no); + // Remove unused parameters from the computation. + // Note this is only applicatable to the computation for the fusion + // instruction. + Status RemoveUnusedParameters(); + // Add new parameter instruction to the computation. // This should be a new parameter. Instruction will be appended to parameters // and inserted to the instruction list. @@ -199,7 +205,7 @@ class HloComputation { // Compute and return a post-order of the instructions in the computation. In // this order, definitions of values always appear before their uses. - std::list MakeInstructionPostOrder() const; + std::vector MakeInstructionPostOrder() const; // Computes and returns the reachability between HLO instructions in the // computation. The returned HloReachabilityMap is constructed such that @@ -221,7 +227,7 @@ class HloComputation { // transitively. The embedded computations are sorted such that if computation // A calls computation B (eg, via a map instruction) then A will appear after // B in the list. - std::list MakeEmbeddedComputationsList() const; + std::vector MakeEmbeddedComputationsList() const; // Creates a fusion instruction containing the given instructions. // `fusion_kind` indicates the type of the fusion, e.g., loop fusion or fusion @@ -231,7 +237,7 @@ class HloComputation { // removed if they have no uses after fusion (this is necessarily true for at // least the root). HloInstruction* CreateFusionInstruction( - tensorflow::gtl::ArraySlice instructions_to_fuse, + absl::Span instructions_to_fuse, HloInstruction::FusionKind fusion_kind); // Create a deep copy of the given instruction and return the instruction @@ -249,6 +255,14 @@ class HloComputation { const ShapeTree* indices_to_copy = nullptr, ShapeTree* copies_added = nullptr); + // As above, but uses a custom function to copy the leaf nodes, which could + // create alternative HLOs other than kCopy, or even pass-throughs. + StatusOr DeepCopyInstructionWithCustomCopier( + HloInstruction* instruction, + const std::function< + HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation)>& copy_leaf); + // Computes and returns the ProgramShape of this computation (shape of // parameters and result with layout). ProgramShape ComputeProgramShape() const; @@ -351,6 +365,10 @@ class HloComputation { unique_id_ = id; } + // Returns the instruction in this computation that has name `name`. Returns + // null if there is no such computation. + HloInstruction* GetInstructionWithName(absl::string_view name); + int64 unique_id() const { return unique_id_; } private: @@ -367,18 +385,34 @@ class HloComputation { // // Pre-condition: fusion_instruction's opcode is kFusion. void FuseInstructionsInto( - tensorflow::gtl::ArraySlice instructions_to_fuse, + absl::Span instructions_to_fuse, HloInstruction* fusion_instruction); // Internal helper for recursive copying of an instruction. Creates and // returns a deep copy of the given instruction. StatusOr DeepCopyHelper( - HloInstruction* instruction, const ShapeTree* indices_to_copy, - ShapeTree* copies_added, ShapeIndex* index); + HloInstruction* instruction, ShapeIndex* index, + const std::function< + HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* computation)>& copy_leaf); // Internal helper to collect unreachable roots. std::vector CollectUnreachableRoots() const; + // Returns a map from channel-id to directed dependencies of the channel + // instructions. For send&recv pairs it means the send instruction and for + // cross-replica-sum the union of the dependencies for all participating + // instructions. + using ChannelDependencyMap = + tensorflow::gtl::FlatMap>; + ChannelDependencyMap ComputeChannelDependencies() const; + + enum VisitState { kVisiting, kVisited }; + void ComputeInstructionPostOrder( + const HloComputation::ChannelDependencyMap& channel_dependency_map, + std::vector* post_order, HloInstruction* root, + tensorflow::gtl::FlatMap* visited) const; + string name_; int64 unique_id_; HloInstruction* root_instruction_; diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc index 25469a54c48f4f5cab478aba929f1cc18de8b81f..f7ed1b0316b213a0f34b1d690229f0173dbd5250 100644 --- a/tensorflow/compiler/xla/service/hlo_computation_test.cc +++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc @@ -17,7 +17,7 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -118,7 +118,7 @@ TEST_F(HloComputationTest, PostOrderSingleton) { // Test GetInstructionPostOrder for a computation with one instruction. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto module = CreateNewModule(); auto computation = module->AddEntryComputation(builder.Build()); EXPECT_THAT(computation->MakeInstructionPostOrder(), ElementsAre(constant)); @@ -129,7 +129,7 @@ TEST_F(HloComputationTest, PostOrderSimple) { // instructions. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto negate1 = builder.AddInstruction( HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant)); auto negate2 = builder.AddInstruction( @@ -144,7 +144,7 @@ TEST_F(HloComputationTest, PostOrderTrace) { // Test GetInstructionPostOrder for a computation with a trace instruction. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto negate1 = builder.AddInstruction( HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant)); auto trace = @@ -163,13 +163,13 @@ TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) { // which are not connected. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant4 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto module = CreateNewModule(); auto computation = module->AddEntryComputation(builder.Build()); EXPECT_THAT(computation->MakeInstructionPostOrder(), @@ -181,11 +181,11 @@ TEST_F(HloComputationTest, PostOrderWithMultipleRoots) { // which are not connected. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto add1 = builder.AddInstruction(HloInstruction::CreateBinary( r0f32_, HloOpcode::kAdd, constant1, constant2)); auto add2 = builder.AddInstruction(HloInstruction::CreateBinary( @@ -205,11 +205,11 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) { // computation has multiple roots (dead code). auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); // Add three disconnected add expressions. builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, constant1, constant2)); @@ -256,7 +256,7 @@ TEST_F(HloComputationTest, DeepCopyArray) { // Test that DeepCopyInstruction properly copies an array. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.0, 2.0, 3.0}))); + LiteralUtil::CreateR1({1.0, 2.0, 3.0}))); auto module = CreateNewModule(); auto computation = module->AddEntryComputation(builder.Build()); auto copy = computation->DeepCopyInstruction(constant).ValueOrDie(); @@ -268,9 +268,9 @@ TEST_F(HloComputationTest, DeepCopyTuple) { // Test that DeepCopyInstruction properly copies a tuple. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.0, 2.0, 3.0}))); + LiteralUtil::CreateR1({1.0, 2.0, 3.0}))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); @@ -289,7 +289,7 @@ TEST_F(HloComputationTest, DeepCopyArrayAtIndices) { // copy are specified. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.0, 2.0, 3.0}))); + LiteralUtil::CreateR1({1.0, 2.0, 3.0}))); auto computation = builder.Build(); { @@ -314,9 +314,9 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) { // specified by the given indices. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.0, 2.0, 3.0}))); + LiteralUtil::CreateR1({1.0, 2.0, 3.0}))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto computation = builder.Build(); @@ -371,11 +371,43 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) { } } +TEST_F(HloComputationTest, DeepCopyToken) { + // Test that DeepCopyInstruction properly handles tokens which should not be + // copied. + auto builder = HloComputation::Builder(TestName()); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); + auto module = CreateNewModule(); + auto computation = module->AddEntryComputation(builder.Build()); + auto copy = computation->DeepCopyInstruction(token).ValueOrDie(); + + // No copy should be added. + EXPECT_THAT(copy, op::AfterAll()); +} + +TEST_F(HloComputationTest, DeepCopyTokenTuple) { + // Test that DeepCopyInstruction properly handles tokens which should not be + // copied. + auto builder = HloComputation::Builder(TestName()); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); + auto constant = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); + auto tuple = + builder.AddInstruction(HloInstruction::CreateTuple({token, constant})); + auto module = CreateNewModule(); + auto computation = module->AddEntryComputation(builder.Build()); + auto copy = computation->DeepCopyInstruction(tuple).ValueOrDie(); + + // Only the array (second tuple element) should be copied. The token is passed + // through transparently. + EXPECT_THAT(copy, op::Tuple(op::GetTupleElement(tuple), + op::Copy(op::GetTupleElement(tuple)))); +} + TEST_F(HloComputationTest, CycleDetection) { // Test whether the visitor can detect cycles in the graph. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto negate = builder.AddInstruction( HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant)); auto add = builder.AddInstruction( @@ -385,6 +417,9 @@ TEST_F(HloComputationTest, CycleDetection) { // Add a control dependency to create a cycle. ASSERT_IS_OK(add->AddControlDependencyTo(negate)); + auto instructions = computation->MakeInstructionPostOrder(); + EXPECT_EQ(3, instructions.size()); + const auto visitor = [](HloInstruction* instruction) { return Status::OK(); }; auto visit_status = computation->Accept(visitor); ASSERT_FALSE(visit_status.ok()); @@ -398,7 +433,7 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) { // twice. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto dead_negate = builder.AddInstruction( HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant)); auto dead_add = builder.AddInstruction(HloInstruction::CreateBinary( @@ -421,9 +456,9 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) { TEST_F(HloComputationTest, CloneWithControlDependency) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0f))); auto add = builder.AddInstruction(HloInstruction::CreateBinary( r0f32_, HloOpcode::kAdd, constant1, constant2)); @@ -467,9 +502,9 @@ TEST_F(HloComputationTest, Reachability) { // There is a control dependency from 'add' to 'exp'. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0f))); auto add = builder.AddInstruction(HloInstruction::CreateBinary( r0f32_, HloOpcode::kAdd, constant1, constant2)); auto negate = builder.AddInstruction( @@ -572,13 +607,14 @@ TEST_F(HloComputationTest, Stringification) { auto* computation = module->AddEntryComputation(builder.Build()); auto options = HloPrintOptions().set_print_metadata(false); - EXPECT_EQ(computation->ToString(options), - R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] { + const string expected_computation = + R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] { %x = f32[5,10]{1,0} parameter(0) %y = f32[20,10]{1,0} parameter(1) %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0} ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0} -})"); +})"; + EXPECT_EQ(computation->ToString(options), expected_computation); } TEST_F(HloComputationTest, StringificationIndent) { @@ -604,13 +640,14 @@ TEST_F(HloComputationTest, StringificationIndent) { auto options = HloPrintOptions().set_print_metadata(false).set_indent_amount(2); - EXPECT_EQ(computation->ToString(options), - R"( %TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] { + const string expected_computation = + R"( %TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] { %x = f32[5,10]{1,0} parameter(0) %y = f32[20,10]{1,0} parameter(1) %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0} ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0} - })"); + })"; + EXPECT_EQ(computation->ToString(options), expected_computation); } TEST_F(HloComputationTest, StringificationCanonical) { @@ -635,23 +672,46 @@ TEST_F(HloComputationTest, StringificationCanonical) { auto* computation = module->AddEntryComputation(builder.Build()); auto options = HloPrintOptions().set_print_metadata(false); - EXPECT_EQ(computation->ToString(options), - R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] { + const string expected_computation1 = + R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] { %x = f32[5,10]{1,0} parameter(0) %y = f32[20,10]{1,0} parameter(1) %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0} ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0} -})"); +})"; + EXPECT_EQ(computation->ToString(options), expected_computation1); options = HloPrintOptions().Canonical(); - EXPECT_EQ(computation->ToString(options), R"(TransposeDot { + const string expected_computation2 = R"(TransposeDot { tmp_0 = f32[5,10]{1,0} parameter(0) tmp_1 = f32[20,10]{1,0} parameter(1) tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0} ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0} -})"); +})"; + EXPECT_EQ(computation->ToString(options), expected_computation2); } -} // namespace +TEST_F(HloComputationTest, ChannelReachability) { + const Shape shape = ShapeUtil::MakeShape(F32, {5, 7}); + HloComputation::Builder builder("ChannelReachability"); + auto param = builder.AddInstruction( + HloInstruction::CreateParameter(0, shape, "param")); + auto token0 = builder.AddInstruction(HloInstruction::CreateToken()); + auto send = + builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1)); + auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send)); + auto token1 = builder.AddInstruction(HloInstruction::CreateToken()); + auto recv = + builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1)); + auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv)); + auto module = CreateNewModule(); + auto computation = module->AddEntryComputation(builder.Build(recv_done)); + auto reachability = computation->ComputeReachability(); + EXPECT_TRUE(reachability->IsReachable(param, recv_done)); + EXPECT_FALSE(reachability->IsReachable(send, recv)); + EXPECT_FALSE(reachability->IsReachable(send_done, recv)); +} + +} // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc index 35ecd4428d0dfde2de445ea34472d2c78148c6c9..8a45939c61755876555bc35c49d7d6c781f8b4fe 100644 --- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc +++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc @@ -20,8 +20,9 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_evaluator.h" @@ -38,7 +39,7 @@ StatusOr HloConstantFolding::Run(HloModule* module) { // Limit the constant folding to 0 iterations to skip folding loops. This // retains the behavior from before while loop support in HloEvaluator and may // be revised. - auto evaluator = MakeUnique(/*max_loop_iterations=*/0); + auto evaluator = absl::make_unique(/*max_loop_iterations=*/0); XLA_VLOG_LINES(2, "HloConstantFolding::Run(), before:\n" + module->ToString()); @@ -51,14 +52,15 @@ StatusOr HloConstantFolding::Run(HloModule* module) { computation->root_instruction() != instruction) { continue; } - // Skip Constant, Parameter, Reduce operation. - // TODO(b/35975797): Enable Reduce operation once arbitrary computation - // are supported by the evaluator. + // Skip Constant, Parameter, and AfterAll operation. // TODO(b/64407269): Enable Tuple once the timeout issue is resolved. + // TODO(b/110532604): Enable AfterAll once AfterAll requires at least one + // operand in which case constant folding will be impossible and this + // special case is not necessary. if (instruction->opcode() == HloOpcode::kParameter || instruction->opcode() == HloOpcode::kConstant || instruction->opcode() == HloOpcode::kTuple || - instruction->opcode() == HloOpcode::kReduce) { + instruction->opcode() == HloOpcode::kAfterAll) { continue; } // Skip instructions with non-constant operands. @@ -69,7 +71,8 @@ StatusOr HloConstantFolding::Run(HloModule* module) { // Broadcasts dramatically increase the size of constants, which is often // detrimental to performance and memory capacity, so do not fold // broadcasts. - if (instruction->opcode() == HloOpcode::kBroadcast) { + if (instruction->opcode() == HloOpcode::kBroadcast || + instruction->opcode() == HloOpcode::kIota) { continue; } diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.h b/tensorflow/compiler/xla/service/hlo_constant_folding.h index 331480bd029727fa15476cb9ced2e7b7afd170f3..4557983a9c0b0006cc2189c96a88478d469475c1 100644 --- a/tensorflow/compiler/xla/service/hlo_constant_folding.h +++ b/tensorflow/compiler/xla/service/hlo_constant_folding.h @@ -25,7 +25,7 @@ namespace xla { // computation on constants. class HloConstantFolding : public HloPassInterface { public: - tensorflow::StringPiece name() const override { return "constant_folding"; } + absl::string_view name() const override { return "constant_folding"; } // Run constant folding operations on the given module. Returns whether the // module was changed (constant expressions folded). diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc index 5d05ccfc0b223d8749a2577ba1bf96b1ab3e761b..07cd1efc1208309770478885532e0284bdb1fbcc 100644 --- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc +++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc @@ -19,11 +19,12 @@ limitations under the License. #include #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/hlo_pass_fix.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" @@ -41,7 +42,7 @@ using HloConstantFoldingTest = HloTestBase; TEST_F(HloConstantFoldingTest, ConvertF32ToS64) { HloComputation::Builder builder(TestName()); HloInstruction* input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); builder.AddInstruction( HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input)); @@ -62,7 +63,7 @@ TEST_F(HloConstantFoldingTest, ConvertF32ToS64) { TEST_F(HloConstantFoldingTest, ConvertS64ToF32) { HloComputation::Builder builder(TestName()); HloInstruction* input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42))); builder.AddInstruction( HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input)); @@ -82,8 +83,8 @@ TEST_F(HloConstantFoldingTest, ConvertS64ToF32) { TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) { HloComputation::Builder builder(TestName()); - HloInstruction* input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({42.0f, 19.0f}))); + HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({42.0f, 19.0f}))); builder.AddInstruction( HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input)); @@ -104,8 +105,8 @@ TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) { TEST_F(HloConstantFoldingTest, Concatenate) { const struct TestConfig { int concat_dimension; - tensorflow::gtl::ArraySlice dimensions; - tensorflow::gtl::ArraySlice concat_sizes; + absl::Span dimensions; + absl::Span concat_sizes; } test_configs[] = { {1, {11, 0, 7, 5, 9}, {2, 5, 7, 11}}, {3, {1, 4, 17, 0, 8}, {1, 3, 9, 12}}, @@ -120,7 +121,7 @@ TEST_F(HloConstantFoldingTest, Concatenate) { for (auto csize : test_config.concat_sizes) { dimensions[test_config.concat_dimension] = csize; concat_size += csize; - auto literal = Literal::CreateFromDimensions(F32, dimensions); + auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions); HloInstruction* insn = builder.AddInstruction( HloInstruction::CreateConstant(std::move(literal))); operands.push_back(insn); @@ -149,7 +150,7 @@ TEST_F(HloConstantFoldingTest, Slice) { const int64 slice_limits[] = {10, 8, 6, 5, 9}; const int64 slice_strides[] = {1, 1, 1, 1, 1}; TF_ASSERT_OK_AND_ASSIGN(auto literal, - Literal::CreateRandomLiteral( + LiteralUtil::CreateRandomLiteral( ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0)); HloInstruction* literal_instruction = builder.AddInstruction( HloInstruction::CreateConstant(std::move(literal))); @@ -172,7 +173,7 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) { HloComputation::Builder builder(TestName()); const int64 dimensions[] = {11, 8, 7, 5, 9}; TF_ASSERT_OK_AND_ASSIGN(auto literal, - Literal::CreateRandomLiteral( + LiteralUtil::CreateRandomLiteral( ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0)); auto literal_clone = literal->Literal::CloneToUnique(); HloInstruction* literal_instruction = builder.AddInstruction( @@ -195,12 +196,52 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) { using NativeT = typename primitive_util::PrimitiveTypeToNative::type; bool matched = true; root->literal().EachCell( - [&](tensorflow::gtl::ArraySlice indices, NativeT value) { + [&](absl::Span indices, NativeT value) { std::vector rindexes = Permute(permutation, indices); matched = matched && (value == literal_clone->Get(rindexes)); }); EXPECT_TRUE(matched); } +const char* const kConstantFoldReduce = R"( + HloModule ConstantFoldReduce + + add { + a = s32[] parameter(0) + b = s32[] parameter(1) + ROOT add = s32[] add(a, b) + } + + ENTRY r { + x = s32[3] constant({1, 2, 3}) + init = s32[] constant(0) + ROOT reduce = s32[] reduce(x, init), dimensions={0}, to_apply=add + })"; + +TEST_F(HloConstantFoldingTest, ConstantFoldReduce) { + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(kConstantFoldReduce)); + HloConstantFolding const_folder; + TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get())); + EXPECT_TRUE(result); + + EXPECT_EQ(6, module->entry_computation() + ->root_instruction() + ->literal() + .GetFirstElement()); +} + +TEST_F(HloConstantFoldingTest, ConstantFoldReduceNoLayout) { + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(kConstantFoldReduce)); + HloInstruction* add = module->computations().begin()->root_instruction(); + LayoutUtil::ClearLayout(add->mutable_shape()); + HloConstantFolding const_folder; + TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get())); + EXPECT_FALSE(result); + + EXPECT_THAT(module->entry_computation()->root_instruction(), op::Reduce()); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc index 94c9c7eabcc99d4cf61f535925c068a9b55ed136..939b5114c3f8f93ad2d768e77db302ae83e44d17 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc @@ -49,9 +49,9 @@ Status HloCostAnalysis::Preprocess(const HloInstruction* hlo) { // The default number of bytes accessed for an instruction is the sum of the // sizes of the inputs and outputs. The default ShapeUtil::ByteSizeOf does not // handle opaque types. - float bytes_accessed = shape_size_(hlo->shape()); + float bytes_accessed = GetShapeSize(hlo->shape()); for (const HloInstruction* operand : hlo->operands()) { - bytes_accessed += shape_size_(operand->shape()); + bytes_accessed += GetShapeSize(operand->shape()); } current_properties_[kBytesAccessedKey] = bytes_accessed; @@ -121,6 +121,13 @@ Status HloCostAnalysis::HandleElementwiseOp( } } +int64 HloCostAnalysis::GetShapeSize(const Shape& shape) const { + if (!LayoutUtil::HasLayout(shape)) { + return 0; + } + return shape_size_(shape); +} + Status HloCostAnalysis::HandleElementwiseUnary(const HloInstruction* hlo) { return HandleElementwiseOp(hlo); } @@ -155,6 +162,10 @@ Status HloCostAnalysis::HandleConstant(const HloInstruction*) { return Status::OK(); } +Status HloCostAnalysis::HandleIota(const HloInstruction*) { + return Status::OK(); +} + Status HloCostAnalysis::HandleGetTupleElement(const HloInstruction*) { // GetTupleElement forwards a pointer and does not touch each element in the // output. @@ -164,7 +175,11 @@ Status HloCostAnalysis::HandleGetTupleElement(const HloInstruction*) { return Status::OK(); } -Status HloCostAnalysis::HandleSelect(const HloInstruction*) { +Status HloCostAnalysis::HandleSelect(const HloInstruction* hlo) { + return HandleElementwiseOp(hlo); +} + +Status HloCostAnalysis::HandleTupleSelect(const HloInstruction*) { return Status::OK(); } @@ -172,15 +187,22 @@ Status HloCostAnalysis::HandleReverse(const HloInstruction*) { return Status::OK(); } -Status HloCostAnalysis::HandleSlice(const HloInstruction*) { +Status HloCostAnalysis::HandleSlice(const HloInstruction* slice) { + current_properties_[kBytesAccessedKey] = GetShapeSize(slice->shape()) * 2; return Status::OK(); } -Status HloCostAnalysis::HandleDynamicSlice(const HloInstruction*) { +Status HloCostAnalysis::HandleDynamicSlice( + const HloInstruction* dynamic_slice) { + current_properties_[kBytesAccessedKey] = + GetShapeSize(dynamic_slice->shape()) * 2; return Status::OK(); } -Status HloCostAnalysis::HandleDynamicUpdateSlice(const HloInstruction*) { +Status HloCostAnalysis::HandleDynamicUpdateSlice( + const HloInstruction* dynamic_update_slice) { + current_properties_[kBytesAccessedKey] = + GetShapeSize(dynamic_update_slice->operand(1)->shape()) * 2; return Status::OK(); } @@ -189,7 +211,7 @@ Status HloCostAnalysis::HandleTuple(const HloInstruction* tuple) { // through them). The memory touched is then only the size of the output // index table of the tuple. - current_properties_[kBytesAccessedKey] = shape_size_(tuple->shape()); + current_properties_[kBytesAccessedKey] = GetShapeSize(tuple->shape()); return Status::OK(); } @@ -236,10 +258,6 @@ Status HloCostAnalysis::HandleOutfeed(const HloInstruction*) { return Status::OK(); } -Status HloCostAnalysis::HandleHostCompute(const HloInstruction*) { - return Status::OK(); -} - Status HloCostAnalysis::HandleMap(const HloInstruction* map) { // Compute properties of the mapped function. TF_ASSIGN_OR_RETURN(const Properties sub_properties, @@ -256,15 +274,21 @@ Status HloCostAnalysis::HandleMap(const HloInstruction* map) { } Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) { - auto arg = reduce->operand(0); HloComputation* function = reduce->to_apply(); // Compute the cost of the user function. TF_ASSIGN_OR_RETURN(const Properties sub_properties, ProcessSubcomputation(function)); // Compute the cost of all elements for this Reduce operation. - int64 reduction_count = ShapeUtil::ElementsIn(arg->shape()) - - ShapeUtil::ElementsIn(reduce->shape()); + // This counts the number of times the reduction function is applied, so it + // does not need to be multiplied by the number of input tensors - that's + // already "priced in" by the sub-computation doing more work. + auto arg = reduce->operand(0); + auto output_shape = ShapeUtil::IsArray(reduce->shape()) + ? reduce->shape() + : reduce->shape().tuple_shapes(0); + int64 reduction_count = + ShapeUtil::ElementsIn(arg->shape()) - ShapeUtil::ElementsIn(output_shape); for (const auto& property : sub_properties) { if (property.first != kBytesAccessedKey) { current_properties_[property.first] = property.second * reduction_count; @@ -386,6 +410,10 @@ Status HloCostAnalysis::HandleTranspose(const HloInstruction*) { return Status::OK(); } +Status HloCostAnalysis::HandleAfterAll(const HloInstruction*) { + return Status::OK(); +} + Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) { auto lhs = convolution->operand(0); auto rhs = convolution->operand(1); @@ -507,16 +535,24 @@ Status HloCostAnalysis::HandleCrossReplicaSum(const HloInstruction* crs) { // TODO(b/33004697): Compute correct cost here, taking the actual number of // replicas into account. double flops = 0.0; - ShapeUtil::ForEachSubshape( - crs->shape(), [&, this](const Shape& subshape, const ShapeIndex&) { - if (ShapeUtil::IsArray(subshape)) { - flops += ShapeUtil::ElementsIn(subshape); - } - }); + ShapeUtil::ForEachSubshape(crs->shape(), + [&](const Shape& subshape, const ShapeIndex&) { + if (ShapeUtil::IsArray(subshape)) { + flops += ShapeUtil::ElementsIn(subshape); + } + }); current_properties_[kFlopsKey] = flops; return Status::OK(); } +Status HloCostAnalysis::HandleAllToAll(const HloInstruction* hlo) { + return Status::OK(); +} + +Status HloCostAnalysis::HandleCollectivePermute(const HloInstruction* /*hlo*/) { + return Status::OK(); +} + Status HloCostAnalysis::HandleRng(const HloInstruction* random) { // TODO(b/26346211): Implement better estimates for the RNG cost, since the // cost changes with the implementation and the distribution. For now, assume @@ -527,15 +563,9 @@ Status HloCostAnalysis::HandleRng(const HloInstruction* random) { } Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) { - // Compute the properties of the fused expression and attribute them to the - // fusion node. Use a dummy shape_size to avoid any errors from trying to - // calculate the size of a shape that does not have a layout, since nodes - // inside fusion nodes do not necessarily have a layout assigned. - ShapeSizeFunction shape_size = [](const Shape& shape) { return 0; }; TF_ASSIGN_OR_RETURN( current_properties_, - ProcessSubcomputation(fusion->fused_instructions_computation(), - &shape_size)); + ProcessSubcomputation(fusion->fused_instructions_computation())); // Fusion nodes that produce a tuple also produce the entries in the tuple. // Ignore the memory accessed inside fused ops, since fusion is supposed to @@ -544,11 +574,11 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) { ShapeUtil::ForEachSubshape( fusion->shape(), [this](const Shape& subshape, const ShapeIndex& /*shape_index*/) { - current_properties_[kBytesAccessedKey] += shape_size_(subshape); + current_properties_[kBytesAccessedKey] += GetShapeSize(subshape); }); for (const HloInstruction* operand : fusion->operands()) { - current_properties_[kBytesAccessedKey] += shape_size_(operand->shape()); + current_properties_[kBytesAccessedKey] += GetShapeSize(operand->shape()); } return Status::OK(); @@ -629,6 +659,11 @@ Status HloCostAnalysis::HandleGather(const HloInstruction* gather) { return Status::OK(); } +Status HloCostAnalysis::HandleScatter(const HloInstruction* scatter) { + // TODO(b/32945756): Compute the properties of the sub-computation. + return Status::OK(); +} + Status HloCostAnalysis::FinishVisit(const HloInstruction*) { return Status::OK(); } @@ -666,11 +701,8 @@ float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const { } StatusOr HloCostAnalysis::ProcessSubcomputation( - HloComputation* computation, const ShapeSizeFunction* shape_size) { - if (shape_size == nullptr) { - shape_size = &shape_size_; - } - HloCostAnalysis visitor(*shape_size, per_second_rates_); + HloComputation* computation) { + HloCostAnalysis visitor(shape_size_, per_second_rates_); TF_RETURN_IF_ERROR(computation->Accept(&visitor)); return visitor.properties(); } diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h index d17678d20f2a23fd98d18b77d5fb25853901a789..9bb3f12ee2c7867d71de61c5077f129fdf59ef75 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COST_ANALYSIS_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COST_ANALYSIS_H_ +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -23,7 +24,6 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -52,9 +52,11 @@ class HloCostAnalysis : public ConstDfsHloVisitor { Status HandleElementwiseUnary(const HloInstruction* hlo) override; Status HandleElementwiseBinary(const HloInstruction* hlo) override; Status HandleConstant(const HloInstruction* constant) override; + Status HandleIota(const HloInstruction* iota) override; Status HandleGetTupleElement( const HloInstruction* get_tuple_element) override; - Status HandleSelect(const HloInstruction* select) override; + Status HandleSelect(const HloInstruction* hlo) override; + Status HandleTupleSelect(const HloInstruction* hlo) override; Status HandleCompare(const HloInstruction* compare) override; Status HandleClamp(const HloInstruction* clamp) override; Status HandleReducePrecision(const HloInstruction* hlo) override; @@ -69,9 +71,10 @@ class HloCostAnalysis : public ConstDfsHloVisitor { Status HandleConvolution(const HloInstruction* convolution) override; Status HandleFft(const HloInstruction* fft) override; Status HandleCrossReplicaSum(const HloInstruction* crs) override; + Status HandleAllToAll(const HloInstruction* hlo) override; + Status HandleCollectivePermute(const HloInstruction* hlo) override; Status HandleInfeed(const HloInstruction* infeed) override; Status HandleOutfeed(const HloInstruction* outfeed) override; - Status HandleHostCompute(const HloInstruction* host_compute) override; Status HandleRng(const HloInstruction* random) override; Status HandleReverse(const HloInstruction* reverse) override; Status HandleSort(const HloInstruction* sort) override; @@ -97,10 +100,12 @@ class HloCostAnalysis : public ConstDfsHloVisitor { Status HandleBroadcast(const HloInstruction* broadcast) override; Status HandlePad(const HloInstruction* pad) override; Status HandleReshape(const HloInstruction* reshape) override; + Status HandleAfterAll(const HloInstruction* token) override; Status HandleTranspose(const HloInstruction* transpose) override; Status HandleWhile(const HloInstruction* xla_while) override; Status HandleConditional(const HloInstruction* conditional) override; Status HandleGather(const HloInstruction* gather) override; + Status HandleScatter(const HloInstruction* scatter) override; Status FinishVisit(const HloInstruction* root) override; Status Preprocess(const HloInstruction* hlo) override; @@ -146,11 +151,8 @@ class HloCostAnalysis : public ConstDfsHloVisitor { const Properties& per_second_rates); // Returns the properties computed from visiting the computation rooted at the - // given hlo. Uses shape_size_ to calculate shape sizes if shape_size is null, - // otherwise uses shape_size_. - StatusOr ProcessSubcomputation( - HloComputation* computation, - const ShapeSizeFunction* shape_size = nullptr); + // given hlo. + StatusOr ProcessSubcomputation(HloComputation* computation); // Utility function to handle all element-wise operations. Status HandleElementwiseOp(const HloInstruction* hlo_instruction); @@ -167,6 +169,10 @@ class HloCostAnalysis : public ConstDfsHloVisitor { static float GetPropertyForHlo(const HloInstruction& hlo, const string& key, const HloToProperties& hlo_to_properties); + // Decorates shape_size_ by returning 0 immediately if the shape does not have + // a layout. + int64 GetShapeSize(const Shape& shape) const; + // Function which computes the size of the top-level of a given shape (not // including nested elements, if any). If null then bytes_accessed methods // return an error. diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc index 16fdda8a8b9ade09ea31cda1f4cf5e8ff2c0a081..2c854eea18642eb7cb081b4fdfe3bc83627e41ae 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc @@ -22,8 +22,8 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/service.h" @@ -59,9 +59,9 @@ class HloCostAnalysisTest : public ::testing::Test { // Create a computation for a unary user function: x => exp(x + 0.5) { XlaBuilder builder("add_and_exp"); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto half = builder.ConstantR0(0.5); - builder.Exp(builder.Add(x, half)); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto half = ConstantR0(&builder, 0.5); + Exp(Add(x, half)); auto computation_status = builder.Build(); TF_CHECK_OK(computation_status.status()); add_and_exp_ = computation_status.ConsumeValueOrDie(); @@ -70,9 +70,9 @@ class HloCostAnalysisTest : public ::testing::Test { // Create a computation for a binary user function: (x, y) => x + y { XlaBuilder builder("add"); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {}), "y"); + Add(x, y); auto computation_status = builder.Build(); TF_CHECK_OK(computation_status.status()); add_ = computation_status.ConsumeValueOrDie(); @@ -81,9 +81,9 @@ class HloCostAnalysisTest : public ::testing::Test { // Create a computation for a sigmoid function: x => 1 / (1 + exp(-x)) { XlaBuilder builder("sigmoid"); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto one = builder.ConstantR0(1.0); - builder.Div(one, builder.Add(one, builder.Exp(builder.Neg(x)))); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto one = ConstantR0(&builder, 1.0); + Div(one, Add(one, Exp(Neg(x)))); auto computation_status = builder.Build(); TF_CHECK_OK(computation_status.status()); sigmoid_ = computation_status.ConsumeValueOrDie(); @@ -92,9 +92,9 @@ class HloCostAnalysisTest : public ::testing::Test { // Create a computation for a binary max function: (x, y) => max (x, y) { XlaBuilder builder("max"); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - builder.Max(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {}), "y"); + Max(x, y); auto computation_status = builder.Build(); TF_CHECK_OK(computation_status.status()); max_ = computation_status.ConsumeValueOrDie(); @@ -103,9 +103,9 @@ class HloCostAnalysisTest : public ::testing::Test { // Create a computation for a binary GT function: (x, y) => x > y { XlaBuilder builder("gt"); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - builder.Gt(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {}), "y"); + Gt(x, y); auto computation_status = builder.Build(); TF_CHECK_OK(computation_status.status()); gt_ = computation_status.ConsumeValueOrDie(); @@ -137,9 +137,9 @@ class HloCostAnalysisTest : public ::testing::Test { TEST_F(HloCostAnalysisTest, MatrixMultiply) { XlaBuilder builder("matrix_multiply"); - auto lhs = builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 5}), "lhs"); - auto rhs = builder.Parameter(1, ShapeUtil::MakeShape(F32, {5, 30}), "rhs"); - auto result = builder.Dot(lhs, rhs); + auto lhs = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5}), "lhs"); + auto rhs = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 30}), "rhs"); + Dot(lhs, rhs); // Run HLO cost analysis. auto hlo_module = BuildHloGraph(&builder); @@ -159,8 +159,8 @@ TEST_F(HloCostAnalysisTest, MatrixMultiply) { TEST_F(HloCostAnalysisTest, Map) { XlaBuilder builder("map"); - auto input = builder.Parameter(0, ShapeUtil::MakeShape(F32, {10}), "in"); - auto result = builder.Map({input}, add_and_exp_, {0}); + auto input = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10}), "in"); + Map(&builder, {input}, add_and_exp_, {0}); // Run HLO cost analysis. auto hlo_module = BuildHloGraph(&builder); @@ -176,17 +176,17 @@ TEST_F(HloCostAnalysisTest, Map) { TEST_F(HloCostAnalysisTest, Convolution) { XlaBuilder builder("convolution"); - auto input = builder.Parameter( - 0, + auto input = Parameter( + &builder, 0, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10, /*x_dim=*/20}), "input"); - auto kernel = builder.Parameter( - 1, + auto kernel = Parameter( + &builder, 1, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3, /*x_dim=*/3}), "kernel"); - auto result = builder.Conv(input, kernel, {1, 1}, Padding::kValid); + Conv(input, kernel, {1, 1}, Padding::kValid); // Run HLO cost analysis. auto hlo_module = BuildHloGraph(&builder); @@ -206,9 +206,8 @@ TEST_F(HloCostAnalysisTest, Convolution) { TEST_F(HloCostAnalysisTest, Reduce) { XlaBuilder builder("reduce"); auto input = - builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 20}), "input"); - auto result = - builder.Reduce(input, builder.ConstantR0(0.0f), add_, {1}); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 20}), "input"); + Reduce(input, ConstantR0(&builder, 0.0f), add_, {1}); // Run HLO cost analysis. auto hlo_module = BuildHloGraph(&builder); @@ -224,9 +223,9 @@ TEST_F(HloCostAnalysisTest, Reduce) { TEST_F(HloCostAnalysisTest, ReduceWindow) { XlaBuilder builder("reduce_window"); auto input = - builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 20}), "input"); - auto result = builder.ReduceWindow(input, builder.ConstantR0(0), add_, - {4, 5}, {4, 5}, Padding::kValid); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 20}), "input"); + ReduceWindow(input, ConstantR0(&builder, 0), add_, {4, 5}, {4, 5}, + Padding::kValid); // Run HLO cost analysis. auto hlo_module = BuildHloGraph(&builder); @@ -241,12 +240,11 @@ TEST_F(HloCostAnalysisTest, ReduceWindow) { TEST_F(HloCostAnalysisTest, SelectAndScatter) { XlaBuilder builder("select_and_scatter"); auto operand = - builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 20}), "input"); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 20}), "input"); auto source = - builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 4}), "source"); - auto result = - builder.SelectAndScatter(operand, gt_, {4, 5}, {4, 5}, Padding::kValid, - source, builder.ConstantR0(0), add_); + Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {2, 4}), "source"); + SelectAndScatter(operand, gt_, {4, 5}, {4, 5}, Padding::kValid, source, + ConstantR0(&builder, 0), add_); // Run HLO cost analysis. auto hlo_module = BuildHloGraph(&builder); @@ -261,7 +259,7 @@ TEST_F(HloCostAnalysisTest, SelectAndScatter) { TEST_F(HloCostAnalysisTest, Broadcast) { XlaBuilder b("broadcast"); - b.Broadcast(b.ConstantR0(42), {10, 7}); + Broadcast(ConstantR0(&b, 42), {10, 7}); auto hlo_module = BuildHloGraph(&b); HloCostAnalysis analysis(ShapeSize); ASSERT_IS_OK( @@ -273,13 +271,12 @@ TEST_F(HloCostAnalysisTest, Broadcast) { TEST_F(HloCostAnalysisTest, FullyConnectedForward) { XlaBuilder builder("fully_connected_forward"); auto input = - builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 5}), "input"); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5}), "input"); auto weight = - builder.Parameter(1, ShapeUtil::MakeShape(F32, {5, 20}), "weight"); - auto bias = builder.Parameter(2, ShapeUtil::MakeShape(F32, {20}), "bias"); + Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 20}), "weight"); + auto bias = Parameter(&builder, 2, ShapeUtil::MakeShape(F32, {20}), "bias"); // sigmoid(input * weight + bias) - auto result = builder.Map( - {builder.Add(builder.Dot(input, weight), bias, {1})}, sigmoid_, {0, 1}); + Map(&builder, {Add(Dot(input, weight), bias, {1})}, sigmoid_, {0, 1}); // Run HLO cost analysis. auto hlo_module = BuildHloGraph(&builder); @@ -297,11 +294,11 @@ TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) { HloCostAnalysis conv_analysis(ShapeSize); { XlaBuilder builder("conv_looking_matmul"); - auto lhs = builder.Parameter(0, ShapeUtil::MakeShape(F32, {64, 64, 1, 1}), - "input"); - auto rhs = builder.Parameter(1, ShapeUtil::MakeShape(F32, {64, 64, 1, 1}), - "weights"); - builder.Conv(lhs, rhs, {1, 1}, Padding::kSame); + auto lhs = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {64, 64, 1, 1}), + "input"); + auto rhs = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {64, 64, 1, 1}), + "weights"); + Conv(lhs, rhs, {1, 1}, Padding::kSame); auto hlo_module = BuildHloGraph(&builder); ASSERT_IS_OK(hlo_module->entry_computation()->root_instruction()->Accept( &conv_analysis)); @@ -311,10 +308,10 @@ TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) { { XlaBuilder builder("matmul"); auto lhs = - builder.Parameter(0, ShapeUtil::MakeShape(F32, {64, 64}), "input"); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {64, 64}), "input"); auto rhs = - builder.Parameter(1, ShapeUtil::MakeShape(F32, {64, 64}), "weights"); - builder.Dot(lhs, rhs); + Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {64, 64}), "weights"); + Dot(lhs, rhs); auto hlo_module = BuildHloGraph(&builder); ASSERT_IS_OK(hlo_module->entry_computation()->root_instruction()->Accept( &matmul_analysis)); @@ -341,13 +338,13 @@ TEST_F(FusionCostAnalysis, LoopFusion) { // tuple = Tuple({sub, sub, mul, C1}) HloComputation::Builder builder(TestName()); auto c1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2F32Linspace( + HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace( /*from=*/0.0f, /*to=*/1.0f, /*rows=*/2, /*cols=*/2))); auto c2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2F32Linspace( + HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace( /*from=*/1.0f, /*to=*/2.0f, /*rows=*/2, /*cols=*/2))); auto c3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2F32Linspace( + HloInstruction::CreateConstant(LiteralUtil::CreateR2F32Linspace( /*from=*/2.0f, /*to=*/3.0f, /*rows=*/2, /*cols=*/2))); auto add = builder.AddInstruction( HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, c1, c2)); @@ -394,9 +391,9 @@ TEST_F(FusionCostAnalysis, NoLayout) { HloComputation::Builder builder(TestName()); auto c1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR4FromArray4D(Array4D(2, 3, 4, 5)))); + LiteralUtil::CreateR4FromArray4D(Array4D(2, 3, 4, 5)))); auto c2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({1, 2, 3}))); auto broadcast = builder.AddInstruction( HloInstruction::CreateBroadcast(shape_without_layout, c2, {1})); @@ -419,9 +416,9 @@ TEST_F(HloCostAnalysisTest, TupleCost) { HloCostAnalysis analysis(ShapeSize); { XlaBuilder builder("matmul"); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {123}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {42}), "y"); - auto tuple = builder.Tuple({x, y}); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {123}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {42}), "y"); + Tuple(&builder, {x, y}); auto hlo_module = BuildHloGraph(&builder); ASSERT_IS_OK( @@ -435,21 +432,21 @@ TEST_F(HloCostAnalysisTest, TupleCost) { TEST_F(HloCostAnalysisTest, BaseDilatedConvolution) { XlaBuilder builder("BaseDilatedConvolution"); - auto input = builder.Parameter( - 0, + auto input = Parameter( + &builder, 0, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10, /*x_dim=*/20}), "input"); - auto kernel = builder.Parameter( - 1, + auto kernel = Parameter( + &builder, 1, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3, /*x_dim=*/3}), "kernel"); - auto result = builder.ConvGeneralDilated( - input, kernel, /*window_strides=*/{1, 1}, /*padding=*/{{1, 1}, {1, 1}}, - /*lhs_dilation=*/{3, 5}, /*rhs_dilation=*/{7, 11}, - XlaBuilder::CreateDefaultConvDimensionNumbers(2)); + ConvGeneralDilated(input, kernel, /*window_strides=*/{1, 1}, + /*padding=*/{{1, 1}, {1, 1}}, + /*lhs_dilation=*/{3, 5}, /*rhs_dilation=*/{7, 11}, + XlaBuilder::CreateDefaultConvDimensionNumbers(2)); // Run HLO cost analysis. auto hlo_module = BuildHloGraph(&builder); @@ -460,5 +457,51 @@ TEST_F(HloCostAnalysisTest, BaseDilatedConvolution) { EXPECT_EQ(analysis.flop_count(), 1472); } +TEST_F(HloCostAnalysisTest, Slice) { + // Test the analysis on a slice. + XlaBuilder builder("slice"); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "x"); + Slice(x, {0}, {1}, {1}); + auto hlo_module = BuildHloGraph(&builder); + + // Run HLO cost analysis. + HloCostAnalysis analysis(ShapeSize); + ASSERT_IS_OK( + hlo_module->entry_computation()->root_instruction()->Accept(&analysis)); + + EXPECT_EQ(analysis.bytes_accessed(), 8); +} + +TEST_F(HloCostAnalysisTest, DynamicSlice) { + // Test the analysis on a slice. + XlaBuilder builder("dynamic-slice"); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "x"); + DynamicSlice(x, ConstantR1(&builder, {1}), {1}); + auto hlo_module = BuildHloGraph(&builder); + + // Run HLO cost analysis. + HloCostAnalysis analysis(ShapeSize); + ASSERT_IS_OK( + hlo_module->entry_computation()->root_instruction()->Accept(&analysis)); + + EXPECT_EQ(analysis.bytes_accessed(), 8); +} + +TEST_F(HloCostAnalysisTest, DynamicUpdateSlice) { + // Test the analysis on a slice. + XlaBuilder builder("dynamic-update-slice"); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "x"); + DynamicUpdateSlice(x, ConstantR1(&builder, {1.0}), + ConstantR1(&builder, {1})); + auto hlo_module = BuildHloGraph(&builder); + + // Run HLO cost analysis. + HloCostAnalysis analysis(ShapeSize); + ASSERT_IS_OK( + hlo_module->entry_computation()->root_instruction()->Accept(&analysis)); + + EXPECT_EQ(analysis.bytes_accessed(), 8); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc index 0fb65c845a6d4407c81171f6c1569fee98b1d16d..19ffb465c04ccc720ba6a8a14b187691a62b5c24 100644 --- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc +++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc @@ -14,14 +14,16 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/hlo_creation_utils.h" +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/shape_inference.h" #include "tensorflow/compiler/xla/util.h" namespace xla { -using tensorflow::gtl::ArraySlice; -using tensorflow::strings::StrCat; +using absl::StrCat; StatusOr MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs, HloInstruction* rhs) { @@ -47,9 +49,9 @@ StatusOr MakePadHlo(HloInstruction* operand, } StatusOr MakeSliceHlo(HloInstruction* operand, - ArraySlice start_indices, - ArraySlice limit_indices, - ArraySlice strides) { + absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides) { HloComputation* computation = operand->parent(); TF_ASSIGN_OR_RETURN(Shape slice_shape, ShapeInference::InferSliceShape( operand->shape(), start_indices, @@ -71,7 +73,7 @@ StatusOr MakeConvolveHlo( } StatusOr MakeTransposeHlo(HloInstruction* operand, - ArraySlice dimensions) { + absl::Span dimensions) { HloComputation* computation = operand->parent(); TF_ASSIGN_OR_RETURN( Shape transpose_shape, @@ -88,15 +90,15 @@ StatusOr MakeReshapeHlo(const Shape& result_shape, } StatusOr MakeReshapeHlo( - ArraySlice result_shape_dim_bounds, HloInstruction* operand) { + absl::Span result_shape_dim_bounds, HloInstruction* operand) { Shape new_shape = ShapeUtil::MakeShape(operand->shape().element_type(), result_shape_dim_bounds); return MakeReshapeHlo(new_shape, operand); } -StatusOr MakeDynamicSliceHlo(HloInstruction* operand, - HloInstruction* start_indices, - ArraySlice slice_sizes) { +StatusOr MakeDynamicSliceHlo( + HloInstruction* operand, HloInstruction* start_indices, + absl::Span slice_sizes) { HloComputation* computation = operand->parent(); CHECK_EQ(computation, start_indices->parent()); TF_ASSIGN_OR_RETURN( @@ -122,8 +124,8 @@ StatusOr MakeDynamicUpdateSliceHlo( } StatusOr MakeBroadcastHlo( - HloInstruction* operand, ArraySlice broadcast_dimensions, - ArraySlice result_shape_bounds) { + HloInstruction* operand, absl::Span broadcast_dimensions, + absl::Span result_shape_bounds) { HloComputation* computation = operand->parent(); Shape broadcast_shape = ShapeUtil::MakeShape(operand->shape().element_type(), result_shape_bounds); @@ -143,18 +145,18 @@ StatusOr MakeGetTupleElementHlo(HloInstruction* operand, HloInstruction::CreateGetTupleElement(gte_shape, operand, index)); } -StatusOr MakeConcatHlo(ArraySlice operands, - int64 dimension) { +StatusOr MakeConcatHlo( + absl::Span operands, int64 dimension) { CHECK_GT(operands.size(), 0); HloComputation* computation = operands[0]->parent(); - CHECK(c_all_of(operands, [&](HloInstruction* instr) { + CHECK(absl::c_all_of(operands, [&](HloInstruction* instr) { return instr->parent() == computation; })); std::vector operand_shapes; - c_transform(operands, std::back_inserter(operand_shapes), - [](HloInstruction* instr) { return &instr->shape(); }); + absl::c_transform(operands, std::back_inserter(operand_shapes), + [](HloInstruction* instr) { return &instr->shape(); }); TF_ASSIGN_OR_RETURN(Shape concat_shape, ShapeInference::InferConcatOpShape( operand_shapes, dimension)); @@ -173,6 +175,28 @@ StatusOr MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs, HloInstruction::CreateDot(dot_shape, lhs, rhs, dim_numbers)); } +StatusOr MakeMapHlo(absl::Span operands, + HloComputation* map_computation) { + CHECK(!operands.empty()) << "Map Hlo requires at least one operand."; + HloComputation* computation = operands.front()->parent(); + std::vector operand_shapes; + int64 max_operand_rank = 0; + for (const HloInstruction* operand : operands) { + CHECK_EQ(computation, operand->parent()); + operand_shapes.push_back(&operand->shape()); + max_operand_rank = + std::max(max_operand_rank, ShapeUtil::Rank(operand->shape())); + } + std::vector map_dims(max_operand_rank); + std::iota(map_dims.begin(), map_dims.end(), 0); + TF_ASSIGN_OR_RETURN( + Shape map_shape, + ShapeInference::InferMapShape( + operand_shapes, map_computation->ComputeProgramShape(), map_dims)); + return computation->AddInstruction( + HloInstruction::CreateMap(map_shape, operands, map_computation)); +} + StatusOr CollapseFirstNDims(HloInstruction* operand, int64 n) { CHECK_GT(n, 0); @@ -204,19 +228,19 @@ StatusOr PrependDegenerateDims(HloInstruction* operand, const Shape& operand_shape = operand->shape(); new_shape_dims.reserve(n + operand_shape.dimensions_size()); new_shape_dims.insert(new_shape_dims.begin(), n, 1); - c_copy(operand_shape.dimensions(), std::back_inserter(new_shape_dims)); + absl::c_copy(operand_shape.dimensions(), std::back_inserter(new_shape_dims)); return MakeReshapeHlo(new_shape_dims, operand); } StatusOr ExpandFirstDimIntoNDims( - HloInstruction* operand, ArraySlice expanded_dims) { + HloInstruction* operand, absl::Span expanded_dims) { CHECK_GT(operand->shape().dimensions_size(), 0); CHECK_EQ(operand->shape().dimensions(0), Product(expanded_dims)); std::vector expanded_shape_dim_bounds; expanded_shape_dim_bounds.reserve(expanded_dims.size() + operand->shape().dimensions_size() - 1); - c_copy(expanded_dims, std::back_inserter(expanded_shape_dim_bounds)); + absl::c_copy(expanded_dims, std::back_inserter(expanded_shape_dim_bounds)); std::copy(operand->shape().dimensions().begin() + 1, operand->shape().dimensions().end(), std::back_inserter(expanded_shape_dim_bounds)); @@ -225,9 +249,9 @@ StatusOr ExpandFirstDimIntoNDims( return MakeReshapeHlo(new_shape, operand); } -StatusOr ElideDegenerateDims(HloInstruction* operand, - ArraySlice dims_to_elide) { - CHECK(c_is_sorted(dims_to_elide)); +StatusOr ElideDegenerateDims( + HloInstruction* operand, absl::Span dims_to_elide) { + CHECK(absl::c_is_sorted(dims_to_elide)); const Shape& input_shape = operand->shape(); // First accumulate in reverse @@ -244,12 +268,44 @@ StatusOr ElideDegenerateDims(HloInstruction* operand, } } - c_reverse(new_shape_dim_bounds); + absl::c_reverse(new_shape_dim_bounds); Shape output_shape = ShapeUtil::MakeShape(input_shape.element_type(), new_shape_dim_bounds); return MakeReshapeHlo(output_shape, operand); } +StatusOr InsertDegenerateDims( + HloInstruction* operand, absl::Span dims_to_insert) { + CHECK(absl::c_is_sorted(dims_to_insert)); + + const Shape& operand_shape = operand->shape(); + int64 output_shape_rank = + operand_shape.dimensions_size() + dims_to_insert.size(); + for (auto dim_to_insert : dims_to_insert) { + CHECK_LT(dim_to_insert, output_shape_rank); + } + + std::vector output_shape_dim_bounds; + output_shape_dim_bounds.reserve(output_shape_rank); + int64 operand_dims_idx = 0; + int64 dims_to_insert_idx = 0; + for (int64 i = 0; i < output_shape_rank; ++i) { + if (dims_to_insert_idx < dims_to_insert.size() && + i == dims_to_insert[dims_to_insert_idx]) { + output_shape_dim_bounds.push_back(1); + ++dims_to_insert_idx; + } else { + output_shape_dim_bounds.push_back( + operand_shape.dimensions(operand_dims_idx)); + ++operand_dims_idx; + } + } + + Shape output_shape = ShapeUtil::MakeShape(operand_shape.element_type(), + output_shape_dim_bounds); + return MakeReshapeHlo(output_shape, operand); +} + StatusOr PadVectorWithZeros(HloInstruction* operand, int64 zeros_to_prepend, int64 zeros_to_append) { @@ -261,26 +317,26 @@ StatusOr PadVectorWithZeros(HloInstruction* operand, padding_config_dim.set_edge_padding_high(zeros_to_append); *padding_config.add_dimensions() = padding_config_dim; - HloInstruction* zero = - computation->AddInstruction(HloInstruction::CreateConstant( - MakeUnique(Literal::Zero(operand->shape().element_type())))); + HloInstruction* zero = computation->AddInstruction( + HloInstruction::CreateConstant(absl::make_unique( + LiteralUtil::Zero(operand->shape().element_type())))); return MakePadHlo(operand, zero, padding_config); } StatusOr BroadcastZeros( HloComputation* computation, PrimitiveType element_type, - ArraySlice broadcast_dimensions) { + absl::Span broadcast_dimensions) { HloInstruction* zero = computation->AddInstruction(HloInstruction::CreateConstant( - MakeUnique(Literal::Zero(element_type)))); + absl::make_unique(LiteralUtil::Zero(element_type)))); return MakeBroadcastHlo(zero, /*broadcast_dimensions=*/{}, /*result_shape_bounds=*/broadcast_dimensions); } StatusOr> CreateComputationWithSignature( - ArraySlice domain, const Shape& range, - tensorflow::StringPiece name) { - HloComputation::Builder b{std::string(name)}; + absl::Span domain, const Shape& range, + absl::string_view name) { + HloComputation::Builder b{string(name)}; int64 param_idx = 0; for (const Shape* param_shape : domain) { b.AddInstruction(HloInstruction::CreateParameter( diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h index 49b1402d689a74874e34423a1832a0b6aa15f469..a1c4b374d1121bbf94f5940b52859682808119c4 100644 --- a/tensorflow/compiler/xla/service/hlo_creation_utils.h +++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h @@ -40,10 +40,10 @@ StatusOr MakePadHlo(HloInstruction* operand, // Creates a slice HLO instruction and adds it to the computation containing // `operand`. -StatusOr MakeSliceHlo( - HloInstruction* operand, tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices, - tensorflow::gtl::ArraySlice strides); +StatusOr MakeSliceHlo(HloInstruction* operand, + absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides); // Creates a convolution HLO instruction and adds it to the computation // containing `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation). @@ -53,8 +53,8 @@ StatusOr MakeConvolveHlo( // Creates a transpose HLO instruction and adds it to the computation containing // `operand`. -StatusOr MakeTransposeHlo( - HloInstruction* operand, tensorflow::gtl::ArraySlice dimensions); +StatusOr MakeTransposeHlo(HloInstruction* operand, + absl::Span dimensions); // Creates a reshape HLO instruction and adds it to the computation containing // `operand`. @@ -62,15 +62,14 @@ StatusOr MakeReshapeHlo(const Shape& result_shape, HloInstruction* operand); StatusOr MakeReshapeHlo( - tensorflow::gtl::ArraySlice result_shape_dim_bounds, - HloInstruction* operand); + absl::Span result_shape_dim_bounds, HloInstruction* operand); // Creates a dynamic-slice HLO instruction and adds it to the computation // containing `operand` and `start_indices` (`operand` and `start_indices` must // be in the same computation). StatusOr MakeDynamicSliceHlo( HloInstruction* operand, HloInstruction* start_indices, - tensorflow::gtl::ArraySlice slice_sizes); + absl::Span slice_sizes); // Creates a dynamic-update-slice HLO instruction and adds it to the computation // containing `operand`, `update` and `start_indices` (`operand`, `update` and @@ -82,9 +81,8 @@ StatusOr MakeDynamicUpdateSliceHlo( // Creates a broadcast HLO instruction and adds it to the computation containing // `operand`. StatusOr MakeBroadcastHlo( - HloInstruction* operand, - tensorflow::gtl::ArraySlice broadcast_dimensions, - tensorflow::gtl::ArraySlice result_shape_bounds); + HloInstruction* operand, absl::Span broadcast_dimensions, + absl::Span result_shape_bounds); // Creates a GetTupleElement HLO instruction and adds it to the computation // containing `operand`. @@ -95,13 +93,18 @@ StatusOr MakeGetTupleElementHlo(HloInstruction* operand, // containing `operands` (`operands` must be non-empty and every element must be // contained in the same computation). StatusOr MakeConcatHlo( - tensorflow::gtl::ArraySlice operands, int64 dimension); + absl::Span operands, int64 dimension); // Creates a Dot HLO instruction and adds it to the computation containing `lhs` // and `rhs` (both must be in the same computation). StatusOr MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs, const DotDimensionNumbers& dim_numbers); +// Creates a Map HLO instruction and adds it to the computation containing the +// operands. All operands must be in the same computation. +StatusOr MakeMapHlo(absl::Span operands, + HloComputation* map_computation); + // ----------------------------------------------------------------------------- // Some other miscellaneous helpers to generate common HLO patterns. All of // these add all the instructions they generate into the computation containing @@ -132,7 +135,7 @@ StatusOr PrependDegenerateDims(HloInstruction* operand, // For instance if `operand` has shape f32[200,9,7] and expanded_dims is // {2,5,20} the result is `operand` reshaped to [2,5,20,9,7]. StatusOr ExpandFirstDimIntoNDims( - HloInstruction* operand, tensorflow::gtl::ArraySlice expanded_dims); + HloInstruction* operand, absl::Span expanded_dims); // Elides (via reshape) a set of degenerate dimensions (dimensions containing // exactly one element), `dims_to_elide` from `operand`. Every dimension in @@ -142,7 +145,17 @@ StatusOr ExpandFirstDimIntoNDims( // For example if `operand` is of shape f32[19,1,20,1,7,1,9] and dims_to_elide // is {1,5} then the result is `operand` reshaped to [19,20,1,7,9]. StatusOr ElideDegenerateDims( - HloInstruction* operand, tensorflow::gtl::ArraySlice dims_to_elide); + HloInstruction* operand, absl::Span dims_to_elide); + +// Inserts (via reshape) a set of degenerate dimensions (dimensions containing +// exactly one element), `dims_to_insert` into `operand`. The dimensions in +// `dims_to_insert` refer to the dimensions in the result, and hence should be +// less than the rank of the result. Also, `dims_to_insert` must be sorted. +// +// For example, if `operand` is of shape f32[12,21,8,34] and dims_to_insert is +// {0, 2}, then the result is `operand` reshaped to [1,12,1,21,8,34]. +StatusOr InsertDegenerateDims( + HloInstruction* operand, absl::Span dims_to_insert); // Pads `operand` (which must have rank 1) with `zeros_to_prepend` zeros in the // front and `zeros_to_append` zeros in the back. @@ -155,13 +168,13 @@ StatusOr PadVectorWithZeros(HloInstruction* operand, // broadcast instruction is emitted into `computation`. StatusOr BroadcastZeros( HloComputation* computation, PrimitiveType element_type, - tensorflow::gtl::ArraySlice broadcast_dimensions); + absl::Span broadcast_dimensions); // Creates a HLO computation that takes arguments of type `domain` and produces // a value of type `range`. StatusOr> CreateComputationWithSignature( - tensorflow::gtl::ArraySlice domain, const Shape& range, - tensorflow::StringPiece name); + absl::Span domain, const Shape& range, + absl::string_view name); } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc index 7e7c4f95fed737f40064224717f409b934e4ff27..eb6affadc800d9d5cf7b143386b46f3e8c608e63 100644 --- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc +++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc @@ -14,23 +14,22 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/hlo_creation_utils.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/hlo_evaluator.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/core/platform/test.h" namespace xla { namespace { -using tensorflow::gtl::ArraySlice; -class HloCreationUtilsTest : public HloTestBase { +class HloCreationUtilsTest : public HloVerifiedTestBase { protected: - static std::unique_ptr CreateModuleWithProgramShape( - PrimitiveType primitive_type, ArraySlice input_shape_dims, - ArraySlice output_shape_dims, HloInstruction** param, + HloModule* CreateModuleWithProgramShape( + PrimitiveType primitive_type, absl::Span input_shape_dims, + absl::Span output_shape_dims, HloInstruction** param, HloComputation** entry_computation) { Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims); Shape output_shape = @@ -48,10 +47,10 @@ TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) { HloInstruction* param; HloComputation* entry_computation; - std::unique_ptr module = CreateModuleWithProgramShape( - S32, - /*input_shape_dims=*/{2}, /*output_shape_dims=*/{2}, ¶m, - &entry_computation); + HloModule* module = CreateModuleWithProgramShape(S32, + /*input_shape_dims=*/{2}, + /*output_shape_dims=*/{2}, + ¶m, &entry_computation); TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_1_dims_collapsed, CollapseFirstNDims(param, 1)); @@ -60,15 +59,15 @@ TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) { HloEvaluator evaluator; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, evaluator.Evaluate>( - *module, {Literal::CreateR1({3, 4})})); - CHECK_EQ(*result_literal, *Literal::CreateR1({3, 4})); + *module, {LiteralUtil::CreateR1({3, 4})})); + CHECK_EQ(*result_literal, *LiteralUtil::CreateR1({3, 4})); } TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) { HloInstruction* param; HloComputation* entry_computation; - std::unique_ptr module = CreateModuleWithProgramShape( + HloModule* module = CreateModuleWithProgramShape( S32, /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, ¶m, &entry_computation); @@ -82,10 +81,10 @@ TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) { std::unique_ptr result_literal, evaluator.Evaluate>( *module, - {Literal::CreateR3( + {LiteralUtil::CreateR3( {{{1, 2}, {3, 4}, {5, 6}}, {{-1, -2}, {-3, -4}, {-5, -6}}})})); CHECK_EQ(*result_literal, - *Literal::CreateR2( + *LiteralUtil::CreateR2( {{1, 2}, {3, 4}, {5, 6}, {-1, -2}, {-3, -4}, {-5, -6}})); } @@ -93,27 +92,28 @@ TEST_F(HloCreationUtilsTest, Prepend1DegenerateDim) { HloInstruction* param; HloComputation* entry_computation; - std::unique_ptr module = CreateModuleWithProgramShape( - S32, - /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 2}, ¶m, - &entry_computation); + HloModule* module = CreateModuleWithProgramShape(S32, + /*input_shape_dims=*/{2}, + /*output_shape_dims=*/{1, 2}, + ¶m, &entry_computation); TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_1_degenerate_dim_prepended, PrependDegenerateDims(param, 1)); entry_computation->set_root_instruction(with_1_degenerate_dim_prepended); HloEvaluator evaluator; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, - evaluator.Evaluate>( - *module, {Literal::CreateR1({9, 10})})); - CHECK_EQ(*result_literal, *Literal::CreateR2({{9, 10}})); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {LiteralUtil::CreateR1({9, 10})})); + CHECK_EQ(*result_literal, *LiteralUtil::CreateR2({{9, 10}})); } TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) { HloInstruction* param; HloComputation* entry_computation; - std::unique_ptr module = CreateModuleWithProgramShape( + HloModule* module = CreateModuleWithProgramShape( S32, /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 1, 2}, ¶m, &entry_computation); @@ -123,20 +123,21 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) { entry_computation->set_root_instruction(with_2_degenerate_dims_prepended); HloEvaluator evaluator; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, - evaluator.Evaluate>( - *module, {Literal::CreateR1({9, 10})})); - CHECK_EQ(*result_literal, *Literal::CreateR3({{{9, 10}}})); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {LiteralUtil::CreateR1({9, 10})})); + CHECK_EQ(*result_literal, *LiteralUtil::CreateR3({{{9, 10}}})); } TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) { HloInstruction* param; HloComputation* entry_computation; - std::unique_ptr module = CreateModuleWithProgramShape( - S32, - /*input_shape_dims=*/{}, /*output_shape_dims=*/{1, 1}, ¶m, - &entry_computation); + HloModule* module = CreateModuleWithProgramShape(S32, + /*input_shape_dims=*/{}, + /*output_shape_dims=*/{1, 1}, + ¶m, &entry_computation); TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended, PrependDegenerateDims(param, 2)); @@ -145,15 +146,15 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) { HloEvaluator evaluator; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, evaluator.Evaluate>( - *module, {Literal::CreateR0(9)})); - CHECK_EQ(*result_literal, *Literal::CreateR2({{9}})); + *module, {LiteralUtil::CreateR0(9)})); + CHECK_EQ(*result_literal, *LiteralUtil::CreateR2({{9}})); } TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) { HloInstruction* param; HloComputation* entry_computation; - std::unique_ptr module = CreateModuleWithProgramShape( + HloModule* module = CreateModuleWithProgramShape( S32, /*input_shape_dims=*/{6}, /*output_shape_dims=*/{3, 1, 2}, ¶m, &entry_computation); @@ -166,19 +167,19 @@ TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) { TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr result_literal, evaluator.Evaluate>( - *module, {Literal::CreateR1({1, 2, 3, 4, 5, 6})})); + *module, {LiteralUtil::CreateR1({1, 2, 3, 4, 5, 6})})); CHECK_EQ(*result_literal, - *Literal::CreateR3({{{1, 2}}, {{3, 4}}, {{5, 6}}})); + *LiteralUtil::CreateR3({{{1, 2}}, {{3, 4}}, {{5, 6}}})); } TEST_F(HloCreationUtilsTest, PadVectorWithZeros) { HloInstruction* param; HloComputation* entry_computation; - std::unique_ptr module = CreateModuleWithProgramShape( - S32, - /*input_shape_dims=*/{2}, /*output_shape_dims=*/{6}, ¶m, - &entry_computation); + HloModule* module = CreateModuleWithProgramShape(S32, + /*input_shape_dims=*/{2}, + /*output_shape_dims=*/{6}, + ¶m, &entry_computation); TF_ASSERT_OK_AND_ASSIGN( HloInstruction * zero_padded_param, @@ -188,18 +189,18 @@ TEST_F(HloCreationUtilsTest, PadVectorWithZeros) { HloEvaluator evaluator; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, evaluator.Evaluate>( - *module, {Literal::CreateR1({3, 4})})); - CHECK_EQ(*result_literal, *Literal::CreateR1({0, 0, 0, 3, 4, 0})); + *module, {LiteralUtil::CreateR1({3, 4})})); + CHECK_EQ(*result_literal, *LiteralUtil::CreateR1({0, 0, 0, 3, 4, 0})); } TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) { HloInstruction* param; HloComputation* entry_computation; - std::unique_ptr module = CreateModuleWithProgramShape( - S32, - /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, ¶m, - &entry_computation); + HloModule* module = CreateModuleWithProgramShape(S32, + /*input_shape_dims=*/{}, + /*output_shape_dims=*/{2, 2}, + ¶m, &entry_computation); TF_ASSERT_OK_AND_ASSIGN( HloInstruction * zeros, @@ -209,18 +210,18 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) { HloEvaluator evaluator; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, evaluator.Evaluate>( - *module, {Literal::CreateR0(0)})); - CHECK_EQ(*result_literal, *Literal::CreateR2({{0, 0}, {0, 0}})); + *module, {LiteralUtil::CreateR0(0)})); + CHECK_EQ(*result_literal, *LiteralUtil::CreateR2({{0, 0}, {0, 0}})); } TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) { HloInstruction* param; HloComputation* entry_computation; - std::unique_ptr module = CreateModuleWithProgramShape( - F32, - /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, ¶m, - &entry_computation); + HloModule* module = CreateModuleWithProgramShape(F32, + /*input_shape_dims=*/{}, + /*output_shape_dims=*/{2, 2}, + ¶m, &entry_computation); TF_ASSERT_OK_AND_ASSIGN( HloInstruction * zeros, @@ -230,9 +231,9 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) { HloEvaluator evaluator; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, evaluator.Evaluate>( - *module, {Literal::CreateR0(0.0f)})); + *module, {LiteralUtil::CreateR0(0.0f)})); CHECK_EQ(*result_literal, - *Literal::CreateR2({{0.0f, 0.0f}, {0.0f, 0.0f}})); + *LiteralUtil::CreateR2({{0.0f, 0.0f}, {0.0f, 0.0f}})); } } // namespace diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc index dab946a099fa0066a4a0d42ce29077b9de6a486e..cb367adf5ef29111838dd6ee1b770394eef1301c 100644 --- a/tensorflow/compiler/xla/service/hlo_cse.cc +++ b/tensorflow/compiler/xla/service/hlo_cse.cc @@ -24,7 +24,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_domain_map.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -35,6 +35,7 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/lib/hash/hash.h" namespace xla { @@ -103,6 +104,9 @@ int64 CseHash(const HloInstruction* instruction) { for (auto operand : instruction->operands()) { hash = tensorflow::Hash64Combine(hash, operand->unique_id()); } + if (instruction->opcode() == HloOpcode::kConstant) { + hash = tensorflow::Hash64Combine(hash, instruction->literal().Hash()); + } return hash; } @@ -135,15 +139,14 @@ StatusOr HloCSE::Run(HloModule* module) { // instruction for each class. tensorflow::gtl::FlatSet - representatives(/*N=*/1024, &CseHash, cse_equal); - + representatives(/*N=*/computation->instruction_count() + 1, &CseHash, + cse_equal); for (auto instruction : computation->MakeInstructionPostOrder()) { // If the instruction has zero operands (constants, parameters, etc.) skip // over it. if (instruction->operand_count() == 0) { continue; } - // Skip instructions which have side effects. if (instruction->HasSideEffect()) { continue; diff --git a/tensorflow/compiler/xla/service/hlo_cse.h b/tensorflow/compiler/xla/service/hlo_cse.h index 5e2b348bdda2b31556fb692e24d2bad2e4173ef5..a28c03599a8765da708f37b986010713654647cb 100644 --- a/tensorflow/compiler/xla/service/hlo_cse.h +++ b/tensorflow/compiler/xla/service/hlo_cse.h @@ -34,7 +34,7 @@ class HloCSE : public HloPassInterface { : is_layout_sensitive_(is_layout_sensitive), only_fusion_computations_(only_fusion_computations) {} ~HloCSE() override = default; - tensorflow::StringPiece name() const override { return "cse"; } + absl::string_view name() const override { return "cse"; } // Run CSE on the given module. Returns whether the module was changed (common // subexpressions were found and eliminated). diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc index e8c5ca347bb33bb12e30ef4cc7a43107b16de894..406d712ec6783a310aabc6600b8b70e1a1ae30a9 100644 --- a/tensorflow/compiler/xla/service/hlo_cse_test.cc +++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc @@ -20,9 +20,9 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -32,10 +32,10 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_utils.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/platform/types.h" @@ -53,9 +53,9 @@ TEST_F(HloCseTest, CombineTwoConstants) { // Test that two identical constants are commoned. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); builder.AddInstruction(HloInstruction::CreateBinary( constant1->shape(), HloOpcode::kAdd, constant1, constant2)); @@ -72,7 +72,7 @@ TEST_F(HloCseTest, CombineTwoConstants) { EXPECT_EQ(42.0f, constant->literal().Get({})); auto result = ExecuteAndTransfer(std::move(module), {}); - auto expected = Literal::CreateR0(84.0); + auto expected = LiteralUtil::CreateR0(84.0); EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(1e-4))); } @@ -81,10 +81,10 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) { // the pass is not layout sensitive. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2WithLayout( + HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1})))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2WithLayout( + HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0})))); auto add = builder.AddInstruction(HloInstruction::CreateBinary( constant1->shape(), HloOpcode::kAdd, constant1, constant2)); @@ -104,7 +104,7 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) { EXPECT_THAT(add, op::Add(first_operand, first_operand)); auto result = ExecuteAndTransfer(std::move(module), {}); - auto expected = Literal::CreateR2({{2.0, 4.0}, {6.0, 8.0}}); + auto expected = LiteralUtil::CreateR2({{2.0, 4.0}, {6.0, 8.0}}); EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(1e-4))); } @@ -113,10 +113,10 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) { // if the pass is layout sensitive. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2WithLayout( + HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1})))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2WithLayout( + HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0})))); auto add = builder.AddInstruction(HloInstruction::CreateBinary( constant1->shape(), HloOpcode::kAdd, constant1, constant2)); @@ -134,7 +134,7 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) { EXPECT_THAT(add, op::Add(constant1, constant2)); auto result = ExecuteAndTransfer(std::move(module), {}); - auto expected = Literal::CreateR2({{2.0, 4.0}, {6.0, 8.0}}); + auto expected = LiteralUtil::CreateR2({{2.0, 4.0}, {6.0, 8.0}}); EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(1e-4))); } @@ -144,20 +144,20 @@ TEST_F(HloCseTest, ConstantsSameValueDifferentType) { auto builder = HloComputation::Builder(TestName()); std::vector constants; constants.push_back(builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42)))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42)))); constants.push_back(builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42)))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42)))); constants.push_back(builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0)))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0)))); constants.push_back(builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0)))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0)))); constants.push_back(builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0)))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0)))); constants.push_back(builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f)))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f)))); // Duplicate the float constant to verify something happens. constants.push_back(builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f)))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f)))); const Shape shape_r0 = ShapeUtil::MakeShape(F32, {}); for (int64 i = 0; i < constants.size(); ++i) { @@ -188,13 +188,13 @@ TEST_F(HloCseTest, NonscalarConstants) { // Test that identical nonscalar constants are merged. auto builder = HloComputation::Builder(TestName()); auto common_constant1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); auto common_constant2 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); // Create a constant which has the same shape but a different value. auto uncommon_constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{2.0, 4.0}, {6.0, 8.0}}))); + LiteralUtil::CreateR2({{2.0, 4.0}, {6.0, 8.0}}))); // Tie the constants together with a tuple. This makes it easier to refer to // the constant instructions via their use. @@ -223,7 +223,7 @@ TEST_F(HloCseTest, IdenticalInstructions) { // Test that three identical instructions are commoned. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); auto exp1 = builder.AddInstruction(HloInstruction::CreateUnary( constant->shape(), HloOpcode::kExp, constant)); auto exp2 = builder.AddInstruction(HloInstruction::CreateUnary( @@ -239,7 +239,7 @@ TEST_F(HloCseTest, IdenticalInstructions) { EXPECT_EQ(5, computation->instruction_count()); EXPECT_THAT(tuple, op::Tuple(exp1, exp2, exp3)); - HloCSE cse(/*is_layout_sensitive=*/false); + HloCSE cse(/*is_layout_sensitive=*/true); EXPECT_TRUE(cse.Run(module.get()).ValueOrDie()); EXPECT_EQ(3, computation->instruction_count()); @@ -248,12 +248,189 @@ TEST_F(HloCseTest, IdenticalInstructions) { EXPECT_THAT(tuple, op::Tuple(first_operand, first_operand, first_operand)); } +// Test two identical while loops with same inputs +TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesSameInput) { + auto module = ParseHloString(R"( + HloModule WhileLoopsIdenticalConditionsAndBodiesSameInput + + %body (param: (f32[], f32[])) -> (f32[], f32[]) { + %param = (f32[], f32[]) parameter(0) + %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param), +index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param), +index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1) + ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add) + } + + %condition (param.1: (f32[], f32[])) -> pred[] { + %param.1 = (f32[], f32[]) parameter(0) + ROOT %constant = pred[] constant(false) + } + + %condition.1 (param.2: (f32[], f32[])) -> pred[] { + %param.2 = (f32[], f32[]) parameter(0) + ROOT %constant.1 = pred[] constant(false) + } + + ENTRY %WhileLoopsIdenticalConditionsAndBodiesSameInput () -> (f32[], f32[]) +{ %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2) %tuple.1 = +(f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3) %while = (f32[], +f32[]) while((f32[], f32[]) %tuple.1), condition=%condition, body=%body ROOT +%while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.1), +condition=%condition.1, body=%body + } + )") + .ValueOrDie(); + + auto computation = module->entry_computation(); + + EXPECT_EQ(5, computation->instruction_count()); + HloCSE cse(true); + EXPECT_TRUE(cse.Run(module.get()).ValueOrDie()); + EXPECT_EQ(4, computation->instruction_count()); +} + +// Test two while loops with same conditions, same inputs, but different +// bodies +TEST_F(HloCseTest, WhileLoopsIdenticalConditionsSameInputAndDifferentBodies) { + auto module = ParseHloString(R"( + HloModule WhileLoopsIdenticalConditionsSameInputAndDifferentBodies + + %body (param: (f32[], f32[])) -> (f32[], f32[]) { + %param = (f32[], f32[]) parameter(0) + %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param), +index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param), +index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1) + ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add) + } + + %body2 (param.1: (f32[], f32[])) -> (f32[], f32[]) { + %param.1 = (f32[], f32[]) parameter(0) + %get-tuple-element.2 = f32[] get-tuple-element((f32[], f32[]) %param.1), +index=0 %get-tuple-element.3 = f32[] get-tuple-element((f32[], f32[]) %param.1), +index=1 %sub = f32[] subtract(f32[] %get-tuple-element.2, f32[] +%get-tuple-element.3) ROOT %tuple.2 = (f32[], f32[]) tuple(f32[] +%get-tuple-element.2, f32[] %sub) + } + + %condition (param.2: (f32[], f32[])) -> pred[] { + %param.2 = (f32[], f32[]) parameter(0) + ROOT %constant = pred[] constant(false) + } + + %condition.1 (param.3: (f32[], f32[])) -> pred[] { + %param.3 = (f32[], f32[]) parameter(0) + ROOT %constant.1 = pred[] constant(false) + } + + ENTRY %WhileLoopsIdenticalConditionsSameInputAndDifferentBodies () -> +(f32[], f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2) + %tuple.1 = (f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3) + %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1), +condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[], +f32[]) %tuple.1), condition=%condition.1, body=%body2 + } + )") + .ValueOrDie(); + + auto computation = module->entry_computation(); + + EXPECT_EQ(5, computation->instruction_count()); + HloCSE cse(true); + EXPECT_FALSE(cse.Run(module.get()).ValueOrDie()); + EXPECT_EQ(5, computation->instruction_count()); +} + +// Test two identical while loops with different inputs +TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesDifferentInput) { + auto module = ParseHloString(R"( + HloModule WhileLoopsIdenticalConditionsAndBodiesDifferentInput + + %body (param: (f32[], f32[])) -> (f32[], f32[]) { + %param = (f32[], f32[]) parameter(0) + %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param), +index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param), +index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1) + ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add) + } + + %condition (param.1: (f32[], f32[])) -> pred[] { + %param.1 = (f32[], f32[]) parameter(0) + ROOT %constant = pred[] constant(false) + } + + %condition.1 (param.2: (f32[], f32[])) -> pred[] { + %param.2 = (f32[], f32[]) parameter(0) + ROOT %constant.1 = pred[] constant(false) + } + + ENTRY %WhileLoopsIdenticalConditionsAndBodiesDifferentInput () -> (f32[], +f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2) + %tuple.1 = (f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3) + %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1), +condition=%condition, body=%body %constant.4 = f32[] constant(1) %constant.5 = +f32[] constant(2) %tuple.2 = (f32[], f32[]) tuple(f32[] %constant.4, f32[] +%constant.5) ROOT %while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.2), +condition=%condition.1, body=%body + } + + )") + .ValueOrDie(); + + auto computation = module->entry_computation(); + + EXPECT_EQ(8, computation->instruction_count()); + HloCSE cse(true); + EXPECT_FALSE(cse.Run(module.get()).ValueOrDie()); + EXPECT_EQ(8, computation->instruction_count()); +} + +// Test two while loops with identical bodies and same inputs, but different +// conditions +TEST_F(HloCseTest, WhileLoopsIdenticalBodiesAndInputDifferntConditions) { + auto module = ParseHloString(R"( + HloModule WhileLoopsIdenticalBodiesAndInputDifferntConditions + + %body (param: (f32[], f32[])) -> (f32[], f32[]) { + %param = (f32[], f32[]) parameter(0) + %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param), +index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param), +index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1) + ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add) + } + + %condition (param.1: (f32[], f32[])) -> pred[] { + %param.1 = (f32[], f32[]) parameter(0) + ROOT %constant = pred[] constant(false) + } + + %condition.1 (param.2: (f32[], f32[])) -> pred[] { + %param.2 = (f32[], f32[]) parameter(0) + ROOT %constant.1 = pred[] constant(true) + } + + ENTRY %WhileLoopsIdenticalBodiesAndInputDifferntConditions () -> (f32[], +f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2) + %tuple.1 = (f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3) + %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1), +condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[], +f32[]) %tuple.1), condition=%condition.1, body=%body + })") + .ValueOrDie(); + + auto computation = module->entry_computation(); + + EXPECT_EQ(5, computation->instruction_count()); + HloCSE cse(true); + EXPECT_FALSE(cse.Run(module.get()).ValueOrDie()); + EXPECT_EQ(5, computation->instruction_count()); +} + TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) { // Test that two identical instructions with different layouts are *not* // commoned if the pass is layout sensitive. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); auto exp1 = builder.AddInstruction(HloInstruction::CreateUnary( constant->shape(), HloOpcode::kExp, constant)); @@ -284,7 +461,7 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) { // the pass is layout insensitive. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); auto exp1 = builder.AddInstruction(HloInstruction::CreateUnary( constant->shape(), HloOpcode::kExp, constant)); @@ -362,7 +539,7 @@ TEST_F(HloCseTest, IdenticalExpressions) { // The *1 instructions should be merged with the *2 instructions. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); auto negate1 = builder.AddInstruction(HloInstruction::CreateUnary( constant->shape(), HloOpcode::kNegate, constant)); @@ -400,9 +577,9 @@ TEST_F(HloCseTest, DoNotCombineRng) { // Test that two RNG ops are not commoned. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); auto rng1 = builder.AddInstruction(HloInstruction::CreateRng( ShapeUtil::MakeShape(F32, {}), RandomDistribution::RNG_UNIFORM, {constant1, constant2})); @@ -442,9 +619,9 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) { Shape scalar_shape = ShapeUtil::MakeShape(F32, {}); auto builder = HloComputation::Builder(TestName() + "_rng_fun"); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); auto rng = builder.AddInstruction(HloInstruction::CreateRng( scalar_shape, RandomDistribution::RNG_UNIFORM, {constant1, constant2})); auto param = builder.AddInstruction(HloInstruction::CreateParameter( @@ -459,7 +636,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) { { auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({5.0f}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({5.0f}))); auto rng1 = builder.AddInstruction( HloInstruction::CreateMap(constant->shape(), {constant}, rng_function)); auto rng2 = builder.AddInstruction( @@ -486,7 +663,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) { } TEST_F(HloCseTest, CompareComputations) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule m add_computation { @@ -521,9 +698,9 @@ TEST_F(HloCseTest, ConstantsSameValueInDifferentDomains) { // in this case) are not collapsed. auto builder = HloComputation::Builder(TestName()); builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42))); builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42))); auto module = CreateNewModule(); auto computation = module->AddEntryComputation(builder.Build()); @@ -536,5 +713,40 @@ TEST_F(HloCseTest, ConstantsSameValueInDifferentDomains) { EXPECT_EQ(2, computation->instruction_count()); } +TEST_F(HloCseTest, Domain) { + auto module = ParseHloString(R"( +HloModule module +ENTRY %entry { + %param = f32[] parameter(0), sharding={maximal device=0} + %domain.0 = f32[] domain(%param), + domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}} + %domain.1 = f32[] domain(%param), + domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}} + %domain.2 = f32[] domain(%param), + domain={kind="sharding", entry={maximal device=0}, exit={maximal device=2}} + %negate.0 = f32[] negate(%domain.0) + %negate.1 = f32[] negate(%domain.1) + %negate.2 = f32[] negate(%domain.2) + %domain.3 = f32[] domain(%negate.0), + domain={kind="sharding", entry={maximal device=1}, exit={maximal device=0}} + %domain.4 = f32[] domain(%negate.1), + domain={kind="sharding", entry={maximal device=1}, exit={maximal device=0}} + %domain.5 = f32[] domain(%negate.2), + domain={kind="sharding", entry={maximal device=2}, exit={maximal device=0}} + %add = f32[] add(%domain.3, %domain.4) + ROOT %sub = f32[] subtract(%add, %domain.5) +})") + .ValueOrDie(); + + HloCSE cse(/*is_layout_sensitive=*/false); + EXPECT_TRUE(cse.Run(module.get()).ValueOrDie()); + LOG(INFO) << "AAAAA " << module->ToString(); + const HloInstruction* sub = module->entry_computation()->root_instruction(); + const HloInstruction* add = sub->operand(0); + EXPECT_EQ(add->operand(0), add->operand(1)); + EXPECT_NE(add->operand(0), sub->operand(1)); + EXPECT_NE(add->operand(1), sub->operand(1)); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc index cc130a4900dc162d4b416116fbe879fec37136a2..6a63681996bc57f4ef16b2405ffc8ce4f003e783 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc @@ -19,8 +19,10 @@ limitations under the License. #include #include +#include "absl/container/inlined_vector.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" @@ -29,21 +31,88 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" namespace xla { +namespace { + +// We have this pattern in dynamaic update slice fusion, which should be +// supported: +// +// Parameters: p0, p1 +// Fusion +// ds = DynamicSlice(p0, p1) +// ROOT DynamicUpdateslice(p0, ds, p1) +// +// In this case, we should be able to reuse p0 and output, although p0 has +// multiple uses. +bool MultiDynamicSliceUseShareSameIndices(absl::Span uses) { + if (uses.empty()) { + return false; + } + const HloInstruction* indices = nullptr; + for (HloUse use : uses) { + auto user = use.instruction; + if (user->opcode() == HloOpcode::kDynamicUpdateSlice) { + if (indices == nullptr) { + indices = user->operand(2); + } else if (indices != user->operand(2)) { + return false; + } + if (use.operand_number != 0) { + return false; + } + } else if (user->opcode() == HloOpcode::kDynamicSlice) { + if (indices == nullptr) { + indices = user->operand(1); + } else if (indices != user->operand(1)) { + return false; + } + } else { + return false; + } + } + return true; +} + +} // namespace -using ::tensorflow::strings::StrAppend; -using ::tensorflow::strings::StrCat; +using absl::StrAppend; +using absl::StrCat; -HloDataflowAnalysis::HloDataflowAnalysis(const HloModule& module, bool ssa_form, - bool bitcast_defines_value) +HloDataflowAnalysis::HloDataflowAnalysis( + const HloModule& module, bool ssa_form, bool bitcast_defines_value, + const FusionCanShareBufferFunction& fusion_can_share_buffer) : module_(module), ssa_form_(ssa_form), bitcast_defines_value_(bitcast_defines_value), - call_graph_(CallGraph::Build(&module)) {} + call_graph_(CallGraph::Build(&module)), + fusion_can_share_buffer_(fusion_can_share_buffer) {} + +bool HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple( + const HloInstruction* inst) { + tensorflow::gtl::FlatSet visited; + absl::InlinedVector stack; + stack.push_back(inst); + while (!stack.empty()) { + const HloInstruction* current = stack.back(); + stack.pop_back(); + visited.insert(current); + for (const HloInstruction* user : current->users()) { + // Found a user that is non-elementwise on current instruction. + for (const int64 use_index : user->OperandIndices(current)) { + if (!user->IsElementwiseOnOperand(use_index) && + user->opcode() != HloOpcode::kTuple) { + return false; + } + } + if (!visited.count(user)) { + stack.push_back(user); + } + } + } + return true; +} bool HloDataflowAnalysis::ValueIsDefinedAt(const HloInstruction* instruction, const ShapeIndex& index) const { @@ -151,7 +220,7 @@ string HloDataflowAnalysis::ToString() const { bool HloDataflowAnalysis::Phi( HloInstruction* instruction, - tensorflow::gtl::ArraySlice inputs) { + absl::Span inputs) { CHECK(ssa_form_); VLOG(4) << "Phi(" << instruction->name() << ")"; VLOG(5) << "instruction value set = " @@ -328,18 +397,17 @@ bool HloDataflowAnalysis::UpdateSendValueSet(HloInstruction* send) { bool HloDataflowAnalysis::UpdateRecvDoneValueSet(HloInstruction* recv_done) { CHECK_EQ(recv_done->opcode(), HloOpcode::kRecvDone); bool changed = false; - // RecvDone forwards the operand value at {0} to the output. + // RecvDone forwards the operand value at {0} to element {0} of its output. for (auto& pair : GetInstructionValueSet(recv_done)) { ShapeIndex& index = pair.first; HloValueSet& value_set = pair.second; - ShapeIndex operand_index = {0}; - for (int64 i : index) { - operand_index.push_back(i); + if (index.empty() || index[0] != 0) { + continue; } const HloValueSet& operand_value_set = - GetValueSet(recv_done->operand(0), operand_index); + GetValueSet(recv_done->operand(0), index); if (value_set != operand_value_set) { value_set = operand_value_set; changed = true; @@ -396,6 +464,24 @@ bool HloDataflowAnalysis::UpdateCopyValueSet(HloInstruction* copy) { return changed; } +bool HloDataflowAnalysis::UpdateDomainValueSet(HloInstruction* domain) { + // Domain instructions just forward their operand. Given that domains can have + // a tuple operand, we iterate through its indexes, like for copies. + // Unlike copies though we also propagate the top-level value. + CHECK_EQ(domain->opcode(), HloOpcode::kDomain); + bool changed = false; + for (auto& pair : GetInstructionValueSet(domain)) { + const ShapeIndex& index = pair.first; + HloValueSet& value_set = pair.second; + HloValueSet& operand_value_set = GetValueSet(domain->operand(0), index); + if (value_set != operand_value_set) { + value_set = operand_value_set; + changed = true; + } + } + return changed; +} + bool HloDataflowAnalysis::UpdateGetTupleElementValueSet(HloInstruction* gte) { CHECK_EQ(gte->opcode(), HloOpcode::kGetTupleElement); bool changed = false; @@ -490,17 +576,17 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) { } } -bool HloDataflowAnalysis::UpdateSelectValueSet(HloInstruction* select) { - CHECK_EQ(select->opcode(), HloOpcode::kSelect); - // A phi value is not defined at a kSelect instruction because kSelect does - // not create a new value. Rather it forwards a value from its operands. This - // contrasts with kWhile instruction (which does define a phi value) which has - // in-place update semantics. +bool HloDataflowAnalysis::UpdateTupleSelectValueSet(HloInstruction* select) { + CHECK_EQ(select->opcode(), HloOpcode::kTupleSelect); + // A phi value is not defined at a kTupleSelect instruction because + // kTupleSelect does not create a new value. Rather it forwards a value from + // its operands. This contrasts with kWhile instruction (which does define a + // phi value) which has in-place update semantics. bool changed = false; for (auto& pair : GetInstructionValueSet(select)) { const ShapeIndex& index = pair.first; if (index.empty()) { - // kSelect copies (not forwards) the top-level value. + // kTupleSelect copies (not forwards) the top-level value. continue; } HloValueSet& value_set = pair.second; @@ -556,12 +642,14 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet( return UpdateBitcastValueSet(instruction); case HloOpcode::kSlice: return UpdateSliceValueSet(instruction); + case HloOpcode::kDomain: + return UpdateDomainValueSet(instruction); case HloOpcode::kCopy: return UpdateCopyValueSet(instruction); case HloOpcode::kGetTupleElement: return UpdateGetTupleElementValueSet(instruction); - case HloOpcode::kSelect: - return UpdateSelectValueSet(instruction); + case HloOpcode::kTupleSelect: + return UpdateTupleSelectValueSet(instruction); case HloOpcode::kTuple: return UpdateTupleValueSet(instruction); case HloOpcode::kParameter: @@ -734,6 +822,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { case HloOpcode::kCall: case HloOpcode::kConditional: case HloOpcode::kGetTupleElement: + case HloOpcode::kDomain: // These instructions define no values. The values in their output // flow from their operands or from cross computation dataflow. break; @@ -747,7 +836,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { return Unimplemented( "Computation %s is called in both a parallel (eg, kMap) and " "sequential (eg, kCall) context", - computation->name().c_str()); + computation->name()); } if (call_graph_node.caller_callsites().empty() || call_graph_node.context() == CallContext::kParallel) { @@ -759,21 +848,25 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { } break; case HloOpcode::kCopy: - case HloOpcode::kSelect: + case HloOpcode::kTupleSelect: case HloOpcode::kTuple: // These instructions only define their top-level values. Any other // values flow from their operands. define_top_level_only(); break; case HloOpcode::kRecvDone: - // RecvDone aliases its input tuple element {0}, therefore does not - // define any values. + // RecvDone produces a two-element tuple. Element zero aliases its + // input tuple element {0}; element one is a token. + define_value_at(/*index=*/{}); + define_value_at(/*index=*/{1}); break; case HloOpcode::kSend: - // Send produces a tuple of {aliased operand, U32 context}, therefore - // only defines the top-level tuple and the tuple element at {1}. + // Send produces a tuple of {aliased operand, U32 context, token}, + // therefore only defines the top-level tuple and the tuple elements + // at {1} and {2}. define_value_at(/*index=*/{}); define_value_at(/*index=*/{1}); + define_value_at(/*index=*/{2}); break; default: define_all_values(); @@ -787,12 +880,13 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { /* static */ StatusOr> HloDataflowAnalysis::Run( - const HloModule& module, bool ssa_form, bool bitcast_defines_value) { + const HloModule& module, bool ssa_form, bool bitcast_defines_value, + const FusionCanShareBufferFunction& fusion_can_share_buffer) { VLOG(1) << "HloDataflowAnalysis::Run on module " << module.name(); XLA_VLOG_LINES(2, module.ToString()); - auto dataflow_analysis = WrapUnique( - new HloDataflowAnalysis(module, ssa_form, bitcast_defines_value)); + auto dataflow_analysis = absl::WrapUnique(new HloDataflowAnalysis( + module, ssa_form, bitcast_defines_value, fusion_can_share_buffer)); TF_RETURN_IF_ERROR(dataflow_analysis->InitializeInstructionValueSets()); dataflow_analysis->Propagate(); @@ -881,28 +975,22 @@ Status HloDataflowAnalysis::Verify() const { bool HloDataflowAnalysis::DoesNotUseOperandBuffer( const HloInstruction* operand, const ShapeIndex& index, const HloInstruction* user) const { - CHECK(user->IsUserOf(operand)) - << "user: " << user->ToString() << " operand: " << operand->ToString(); - if (user->opcode() == HloOpcode::kFusion && - user->fusion_kind() == HloInstruction::FusionKind::kLoop) { - // Find fusion parameter associated with 'operand'. - HloInstruction* fusion_param = - user->fused_parameter(user->operand_index(operand)); - // Iterate through all users of all uses of the fusion parameter value. - // Return false if any uses are detected, returns true otherwise. - const HloValue& value = GetValueDefinedAt(fusion_param, index); - return value.uses().empty(); - } else { - // Return false if no value at 'operand' and 'index' is used at 'user'. - for (const HloValue* value : GetValueSet(operand, index).values()) { - for (const HloUse& use : value->uses()) { - if (use.instruction == user) { - return false; + // Return false if no value at 'operand' and 'index' is used at 'user'. + for (const HloValue* value : GetValueSet(operand, index).values()) { + for (const HloUse& use : value->uses()) { + if (use.instruction == user) { + if (user->opcode() == HloOpcode::kFusion && + user->fusion_kind() == HloInstruction::FusionKind::kLoop) { + HloInstruction* fusion_param = + user->fused_parameter(use.operand_number); + const HloValue& value = + GetValueDefinedAt(fusion_param, use.operand_index); + return value.uses().empty(); } + return false; } } } - return true; } @@ -915,34 +1003,44 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser( ShapeUtil::GetSubshape(operand->shape(), operand_index); const Shape& user_subshape = ShapeUtil::GetSubshape(user->shape(), user_index); + // Check that operand and user emit the same shape and layout. if (!ShapeUtil::Equal(operand_subshape, user_subshape)) { return false; } if (user->opcode() == HloOpcode::kFusion) { + if (fusion_can_share_buffer_ != nullptr) { + return fusion_can_share_buffer_(user, operand); + } // Get the parameter associated with 'operand'; HloInstruction* fusion_param = user->fused_parameter(user->operand_index(operand)); const HloValue& value = GetValueDefinedAt(fusion_param, operand_index); - if (value.uses().size() != 1) { - return false; + if (MultiDynamicSliceUseShareSameIndices(value.uses())) { + return true; } - const HloUse& use = value.uses()[0]; - - if (user->fusion_kind() == HloInstruction::FusionKind::kLoop && - user->fused_expression_root()->opcode() == - HloOpcode::kDynamicUpdateSlice) { - // Loop fusion with kDynamicUpdateSlice fused root. - // - // Returns true iff there is exactly one use of 'operand' at shape index - // 'operand_index', and this singleton use is the fused root at operand - // index 0. - return use.instruction == user->fused_expression_root() && - use.operand_number == 0; - } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput && - user->fused_expression_root()->opcode() == HloOpcode::kAdd) { + if (user->fusion_kind() == HloInstruction::FusionKind::kLoop || + user->fusion_kind() == HloInstruction::FusionKind::kInput) { + if (user->fused_expression_root()->opcode() == + HloOpcode::kDynamicUpdateSlice) { + // Loop fusion with kDynamicUpdateSlice fused root. + // + // Returns true iff there is exactly one use of 'operand' at shape index + // 'operand_index', and this singleton use is the fused root at operand + // index 0. + if (value.uses().size() == 1) { + const HloUse& use = value.uses()[0]; + return use.instruction == user->fused_expression_root() && + use.operand_number == 0; + } + return false; + } + return AreTransitiveUsesElementwiseOrTuple(fusion_param); + } + if (user->fusion_kind() == HloInstruction::FusionKind::kOutput && + user->fused_expression_root()->opcode() == HloOpcode::kAdd) { // Output fusion with kAdd fused root. // Check if one operand of kAdd fused root is kDot or kConvolution. @@ -963,10 +1061,15 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser( // Returns true iff there is exactly one use of 'operand' at shape index // 'operand_index', and this singleton use is the fused root (at operand // index 'other_add_operand_index'). - return use.instruction == user->fused_expression_root() && - use.operand_number == other_add_operand_index; + if (value.uses().size() == 1) { + const HloUse& use = value.uses()[0]; + return use.instruction == user->fused_expression_root() && + use.operand_number == other_add_operand_index; + } + return false; } } + if (user->opcode() == HloOpcode::kDynamicUpdateSlice || user->opcode() == HloOpcode::kWhile) { // We eliminated other users in BufferLiveness::live_range_strictly_before, @@ -974,6 +1077,21 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser( std::vector operand_indices = user->OperandIndices(operand); return operand_indices.size() == 1 && operand_indices[0] == 0; } + if (user->opcode() == HloOpcode::kSort) { + // Only valid if there are no other users. + if (operand->users().size() != 1) { + return false; + } + // If we only sort keys, the output of sort is not a tuple, so we can always + // share the buffer. + if (user->operand_count() == 1) { + return true; + } + CHECK(!user_index.empty()); + // Only share with the right tuple element buffer. + std::vector operand_indices = user->OperandIndices(operand); + return operand_indices.size() == 1 && user_index[0] == operand_indices[0]; + } if (user->opcode() == HloOpcode::kCall) { // Get all uses of value defined by 'operand' at 'operand_index'. const auto& uses = GetValueDefinedAt(operand, operand_index).uses(); @@ -998,8 +1116,10 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser( }) != uses.end(); return uses.size() == 2 && found_caller_use && found_elementwise_callee_use; } - // Check if 'user' is element-wise. - return user->IsElementwise(); + + // Loop fusions that contain transposing copies won't reach here as they have + // different layouts, which fails the check in the beginning of this function. + return user->IsElementwiseOnOperand(user->operand_index(operand)); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h index 9868746b6113881949e388cd2a4aa9f610b1fdb7..e62c1c2ac81981e1f44f4c7e1479107979576e32 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h @@ -25,6 +25,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/call_graph.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -34,7 +35,6 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" namespace xla { @@ -42,6 +42,20 @@ namespace xla { // Analysis which identifies all HLO values and their uses in an HLO module. class HloDataflowAnalysis { public: + // Different backends can have very different ways to do fusion, so we give + // backends the flexibility to decide whether an fusion instruction can share + // buffer with it's operands. If this is not specified, a default strategy + // will be used; if this is specified, it will be applied *in addition* to the + // default strategy. + // + // The first parameter of the function should be the fusion instruction, the + // second parameter should be an operand of the fusion instruction. + // + // TODO(b/80315712): Find a better way to tell whether a fusion can share + // buffer. + using FusionCanShareBufferFunction = std::function; + // Run dataflow analysis on the given module. Parameters: // // ssa_form : If true then new values are defined at the merge points of @@ -61,7 +75,10 @@ class HloDataflowAnalysis { // value of its operand. static StatusOr> Run( const HloModule& module, bool ssa_form = false, - bool bitcast_defines_value = false); + bool bitcast_defines_value = false, + const FusionCanShareBufferFunction& fusion_can_share_buffer = nullptr); + + static bool AreTransitiveUsesElementwiseOrTuple(const HloInstruction* inst); // Returns true if 'instruction' defines an HLO value at the given shape index // of its output. @@ -121,7 +138,8 @@ class HloDataflowAnalysis { // Returns true if 'user' cannot possibly use the buffer at 'index' in // 'operand'. Returns false otherwise. // - // REQUIRES: 'operand' is an operand of 'user'. + // 'operand' does not have to be an operand of 'user'. This can be the case + // with indirect uses. bool DoesNotUseOperandBuffer(const HloInstruction* operand, const ShapeIndex& index, const HloInstruction* user) const; @@ -136,8 +154,10 @@ class HloDataflowAnalysis { const ShapeIndex& user_index) const; protected: - HloDataflowAnalysis(const HloModule& module, bool ssa_form, - bool bitcast_defines_value = false); + HloDataflowAnalysis( + const HloModule& module, bool ssa_form, + bool bitcast_defines_value = false, + const FusionCanShareBufferFunction& fusion_can_share_buffer = nullptr); // Returns a new HloValue defined at the given instruction and shape index. HloValue* NewHloValue(HloInstruction* instruction, const ShapeIndex& index, @@ -166,10 +186,11 @@ class HloDataflowAnalysis { bool UpdateCallValueSet(HloInstruction* call); bool UpdateConditionalValueSet(HloInstruction* conditional); bool UpdateCopyValueSet(HloInstruction* copy); + bool UpdateDomainValueSet(HloInstruction* domain); bool UpdateGetTupleElementValueSet(HloInstruction* gte); bool UpdateParameterValueSet(HloInstruction* parameter); bool UpdateRecvDoneValueSet(HloInstruction* recv_done); - bool UpdateSelectValueSet(HloInstruction* select); + bool UpdateTupleSelectValueSet(HloInstruction* select); bool UpdateSendValueSet(HloInstruction* send); bool UpdateTupleValueSet(HloInstruction* tuple); bool UpdateWhileValueSet(HloInstruction* xla_while); @@ -181,7 +202,7 @@ class HloDataflowAnalysis { // the given instruction. If skip_top_level is true, then the top level of the // value set of 'instruction' is not modified. bool Phi(HloInstruction* instruction, - tensorflow::gtl::ArraySlice inputs); + absl::Span inputs); // Updates the positions of the HloValues in the output of the given // instruction. This should be called after the instruction value set of @@ -221,6 +242,10 @@ class HloDataflowAnalysis { // The Id to use for the next HloValue. HloValue::Id next_value_id_ = 0; + + // Backend specific function that decides whether a fusion can share buffer + // with its operand. + FusionCanShareBufferFunction fusion_can_share_buffer_ = nullptr; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc index 5798326dcbf65c3c34748afb02afab1dc7af9147..d1a96c10f88e3c05e21a6db4eccb46683cd64c4a 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -101,9 +101,9 @@ TEST_P(HloDataflowAnalysisTest, BinaryOperation) { // Test the dataflow for a simple binary operation (Add). auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto add = builder.AddInstruction(HloInstruction::CreateBinary( scalar_shape_, HloOpcode::kAdd, constant1, constant2)); module_->AddEntryComputation(builder.Build()); @@ -198,9 +198,9 @@ TEST_P(HloDataflowAnalysisTest, NestedTuple) { // Verify the dataflow through a nested tuple. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto nested_tuple = builder.AddInstruction( @@ -259,9 +259,9 @@ TEST_P(HloDataflowAnalysisTest, SingleCall) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto call = builder.AddInstruction(HloInstruction::CreateCall( scalar_shape_, {constant1, constant2}, called_computation)); module_->AddEntryComputation(builder.Build()); @@ -308,9 +308,9 @@ TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithSameArguments) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto call1 = builder.AddInstruction(HloInstruction::CreateCall( scalar_shape_, {constant1, constant2}, called_computation)); auto call2 = builder.AddInstruction(HloInstruction::CreateCall( @@ -362,9 +362,9 @@ TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithDifferentArguments) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto call1 = builder.AddInstruction(HloInstruction::CreateCall( scalar_shape_, {constant1, constant2}, called_computation)); auto call2 = builder.AddInstruction(HloInstruction::CreateCall( @@ -426,9 +426,9 @@ TEST_P(HloDataflowAnalysisTest, NestedCalls) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto call = builder.AddInstruction(HloInstruction::CreateCall( scalar_shape_, {constant1, constant2}, outer_computation)); module_->AddEntryComputation(builder.Build()); @@ -493,15 +493,15 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) { auto cond_param = cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); auto cond_constant = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto xla_while = builder.AddInstruction( @@ -594,15 +594,15 @@ TEST_P(HloDataflowAnalysisTest, SequentialWhiles) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto xla_while0 = builder.AddInstruction( @@ -653,7 +653,7 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); @@ -691,9 +691,9 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto entry_while = builder.AddInstruction( @@ -780,15 +780,15 @@ TEST_P(HloDataflowAnalysisTest, SwizzlingWhile) { auto cond_param = cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto xla_while = builder.AddInstruction( @@ -840,11 +840,11 @@ TEST_P(HloDataflowAnalysisTest, ArraySelect) { // Test a kSelect of an array value. auto builder = HloComputation::Builder(TestName()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto select = builder.AddInstruction(HloInstruction::CreateTernary( scalar_shape_, HloOpcode::kSelect, pred, constant1, constant2)); @@ -860,19 +860,18 @@ TEST_P(HloDataflowAnalysisTest, ArraySelect) { } TEST_P(HloDataflowAnalysisTest, TupleSelect) { - // Test a kSelect of a tuple value. Non-top-level element flow through the - // instruction. + // Test a kTupleSelect. Non-top-level element flow through the instruction. auto builder = HloComputation::Builder(TestName()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); auto constant4 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(4.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(4.0))); auto tuple1 = builder.AddInstruction(HloInstruction::CreateTuple({constant1})); auto tuple2 = @@ -883,20 +882,20 @@ TEST_P(HloDataflowAnalysisTest, TupleSelect) { builder.AddInstruction(HloInstruction::CreateTuple({constant4})); const Shape tuple_shape = tuple1->shape(); auto select11 = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple1)); + tuple_shape, HloOpcode::kTupleSelect, pred, tuple1, tuple1)); auto select12 = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple_shape, HloOpcode::kTupleSelect, pred, tuple1, tuple2)); auto select34 = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple3, tuple4)); + tuple_shape, HloOpcode::kTupleSelect, pred, tuple3, tuple4)); auto select1234 = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, select12, select34)); + tuple_shape, HloOpcode::kTupleSelect, pred, select12, select34)); module_->AddEntryComputation(builder.Build()); bool ssa_form = GetParam(); const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form); - // Top-level value is always defined by a kSelect. + // Top-level value is always defined by a kTupleSelect. EXPECT_TRUE(analysis.ValueIsDefinedAt(select11)); EXPECT_TRUE(analysis.ValueIsDefinedAt(select12)); EXPECT_TRUE(analysis.ValueIsDefinedAt(select34)); @@ -937,20 +936,20 @@ TEST_P(HloDataflowAnalysisTest, TupleSelect) { } TEST_P(HloDataflowAnalysisTest, NestedTupleSelect) { - // Test kSelect of a nested tuple. + // Test kTupleSelect of a nested tuple. auto builder = HloComputation::Builder(TestName()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); auto constant4 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(4.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(4.0))); auto constant5 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5.0))); auto inner_tuple1 = builder.AddInstruction( HloInstruction::CreateTuple({constant2, constant3})); auto tuple1 = builder.AddInstruction( @@ -960,7 +959,7 @@ TEST_P(HloDataflowAnalysisTest, NestedTupleSelect) { auto tuple2 = builder.AddInstruction( HloInstruction::CreateTuple({constant4, inner_tuple2})); auto select = builder.AddInstruction(HloInstruction::CreateTernary( - tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple1->shape(), HloOpcode::kTupleSelect, pred, tuple1, tuple2)); module_->AddEntryComputation(builder.Build()); @@ -983,7 +982,7 @@ TEST_P(HloDataflowAnalysisTest, NestedTupleSelect) { } TEST_P(HloDataflowAnalysisTest, TupleSelectToWhile) { - // Test a tuple-shaped kSelect feeding a kWhile instruction. HLO: + // Test a tuple-shaped kTupleSelect feeding a kWhile instruction. HLO: // // body((F32[], F32[]) %tuple_param): // %add = Add(%tuple_param{0}, %tuple_param{1}) @@ -1026,24 +1025,24 @@ TEST_P(HloDataflowAnalysisTest, TupleSelectToWhile) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, tuple_shape, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); auto tuple1 = builder.AddInstruction(HloInstruction::CreateTuple({constant1})); auto tuple2 = builder.AddInstruction(HloInstruction::CreateTuple({constant2})); auto select = builder.AddInstruction(HloInstruction::CreateTernary( - tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple1->shape(), HloOpcode::kTupleSelect, pred, tuple1, tuple2)); auto gte = builder.AddInstruction( HloInstruction::CreateGetTupleElement(scalar_shape_, select, 0)); auto tuple = @@ -1089,7 +1088,7 @@ TEST_P(HloDataflowAnalysisTest, BitcastDefinesValue) { // Test the bitcast_defines_value flag to the dataflow analysis. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary( scalar_shape_, HloOpcode::kBitcast, constant)); @@ -1158,44 +1157,50 @@ TEST_P(HloDataflowAnalysisTest, SendAndSendDone) { auto builder = HloComputation::Builder(TestName()); auto param = builder.AddInstruction( HloInstruction::CreateParameter(0, scalar_shape_, "param0")); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); auto send = builder.AddInstruction( - HloInstruction::CreateSend(param, /*channel_id=*/0)); + HloInstruction::CreateSend(param, token, /*channel_id=*/0)); auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send)); module_->AddEntryComputation(builder.Build()); bool ssa_form = GetParam(); const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form); - EXPECT_EQ(analysis.values().size(), 4); + EXPECT_EQ(analysis.values().size(), 6); EXPECT_TRUE(analysis.ValueIsDefinedAt(param)); EXPECT_TRUE(analysis.ValueIsDefinedAt(send, /*index=*/{})); EXPECT_FALSE(analysis.ValueIsDefinedAt(send, /*index=*/{0})); EXPECT_TRUE(analysis.ValueIsDefinedAt(send, /*index=*/{1})); + EXPECT_TRUE(analysis.ValueIsDefinedAt(send, /*index=*/{2})); EXPECT_TRUE(analysis.ValueIsDefinedAt(send_done)); EXPECT_THAT(HloValuesAt(send, /*index=*/{0}), UnorderedElementsAre(analysis.GetValueDefinedAt(param))); } TEST_P(HloDataflowAnalysisTest, RecvAndRecvDone) { - // Test that a RecvDone forwards its operand tuple element at {0} to the - // output. + // Test that a RecvDone forwards its operand tuple element at {0} to element + // {0} of the output. auto builder = HloComputation::Builder(TestName()); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); auto recv = builder.AddInstruction( - HloInstruction::CreateRecv(scalar_shape_, /*channel_id=*/0)); + HloInstruction::CreateRecv(scalar_shape_, token, /*channel_id=*/0)); auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv)); module_->AddEntryComputation(builder.Build()); bool ssa_form = GetParam(); const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form); - EXPECT_EQ(analysis.values().size(), 3); + EXPECT_EQ(analysis.values().size(), 7); EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{})); EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{0})); EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{1})); - EXPECT_FALSE(analysis.ValueIsDefinedAt(recv_done)); - EXPECT_THAT(HloValuesAt(recv_done), + EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{2})); + EXPECT_TRUE(analysis.ValueIsDefinedAt(recv_done, /*index=*/{})); + EXPECT_FALSE(analysis.ValueIsDefinedAt(recv_done, /*index=*/{0})); + EXPECT_TRUE(analysis.ValueIsDefinedAt(recv_done, /*index=*/{1})); + EXPECT_THAT(HloValuesAt(recv_done, /*index=*/{0}), UnorderedElementsAre(analysis.GetValueDefinedAt(recv, {0}))); EXPECT_TRUE( analysis.GetValueDefinedAt(recv, /*index=*/{0}).live_out_of_module()); @@ -1304,13 +1309,13 @@ TEST_P(HloDataflowAnalysisTest, WhileParameters_Sequential) { auto body_param = body_builder.AddInstruction( HloInstruction::CreateParameter(0, scalar_shape_, "body_param")); auto constant = body_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto exp = body_builder.AddInstruction( HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kExp, constant)); auto add = body_builder.AddInstruction(HloInstruction::CreateBinary( scalar_shape_, HloOpcode::kAdd, exp, body_param)); auto dead_constant = body_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto dead_negate = body_builder.AddInstruction(HloInstruction::CreateUnary( scalar_shape_, HloOpcode::kNegate, dead_constant)); HloComputation* body = module_->AddEmbeddedComputation( @@ -1320,7 +1325,7 @@ TEST_P(HloDataflowAnalysisTest, WhileParameters_Sequential) { auto cond_param = cond_builder.AddInstruction( HloInstruction::CreateParameter(0, scalar_shape_, "cond_param")); auto cond_constant = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); HloComputation* condition = module_->AddEmbeddedComputation(cond_builder.Build()); @@ -1571,11 +1576,11 @@ TEST_P(HloDataflowAnalysisTest, ConditionalWithIdentity) { auto builder = HloComputation::Builder(TestName()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(56.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(56.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(12.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(12.0f))); auto conditional = builder.AddInstruction(HloInstruction::CreateConditional( scalar_shape_, pred, constant1, true_computation, constant2, false_computation)); @@ -1662,11 +1667,11 @@ TEST_P(HloDataflowAnalysisTest, ConditionalTakingTupleOperand) { auto builder = HloComputation::Builder(TestName()); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(56.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(56.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(12.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(12.0f))); auto tuple_operand = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto conditional = builder.AddInstruction(HloInstruction::CreateConditional( @@ -1792,15 +1797,15 @@ TEST_P(HloDataflowAnalysisTest, NestedConditionals) { // Build entry computation. auto builder = HloComputation::Builder(TestName()); auto pred1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); auto pred2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.2f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.2f))); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.3f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.3f))); auto tuple_operand = builder.AddInstruction( HloInstruction::CreateTuple({pred2, constant1, constant2})); auto conditional = builder.AddInstruction(HloInstruction::CreateConditional( @@ -1880,9 +1885,14 @@ class HloDataflowAnalysisTestBase : public HloTestBase { computation_ = module_->AddEntryComputation(std::move(computation)); } - void RunAnalysis() { + void RunAnalysis(const HloDataflowAnalysis::FusionCanShareBufferFunction& + fusion_can_share_buffer = nullptr) { CHECK_NOTNULL(module_.get()); - dataflow_analysis_ = HloDataflowAnalysis::Run(*module_).ConsumeValueOrDie(); + dataflow_analysis_ = + HloDataflowAnalysis::Run(*module_, /*ssa_form=*/false, + /*bitcast_defines_value=*/false, + fusion_can_share_buffer) + .ConsumeValueOrDie(); } void BuildModuleAndRunAnalysis(std::unique_ptr computation) { @@ -1933,9 +1943,9 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) { // Create a DynamicUpdateSlice instruction of tuple element 1. auto starts = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({2}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2}))); auto update = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({2.f, 2.f, 2.f}))); + LiteralUtil::CreateR1({2.f, 2.f, 2.f}))); auto dynamic_update_slice = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( data_shape, gte1, update, starts)); @@ -1953,6 +1963,54 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) { EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {1}, fusion)); } +// Similar to FusedDynamicUpdateSlice above, but tests indirect uses of the +// parameter tuple. +TEST_F(DoesNotUseOperandBufferTest, IndirectUses) { + auto builder = HloComputation::Builder(TestName()); + + Shape data_shape = ShapeUtil::MakeShape(F32, {8}); + auto tuple_param = builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple")); + auto t0 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(data_shape, tuple_param, 0)); + auto t1 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(data_shape, tuple_param, 1)); + // Swap the tuple elements. + auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({t1, t0})); + + auto gte0 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(data_shape, tuple, 0)); + auto gte1 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(data_shape, tuple, 1)); + + // Create a DynamicUpdateSlice instruction of tuple element 1. + auto starts = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2}))); + auto update = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({2.f, 2.f, 2.f}))); + auto dynamic_update_slice = + builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( + data_shape, gte1, update, starts)); + builder.AddInstruction( + HloInstruction::CreateTuple({gte0, dynamic_update_slice})); + + BuildModule(builder.Build()); + auto fusion = computation_->CreateFusionInstruction( + {dynamic_update_slice, starts, update, gte1}, + HloInstruction::FusionKind::kLoop); + RunAnalysis(); + + // The fusion instruction never uses tuple element 0, but does use element 1. + EXPECT_TRUE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {0}, fusion)); + EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {1}, fusion)); + // The same holds for the parameter tuple, except that the tuple elements are + // swapped in 'tuple'. + EXPECT_TRUE( + dataflow_analysis_->DoesNotUseOperandBuffer(tuple_param, {1}, fusion)); + EXPECT_FALSE( + dataflow_analysis_->DoesNotUseOperandBuffer(tuple_param, {0}, fusion)); +} + class CanShareOperandBufferWithUserTest : public HloDataflowAnalysisTestBase {}; TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) { @@ -1974,6 +2032,114 @@ TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) { dataflow_analysis_->CanShareOperandBufferWithUser(exp, {}, log, {})); } +TEST_F(CanShareOperandBufferWithUserTest, + NonElementwiseLoopFusionCantAliasOperandBuffer) { + auto builder = HloComputation::Builder(TestName()); + Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); + + auto param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, data_shape, "param0")); + + auto neg = builder.AddInstruction( + HloInstruction::CreateUnary(data_shape, HloOpcode::kNegate, param0)); + + auto reverse = builder.AddInstruction( + HloInstruction::CreateReverse(data_shape, neg, {0, 1})); + + BuildModule(builder.Build()); + auto fusion = computation_->CreateFusionInstruction( + {reverse, neg}, HloInstruction::FusionKind::kLoop); + RunAnalysis(); + + EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param0, {}, + fusion, {})); +} + +TEST_F(CanShareOperandBufferWithUserTest, + MultiOutputFusionCanAliasOperandBuffer) { + auto builder = HloComputation::Builder(TestName()); + Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); + + Shape in_shape = ShapeUtil::MakeShape(F32, {8}); + Shape out_shape = ShapeUtil::MakeShape(PRED, {8}); + auto param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, in_shape, "param0")); + auto param1 = builder.AddInstruction( + HloInstruction::CreateParameter(1, in_shape, "param1")); + + auto copy0 = builder.AddInstruction( + HloInstruction::CreateUnary(in_shape, HloOpcode::kCopy, param0)); + auto copy1 = builder.AddInstruction( + HloInstruction::CreateUnary(in_shape, HloOpcode::kCopy, param1)); + + auto tuple = + builder.AddInstruction(HloInstruction::CreateTuple({copy1, copy0})); + + BuildModule(builder.Build()); + auto fusion = computation_->CreateFusionInstruction( + {tuple, copy1, copy0}, HloInstruction::FusionKind::kLoop); + RunAnalysis(); + + EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(param0, {}, + fusion, {0})); + EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(param0, {}, + fusion, {1})); + EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(param1, {}, + fusion, {0})); + EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(param1, {}, + fusion, {1})); +} + +TEST_F(CanShareOperandBufferWithUserTest, + ElementwiseLoopFusionCantAliasOperandBuffer) { + auto builder = HloComputation::Builder(TestName()); + Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); + + auto one = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); + auto operand = builder.AddInstruction( + HloInstruction::CreateBroadcast(data_shape, one, {1})); + + auto neg = builder.AddInstruction( + HloInstruction::CreateUnary(data_shape, HloOpcode::kNegate, operand)); + + auto exp = builder.AddInstruction( + HloInstruction::CreateUnary(data_shape, HloOpcode::kExp, neg)); + + BuildModule(builder.Build()); + auto fusion = computation_->CreateFusionInstruction( + {exp, neg}, HloInstruction::FusionKind::kLoop); + RunAnalysis(); + + EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(operand, {}, + fusion, {})); +} + +TEST_F(CanShareOperandBufferWithUserTest, + CanShareOperandWhenDynamicUpdateSliceIsFedByDynamicSliceWithSameIndex) { + auto builder = HloComputation::Builder(TestName()); + Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); + Shape slice_shape = ShapeUtil::MakeShape(F32, {1, 2}); + + auto param = builder.AddInstruction( + HloInstruction::CreateParameter(0, data_shape, "param0")); + auto index = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1({0, 0}))); + auto ds = builder.AddInstruction( + HloInstruction::CreateDynamicSlice(slice_shape, param, index, {1, 2, 2})); + + auto dus = builder.AddInstruction( + HloInstruction::CreateDynamicUpdateSlice(data_shape, param, ds, index)); + + BuildModule(builder.Build()); + auto fusion = computation_->CreateFusionInstruction( + {dus, ds, index}, HloInstruction::FusionKind::kLoop); + RunAnalysis(); + + EXPECT_TRUE( + dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, fusion, {})); +} + TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) { auto builder = HloComputation::Builder(TestName()); @@ -2026,9 +2192,9 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) { // Create a DynamicUpdateSlice instruction of tuple element 1. auto starts = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({2}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2}))); auto update = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({2.f, 2.f, 2.f}))); + LiteralUtil::CreateR1({2.f, 2.f, 2.f}))); auto dynamic_update_slice = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( data_shape, gte1, update, starts)); @@ -2048,6 +2214,45 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) { fusion, {})); } +TEST_F(CanShareOperandBufferWithUserTest, + FusedDynamicUpdateSliceWithConvertCanShare) { + auto builder = HloComputation::Builder(TestName()); + + Shape data_shape = ShapeUtil::MakeShape(F32, {8}); + Shape data_shape_bf16 = ShapeUtil::MakeShape(BF16, {8}); + auto tuple = builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple")); + auto gte0 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(data_shape, tuple, 0)); + auto gte1 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(data_shape, tuple, 1)); + + auto convert1 = builder.AddInstruction( + HloInstruction::CreateConvert(data_shape_bf16, gte1)); + + // Create a DynamicUpdateSlice instruction of tuple element 1. + auto starts = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2}))); + auto update = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({2.f, 2.f, 2.f}))); + auto dynamic_update_slice = + builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( + data_shape_bf16, convert1, update, starts)); + + auto convert2 = builder.AddInstruction( + HloInstruction::CreateConvert(data_shape, dynamic_update_slice)); + builder.AddInstruction(HloInstruction::CreateTuple({gte0, convert2})); + + BuildModule(builder.Build()); + auto fusion = computation_->CreateFusionInstruction( + {convert2, dynamic_update_slice, starts, update, convert1}, + HloInstruction::FusionKind::kLoop); + RunAnalysis(); + + EXPECT_TRUE( + dataflow_analysis_->CanShareOperandBufferWithUser(gte1, {}, fusion, {})); +} + TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) { auto builder = HloComputation::Builder(TestName()); @@ -2075,14 +2280,56 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) { dataflow_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {})); } +TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) { + auto builder = HloComputation::Builder(TestName()); + + Shape keys_shape = ShapeUtil::MakeShape(F32, {8}); + auto keys = builder.AddInstruction( + HloInstruction::CreateParameter(0, keys_shape, "keys")); + auto sort = + builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys)); + + BuildModuleAndRunAnalysis(builder.Build()); + + EXPECT_TRUE( + dataflow_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {})); +} + +TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) { + auto builder = HloComputation::Builder(TestName()); + + Shape keys_shape = ShapeUtil::MakeShape(F32, {8}); + Shape values_shape = ShapeUtil::MakeShape(F32, {8}); + auto keys = builder.AddInstruction( + HloInstruction::CreateParameter(0, keys_shape, "keys")); + auto values = builder.AddInstruction( + HloInstruction::CreateParameter(1, values_shape, "values")); + auto sort = builder.AddInstruction(HloInstruction::CreateSort( + ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys, values)); + + BuildModuleAndRunAnalysis(builder.Build()); + + // The buffer for the keys can be shared with the first tuple entry. + EXPECT_TRUE( + dataflow_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {0})); + // The buffer for the values can be shared with the second tuple entry. + EXPECT_TRUE( + dataflow_analysis_->CanShareOperandBufferWithUser(values, {}, sort, {1})); + // Verify that the buffers are not shared with the "wrong" tuple entry. + EXPECT_FALSE( + dataflow_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {1})); + EXPECT_FALSE( + dataflow_analysis_->CanShareOperandBufferWithUser(values, {}, sort, {0})); +} + TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) { auto builder = HloComputation::Builder(TestName()); Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); auto a = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 0.0}, {0.0, 1.0}}))); + LiteralUtil::CreateR2({{1.0, 0.0}, {0.0, 1.0}}))); auto b = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); + LiteralUtil::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(1); @@ -2091,7 +2338,7 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) { HloInstruction::CreateDot(data_shape, a, b, dot_dnums)); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto add_operand = builder.AddInstruction( HloInstruction::CreateBroadcast(data_shape, one, {1})); @@ -2113,7 +2360,7 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) { Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto operand = builder.AddInstruction( HloInstruction::CreateBroadcast(data_shape, one, {1})); @@ -2121,7 +2368,7 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) { HloInstruction::CreateReverse(data_shape, operand, {0, 1})); auto two = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); + LiteralUtil::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); auto add = builder.AddInstruction( HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two)); @@ -2136,10 +2383,37 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) { fusion, {})); } +TEST_F(CanShareOperandBufferWithUserTest, FusionCanShareBufferCustomized) { + auto builder = HloComputation::Builder(TestName()); + Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); + + auto one = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); + auto operand = builder.AddInstruction( + HloInstruction::CreateBroadcast(data_shape, one, {1})); + auto mul = builder.AddInstruction(HloInstruction::CreateBinary( + data_shape, HloOpcode::kMultiply, operand, operand)); + auto two = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); + auto add = builder.AddInstruction( + HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, mul, two)); + + BuildModule(builder.Build()); + auto fusion = computation_->CreateFusionInstruction( + {add, two, mul}, HloInstruction::FusionKind::kInput); + RunAnalysis(/*fusion_can_share_buffer=*/[](const HloInstruction* fusion, + const HloInstruction*) { + return fusion->fusion_kind() == HloInstruction::FusionKind::kLoop; + }); + + EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(operand, {}, + fusion, {})); +} + TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) { Shape data_shape = ShapeUtil::MakeShape(F32, {8}); - auto make_cond = [this, &data_shape]() { + auto make_cond = [&data_shape]() { auto builder = HloComputation::Builder(TestName() + ".Cond"); auto data = builder.AddInstruction( HloInstruction::CreateParameter(0, data_shape, "data")); @@ -2148,7 +2422,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) { return builder.Build(); }; - auto make_body = [this, &data_shape]() { + auto make_body = [&data_shape]() { auto builder = HloComputation::Builder(TestName() + ".Body"); auto data = builder.AddInstruction( HloInstruction::CreateParameter(0, data_shape, "data")); @@ -2186,7 +2460,7 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) { auto sub_param = sub_builder.AddInstruction( HloInstruction::CreateParameter(0, shape, "sub_param")); auto one = sub_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto ones = sub_builder.AddInstruction( HloInstruction::CreateBroadcast(shape, one, {1})); auto add = sub_builder.AddInstruction( diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc index fcd723af146e2227b8661b1a4993f1338f7de389..7d35e251ca21951036336ff1a1eb4aabc87bc5ca 100644 --- a/tensorflow/compiler/xla/service/hlo_dce.cc +++ b/tensorflow/compiler/xla/service/hlo_dce.cc @@ -41,20 +41,13 @@ StatusOr HloDCE::Run(HloModule* module) { XLA_VLOG_LINES(2, module->ToString()); for (auto* computation : module->MakeComputationPostOrder()) { - std::unordered_set live_instructions; - TF_RETURN_IF_ERROR(computation->root_instruction()->Accept( - [&live_instructions](HloInstruction* instruction) { - live_instructions.insert(instruction); - return Status::OK(); - })); - // Remove any dead roots and their dead transitive operands. Collect them // into a separate list first to avoid problems with iterating through the // computation's instruction while simultaneously removing instructions. std::vector dead_roots; for (auto* instruction : computation->instructions()) { - if (instruction->user_count() == 0 && - live_instructions.count(instruction) == 0 && + if (instruction != computation->root_instruction() && + instruction->user_count() == 0 && computation->IsRemovable(instruction) && !instruction->HasSideEffect()) { dead_roots.push_back(instruction); @@ -85,8 +78,7 @@ StatusOr HloDCE::Run(HloModule* module) { } // Remove dead computations. - std::list computations = module->MakeComputationPostOrder(); - for (auto* computation : computations) { + for (auto* computation : module->MakeComputationPostOrder()) { if (live_computations.count(computation) == 0) { TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(computation)); changed = true; diff --git a/tensorflow/compiler/xla/service/hlo_dce.h b/tensorflow/compiler/xla/service/hlo_dce.h index 4e244494d6f98c48f4376bd762f116b9a9c2084d..1fe69b1395753a612499e6e87bfc22f8ac8e767b 100644 --- a/tensorflow/compiler/xla/service/hlo_dce.h +++ b/tensorflow/compiler/xla/service/hlo_dce.h @@ -36,7 +36,7 @@ namespace xla { class HloDCE : public HloPassInterface { public: ~HloDCE() override {} - tensorflow::StringPiece name() const override { return "dce"; } + absl::string_view name() const override { return "dce"; } // Run the pass on the given module. Returns whether the module was changed // (instructions were removed). diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc index 5a56607a665c4cbeb7b2572f182b88e890602968..3b5cde2996c4195ef458662cd21de85a832d8d55 100644 --- a/tensorflow/compiler/xla/service/hlo_dce_test.cc +++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc @@ -17,9 +17,9 @@ limitations under the License. #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -53,9 +53,9 @@ TEST_F(HloDceTest, NoDeadCode) { // Verify that no dead code is removed from a computation with no dead code. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(123.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(123.0f))); builder.AddInstruction(HloInstruction::CreateBinary( constant1->shape(), HloOpcode::kAdd, constant1, constant2)); @@ -74,20 +74,21 @@ TEST_F(HloDceTest, InstructionsWithSideEffect) { // Verify that side-effect instructions (Send in this test) are not removed. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); builder.AddInstruction( - HloInstruction::CreateSend(constant, /*channel_id=*/0)); + HloInstruction::CreateSend(constant, token, /*channel_id=*/0)); builder.AddInstruction(HloInstruction::CreateTuple({})); auto module = CreateNewModule(); auto computation = module->AddEntryComputation(builder.Build()); - EXPECT_EQ(3, computation->instruction_count()); + EXPECT_EQ(4, computation->instruction_count()); HloDCE dce; EXPECT_FALSE(dce.Run(module.get()).ValueOrDie()); - EXPECT_EQ(3, computation->instruction_count()); + EXPECT_EQ(4, computation->instruction_count()); } TEST_F(HloDceTest, DeadParameters) { @@ -126,9 +127,9 @@ TEST_F(HloDceTest, ControlDependencies) { // Verify that instructions with control dependencies are not removed. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(123.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(123.0f))); // Create two dead instructions: a negate and an add. auto dead_negate = builder.AddInstruction(HloInstruction::CreateUnary( @@ -223,7 +224,7 @@ TEST_F(HloDceTest, CalledComputationWithSideEffect) { auto param = cond_builder.AddInstruction( HloInstruction::CreateParameter(0, shape, "cond_param")); auto constant = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); cond_builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param, constant)); } @@ -234,9 +235,9 @@ TEST_F(HloDceTest, CalledComputationWithSideEffect) { { auto param = body_builder.AddInstruction( HloInstruction::CreateParameter(0, shape, "param")); - - auto infeed = - body_builder.AddInstruction(HloInstruction::CreateInfeed(shape, "")); + auto token = body_builder.AddInstruction(HloInstruction::CreateToken()); + auto infeed = body_builder.AddInstruction( + HloInstruction::CreateInfeed(shape, token, "")); body_builder.AddInstruction( HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, infeed)); } @@ -278,8 +279,10 @@ TEST_F(HloDceTest, CalledComputationWithNestedSideEffect) { { auto param = nested_callee_builder.AddInstruction( HloInstruction::CreateParameter(0, shape, "param")); + auto token = + nested_callee_builder.AddInstruction(HloInstruction::CreateToken()); nested_callee_builder.AddInstruction( - HloInstruction::CreateOutfeed(shape, param, "")); + HloInstruction::CreateOutfeed(shape, param, token, "")); } auto nested_called_computation = module->AddEmbeddedComputation(nested_callee_builder.Build()); @@ -342,12 +345,12 @@ TEST_F(HloDceTest, RemoveDeadSubcomputation) { builder.AddInstruction(HloInstruction::CreateParameter( /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {100}), "param0")), builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))), + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))), /*dimensions_to_reduce=*/{0}, reduce_subcomp)); // Add another instruction as the root of the computation. builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); module->AddEntryComputation(builder.Build()); EXPECT_EQ(module->MakeComputationPostOrder().size(), 2); @@ -383,7 +386,7 @@ TEST_F(HloDceTest, KeepUsedSubcomputation) { builder.AddInstruction(HloInstruction::CreateParameter( /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {100}), "param0")), builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))), + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))), /*dimensions_to_reduce=*/{0}, reduce_subcomp)); // Add another instruction as the root of the computation that also uses @@ -393,7 +396,7 @@ TEST_F(HloDceTest, KeepUsedSubcomputation) { builder.AddInstruction(HloInstruction::CreateParameter( /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {100}), "param1")), builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))), + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))), /*dimensions_to_reduce=*/{0}, reduce_subcomp)); module->AddEntryComputation(builder.Build()); diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc index 78955db0da02f16eb93689db947dc1190ab7049a..72185698c9bdcbf2bebed7ee82bc4ed082ce6a14 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc +++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc @@ -31,31 +31,10 @@ class HloDomainIsolator::RunContext { StatusOr Run(); private: - // Inserts a kDomain instruction between parent and operand, in case - // the attribute (ie, sharding) values change between instruction and operand. - // Returns the newly inserted kDomain instruction, or nullptr if no kDomain - // instruction was necessary. - StatusOr CreateDomain(HloInstruction* instruction, - HloInstruction* parent, - HloInstruction* operand); - HloModule* module_; HloDomainIsolator* isolator_; }; -StatusOr HloDomainIsolator::RunContext::CreateDomain( - HloInstruction* instruction, HloInstruction* parent, - HloInstruction* operand) { - HloInstruction* domain = nullptr; - std::unique_ptr domain_instruction = - isolator_->creator_(instruction, operand); - if (domain_instruction != nullptr) { - domain = operand->parent()->AddInstruction(std::move(domain_instruction)); - TF_RETURN_IF_ERROR(operand->ReplaceUseWith(parent, domain)); - } - return domain; -} - StatusOr HloDomainIsolator::RunContext::Run() { hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before Domain Isolator"); @@ -71,16 +50,16 @@ StatusOr HloDomainIsolator::RunContext::Run() { // When applying multiple domains, we could end up stacking more than // one in one edge, so here we want to build the effective // (kDomain-less) instruction->operand edge. - HloInstruction* parent = instruction; - while (operand->opcode() == HloOpcode::kDomain) { - parent = operand; - operand = operand->mutable_operand(0); + HloInstruction* root = operand; + while (root->opcode() == HloOpcode::kDomain) { + root = root->mutable_operand(0); } // Check whether a kDomain is necessary between instruction and operand. - TF_ASSIGN_OR_RETURN(HloInstruction * domain, - CreateDomain(instruction, parent, operand)); + HloInstruction* domain = + isolator_->creator_(instruction, root, operand); if (domain != nullptr) { VLOG(4) << "New domain: " << domain->ToString(); + TF_RETURN_IF_ERROR(operand->ReplaceUseWith(instruction, domain)); ++added_domains; } } diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.h b/tensorflow/compiler/xla/service/hlo_domain_isolator.h index e0c5718509dabebb7b9307bf764b0ea1ce7369a0..d36631fc2f16902ed8f1f89f903027081f9b3801 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_isolator.h +++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.h @@ -26,22 +26,24 @@ limitations under the License. namespace xla { // Domain isolation is the task of placing kDomain instructions between HLO -// instructions having different shrading. A kDomain instruction is essentially +// instructions having different sharding. A kDomain instruction is essentially // used to break an HLO graph edge connecting two instructions with different // sharding. If a set of connected instructions have all the same sharding, no -// kDomain instruciton will be placed. +// kDomain instruction will be placed. class HloDomainIsolator : public HloPassInterface { public: // Creates a new kDomain instruction for the edge between the use instruction // (the first HloInstruction argument), and the operand instruction (the - // second HloInstruction argument). + // third HloInstruction argument) if the interesting attribute of the + // instruction differes from the attribute of the root (the second + // HloInstruction argument). // Returns nullptr in case no domain separation is necessary. - using DomainCreator = std::function( - HloInstruction*, HloInstruction*)>; + using DomainCreator = std::function; explicit HloDomainIsolator(DomainCreator creator); - tensorflow::StringPiece name() const override { return "domain_isolator"; } + absl::string_view name() const override { return "domain_isolator"; } StatusOr Run(HloModule* module) override; diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc index ebd5adb5d573ce4b556046f85eb26a6ad59efcb9..8b2846e0c277b3e7cffd578d988d0a09c13833ed 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_map.cc +++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc @@ -17,6 +17,7 @@ limitations under the License. #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/types.h" @@ -25,14 +26,14 @@ namespace xla { /* static */ StatusOr> HloDomainMap::Create( HloComputation* computation, string domain_kind) { - auto domain_map = WrapUnique(new HloDomainMap(std::move(domain_kind))); + auto domain_map = absl::WrapUnique(new HloDomainMap(std::move(domain_kind))); TF_RETURN_IF_ERROR(domain_map->Populate(computation)); return std::move(domain_map); } /* static */ StatusOr> HloDomainMap::Create( HloModule* module, string domain_kind) { - auto domain_map = WrapUnique(new HloDomainMap(std::move(domain_kind))); + auto domain_map = absl::WrapUnique(new HloDomainMap(std::move(domain_kind))); for (HloComputation* computation : module->computations()) { TF_RETURN_IF_ERROR(domain_map->Populate(computation)); } @@ -41,27 +42,41 @@ namespace xla { bool HloDomainMap::InSameDomain(HloInstruction* instruction1, HloInstruction* instruction2) const { - int64 domain_id1 = FindOrDefault(instruction_to_domain_, instruction1, -1); - int64 domain_id2 = FindOrDefault(instruction_to_domain_, instruction2, -1); + int64 domain_id1 = GetDomainId(instruction1); + int64 domain_id2 = GetDomainId(instruction2); return domain_id1 >= 0 && domain_id1 == domain_id2; } +int64 HloDomainMap::GetDomainId(HloInstruction* instruction) const { + return FindOrDefault(instruction_to_domain_, instruction, -1); +} + Status HloDomainMap::TryProcessEmptyDomain(HloInstruction* instruction) { TF_RET_CHECK(instruction->opcode() == HloOpcode::kDomain); // We only check operands, so we are sure to not process the empty domain from // both sides. for (HloInstruction* operand : instruction->unique_operands()) { if (IsDomainInstruction(operand)) { - auto domain = MakeUnique(); + auto domain = absl::make_unique(); domain->enter_domains.insert(operand); domain->exit_domains.insert(instruction); TF_RETURN_IF_ERROR(InsertDomain(std::move(domain))); } } + if (instruction == instruction->parent()->root_instruction()) { + auto domain = absl::make_unique(); + domain->enter_domains.insert(instruction); + TF_RETURN_IF_ERROR(InsertDomain(std::move(domain))); + } return Status::OK(); } Status HloDomainMap::Populate(HloComputation* computation) { + InstructionOrderMap instructions_post_order; + int64 count = 0; + for (HloInstruction* instruction : computation->MakeInstructionPostOrder()) { + instructions_post_order.insert(std::make_pair(instruction, count++)); + } for (HloInstruction* instruction : computation->instructions()) { if (IsDomainInstruction(instruction)) { // If this is a kDomain of the kind we are currently processing, check @@ -75,7 +90,7 @@ Status HloDomainMap::Populate(HloComputation* computation) { continue; } TF_ASSIGN_OR_RETURN(std::unique_ptr domain, - CreateDomain(instruction)); + CreateDomain(instruction, instructions_post_order)); TF_RETURN_IF_ERROR(InsertDomain(std::move(domain))); } return Status::OK(); @@ -133,10 +148,12 @@ Status HloDomainMap::ExpandDomain(HloInstruction* instruction, } StatusOr> HloDomainMap::CreateDomain( - HloInstruction* instruction) const { - auto domain = MakeUnique(); + HloInstruction* instruction, + const InstructionOrderMap& instructions_order) const { + auto domain = absl::make_unique(); TF_RETURN_IF_ERROR(ExpandDomain(instruction, domain.get())); - domain->instructions = MakeNonDomainInstructions(domain->reach_set); + domain->instructions = + MakeNonDomainInstructions(domain->reach_set, instructions_order); return std::move(domain); } @@ -158,7 +175,8 @@ bool HloDomainMap::IsDomainInstruction(HloInstruction* instruction) const { /* static */ std::vector HloDomainMap::MakeNonDomainInstructions( - const tensorflow::gtl::FlatSet& instruction_set) { + const tensorflow::gtl::FlatSet& instruction_set, + const InstructionOrderMap& instructions_order) { std::vector instructions; instructions.reserve(instruction_set.size()); for (HloInstruction* instruction : instruction_set) { @@ -166,9 +184,10 @@ HloDomainMap::MakeNonDomainInstructions( instructions.push_back(instruction); } } + // sort instructions according to instructions_order std::sort(instructions.begin(), instructions.end(), - [](HloInstruction* a, HloInstruction* b) { - return a->unique_id() < b->unique_id(); + [&instructions_order](HloInstruction* a, HloInstruction* b) { + return instructions_order.at(a) < instructions_order.at(b); }); return instructions; } diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.h b/tensorflow/compiler/xla/service/hlo_domain_map.h index e62ef763fb3881ab6030b1f6a66266ac80a3d84d..633109249a91eec3d7b4cbe5b423b73f980217c9 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_map.h +++ b/tensorflow/compiler/xla/service/hlo_domain_map.h @@ -65,7 +65,16 @@ class HloDomainMap { // currently processing. bool IsDomainInstruction(HloInstruction* instruction) const; + // Retrieves the domain identifier of the instruction, or -1 in case + // instruction is not found within any domain. + int64 GetDomainId(HloInstruction* instruction) const; + private: + // Map used for representing instruction ordering, i.e. + // order_map[a] < order_map[b] means a must be ordered before b. + using InstructionOrderMap = + tensorflow::gtl::FlatMap; + HloDomainMap(string domain_kind) : domain_kind_(std::move(domain_kind)) {} // Check if the kDomain instruction is facing (via its operand link) another @@ -91,12 +100,14 @@ class HloDomainMap { // Creates a domain data structure using the ExpandDomain() API. StatusOr> CreateDomain( - HloInstruction* instruction) const; + HloInstruction* instruction, + const InstructionOrderMap& instructions_order) const; // Out of an instruction set, returns a vector of all the ones which are not // a kDomain kind. static std::vector MakeNonDomainInstructions( - const tensorflow::gtl::FlatSet& instruction_set); + const tensorflow::gtl::FlatSet& instruction_set, + const InstructionOrderMap& instructions_order); string domain_kind_; std::vector> instruction_domains_; diff --git a/tensorflow/compiler/xla/service/hlo_domain_metadata.h b/tensorflow/compiler/xla/service/hlo_domain_metadata.h index aa0308100a21f109579de75788fce7d242d6a6b0..6c142ee47421049e8a25dfb80a6297e02fe782f1 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_metadata.h +++ b/tensorflow/compiler/xla/service/hlo_domain_metadata.h @@ -20,10 +20,10 @@ limitations under the License. #include #include +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/gtl/flatset.h" namespace xla { @@ -44,7 +44,10 @@ class DomainMetadata { // two domains of different kind intersect each other. tensorflow::gtl::FlatSet reach_set; - // The same instructions in reach_set, but purged from kDomain instructions. + // The same instructions in reach_set, but purged from kDomain instructions + // and ordered according to their computation graph post-order, i.e. + // if instructions[pos_a] depends on instructions[pos_b], then pos_a > + // pos_b. std::vector instructions; // If we consider a graph edge as an arrow oriented from the operand to the @@ -63,7 +66,7 @@ class DomainMetadata { // Returns the metadata type. A unique identifier which describes the real // metadata type. - virtual tensorflow::StringPiece Kind() const = 0; + virtual absl::string_view Kind() const = 0; // Compares the metadata object with another one and returns true if the // two matches. @@ -71,12 +74,6 @@ class DomainMetadata { // Returns a string representation of the metadata. virtual string ToString() const = 0; - - // Given a reachable set (the set of instructions which are reachable from - // each other via user/operand pathways, without crossing a kDomain - // instruciton), makes sure that all of them have metadata attributes which - // are coherent with this metadata object. - virtual Status NormalizeInstructions(const Domain& domain) const = 0; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.cc b/tensorflow/compiler/xla/service/hlo_domain_remover.cc index 1d06040b0e7c92b03f4cb5481bdee73a0f74f939..67fad0769f5eb5ceca64ebd2aa78c6469f2c813d 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_remover.cc +++ b/tensorflow/compiler/xla/service/hlo_domain_remover.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_domain_remover.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_domain_isolator.h" #include "tensorflow/compiler/xla/service/hlo_domain_map.h" +#include "tensorflow/compiler/xla/service/hlo_domain_verifier.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" @@ -43,54 +43,16 @@ class HloDomainRemover::RunContext { Status HloDomainRemover::RunContext::VerifyAndNormalizeDomain( const DomainMetadata::Domain& domain) { - // Verify that the whole kDomain frontier bounding the instruction reach set, - // has matching metadata. - // A kDomain instruction has two sides of metadata, a user facing and an - // operand facing. - // A reachable instruction set can make contact with a kDomain instruction on - // a user facing side (the kDomain is operand of the instruction), or on a - // operand facing side (the kDomain is user of the instruction). - // And depending on the contact side, the proper metadata object - // (user_side_metadata() vs. operand_side_metadata()) needs to be used for - // consistency checks. - const DomainMetadata* ref_metadata = nullptr; - VLOG(4) << "Reach set:"; - for (HloInstruction* instruction : domain.instructions) { - VLOG(4) << " " << instruction->name(); - } - VLOG(4) << " Domains:"; - for (HloInstruction* instruction : domain.enter_domains) { - const DomainMetadata& meta = instruction->user_side_metadata(); - VLOG(4) << " User side: " << instruction->name(); - VLOG(4) << " " << meta.ToString(); - if (ref_metadata == nullptr) { - ref_metadata = &meta; - } else { - TF_RET_CHECK(meta.Matches(*ref_metadata)) - << "Metadata mismatch at instruction " << instruction->name() << " : " - << meta.ToString() << " vs " << ref_metadata->ToString(); - } - } - for (HloInstruction* instruction : domain.exit_domains) { - const DomainMetadata& meta = instruction->operand_side_metadata(); - VLOG(4) << " Operand side: " << instruction->name(); - VLOG(4) << " " << meta.ToString(); - if (ref_metadata == nullptr) { - ref_metadata = &meta; - } else { - TF_RET_CHECK(meta.Matches(*ref_metadata)) - << "Metadata mismatch at instruction " << instruction->name() << " : " - << meta.ToString() << " vs " << ref_metadata->ToString(); - } - } + TF_ASSIGN_OR_RETURN(const DomainMetadata* ref_metadata, + HloDomainVerifier::VerifyDomain(domain)); if (ref_metadata != nullptr) { VLOG(4) << "Applying domain normalization: " << ref_metadata->ToString(); - TF_RETURN_IF_ERROR(ref_metadata->NormalizeInstructions(domain)); + TF_RETURN_IF_ERROR(remover_->normalizer_(domain, ref_metadata)); } else { // No kDomain instruction was present within this domain, so call the // generic normalization functions and have them apply their heuristic. VLOG(2) << "Applying domain-less normalization"; - TF_RETURN_IF_ERROR(remover_->normalizer_(domain)); + TF_RETURN_IF_ERROR(remover_->normalizer_(domain, nullptr)); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.h b/tensorflow/compiler/xla/service/hlo_domain_remover.h index 0c71dd34fd4d2944037dc965a2c9ad2c592d6e3e..97bc8ef604092acc849b55b09af8a24bf775529e 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_remover.h +++ b/tensorflow/compiler/xla/service/hlo_domain_remover.h @@ -35,12 +35,13 @@ class HloDomainRemover : public HloPassInterface { // instructions in it with the same attributes (ie, sharding), a normalizer // function is tasked at applying attribute normalization on the instructions // within such domain. - HloDomainRemover( - tensorflow::StringPiece kind, - std::function normalizer) - : kind_(kind.ToString()), normalizer_(std::move(normalizer)) {} + HloDomainRemover(absl::string_view kind, + std::function + normalizer) + : kind_(kind), normalizer_(std::move(normalizer)) {} - tensorflow::StringPiece name() const override { return "domain_remover"; } + absl::string_view name() const override { return "domain_remover"; } StatusOr Run(HloModule* module) override; @@ -48,7 +49,9 @@ class HloDomainRemover : public HloPassInterface { class RunContext; string kind_; - std::function normalizer_; + std::function + normalizer_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc index f29aac29c0586931f79633a6748cf5d06ad8ff31..974ab94467dfb63325698b4590dac1abd1ed9f89 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_test.cc +++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc @@ -13,20 +13,22 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/hlo_domain_isolator.h" #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h" #include "tensorflow/compiler/xla/service/hlo_domain_remover.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" +#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/core/lib/core/status_test_util.h" namespace xla { namespace { -class HloDomainTest : public HloTestBase { +class HloDomainTest : public HloVerifiedTestBase { protected: bool FindUserViaDomainPath(HloInstruction* instruction, HloInstruction* operand) const { @@ -44,9 +46,8 @@ class HloDomainTest : public HloTestBase { // Checks whether there is a kDomain instruction in the edge between the // instruction and the operand. - bool HasDomainEdge(HloModule* module, - tensorflow::StringPiece instruction_name, - tensorflow::StringPiece operand_name) { + bool HasDomainEdge(HloModule* module, absl::string_view instruction_name, + absl::string_view operand_name) { HloInstruction* instruction = FindInstruction(module, instruction_name); HloInstruction* operand = FindInstruction(module, operand_name); CHECK_NE(instruction, nullptr); @@ -64,11 +65,11 @@ class HloDomainTest : public HloTestBase { return false; } - StatusOr> ParseModule( - tensorflow::StringPiece hlo_string) { + StatusOr ParseModule(absl::string_view hlo_string) { HloModuleConfig config; config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags()); - return tools::Parse(hlo_string, config); + ParseAndVerifyModule(hlo_string, config); + return &module(); } }; @@ -79,10 +80,10 @@ class OpNameMetadata : public DomainMetadata { explicit OpNameMetadata(string opname) : opname_(std::move(opname)) {} std::unique_ptr Clone() const override { - return MakeUnique(opname_); + return absl::make_unique(opname_); } - tensorflow::StringPiece Kind() const override { return KindName(); } + absl::string_view Kind() const override { return KindName(); } bool Matches(const DomainMetadata& other) const override { const OpNameMetadata* other_ptr = @@ -96,34 +97,30 @@ class OpNameMetadata : public DomainMetadata { string ToString() const override { return opname_; } - Status NormalizeInstructions( - const DomainMetadata::Domain& domain) const override { - // For the purposes of this test, nothing to do. - return Status::OK(); - } - - static tensorflow::StringPiece KindName() { return "opname"; } + static absl::string_view KindName() { return "opname"; } private: string opname_; }; // Creator function for OpNameMetadata domains. -std::unique_ptr OpNameDomainCreator(HloInstruction* instruction, - HloInstruction* operand) { - if (instruction->metadata().op_name() == operand->metadata().op_name()) { +HloInstruction* OpNameDomainCreator(HloInstruction* instruction, + HloInstruction* root, + HloInstruction* operand) { + if (instruction->metadata().op_name() == root->metadata().op_name()) { return nullptr; } std::unique_ptr operand_side_metadata = - MakeUnique(operand->metadata().op_name()); + absl::make_unique(root->metadata().op_name()); std::unique_ptr user_side_metadata = - MakeUnique(instruction->metadata().op_name()); - return HloInstruction::CreateDomain(operand->shape(), operand, - std::move(operand_side_metadata), - std::move(user_side_metadata)); + absl::make_unique(instruction->metadata().op_name()); + return operand->parent()->AddInstruction(HloInstruction::CreateDomain( + operand->shape(), operand, std::move(operand_side_metadata), + std::move(user_side_metadata))); } -Status OpNameDomainNormalizer(const DomainMetadata::Domain& domain) { +Status OpNameDomainNormalizer(const DomainMetadata::Domain& domain, + const DomainMetadata* metadata) { // Nothing to do for the particular use this test make of the OpName domains. return Status::OK(); } @@ -143,32 +140,31 @@ ENTRY entry { } )"; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - ParseModule(hlo_string)); + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); LOG(INFO) << "Original module:\n" << module->ToString(); - HloDomainIsolator isolator(CreateShardingDomain); - TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get())); + HloDomainIsolator isolator(ShardingDomainCreator{}); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); EXPECT_TRUE(isolator_changed); - EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a")); - EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b")); - EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a")); - EXPECT_TRUE(HasDomainEdge(module.get(), "d", "b")); - EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c")); - EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d")); + EXPECT_TRUE(HasDomainEdge(module, "c", "a")); + EXPECT_TRUE(HasDomainEdge(module, "c", "b")); + EXPECT_TRUE(HasDomainEdge(module, "d", "a")); + EXPECT_TRUE(HasDomainEdge(module, "d", "b")); + EXPECT_FALSE(HasDomainEdge(module, "e", "c")); + EXPECT_FALSE(HasDomainEdge(module, "e", "d")); HloDomainRemover remover(ShardingMetadata::KindName(), - NormalizeShardingDomain); - TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get())); + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module)); EXPECT_TRUE(remover_changed); - EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a")); - EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b")); - EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a")); - EXPECT_FALSE(HasDomainEdge(module.get(), "d", "b")); - EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c")); - EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d")); + EXPECT_FALSE(HasDomainEdge(module, "c", "a")); + EXPECT_FALSE(HasDomainEdge(module, "c", "b")); + EXPECT_FALSE(HasDomainEdge(module, "d", "a")); + EXPECT_FALSE(HasDomainEdge(module, "d", "b")); + EXPECT_FALSE(HasDomainEdge(module, "e", "c")); + EXPECT_FALSE(HasDomainEdge(module, "e", "d")); } TEST_F(HloDomainTest, CheckNoDomainAddedIfNoSharding) { @@ -186,12 +182,11 @@ ENTRY entry { } )"; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - ParseModule(hlo_string)); + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); LOG(INFO) << "Original module:\n" << module->ToString(); - HloDomainIsolator isolator(CreateShardingDomain); - TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get())); + HloDomainIsolator isolator(ShardingDomainCreator{}); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); EXPECT_TRUE(!isolator_changed); } @@ -202,37 +197,38 @@ HloModule Module ENTRY entry { p0 = (f32[4]) parameter(0) a = f32[4] get-tuple-element(p0), index=0 - b = (f32[4], u32[]) send(a), channel_id=1, sharding={maximal device=0} - c = () send-done(b), channel_id=1, sharding={maximal device=0} - d = (f32[4], u32[]) recv(), channel_id=2, sharding={maximal device=0} - e = f32[4] recv-done(d), channel_id=2, sharding={maximal device=0} - f = f32[4] add(a, e) - g = f32[4] subtract(a, e) + token = token[] after-all() + b = (f32[4], u32[], token[]) send(a, token), channel_id=1, sharding={maximal device=0} + c = token[] send-done(b), channel_id=1, sharding={maximal device=0} + d = (f32[4], u32[], token[]) recv(token), channel_id=2, sharding={maximal device=0} + e = (f32[4], token[]) recv-done(d), channel_id=2, sharding={maximal device=0} + e_element = f32[4] get-tuple-element(e), index=0, sharding={maximal device=0} + f = f32[4] add(a, e_element) + g = f32[4] subtract(a, e_element) ROOT h = (f32[4], f32[4]) tuple(f, g) } )"; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - ParseModule(hlo_string)); + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); LOG(INFO) << "Original module:\n" << module->ToString(); - HloDomainIsolator isolator(CreateShardingDomain); - TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get())); + HloDomainIsolator isolator(ShardingDomainCreator{}); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); EXPECT_TRUE(isolator_changed); - EXPECT_TRUE(HasDomainEdge(module.get(), "b", "a")); - EXPECT_TRUE(HasDomainEdge(module.get(), "f", "e")); - EXPECT_FALSE(HasDomainEdge(module.get(), "a", "p0")); - EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b")); - EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d")); + EXPECT_TRUE(HasDomainEdge(module, "b", "a")); + EXPECT_TRUE(HasDomainEdge(module, "f", "e_element")); + EXPECT_FALSE(HasDomainEdge(module, "a", "p0")); + EXPECT_FALSE(HasDomainEdge(module, "c", "b")); + EXPECT_FALSE(HasDomainEdge(module, "e", "d")); HloDomainRemover remover(ShardingMetadata::KindName(), - NormalizeShardingDomain); - TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get())); + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module)); EXPECT_TRUE(remover_changed); - EXPECT_FALSE(HasDomainEdge(module.get(), "b", "a")); - EXPECT_FALSE(HasDomainEdge(module.get(), "f", "e")); + EXPECT_FALSE(HasDomainEdge(module, "b", "a")); + EXPECT_FALSE(HasDomainEdge(module, "f", "e_element")); } TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) { @@ -240,20 +236,21 @@ TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) { HloModule Module ENTRY entry { - a = (f32[4], u32[]) recv(), channel_id=1, sharding={maximal device=-1} - b = f32[4] recv-done(a), channel_id=1, sharding={maximal device=-1} - c = f32[4] add(b, b), sharding={maximal device=-1} - d = (f32[4], u32[]) send(c), channel_id=2, sharding={maximal device=-1} - ROOT e = () send-done(d), channel_id=2, sharding={maximal device=-1} + token = token[] after-all(), sharding={maximal device=-1} + a = (f32[4], u32[], token[]) recv(token), channel_id=1, sharding={maximal device=-1} + b = (f32[4], token[]) recv-done(a), channel_id=1, sharding={maximal device=-1} + b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=-1} + c = f32[4] add(b_element, b_element), sharding={maximal device=-1} + d = (f32[4], u32[], token[]) send(c, token), channel_id=2, sharding={maximal device=-1} + ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=-1} } )"; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - ParseModule(hlo_string)); + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); LOG(INFO) << "Original module:\n" << module->ToString(); - HloDomainIsolator isolator(CreateShardingDomain); - TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get())); + HloDomainIsolator isolator(ShardingDomainCreator{}); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); EXPECT_FALSE(isolator_changed); } @@ -262,24 +259,25 @@ TEST_F(HloDomainTest, CheckNormalizationOnPureIOComputation) { HloModule Module ENTRY entry { - a = (f32[4], u32[]) recv(), channel_id=1, sharding={maximal device=0} - b = f32[4] recv-done(a), channel_id=1, sharding={maximal device=0} - c = f32[4] add(b, b) - d = (f32[4], u32[]) send(c), channel_id=2, sharding={maximal device=0} - ROOT e = () send-done(d), channel_id=2, sharding={maximal device=0} + token = token[] after-all(), sharding={maximal device=0} + a = (f32[4], u32[], token[]) recv(token), channel_id=1, sharding={maximal device=0} + b = (f32[4], token[]) recv-done(a), channel_id=1, sharding={maximal device=0} + b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=0} + c = f32[4] add(b_element, b_element) + d = (f32[4], u32[], token[]) send(c, token), channel_id=2, sharding={maximal device=0} + ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=0} } )"; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - ParseModule(hlo_string)); + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); LOG(INFO) << "Original module:\n" << module->ToString(); HloDomainRemover remover(ShardingMetadata::KindName(), - NormalizeShardingDomain); - TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get())); + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module)); EXPECT_FALSE(remover_changed); - HloInstruction* add = FindInstruction(module.get(), "c"); + HloInstruction* add = FindInstruction(module, "c"); ASSERT_NE(add, nullptr); auto device = add->sharding_unique_device(); EXPECT_TRUE(device.has_value()); @@ -302,42 +300,41 @@ ENTRY entry { } )"; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - ParseModule(hlo_string)); + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); LOG(INFO) << "Original module:\n" << module->ToString(); - HloDomainIsolator sharding_isolator(CreateShardingDomain); + HloDomainIsolator sharding_isolator(ShardingDomainCreator{}); TF_ASSERT_OK_AND_ASSIGN(bool sharding_isolator_changed, - sharding_isolator.Run(module.get())); + sharding_isolator.Run(module)); EXPECT_TRUE(sharding_isolator_changed); HloDomainIsolator opname_isolator(OpNameDomainCreator); TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed, - opname_isolator.Run(module.get())); + opname_isolator.Run(module)); EXPECT_TRUE(opname_isolator_changed); - EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a")); - EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b")); - EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a")); - EXPECT_TRUE(HasDomainEdge(module.get(), "d", "c")); - EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d")); + EXPECT_TRUE(HasDomainEdge(module, "c", "a")); + EXPECT_TRUE(HasDomainEdge(module, "c", "b")); + EXPECT_TRUE(HasDomainEdge(module, "d", "a")); + EXPECT_TRUE(HasDomainEdge(module, "d", "c")); + EXPECT_FALSE(HasDomainEdge(module, "e", "d")); HloDomainRemover sharding_remover(ShardingMetadata::KindName(), - NormalizeShardingDomain); + ShardingMetadata::NormalizeShardingDomain); TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed, - sharding_remover.Run(module.get())); + sharding_remover.Run(module)); EXPECT_TRUE(sharding_remover_changed); HloDomainRemover opname_remover(OpNameMetadata::KindName(), OpNameDomainNormalizer); TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed, - opname_remover.Run(module.get())); + opname_remover.Run(module)); EXPECT_TRUE(opname_remover_changed); - EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a")); - EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b")); - EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a")); - EXPECT_FALSE(HasDomainEdge(module.get(), "d", "c")); + EXPECT_FALSE(HasDomainEdge(module, "c", "a")); + EXPECT_FALSE(HasDomainEdge(module, "c", "b")); + EXPECT_FALSE(HasDomainEdge(module, "d", "a")); + EXPECT_FALSE(HasDomainEdge(module, "d", "c")); } TEST_F(HloDomainTest, CheckNormalizationOnInfeedTuple) { @@ -345,33 +342,36 @@ TEST_F(HloDomainTest, CheckNormalizationOnInfeedTuple) { HloModule Module ENTRY entry { - infeed = (f32[4], f32[4]) infeed(), + token = token[] after-all() + infeed = ((f32[4], f32[4]), token[]) infeed(token), + sharding={{maximal device=1}, {maximal device=0}, {maximal device=0}} + infeed.data = (f32[4], f32[4]) get-tuple-element(infeed), index=0, sharding={{maximal device=1}, {maximal device=0}} - gte0 = f32[4] get-tuple-element(infeed), index=0 - gte1 = f32[4] get-tuple-element(infeed), index=1 + gte0 = f32[4] get-tuple-element(infeed.data), index=0 + gte1 = f32[4] get-tuple-element(infeed.data), index=1 copy0 = f32[4] copy(gte0) copy1 = f32[4] copy(gte1) ROOT add = f32[4] add(copy0, copy1) } )"; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - ParseModule(hlo_string)); + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); LOG(INFO) << "Original module:\n" << module->ToString(); - HloDomainIsolator isolator(CreateShardingDomain); - TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get())); + HloDomainIsolator isolator(ShardingDomainCreator{}); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); EXPECT_TRUE(isolator_changed); - EXPECT_TRUE(HasDomainEdge(module.get(), "gte0", "infeed")); - EXPECT_TRUE(HasDomainEdge(module.get(), "gte1", "infeed")); - EXPECT_FALSE(HasDomainEdge(module.get(), "copy0", "gte0")); - EXPECT_FALSE(HasDomainEdge(module.get(), "copy1", "gte1")); + EXPECT_TRUE(HasDomainEdge(module, "infeed.data", "infeed")); + EXPECT_FALSE(HasDomainEdge(module, "copy0", "gte0")); + EXPECT_FALSE(HasDomainEdge(module, "copy1", "gte1")); // Inject unassigned tuple/gte within the infeed domain, to simulate the // HLO passes adding unexpected instructions. // // infeed + // | + // infeed.data (tuple element 0 of infeed) // / \ // GTE0 GTE1 // / \ @@ -380,31 +380,33 @@ ENTRY entry { // \ / // TUPLE // | - // DOMAIN - HloInstruction* infeed = FindInstruction(module.get(), "infeed"); - ASSERT_NE(infeed, nullptr); - auto infeed_users = infeed->users(); - HloInstruction* new_gte0 = - infeed->parent()->AddInstruction(HloInstruction::CreateGetTupleElement( - ShapeUtil::GetTupleElementShape(infeed->shape(), 0), infeed, 0)); + HloInstruction* infeed_data = FindInstruction(module, "infeed.data"); + ASSERT_NE(infeed_data, nullptr); + + auto infeed_data_users = infeed_data->users(); + HloInstruction* new_gte0 = infeed_data->parent()->AddInstruction( + HloInstruction::CreateGetTupleElement( + ShapeUtil::GetTupleElementShape(infeed_data->shape(), 0), infeed_data, + 0)); HloInstruction* new_copy0 = - infeed->parent()->AddInstruction(HloInstruction::CreateUnary( + infeed_data->parent()->AddInstruction(HloInstruction::CreateUnary( new_gte0->shape(), HloOpcode::kCopy, new_gte0)); - HloInstruction* new_gte1 = - infeed->parent()->AddInstruction(HloInstruction::CreateGetTupleElement( - ShapeUtil::GetTupleElementShape(infeed->shape(), 1), infeed, 1)); + HloInstruction* new_gte1 = infeed_data->parent()->AddInstruction( + HloInstruction::CreateGetTupleElement( + ShapeUtil::GetTupleElementShape(infeed_data->shape(), 1), infeed_data, + 1)); HloInstruction* new_copy1 = - infeed->parent()->AddInstruction(HloInstruction::CreateUnary( + infeed_data->parent()->AddInstruction(HloInstruction::CreateUnary( new_gte1->shape(), HloOpcode::kCopy, new_gte1)); - HloInstruction* new_tuple = infeed->parent()->AddInstruction( + HloInstruction* new_tuple = infeed_data->parent()->AddInstruction( HloInstruction::CreateTuple({new_copy0, new_copy1})); - for (HloInstruction* user : infeed_users) { - TF_EXPECT_OK(infeed->ReplaceUseWith(user, new_tuple)); + for (HloInstruction* user : infeed_data_users) { + TF_EXPECT_OK(infeed_data->ReplaceUseWith(user, new_tuple)); } HloDomainRemover remover(ShardingMetadata::KindName(), - NormalizeShardingDomain); - TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get())); + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module)); EXPECT_TRUE(remover_changed); struct Assignment { @@ -418,7 +420,7 @@ ENTRY entry { }; for (auto& assignment : assignments) { auto device = assignment.instruction->sharding_unique_device(); - EXPECT_TRUE(device.has_value()); + ASSERT_TRUE(device.has_value()); EXPECT_EQ(*device, assignment.device); } EXPECT_TRUE(new_tuple->has_sharding()); @@ -428,5 +430,262 @@ ENTRY entry { HloSharding::AssignDevice(0)})); } +TEST_F(HloDomainTest, EmptyRootDomain) { + const char* const hlo_string = R"( +HloModule Module + +ENTRY entry { + %param = f32[1] parameter(0), sharding={maximal device=0} + %tuple = (f32[1]) tuple(%param), + sharding={maximal device=1} + ROOT %gte = f32[1] get-tuple-element(%tuple), index=0, + sharding={maximal device=1} +})"; + + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); + + HloDomainIsolator isolator(ShardingDomainCreator{}); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); + EXPECT_TRUE(isolator_changed); + + EXPECT_TRUE(HasDomainEdge(module, "tuple", "param")); + EXPECT_FALSE(HasDomainEdge(module, "gte", "tuple")); + + // Remove %tuple and %gte (tuple simplification) + HloInstruction* gte = FindInstruction(module, "gte"); + HloInstruction* tuple = FindInstruction(module, "tuple"); + module->entry_computation()->set_root_instruction(tuple->mutable_operand(0)); + TF_EXPECT_OK(module->entry_computation()->RemoveInstruction(gte)); + TF_EXPECT_OK(module->entry_computation()->RemoveInstruction(tuple)); + + HloDomainRemover remover(ShardingMetadata::KindName(), + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module)); + EXPECT_TRUE(remover_changed); + + const HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_TRUE(root->has_sharding()); + EXPECT_EQ(root->sharding(), HloSharding::AssignDevice(1)); +} + +// Tests that text dumps of domain instructions can be parsed back, in the +// specific case of null shardings. +TEST_F(HloDomainTest, DumpParseNullSharding) { + auto builder = HloComputation::Builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {}); + auto sharding_md_0 = absl::make_unique(nullptr); + auto sharding_md_1 = absl::make_unique(nullptr); + HloInstruction* param = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p")); + HloInstruction* domain = builder.AddInstruction(HloInstruction::CreateDomain( + shape, param, std::move(sharding_md_0), std::move(sharding_md_1))); + builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, domain, domain)); + + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build()); + + auto hlo_string = module->ToString(); + ASSERT_TRUE(ParseModule(hlo_string).status().ok()); +} + +// Tuple inputs are domain instructions. +TEST_F(HloDomainTest, DomainTuple) { + const char* const hlo_string = R"( +HloModule Module + +ENTRY entry { + p0 = f32[4] parameter(0), sharding={maximal device=0} + cst = u32[] constant(0), sharding={maximal device=1} + tpl = (u32[], f32[4]) tuple(cst, p0), + sharding={{maximal device=1}, {maximal device=0}} + ROOT gte = f32[4] get-tuple-element(tpl), index=1, sharding={maximal device=0} +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); + + HloDomainIsolator isolator(ShardingDomainCreator{}); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); + EXPECT_TRUE(isolator_changed); + + // Clear sharding of tpl instruction, in order to test domain sharding + // application. + auto tpl = FindInstruction(module, "tpl"); + tpl->clear_sharding(); + + HloDomainRemover remover(ShardingMetadata::KindName(), + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module)); + EXPECT_TRUE(remover_changed); + + EXPECT_EQ(HloSharding::Tuple(tpl->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + tpl->sharding()); +} + +TEST_F(HloDomainTest, MultiDomainMultiUser) { + const char* const hlo_string = R"( + HloModule Module + +ENTRY %entry (p0: (f32[4], f32[4])) -> (f32[4], f32[4], f32[4]) { + %p0 = (f32[4], f32[4]) parameter(0) + %a = f32[4]{0} get-tuple-element(%p0), index=0 + %domain = f32[4] domain(%a), + domain={kind="sharding", entry={maximal device=1}, exit={maximal device=0}} + %b = f32[4] get-tuple-element(%p0), index=1 + %domain.1 = f32[4] domain(%b), + domain={kind="sharding", entry={maximal device=1}, exit={maximal device=0}} + %c = f32[4] add(%domain, %domain.1), sharding={maximal device=1} + %domain.2 = f32[4] domain(%c), + domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}} + %d = f32[4] subtract(%domain, %c), + sharding={maximal device=1}, metadata={op_name="D"} + %domain.3 = f32[4] domain(%d), + domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}} + %e = f32[4] multiply(%c, %d), + sharding={maximal device=1}, metadata={op_name="D"} + %f = f32[4] add(f32[4]{0} %e, f32[4]{0} %c), sharding={maximal device=1} + %domain.4 = f32[4]{0} domain(%f), + domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}} + ROOT %g = (f32[4], f32[4], f32[4]) tuple(%domain.2, %domain.3, %domain.4) +})"; + + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); + LOG(INFO) << "Original module:\n" << module->ToString(); + + HloDomainIsolator opname_isolator(OpNameDomainCreator); + TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed, + opname_isolator.Run(module)); + EXPECT_TRUE(opname_isolator_changed); + + EXPECT_TRUE(HasDomainEdge(module, "c", "a")); + EXPECT_TRUE(HasDomainEdge(module, "c", "b")); + EXPECT_TRUE(HasDomainEdge(module, "d", "a")); + EXPECT_TRUE(HasDomainEdge(module, "d", "c")); + EXPECT_FALSE(HasDomainEdge(module, "e", "d")); + + HloDomainRemover sharding_remover(ShardingMetadata::KindName(), + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed, + sharding_remover.Run(module)); + EXPECT_TRUE(sharding_remover_changed); + + HloDomainRemover opname_remover(OpNameMetadata::KindName(), + OpNameDomainNormalizer); + TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed, + opname_remover.Run(module)); + EXPECT_TRUE(opname_remover_changed); + + EXPECT_FALSE(HasDomainEdge(module, "c", "a")); + EXPECT_FALSE(HasDomainEdge(module, "c", "b")); + EXPECT_FALSE(HasDomainEdge(module, "d", "a")); + EXPECT_FALSE(HasDomainEdge(module, "d", "c")); +} + +// Emulate instructions inserted at top and bottom within nested tuple domain. +TEST_F(HloDomainTest, DomainTupleTopBottomInsert) { + const char* const hlo_string = R"( +HloModule Module + +ENTRY entry { + p0 = f32[4] parameter(0), sharding={maximal device=1} + p1 = (f32[5], f32[6]) parameter(1), + sharding={{maximal device=1}, {maximal device=0}} + tuple.0 = (f32[4], (f32[5], f32[6])) tuple(p0, p1), + sharding={{maximal device=1}, {maximal device=1}, {maximal device=0}} + ROOT res = (f32[5], f32[6]) get-tuple-element(tuple.0), index=1, + sharding={{maximal device=1}, {maximal device=0}} +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); + + HloDomainIsolator isolator(ShardingDomainCreator{}); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); + EXPECT_TRUE(isolator_changed); + + // Clear sharding of tuple.0 instruction, in order to test domain sharding + // application. + auto tuple0 = FindInstruction(module, "tuple.0"); + tuple0->clear_sharding(); + + // Insert the following instructons above and below tuple.0, to emulate other + // passes effects: + // COPY.0 + // \ / + // TUPLE.0 + // / \ + // COPY.1 \ + // / \ + // GTE.0 GTE.1 + // | | + // | COPY.2 + // \ / + // \ / + // TUPLE.1 + // | + auto tuple0_users = tuple0->users(); + auto computation = tuple0->parent(); + HloInstruction* copy0 = computation->AddInstruction( + HloInstruction::CreateUnary(tuple0->operand(1)->shape(), HloOpcode::kCopy, + tuple0->mutable_operand(1))); + TF_EXPECT_OK(tuple0->ReplaceOperandWith(1, copy0)); + + HloInstruction* copy1 = computation->AddInstruction( + HloInstruction::CreateUnary(tuple0->shape(), HloOpcode::kCopy, tuple0)); + HloInstruction* gte0 = + computation->AddInstruction(HloInstruction::CreateGetTupleElement( + ShapeUtil::GetTupleElementShape(copy1->shape(), 0), copy1, 0)); + HloInstruction* gte1 = + computation->AddInstruction(HloInstruction::CreateGetTupleElement( + ShapeUtil::GetTupleElementShape(tuple0->shape(), 1), tuple0, 1)); + HloInstruction* copy2 = computation->AddInstruction( + HloInstruction::CreateUnary(gte1->shape(), HloOpcode::kCopy, gte1)); + HloInstruction* tuple1 = + computation->AddInstruction(HloInstruction::CreateTuple({gte0, copy2})); + + for (HloInstruction* user : tuple0_users) { + TF_EXPECT_OK(tuple0->ReplaceUseWith(user, tuple1)); + } + + HloDomainRemover remover(ShardingMetadata::KindName(), + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module)); + EXPECT_TRUE(remover_changed); + + EXPECT_TRUE(tuple0->has_sharding()); + EXPECT_EQ(HloSharding::Tuple(tuple0->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + tuple0->sharding()); + + EXPECT_TRUE(copy0->has_sharding()); + EXPECT_EQ(HloSharding::Tuple(copy0->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + copy0->sharding()); + + // copy1 has partial information only from gte.0, so in the end it gets no + // sharding at all. During propagation it does propagate the information from + // gte.0 though, enabling Tuple.0 to be fully sharded. + EXPECT_FALSE(copy1->has_sharding()); + + EXPECT_TRUE(gte0->has_sharding()); + EXPECT_EQ(HloSharding::AssignDevice(1), gte0->sharding()); + + EXPECT_TRUE(gte1->has_sharding()); + EXPECT_EQ(HloSharding::Tuple(gte1->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + gte1->sharding()); + + EXPECT_TRUE(copy2->has_sharding()); + EXPECT_EQ(HloSharding::Tuple(copy2->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + copy2->sharding()); + + EXPECT_TRUE(tuple1->has_sharding()); + EXPECT_EQ(tuple0->sharding(), tuple1->sharding()); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_domain_verifier.cc b/tensorflow/compiler/xla/service/hlo_domain_verifier.cc new file mode 100644 index 0000000000000000000000000000000000000000..dc514ae3e5c6907f6398805d171e69ee8635d08e --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_domain_verifier.cc @@ -0,0 +1,124 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_domain_verifier.h" + +#include + +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_domain_map.h" +#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/types.h" + +namespace xla { + +class HloDomainVerifier::RunContext { + public: + RunContext(HloModule* module, HloDomainVerifier* verifier) + : module_(module), verifier_(verifier) {} + + Status Run(); + + private: + // If the verifier caller passed an empty vector for kinds, we collect all the + // avalable domain types. + Status PopulateDomainKinds(); + + HloModule* module_; + HloDomainVerifier* verifier_; +}; + +Status HloDomainVerifier::RunContext::PopulateDomainKinds() { + if (verifier_->kinds_.empty()) { + // The caller specified no domain kinds, collect all the ones available. + std::set kinds; + for (HloComputation* computation : module_->computations()) { + for (HloInstruction* instruction : computation->instructions()) { + if (instruction->opcode() == HloOpcode::kDomain) { + TF_RET_CHECK(instruction->user_side_metadata().Kind() == + instruction->operand_side_metadata().Kind()) + << instruction->ToString(); + kinds.insert(string(instruction->user_side_metadata().Kind())); + } + } + } + verifier_->kinds_.insert(verifier_->kinds_.end(), kinds.begin(), + kinds.end()); + } + return Status::OK(); +} + +Status HloDomainVerifier::RunContext::Run() { + VLOG(4) << "Running HLO Domain Verifier"; + TF_RETURN_IF_ERROR(PopulateDomainKinds()); + for (HloComputation* computation : module_->computations()) { + for (auto& kind : verifier_->kinds_) { + // First create the domain instruciton sets. A domain instruction set is + // the set of instructions whose edges never cross a kDomain instruction. + TF_ASSIGN_OR_RETURN(std::unique_ptr domain_map, + HloDomainMap::Create(computation, kind)); + // Verify every domain populated within the map. + for (auto& domain : domain_map->GetDomains()) { + TF_RETURN_IF_ERROR(VerifyDomain(*domain).status()); + } + } + } + return Status::OK(); +} + +StatusOr HloDomainVerifier::Run(HloModule* module) { + RunContext run_context(module, this); + TF_RETURN_IF_ERROR(run_context.Run()); + return false; +} + +StatusOr HloDomainVerifier::VerifyDomain( + const DomainMetadata::Domain& domain) { + const DomainMetadata* ref_metadata = nullptr; + VLOG(4) << "Reach set:"; + for (HloInstruction* instruction : domain.instructions) { + VLOG(4) << " " << instruction->name(); + } + VLOG(4) << " Domains:"; + for (HloInstruction* instruction : domain.enter_domains) { + const DomainMetadata& meta = instruction->user_side_metadata(); + VLOG(4) << " User side: " << instruction->name(); + VLOG(4) << " " << meta.ToString(); + if (ref_metadata == nullptr) { + ref_metadata = &meta; + } else { + TF_RET_CHECK(meta.Matches(*ref_metadata)) + << "Metadata mismatch at instruction " << instruction->name() << " : " + << meta.ToString() << " vs " << ref_metadata->ToString(); + } + } + for (HloInstruction* instruction : domain.exit_domains) { + const DomainMetadata& meta = instruction->operand_side_metadata(); + VLOG(4) << " Operand side: " << instruction->name(); + VLOG(4) << " " << meta.ToString(); + if (ref_metadata == nullptr) { + ref_metadata = &meta; + } else { + TF_RET_CHECK(meta.Matches(*ref_metadata)) + << "Metadata mismatch at instruction " << instruction->name() << " : " + << meta.ToString() << " vs " << ref_metadata->ToString(); + } + } + return ref_metadata; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_domain_verifier.h b/tensorflow/compiler/xla/service/hlo_domain_verifier.h new file mode 100644 index 0000000000000000000000000000000000000000..81d6d69a8c59da2fc77cb2bab808602cd964fdaf --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_domain_verifier.h @@ -0,0 +1,65 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_VERIFIER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_VERIFIER_H_ + +#include +#include + +#include "tensorflow/compiler/xla/service/hlo_domain_map.h" +#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" +#include "tensorflow/core/lib/core/status.h" + +namespace xla { + +// Verifies that the domain instructions are consistent, and the each domain is +// surrounded by the same metadata. +class HloDomainVerifier : public HloPassInterface { + public: + HloDomainVerifier(std::vector kinds) : kinds_(std::move(kinds)) {} + + absl::string_view name() const override { return "domain_verifier"; } + + StatusOr Run(HloModule* module) override; + + // Verify that the whole kDomain frontier bounding the instruction reach set, + // has matching metadata. + // A kDomain instruction has two sides of metadata, a user facing and an + // operand facing. + // A reachable instruction set can make contact with a kDomain instruction on + // a user facing side (the kDomain is operand of the instruction), or on a + // operand facing side (the kDomain is user of the instruction). + // And depending on the contact side, the proper metadata object + // (user_side_metadata() vs. operand_side_metadata()) needs to be used for + // consistency checks. + // Returns the DomainMetadata pointer which surrounds the domain, and + // represents the common metadata within such domain. If the returned + // DomainMetadata pointer is nullptr, the input domain had no kDomain + // boundary. + static StatusOr VerifyDomain( + const DomainMetadata::Domain& domain); + + private: + class RunContext; + + std::vector kinds_; +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_VERIFIER_H_ diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc index abec29df433c521c3480b9297000085b1b1104e3..72006e17e7e7ec09b62e88d05b695ec9f4c49647 100644 --- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc +++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc @@ -21,7 +21,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_evaluator.h" @@ -141,15 +141,21 @@ StatusOr HloElementTypeConverter::Run(HloModule* module) { // These are ops with embedded computations where it suffices to convert // the embedded computations instead of converting the ops themselves. if (opcode == HloOpcode::kWhile || opcode == HloOpcode::kCall || + opcode == HloOpcode::kCrossReplicaSum || opcode == HloOpcode::kFusion || opcode == HloOpcode::kMap || opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow || + opcode == HloOpcode::kScatter || opcode == HloOpcode::kSelectAndScatter || opcode == HloOpcode::kConditional) { continue; } TF_RET_CHECK(hlo->called_computations().empty()) << hlo->ToString(); - if (!HasOperandType(hlo, eliminate_type_)) { + bool nullary = hlo->operands().empty(); + bool wrong_element_type = hlo->shape().element_type() == eliminate_type_; + bool should_eliminate_type = (nullary && wrong_element_type) || + HasOperandType(hlo, eliminate_type_); + if (!should_eliminate_type) { // If this CHECK fires, then this was an instruction that does not take // the elimination type as an operand but it does return it. This pass // does not have a feature to change the output type in that case, so diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.h b/tensorflow/compiler/xla/service/hlo_element_type_converter.h index 2b109225d0b192e5c9e4f6d841377ffad8078dc2..44ded2c2faf7c38d1e2f2aae577ddc07089bbb6a 100644 --- a/tensorflow/compiler/xla/service/hlo_element_type_converter.h +++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.h @@ -32,9 +32,7 @@ class HloElementTypeConverter : public HloPassInterface { HloElementTypeConverter(PrimitiveType eliminate_type, PrimitiveType replace_with_type); - tensorflow::StringPiece name() const override { - return "element_type_converter"; - } + absl::string_view name() const override { return "element_type_converter"; } // Returns the pass on the module and returns whether the module was modified. StatusOr Run(HloModule* module) override; diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc index 5c5a059e0fd895f03bc26a975609b57333237faf..c170e36c73ad2bef830e528de3ec72d38683d888 100644 --- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc +++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc @@ -57,8 +57,10 @@ TEST_F(HloElementTypeConverterTest, InfeedsOutfeedsNotConverted) { const string& hlo_string = R"( HloModule InfeedOutfeed ENTRY RoundTrip16MiBR1.v2 { - ROOT infeed = bf16[4]{0} infeed() - outfeed = () outfeed(infeed) + token = token[] after-all() + infeed = (bf16[4]{0}, token[]) infeed(token) + ROOT infeed.data = bf16[4]{0} get-tuple-element(infeed), index=0 + outfeed = token[] outfeed(infeed.data, token) } )"; auto module = CreateModuleFromHloString(hlo_string); diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc index 1e78d775c8e172a272a03fbd1101cef365e6dc2d..441dcad00047311d682c0623964ee63aab341904 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator.cc +++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc @@ -23,12 +23,15 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/index_util.h" #include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/primitive_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" @@ -42,7 +45,6 @@ limitations under the License. #include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/types.h" @@ -51,7 +53,6 @@ namespace xla { namespace { -using tensorflow::gtl::ArraySlice; template StatusOr> Compare(const Shape& shape, HloOpcode opcode, @@ -94,11 +95,12 @@ StatusOr> Compare(const Shape& shape, HloOpcode opcode, << HloOpcodeString(opcode); } - auto result = MakeUnique(shape); - TF_RETURN_IF_ERROR(result->Populate([&](ArraySlice multi_index) { - return compare_op(lhs_literal.Get(multi_index), - rhs_literal.Get(multi_index)); - })); + auto result = absl::make_unique(shape); + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span multi_index) { + return compare_op(lhs_literal.Get(multi_index), + rhs_literal.Get(multi_index)); + })); return std::move(result); } @@ -124,63 +126,76 @@ StatusOr> Compare( << HloOpcodeString(opcode); } - auto result = MakeUnique(shape); - TF_RETURN_IF_ERROR(result->Populate([&](ArraySlice multi_index) { - return compare_op(lhs_literal.Get(multi_index), - rhs_literal.Get(multi_index)); - })); + auto result = absl::make_unique(shape); + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span multi_index) { + return compare_op(lhs_literal.Get(multi_index), + rhs_literal.Get(multi_index)); + })); return std::move(result); } } // namespace - HloEvaluator::HloEvaluator(int64 max_loop_iterations) : max_loop_iterations_(max_loop_iterations) { - typed_visitors_[PRED] = MakeUnique>(this); - typed_visitors_[U8] = MakeUnique>(this); - typed_visitors_[U16] = MakeUnique([](HloInstruction*) { - return Unimplemented( - "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: " - "U16."); - }); - typed_visitors_[U32] = MakeUnique>(this); - typed_visitors_[U64] = MakeUnique>(this); - typed_visitors_[S8] = MakeUnique>(this); - typed_visitors_[S16] = MakeUnique([](HloInstruction*) { - return Unimplemented( - "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: " - "S16."); - }); - typed_visitors_[S32] = MakeUnique>(this); - typed_visitors_[S64] = MakeUnique>(this); + typed_visitors_[PRED] = + absl::make_unique>(this); + typed_visitors_[U8] = + absl::make_unique>(this); + typed_visitors_[U16] = + absl::make_unique([](HloInstruction*) { + return Unimplemented( + "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: " + "U16."); + }); + typed_visitors_[U32] = + absl::make_unique>(this); + typed_visitors_[U64] = + absl::make_unique>(this); + typed_visitors_[S8] = absl::make_unique>(this); + typed_visitors_[S16] = + absl::make_unique([](HloInstruction*) { + return Unimplemented( + "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: " + "S16."); + }); + typed_visitors_[S32] = + absl::make_unique>(this); + typed_visitors_[S64] = + absl::make_unique>(this); typed_visitors_[F16] = - MakeUnique>(this); - typed_visitors_[F32] = MakeUnique>(this); - typed_visitors_[F64] = MakeUnique>(this); - typed_visitors_[C64] = MakeUnique>(this); + absl::make_unique>(this); + typed_visitors_[F32] = + absl::make_unique>(this); + typed_visitors_[F64] = + absl::make_unique>(this); + typed_visitors_[C64] = + absl::make_unique>(this); // Most of the evaluator computations we use don't support BF16 (e.g., // std::ceil, std::tanh). To make evaluator work with BF16, we set all // elementwise computations to be done in F32 and do BF16<->F32 conversion // around the input and the output of the computations. typed_visitors_[BF16] = - MakeUnique>(this); - - typed_visitors_[TUPLE] = MakeUnique([](HloInstruction*) { - return Unimplemented( - "HloEvaluatorTypedVisitor: unhandled primitive type: TUPLE."); - }); - typed_visitors_[OPAQUE] = MakeUnique([](HloInstruction*) { - return Unimplemented( - "HloEvaluatorTypedVisitor: unhandled primitive type: OPAQUE."); - }); + absl::make_unique>(this); + + typed_visitors_[TUPLE] = + absl::make_unique([](HloInstruction*) { + return Unimplemented( + "HloEvaluatorTypedVisitor: unhandled primitive type: TUPLE."); + }); + typed_visitors_[OPAQUE] = + absl::make_unique([](HloInstruction*) { + return Unimplemented( + "HloEvaluatorTypedVisitor: unhandled primitive type: OPAQUE."); + }); } template StatusOr> HloEvaluator::Evaluate( - const HloModule& module, ArraySlice arg_literals) { + const HloModule& module, absl::Span arg_literals) { XLA_VLOG_LINES(2, "HloEvaluator::Evaluate module:\n" + module.ToString()); evaluated_.clear(); @@ -197,7 +212,8 @@ StatusOr> HloEvaluator::Evaluate( template StatusOr> HloEvaluator::Evaluate( - const HloComputation& computation, ArraySlice arg_literals) { + const HloComputation& computation, + absl::Span arg_literals) { CHECK(computation.parent() != nullptr); XLA_VLOG_LINES( 2, "HloEvaluator::Evaluate computation:\n" + computation.ToString()); @@ -214,9 +230,8 @@ StatusOr> HloEvaluator::Evaluate( template StatusOr> HloEvaluator::Evaluate( - HloInstruction* instruction, ArraySlice arg_literals) { + HloInstruction* instruction, absl::Span arg_literals) { TF_RET_CHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction)); - TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(instruction->shape())); evaluated_.clear(); arg_literals_.clear(); @@ -253,7 +268,6 @@ StatusOr> HloEvaluator::Evaluate( return tensorflow::errors::FailedPrecondition( "Not all operands are constants."); } - TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(instruction->shape())); arg_literals_.clear(); evaluated_.clear(); @@ -300,12 +314,6 @@ StatusOr> HloEvaluator::EvaluateWithSubstitutions( instruction->CloneWithNewOperands(instruction->shape(), operands); auto result = Evaluate(cloned_instruction.get()); - // Clean up our cloned instructions before returning. - cloned_instruction->DetachFromOperands(); - for (auto& operand : owned_operands) { - operand->DetachFromOperands(); - } - return result; } @@ -321,7 +329,6 @@ StatusOr> HloEvaluator::EvaluateElementwiseBinaryOp( rhs_instr.get()); auto result = Evaluate(cloned_instruction.get()); - cloned_instruction->DetachFromOperands(); return result; } @@ -334,10 +341,27 @@ StatusOr> HloEvaluator::EvaluateElementwiseUnaryOp( HloInstruction::CreateUnary(operand.shape(), opcode, operand_instr.get()); auto result = Evaluate(cloned_instruction.get()); - cloned_instruction->DetachFromOperands(); return result; } +StatusOr> HloEvaluator::EvaluateDotOp( + const DotDimensionNumbers& dim_numbers, const Literal& lhs, + const Literal& rhs) { + std::unique_ptr lhs_instr = + HloInstruction::CreateConstant(lhs.CloneToUnique()); + std::unique_ptr rhs_instr = + HloInstruction::CreateConstant(rhs.CloneToUnique()); + + TF_ASSIGN_OR_RETURN( + Shape dot_shape, + ShapeInference::InferDotOpShape(lhs.shape(), rhs.shape(), dim_numbers)); + + std::unique_ptr cloned_instruction = + HloInstruction::CreateDot(dot_shape, lhs_instr.get(), rhs_instr.get(), + dim_numbers); + return Evaluate(cloned_instruction.get()); +} + Status HloEvaluator::HandleParameter(HloInstruction* parameter) { CHECK_LT(parameter->parameter_number(), arg_literals_.size()); const Literal* input_literal = arg_literals_[parameter->parameter_number()]; @@ -368,11 +392,11 @@ Status HloEvaluator::HandleTranspose(HloInstruction* transpose) { } Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) { - ArraySlice operands(concatenate->operands()); + absl::Span operands(concatenate->operands()); // The result concatenate dimension is going to be the sum of all // concatenate dimensions of the operands taking part of the operation. const Shape& reference_shape = operands[0]->shape(); - CHECK(!ShapeUtil::IsTuple(reference_shape)); + CHECK(ShapeUtil::IsArray(reference_shape)); const int64 rank = ShapeUtil::Rank(reference_shape); const int64 concat_dim = concatenate->dimensions()[0]; CHECK_GE(concat_dim, 0); @@ -383,14 +407,14 @@ Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) { for (int64 i = 1; i < operands.size(); ++i) { const Shape& operand_shape = operands[i]->shape(); - CHECK(!ShapeUtil::IsTuple(operand_shape)); + CHECK(ShapeUtil::IsArray(operand_shape)); // Accumulate the concat dimension from all tensors taking part to the // operation. concat_dimensions[concat_dim] += ShapeUtil::GetDimension(operand_shape, concat_dim); } - auto result_literal = Literal::CreateFromDimensions( + auto result_literal = LiteralUtil::CreateFromDimensions( reference_shape.element_type(), concat_dimensions); DimensionVector source_indices(rank, 0); DimensionVector dest_indices(concat_dimensions.size(), 0); @@ -413,7 +437,7 @@ Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite) { if (!ShapeUtil::ElementIsFloating(operand->shape())) { return InvalidArgument( "expected element type in shape to be float for IsFinite op, got: %s", - PrimitiveType_Name(operand->shape().element_type()).c_str()); + PrimitiveType_Name(operand->shape().element_type())); } switch (operand->shape().element_type()) { @@ -454,9 +478,9 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) { return Unimplemented( "Implicit broadcasting is currently unsupported in HLO evaluator " "Shape Mismatch: %s vs %s vs %s", - ShapeUtil::HumanString(compare->shape()).c_str(), - ShapeUtil::HumanString(lhs->shape()).c_str(), - ShapeUtil::HumanString(rhs->shape()).c_str()); + ShapeUtil::HumanString(compare->shape()), + ShapeUtil::HumanString(lhs->shape()), + ShapeUtil::HumanString(rhs->shape())); } TF_RET_CHECK(lhs->shape().element_type() == rhs->shape().element_type()); @@ -541,47 +565,45 @@ Status HloEvaluator::HandleTuple(HloInstruction* tuple) { operand_literals.push_back(&GetEvaluatedLiteralFor(operand)); } - evaluated_[tuple] = Literal::MakeTuple(operand_literals); + evaluated_[tuple] = LiteralUtil::MakeTuple(operand_literals); return Status::OK(); } -// Returns an ShapeUtil::IndexIterationSpace that iterates over the output -// gather dimensions while keeping the rest of the output dimensions clamped to -// 0. -ShapeUtil::IndexIterationSpace IterationSpaceForOutputGatherIndices( +// Returns an ShapeUtil::IndexIterationSpace that iterates over the output batch +// dimensions while keeping the rest of the output dimensions clamped to 0. +ShapeUtil::IndexIterationSpace IterationSpaceForOutputBatchIndices( const Shape& output_shape, const GatherDimensionNumbers& dim_numbers) { int64 output_rank = output_shape.dimensions_size(); std::vector index_base(output_rank, 0); std::vector index_count; index_count.reserve(output_rank); for (int64 i = 0; i < output_rank; i++) { - bool is_output_gather_dim = - !c_binary_search(dim_numbers.output_window_dims(), i); - index_count.push_back(is_output_gather_dim ? output_shape.dimensions(i) - : 1); + bool is_output_batch_dim = + !absl::c_binary_search(dim_numbers.offset_dims(), i); + index_count.push_back(is_output_batch_dim ? output_shape.dimensions(i) : 1); } return {std::move(index_base), std::move(index_count), std::vector(output_rank, 1)}; } -// Return an ShapeUtil::IndexIterationSpace that iterates over the output window +// Return an ShapeUtil::IndexIterationSpace that iterates over the output slice // dimensions while keeping the rest of the output dimensions clamped to 0. -ShapeUtil::IndexIterationSpace IterationSpaceForOutputWindowIndices( - int64 output_rank, ArraySlice window_bounds, +ShapeUtil::IndexIterationSpace IterationSpaceForOutputOffsetIndices( + int64 output_rank, absl::Span slice_sizes, const GatherDimensionNumbers& dim_numbers) { std::vector index_base(output_rank, 0); std::vector index_count(output_rank, 1); - int64 window_bounds_idx = 0; + int64 slice_sizes_idx = 0; for (int64 i = 0; i < output_rank; i++) { bool is_output_window_dim = - c_binary_search(dim_numbers.output_window_dims(), i); + absl::c_binary_search(dim_numbers.offset_dims(), i); if (is_output_window_dim) { - while (c_binary_search(dim_numbers.elided_window_dims(), - window_bounds_idx)) { - window_bounds_idx++; + while (absl::c_binary_search(dim_numbers.collapsed_slice_dims(), + slice_sizes_idx)) { + slice_sizes_idx++; } - index_count[i] = window_bounds[window_bounds_idx++]; + index_count[i] = slice_sizes[slice_sizes_idx++]; } } @@ -589,30 +611,30 @@ ShapeUtil::IndexIterationSpace IterationSpaceForOutputWindowIndices( std::vector(output_rank, 1)}; } -// This functor computes the contribution of gather_indices to an input index +// This functor computes the contribution of start_indices to an input index // corresponding to an output index. That is, given an output index I, it picks -// out the gather output indices in I and uses them to look up a gather index, -// G, from the gather indices tensor, and expands G into the input space -// according to gather_dims_to_operand_dims. -class OutputGatherIndexToInputIndex { +// out the batch indices in I and uses them to look up a starting index, G, from +// the start indices tensor, and expands G into the input space according to +// start_index_map. +class OutputBatchIndexToInputIndex { public: // The constructor does some setup work that is amortized across all // iterations. - explicit OutputGatherIndexToInputIndex( + explicit OutputBatchIndexToInputIndex( const GatherDimensionNumbers* dim_numbers, const Shape& input_shape, - const Shape& output_shape, const Literal* gather_indices) - : dim_numbers_(*dim_numbers), gather_indices_(*gather_indices) { + const Shape& output_shape, const Literal* start_indices) + : dim_numbers_(*dim_numbers), start_indices_(*start_indices) { for (int64 i = 0; i < output_shape.dimensions_size(); i++) { - output_dim_is_gather_dims_.push_back( - !c_binary_search(dim_numbers_.output_window_dims(), i)); + output_dim_is_batch_dims_.push_back( + !absl::c_binary_search(dim_numbers_.offset_dims(), i)); } for (int64 i = 0; i < input_shape.dimensions_size(); i++) { int64 index_of_input_dim_in_index_vector = - std::distance(dim_numbers_.gather_dims_to_operand_dims().begin(), - c_find(dim_numbers_.gather_dims_to_operand_dims(), i)); + std::distance(dim_numbers_.start_index_map().begin(), + absl::c_find(dim_numbers_.start_index_map(), i)); if (index_of_input_dim_in_index_vector == - dim_numbers_.gather_dims_to_operand_dims_size()) { + dim_numbers_.start_index_map_size()) { input_dim_value_to_index_vector_.push_back(-1); } else { input_dim_value_to_index_vector_.push_back( @@ -620,14 +642,14 @@ class OutputGatherIndexToInputIndex { } } - index_vector_index_.resize(gather_indices_.shape().dimensions_size()); + index_vector_index_.resize(start_indices_.shape().dimensions_size()); input_index_.resize(input_shape.dimensions_size()); int64 index_vector_size = - gather_indices_.shape().dimensions(dim_numbers_.index_vector_dim()); + start_indices_.shape().dimensions(dim_numbers_.index_vector_dim()); index_vector_.resize(index_vector_size); } - // Returns the contribution of gather_indices to the input index corresponding + // Returns the contribution of start_indices to the input index corresponding // to output_index. See gather_inner_loop_body. // // This is conceptually a stateless transformation from output_index to the @@ -640,24 +662,25 @@ class OutputGatherIndexToInputIndex { // index_vector_index_ and index_vector on every invocation, we reuse the // same storage for all invocations. // - // This returns an arrayslice into memory owned by the class. - StatusOr> operator()(ArraySlice output_index) { + // This returns a Span into memory owned by the class. + StatusOr> operator()( + absl::Span output_index) { PropagateOutputIndexGatherDimsToIndexVectorIndex(output_index); TF_RETURN_IF_ERROR(FetchIndexVector()); PropagateIndexVectorToInputIndex(); - return ArraySlice(input_index_); + return absl::Span(input_index_); } private: - // Propagates the gather index dimensions from the output index into + // Propagates the batch dimensions from the output index into // index_vector_index_ by mutating index_vector_index_ in place. Does not // update the dim_numbers.index_vector_dim() dimension -- that's the dimension // we iterate over in FetchIndexVector. void PropagateOutputIndexGatherDimsToIndexVectorIndex( - ArraySlice output_index) { + absl::Span output_index) { int64 index_vector_index_i = 0; for (int64 i = 0, e = output_index.size(); i < e; i++) { - if (!output_dim_is_gather_dims_[i]) { + if (!output_dim_is_batch_dims_[i]) { continue; } @@ -669,14 +692,14 @@ class OutputGatherIndexToInputIndex { } } - // Populates index_vector_ by iterating over gather_indices_ according to + // Populates index_vector_ by iterating over start_indices_ according to // index_vector_index_. Status FetchIndexVector() { int64 index_vector_dim = dim_numbers_.index_vector_dim(); for (int64 i = 0, e = index_vector_.size(); i < e; i++) { index_vector_index_[index_vector_dim] = i; - TF_ASSIGN_OR_RETURN(index_vector_[i], gather_indices_.GetIntegralAsS64( - index_vector_index_)); + TF_ASSIGN_OR_RETURN(index_vector_[i], + start_indices_.GetIntegralAsS64(index_vector_index_)); } return Status::OK(); } @@ -698,40 +721,39 @@ class OutputGatherIndexToInputIndex { // PropagateIndexVectorToInputIndex. std::vector input_dim_value_to_index_vector_; - // output_dim_is_gather_dims_[i] is true iff the output index i is a gather + // output_dim_is_batch_dims_[i] is true iff the output index i is a gather // dimension. - std::vector output_dim_is_gather_dims_; + std::vector output_dim_is_batch_dims_; - // The buffer into which we construct an index into gather_indices_ to fetch + // The buffer into which we construct an index into start_indices_ to fetch // the index vector. std::vector index_vector_index_; - // The index vector fetched from gather_indices_. + // The index vector fetched from start_indices_. std::vector index_vector_; - // The result computed by this functor. operator() returns an ArraySlice into + // The result computed by this functor. operator() returns a Span into // this vector. std::vector input_index_; const GatherDimensionNumbers& dim_numbers_; - const Literal& gather_indices_; + const Literal& start_indices_; }; -// This functor computes the contribution of the window indices in an output +// This functor computes the contribution of the offset indices in an output // index to an input index. That is, given an output index I it picks out the -// output window indices in I and expands it into a window index into the input -// shape. -class OutputWindowIndexToInputIndex { +// output offset indices in I and expands it into an index into the input shape. +class OutputOffsetIndexToInputIndex { public: // The constructor does some setup work that is amortized across all // iterations. - explicit OutputWindowIndexToInputIndex( + explicit OutputOffsetIndexToInputIndex( const GatherDimensionNumbers& dim_numbers, const Shape& input_shape, const Shape& output_shape) { std::vector window_index_to_output_index; int64 output_index_count = 0; for (int64 i = 0; i < output_shape.dimensions_size(); i++) { - if (c_binary_search(dim_numbers.output_window_dims(), i)) { + if (absl::c_binary_search(dim_numbers.offset_dims(), i)) { window_index_to_output_index.push_back(output_index_count++); } else { output_index_count++; @@ -740,7 +762,7 @@ class OutputWindowIndexToInputIndex { int64 window_dim_count = 0; for (int64 i = 0; i < input_shape.dimensions_size(); i++) { - if (c_binary_search(dim_numbers.elided_window_dims(), i)) { + if (absl::c_binary_search(dim_numbers.collapsed_slice_dims(), i)) { input_dim_value_to_output_index_.push_back(-1); } else { input_dim_value_to_output_index_.push_back( @@ -759,17 +781,24 @@ class OutputWindowIndexToInputIndex { // gather input index on every invocation we reuse the same storage for the // result (input_index_), mutating it in place. // - // This returns an arrayslice into memory owned by the class. - StatusOr> operator()(ArraySlice output_index) { + // This returns a Span into memory owned by the class. + StatusOr> operator()( + absl::Span output_index) { PropagateOutputIndexWindowDimsToInputIndex(output_index); - return ArraySlice(input_index_); + return absl::Span(input_index_); + } + + // Returns for a given 'input_dim' the corresponding output dimension index, + // or -1 if 'input_dim' is an elided window dimension. + int64 input_dim_value_to_output_index(int64 input_dim) { + return input_dim_value_to_output_index_[input_dim]; } private: // Propagates window dimensions from the output index to input_index_ by // mutating input_index_ in place. void PropagateOutputIndexWindowDimsToInputIndex( - ArraySlice output_index) { + absl::Span output_index) { for (int64 i = 0, e = input_index_.size(); i < e; i++) { if (input_dim_value_to_output_index_[i] != -1) { input_index_[i] = output_index[input_dim_value_to_output_index_[i]]; @@ -782,30 +811,30 @@ class OutputWindowIndexToInputIndex { // input_dim_value_to_index_vector_[i] tells us how to compute dimension i of // the input index from the output index. See - // PropagateOutputIndexToInputIndex. + // PropagateOutputIndexWindowDimsToInputIndex. std::vector input_dim_value_to_output_index_; - // The result computed by this functor. operator() returns an ArraySlice into + // The result computed by this functor. operator() returns a Span into // this vector. std::vector input_index_; }; // Rehapes the gather indices input to have a trailing degenerate `1` dimension // if necessary. Hands over the ownership of the newly created literal (if -// there is one) to `reshaped_gather_indices`. +// there is one) to `reshaped_start_indices`. static StatusOr> ReshapedGatherIndices( - int64 index_vector_dim, const Literal& gather_indices, - std::unique_ptr* reshaped_gather_indices) { - if (gather_indices.shape().dimensions_size() != index_vector_dim) { - return std::cref(gather_indices); + int64 index_vector_dim, const Literal& start_indices, + std::unique_ptr* reshaped_start_indices) { + if (start_indices.shape().dimensions_size() != index_vector_dim) { + return std::cref(start_indices); } - std::vector new_shape(gather_indices.shape().dimensions().begin(), - gather_indices.shape().dimensions().end()); + std::vector new_shape(start_indices.shape().dimensions().begin(), + start_indices.shape().dimensions().end()); new_shape.push_back(1); - TF_ASSIGN_OR_RETURN(*reshaped_gather_indices, - gather_indices.Reshape(new_shape)); - return std::cref(**reshaped_gather_indices); + TF_ASSIGN_OR_RETURN(*reshaped_start_indices, + start_indices.Reshape(new_shape)); + return std::cref(**reshaped_start_indices); } Status HloEvaluator::HandleGather(HloInstruction* gather) { @@ -814,56 +843,69 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) { const GatherDimensionNumbers& dim_numbers = gather->gather_dimension_numbers(); const Literal& operand = GetEvaluatedLiteralFor(gather->operand(0)); - std::unique_ptr reshaped_gather_indices; + std::unique_ptr reshaped_start_indices; TF_ASSIGN_OR_RETURN( - const Literal& gather_indices, + const Literal& start_indices, ReshapedGatherIndices(dim_numbers.index_vector_dim(), GetEvaluatedLiteralFor(gather->operand(1)), - &reshaped_gather_indices)); + &reshaped_start_indices)); // We iterate over the gather dimensions in the output shape in an outer loop // nest, and iterate over the window dimensions in the output shape in an // inner loop nest. - ShapeUtil::IndexIterationSpace gather_indices_iteration_space = - IterationSpaceForOutputGatherIndices(shape, dim_numbers); - ShapeUtil::IndexIterationSpace window_indices_iteration_space = - IterationSpaceForOutputWindowIndices( - shape.dimensions_size(), gather->gather_window_bounds(), dim_numbers); + ShapeUtil::IndexIterationSpace start_indices_iteration_space = + IterationSpaceForOutputBatchIndices(shape, dim_numbers); + ShapeUtil::IndexIterationSpace offset_indices_iteration_space = + IterationSpaceForOutputOffsetIndices( + shape.dimensions_size(), gather->gather_slice_sizes(), dim_numbers); // Scratch buffers that hold an index in the output shape and the // corresponding index in the input shape. std::vector input_index(operand.shape().dimensions_size()); std::vector output_index(gather->shape().dimensions_size()); + std::vector input_index_clamped(operand.shape().dimensions_size()); - OutputGatherIndexToInputIndex output_gather_index_to_input_index( + OutputBatchIndexToInputIndex output_batch_index_to_input_index( &gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(), - /*output_shape=*/shape, &gather_indices); - OutputWindowIndexToInputIndex output_window_index_to_input_index( + /*output_shape=*/shape, &start_indices); + OutputOffsetIndexToInputIndex output_offset_index_to_input_index( gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(), /*output_shape=*/shape); const Shape& operand_shape = operand.shape(); auto gather_inner_loop_body = - [&](ArraySlice output_window_index, - ArraySlice input_gather_index, - ArraySlice output_gather_index) -> StatusOr { + [&](absl::Span output_window_index, + absl::Span input_gather_index, + absl::Span output_gather_index) -> StatusOr { TF_ASSIGN_OR_RETURN( - ArraySlice input_window_index, - output_window_index_to_input_index(output_window_index)); + absl::Span input_window_index, + output_offset_index_to_input_index(output_window_index)); for (int i = 0, e = output_index.size(); i < e; i++) { output_index[i] = output_gather_index[i] + output_window_index[i]; DCHECK_LT(output_index[i], shape.dimensions(i)); } + for (int i = 0, e = input_gather_index.size(); i < e; i++) { + int64 output_dim = + output_offset_index_to_input_index.input_dim_value_to_output_index(i); + // If 'output_dim' is -1, it means 'i' is an elided window dim. This means + // we set the iteration index to 0, so for the purpose of the following + // calculations we can consider the output dimension size to be 1. + int64 output_dim_size = + output_dim == -1 ? 1 : shape.dimensions(output_dim); + // Clamp the gather index so that the gather region fits in the operand. + // input_index_clamped[i] = clamp(input_gather_index[i], 0, + // operand_shape.dimensions(i) - + // output_dim_size); + input_index_clamped[i] = + std::min(operand_shape.dimensions(i) - output_dim_size, + std::max(0LL, input_gather_index[i])); + } for (int i = 0, e = input_index.size(); i < e; i++) { - // TODO(b/74360564): We should implement whatever out of bounds behavior - // we decide for dynamic-slice here as well. - input_index[i] = (input_gather_index[i] + input_window_index[i]) % - operand_shape.dimensions(i); - if (input_index[i] < 0) { - input_index[i] += operand_shape.dimensions(i); - } + input_index[i] = input_index_clamped[i] + input_window_index[i]; + DCHECK_GE(input_index[i], 0); + DCHECK_LT(input_index[i], operand_shape.dimensions(i)); } TF_RETURN_IF_ERROR( result->CopyElementFrom(operand, input_index, output_index)); @@ -871,19 +913,18 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) { }; auto gather_outer_loop_body = - [&](ArraySlice output_gather_index) -> StatusOr { - TF_ASSIGN_OR_RETURN( - ArraySlice input_gather_index, - output_gather_index_to_input_index(output_gather_index)); + [&](absl::Span output_gather_index) -> StatusOr { + TF_ASSIGN_OR_RETURN(absl::Span input_gather_index, + output_batch_index_to_input_index(output_gather_index)); TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus( - shape, window_indices_iteration_space, + shape, offset_indices_iteration_space, std::bind(gather_inner_loop_body, std::placeholders::_1, input_gather_index, output_gather_index))); return true; }; TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus( - shape, gather_indices_iteration_space, gather_outer_loop_body)); + shape, start_indices_iteration_space, gather_outer_loop_body)); evaluated_[gather] = std::move(result); return Status::OK(); } @@ -910,6 +951,11 @@ Status HloEvaluator::HandleBroadcast(HloInstruction* broadcast) { return Status::OK(); } +Status HloEvaluator::HandleAfterAll(HloInstruction* token) { + evaluated_[token] = LiteralUtil::CreateToken(); + return Status::OK(); +} + Status HloEvaluator::HandleGetTupleElement(HloInstruction* get_tuple_element) { const auto result_shape = get_tuple_element->shape(); const int64 index = get_tuple_element->tuple_index(); @@ -925,7 +971,7 @@ Status HloEvaluator::HandleGetTupleElement(HloInstruction* get_tuple_element) { const Literal& operand_tuple_literal = GetEvaluatedLiteralFor(operand); - evaluated_[get_tuple_element] = MakeUnique( + evaluated_[get_tuple_element] = absl::make_unique( ShapeUtil::GetTupleElementShape(operand->shape(), index)); return evaluated_[get_tuple_element]->CopyFrom(operand_tuple_literal, /*dest_shape_index=*/{}, @@ -1027,8 +1073,6 @@ Status HloEvaluator::HandleSelect(HloInstruction* select) { const auto& on_false = GetEvaluatedLiteralFor(select->operand(2)); // If predicate is of scalar type, no element-wise selection would be needed. - // This would also handle output array of tuple types as the DefaultAction - // would go through the HloEvaluatorTypedVisitor which doesn't handle tuples. if (ShapeUtil::IsScalar(pred.shape())) { if (pred.Get({})) { evaluated_[select] = on_true.CloneToUnique(); @@ -1041,6 +1085,19 @@ Status HloEvaluator::HandleSelect(HloInstruction* select) { return DefaultAction(select); } +Status HloEvaluator::HandleTupleSelect(HloInstruction* tuple_select) { + const auto& pred = GetEvaluatedLiteralFor(tuple_select->operand(0)); + const auto& on_true = GetEvaluatedLiteralFor(tuple_select->operand(1)); + const auto& on_false = GetEvaluatedLiteralFor(tuple_select->operand(2)); + + if (pred.Get({})) { + evaluated_[tuple_select] = on_true.CloneToUnique(); + } else { + evaluated_[tuple_select] = on_false.CloneToUnique(); + } + return Status::OK(); +} + Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) { HloComputation* cond_comp = while_hlo->while_condition(); HloComputation* body_comp = while_hlo->while_body(); @@ -1052,8 +1109,8 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) { HloEvaluator loop_body_evaluator(max_loop_iterations_); while (keep_going) { if (max_loop_iterations_ >= 0 && iteration_count++ > max_loop_iterations_) { - return InvalidArgument("Loop %s exceeded loop iteration limit (%lld).", - while_hlo->name().c_str(), max_loop_iterations_); + return InvalidArgument("Loop %s exceeded loop iteration limit (%d).", + while_hlo->name(), max_loop_iterations_); } TF_ASSIGN_OR_RETURN(auto cond_val, cond_evaluator.Evaluate( *cond_comp, {lcv.get()})); @@ -1071,9 +1128,181 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) { return Status::OK(); } +// Key-value sort is a special snowflake: it's templated on two different +// element types, one for the keys, and one for the values. Jump through some +// hoops to make this work. +namespace { +template +StatusOr> EvaluateSortInternal( + HloInstruction* sort, const Literal& keys_literal, + const Literal& values_literal) { + auto rank = ShapeUtil::Rank(keys_literal.shape()); + TF_RET_CHECK( + ShapeUtil::SameDimensions(keys_literal.shape(), values_literal.shape())) + << "Sort keys and values must have the same dimensions"; + TF_RET_CHECK(rank > 0 && rank <= 2) + << "Sort is only supported for rank-1 and rank-2 shapes, rank is: " + << rank; + TF_RET_CHECK(sort->operand_count() == 2) << "Expected key-value sort"; + // We need to sort and array of keys and an array of values, where the + // sorted order of the values is determined by the keys. The simplest(?) + // way to do this is to go to an array-of-pairs representation, sort the + // array using the keys, and then go back to pair-of-arrays. + VLOG(3) << "HandleSort keys_literal: " << keys_literal.ToString(); + VLOG(3) << "HandleSort values_literal: " << values_literal.ToString(); + + auto sort_r1 = [](const Literal& keys_literal, + const Literal& values_literal) { + const auto& keys_data = keys_literal.data(); + const auto& values_data = values_literal.data(); + + using kv_pair = std::pair; + std::vector key_value_vector; + CHECK_EQ(keys_data.size(), values_data.size()); + key_value_vector.reserve(keys_data.size()); + for (int i = 0; i < keys_data.size(); ++i) { + key_value_vector.push_back(std::make_pair(keys_data[i], values_data[i])); + } + std::sort(key_value_vector.begin(), key_value_vector.end(), + [](const kv_pair& a, const kv_pair& b) { + return SafeLess(a.first, b.first); + }); + std::vector result_keys; + std::vector result_values; + for (const auto& key_value : key_value_vector) { + result_keys.push_back(key_value.first); + result_values.push_back(key_value.second); + } + auto result_keys_literal = absl::make_unique(keys_literal.shape()); + result_keys_literal->PopulateR1(absl::Span(result_keys)); + auto result_values_literal = + absl::make_unique(values_literal.shape()); + result_values_literal->PopulateR1( + absl::Span(result_values)); + return std::make_pair(std::move(result_keys_literal), + std::move(result_values_literal)); + }; + + std::unique_ptr result_tuple; + if (rank == 1) { + auto result_pair = sort_r1(keys_literal, values_literal); + result_tuple = LiteralUtil::MakeTuple( + {result_pair.first.get(), result_pair.second.get()}); + } else { + // For R2 sort, the desired semantics are to sort each matrix row + // independently. + auto keys_result_literal = absl::make_unique(keys_literal.shape()); + auto values_result_literal = + absl::make_unique(values_literal.shape()); + int64 r1_length = keys_literal.shape().dimensions(1); + for (int64 row = 0; row < keys_literal.shape().dimensions(0); ++row) { + TF_ASSIGN_OR_RETURN(auto keys_r1_slice, + keys_literal.Slice({row, 0}, {row + 1, r1_length}) + ->Reshape({r1_length})); + TF_ASSIGN_OR_RETURN(auto values_r1_slice, + values_literal.Slice({row, 0}, {row + 1, r1_length}) + ->Reshape({r1_length})); + auto r1_result_pair = sort_r1(*keys_r1_slice, *values_r1_slice); + TF_ASSIGN_OR_RETURN(auto sorted_keys, + r1_result_pair.first->Reshape({1, r1_length})); + TF_ASSIGN_OR_RETURN(auto sorted_values, + r1_result_pair.second->Reshape({1, r1_length})); + TF_RETURN_IF_ERROR(keys_result_literal->CopySliceFrom( + *sorted_keys, {0, 0}, {row, 0}, {1, r1_length})); + TF_RETURN_IF_ERROR(values_result_literal->CopySliceFrom( + *sorted_values, {0, 0}, {row, 0}, {1, r1_length})); + } + result_tuple = LiteralUtil::MakeTuple( + {keys_result_literal.get(), values_result_literal.get()}); + } + + VLOG(3) << "HandleSort result_tuple: " << result_tuple->ToString(); + return std::move(result_tuple); +} + +template +StatusOr> EvaluateSortCurried( + HloInstruction* sort, const Literal& keys_literal, + const Literal& values_literal) { + switch (sort->operand(1)->shape().element_type()) { + case F32: + return EvaluateSortInternal(sort, keys_literal, + values_literal); + case U32: + return EvaluateSortInternal(sort, keys_literal, + values_literal); + case S32: + return EvaluateSortInternal(sort, keys_literal, + values_literal); + case BF16: + return EvaluateSortInternal(sort, keys_literal, + values_literal); + default: + return InvalidArgument("Unsupported type for Sort"); + } +} + +StatusOr> EvaluateSort(HloInstruction* sort, + const Literal& keys_literal, + const Literal& values_literal) { + switch (sort->operand(0)->shape().element_type()) { + case F32: + return EvaluateSortCurried(sort, keys_literal, values_literal); + case U32: + return EvaluateSortCurried(sort, keys_literal, values_literal); + case S32: + return EvaluateSortCurried(sort, keys_literal, values_literal); + case BF16: + return EvaluateSortCurried(sort, keys_literal, values_literal); + default: + return InvalidArgument("Unsupported type for Sort"); + } +} +} // namespace + +Status HloEvaluator::HandleSort(HloInstruction* sort) { + const int64 sort_dim = sort->dimensions(0); + const int64 rank = ShapeUtil::Rank(sort->operand(0)->shape()); + if (sort_dim != rank - 1) { + return Unimplemented( + "Trying to sort along dimension %d, which is not the last " + "dimension", + sort_dim); + } + + if (!ShapeUtil::IsTuple(sort->shape())) { + return DefaultAction(sort); + } else { + auto result = EvaluateSort(sort, GetEvaluatedLiteralFor(sort->operand(0)), + GetEvaluatedLiteralFor(sort->operand(1))); + if (result.ok()) { + evaluated_[sort] = std::move(result.ValueOrDie()); + return Status::OK(); + } else { + return result.status(); + } + } +} + +Status HloEvaluator::HandleReduce(HloInstruction* reduce) { + if (!ShapeUtil::IsTuple(reduce->shape())) { + return DefaultAction(reduce); + } else { + auto first_element_type = reduce->shape().tuple_shapes(0).element_type(); + for (const auto& tuple_shape : reduce->shape().tuple_shapes()) { + if (tuple_shape.element_type() != first_element_type) { + return Unimplemented( + "Reduce with several outputs that have mixed element types is " + "unsupported"); + } + } + return reduce->Visit(typed_visitors_.at(first_element_type).get()); + } +} + Status HloEvaluator::Preprocess(HloInstruction* hlo) { VLOG(2) << "About to visit HLO: " << hlo->ToString(); - return Status::OK(); + return ShapeUtil::ValidateShape(hlo->shape()); } Status HloEvaluator::Postprocess(HloInstruction* hlo) { @@ -1085,26 +1314,27 @@ Status HloEvaluator::Postprocess(HloInstruction* hlo) { // Explicit instantiation of templatized Evaluate* methods. // template StatusOr> -HloEvaluator::Evaluate(const HloModule& module, - ArraySlice arg_literals); +HloEvaluator::Evaluate( + const HloModule& module, absl::Span arg_literals); template StatusOr> HloEvaluator::Evaluate>( - const HloModule& module, ArraySlice> arg_literals); + const HloModule& module, + absl::Span> arg_literals); -template StatusOr> -HloEvaluator::Evaluate(const HloComputation& computation, - ArraySlice arg_literals); +template StatusOr> HloEvaluator::Evaluate< + const Literal*>(const HloComputation& computation, + absl::Span arg_literals); template StatusOr> HloEvaluator::Evaluate>( const HloComputation& computation, - ArraySlice> arg_literals); + absl::Span> arg_literals); template StatusOr> -HloEvaluator::Evaluate(HloInstruction* instruction, - ArraySlice arg_literals); +HloEvaluator::Evaluate( + HloInstruction* instruction, absl::Span arg_literals); template StatusOr> HloEvaluator::Evaluate>( HloInstruction* instruction, - ArraySlice> arg_literals); + absl::Span> arg_literals); } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h index b53d5644de5a17c52bdbf2593ce52f0227008a00..c2d49e56ac487ee8a5cb3d26aee497ade63aa844 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator.h +++ b/tensorflow/compiler/xla/service/hlo_evaluator.h @@ -18,15 +18,16 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/shape_inference.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/platform/macros.h" @@ -50,8 +51,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault { // type. template StatusOr> Evaluate( - const HloModule& module, - tensorflow::gtl::ArraySlice arg_literals); + const HloModule& module, absl::Span arg_literals); // Evaluates an HLO computation and an array of pointers to literals. // Returns the evaluated result as a literal if successful. @@ -74,7 +74,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault { template StatusOr> Evaluate( const HloComputation& computation, - tensorflow::gtl::ArraySlice arg_literals); + absl::Span arg_literals); // Evaluates a single HLO instruction and an array of pointers to literals. // Return the evaluated result as literal if successful. @@ -86,8 +86,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault { // type. template StatusOr> Evaluate( - HloInstruction* instruction, - tensorflow::gtl::ArraySlice arg_literals); + HloInstruction* instruction, absl::Span arg_literals); // Evaluates a single HLO instruction with constant operands. // Returns the evaluated result as literal if successful. @@ -115,6 +114,10 @@ class HloEvaluator : public DfsHloVisitorWithDefault { StatusOr> EvaluateElementwiseUnaryOp( HloOpcode opcode, const Literal& operand); + StatusOr> EvaluateDotOp( + const DotDimensionNumbers& dim_numbers, const Literal& lhs, + const Literal& rhs); + protected: // Make HloEvaluatorTypedVisitor a friend because it is logically part of this // class. @@ -172,8 +175,16 @@ class HloEvaluator : public DfsHloVisitorWithDefault { Status HandleSelect(HloInstruction* select) override; + Status HandleTupleSelect(HloInstruction* tuple_select) override; + Status HandleBroadcast(HloInstruction* broadcast) override; + Status HandleAfterAll(HloInstruction* token) override; + + Status HandleSort(HloInstruction* sort) override; + + Status HandleReduce(HloInstruction* reduce) override; + // Returns the already-evaluated literal result for the instruction. // A Constant instruction is considered evaluated and its literal will be // returned directly without looking up the cache. @@ -211,13 +222,13 @@ class HloEvaluator : public DfsHloVisitorWithDefault { return Unimplemented( "Implicit broadcasting is currently unsupported in HLO evaluator " "Shape Mismatch: %s vs %s", - ShapeUtil::HumanString(shape).c_str(), - ShapeUtil::HumanString(operand->shape()).c_str()); + ShapeUtil::HumanString(shape), + ShapeUtil::HumanString(operand->shape())); } - auto result = MakeUnique(shape); - TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice multi_index) { + auto result = absl::make_unique(shape); + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span multi_index) { return unary_op(operand_literal.Get(multi_index)); })); return std::move(result); diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc index 84b4ead2dd28caa40b6d7830a1e1401be88b6b36..7e490d7f324022fdf02c569fc1986d0b6f5823ba 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc +++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc @@ -21,8 +21,9 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h" @@ -51,12 +52,15 @@ static std::array use_bf16_params{true, false}; class HloEvaluatorTest : public ::testing::WithParamInterface, public HloVerifiedTestBase { protected: - HloEvaluatorTest() : use_bfloat16_(GetParam()) { - evaluator_ = MakeUnique(); + HloEvaluatorTest() + : HloVerifiedTestBase(/*layout_sensitive=*/false, + /*allow_mixed_precision=*/false), + use_bfloat16_(GetParam()) { + evaluator_ = absl::make_unique(); } std::unique_ptr Evaluate( - tensorflow::gtl::ArraySlice arg_literals = {}) { + absl::Span arg_literals = {}) { if (use_bfloat16_) { // In BF16 mode, we convert all F32 type to BF16 and evaluate the module. auto type_converter = HloElementTypeConverter(F32, BF16); @@ -112,9 +116,9 @@ class HloEvaluatorTest : public ::testing::WithParamInterface, // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp // with 3 operands. TEST_P(HloEvaluatorTest, DoesClamp) { - auto low = Literal::CreateR2({{0.f, 2.f}, {2.f, 4.f}}); - auto value = Literal::CreateR2({{0.f, 5.f}, {0.f, 4.f}}); - auto high = Literal::CreateR2({{2.f, 4.f}, {4.f, 4.f}}); + auto low = LiteralUtil::CreateR2({{0.f, 2.f}, {2.f, 4.f}}); + auto value = LiteralUtil::CreateR2({{0.f, 5.f}, {0.f, 4.f}}); + auto high = LiteralUtil::CreateR2({{2.f, 4.f}, {4.f, 4.f}}); Shape shape = low->shape(); HloComputation::Builder b(TestName()); @@ -127,15 +131,15 @@ TEST_P(HloEvaluatorTest, DoesClamp) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2({{0, 4}, {2, 4}}); + auto expected = LiteralUtil::CreateR2({{0, 4}, {2, 4}}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) { - auto low = Literal::CreateR0(0.f); - auto value = Literal::CreateR2({{-1.f, 0.f}, {1.f, 2.f}}); - auto high = Literal::CreateR0(1.f); + auto low = LiteralUtil::CreateR0(0.f); + auto value = LiteralUtil::CreateR2({{-1.f, 0.f}, {1.f, 2.f}}); + auto high = LiteralUtil::CreateR0(1.f); Shape shape = value->shape(); HloComputation::Builder b(TestName()); @@ -148,7 +152,7 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2({{0, 0}, {1, 1}}); + auto expected = LiteralUtil::CreateR2({{0, 0}, {1, 1}}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -156,9 +160,9 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) { // Verifies that HloEvaluator evaluates a HLO instruction that performs select // with 3 operands. TEST_P(HloEvaluatorTest, DoesSelect) { - auto pred = Literal::CreateR2({{true, false}, {false, true}}); - auto on_true = Literal::CreateR2({{2.f, 4.f}, {4.f, 4.f}}); - auto on_false = Literal::CreateR2({{0.f, 5.f}, {0.f, 4.f}}); + auto pred = LiteralUtil::CreateR2({{true, false}, {false, true}}); + auto on_true = LiteralUtil::CreateR2({{2.f, 4.f}, {4.f, 4.f}}); + auto on_false = LiteralUtil::CreateR2({{0.f, 5.f}, {0.f, 4.f}}); Shape shape = on_true->shape(); HloComputation::Builder b(TestName()); @@ -173,7 +177,7 @@ TEST_P(HloEvaluatorTest, DoesSelect) { std::unique_ptr result = Evaluate({}); - auto expected = Literal::CreateR2({{2, 5}, {0, 4}}); + auto expected = LiteralUtil::CreateR2({{2, 5}, {0, 4}}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -181,37 +185,46 @@ TEST_P(HloEvaluatorTest, DoesSelect) { // Verifies that HloEvaluator evaluates a HLO instruction that performs // element-wise addition with 2 operands. TEST_P(HloEvaluatorTest, DoesAdd) { - auto lhs = Literal::CreateR2({{1, 0}, {-100, 4}}); - auto rhs = Literal::CreateR2({{2, 4}, {4, 4}}); - auto expected = Literal::CreateR2({{3, 4}, {-96, 8}}); + auto lhs = LiteralUtil::CreateR2({{1, 0}, {-100, 4}}); + auto rhs = LiteralUtil::CreateR2({{2, 4}, {4, 4}}); + auto expected = LiteralUtil::CreateR2({{3, 4}, {-96, 8}}); TestBinaryOp(HloOpcode::kAdd, std::move(expected), std::move(lhs), std::move(rhs)); } // Verifies that HloEvaluator evaluates a HLO instruction that performs // element-wise and with 2 operands. TEST_P(HloEvaluatorTest, DoesAnd) { - auto lhs = Literal::CreateR2({{1, 0}, {-100, 4}}); - auto rhs = Literal::CreateR2({{2, 4}, {4, 4}}); - auto expected = Literal::CreateR2({{0, 0}, {4, 4}}); + auto lhs = LiteralUtil::CreateR2({{1, 0}, {-100, 4}}); + auto rhs = LiteralUtil::CreateR2({{2, 4}, {4, 4}}); + auto expected = LiteralUtil::CreateR2({{0, 0}, {4, 4}}); TestBinaryOp(HloOpcode::kAnd, std::move(expected), std::move(lhs), std::move(rhs)); } // Verifies that HloEvaluator evaluates a HLO instruction that performs // element-wise or with 2 operands. TEST_P(HloEvaluatorTest, DoesOr) { - auto lhs = Literal::CreateR2({{1, 0}, {-100, 4}}); - auto rhs = Literal::CreateR2({{2, 4}, {4, 4}}); - auto expected = Literal::CreateR2({{3, 4}, {-100, 4}}); + auto lhs = LiteralUtil::CreateR2({{1, 0}, {-100, 4}}); + auto rhs = LiteralUtil::CreateR2({{2, 4}, {4, 4}}); + auto expected = LiteralUtil::CreateR2({{3, 4}, {-100, 4}}); TestBinaryOp(HloOpcode::kOr, std::move(expected), std::move(lhs), std::move(rhs)); } // Verifies that HloEvaluator evaluates a HLO instruction that performs +// element-wise or with 2 operands. +TEST_P(HloEvaluatorTest, DoesXor) { + auto lhs = LiteralUtil::CreateR2({{1, 0}, {-100, 4}}); + auto rhs = LiteralUtil::CreateR2({{2, 4}, {4, 4}}); + auto expected = LiteralUtil::CreateR2({{3, 4}, {-104, 0}}); + TestBinaryOp(HloOpcode::kXor, std::move(expected), std::move(lhs), + std::move(rhs)); +} +// Verifies that HloEvaluator evaluates a HLO instruction that performs // element-wise multiply with 2 operands. TEST_P(HloEvaluatorTest, DoesMultiply) { - auto lhs = Literal::CreateR2({{-1, 0}, {-100, 4}}); - auto rhs = Literal::CreateR2( + auto lhs = LiteralUtil::CreateR2({{-1, 0}, {-100, 4}}); + auto rhs = LiteralUtil::CreateR2( {{std::numeric_limits::min(), 4}, {4, 4}}); - auto expected = Literal::CreateR2( + auto expected = LiteralUtil::CreateR2( {{std::numeric_limits::min(), 0}, {-400, 16}}); TestBinaryOp(HloOpcode::kMultiply, std::move(expected), std::move(lhs), std::move(rhs)); @@ -219,17 +232,17 @@ TEST_P(HloEvaluatorTest, DoesMultiply) { // Verifies that HloEvaluator evaluates a HLO instruction that performs // element-wise divide with 2 operands. TEST_P(HloEvaluatorTest, DoesDivideInt64) { - auto lhs = Literal::CreateR2({{1, 0}, {-100, 4}}); - auto rhs = Literal::CreateR2({{2, 4}, {4, 4}}); - auto expected = Literal::CreateR2({{0, 0}, {-25, 1}}); + auto lhs = LiteralUtil::CreateR2({{1, 0}, {-100, 4}}); + auto rhs = LiteralUtil::CreateR2({{2, 4}, {4, 4}}); + auto expected = LiteralUtil::CreateR2({{0, 0}, {-25, 1}}); TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs), std::move(rhs)); } TEST_P(HloEvaluatorTest, DoesDivideDouble) { - auto lhs = Literal::CreateR2({{1.0, 0.0}, {-100.0, 4.0}}); - auto rhs = Literal::CreateR2({{2.2, 4.0}, {4.0, 4.0}}); + auto lhs = LiteralUtil::CreateR2({{1.0, 0.0}, {-100.0, 4.0}}); + auto rhs = LiteralUtil::CreateR2({{2.2, 4.0}, {4.0, 4.0}}); auto expected = - Literal::CreateR2({{0.45454545454545453, 0}, {-25, 1}}); + LiteralUtil::CreateR2({{0.45454545454545453, 0}, {-25, 1}}); TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs), std::move(rhs)); } @@ -237,54 +250,54 @@ TEST_P(HloEvaluatorTest, DoesDivideDouble) { // Verifies that HloEvaluator evaluates a HLO instruction that performs // element-wise abs op with 1 operand. TEST_P(HloEvaluatorTest, DoesAbsR2) { - auto operand = Literal::CreateR2({{1, -20}, {-100, 4}}); - auto expected = Literal::CreateR2({{1, 20}, {100, 4}}); + auto operand = LiteralUtil::CreateR2({{1, -20}, {-100, 4}}); + auto expected = LiteralUtil::CreateR2({{1, 20}, {100, 4}}); TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand)); } TEST_P(HloEvaluatorTest, DoesAbsR0) { - auto operand = Literal::CreateR0(-1.0f); - auto expected = Literal::CreateR0(1.0f); + auto operand = LiteralUtil::CreateR0(-1.0f); + auto expected = LiteralUtil::CreateR0(1.0f); TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand)); } TEST_P(HloEvaluatorTest, DoesAbsR1WithZeroSize) { - auto operand = Literal::CreateR1({}); - auto expected = Literal::CreateR1({}); + auto operand = LiteralUtil::CreateR1({}); + auto expected = LiteralUtil::CreateR1({}); TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand)); } TEST_P(HloEvaluatorTest, DoesNegateR2) { - auto operand = Literal::CreateR2( + auto operand = LiteralUtil::CreateR2( {{0, std::numeric_limits::min()}, {-1, 4}}); - auto expected = - Literal::CreateR2({{0, std::numeric_limits::min()}, {1, -4}}); + auto expected = LiteralUtil::CreateR2( + {{0, std::numeric_limits::min()}, {1, -4}}); TestUnaryOp(HloOpcode::kNegate, std::move(expected), std::move(operand)); } TEST_P(HloEvaluatorTest, DoesCosR2) { - auto operand = Literal::CreateR2({{0, M_PI}, {-M_PI, 2 * M_PI}}); - auto expected = Literal::CreateR2({{1, -1}, {-1, 1}}); + auto operand = LiteralUtil::CreateR2({{0, M_PI}, {-M_PI, 2 * M_PI}}); + auto expected = LiteralUtil::CreateR2({{1, -1}, {-1, 1}}); TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand), use_bfloat16_ ? 0.031250 : 9.5367431640625E-7); } TEST_P(HloEvaluatorTest, DoesSinR2) { - auto operand = Literal::CreateR2({{0, M_PI}, {-M_PI, 2 * M_PI}}); - auto expected = Literal::CreateR2({{0, 0}, {0, 0}}); + auto operand = LiteralUtil::CreateR2({{0, M_PI}, {-M_PI, 2 * M_PI}}); + auto expected = LiteralUtil::CreateR2({{0, 0}, {0, 0}}); TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand), use_bfloat16_ ? 0.031250 : 9.5367431640625E-7); } TEST_P(HloEvaluatorTest, DoesNotR2) { auto operand = - Literal::CreateR2({{0, std::numeric_limits::min()}, - {-1, std::numeric_limits::max()}}); + LiteralUtil::CreateR2({{0, std::numeric_limits::min()}, + {-1, std::numeric_limits::max()}}); auto expected = - Literal::CreateR2({{-1, std::numeric_limits::max()}, - {0, std::numeric_limits::min()}}); + LiteralUtil::CreateR2({{-1, std::numeric_limits::max()}, + {0, std::numeric_limits::min()}}); TestUnaryOp(HloOpcode::kNot, std::move(expected), std::move(operand)); } // Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor // constant operands. TEST_P(HloEvaluatorTest, DoesTraverseInstructions) { - auto lhs = Literal::CreateR2({{1, 0}, {-100, 4}}); - auto rhs = Literal::CreateR2({{2, 4}, {4, 4}}); - auto rhs2 = Literal::CreateR2({{1, -20}, {-100, 4}}); + auto lhs = LiteralUtil::CreateR2({{1, 0}, {-100, 4}}); + auto rhs = LiteralUtil::CreateR2({{2, 4}, {4, 4}}); + auto rhs2 = LiteralUtil::CreateR2({{1, -20}, {-100, 4}}); std::vector args = {lhs.get(), rhs.get(), rhs2.get()}; Shape shape = ShapeUtil::MakeShape(S64, {2, 2}); @@ -305,7 +318,7 @@ TEST_P(HloEvaluatorTest, DoesTraverseInstructions) { std::unique_ptr result = Evaluate(args); - auto expected = Literal::CreateR2({{4, -16}, {-196, 12}}); + auto expected = LiteralUtil::CreateR2({{4, -16}, {-196, 12}}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -315,7 +328,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) { HloComputation::Builder b(TestName()); const int64 dimensions[] = {11, 8, 7, 5, 9}; TF_ASSERT_OK_AND_ASSIGN(auto literal, - Literal::CreateRandomLiteral( + LiteralUtil::CreateRandomLiteral( ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0)); auto literal_clone = literal->CloneToUnique(); HloInstruction* literal_instruction = @@ -331,7 +344,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) { using NativeT = typename primitive_util::PrimitiveTypeToNative::type; result->EachCell( - [&](tensorflow::gtl::ArraySlice indices, NativeT value) { + [&](absl::Span indices, NativeT value) { std::vector rindexes = Permute(permutation, indices); EXPECT_NEAR(value, literal_clone->Get(rindexes), 0.031250); }); @@ -340,8 +353,8 @@ TEST_P(HloEvaluatorTest, DoesReshape) { // Verifies Broadcast operation is correctly evaluated. TEST_P(HloEvaluatorTest, DoesBroadcast) { HloComputation::Builder b(TestName()); - auto input_literal = Literal::CreateR2({{1, 2}, {3, 4}, {5, 6}}); - auto output_literal = Literal::CreateR3( + auto input_literal = LiteralUtil::CreateR2({{1, 2}, {3, 4}, {5, 6}}); + auto output_literal = LiteralUtil::CreateR3( {{{1, 2}, {3, 4}, {5, 6}}, {{1, 2}, {3, 4}, {5, 6}}}); HloInstruction* literal_instruction = b.AddInstruction( HloInstruction::CreateConstant(std::move(input_literal))); @@ -356,8 +369,8 @@ TEST_P(HloEvaluatorTest, DoesBroadcast) { TEST_P(HloEvaluatorTest, DoesBroadcastScalar) { HloComputation::Builder b(TestName()); - auto input_literal = Literal::CreateR0(111); - auto output_literal = Literal::CreateR2( + auto input_literal = LiteralUtil::CreateR0(111); + auto output_literal = LiteralUtil::CreateR2( {{111, 111}, {111, 111}, {111, 111}, {111, 111}, {111, 111}, {111, 111}}); HloInstruction* literal_instruction = b.AddInstruction( @@ -377,9 +390,9 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) { HloComputation::Builder b(TestName()); HloInstruction* operand1 = b.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{-1, -2}, {100, 200}}))); + LiteralUtil::CreateR2({{-1, -2}, {100, 200}}))); HloInstruction* operand2 = b.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{-2, -3}, {-100, -200}}))); + LiteralUtil::CreateR2({{-2, -3}, {-100, -200}}))); std::vector operands = {operand1, operand2}; @@ -390,8 +403,8 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) { std::unique_ptr result = Evaluate(); - auto expected = - Literal::CreateR2({{-1, -2}, {100, 200}, {-2, -3}, {-100, -200}}); + auto expected = LiteralUtil::CreateR2( + {{-1, -2}, {100, 200}, {-2, -3}, {-100, -200}}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -399,9 +412,9 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) { HloComputation::Builder b(TestName()); HloInstruction* operand1 = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({100, 200}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({100, 200}))); HloInstruction* operand2 = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({}))); std::vector operands = {operand1, operand2}; @@ -412,16 +425,16 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR1({100, 200}); + auto expected = LiteralUtil::CreateR1({100, 200}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } TEST_P(HloEvaluatorTest, ConvertWithSameLayout) { HloComputation::Builder b(TestName()); - auto input_literal = Literal::CreateR2({{1, 2}, {3, 4}, {5, 6}}); + auto input_literal = LiteralUtil::CreateR2({{1, 2}, {3, 4}, {5, 6}}); auto expected = - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}); ASSERT_TRUE(LayoutUtil::LayoutsInShapesEqual(input_literal->shape(), expected->shape())); @@ -438,9 +451,9 @@ TEST_P(HloEvaluatorTest, ConvertWithSameLayout) { TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) { HloComputation::Builder b(TestName()); - auto input_literal = Literal::CreateR2WithLayout( + auto input_literal = LiteralUtil::CreateR2WithLayout( {{1, 2}, {3, 4}, {5, 6}}, LayoutUtil::MakeLayout({0, 1})); - auto expected = Literal::CreateR2WithLayout( + auto expected = LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, LayoutUtil::MakeLayout({1, 0})); ASSERT_FALSE(LayoutUtil::LayoutsInShapesEqual(input_literal->shape(), expected->shape())); @@ -469,13 +482,13 @@ PaddingConfig CreatePaddingConfig( } TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) { - auto operand = Literal::CreateR2({{}, {}}); + auto operand = LiteralUtil::CreateR2({{}, {}}); HloComputation::Builder b(TestName()); auto operand_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(operand))); constexpr int32 kPadValue = 10; - auto pad_value = Literal::CreateR0(kPadValue); + auto pad_value = LiteralUtil::CreateR0(kPadValue); auto padding_value_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(pad_value))); @@ -487,7 +500,7 @@ TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2( + auto expected = LiteralUtil::CreateR2( {{10, 10}, {10, 10}, {10, 10}, {10, 10}, {10, 10}}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); @@ -497,11 +510,11 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) { HloComputation::Builder b(TestName()); Array4D input_array(3, 2, 1, 1, {1, 2, 3, 4, 5, 6}); - auto input = Literal::CreateR4FromArray4D(input_array); + auto input = LiteralUtil::CreateR4FromArray4D(input_array); HloInstruction* input_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(input))); constexpr float kPadValue = 1.5; - auto pad_value = Literal::CreateR0(kPadValue); + auto pad_value = LiteralUtil::CreateR0(kPadValue); HloInstruction* pad_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(pad_value))); @@ -514,7 +527,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) { std::unique_ptr result = Evaluate(); - auto expected_array = MakeUnique>(8, 5, 1, 1); + auto expected_array = absl::make_unique>(8, 5, 1, 1); expected_array->Fill(kPadValue); (*expected_array)(1, 0, 0, 0) = 1.0f; (*expected_array)(1, 2, 0, 0) = 2.0f; @@ -523,7 +536,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) { (*expected_array)(7, 0, 0, 0) = 5.0f; (*expected_array)(7, 2, 0, 0) = 6.0f; - auto expected = Literal::CreateR4FromArray4D(*expected_array); + auto expected = LiteralUtil::CreateR4FromArray4D(*expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -538,14 +551,14 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) { // { 9, 10, 11 }, // { 13, 14, 15 }, // } - auto input_array = MakeUnique>(4, 3); + auto input_array = absl::make_unique>(4, 3); input_array->FillUnique(1.0f); - auto input = Literal::CreateR2FromArray2D(*input_array); + auto input = LiteralUtil::CreateR2FromArray2D(*input_array); HloInstruction* input_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(input))); auto pad_value_instruction = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.718f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.718f))); auto r2_padding_on_dim0_dim1 = CreatePaddingConfig({{{-1, -2, 0}}, {{-2, 4, 0}}}); @@ -559,13 +572,13 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) { std::unique_ptr result = Evaluate(); // f32[1,5] { 7.0, 2.718, 2.718, 2.718, 2.718 } - auto expected_array = MakeUnique>(1, 5); + auto expected_array = absl::make_unique>(1, 5); (*expected_array)(0, 0) = 7.0f; (*expected_array)(0, 1) = 2.718f; (*expected_array)(0, 2) = 2.718f; (*expected_array)(0, 3) = 2.718f; (*expected_array)(0, 4) = 2.718f; - auto expected = Literal::CreateR2FromArray2D(*expected_array); + auto expected = LiteralUtil::CreateR2FromArray2D(*expected_array); EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(0.031250))); } @@ -579,14 +592,14 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) { // { 9, 10, 11 }, // { 13, 14, 15 }, // } - auto input_array = MakeUnique>(4, 3); + auto input_array = absl::make_unique>(4, 3); input_array->FillUnique(1.0f); - auto input = Literal::CreateR2FromArray2D(*input_array); + auto input = LiteralUtil::CreateR2FromArray2D(*input_array); HloInstruction* input_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(input))); auto pad_value_instruction = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.718f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.718f))); PaddingConfig padding_config = MakeNoPaddingConfig(2); @@ -603,8 +616,8 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) { std::unique_ptr result = Evaluate(); - auto expected_array = MakeUnique>(0, 9); - auto expected = Literal::CreateR2FromArray2D(*expected_array); + auto expected_array = absl::make_unique>(0, 9); + auto expected = LiteralUtil::CreateR2FromArray2D(*expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -619,15 +632,15 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) { // { 3 }, // { 4 }, // } - auto lhs_array = MakeUnique>(4, 1); + auto lhs_array = absl::make_unique>(4, 1); lhs_array->FillUnique(1.0f); - auto lhs_literal = Literal::CreateR2FromArray2D(*lhs_array); + auto lhs_literal = LiteralUtil::CreateR2FromArray2D(*lhs_array); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); // rhs: // f32[2] { 1, 2 }, - auto rhs_literal = Literal::CreateR2({{1, 2}}); + auto rhs_literal = LiteralUtil::CreateR2({{1, 2}}); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); @@ -649,7 +662,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) { {4.f, 8.f}, }); // clang-format on - auto expected = Literal::CreateR2FromArray2D(expected_array); + auto expected = LiteralUtil::CreateR2FromArray2D(expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -660,7 +673,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) { // lhs: // f32[3] // { 1, 2, 3 }, - auto lhs_literal = Literal::CreateR1({1, 2, 3}); + auto lhs_literal = LiteralUtil::CreateR1({1, 2, 3}); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); @@ -670,9 +683,9 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) { // { 3, 4 }, // { 5, 6 }, // } - auto rhs_array = MakeUnique>(3, 2); + auto rhs_array = absl::make_unique>(3, 2); rhs_array->FillUnique(1.0f); - auto rhs_literal = Literal::CreateR2FromArray2D(*rhs_array); + auto rhs_literal = LiteralUtil::CreateR2FromArray2D(*rhs_array); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); @@ -686,7 +699,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR1({22.f, 28.f}); + auto expected = LiteralUtil::CreateR1({22.f, 28.f}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -701,9 +714,9 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) { // { 9, 10, 11 }, // { 13, 14, 15 }, // } - auto lhs_array = MakeUnique>(4, 3); + auto lhs_array = absl::make_unique>(4, 3); lhs_array->FillUnique(1.0f); - auto lhs_literal = Literal::CreateR2FromArray2D(*lhs_array); + auto lhs_literal = LiteralUtil::CreateR2FromArray2D(*lhs_array); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); @@ -713,9 +726,9 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) { // { 3, 4 }, // { 5, 6 }, // } - auto rhs_array = MakeUnique>(3, 2); + auto rhs_array = absl::make_unique>(3, 2); rhs_array->FillUnique(1.0f); - auto rhs_literal = Literal::CreateR2FromArray2D(*rhs_array); + auto rhs_literal = LiteralUtil::CreateR2FromArray2D(*rhs_array); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); @@ -735,7 +748,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) { {94.f, 124.f}, {130.f, 172.f}, }); - auto expected = Literal::CreateR2FromArray2D(expected_array); + auto expected = LiteralUtil::CreateR2FromArray2D(expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -744,12 +757,12 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) { HloComputation::Builder b(TestName()); Array3D lhs_array = {{{1, 2, 3}}}; - auto lhs_literal = Literal::CreateR3FromArray3D(lhs_array); + auto lhs_literal = LiteralUtil::CreateR3FromArray3D(lhs_array); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); Array3D rhs_array = {{{3.f, 4.f}}}; - auto rhs_literal = Literal::CreateR3FromArray3D(rhs_array); + auto rhs_literal = LiteralUtil::CreateR3FromArray3D(rhs_array); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); @@ -783,7 +796,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) { std::unique_ptr result = Evaluate(); Array3D expected_array = {{{11.f, 18.f, 9.f}}}; - auto expected = Literal::CreateR3FromArray3D(expected_array); + auto expected = LiteralUtil::CreateR3FromArray3D(expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -800,7 +813,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) { {13, 14, 15, 16}, })); // clang-format on - auto lhs_literal = Literal::CreateR4FromArray4D(lhs_array); + auto lhs_literal = LiteralUtil::CreateR4FromArray4D(lhs_array); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); @@ -811,7 +824,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) { {7, 8}, })); // clang-format on - auto rhs_literal = Literal::CreateR4FromArray4D(rhs_array); + auto rhs_literal = LiteralUtil::CreateR4FromArray4D(rhs_array); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); @@ -845,7 +858,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) { {149, 160, 171, 80}, })); // clang-format on - auto expected = Literal::CreateR4FromArray4D(expected_array); + auto expected = LiteralUtil::CreateR4FromArray4D(expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -875,11 +888,11 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) { }}); // clang-format on - auto lhs_literal = Literal::CreateR4FromArray4D(input); + auto lhs_literal = LiteralUtil::CreateR4FromArray4D(input); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); - auto rhs_literal = Literal::CreateR4FromArray4D(weight); + auto rhs_literal = LiteralUtil::CreateR4FromArray4D(weight); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); rhs_instruction = b.AddInstruction(HloInstruction::CreateReverse( @@ -922,9 +935,9 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) { // clang-format off // Result dimensions: [feature=1, height=1, batch=1, width=2] Array4D expected_array({{{{2514, 2685}}}}); - Array4D expected_array_bf16({{{{2512, 2672}}}}); + Array4D expected_array_bf16({{{{2512, 2688}}}}); // clang-format on - auto expected = Literal::CreateR4FromArray4D( + auto expected = LiteralUtil::CreateR4FromArray4D( use_bfloat16_ ? expected_array_bf16 : expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); @@ -955,11 +968,11 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) { }}); // clang-format on - auto lhs_literal = Literal::CreateR4FromArray4D(input); + auto lhs_literal = LiteralUtil::CreateR4FromArray4D(input); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); - auto rhs_literal = Literal::CreateR4FromArray4D(weight); + auto rhs_literal = LiteralUtil::CreateR4FromArray4D(weight); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); @@ -999,9 +1012,9 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) { // clang-format off // Result dimensions: [feature=1, height=1, batch=1, width=2] Array4D expected_array({{{{2514, 2685}}}}); - Array4D expected_array_bf16({{{{2512, 2672}}}}); + Array4D expected_array_bf16({{{{2512, 2688}}}}); // clang-format on - auto expected = Literal::CreateR4FromArray4D( + auto expected = LiteralUtil::CreateR4FromArray4D( use_bfloat16_ ? expected_array_bf16 : expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); @@ -1019,7 +1032,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) { {13, 14, 15, 16}, })); // clang-format on - auto lhs_literal = Literal::CreateR4FromArray4D(lhs_array); + auto lhs_literal = LiteralUtil::CreateR4FromArray4D(lhs_array); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); @@ -1030,7 +1043,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) { {7, 8}, })); // clang-format on - auto rhs_literal = Literal::CreateR4FromArray4D(rhs_array); + auto rhs_literal = LiteralUtil::CreateR4FromArray4D(rhs_array); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); @@ -1065,7 +1078,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) { {91, 112, 98, 120, 105, 128, 112}, {65, 84, 70, 90, 75, 96, 80}, })); - auto expected = Literal::CreateR4FromArray4D(expected_array); + auto expected = LiteralUtil::CreateR4FromArray4D(expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -1082,7 +1095,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) { {13, 14, 15, 16}, })); // clang-format on - auto lhs_literal = Literal::CreateR4FromArray4D(lhs_array); + auto lhs_literal = LiteralUtil::CreateR4FromArray4D(lhs_array); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); @@ -1093,7 +1106,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) { {7, 8}, })); // clang-format on - auto rhs_literal = Literal::CreateR4FromArray4D(rhs_array); + auto rhs_literal = LiteralUtil::CreateR4FromArray4D(rhs_array); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); @@ -1129,7 +1142,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) { {104, 91, 112, 98, 120, 105, 128, 112}, {78, 65, 84, 70, 90, 75, 96, 80}, })); - auto expected = Literal::CreateR4FromArray4D(expected_array); + auto expected = LiteralUtil::CreateR4FromArray4D(expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -1147,7 +1160,7 @@ TEST_P(HloEvaluatorTest, {13, 14, 15, 16}, })); // clang-format on - auto lhs_literal = Literal::CreateR4FromArray4D(lhs_array); + auto lhs_literal = LiteralUtil::CreateR4FromArray4D(lhs_array); HloInstruction* lhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal))); @@ -1158,7 +1171,7 @@ TEST_P(HloEvaluatorTest, {8, 9, 10}, })); // clang-format on - auto rhs_literal = Literal::CreateR4FromArray4D(rhs_array); + auto rhs_literal = LiteralUtil::CreateR4FromArray4D(rhs_array); HloInstruction* rhs_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal))); @@ -1201,7 +1214,7 @@ TEST_P(HloEvaluatorTest, {0, 0, 0}, {91, 98, 105}, })); - auto expected = Literal::CreateR4FromArray4D(expected_array); + auto expected = LiteralUtil::CreateR4FromArray4D(expected_array); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -1216,9 +1229,9 @@ TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) { constexpr int kNumElements = 1 << 25; // float += 1 saturates at 1<<24 std::vector v(kNumElements, 1.0f); HloInstruction* arg_instruction = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1(v))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1(v))); HloInstruction* init_value = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.f))); HloComputation::Builder add_computation("add"); Shape scalar_shape = ShapeUtil::MakeShape(F32, {}); @@ -1248,14 +1261,14 @@ void BM_ReducePrecisely(int num_iters) { HloComputation::Builder b("BM_ReducePrecisely"); HloModuleConfig config; config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags()); - HloModule module("BM_ReducePrecisely", VersionedComputationHandle(), config); + HloModule module("BM_ReducePrecisely", config); constexpr int kNumElements = 1 << 25; // float += 1 saturates at 1<<24 std::vector v(kNumElements, 1.0f); HloInstruction* arg_instruction = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1(v))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1(v))); auto init_value = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.f))); HloComputation::Builder add_computation("add"); Shape scalar_shape = ShapeUtil::MakeShape(F32, {}); @@ -1288,15 +1301,15 @@ TEST_P(HloEvaluatorTest, ReduceAdd) { // { 1, 2, 3 }, // { 5, 6, 7 }, // } - auto arg_array = MakeUnique>(2, 3); + auto arg_array = absl::make_unique>(2, 3); arg_array->FillUnique(1.0f); - auto arg_literal = Literal::CreateR2FromArray2D(*arg_array); + auto arg_literal = LiteralUtil::CreateR2FromArray2D(*arg_array); HloInstruction* arg_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal))); auto init_value = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.f))); HloComputation::Builder add_computation("add"); Shape scalar_shape = ShapeUtil::MakeShape(F32, {}); @@ -1317,7 +1330,7 @@ TEST_P(HloEvaluatorTest, ReduceAdd) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR1({6, 18}); + auto expected = LiteralUtil::CreateR1({6, 18}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -1330,15 +1343,15 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) { // { 1, 2, 3 }, // { 5, 6, 7 }, // } - auto arg_array = MakeUnique>(2, 3); + auto arg_array = absl::make_unique>(2, 3); arg_array->FillUnique(1.0f); - auto arg_literal = Literal::CreateR2FromArray2D(*arg_array); + auto arg_literal = LiteralUtil::CreateR2FromArray2D(*arg_array); HloInstruction* arg_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal))); auto init_value = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.f))); HloComputation::Builder max_computation("max"); Shape scalar_shape = ShapeUtil::MakeShape(F32, {}); @@ -1369,7 +1382,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2({{6, 7}}); + auto expected = LiteralUtil::CreateR2({{6, 7}}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -1381,15 +1394,15 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) { // { 1, 2, 3 }, // { 5, 6, 7 }, // } - auto arg_array = MakeUnique>(2, 3); + auto arg_array = absl::make_unique>(2, 3); arg_array->FillUnique(1.0f); - auto arg_literal = Literal::CreateR2FromArray2D(*arg_array); + auto arg_literal = LiteralUtil::CreateR2FromArray2D(*arg_array); HloInstruction* arg_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal))); auto init_value = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.f))); HloComputation::Builder add_computation("add"); Shape scalar_shape = ShapeUtil::MakeShape(F32, {}); @@ -1426,7 +1439,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2({{1, 3, 5}, {5, 11, 13}}); + auto expected = LiteralUtil::CreateR2({{1, 3, 5}, {5, 11, 13}}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result)); } @@ -1436,13 +1449,13 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) { // arg: f32[4,4,4,4,4,4] full of ones. Using small dims to limit run-time. std::vector input_dims(6, 4); std::unique_ptr arg_literal = - Literal::CreateFullWithDescendingLayout(input_dims, 1.0f); + LiteralUtil::CreateFullWithDescendingLayout(input_dims, 1.0f); HloInstruction* arg_instruction = b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal))); auto init_value = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.f))); HloComputation::Builder add_computation("add"); Shape scalar_shape = ShapeUtil::MakeShape(F32, {}); @@ -1489,7 +1502,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) { std::vector output_dims = {4, 3, 3, 3, 4, 4}; std::unique_ptr result_literal = - Literal::CreateFullWithDescendingLayout(output_dims, 8.0f); + LiteralUtil::CreateFullWithDescendingLayout(output_dims, 8.0f); EXPECT_TRUE(LiteralTestUtil::Equal(*result_literal, *result)); } @@ -1502,9 +1515,10 @@ TEST_P(HloEvaluatorTest, StridedSlice) { // { 9, 10, 11, 12, 13 }, // { 17, 18, 19, 20, 21 }, // } - auto operand_array = MakeUnique>(3, 5); + auto operand_array = absl::make_unique>(3, 5); operand_array->FillUnique(1.0f); - auto operand_literal = Literal::CreateR2FromArray2D(*operand_array); + auto operand_literal = + LiteralUtil::CreateR2FromArray2D(*operand_array); HloInstruction* operand = b.AddInstruction( HloInstruction::CreateConstant(std::move(operand_literal))); @@ -1518,7 +1532,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2({ + auto expected = LiteralUtil::CreateR2({ {3}, {19}, }); @@ -1534,15 +1548,16 @@ TEST_P(HloEvaluatorTest, DynamicSlice) { // { 1, 2, 3, 4 }, // { 5, 6, 7, 8 }, // } - auto operand_array = MakeUnique>(2, 4); + auto operand_array = absl::make_unique>(2, 4); operand_array->FillUnique(1.0f); - auto operand_literal = Literal::CreateR2FromArray2D(*operand_array); + auto operand_literal = + LiteralUtil::CreateR2FromArray2D(*operand_array); HloInstruction* operand = b.AddInstruction( HloInstruction::CreateConstant(std::move(operand_literal))); auto start_indices = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0, 1}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({0, 1}))); Shape shape = ShapeUtil::MakeShape(F32, {2, 3}); b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand, @@ -1551,7 +1566,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2({ + auto expected = LiteralUtil::CreateR2({ {2, 3, 4}, {6, 7, 8}, }); @@ -1569,15 +1584,16 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) { // { 1, 2, 3, 4 }, // { 5, 6, 7, 8 }, // } - auto operand_array = MakeUnique>(2, 4); + auto operand_array = absl::make_unique>(2, 4); operand_array->FillUnique(1.0f); - auto operand_literal = Literal::CreateR2FromArray2D(*operand_array); + auto operand_literal = + LiteralUtil::CreateR2FromArray2D(*operand_array); HloInstruction* operand = b.AddInstruction( HloInstruction::CreateConstant(std::move(operand_literal))); auto start_indices = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({2, 1}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2, 1}))); Shape shape = ShapeUtil::MakeShape(F32, {2, 3}); b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand, @@ -1586,7 +1602,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2({ + auto expected = LiteralUtil::CreateR2({ {2, 3, 4}, {6, 7, 8}, }); @@ -1602,18 +1618,19 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) { // { 1, 2, 3 }, // { 5, 6, 7 }, // } - auto operand_array = MakeUnique>(2, 3); + auto operand_array = absl::make_unique>(2, 3); operand_array->FillUnique(1.0); - auto operand_literal = Literal::CreateR2FromArray2D(*operand_array); + auto operand_literal = + LiteralUtil::CreateR2FromArray2D(*operand_array); HloInstruction* operand = b.AddInstruction( HloInstruction::CreateConstant(std::move(operand_literal))); auto start_indices = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0, 1}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({0, 1}))); auto update = b.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{-2.0, -3.0}, {-6.0, -7.0}}))); + LiteralUtil::CreateR2({{-2.0, -3.0}, {-6.0, -7.0}}))); Shape shape = ShapeUtil::MakeShape(F64, {2, 3}); b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( @@ -1622,7 +1639,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2({ + auto expected = LiteralUtil::CreateR2({ {1, -2, -3}, {5, -6, -7}, }); @@ -1638,14 +1655,15 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) { // { 1, 2, 3 }, // { 5, 6, 7 }, // } - auto operand_array = MakeUnique>(2, 3); + auto operand_array = absl::make_unique>(2, 3); operand_array->FillUnique(1.0); - auto operand_literal2 = Literal::CreateR2FromArray2D(*operand_array); + auto operand_literal2 = + LiteralUtil::CreateR2FromArray2D(*operand_array); HloInstruction* operand2 = b.AddInstruction( HloInstruction::CreateConstant(std::move(operand_literal2))); HloInstruction* operand1 = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0, 1}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({0, 1}))); auto tuple = b.AddInstruction(HloInstruction::CreateTuple({operand1, operand2})); @@ -1657,7 +1675,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) { std::unique_ptr result = Evaluate(); - auto expected = Literal::CreateR2({ + auto expected = LiteralUtil::CreateR2({ {1, 2, 3}, {5, 6, 7}, }); @@ -1673,13 +1691,13 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) { // { 1, 2, 3 }, // { 5, 6, 7 }, // } - auto operand_array = MakeUnique>(2, 3); + auto operand_array = absl::make_unique>(2, 3); operand_array->FillUnique(1.0); HloInstruction* operand2 = b.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2FromArray2D(*operand_array))); + LiteralUtil::CreateR2FromArray2D(*operand_array))); HloInstruction* operand1 = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0, 1}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({0, 1}))); auto tuple1 = b.AddInstruction(HloInstruction::CreateTuple({operand1, operand2})); @@ -1697,8 +1715,8 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) { std::unique_ptr result = Evaluate(); auto result_inner_literal = - Literal::CreateR2FromArray2D(*operand_array); - auto expected = Literal::MakeTuple({ + LiteralUtil::CreateR2FromArray2D(*operand_array); + auto expected = LiteralUtil::MakeTuple({ result_inner_literal.get(), result_inner_literal.get(), }); @@ -1726,7 +1744,7 @@ TEST_P(HloEvaluatorTest, Reverse) { {{23.0f}, {24.0f}}}, }); // clang-format on - auto operand_literal = Literal::CreateR4FromArray4D(input); + auto operand_literal = LiteralUtil::CreateR4FromArray4D(input); HloInstruction* operand = b.AddInstruction( HloInstruction::CreateConstant(std::move(operand_literal))); @@ -1737,7 +1755,7 @@ TEST_P(HloEvaluatorTest, Reverse) { std::unique_ptr result = Evaluate(); // clang-format off - auto expected = Literal::CreateR4FromArray4D({ + auto expected = LiteralUtil::CreateR4FromArray4D({ {{{23.0f}, {24.0f}}, {{21.0f}, {22.0f}}, {{19.0f}, {20.0f}}}, @@ -1773,11 +1791,11 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) { // Evaluate add with param0 = {1, 2, 3, 4}, square = {10, 20, 30, 40}. HloEvaluator evaluator; auto result = evaluator.EvaluateWithSubstitutions( - add, {{param0, Literal::CreateR1({1, 2, 3, 4}).get()}, - {square, Literal::CreateR1({10, 20, 30, 40}).get()}}); + add, {{param0, LiteralUtil::CreateR1({1, 2, 3, 4}).get()}, + {square, LiteralUtil::CreateR1({10, 20, 30, 40}).get()}}); TF_ASSERT_OK(result.status()); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR1({11, 22, 33, 44}), *result.ValueOrDie())); + *LiteralUtil::CreateR1({11, 22, 33, 44}), *result.ValueOrDie())); } // Check that EvaluateWithSubstitutions works if one of the operands to the op @@ -1790,18 +1808,18 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) { b.AddInstruction(HloInstruction::CreateParameter(0, shape, "param0")); HloInstruction* square = b.AddInstruction(HloInstruction::CreateBinary( shape, HloOpcode::kMultiply, param0, param0)); - HloInstruction* constant = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3, 4}))); + HloInstruction* constant = b.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({1, 2, 3, 4}))); HloInstruction* add = b.AddInstruction( HloInstruction::CreateBinary(shape, HloOpcode::kAdd, constant, square)); // Evaluate add with square = {10, 20, 30, 40}. HloEvaluator evaluator; auto result = evaluator.EvaluateWithSubstitutions( - add, {{square, Literal::CreateR1({10, 20, 30, 40}).get()}}); + add, {{square, LiteralUtil::CreateR1({10, 20, 30, 40}).get()}}); TF_ASSERT_OK(result.status()); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR1({11, 22, 33, 44}), *result.ValueOrDie())); + *LiteralUtil::CreateR1({11, 22, 33, 44}), *result.ValueOrDie())); } TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) { @@ -1812,20 +1830,20 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) ROOT gather = s32[2,3] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1, 3} + slice_sizes={1, 3} } )"; ParseAndVerifyModule(hlo_text); std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR1({0, 2}); - EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR2({{1, 2, 3}, {7, 8, 9}}), - *Evaluate({operand.get(), gather_indices.get()}))); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({0, 2}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{1, 2, 3}, {7, 8, 9}}), + *Evaluate({operand.get(), start_indices.get()}))); } TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) { @@ -1836,20 +1854,20 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) ROOT gather = s32[3,2] gather(operand, indices), - output_window_dims={0}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={0}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=1, - window_bounds={3, 1} + slice_sizes={3, 1} } )"; ParseAndVerifyModule(hlo_text); std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR1({0, 2}); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({0, 2}); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR2({{1, 3}, {4, 6}, {7, 9}}), - *Evaluate({operand.get(), gather_indices.get()}))); + *LiteralUtil::CreateR2({{1, 3}, {4, 6}, {7, 9}}), + *Evaluate({operand.get(), start_indices.get()}))); } TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) { @@ -1860,22 +1878,22 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2] parameter(1) ROOT gather = s32[2,3,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={1}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=2, - window_bounds={3, 1} + slice_sizes={3, 1} } )"; ParseAndVerifyModule(hlo_text); std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{0, 2}, {2, 1}}); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{0, 2}, {2, 1}}); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR3( + *LiteralUtil::CreateR3( {{{1, 3}, {4, 6}, {7, 9}}, {{3, 2}, {6, 5}, {9, 8}}}), - *Evaluate({operand.get(), gather_indices.get()}))); + *Evaluate({operand.get(), start_indices.get()}))); } TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) { @@ -1886,23 +1904,23 @@ ENTRY main { operand = s32[3,3,2] parameter(0) indices = s32[2,2] parameter(1) ROOT gather = s32[2,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=1, - window_bounds={1,1,2} + slice_sizes={1,1,2} } )"; ParseAndVerifyModule(hlo_text); std::unique_ptr operand = - Literal::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // - {{-4, 4}, {-5, 5}, {-6, 6}}, // - {{-7, 7}, {-8, 8}, {-9, 9}}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{0, 0}, {1, 0}}); + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR2({{-1, 1}, {-4, 4}}), - *Evaluate({operand.get(), gather_indices.get()}))); + LiteralTestUtil::Equal(*LiteralUtil::CreateR2({{-1, 1}, {-4, 4}}), + *Evaluate({operand.get(), start_indices.get()}))); } TEST_P(HloEvaluatorTest, @@ -1914,23 +1932,23 @@ ENTRY main { operand = s32[3,3,2] parameter(0) indices = s32[2,2] parameter(1) ROOT gather = s32[2,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1,2} + slice_sizes={1,1,2} } )"; ParseAndVerifyModule(hlo_text); std::unique_ptr operand = - Literal::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // - {{-4, 4}, {-5, 5}, {-6, 6}}, // - {{-7, 7}, {-8, 8}, {-9, 9}}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{0, 0}, {1, 0}}); + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR2({{-2, 2}, {-1, 1}}), - *Evaluate({operand.get(), gather_indices.get()}))); + LiteralTestUtil::Equal(*LiteralUtil::CreateR2({{-2, 2}, {-1, 1}}), + *Evaluate({operand.get(), start_indices.get()}))); } TEST_P(HloEvaluatorTest, EvaluateGather_DynamicSlice) { @@ -1941,20 +1959,20 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) ROOT gather = s32[1,1] gather(operand, indices), - output_window_dims={0,1}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={0,1}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1} + slice_sizes={1,1} } )"; ParseAndVerifyModule(hlo_text); std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR1({1, 1}); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({1, 1}); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR2({{5}}), - *Evaluate({operand.get(), gather_indices.get()}))); + LiteralTestUtil::Equal(*LiteralUtil::CreateR2({{5}}), + *Evaluate({operand.get(), start_indices.get()}))); } TEST_P(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) { @@ -1965,21 +1983,21 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2] parameter(1) ROOT gather = s32[2,1,1] gather(operand, indices), - output_window_dims={1,2}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1,2}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1} + slice_sizes={1,1} } )"; ParseAndVerifyModule(hlo_text); std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{2, 1}, {1, 1}}); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{2, 1}, {1, 1}}); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR3({{{8}}, {{5}}}), - *Evaluate({operand.get(), gather_indices.get()}))); + LiteralTestUtil::Equal(*LiteralUtil::CreateR3({{{8}}, {{5}}}), + *Evaluate({operand.get(), start_indices.get()}))); } TEST_P(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) { @@ -1990,19 +2008,19 @@ ENTRY main { operand = s32[3,0] parameter(0) indices = s32[2] parameter(1) ROOT gather = s32[2,0] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1, 0} + slice_sizes={1, 0} } )"; ParseAndVerifyModule(hlo_text); - std::unique_ptr operand = Literal::CreateR2({{}, {}, {}}); - std::unique_ptr gather_indices = Literal::CreateR1({0, 2}); + std::unique_ptr operand = LiteralUtil::CreateR2({{}, {}, {}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({0, 2}); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR2({{}, {}}), - *Evaluate({operand.get(), gather_indices.get()}))); + LiteralTestUtil::Equal(*LiteralUtil::CreateR2({{}, {}}), + *Evaluate({operand.get(), start_indices.get()}))); } TEST_P(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) { @@ -2013,39 +2031,517 @@ ENTRY main { operand = s32[3] parameter(0) indices = s32[2,2,1] parameter(1) ROOT gather = s32[2,2] gather(operand, indices), - output_window_dims={}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=2, - window_bounds={1} + slice_sizes={1} } )"; ParseAndVerifyModule(hlo_text); - std::unique_ptr operand = Literal::CreateR1({0, 1, 2}); - std::unique_ptr gather_indices = - Literal::CreateR3({{{0}, {1}}, {{2}, {1}}}); + std::unique_ptr operand = LiteralUtil::CreateR1({0, 1, 2}); + std::unique_ptr start_indices = + LiteralUtil::CreateR3({{{0}, {1}}, {{2}, {1}}}); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR2({{0, 1}, {2, 1}}), - *Evaluate({operand.get(), gather_indices.get()}))); + LiteralTestUtil::Equal(*LiteralUtil::CreateR2({{0, 1}, {2, 1}}), + *Evaluate({operand.get(), start_indices.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) { + const char* hlo_text = R"( +HloModule TensorFlowScatterV1 + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{10, 20, 30}, {4, 5, 6}, {70, 80, 90}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) { + const char* hlo_text = R"( +HloModule TensorFlowScatterV2 + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[3,2] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={0}, + inserted_window_dims={1}, + scatter_dims_to_operand_dims={1}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 30}, {40, 60}, {70, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{10, 2, 30}, {40, 5, 60}, {70, 8, 90}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) { + const char* hlo_text = R"( +HloModule TensorFlowScatter + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{11, 22, 33}, {4, 5, 6}, {77, 88, 99}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) { + const char* hlo_text = R"( +HloModule TensorFlowScatter + +mul_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT mul = s32[] multiply(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=mul_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{10, 40, 90}, {4, 5, 6}, {490, 640, 810}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_F32) { + const char* hlo_text = R"( +HloModule TensorFlowScatter + +add_f32 (lhs: f32[], rhs: f32[]) -> f32[] { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(f32[] lhs, f32[] rhs) +} + +ENTRY main { + operand = f32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = f32[2,3] parameter(2) + ROOT scatter = f32[3,3] scatter(operand, indices, updates), + to_apply=add_f32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = LiteralUtil::CreateR2( + {{1.1, 2.2, 3.3}, {4.4, 5.5, 6.6}, {7.7, 8.8, 9.9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({2, 1}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{0.4, 1.1, 0.7}, {2.3, 3.1, 1.6}}); + EXPECT_TRUE(LiteralTestUtil::Near( + *LiteralUtil::CreateR2( + {{1.1, 2.2, 3.3}, {6.7, 8.6, 8.2}, {8.1, 9.9, 10.6}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}), + ErrorSpec{0.1, 0.01})); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) { + const char* hlo_text = R"( +HloModule TensorFlowScatter + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({1, 1}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{1, 2, 3}, {84, 105, 126}, {7, 8, 9}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) { + const char* hlo_text = R"( +HloModule TensorFlowScatterMultipleBatchDims + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,3,2] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={1}, + scatter_dims_to_operand_dims={1}, + index_vector_dim=2 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{0, 2}, {2, 1}}); + std::unique_ptr updates = LiteralUtil::CreateR3( + {{{10, 30}, {40, 60}, {70, 90}}, {{5, 5}, {5, 5}, {5, 5}}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{11, 7, 38}, {44, 10, 71}, {77, 13, 104}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) { + const char* hlo_text = R"( +HloModule TensorFlowScatterNd + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3,2] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,2] parameter(2) + ROOT scatter = s32[3,3,2] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0,1}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{-10, 10}, {-40, 40}}); + std::unique_ptr expected = + LiteralUtil::CreateR3({{{-10, 10}, {-2, 2}, {-3, 3}}, // + {{-40, 40}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, + EvaluateScatter_TensorFlowScatterNd_NonDefaultIndexVectorDim) { + const char* hlo_text = R"( +HloModule TensorFlowScatterNdNonDefaultIndexVectorDim + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3,2] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,2] parameter(2) + ROOT scatter = s32[3,3,2] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0,1}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=0 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{-10, 10}, {-20, 20}}); + std::unique_ptr expected = + LiteralUtil::CreateR3({{{-20, 20}, {-10, 10}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) { + const char* hlo_text = R"( +HloModule DynamicUpdateSlice + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[1,1] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={0,1}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=0 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({1, 1}); + std::unique_ptr updates = LiteralUtil::CreateR2({{10}}); + std::unique_ptr expected = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 10, 6}, {7, 8, 9}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) { + const char* hlo_text = R"( +HloModule BatchDynamicUpdateSlice + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,1,1] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1,2}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=0 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{2, 1}, {1, 1}}); + std::unique_ptr updates = + LiteralUtil::CreateR3({{{10}}, {{20}}}); + std::unique_ptr expected = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 20, 6}, {7, 10, 9}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) { + const char* hlo_text = R"( +HloModule TensorFlowScatter_ZeroDimBounds + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,0] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,0] parameter(2) + ROOT scatter = s32[3,0] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = LiteralUtil::CreateR2({{}, {}, {}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = LiteralUtil::CreateR2({{}, {}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *operand, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) { + const string hlo_text = R"( +HloModule Scatter_NoUpdateWindowDims + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3] parameter(0) + indices = s32[2,2,1] parameter(1) + updates = s32[2,2] parameter(2) + ROOT scatter = s32[3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=2 +} +)"; + ParseAndVerifyModule(hlo_text); + + std::unique_ptr operand = LiteralUtil::CreateR1({0, 1, 2}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR3({{{0}, {1}}, {{2}, {1}}}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20}, {30, 40}}); + std::unique_ptr expected = + LiteralUtil::CreateR1({10, 61, 32}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); } // Verifies that HloEvaluator evaluates a HLO instruction that performs // element-wise comparison with 2 bfloat16 operands. TEST_P(HloEvaluatorTest, DoesCompareBF16) { // lhs >= rhs - auto lhs = Literal::CreateR2( + auto lhs = LiteralUtil::CreateR2( {{bfloat16(0.25), bfloat16(0.35), bfloat16(0.125)}, {bfloat16(-0.25), bfloat16(-0.35), bfloat16(-0.125)}}); - auto rhs = Literal::CreateR2( + auto rhs = LiteralUtil::CreateR2( {{bfloat16(0.5), bfloat16(0.125), bfloat16(0.125)}, {bfloat16(0.25), bfloat16(-0.375), bfloat16(-0.127)}}); auto expected = - Literal::CreateR2({{false, true, true}, {false, true, true}}); + LiteralUtil::CreateR2({{false, true, true}, {false, true, true}}); TestBinaryOp(HloOpcode::kGe, std::move(expected), std::move(lhs), std::move(rhs)); } +TEST_P(HloEvaluatorTest, Bf16Reduction) { + const string hlo_text = R"( +HloModule Bf16Reduction + +add_bf16 (lhs: bf16[], rhs: bf16[]) -> bf16[] { + lhs = bf16[] parameter(0) + rhs = bf16[] parameter(1) + ROOT add = bf16[] add(bf16[] lhs, bf16[] rhs) +} + +ENTRY main { + arg0 = bf16[4]{0} parameter(0) + init = bf16[] constant(0) + ROOT %reduce = bf16[] reduce(arg0, init), dimensions={0}, to_apply=add_bf16 +} +)"; + ParseAndVerifyModule(hlo_text); + + std::unique_ptr arg = LiteralUtil::CreateR1( + {bfloat16(1.0f), bfloat16(3.0f), bfloat16(-2.0f), bfloat16(42.0f)}); + std::unique_ptr expected = + LiteralUtil::CreateR0(bfloat16(44.0f)); + EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *Evaluate({arg.get()}))); +} + INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest, ::testing::ValuesIn(use_bf16_params)); diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h index b1b58642ec1b854ff91856b2ec0af513e8c5c161..cb27e13e99c0192a9796d3d32eba2637e7db06bc 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h +++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h @@ -16,10 +16,16 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_ +#include "absl/algorithm/container.h" +#include "absl/container/inlined_vector.h" +#include "absl/memory/memory.h" +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/compiler/xla/service/hlo_evaluator.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/shape_inference.h" #include "tensorflow/core/lib/core/casts.h" -#include "tensorflow/core/lib/gtl/optional.h" namespace xla { @@ -34,6 +40,37 @@ using is_complex_t = std::is_same; template using is_complex64_t = std::is_same; +// It's UB to use std::sort with std::less, because of NaNs. Define +// "safe" less functions which are actually strict weak orders. +template < + typename NativeT, + typename std::enable_if::value>::type* = nullptr> +bool SafeLess(const NativeT& a, const NativeT& b) { + return a < b; +} + +template ::value || + std::is_same::value>::type* = nullptr> +bool SafeLess(const NativeT& a, const NativeT& b) { + if (std::isnan(b)) { + return !std::isnan(a); + } else { + return a < b; + } +} + +template ::value>::type* = nullptr> +bool SafeLess(const NativeT& a, const NativeT& b) { + if (Eigen::half_impl::isnan(b)) { + return !Eigen::half_impl::isnan(a); + } else { + return a < b; + } +} + // Templated DfsHloVisitor for use by HloEvaluator. // // Typically ReturnT here indicates the resulting literal type of each evaluated @@ -54,6 +91,29 @@ using is_complex64_t = std::is_same; // of this class. template class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { + private: + // Get the value in the given literal static_cast as a double. + template < + typename NativeT, + typename std::enable_if::value>::type* = nullptr> + double GetAsDouble(const Literal& literal, + absl::Span input_index) { + return static_cast(literal.Get(input_index)); + } + + // Specialization for complex types. In this case it is not possible to + // static_cast value to a double so just CHECK fail. This method is not used + // at run-time, but must be available at compile-time to keep the compiler + // happy. + template < + typename NativeT, + typename std::enable_if::value>::type* = nullptr> + double GetAsDouble(const Literal& literal, + absl::Span input_index) { + LOG(FATAL) << "Trying to get complex literal as double: " + << literal.ToString(); + } + public: explicit HloEvaluatorTypedVisitor(HloEvaluator* p) : parent_(p) {} @@ -85,7 +145,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { Status DefaultAction(HloInstruction* hlo_instruction) override { return Unimplemented("unhandled HLO ops for HloEvaluator: %s.", - HloOpcodeString(hlo_instruction->opcode()).c_str()); + HloOpcodeString(hlo_instruction->opcode())); } // TODO(b/35950897): many of the stl functions used in the handlers are not @@ -269,6 +329,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return HandleFloor(floor); } + Status HandleImag(HloInstruction* imag) override { + TF_ASSIGN_OR_RETURN(parent_->evaluated_[imag], + ElementWiseUnaryOp(imag, [](ElementwiseT elem_operand) { + return std::imag(elem_operand); + })); + return Status::OK(); + } + Status HandleLog(HloInstruction* log) override { TF_ASSIGN_OR_RETURN(parent_->evaluated_[log], ElementWiseUnaryOp(log, [](ElementwiseT elem_operand) { @@ -485,7 +553,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Status::OK(); } - Status HandleDivide(HloInstruction* divide) override { + template < + typename NativeT, + typename std::enable_if::value || + is_complex_t::value>::type* = nullptr> + Status HandleDivide(HloInstruction* divide) { TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide], ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) { @@ -494,6 +566,46 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Status::OK(); } + template ::value && + std::is_integral::value>::type* = + nullptr> + Status HandleDivide(HloInstruction* divide) { + TF_ASSIGN_OR_RETURN( + parent_->evaluated_[divide], + ElementWiseBinaryOp( + divide, + [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) -> ElementwiseT { + if (rhs_elem == 0) { + return static_cast(-1); + } + if (rhs_elem == -1 && + lhs_elem == std::numeric_limits::min()) { + return lhs_elem; + } + return lhs_elem / rhs_elem; + })); + return Status::OK(); + } + + template ::value>::type* = + nullptr> + Status HandleDivide(HloInstruction* divide) { + TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide], + ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem, + ElementwiseT rhs_elem) { + return rhs_elem == 0 + ? std::numeric_limits::max() + : (lhs_elem / rhs_elem); + })); + return Status::OK(); + } + + Status HandleDivide(HloInstruction* divide) { + return HandleDivide(divide); + } + template ::value>::type* = nullptr> @@ -572,9 +684,16 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Status::OK(); } - template < - typename NativeT, - typename std::enable_if::value>::type* = nullptr> + Status HandleReal(HloInstruction* real) override { + TF_ASSIGN_OR_RETURN(parent_->evaluated_[real], + ElementWiseUnaryOp(real, [](ElementwiseT elem_operand) { + return std::real(elem_operand); + })); + return Status::OK(); + } + + template ::value>::type* = nullptr> Status HandleRemainder(HloInstruction* remainder) { TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder], ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el, @@ -584,6 +703,40 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Status::OK(); } + template ::value>::type* = + nullptr> + Status HandleRemainder(HloInstruction* remainder) { + TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder], + ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el, + ElementwiseT rhs_el) { + return rhs_el == 0 ? lhs_el : (lhs_el % rhs_el); + })); + return Status::OK(); + } + + template ::value && + std::is_integral::value>::type* = + nullptr> + Status HandleRemainder(HloInstruction* remainder) { + TF_ASSIGN_OR_RETURN( + parent_->evaluated_[remainder], + ElementWiseBinaryOp( + remainder, + [](ElementwiseT lhs_el, ElementwiseT rhs_el) -> ElementwiseT { + if (rhs_el == 0) { + return lhs_el; + } + if (rhs_el == -1 && + lhs_el == std::numeric_limits::min()) { + return 0; + } + return lhs_el % rhs_el; + })); + return Status::OK(); + } + template < typename NativeT, typename std::enable_if::value>::type* = nullptr> @@ -610,12 +763,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { template ::value>::type* = nullptr> Status HandleAnd(HloInstruction* and_) { - TF_ASSIGN_OR_RETURN( - parent_->evaluated_[and_], - ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) { - return lhs_el && rhs_el; - })); - return Status::OK(); + return InvalidArgument("Unsupported type for And"); } template < @@ -644,12 +792,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { template ::value>::type* = nullptr> Status HandleOr(HloInstruction* or_) { - TF_ASSIGN_OR_RETURN( - parent_->evaluated_[or_], - ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) { - return lhs_el || rhs_el; - })); - return Status::OK(); + return InvalidArgument("Unsupported type for Or"); } template < @@ -663,6 +806,35 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return HandleOr(or_); } + template ::value>::type* = + nullptr> + Status HandleXor(HloInstruction* xor_) { + TF_ASSIGN_OR_RETURN( + parent_->evaluated_[xor_], + ElementWiseBinaryOp(xor_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) { + return lhs_el ^ rhs_el; + })); + return Status::OK(); + } + + template ::value>::type* = nullptr> + Status HandleXor(HloInstruction* xor_) { + return InvalidArgument("Unsupported type for Xor"); + } + + template < + typename NativeT, + typename std::enable_if::value>::type* = nullptr> + Status HandleXor(HloInstruction* xor_) { + return InvalidArgument("Unsupported type for Xor"); + } + + Status HandleXor(HloInstruction* xor_) override { + return HandleXor(xor_); + } + template ::value && @@ -778,7 +950,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { Status HandleSelect(HloInstruction* select) override { CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape())); - CHECK(!ShapeUtil::IsTuple(select->shape())); + CHECK(ShapeUtil::IsArray(select->shape())); std::function select_op = [](bool pred, ReturnT on_true, ReturnT on_false) { if (pred) { @@ -806,10 +978,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { << ShapeUtil::HumanString(inferred_return_shape); const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand); - auto result = MakeUnique(result_shape); + auto result = absl::make_unique(result_shape); - TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice out_index) { + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span out_index) { std::vector from_index(out_index.begin(), out_index.end()); for (const int64 dim : reverse_dimensions) { from_index[dim] = result_shape.dimensions(dim) - 1 - out_index[dim]; @@ -876,8 +1048,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window, &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data, - rhs_literal_data]( - tensorflow::gtl::ArraySlice out_index) { + rhs_literal_data](absl::Span out_index) { // Dimension number applicable for input (lhs). const int64 input_batch_dim = dnums.input_batch_dimension(); const int64 input_z_dim = dnums.input_feature_dimension(); @@ -958,12 +1129,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { static_cast(rhs_literal_data[rhs_linear_index]); } cnt : {} - } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index)); + } while (IndexUtil::BumpIndices(window_shape, + absl::MakeSpan(rhs_spatial_index))); return static_cast(result_val); }; - auto result = MakeUnique(result_shape); + auto result = absl::make_unique(result_shape); TF_RETURN_IF_ERROR(result->PopulateParallel(func)); parent_->evaluated_[conv] = std::move(result); @@ -1006,83 +1178,47 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { CHECK_EQ(dnums.lhs_batch_dimensions_size(), dnums.rhs_batch_dimensions_size()); - std::vector lhs_non_contracting_dims; + DimensionVector lhs_index(lhs_rank); + DimensionVector rhs_index(rhs_rank); + + // result_index_locations[i] contains one or two pointers to the locations + // in lhs_index or rhs_index where the i'th result index should go. + absl::InlinedVector, kInlineRank> + result_index_locations; + result_index_locations.reserve(lhs_rank + rhs_rank - 2); + + // The first components in the output shape are the LHS and RHS batch + // dimensions: + for (int64 i = 0; i < dnums.lhs_batch_dimensions_size(); i++) { + result_index_locations.push_back( + {&lhs_index[dnums.lhs_batch_dimensions(i)], + &rhs_index[dnums.rhs_batch_dimensions(i)]}); + } + + // Then we have the LHS and RHS non-contracting dimensions, if any: for (int64 i = 0; i < lhs_rank; i++) { - if (i != lhs_contracting_dimension) { - lhs_non_contracting_dims.push_back(i); + if (i != lhs_contracting_dimension && + !absl::c_linear_search(dnums.lhs_batch_dimensions(), i)) { + result_index_locations.push_back({&lhs_index[i], nullptr}); } } - - std::vector rhs_non_batch_non_contracting_dims; - tensorflow::gtl::FlatSet batch_dims_set( - dnums.rhs_batch_dimensions().begin(), - dnums.rhs_batch_dimensions().end()); for (int64 i = 0; i < rhs_rank; i++) { - if (i != rhs_contracting_dimension && batch_dims_set.count(i) == 0) { - rhs_non_batch_non_contracting_dims.push_back(i); + if (i != rhs_contracting_dimension && + !absl::c_linear_search(dnums.rhs_batch_dimensions(), i)) { + result_index_locations.push_back({&rhs_index[i], nullptr}); } } - const int64 batch_dim_size = dnums.lhs_batch_dimensions_size(); - const int64 lhs_non_contracting_size = lhs_non_contracting_dims.size(); - - DimensionVector lhs_index(lhs_rank); - DimensionVector rhs_index(rhs_rank); - auto result = MakeUnique(dot->shape()); - TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice result_index) { + auto result = absl::make_unique(dot->shape()); + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span result_index) { ElementwiseT result_val = static_cast(0); - // Find the corresponding non-contracting indices for lhs and rhs. - // - // For `result_index`, its batch dimension, if exists, will be at the - // same dimension as the batch dimension of lhs and rhs. More - // specifically: - // - For lhs, the non-contracting dimensions, including the batch - // dimension have the same index as the `result_index`. - // - For rhs, the batch dimension is set seperately from other - // non-contracting dimensions, since these other non-contracting - // dimensions in rhs follow the non-contracting dimensions of lhs in - // the resulting index. - // - // As an example, for a resulting index: - // result_index [result_batch, result_x, result_y] - // the effecting lhs and rhs indices are: - // lhs [result_batch, lhs_non_contracting_dim, contracting_dim - // rhs [result_batch, contracting_dim, rhs_non_contracting_dim] - // `result_x` is only affected by the lhs_non_contracting_dim and - // likewise `result_y` only depends on rhs_non_contracting_dim. - // - // so we can look up the lhs and rhs indices by: - // - // lhs: - // batch index is the same as `result_batch`. - // non-contracting dimension is the same as - // result_index[lhs_non_contracting_dim] - // rhs: - // batch index: the same as `result_batch`. - // non-contracting dimension index: *not* the same as - // result_index[rhs_non_contractng_dim], since the - // non-contracting dimensions of lhs are included in the - // result_index first. Instead, the non_contracting_dim of rhs must - // be calculated as following: - // lhs_non_contracting_dimensions_size + - // (rhs_non_batch_non_contracting_dim - batch_dim_size) - 1 - // - // Note that (rhs_non_batch_contracting_dim - batch_dim_size) is - // the index offset to the result_index that only depends on - // the non_batch and non-contracting dimensions of rhs. -1 at the - // end translates size to index. - for (auto i : lhs_non_contracting_dims) { - lhs_index[i] = result_index[i]; - } - for (auto i : dnums.rhs_batch_dimensions()) { - rhs_index[i] = result_index[i]; - } - for (auto i : rhs_non_batch_non_contracting_dims) { - const int64 rhs_non_batch_non_contracting_dim = - lhs_non_contracting_size + (i - batch_dim_size) - 1; - rhs_index[i] = result_index[rhs_non_batch_non_contracting_dim]; + for (int64 i = 0; i < result_index.size(); i++) { + *result_index_locations[i].first = result_index[i]; + if (result_index_locations[i].second) { + *result_index_locations[i].second = result_index[i]; + } } // Accumulates resulting product along the contracted dimension. @@ -1103,7 +1239,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { } Status HandlePad(HloInstruction* pad) override { - CHECK(!ShapeUtil::IsTuple(pad->operand(0)->shape())); + CHECK(ShapeUtil::IsArray(pad->operand(0)->shape())); // Padding value must be scalar. CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape())); CHECK_EQ(ShapeUtil::Rank(pad->operand(0)->shape()), @@ -1116,17 +1252,15 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { /*padding_config=*/pad->padding_config())); CHECK(ShapeUtil::Compatible(pad->shape(), inferred_return_shape)) << "return shape is set to: " << ShapeUtil::HumanString(pad->shape()) - << "but is inferred to be: " + << " but is inferred to be: " << ShapeUtil::HumanString(inferred_return_shape); // Create new HLO of padded shape with padding value. ReturnT scalar = parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get({}); - auto result = MakeUnique(pad->shape()); + auto result = absl::make_unique(pad->shape()); TF_RETURN_IF_ERROR(result->Populate( - [&scalar](tensorflow::gtl::ArraySlice multi_index) { - return scalar; - })); + [&scalar](absl::Span multi_index) { return scalar; })); const Literal& evaluated_operand = parent_->GetEvaluatedLiteralFor(pad->operand(0)); @@ -1139,7 +1273,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { // corresponding index of the resulting padded literal. const PaddingConfig& pad_config = pad->padding_config(); - auto func = [&](tensorflow::gtl::ArraySlice input_index) { + auto func = [&](absl::Span input_index) { for (auto i = 0; i < input_index.size(); ++i) { // Interior padding occurs logically before edge padding, so in the case // of negative edge padding elements are removed from the @@ -1182,7 +1316,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { dynamic_slice->dynamic_slice_sizes())); TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape)) << "return shape is set to: " << ShapeUtil::HumanString(result_shape) - << "but is inferred to be: " + << " but is inferred to be: " << ShapeUtil::HumanString(inferred_return_shape); TF_RET_CHECK( primitive_util::IsIntegralType(start_indices->shape().element_type())); @@ -1237,7 +1371,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { operand->shape(), update->shape(), start_indices->shape())); TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape)) << "return shape is set to: " << ShapeUtil::HumanString(result_shape) - << "but is inferred to be: " + << " but is inferred to be: " << ShapeUtil::HumanString(inferred_return_shape); TF_RET_CHECK( primitive_util::IsIntegralType(start_indices->shape().element_type())); @@ -1287,11 +1421,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { auto operands = map->operands(); HloComputation* computation = map->to_apply(); - auto result = MakeUnique(map->shape()); + auto result = absl::make_unique(map->shape()); HloEvaluator embedded_evaluator(parent_->max_loop_iterations_); - TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice multi_index) { + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span multi_index) { std::vector> arg_literals; arg_literals.reserve(operands.size()); @@ -1302,7 +1436,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { parent_->GetEvaluatedLiteralFor(operand); auto curr_val = arg_literal.Get(multi_index); - auto curr_val_literal = Literal::CreateR0(curr_val); + auto curr_val_literal = LiteralUtil::CreateR0(curr_val); arg_literals.push_back(std::move(curr_val_literal)); } @@ -1378,32 +1512,106 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Status::OK(); } - Status HandleReduce(HloInstruction* reduce) override { - auto arg = reduce->operand(0); - auto init_value = reduce->operand(1); - tensorflow::gtl::ArraySlice dimensions(reduce->dimensions()); + template ::value && + !std::is_same::value>::type* = nullptr> + Status HandleSort(HloInstruction* sort) { + auto keys = sort->operand(0); + auto rank = ShapeUtil::Rank(keys->shape()); + TF_RET_CHECK(rank > 0 && rank <= 2) + << "Sort is only supported for R1 and R2 shapes"; + TF_RET_CHECK(sort->operand_count() == 1) + << "Typed visitor does not support key-value sort"; + + const Literal& keys_literal = parent_->GetEvaluatedLiteralFor(keys); + + auto sort_r1 = [this](const Literal& keys_literal) { + VLOG(3) << "HandleSort keys_literal: " << keys_literal.ToString(); + const auto& keys_data = keys_literal.data(); + + std::vector result_data(keys_data.begin(), keys_data.end()); + std::sort(result_data.begin(), result_data.end(), + [](const ReturnT& a, const ReturnT& b) { + return SafeLess(a, b); + }); + auto result_literal = absl::make_unique(keys_literal.shape()); + result_literal->PopulateR1(absl::Span(result_data)); + VLOG(3) << "HandleSort result_literal: " << result_literal->ToString(); + return result_literal; + }; + + if (rank == 1) { + parent_->evaluated_[sort] = std::move(sort_r1(keys_literal)); + } else { + // For R2 sort, the desired semantics are to sort each matrix row + // independently. + auto result_literal = absl::make_unique(keys_literal.shape()); + int64 r1_length = keys->shape().dimensions(1); + for (int64 row = 0; row < keys->shape().dimensions(0); ++row) { + TF_ASSIGN_OR_RETURN(auto r1_slice, + keys_literal.Slice({row, 0}, {row + 1, r1_length}) + ->Reshape({r1_length})); + auto r1_result = sort_r1(*r1_slice); + TF_ASSIGN_OR_RETURN(r1_result, r1_result->Reshape({1, r1_length})); + TF_RETURN_IF_ERROR(result_literal->CopySliceFrom( + *r1_result, {0, 0}, {row, 0}, {1, r1_length})); + } + parent_->evaluated_[sort] = std::move(result_literal); + } + return Status::OK(); + } + + template ::value || + std::is_same::value>::type* = + nullptr> + Status HandleSort(HloInstruction* sort) { + return InvalidArgument("Unsupported type for Sort"); + } + + Status HandleSort(HloInstruction* sort) override { + return HandleSort(sort); + } + + Status HandleReduce(HloInstruction* hlo) override { + HloReduceInstruction* reduce = Cast(hlo); + int64 num_args = reduce->inputs().size(); + bool has_tuple_output = ShapeUtil::IsTuple(reduce->shape()); + absl::Span dimensions(reduce->dimensions()); HloComputation* function = reduce->to_apply(); - TF_RET_CHECK(ShapeUtil::Rank(reduce->shape()) == - ShapeUtil::Rank(arg->shape()) - dimensions.size()); + + absl::InlinedVector operand_shapes; + for (const HloInstruction* operand : reduce->operands()) { + operand_shapes.push_back(&operand->shape()); + } TF_ASSIGN_OR_RETURN(auto inferred_return_shape, ShapeInference::InferReduceShape( - /*arg=*/arg->shape(), - /*init_value=*/init_value->shape(), + operand_shapes, /*dimensions_to_reduce=*/dimensions, /*to_apply=*/function->ComputeProgramShape())); TF_RET_CHECK(ShapeUtil::Compatible(reduce->shape(), inferred_return_shape)) << "return shape is set to: " << ShapeUtil::HumanString(reduce->shape()) - << "but is inferred to be: " + << " but is inferred to be: " << ShapeUtil::HumanString(inferred_return_shape); - const Literal& arg_literal = parent_->GetEvaluatedLiteralFor(arg); - VLOG(3) << "HandleReduce arg_literal: " << arg_literal.ToString(); - const Literal& init_literal = parent_->GetEvaluatedLiteralFor(init_value); - VLOG(3) << "HandleReduce init_literal: " << init_literal.ToString(); - TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape())); - auto init_scalar = init_literal.Get({}); + absl::InlinedVector arg_literals(num_args); + absl::InlinedVector init_literals(num_args); + for (int64 i = 0; i < num_args; ++i) { + arg_literals[i] = &parent_->GetEvaluatedLiteralFor(reduce->inputs()[i]); + VLOG(3) << "HandleReduce arg_literal: " << arg_literals[i]->ToString(); + init_literals[i] = + &parent_->GetEvaluatedLiteralFor(reduce->init_values()[i]); + VLOG(3) << "HandleReduce init_literal: " << init_literals[i]->ToString(); + TF_RET_CHECK(ShapeUtil::IsScalar(init_literals[i]->shape())); + } - const auto arg_dimensions = AsInt64Slice(arg_literal.shape().dimensions()); + // All args and results have the same dimensions, so pick an arbitrary one. + const Shape& arg_shape = arg_literals[0]->shape(); + const Shape& result_shape = ShapeUtil::IsTuple(reduce->shape()) + ? reduce->shape().tuple_shapes(0) + : reduce->shape(); + const auto arg_dimensions = AsInt64Slice(arg_shape.dimensions()); std::vector arg_dim_steps(arg_dimensions.size()); std::vector arg_dim_counts(arg_dimensions.size()); for (const int64 dim : dimensions) { @@ -1421,60 +1629,110 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { } HloEvaluator embedded_evaluator(parent_->max_loop_iterations_); - auto result = MakeUnique(reduce->shape()); - // For each resulting dimension, calculate and assign computed value. - TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice multi_index) { - ReturnT result_val = init_scalar; + absl::InlinedVector, 1> results(num_args); + for (int64 i = 0; i < num_args; ++i) { + results[i] = absl::make_unique(result_shape); + } - std::vector base(arg_dimensions.size()); - for (int64 i = 0; i < multi_index.size(); ++i) { - base[result_to_arg_index[i]] = multi_index[i]; - } + Status eval_status; + // For each resulting dimension, calculate and assign computed values. + // This is really wasteful when num_args > 1, since we re-run the + // reduction num_args time. The alternative is to teach Populate() about + // tuples, which we should probably do. + absl::InlinedVector init_scalars(num_args); + for (int i = 0; i < num_args; ++i) { + init_scalars[i] = init_literals[i]->Get({}); + } + + for (int64 input = 0; input < num_args; ++input) { + TF_RETURN_IF_ERROR(results[input]->Populate( + [&](absl::Span multi_index) { + if (!eval_status.ok()) { + return init_scalars[input]; + } + absl::InlinedVector result_values(init_scalars.begin(), + init_scalars.end()); + std::vector base(arg_dimensions.size()); + for (int64 i = 0; i < multi_index.size(); ++i) { + base[result_to_arg_index[i]] = multi_index[i]; + } - // When the reduction is addition of floats, accumulate in a double - // for better precision. Also, avoid creating Literals for the - // intermediate results; it's much faster. - if (ShapeUtil::ElementIsFloating(init_literal.shape()) && - IsScalarAdd(function)) { - double computed_result = 0; - auto func = [&](tensorflow::gtl::ArraySlice input_index) { - computed_result += arg_literal.Get(input_index); + // When the reduction is addition of floats, accumulate in a double + // for better precision. Also, avoid creating Literals for the + // intermediate results; it's much faster. + if (ShapeUtil::ElementIsFloating(init_literals[0]->shape()) && + IsScalarAdd(function)) { + CHECK_EQ(num_args, 1); + double computed_result = 0; + auto func = [&](absl::Span input_index) { + computed_result += + GetAsDouble(*arg_literals[0], input_index); + return true; + }; + ShapeUtil::ForEachIndex(arg_literals[0]->shape(), base, + arg_dim_counts, arg_dim_steps, func); + return static_cast(computed_result); + } + auto func = + [&](absl::Span input_index) -> StatusOr { + absl::InlinedVector arg_values(num_args); + for (int64 i = 0; i < num_args; ++i) { + arg_values[i] = arg_literals[i]->Get(input_index); + } + + // Evaluate computation with specified literal operands. + absl::InlinedVector, 1> + embedded_operands; + for (ReturnT value : result_values) { + embedded_operands.push_back( + LiteralUtil::CreateR0(value)); + } + for (ReturnT value : arg_values) { + embedded_operands.push_back( + LiteralUtil::CreateR0(value)); + } + absl::InlinedVector embedded_operands_ptrs( + embedded_operands.size()); + std::transform(embedded_operands.begin(), embedded_operands.end(), + embedded_operands_ptrs.begin(), + [](const std::unique_ptr& ptr) { + return ptr.get(); + }); + + TF_ASSIGN_OR_RETURN(std::unique_ptr computed_result, + embedded_evaluator.Evaluate( + *function, embedded_operands_ptrs)); + // Clear visit states so that we can use the evaluator again on + // the same computation. + embedded_evaluator.ResetVisitStates(); + // Assign computed result to result_val. + if (!has_tuple_output) { + result_values[0] = computed_result->Get({}); + } else { + for (int64 i = 0; i < num_args; ++i) { + result_values[i] = computed_result->Get( + /*multi_index=*/{}, /*shape_index=*/{i}); + } + } return true; }; - ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts, - arg_dim_steps, func); - return static_cast(computed_result); - } - auto func = [&](tensorflow::gtl::ArraySlice input_index) { - auto curr_val = arg_literal.Get(input_index); - - // Evaluate computation with specified literal operands. - auto curr_val_literal = Literal::CreateR0(curr_val); - auto result_val_literal = Literal::CreateR0(result_val); - - std::unique_ptr computed_result = - embedded_evaluator - .Evaluate( - *function, - {result_val_literal.get(), curr_val_literal.get()}) - .ConsumeValueOrDie(); - // Clear visit states so that we can use the evaluator again on - // the same computation. - embedded_evaluator.ResetVisitStates(); - // Assign computed result to result_val. - result_val = computed_result->Get({}); - return true; - }; - // Computes one element of the result, reducing all dimensions that - // contribute to that element. - ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts, - arg_dim_steps, func); - return result_val; - })); - - parent_->evaluated_[reduce] = std::move(result); - return Status::OK(); + // Computes one element of the result, reducing all dimensions that + // contribute to that element. + eval_status = ShapeUtil::ForEachIndexWithStatus( + arg_shape, base, arg_dim_counts, arg_dim_steps, func); + return result_values[input]; + })); + } + if (!has_tuple_output) { + parent_->evaluated_[reduce] = std::move(results[0]); + } else { + auto tuple_result = absl::make_unique(reduce->shape()); + for (int64 i = 0; i < num_args; ++i) { + TF_CHECK_OK(tuple_result->MoveFrom(std::move(*results[i]), {i})); + } + parent_->evaluated_[reduce] = std::move(tuple_result); + } + return eval_status; } bool IsScalarAdd(HloComputation* computation) { @@ -1501,13 +1759,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape())); auto init_scalar = init_literal.Get({}); - auto result = MakeUnique(select_and_scatter->shape()); + auto result = absl::make_unique(select_and_scatter->shape()); // Initialize result array with the init value. TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice output_index) { - return init_scalar; - })); + [&](absl::Span output_index) { return init_scalar; })); std::vector window_dimension_sizes; for (const auto& window_dimension : window.dimensions()) { @@ -1529,10 +1785,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { // Used in the dual IterateThroughWindow lambdas below. Hoisted to avoid // dynamic memory allocations. - auto curr_val_literal = Literal::CreateR0(ReturnT()); - auto selected_val_literal = Literal::CreateR0(ReturnT()); - auto source_literal_scatter = Literal::CreateR0(ReturnT()); - auto scattered_literal = Literal::CreateR0(ReturnT()); + auto curr_val_literal = LiteralUtil::CreateR0(ReturnT()); + auto selected_val_literal = LiteralUtil::CreateR0(ReturnT()); + auto source_literal_scatter = LiteralUtil::CreateR0(ReturnT()); + auto scattered_literal = LiteralUtil::CreateR0(ReturnT()); do { // For each element in `source`, we place a window in `operand`. For each // window placement, we iterate inside the window twice: @@ -1545,8 +1801,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { // 2. Using the selected index, scatter value from `source` to result. We // do this by iterating through the window, and compare each index with // the selected index. - tensorflow::gtl::optional selected_val; - tensorflow::gtl::optional> selected_index; + absl::optional selected_val; + absl::optional> selected_index; IterateThroughWindow( window_shape, window, operand_literal.shape(), source_index, @@ -1593,7 +1849,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { embedded_evaluator.ResetVisitStates(); } }); - } while (IndexUtil::BumpIndices(source->shape(), &source_index)); + } while ( + IndexUtil::BumpIndices(source->shape(), absl::MakeSpan(source_index))); parent_->evaluated_[select_and_scatter] = std::move(result); return Status::OK(); @@ -1613,7 +1870,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { ShapeUtil::Compatible(reduce_window->shape(), inferred_return_shape)) << "return shape is set to: " << ShapeUtil::HumanStringWithLayout(reduce_window->shape()) - << "but is inferred to be: " + << " but is inferred to be: " << ShapeUtil::HumanStringWithLayout(inferred_return_shape); const Literal& operand_literal = @@ -1637,10 +1894,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape())); HloEvaluator embedded_evaluator(parent_->max_loop_iterations_); - auto result = MakeUnique(reduce_window->shape()); + auto result = absl::make_unique(reduce_window->shape()); // For each resulting dimension, calculate and assign computed value. - TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice output_index) { + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span output_index) { ReturnT result_val = init_scalar; std::fill(window_index.begin(), window_index.end(), 0); @@ -1653,9 +1910,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { // Evaluate computation with specified literal operands. const auto curr_val_literal = - Literal::CreateR0(curr_val); + LiteralUtil::CreateR0(curr_val); const auto result_val_literal = - Literal::CreateR0(result_val); + LiteralUtil::CreateR0(result_val); std::unique_ptr computed_result = embedded_evaluator .Evaluate( @@ -1677,6 +1934,386 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Status::OK(); } + // Reshapes the scatter indices input to have a trailing degenerate `1` + // dimension if necessary. Hands over the ownership of the newly created + // literal (if there is one) to `reshaped_indices`. + StatusOr> ReshapedScatterIndices( + int64 index_vector_dim, const Literal& indices, + std::unique_ptr* reshaped_indices) { + if (indices.shape().dimensions_size() != index_vector_dim) { + return std::cref(indices); + } + + std::vector new_shape(indices.shape().dimensions().begin(), + indices.shape().dimensions().end()); + new_shape.push_back(1); + TF_ASSIGN_OR_RETURN(*reshaped_indices, indices.Reshape(new_shape)); + return std::cref(**reshaped_indices); + } + + // Returns an ShapeUtil::IndexIterationSpace that iterates over the update + // scatter dimensions while keeping the rest of the update dimensions clamped + // to 0. + ShapeUtil::IndexIterationSpace IterationSpaceForUpdateScatterIndices( + const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) { + int64 updates_rank = updates_shape.dimensions_size(); + std::vector index_base(updates_rank, 0); + std::vector index_count(updates_rank, 1); + for (int64 i = 0; i < updates_rank; i++) { + bool is_update_scatter_dim = + !absl::c_binary_search(dim_numbers.update_window_dims(), i); + if (is_update_scatter_dim) { + index_count[i] = updates_shape.dimensions(i); + } + } + return {std::move(index_base), std::move(index_count), + std::vector(updates_rank, 1)}; + } + + // Return an ShapeUtil::IndexIterationSpace that iterates over the update + // window dimensions while keeping the rest of the update dimensions clamped + // to 0. + ShapeUtil::IndexIterationSpace IterationSpaceForUpdateWindowIndices( + const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) { + int64 updates_rank = updates_shape.dimensions_size(); + std::vector index_base(updates_rank, 0); + std::vector index_count(updates_rank, 1); + for (int64 i = 0; i < updates_rank; i++) { + bool is_update_window_dim = + absl::c_binary_search(dim_numbers.update_window_dims(), i); + if (is_update_window_dim) { + index_count[i] = updates_shape.dimensions(i); + } + } + return {std::move(index_base), std::move(index_count), + std::vector(updates_rank, 1)}; + } + + // This functor computes the contribution of scatter_indices to an input index + // corresponding to an update index. That is, given an update index I, it + // picks out the scatter indices in I and uses them to look up a scatter + // index, S, from the scatter indices tensor, and expands S into the input + // space according to scatter_dims_to_operand_dims. + // + // This is similar to the class HloEvaluator::OutputGatherIndexToInputIndex + // that does the corresponding function for Gather. + class UpdateScatterIndexToInputIndex { + public: + // The constructor does some setup work that is amortized across all + // iterations. + explicit UpdateScatterIndexToInputIndex( + const ScatterDimensionNumbers* dim_numbers, const Shape& input_shape, + const Shape& updates_shape, const Literal* scatter_indices) + : dim_numbers_(*dim_numbers), scatter_indices_(*scatter_indices) { + for (int64 i = 0; i < updates_shape.dimensions_size(); i++) { + update_dim_is_scatter_dims_.push_back( + !absl::c_binary_search(dim_numbers_.update_window_dims(), i)); + } + + for (int64 i = 0; i < input_shape.dimensions_size(); i++) { + int64 index_of_input_dim_in_index_vector = + FindIndex(dim_numbers_.scatter_dims_to_operand_dims(), i); + if (index_of_input_dim_in_index_vector == + dim_numbers_.scatter_dims_to_operand_dims_size()) { + input_dim_value_to_index_vector_.push_back(-1); + } else { + input_dim_value_to_index_vector_.push_back( + index_of_input_dim_in_index_vector); + } + } + + index_vector_index_.resize(scatter_indices_.shape().dimensions_size()); + input_index_.resize(input_shape.dimensions_size()); + int64 index_vector_size = + scatter_indices_.shape().dimensions(dim_numbers_.index_vector_dim()); + index_vector_.resize(index_vector_size); + } + + // Returns the contribution of scatter_indices to the input index + // corresponding to update_index. See scatter_inner_loop_body. + // + // This is conceptually a stateless transformation from update_index to the + // scatter input index, but: + // + // - Instead of allocating memory to represent the scatter input index on + // every invocation we reuse the same storage for the result + // (input_index_), mutating it in place. + // - Instead of allocating buffers for temporary values like + // index_vector_index_ and index_vector on every invocation, we reuse the + // same storage for all invocations. + // + // This returns a Span into memory owned by the class. + StatusOr> operator()( + absl::Span update_index) { + PropagateUpdateIndexScatterDimsToIndexVectorIndex(update_index); + TF_RETURN_IF_ERROR(FetchIndexVector()); + PropagateIndexVectorToInputIndex(); + return absl::Span(input_index_); + } + + private: + // Propagates the scatter index dimensions from the update index into + // index_vector_index_ by mutating index_vector_index_ in place. Does not + // update the dim_numbers.index_vector_dim() dimension -- that's the + // dimension we iterate over in FetchIndexVector. + void PropagateUpdateIndexScatterDimsToIndexVectorIndex( + absl::Span update_index) { + int64 index_vector_index_i = 0; + for (int64 i = 0, e = update_index.size(); i < e; i++) { + if (!update_dim_is_scatter_dims_[i]) { + continue; + } + + if (index_vector_index_i == dim_numbers_.index_vector_dim()) { + index_vector_index_i++; + } + + index_vector_index_[index_vector_index_i++] = update_index[i]; + } + } + + // Populates index_vector_ by iterating over scatter_indices_ according to + // index_vector_index_. + Status FetchIndexVector() { + int64 index_vector_dim = dim_numbers_.index_vector_dim(); + for (int64 i = 0, e = index_vector_.size(); i < e; i++) { + index_vector_index_[index_vector_dim] = i; + TF_ASSIGN_OR_RETURN(index_vector_[i], scatter_indices_.GetIntegralAsS64( + index_vector_index_)); + } + return Status::OK(); + } + + // Populates input_index_. + void PropagateIndexVectorToInputIndex() { + for (int64 i = 0, e = input_index_.size(); i < e; i++) { + if (input_dim_value_to_index_vector_[i] != -1) { + input_index_[i] = index_vector_[input_dim_value_to_index_vector_[i]]; + } + + // If input_dim_value_to_index_vector_[i] == -1 then input_index_[i] + // remains 0, as set by the constructor. + } + } + + // input_dim_value_to_index_vector_[i] tells us how to compute dimension i + // of the input index from the index vector. See + // PropagateIndexVectorToInputIndex. + std::vector input_dim_value_to_index_vector_; + + // update_dim_is_scatter_dims_[i] is true iff the update index i is a + // scatter dimension. + std::vector update_dim_is_scatter_dims_; + + // The buffer into which we construct an index into scatter_indices_ to + // fetch the index vector. + std::vector index_vector_index_; + + // The index vector fetched from scatter_indices_. + std::vector index_vector_; + + // The result computed by this functor. operator() returns a Span + // into this vector. + std::vector input_index_; + + const ScatterDimensionNumbers& dim_numbers_; + const Literal& scatter_indices_; + }; + + // This functor computes the contribution of the window indices in an update + // index to an input index. That is, given an update index I it picks out the + // update window indices in I and expands it into a window index into the + // input shape. + // + // This is similar to the class HloEvaluator::OutputWindowIndexToInputIndex + // that does the corresponding function for Gather. + class UpdateWindowIndexToInputIndex { + public: + // The constructor does some setup work that is amortized across all + // iterations. + explicit UpdateWindowIndexToInputIndex( + const ScatterDimensionNumbers& dim_numbers, const Shape& input_shape, + const Shape& updates_shape) { + std::vector window_index_to_update_index; + int64 update_index_count = 0; + for (int64 i = 0; i < updates_shape.dimensions_size(); i++) { + if (absl::c_binary_search(dim_numbers.update_window_dims(), i)) { + window_index_to_update_index.push_back(update_index_count++); + } else { + update_index_count++; + } + } + + int64 window_dim_count = 0; + for (int64 i = 0; i < input_shape.dimensions_size(); i++) { + if (absl::c_binary_search(dim_numbers.inserted_window_dims(), i)) { + input_dim_value_to_update_index_.push_back(-1); + } else { + input_dim_value_to_update_index_.push_back( + window_index_to_update_index[window_dim_count++]); + } + } + + input_index_.resize(input_shape.dimensions_size()); + } + + // Returns the contribution of the window indices to the input index + // corresponding to update_index. See scatter_inner_loop_body. + // + // This is conceptually a stateless transformation from update_index to the + // window input index, but instead of allocating memory to represent the + // scatter input index on every invocation we reuse the same storage for the + // result (input_index_), mutating it in place. + // + // This returns a Span into memory owned by the class. + StatusOr> operator()( + absl::Span update_index) { + PropagateUpdateIndexWindowDimsToInputIndex(update_index); + return absl::Span(input_index_); + } + + // Returns for a given 'input_dim' the corresponding update dimension index, + // or -1 if 'input_dim' is an elided window dimension. + int64 input_dim_value_to_update_index(int64 input_dim) { + return input_dim_value_to_update_index_[input_dim]; + } + + private: + // Propagates window dimensions from the update index to input_index_ by + // mutating input_index_ in place. + void PropagateUpdateIndexWindowDimsToInputIndex( + absl::Span update_index) { + for (int64 i = 0, e = input_index_.size(); i < e; i++) { + if (input_dim_value_to_update_index_[i] != -1) { + input_index_[i] = update_index[input_dim_value_to_update_index_[i]]; + } + + // If input_dim_value_to_index_vector_[i] == -1 then input_index_[i] + // remains 0, as set by the constructor. + } + } + + // input_dim_value_to_index_vector_[i] tells us how to compute dimension i + // of the input index from the update index. See + // PropagateUpdateIndexWindowDimsToInputIndex. + std::vector input_dim_value_to_update_index_; + + // The result computed by this functor. operator() returns a Span + // into this vector. + std::vector input_index_; + }; + + Status HandleScatter(HloInstruction* scatter) override { + const ScatterDimensionNumbers& dim_numbers = + scatter->scatter_dimension_numbers(); + const Literal& operand = + parent_->GetEvaluatedLiteralFor(scatter->operand(0)); + std::unique_ptr reshaped_scatter_indices; + TF_ASSIGN_OR_RETURN(const Literal& scatter_indices, + ReshapedScatterIndices(dim_numbers.index_vector_dim(), + parent_->GetEvaluatedLiteralFor( + scatter->operand(1)), + &reshaped_scatter_indices)); + const Literal& updates = + parent_->GetEvaluatedLiteralFor(scatter->operand(2)); + const Shape& updates_shape = updates.shape(); + const Shape& operand_shape = operand.shape(); + + ShapeUtil::IndexIterationSpace scatter_indices_iteration_space = + IterationSpaceForUpdateScatterIndices(updates_shape, dim_numbers); + ShapeUtil::IndexIterationSpace window_indices_iteration_space = + IterationSpaceForUpdateWindowIndices(updates_shape, dim_numbers); + + std::vector input_index(operand_shape.dimensions_size()); + std::vector update_index(updates_shape.dimensions_size()); + std::vector input_scatter_index_clamped( + operand_shape.dimensions_size()); + + UpdateScatterIndexToInputIndex update_scatter_index_to_input_index( + &scatter->scatter_dimension_numbers(), /*input_shape=*/operand_shape, + updates_shape, &scatter_indices); + UpdateWindowIndexToInputIndex update_window_index_to_input_index( + scatter->scatter_dimension_numbers(), /*input_shape=*/operand_shape, + updates_shape); + + // Initialize the result with the operand. This makes it easier to handle + // the updates even when the indices are repeated. + std::unique_ptr result = operand.CloneToUnique(); + HloEvaluator embedded_evaluator; + auto scatter_inner_loop_body = + [&](absl::Span update_window_index, + absl::Span input_scatter_index, + absl::Span update_scatter_index) -> StatusOr { + TF_ASSIGN_OR_RETURN( + absl::Span input_window_index, + update_window_index_to_input_index(update_window_index)); + for (int i = 0, e = update_index.size(); i < e; i++) { + update_index[i] = update_scatter_index[i] + update_window_index[i]; + DCHECK_LT(update_index[i], updates_shape.dimensions(i)); + } + for (int i = 0, e = input_scatter_index.size(); i < e; i++) { + int64 update_dim = + update_window_index_to_input_index.input_dim_value_to_update_index( + i); + // If 'update_dim' is -1, it means 'i' is an elided window dim. This + // means we set the iteration index to 0, so for the purpose of the + // following calculations we can consider the update dimension size to + // be 1. + int64 update_dim_size = + update_dim == -1 ? 1 : updates_shape.dimensions(update_dim); + // Clamp the scatter index so that the scatter region fits in the + // operand. input_scatter_index_clamped[i] = + // clamp(input_scatter_index[i], 0, + // operand_shape.dimensions(i) - + // update_dim_size); + input_scatter_index_clamped[i] = + std::min(operand_shape.dimensions(i) - update_dim_size, + std::max(0LL, input_scatter_index[i])); + } + for (int i = 0, e = input_index.size(); i < e; i++) { + input_index[i] = input_scatter_index_clamped[i] + input_window_index[i]; + DCHECK_GE(input_index[i], 0); + DCHECK_LT(input_index[i], operand_shape.dimensions(i)); + } + + auto result_value_literal = + LiteralUtil::CreateR0(result->Get(input_index)); + auto update_value_literal = + LiteralUtil::CreateR0(updates.Get(update_index)); + std::unique_ptr updated_result = + embedded_evaluator + .Evaluate( + *scatter->to_apply(), + {result_value_literal.get(), update_value_literal.get()}) + .ConsumeValueOrDie(); + // Clear visit states so that the we can use the evaluate again on the + // same computation. + embedded_evaluator.ResetVisitStates(); + result->Set(input_index, updated_result->Get({})); + return true; + }; + + auto scatter_outer_loop_body = + [&](absl::Span update_scatter_index) -> StatusOr { + TF_ASSIGN_OR_RETURN( + absl::Span input_scatter_index, + update_scatter_index_to_input_index(update_scatter_index)); + TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus( + updates_shape, window_indices_iteration_space, + [&](absl::Span update_window_index) { + return scatter_inner_loop_body( + update_window_index, input_scatter_index, update_scatter_index); + })); + return true; + }; + + TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus( + updates_shape, scatter_indices_iteration_space, + scatter_outer_loop_body)); + parent_->evaluated_[scatter] = std::move(result); + return Status::OK(); + } + Status HandleSlice(HloInstruction* slice) override { auto operand = slice->operand(0); const Shape& shape = slice->shape(); @@ -1691,7 +2328,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { const int64 rank = ShapeUtil::Rank(operand->shape()); const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand); - auto func = [&](tensorflow::gtl::ArraySlice out_index) { + auto func = [&](absl::Span out_index) { DimensionVector operand_index(rank); for (int64 i = 0; i < rank; ++i) { operand_index[i] = @@ -1700,7 +2337,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return operand_literal.Get(operand_index); }; - auto result = Literal::CreateFromDimensions( + auto result = LiteralUtil::CreateFromDimensions( shape.element_type(), AsInt64Slice(shape.dimensions())); TF_RETURN_IF_ERROR(result->Populate(func)); parent_->evaluated_[slice] = std::move(result); @@ -1902,6 +2539,40 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return HandleReducePrecision(reduce_precision); } + template ::value || + std::is_same::value || + std::is_same::value>::type* = nullptr> + Status HandleIota(HloInstruction* instruction) { + auto* iota = Cast(instruction); + std::vector data(iota->shape().dimensions(iota->iota_dimension())); + std::iota(data.begin(), data.end(), 0); + auto result = LiteralUtil::CreateR1(data); + + if (ShapeUtil::Rank(iota->shape()) > 1) { + TF_ASSIGN_OR_RETURN( + parent_->evaluated_[iota], + result->Broadcast(iota->shape(), {iota->iota_dimension()})); + } else { + TF_RET_CHECK(ShapeUtil::Rank(iota->shape()) == 1); + parent_->evaluated_[iota] = std::move(result); + } + + return Status::OK(); + } + template ::value || + std::is_same::value || + std::is_same::value)>::type* = nullptr> + Status HandleIota(HloInstruction* iota) { + return InvalidArgument("Unsupported type for iota"); + } + Status HandleIota(HloInstruction* iota) override { + return HandleIota(iota); + } + private: // Creates a vector of multipliers which can be used to create a linear index // into shape. @@ -1928,7 +2599,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { // bound, call `f` with the base index. static void IterateThroughWindow( const Shape& window_shape, const Window& window, const Shape& base_shape, - const tensorflow::gtl::ArraySlice& window_count_index, + const absl::Span& window_count_index, const std::function&)>& f) { const int64 rank = ShapeUtil::Rank(base_shape); DimensionVector window_index(rank); @@ -1947,7 +2618,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { if (!out_of_bound) { f(base_index); } - } while (IndexUtil::BumpIndices(window_shape, &window_index)); + } while ( + IndexUtil::BumpIndices(window_shape, absl::MakeSpan(window_index))); } template @@ -1959,10 +2631,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { start_indices_typed.end()); // Clamp the start indices so the slice is in-bounds w.r.t the operand. - - // TODO(b/74360564): This is implementation defined behavior, but is - // currently respected by all implementations. Change this if we ever decide - // to oficially document different behavior. for (int64 i = 0; i < start.size(); ++i) { start[i] = std::min( std::max(int64{0}, start[i]), @@ -1970,9 +2638,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { } std::vector operand_indices(start.size()); - auto result = MakeUnique(result_shape); - TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice multi_index) { + auto result = absl::make_unique(result_shape); + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span multi_index) { for (int64 i = 0; i < operand_indices.size(); ++i) { CHECK_GE(multi_index[i] + start[i], 0); operand_indices[i] = multi_index[i] + start[i]; @@ -1996,10 +2664,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { start_indices_typed.end()); // Clamp the update start indices so the slice is in-bounds w.r.t the // operand. - - // TODO(b/74360564): This is implementation defined behavior, but is - // currently respected by all implementations. Change this if we ever decide - // to oficially document different behavior. for (int64 i = 0; i < rank; ++i) { start[i] = std::min( std::max(0, start[i]), @@ -2007,7 +2671,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { } std::vector result_index(rank, 0); - auto func = [&](tensorflow::gtl::ArraySlice update_index) { + auto func = [&](absl::Span update_index) { std::transform(update_index.begin(), update_index.end(), start.begin(), result_index.begin(), std::plus()); result->Set(result_index, @@ -2052,18 +2716,17 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Unimplemented( "Implicit broadcasting is currently unsupported in HLO evaluator " "Shape Mismatch: %s vs %s vs %s: ", - ShapeUtil::HumanString(shape).c_str(), - ShapeUtil::HumanString(lhs->shape()).c_str(), - ShapeUtil::HumanString(rhs->shape()).c_str()); + ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()), + ShapeUtil::HumanString(rhs->shape())); } const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs); const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs); - auto result = MakeUnique(shape); + auto result = absl::make_unique(shape); - TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice multi_index) { + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span multi_index) { return ConvertBinaryFunction(binary_op)( lhs_literal.Get(multi_index), rhs_literal.Get(multi_index)); @@ -2088,20 +2751,19 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Unimplemented( "Implicit broadcasting is currently unsupported in HLO evaluator " "Shape Mismatch: %s vs %s vs %s vs %s: ", - ShapeUtil::HumanString(shape).c_str(), - ShapeUtil::HumanString(lhs->shape()).c_str(), - ShapeUtil::HumanString(rhs->shape()).c_str(), - ShapeUtil::HumanString(ehs->shape()).c_str()); + ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()), + ShapeUtil::HumanString(rhs->shape()), + ShapeUtil::HumanString(ehs->shape())); } const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs); const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs); const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs); - auto result = MakeUnique(shape); + auto result = absl::make_unique(shape); - TF_RETURN_IF_ERROR(result->Populate( - [&](tensorflow::gtl::ArraySlice multi_index) { + TF_RETURN_IF_ERROR( + result->Populate([&](absl::Span multi_index) { return ternary_op(lhs_literal.Get(multi_index), rhs_literal.Get(multi_index), ehs_literal.Get(multi_index)); diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc index c3ccbf0f0c75b569b49652807dea52faebdccc31..de3d7a167752f0de790585e50874dd6d2904bd37 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc @@ -19,6 +19,8 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h" @@ -49,7 +51,7 @@ std::unique_ptr CreateHloProfilePrinterData( size_t profile_counters_size = hlo_profile_index_map.total_count(); std::unique_ptr profile_printer_data = - MakeUnique(); + absl::make_unique(); profile_printer_data->set_profile_counters_size(profile_counters_size); profile_printer_data->mutable_computation_infos()->Reserve( hlo_profile_index_map.computation_count()); @@ -67,11 +69,11 @@ std::unique_ptr CreateHloProfilePrinterData( // The profile indices were computed deterministically in // HloProfileIndexMap::HloProfileIndexMap. - c_sort(computation_and_profile_idx_list, - [](const std::pair& left, - const std::pair& right) { - return left.second < right.second; - }); + absl::c_sort(computation_and_profile_idx_list, + [](const std::pair& left, + const std::pair& right) { + return left.second < right.second; + }); for (const auto& pair : computation_and_profile_idx_list) { CHECK_LT(pair.second, profile_counters_size); diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc index 4900c813fdf037e65c6b42d027f1cbefb6ee9830..460ae2b5eca78659f86df1227e6a0a4e57508611 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc +++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc @@ -14,22 +14,22 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { namespace { -using tensorflow::strings::StrCat; +using absl::StrCat; using ::testing::AllOf; using ::testing::ContainsRegex; class HloExecutionProfileTest : public HloTestBase {}; TEST_F(HloExecutionProfileTest, Basic) { - auto hlo_module = tools::Parse(R"( + auto hlo_module = ParseHloString(R"( HloModule test_module ENTRY entry_computation { lhs = f32[30,30]{1,0} parameter(0) diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index 672b1c017a22e183b679ed799081ca5a8030f906..3041d94fa9f55b1acffc1295d07e48c967322865 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -26,8 +26,16 @@ limitations under the License. #include #include +#include "absl/strings/match.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "absl/strings/str_replace.h" +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -35,50 +43,25 @@ limitations under the License. #include "tensorflow/compiler/xla/window_util.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/map_util.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/regexp.h" -using ::tensorflow::Env; -using ::tensorflow::WriteStringToFile; -using ::tensorflow::gtl::nullopt; -using ::tensorflow::gtl::optional; -using ::tensorflow::io::JoinPath; -using ::tensorflow::str_util::Join; -using ::tensorflow::str_util::StringReplace; -using ::tensorflow::strings::StrAppend; -using ::tensorflow::strings::StrCat; - namespace xla { namespace hlo_graph_dumper { namespace { -// Helpers for Printf and Appendf. -template -struct PrintfConvert { - const T& operator()(const T& t) const { return t; } -}; -template <> -struct PrintfConvert { - const char* operator()(const string& s) const { return s.c_str(); } -}; - -// Like tensorflow::strings::Printf/Appendf, but you don't need to call c_str() -// on strings. -template -string Printf(const char* fmt, const Ts&... ts) { - return tensorflow::strings::Printf(fmt, PrintfConvert()(ts)...); -} -template -void Appendf(string* s, const char* fmt, const Ts&... ts) { - tensorflow::strings::Appendf(s, fmt, PrintfConvert()(ts)...); -} +using absl::nullopt; +using absl::optional; +using absl::StrAppend; +using absl::StrCat; +using absl::StrFormat; +using absl::StrJoin; +using tensorflow::Env; +using tensorflow::WriteStringToFile; +using tensorflow::io::JoinPath; // Used to indicate how we should treat a given HLOInstruction in the graph. // should we treat it like normal, hide it, and so on? @@ -207,17 +190,15 @@ NodeColors NodeColorsForScheme(ColorScheme color) { string NodeColorAttributes(ColorScheme color) { NodeColors node_colors = NodeColorsForScheme(color); - return Printf( - R"(style="%s", fontcolor="%s", color="%s", fillcolor="%s")", - node_colors.style, node_colors.font_color, node_colors.stroke_color, - node_colors.fill_color); + return StrFormat(R"(style="%s", fontcolor="%s", color="%s", fillcolor="%s")", + node_colors.style, node_colors.font_color, + node_colors.stroke_color, node_colors.fill_color); } // Replaces <> with <>, so that this string is safe(er) for use in a // graphviz HTML-like string. -string HtmlLikeStringSanitize(tensorflow::StringPiece s) { - return StringReplace(StringReplace(s, "<", "<", /*replace_all=*/true), ">", - ">", /*replace_all=*/true); +string HtmlLikeStringSanitize(absl::string_view s) { + return absl::StrReplaceAll(s, {{"<", "<"}, {">", ">"}}); } // Tries to generates a human-readable one-word description of the given @@ -320,11 +301,11 @@ optional MatchTrivialComputation(const HloComputation* computation) { // Encapsulates logic for dumping an HLO module to DOT (i.e. graphviz syntax). class HloDotDumper { public: - HloDotDumper(const HloComputation* computation, tensorflow::StringPiece label, + HloDotDumper(const HloComputation* computation, absl::string_view label, const DebugOptions& debug_options, bool show_backend_config, const HloExecutionProfile* profile, NodeFilter filter) : computation_(computation), - label_(std::string(label)), + label_(label), debug_options_(debug_options), show_backend_config_(show_backend_config), profile_(profile), @@ -446,7 +427,7 @@ string HloDotDumper::Dump() { } string HloDotDumper::Header() { - const char* fmt = R"(digraph G { + constexpr char fmt[] = R"(digraph G { rankdir = TB; compound = true; label = <%s>; @@ -455,7 +436,7 @@ labelloc = t; tooltip = " "; // DOT graphs accept a stylesheet as a URI. So naturally, an inline // stylesheet is a data URI! -stylesheet=" +stylesheet=< data:text/css, @import url(https://fonts.googleapis.com/css?family=Roboto:400,700); svg text { @@ -464,7 +445,7 @@ stylesheet=" } %s -" +> )"; @@ -479,8 +460,8 @@ stylesheet=" } if (profile_ != nullptr) { auto cycles = profile_->total_cycles_executed(*computation_); - Appendf(&graph_label, "
total cycles = %lld (%s)", cycles, - tensorflow::strings::HumanReadableNum(cycles)); + absl::StrAppendFormat(&graph_label, "
total cycles = %d (%s)", cycles, + tensorflow::strings::HumanReadableNum(cycles)); } // Create CSS rules that say, when you hover over the given node or cluster, @@ -507,14 +488,14 @@ stylesheet=" // One could imagine other ways of writing this CSS rule that involve // less duplication, but this way seems to be relatively performant. edge_css_rules.push_back( - Printf(" #%s%d:hover ~ #edge%lld text { fill: %s; }\n" - " #%s%d:hover ~ #edge%lld path { " - "stroke: %s; stroke-width: .2em; }\n" - " #%s%d:hover ~ #edge%lld polygon { " - "fill: %s; stroke: %s; stroke-width: .2em; }\n", - elem_type, elem_id, edge_id, color, // - elem_type, elem_id, edge_id, color, // - elem_type, elem_id, edge_id, color, color)); + StrFormat(" #%s%d:hover ~ #edge%d text { fill: %s; }\n" + " #%s%d:hover ~ #edge%d path { " + "stroke: %s; stroke-width: .2em; }\n" + " #%s%d:hover ~ #edge%d polygon { " + "fill: %s; stroke: %s; stroke-width: .2em; }\n", + elem_type, elem_id, edge_id, color, // + elem_type, elem_id, edge_id, color, // + elem_type, elem_id, edge_id, color, color)); }; // The "to_node" value may be a NULL, indicating that this points to the @@ -557,10 +538,10 @@ stylesheet=" } } - return Printf(fmt, graph_label, Join(edge_css_rules, "\n")); + return StrFormat(fmt, graph_label, StrJoin(edge_css_rules, "\n")); } -string HloDotDumper::Footer() { return StrCat(Join(edges_, "\n"), "\n}"); } +string HloDotDumper::Footer() { return StrCat(StrJoin(edges_, "\n"), "\n}"); } bool HloDotDumper::ShouldShowFusionSubcomputation(const HloInstruction* instr) { CHECK_EQ(instr->opcode(), HloOpcode::kFusion); @@ -590,15 +571,26 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) { string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp, const HloInstruction* parent_instr) { VLOG(2) << "Dumping subcomputation " << subcomp->name(); - const char* computation_fmt = R"(subgraph %s { -%s -label = <%s>; -labelloc = t; -tooltip = " "; -%s -} // %s + // Add an edge from the subcomputation to its parent node. If subcomp + // belongs to a fusion node, it's drawn in place of the fusion instruction, + // so there's no need to link those. + if (parent_instr->opcode() != HloOpcode::kFusion) { + const HloInstruction* from = GetNodeForEdge(subcomp->root_instruction()); + VLOG(2) << "Edge: from " << from->name() << " to " << parent_instr->name() + << " as " << next_edge_id_; + edge_ids_.insert({{from, parent_instr}, next_edge_id_++}); + constexpr char edge_fmt[] = + R"(%s -> %s [ltail="%s", style="dashed" tooltip="%s -> %s"];)"; + edges_.push_back(StrFormat( + edge_fmt, InstructionId(from), InstructionId(parent_instr), + SubcomputationId(subcomp), subcomp->name(), parent_instr->name())); + } -)"; + // Have we already dumped this subcomputation? If so, generating the edge + // linking it and parent_instr is all we want to do in this function. + if (cluster_ids_.find(subcomp) != cluster_ids_.end()) { + return ""; + } cluster_ids_[subcomp] = next_cluster_id_++; @@ -606,9 +598,10 @@ tooltip = " "; string subcomp_label, style; if (parent_instr->opcode() == HloOpcode::kFusion) { - subcomp_label = Printf("Fused expression for %s
%s", - HtmlLikeStringSanitize(parent_instr->name()), - HtmlLikeStringSanitize(parent_instr->ToCategory())); + subcomp_label = + StrFormat("Fused expression for %s
%s", + HtmlLikeStringSanitize(parent_instr->name()), + HtmlLikeStringSanitize(parent_instr->ToCategory())); string extra_info = GetInstructionNodeExtraInfo(parent_instr); if (!extra_info.empty()) { StrAppend(&subcomp_label, "
", extra_info); @@ -634,36 +627,27 @@ tooltip = " "; strokecolor = highlight ? "#b71c1c" : "#c2c2c2"; } style = - Printf(R"(style="rounded,filled,bold"; fillcolor="%s"; color="%s;")", - fillcolor, strokecolor); + StrFormat(R"(style="rounded,filled,bold"; fillcolor="%s"; color="%s;")", + fillcolor, strokecolor); } else { - subcomp_label = Printf("Subcomputation for %s
%s", - HtmlLikeStringSanitize(parent_instr->name()), - HtmlLikeStringSanitize(subcomp->name())); + subcomp_label = StrFormat("Subcomputation for %s
%s", + HtmlLikeStringSanitize(parent_instr->name()), + HtmlLikeStringSanitize(subcomp->name())); style = "style=rounded; color=black;"; } string comp_body = DumpComputation(subcomp); - // Add an edge from the subcomputation to its parent node. If subcomp - // belongs to a fusion node, it's drawn in place of the fusion instruction, - // so there's no need to link those. - if (parent_instr->opcode() != HloOpcode::kFusion) { - const HloInstruction* from = GetNodeForEdge(subcomp->root_instruction()); - VLOG(2) << "Edge: from " << from->name() << " to " << parent_instr->name() - << " as " << next_edge_id_; - edge_ids_.insert({{from, parent_instr}, next_edge_id_++}); - const char* edge_fmt = - R"(%s -> %s [ltail="%s", style="dashed" tooltip="%s -> %s"];)"; - edges_.push_back(Printf( - edge_fmt, InstructionId(from), InstructionId(parent_instr), - SubcomputationId(subcomp), subcomp->name(), parent_instr->name())); - } - - string computation = - Printf(computation_fmt, id, style, subcomp_label, comp_body, id); + constexpr char computation_fmt[] = R"(subgraph %s { +%s +label = <%s>; +labelloc = t; +tooltip = " "; +%s +} // %s - return computation; +)"; + return StrFormat(computation_fmt, id, style, subcomp_label, comp_body, id); } string HloDotDumper::DumpComputation(const HloComputation* comp) { @@ -714,18 +698,32 @@ string HloDotDumper::DumpRootTag() { VLOG(2) << "Adding edge from " << from->name() << " to root tag as " << next_edge_id_; edge_ids_.insert({{from, to}, next_edge_id_++}); - edges_.push_back(Printf(R"(%s -> %s [tooltip=" "];)", from_id, to_id)); + edges_.push_back(StrFormat(R"(%s -> %s [tooltip=" "];)", from_id, to_id)); + + return StrFormat(R"(%s [label=<%s>, shape=%s, tooltip=" ", %s];)" + "\n", + to_id, node_body, node_shape, NodeColorAttributes(color)); +} - return Printf(R"(%s [label=<%s>, shape=%s, tooltip=" ", %s];)" - "\n", - to_id, node_body, node_shape, NodeColorAttributes(color)); +static const HloConstantInstruction* TryGetFusionParameterConstant( + const HloInstruction* instr) { + if (instr->opcode() != HloOpcode::kParameter || !instr->IsFused()) { + return nullptr; + } + const HloInstruction* fusion = instr->parent()->FusionInstruction(); + const HloInstruction* operand = fusion->operand(instr->parameter_number()); + return DynCast(operand); } bool HloDotDumper::ShouldMergeIntoUsers(const HloInstruction* instr) const { // If a node: // - // - is a tuple-shaped parameter, - // - is not a parameter to a fusion node, + // - is a parameter of a fusion node which is bound to a constant, + // + // or + // + // - is a tuple-shaped parameter, and + // - is not a parameter to a fusion node, and // - has at least kMinUsersToOmit users shown, and // - all of the shown users are get-tuple-elements, // @@ -733,6 +731,9 @@ bool HloDotDumper::ShouldMergeIntoUsers(const HloInstruction* instr) const { // // This helps us handle the common case where a while loop body has one big // tuple-shaped parameter. + if (TryGetFusionParameterConstant(instr) != nullptr) { + return true; + } const int kMinUsersToOmit = 3; return instr->opcode() == HloOpcode::kParameter && ShapeUtil::IsTuple(instr->shape()) && !instr->IsFused() && @@ -796,72 +797,72 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) { } } - return Printf(R"(%s [label=<%s>, shape=%s, tooltip="%s", %s];)" - "\n", - InstructionId(instr), node_body, node_shape, node_metadata, - NodeColorAttributes(color)); + return StrFormat(R"(%s [label=<%s>, shape=%s, tooltip="%s", %s];)" + "\n", + InstructionId(instr), node_body, node_shape, node_metadata, + NodeColorAttributes(color)); } string HloDotDumper::GetInstructionNodeInlinedOperands( const HloInstruction* instr) { - auto stringify_constant = [](const HloInstruction* constant) { + auto stringify_constant = [](const HloConstantInstruction* constant) { const auto& shape = constant->shape(); // If the shape has a dimension of size zero, print it as e.g. // "{} (f32[42, 0, 10])". The alternative, calling Literal::ToString(), // enumerates all of its empty dimensions (e.g. "{ { {}, {} }, ..."), which // is just noise. - if (!ShapeUtil::IsTuple(shape) && ShapeUtil::HasZeroElements(shape)) { - return Printf("{} (%s)", ShapeUtil::HumanString(constant->shape())); + if (ShapeUtil::IsZeroElementArray(shape)) { + return StrFormat("{} (%s)", ShapeUtil::HumanString(constant->shape())); } // Print the literal value of constants with <= K elements. optional elem_count; - if (!ShapeUtil::IsOpaque(shape) && !ShapeUtil::IsTuple(shape)) { + if (ShapeUtil::IsArray(shape)) { elem_count = 1; for (int64 dim : shape.dimensions()) { *elem_count *= dim; } } + // Allow HloDotDumper to print HloInstruction reconstructed from HloProto + // collected from profiling tools. Those constants may not have a valid + // literal. if (elem_count.has_value() && *elem_count <= 8 && constant->HasLiteral()) { - return Printf("%s (%s)", constant->literal().ToString(), - ShapeUtil::HumanString(constant->shape())); + return StrFormat("%s (%s)", constant->literal().ToString(), + ShapeUtil::HumanString(constant->shape())); } // Otherwise, print e.g. "%constant.42 (s32[100])". string constant_name; - if (tensorflow::str_util::StartsWith(constant->name(), "constant")) { + if (absl::StartsWith(constant->name(), "constant")) { constant_name = constant->name(); } else { constant_name = StrCat("constant ", constant->name()); } - return Printf("%s %s", constant_name, - ShapeUtil::HumanString(constant->shape())); + return StrFormat("%s %s", constant_name, + ShapeUtil::HumanString(constant->shape())); }; - // Special case: If instr is a parameter to a fusion node, check whether the - // corresponding operand to the fusion node is a constant. - if (instr->opcode() == HloOpcode::kParameter && instr->IsFused()) { - const HloInstruction* fusion = instr->parent()->FusionInstruction(); - const HloInstruction* operand = fusion->operand(instr->parameter_number()); - if (operand->opcode() != HloOpcode::kConstant) { - return ""; - } - return StrCat("constant ", stringify_constant(operand)); - } - std::vector lines; for (int64 i = 0; i < instr->operand_count(); ++i) { const HloInstruction* operand = instr->operand(i); + const auto* constant_operand = DynCast(operand); optional operand_str; - if (operand->opcode() == HloOpcode::kConstant) { - operand_str = stringify_constant(operand); + if (constant_operand != nullptr) { + operand_str = stringify_constant(constant_operand); } else if (ShouldMergeIntoUsers(operand)) { - // Special case: If the operand is a parameter, use its parameter number - // rather than its name, because that's generally how people think of the - // node. + // Special case: If the operand is a parameter to a fusion node and it + // always has a constant value, display it like a regular constant. + // + // For other parameters, use the parameter number rather than the proper + // name, because that's generally how people think of the node. if (operand->opcode() == HloOpcode::kParameter) { - operand_str = Printf("Parameter %lld", operand->parameter_number()); + if (const HloConstantInstruction* constant = + TryGetFusionParameterConstant(operand)) { + operand_str = stringify_constant(constant); + } else { + operand_str = StrFormat("Parameter %d", operand->parameter_number()); + } } else { operand_str = operand->name(); } @@ -869,13 +870,13 @@ string HloDotDumper::GetInstructionNodeInlinedOperands( if (operand_str) { if (instr->operand_count() > 1) { - lines.push_back(Printf("operand %lld = %s", i, *operand_str)); + lines.push_back(StrFormat("operand %d = %s", i, *operand_str)); } else { - lines.push_back(Printf("operand = %s", *operand_str)); + lines.push_back(StrFormat("operand = %s", *operand_str)); } } } - return Join(lines, "
"); + return StrJoin(lines, "
"); } ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { @@ -895,11 +896,14 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { const auto kParameterColor = kOrange; // Special case: If this instruction has a parameter merged into it, paint it - // the same color as a parameter. + // the same color as a parameter. Unless the merged-in parameter is a + // parameter to a fusion node that is bound to a constant -- these aren't + // "real" parameters from the user's perspective. if (std::any_of(instr->operands().begin(), instr->operands().end(), [&](const HloInstruction* operand) { return operand->opcode() == HloOpcode::kParameter && - ShouldMergeIntoUsers(operand); + ShouldMergeIntoUsers(operand) && + TryGetFusionParameterConstant(operand) == nullptr; })) { return kParameterColor; } @@ -927,6 +931,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { case HloOpcode::kGe: case HloOpcode::kGt: case HloOpcode::kImag: + case HloOpcode::kIota: case HloOpcode::kIsFinite: case HloOpcode::kLe: case HloOpcode::kLog: @@ -939,11 +944,13 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { case HloOpcode::kNegate: case HloOpcode::kNot: case HloOpcode::kOr: + case HloOpcode::kXor: case HloOpcode::kPower: case HloOpcode::kReal: case HloOpcode::kRemainder: case HloOpcode::kRng: case HloOpcode::kRoundNearestAfz: + case HloOpcode::kSelect: case HloOpcode::kShiftLeft: case HloOpcode::kShiftRightArithmetic: case HloOpcode::kShiftRightLogical: @@ -962,6 +969,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { case HloOpcode::kBitcast: case HloOpcode::kGetTupleElement: case HloOpcode::kTrace: + case HloOpcode::kAfterAll: case HloOpcode::kTuple: return kWhite; case HloOpcode::kBroadcast: @@ -973,13 +981,12 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { } return kGreen; case HloOpcode::kConcatenate: - case HloOpcode::kCopy: case HloOpcode::kDynamicSlice: case HloOpcode::kGather: case HloOpcode::kPad: case HloOpcode::kReshape: case HloOpcode::kReverse: - case HloOpcode::kSelect: + case HloOpcode::kTupleSelect: case HloOpcode::kTranspose: // De-emphasize scalar-shaped data movement ops and all data movement ops // inside fusion nodes, both of which are essentially free. @@ -995,6 +1002,12 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { return kWhite; } return kGreen; + case HloOpcode::kScatter: + // Do not de-emphasize Scatter, since it involves significant work. + case HloOpcode::kCopy: + // Emphasize copy nodes, which are either physical transposes (and thus + // significant), or copies of read-only buffers (and thus dead weight). + return kGreen; case HloOpcode::kConvolution: case HloOpcode::kDot: case HloOpcode::kFft: @@ -1015,6 +1028,8 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { case HloOpcode::kMap: return kGray; case HloOpcode::kCrossReplicaSum: + case HloOpcode::kAllToAll: + case HloOpcode::kCollectivePermute: case HloOpcode::kInfeed: case HloOpcode::kOutfeed: case HloOpcode::kRecv: @@ -1025,7 +1040,6 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { case HloOpcode::kCall: case HloOpcode::kConditional: case HloOpcode::kCustomCall: - case HloOpcode::kHostCompute: case HloOpcode::kWhile: return kDarkGreen; case HloOpcode::kConstant: @@ -1046,14 +1060,13 @@ string HloDotDumper::GetInstructionNodeShape(const HloInstruction* instr) { string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) { // If we have a parameter, put the param number in the name. if (instr->opcode() == HloOpcode::kParameter) { - return Printf("Parameter %lld", instr->parameter_number()); + return StrFormat("Parameter %d", instr->parameter_number()); } // The HLO instruction name contains usually the opcode, e.g. "%add.42" is // an add instruction. In this case we render just the name. - if (tensorflow::str_util::StartsWith(instr->name(), - HloOpcodeString(instr->opcode()))) { - return Printf("%s", HtmlLikeStringSanitize(instr->name())); + if (absl::StartsWith(instr->name(), HloOpcodeString(instr->opcode()))) { + return StrFormat("%s", HtmlLikeStringSanitize(instr->name())); } string extended_opcode = StrCat(HloOpcodeString(instr->opcode()), @@ -1061,8 +1074,8 @@ string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) { ? "" : StrCat(":", xla::ToString(instr->fusion_kind()))); // If the name does not contain the opcode, render both. - return Printf("%s
%s", HtmlLikeStringSanitize(extended_opcode), - HtmlLikeStringSanitize(instr->name())); + return StrFormat("%s
%s", HtmlLikeStringSanitize(extended_opcode), + HtmlLikeStringSanitize(instr->name())); } string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) { @@ -1071,25 +1084,25 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) { lines.push_back(HtmlLikeStringSanitize(instr->metadata().op_name())); } if (!instr->metadata().op_type().empty()) { - lines.push_back(Printf( + lines.push_back(StrFormat( "op_type: %s", HtmlLikeStringSanitize(instr->metadata().op_type()))); } if (!instr->metadata().source_file().empty() && instr->metadata().source_line() != 0) { - lines.push_back(Printf("op_type: %s", instr->metadata().source_file(), - instr->metadata().source_line())); + lines.push_back(StrFormat("op_type: %s:%d", instr->metadata().source_file(), + instr->metadata().source_line())); } - return Join(lines, "
"); + return StrJoin(lines, "
"); } string HloDotDumper::GetInstructionNodeBackendConfig( const HloInstruction* instr) { - if (!show_backend_config_ || instr->backend_config().empty()) { + if (!show_backend_config_ || instr->raw_backend_config_string().empty()) { return ""; } - return StrCat("backend_config=\"", instr->backend_config(), "\""); + return StrCat("backend_config=\"", instr->raw_backend_config_string(), "\""); } string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) { @@ -1127,13 +1140,12 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) { constexpr int kMaxShapeLen = 64; if (instr_shape.length() > kMaxShapeLen) { instr_shape = StrCat( - tensorflow::StringPiece(instr_shape).substr(0, kMaxShapeLen - 3), - "..."); + absl::string_view(instr_shape).substr(0, kMaxShapeLen - 3), "..."); } lines.push_back(instr_shape); } if (debug_options_.xla_hlo_graph_addresses()) { - lines.push_back(Printf("[%p]", instr)); + lines.push_back(StrFormat("[%p]", instr)); } if (profile_ != nullptr) { double hlo_cycles_executed = profile_->GetCyclesTakenBy(*instr); @@ -1141,11 +1153,11 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) { profile_->total_cycles_executed(*instr->parent()); if (hlo_cycles_executed > 0 && total_cycles_executed > 0) { lines.push_back( - Printf("%% of cycles executed=%.2f", - 100 * hlo_cycles_executed / total_cycles_executed)); + StrFormat("%% of cycles executed=%.2f", + 100 * hlo_cycles_executed / total_cycles_executed)); } } - return Join(lines, "
"); + return StrJoin(lines, "
"); } // Gets the total number of array elements in the given shape. For tuples, this @@ -1177,7 +1189,8 @@ void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) { string edge_label; if (instr->operand_count() > 1 && !control_edge) { - edge_label = Printf(R"( headlabel="%lld", labeldistance=2)", operand_num); + edge_label = + StrFormat(R"( headlabel="%d", labeldistance=2)", operand_num); } else if (control_edge) { edge_label = "style=\"dotted\" color=\"gray\" label=\"ctrl\""; } @@ -1187,10 +1200,11 @@ void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) { // means. bool is_big_array = TotalElementsInShape(from->shape()) >= 4096; - const char* kEdgeFmt = R"(%s -> %s [arrowhead=%s tooltip="%s -> %s" %s];)"; - edges_.push_back(Printf(kEdgeFmt, InstructionId(from), InstructionId(to), - (is_big_array ? "normal" : "empty"), from->name(), - to->name(), edge_label)); + constexpr char kEdgeFmt[] = + R"(%s -> %s [arrowhead=%s tooltip="%s -> %s" %s];)"; + edges_.push_back(StrFormat(kEdgeFmt, InstructionId(from), InstructionId(to), + (is_big_array ? "normal" : "empty"), + from->name(), to->name(), edge_label)); }; // Add edges from instr's operands to instr. Parameters within fusion @@ -1231,14 +1245,14 @@ string HloDotDumper::GetInstructionTrivialComputationStr( continue; } if (instr->called_computations().size() == 1) { - lines.push_back(Printf("Subcomputation: %s", - HtmlLikeStringSanitize(*computation_type))); + lines.push_back(StrFormat("Subcomputation: %s", + HtmlLikeStringSanitize(*computation_type))); } else { - lines.push_back(Printf("Subcomputation %lld: %s", i, - HtmlLikeStringSanitize(*computation_type))); + lines.push_back(StrFormat("Subcomputation %d: %s", i, + HtmlLikeStringSanitize(*computation_type))); } } - return Join(lines, "
"); + return StrJoin(lines, "
"); } const HloInstruction* HloDotDumper::GetNodeForEdge( diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc index 8e52d926d85f1ce6fabeb2dedd2f8e0fe0c2051d..064c53252c0ac4d4e7b93169ad7cbee4807cb963 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -22,12 +24,11 @@ limitations under the License. #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/test_utils.h" #include "tensorflow/compiler/xla/xla.pb.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { namespace { -using ::tensorflow::strings::StrCat; +using absl::StrCat; using ::testing::HasSubstr; string TestName() { @@ -120,8 +121,8 @@ TEST(HloGraphDumperTest, NestedFusion) { TEST(HloGraphDumperTest, Constant) { HloComputation::Builder b("b"); auto instruction = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(-42))); - instruction->set_name("i_am_a_constant_root_instruction"); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(-42))); + instruction->SetAndSanitizeName("i_am_a_constant_root_instruction"); HloModuleConfig config; HloModule m(TestName(), config); HloComputation* root_computation = m.AddEntryComputation(b.Build()); diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index dc351e99681b59289232043c2c727a8ee7113a1d..6d13f85cbbca2ae4b2a794ca5de975fe21e8212e 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -16,39 +16,45 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include -#include #include #include #include #include +#include "absl/algorithm/container.h" +#include "absl/container/inlined_vector.h" +#include "absl/memory/memory.h" +#include "absl/strings/ascii.h" +#include "absl/strings/escaping.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/protobuf_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/name_uniquer.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/compiler/xla/window_util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/lib/gtl/map_util.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/human_readable_json.h" #include "tensorflow/core/platform/logging.h" namespace xla { -using tensorflow::str_util::CEscape; -using ::tensorflow::str_util::Join; -using ::tensorflow::strings::StrAppend; -using ::tensorflow::strings::StrCat; +using absl::CEscape; +using absl::StrAppend; +using absl::StrCat; +using absl::StrJoin; /* static */ StatusOr> HloInstruction::CreateFromProto( @@ -59,106 +65,422 @@ StatusOr> HloInstruction::CreateFromProto( TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(proto.opcode())); TF_RET_CHECK(proto.has_shape()); - auto instruction = WrapUnique(new HloInstruction(opcode, proto.shape())); - for (const int64 operand_id : proto.operand_ids()) { - TF_RET_CHECK(ContainsKey(instruction_map, operand_id)) - << "No instruction with id " << operand_id; - instruction->AppendOperand(instruction_map.at(operand_id)); - } - for (const int64 predecessor_id : proto.control_predecessor_ids()) { - TF_RET_CHECK(ContainsKey(instruction_map, predecessor_id)) - << "No instruction with id " << predecessor_id; - TF_RETURN_IF_ERROR(instruction_map.at(predecessor_id) - ->AddControlDependencyTo(instruction.get())); - } - - // In the proto, fused computations are held exclusively within the - // HloInstructionProto and do not appear as an HloComputationProto within the - // HloModuleProto. - if (instruction->opcode() == HloOpcode::kFusion) { - TF_RET_CHECK(!proto.fusion_kind().empty()); - TF_ASSIGN_OR_RETURN(instruction->fusion_kind_, - StringToFusionKind(proto.fusion_kind())); - - // Find the fused computation and set its fusion instruction. - TF_RET_CHECK(proto.called_computation_ids_size() == 1) - << "Expect 1 called computation for fusion instruction, but sees " - << proto.called_computation_ids_size(); - const int64 fusion_id = proto.called_computation_ids(0); - auto* fused_computation = FindPtrOrNull(computation_map, fusion_id); - TF_RET_CHECK(fused_computation != nullptr) - << "No fusion computation with id " << fusion_id; - fused_computation->SetFusionInstruction(instruction.get()); - instruction->called_computations_.push_back(fused_computation); - } else { - for (const int64 computation_id : proto.called_computation_ids()) { - TF_RET_CHECK(ContainsKey(computation_map, computation_id)) - << "No computation with id " << computation_id; - instruction->called_computations_.push_back( - computation_map.at(computation_id)); + std::unique_ptr instruction; + const auto operands = [&instruction_map, &proto](int index) { + return instruction_map.at(proto.operand_ids(index)); + }; + const auto all_operands = [&instruction_map, &proto]() { + std::vector result(proto.operand_ids_size()); + std::transform(proto.operand_ids().begin(), proto.operand_ids().end(), + result.begin(), [&instruction_map](int64 operand_id) { + return instruction_map.at(operand_id); + }); + return result; + }; + const auto computations = [&computation_map, &proto](int index) { + return computation_map.at(proto.called_computation_ids(index)); + }; + switch (opcode) { + // Ops migrated to subclasses. + case HloOpcode::kBatchNormTraining: + TF_RET_CHECK(proto.operand_ids_size() == 3) + << "BatchNormTraining instruction should have 3 operands but sees " + << proto.operand_ids_size(); + instruction = CreateBatchNormTraining( + proto.shape(), operands(0), operands(1), operands(2), proto.epsilon(), + proto.feature_index()); + break; + case HloOpcode::kBatchNormInference: + TF_RET_CHECK(proto.operand_ids_size() == 5) + << "BatchNormInference instruction should have 5 operands but sees " + << proto.operand_ids_size(); + instruction = CreateBatchNormInference( + proto.shape(), operands(0), operands(1), operands(2), operands(3), + operands(4), proto.epsilon(), proto.feature_index()); + break; + case HloOpcode::kBatchNormGrad: + TF_RET_CHECK(proto.operand_ids_size() == 5) + << "BatchNormGrad instruction should have 5 operands but sees " + << proto.operand_ids_size(); + instruction = CreateBatchNormGrad(proto.shape(), operands(0), operands(1), + operands(2), operands(3), operands(4), + proto.epsilon(), proto.feature_index()); + break; + case HloOpcode::kFft: { + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "Fft instruction should have 1 operand but sees " + << proto.operand_ids_size(); + std::vector fft_length(proto.fft_length().begin(), + proto.fft_length().end()); + instruction = CreateFft(proto.shape(), operands(0), proto.fft_type(), + absl::Span(fft_length)); + break; + } + case HloOpcode::kSend: + TF_RET_CHECK(proto.operand_ids_size() == 2) + << "Send instruction should have 2 operand but sees " + << proto.operand_ids_size(); + instruction = CreateSend(operands(0), operands(1), proto.channel_id(), + proto.is_host_transfer()); + break; + case HloOpcode::kSendDone: + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "SendDone instruction should have 1 operand but sees " + << proto.operand_ids_size(); + instruction = CreateSendDone(operands(0), proto.is_host_transfer()); + break; + case HloOpcode::kRecv: + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "Recv instruction should have 1 operand but sees " + << proto.operand_ids_size(); + instruction = CreateRecv(proto.shape().tuple_shapes(0), operands(0), + proto.channel_id(), proto.is_host_transfer()); + break; + case HloOpcode::kRecvDone: + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "RecvDone instruction should have 1 operand but sees " + << proto.operand_ids_size(); + instruction = CreateRecvDone(operands(0), proto.is_host_transfer()); + break; + case HloOpcode::kReverse: + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "Reverse instruction should have 1 operand but sees " + << proto.operand_ids_size(); + instruction = CreateReverse(proto.shape(), operands(0), + std::vector(proto.dimensions().begin(), + proto.dimensions().end())); + break; + case HloOpcode::kConcatenate: + TF_RET_CHECK(proto.dimensions_size() == 1) + << "Concatenate instruction should have 1 dimension but sees " + << proto.dimensions_size(); + instruction = + CreateConcatenate(proto.shape(), all_operands(), proto.dimensions(0)); + break; + case HloOpcode::kReduce: + TF_RET_CHECK(proto.operand_ids_size() % 2 == 0) + << "Reduce instruction should have an even number of operands but " + "sees " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.called_computation_ids_size() == 1) + << "Reduce instruction should have 1 called computation but sees " + << proto.called_computation_ids_size(); + { + const auto reduce_operands = all_operands(); + auto inputs = absl::MakeSpan(reduce_operands) + .subspan(0, reduce_operands.size() / 2); + auto init_values = + absl::MakeSpan(reduce_operands) + .subspan(reduce_operands.size() / 2, reduce_operands.size()); + instruction = + CreateReduce(proto.shape(), inputs, init_values, + std::vector(proto.dimensions().begin(), + proto.dimensions().end()), + computations(0)); + } + break; + case HloOpcode::kSort: { + TF_RET_CHECK(proto.operand_ids_size() == 1 || + proto.operand_ids_size() == 2) + << "Sort instruction should have 1 or 2 operands but has " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.dimensions().size() == 1) + << "Sort instruction should have 1 dimension"; + HloInstruction* keys = operands(0); + HloInstruction* values = + proto.operand_ids_size() == 2 ? operands(1) : nullptr; + instruction = + CreateSort(proto.shape(), proto.dimensions(0), keys, values); + break; + } + case HloOpcode::kTranspose: + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "Transpose instruction should have 1 operand but sees " + << proto.operand_ids_size(); + instruction = + CreateTranspose(proto.shape(), operands(0), + std::vector(proto.dimensions().begin(), + proto.dimensions().end())); + break; + case HloOpcode::kBroadcast: + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "Broadcast instruction should have 1 operand but sees " + << proto.operand_ids_size(); + instruction = + CreateBroadcast(proto.shape(), operands(0), + std::vector(proto.dimensions().begin(), + proto.dimensions().end())); + break; + case HloOpcode::kMap: + TF_RET_CHECK(proto.called_computation_ids_size() == 1) + << "Map instruction should have 1 called computation but sees " + << proto.called_computation_ids_size(); + instruction = CreateMap(proto.shape(), all_operands(), computations(0)); + break; + case HloOpcode::kSlice: { + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "Slice instruction should have 1 operand but sees " + << proto.operand_ids_size(); + std::vector slice_starts, slice_limits, slice_strides; + for (const HloInstructionProto::SliceDimensions& slice_dimensions : + proto.slice_dimensions()) { + slice_starts.push_back(slice_dimensions.start()); + slice_limits.push_back(slice_dimensions.limit()); + slice_strides.push_back(slice_dimensions.stride()); + } + instruction = CreateSlice(proto.shape(), operands(0), slice_starts, + slice_limits, slice_strides); + break; + } + case HloOpcode::kConstant: { + // TODO(b/110214922): Revert this to CHECK(proto.has_literal()). + if (proto.has_literal()) { + TF_ASSIGN_OR_RETURN(auto literal, + Literal::CreateFromProto(proto.literal())); + instruction = CreateConstant(std::move(literal)); + } else { + instruction = absl::make_unique(proto.shape()); + } + break; + } + case HloOpcode::kTrace: { + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "Trace instruction should have 1 operand but sees " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.has_literal()); + TF_ASSIGN_OR_RETURN(auto literal, + Literal::CreateFromProto(proto.literal())); + instruction = CreateTrace(literal->GetR1U8AsString(), operands(0)); + break; + } + case HloOpcode::kFusion: { + // In the proto, fused computations are held exclusively within the + // HloInstructionProto and do not appear as an HloComputationProto within + // the HloModuleProto. + TF_RET_CHECK(!proto.fusion_kind().empty()); + TF_ASSIGN_OR_RETURN(FusionKind fusion_kind, + StringToFusionKind(proto.fusion_kind())); + + // Find the fused computation and set its fusion instruction. + TF_RET_CHECK(proto.called_computation_ids_size() == 1) + << "Expect 1 called computation for fusion instruction but sees " + << proto.called_computation_ids_size(); + const int64 fusion_id = proto.called_computation_ids(0); + auto* fused_computation = FindPtrOrNull(computation_map, fusion_id); + TF_RET_CHECK(fused_computation != nullptr) + << "No fusion computation with id " << fusion_id; + instruction = CreateFusion(proto.shape(), fusion_kind, all_operands(), + fused_computation); + break; + } + case HloOpcode::kRng: + instruction = + CreateRng(proto.shape(), proto.distribution(), all_operands()); + break; + case HloOpcode::kParameter: + instruction = CreateParameter(proto.parameter_number(), proto.shape(), + proto.name()); + break; + case HloOpcode::kGetTupleElement: + TF_RET_CHECK(proto.operand_ids_size() == 1) + << "GetTupleElement instruction should have 1 operand but sees " + << proto.operand_ids_size(); + instruction = CreateGetTupleElement(proto.shape(), operands(0), + proto.tuple_index()); + break; + case HloOpcode::kReducePrecision: + instruction = + CreateReducePrecision(proto.shape(), operands(0), + proto.exponent_bits(), proto.mantissa_bits()); + break; + case HloOpcode::kInfeed: { + const Shape& data_shape = + ShapeUtil::GetTupleElementShape(proto.shape(), 0); + TF_RET_CHECK(proto.operand_ids_size() == 1); + instruction = + CreateInfeed(data_shape, operands(0), proto.infeed_config()); + } break; + case HloOpcode::kOutfeed: + TF_RET_CHECK(proto.operand_ids_size() == 2); + instruction = CreateOutfeed(proto.outfeed_shape(), operands(0), + operands(1), proto.outfeed_config()); + break; + case HloOpcode::kCrossReplicaSum: { + TF_RET_CHECK(proto.called_computation_ids_size() == 1) + << "CrossReplicaSum should have 1 called computation but sees " + << proto.called_computation_ids_size(); + absl::optional all_reduce_id; + if (proto.all_reduce_id() > 0) { + all_reduce_id = proto.all_reduce_id(); + } + instruction = CreateCrossReplicaSum( + proto.shape(), all_operands(), computations(0), + /*replica_groups=*/ + std::vector(proto.replica_groups().begin(), + proto.replica_groups().end()), + /*barrier=*/proto.cross_replica_sum_barrier(), + /*all_reduce_id=*/all_reduce_id); + break; + } + case HloOpcode::kAllToAll: { + instruction = CreateAllToAll( + proto.shape(), all_operands(), + /*replica_groups=*/ + std::vector(proto.replica_groups().begin(), + proto.replica_groups().end())); + break; + } + case HloOpcode::kCollectivePermute: { + std::vector> source_target_pairs( + proto.source_target_pairs_size()); + for (int i = 0; i < source_target_pairs.size(); i++) { + source_target_pairs[i].first = proto.source_target_pairs(i).source(); + source_target_pairs[i].second = proto.source_target_pairs(i).target(); + } + instruction = CreateCollectivePermute(proto.shape(), operands(0), + source_target_pairs); + break; + } + case HloOpcode::kConvolution: + TF_RET_CHECK(proto.operand_ids_size() == 2) + << "Convolution instruction should have 2 operands but sees " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.has_window()); + TF_RET_CHECK(proto.has_convolution_dimension_numbers()); + instruction = CreateConvolve( + proto.shape(), operands(0), operands(1), proto.window(), + proto.convolution_dimension_numbers(), + std::max(static_cast(proto.feature_group_count()), 1LL)); + break; + case HloOpcode::kReduceWindow: + TF_RET_CHECK(proto.operand_ids_size() == 2) + << "ReduceWindow instruction should have 2 operands but sees " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.called_computation_ids_size() == 1) + << "ReduceWindow should have 1 called computation but sees " + << proto.called_computation_ids_size(); + instruction = CreateReduceWindow(proto.shape(), operands(0), operands(1), + proto.window(), computations(0)); + break; + case HloOpcode::kSelectAndScatter: + TF_RET_CHECK(proto.operand_ids_size() == 3) + << "SelectAndScatter instruction should have 3 operands but sees " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.called_computation_ids_size() == 2) + << "SelectAndScatter should have 2 called computations but sees " + << proto.called_computation_ids_size(); + instruction = CreateSelectAndScatter( + proto.shape(), operands(0), computations(0), proto.window(), + operands(1), operands(2), computations(1)); + break; + case HloOpcode::kCustomCall: + instruction = CreateCustomCall(proto.shape(), all_operands(), + proto.custom_call_target()); + if (proto.has_window()) { + static_cast(instruction.get()) + ->set_window(proto.window()); + } + if (proto.has_convolution_dimension_numbers()) { + static_cast(instruction.get()) + ->set_convolution_dimension_numbers( + proto.convolution_dimension_numbers()); + } + static_cast(instruction.get()) + ->set_feature_group_count( + std::max(static_cast(proto.feature_group_count()), 1LL)); + break; + case HloOpcode::kPad: + TF_RET_CHECK(proto.operand_ids_size() == 2) + << "Pad instruction should have 2 operands but sees " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.has_padding_config()); + instruction = CreatePad(proto.shape(), operands(0), operands(1), + proto.padding_config()); + break; + case HloOpcode::kDynamicSlice: { + TF_RET_CHECK(proto.operand_ids_size() == 2) + << "DynamicSlice instruction should have 2 operands but sees " + << proto.operand_ids_size(); + std::vector slice_sizes(proto.dynamic_slice_sizes_size()); + absl::c_copy(proto.dynamic_slice_sizes(), slice_sizes.begin()); + instruction = CreateDynamicSlice(proto.shape(), operands(0), operands(1), + slice_sizes); + break; + } + case HloOpcode::kGather: { + TF_RET_CHECK(proto.operand_ids_size() == 2) + << "Gather instruction should have 2 operands but sees " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.has_gather_dimension_numbers()) + << "Gather instruction should have GatherDimensionNumbers set."; + std::unique_ptr gather_dimension_numbers = + absl::make_unique( + proto.gather_dimension_numbers()); + std::vector gather_slice_sizes; + for (int64 bound : proto.gather_slice_sizes()) { + gather_slice_sizes.push_back(bound); + } + instruction = CreateGather(proto.shape(), operands(0), operands(1), + *gather_dimension_numbers, gather_slice_sizes); + break; + } + case HloOpcode::kScatter: { + TF_RET_CHECK(proto.operand_ids_size() == 3) + << "Scatter instruction should have 3 operands but sees " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.has_scatter_dimension_numbers()) + << "Scatter instruction should have ScatterDimensionNumbers set."; + TF_RET_CHECK(proto.called_computation_ids_size() == 1) + << "Scatter instruction should have 1 called computation but sees " + << proto.called_computation_ids_size(); + auto scatter_dimension_numbers = + absl::make_unique( + proto.scatter_dimension_numbers()); + instruction = + CreateScatter(proto.shape(), operands(0), operands(1), operands(2), + computations(0), *scatter_dimension_numbers); + break; + } + case HloOpcode::kIota: + TF_RET_CHECK(proto.dimensions_size() <= 1) + << "Iota instruction should have at most 1 dimension but sees " + << proto.dimensions_size(); + instruction = CreateIota(proto.shape(), proto.dimensions(0)); + break; + default: { + instruction = absl::WrapUnique(new HloInstruction(opcode, proto.shape())); + for (const int64 operand_id : proto.operand_ids()) { + TF_RET_CHECK(ContainsKey(instruction_map, operand_id)) + << "No instruction with id " << operand_id; + instruction->AppendOperand(instruction_map.at(operand_id)); + } + for (const int64 predecessor_id : proto.control_predecessor_ids()) { + TF_RET_CHECK(ContainsKey(instruction_map, predecessor_id)) + << "No instruction with id " << predecessor_id; + TF_RETURN_IF_ERROR(instruction_map.at(predecessor_id) + ->AddControlDependencyTo(instruction.get())); + } + if (instruction->opcode() != HloOpcode::kFusion) { + for (const int64 computation_id : proto.called_computation_ids()) { + TF_RET_CHECK(ContainsKey(computation_map, computation_id)) + << "No computation with id " << computation_id; + instruction->called_computations_.push_back( + computation_map.at(computation_id)); + } + } + break; } - } - - if (instruction->opcode() == HloOpcode::kTrace) { - TF_RET_CHECK(instruction->operands().size() == 1) - << "Trace instruction should have 1 operand but sees " - << instruction->operands().size(); - instruction->mutable_operand(0)->set_tracing(instruction.get()); } TF_RET_CHECK(!proto.name().empty()); - instruction->name_ = proto.name(); - + instruction->SetAndSanitizeName(proto.name()); instruction->metadata_ = proto.metadata(); - instruction->set_backend_config(proto.backend_config()); - if (proto.has_literal()) { - TF_ASSIGN_OR_RETURN(instruction->literal_, - Literal::CreateFromProto(proto.literal())); - } - instruction->parameter_number_ = proto.parameter_number(); + instruction->backend_config_ = proto.backend_config(); + instruction->precision_config_ = proto.precision_config(); - instruction->tuple_index_ = proto.tuple_index(); - for (int64 dimension : proto.dimensions()) { - instruction->dimensions_.push_back(dimension); - } - if (proto.has_window()) { - instruction->window_ = MakeUnique(proto.window()); - } - if (proto.has_convolution_dimension_numbers()) { - instruction->convolution_dimension_numbers_ = - MakeUnique( - proto.convolution_dimension_numbers()); - } if (proto.has_dot_dimension_numbers()) { instruction->dot_dimension_numbers_ = - MakeUnique(proto.dot_dimension_numbers()); - } - for (const HloInstructionProto::SliceDimensions& slice_dimensions : - proto.slice_dimensions()) { - instruction->slice_starts_.push_back(slice_dimensions.start()); - instruction->slice_limits_.push_back(slice_dimensions.limit()); - instruction->slice_strides_.push_back(slice_dimensions.stride()); - } - instruction->exponent_bits_ = proto.exponent_bits(); - instruction->mantissa_bits_ = proto.mantissa_bits(); - for (int64 dynamic_slice_size : proto.dynamic_slice_sizes()) { - instruction->dynamic_slice_sizes_.push_back(dynamic_slice_size); - } - if (proto.has_padding_config()) { - instruction->padding_config_ = - MakeUnique(proto.padding_config()); - } - instruction->outfeed_config_ = proto.outfeed_config(); - instruction->distribution_ = proto.distribution(); - instruction->epsilon_ = proto.epsilon(); - instruction->feature_index_ = proto.feature_index(); - instruction->channel_id_ = proto.channel_id(); - instruction->infeed_config_ = proto.infeed_config(); - instruction->custom_call_target_ = proto.custom_call_target(); - instruction->outfeed_shape_ = proto.outfeed_shape(); - instruction->fft_type_ = proto.fft_type(); - for (int64 fft_len : proto.fft_length()) { - instruction->fft_length_.push_back(fft_len); + absl::make_unique(proto.dot_dimension_numbers()); } if (proto.has_sharding()) { @@ -167,78 +489,51 @@ StatusOr> HloInstruction::CreateFromProto( instruction->set_sharding(sharding); } - if (proto.has_gather_dimension_numbers()) { - instruction->gather_dimension_numbers_ = - MakeUnique(proto.gather_dimension_numbers()); - } - for (int64 bound : proto.gather_window_bounds()) { - instruction->gather_window_bounds_.push_back(bound); - } - - instruction->channel_name_ = proto.channel_name(); - instruction->cost_estimate_ns_ = proto.cost_estimate_ns(); - return std::move(instruction); } /* static */ std::unique_ptr HloInstruction::CreateParameter( int64 parameter_number, const Shape& shape, const string& name) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kParameter, shape)); - instruction->parameter_number_ = parameter_number; - instruction->name_ = name; - return instruction; + return absl::make_unique(parameter_number, shape, + name); } /* static */ std::unique_ptr HloInstruction::CreateTrace( const string& tag, HloInstruction* operand) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil())); - instruction->operands_.push_back(operand); - instruction->literal_ = Literal::CreateR1U8(tag); - operand->set_tracing(instruction.get()); - return instruction; + return absl::make_unique(tag, operand); } /* static */ std::unique_ptr HloInstruction::CreateConstant( std::unique_ptr literal) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kConstant, literal->shape())); - instruction->literal_ = std::move(literal); - return instruction; + return absl::make_unique(std::move(literal)); +} + +/* static */ std::unique_ptr HloInstruction::CreateIota( + const Shape& shape, int64 iota_dimension) { + return absl::make_unique(shape, iota_dimension); } /* static */ std::unique_ptr HloInstruction::CreateGetTupleElement(const Shape& shape, HloInstruction* operand, int64 index) { - CHECK(ShapeUtil::IsTuple(operand->shape())); - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kGetTupleElement, shape)); - instruction->tuple_index_ = index; - instruction->AppendOperand(operand); - return instruction; + return absl::make_unique(shape, operand, + index); } /* static */ std::unique_ptr HloInstruction::CreateRng( const Shape& shape, RandomDistribution distribution, - tensorflow::gtl::ArraySlice parameters) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kRng, shape)); - instruction->distribution_ = distribution; - instruction->shape_ = shape; - for (HloInstruction* param : parameters) { - instruction->AppendOperand(param); - } - return instruction; + absl::Span parameters) { + return absl::make_unique(shape, distribution, parameters); } /* static */ std::unique_ptr HloInstruction::CreateNary( const Shape& shape, HloOpcode opcode, - tensorflow::gtl::ArraySlice operands) { + absl::Span operands) { if (opcode == HloOpcode::kCopy) { // It is impossible to copy an opaque shape, we don't know how big it is. CHECK(!ShapeUtil::IsOpaque(shape)); } - auto instruction = WrapUnique(new HloInstruction(opcode, shape)); + auto instruction = absl::WrapUnique(new HloInstruction(opcode, shape)); for (auto operand : operands) { instruction->AppendOperand(operand); } @@ -270,7 +565,6 @@ HloInstruction::CreateGetTupleElement(const Shape& shape, case HloOpcode::kReal: case HloOpcode::kSign: case HloOpcode::kSin: - case HloOpcode::kSort: case HloOpcode::kTanh: break; default: @@ -305,6 +599,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape, case HloOpcode::kSubtract: case HloOpcode::kAnd: case HloOpcode::kOr: + case HloOpcode::kXor: case HloOpcode::kShiftLeft: case HloOpcode::kShiftRightArithmetic: case HloOpcode::kShiftRightLogical: @@ -322,8 +617,9 @@ HloInstruction::CreateGetTupleElement(const Shape& shape, // Only certain opcodes are supported with CreateTernary: opcodes of ternary // instructions with no auxiliary fields. switch (opcode) { - case (HloOpcode::kClamp): - case (HloOpcode::kSelect): + case HloOpcode::kClamp: + case HloOpcode::kSelect: + case HloOpcode::kTupleSelect: break; default: LOG(FATAL) << "Invalid ternary instruction opcode " @@ -334,62 +630,41 @@ HloInstruction::CreateGetTupleElement(const Shape& shape, /* static */ std::unique_ptr HloInstruction::CreateVariadic( const Shape& shape, HloOpcode opcode, - tensorflow::gtl::ArraySlice operands) { + absl::Span operands) { CHECK_EQ(HloOpcode::kTuple, opcode); return CreateNary(shape, opcode, operands); } /* static */ std::unique_ptr HloInstruction::CreateMap( - const Shape& shape, tensorflow::gtl::ArraySlice operands, - HloComputation* map_computation, - tensorflow::gtl::ArraySlice static_operands) { - CHECK(static_operands.empty()) << "static_operands not yet supported"; - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kMap, shape)); - for (auto operand : operands) { - instruction->AppendOperand(operand); - } - instruction->called_computations_.push_back(map_computation); - return instruction; + const Shape& shape, absl::Span operands, + HloComputation* map_computation) { + return absl::make_unique(shape, operands, map_computation); } /* static */ std::unique_ptr HloInstruction::CreateConvolve( const Shape& shape, HloInstruction* lhs, HloInstruction* rhs, - const Window& window, - const ConvolutionDimensionNumbers& dimension_numbers) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kConvolution, shape)); - if (window_util::HasBaseDilation(window)) { - instruction->name_ = instruction->name() + "-base-dilated"; - } - if (window_util::HasWindowDilation(window)) { - instruction->name_ = instruction->name() + "-window-dilated"; - } - instruction->AppendOperand(lhs); - instruction->AppendOperand(rhs); - instruction->window_ = MakeUnique(window); - instruction->convolution_dimension_numbers_ = - MakeUnique(dimension_numbers); - return instruction; + const Window& window, const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count) { + return absl::make_unique( + shape, lhs, rhs, window, dimension_numbers, feature_group_count); } /* static */ std::unique_ptr HloInstruction::CreateFft( const Shape& shape, HloInstruction* operand, FftType fft_type, - tensorflow::gtl::ArraySlice fft_length) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFft, shape)); - instruction->AppendOperand(operand); - instruction->fft_type_ = fft_type; - instruction->fft_length_.assign(fft_length.begin(), fft_length.end()); - return instruction; + absl::Span fft_length) { + return absl::make_unique(shape, operand, fft_type, + fft_length); } /* static */ std::unique_ptr HloInstruction::CreateDot( const Shape& shape, HloInstruction* lhs, HloInstruction* rhs, const DotDimensionNumbers& dimension_numbers) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDot, shape)); + auto instruction = + absl::WrapUnique(new HloInstruction(HloOpcode::kDot, shape)); instruction->AppendOperand(lhs); instruction->AppendOperand(rhs); instruction->dot_dimension_numbers_ = - MakeUnique(dimension_numbers); + absl::make_unique(dimension_numbers); return instruction; } @@ -398,10 +673,12 @@ HloInstruction::CreateGetTupleElement(const Shape& shape, CHECK_EQ(ShapeUtil::Rank(lhs->shape()), 2); CHECK_EQ(ShapeUtil::Rank(rhs->shape()), 2); - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDot, shape)); + auto instruction = + absl::WrapUnique(new HloInstruction(HloOpcode::kDot, shape)); instruction->AppendOperand(lhs); instruction->AppendOperand(rhs); - instruction->dot_dimension_numbers_ = MakeUnique(); + instruction->dot_dimension_numbers_ = + absl::make_unique(); instruction->dot_dimension_numbers_->add_lhs_contracting_dimensions(1); instruction->dot_dimension_numbers_->add_rhs_contracting_dimensions(0); return instruction; @@ -412,100 +689,109 @@ HloInstruction::CreateReducePrecision(const Shape& shape, HloInstruction* operand, const int exponent_bits, const int mantissa_bits) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kReducePrecision, shape)); - instruction->AppendOperand(operand); - instruction->exponent_bits_ = exponent_bits; - instruction->mantissa_bits_ = mantissa_bits; - return instruction; + return absl::make_unique( + shape, operand, exponent_bits, mantissa_bits); } /* static */ std::unique_ptr HloInstruction::CreateCrossReplicaSum( - const Shape& shape, tensorflow::gtl::ArraySlice operands) { - return CreateNary(shape, HloOpcode::kCrossReplicaSum, operands); + const Shape& shape, absl::Span operands, + HloComputation* reduce_computation, + const std::vector& replica_groups, absl::string_view barrier, + const absl::optional& all_reduce_id) { + return absl::make_unique( + shape, operands, reduce_computation, replica_groups, barrier, + all_reduce_id); +} + +/* static */ std::unique_ptr HloInstruction::CreateAllToAll( + const Shape& shape, absl::Span operands, + const std::vector& replica_groups) { + return absl::make_unique(shape, operands, + replica_groups); +} + +/* static */ std::unique_ptr +HloInstruction::CreateCollectivePermute( + const Shape& shape, HloInstruction* operand, + const std::vector>& source_target_pairs) { + return absl::make_unique( + shape, operand, source_target_pairs); } /* static */ std::unique_ptr HloInstruction::CreateInfeed( - const Shape& shape, const string& config) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kInfeed, shape)); - instruction->set_infeed_config(config); - return instruction; + const Shape& infeed_shape, HloInstruction* token_operand, + const string& config) { + return absl::make_unique(infeed_shape, token_operand, + config); } /* static */ std::unique_ptr HloInstruction::CreateOutfeed( - const Shape& shape, HloInstruction* operand, - tensorflow::StringPiece outfeed_config) { - std::unique_ptr instruction = - WrapUnique(new HloInstruction(HloOpcode::kOutfeed, ShapeUtil::MakeNil())); - CHECK(ShapeUtil::Compatible(operand->shape(), shape)) - << "Outfeed shape " << shape << " must be compatible with operand shape " - << operand->shape(); - instruction->AppendOperand(operand); - instruction->outfeed_config_ = std::string(outfeed_config); - instruction->outfeed_shape_ = shape; - return instruction; + const Shape& outfeed_shape, HloInstruction* operand, + HloInstruction* token_operand, absl::string_view outfeed_config) { + return absl::make_unique( + outfeed_shape, operand, token_operand, outfeed_config); } /* static */ std::unique_ptr HloInstruction::CreateSend( - HloInstruction* operand, int64 channel_id) { - // Send instruction produces a tuple of {aliased operand, U32 context}. - Shape output_shape = ShapeUtil::MakeTupleShape( - {operand->shape(), ShapeUtil::MakeShape(U32, {})}); - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kSend, output_shape)); - instruction->AppendOperand(operand); - instruction->channel_id_ = channel_id; - return instruction; + HloInstruction* operand, HloInstruction* token, int64 channel_id, + bool is_host_transfer) { + return absl::make_unique(operand, token, channel_id, + is_host_transfer); } /* static */ std::unique_ptr HloInstruction::CreateSendDone( - HloInstruction* operand) { - CHECK(operand->opcode() == HloOpcode::kSend) + HloInstruction* operand, bool is_host_transfer) { + auto send_operand = DynCast(operand); + CHECK(send_operand != nullptr) << "SendDone must take the context operand from Send"; - auto instruction = WrapUnique( - new HloInstruction(HloOpcode::kSendDone, ShapeUtil::MakeNil())); - instruction->AppendOperand(operand); - instruction->channel_id_ = operand->channel_id(); - return instruction; + return absl::make_unique(send_operand, + is_host_transfer); } /* static */ std::unique_ptr HloInstruction::CreateRecv( - const Shape& shape, int64 channel_id) { - // Recv instruction produces a tuple of {receive buffer, U32 context}. - Shape output_shape = - ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})}); - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kRecv, output_shape)); - instruction->channel_id_ = channel_id; - return instruction; + const Shape& shape, HloInstruction* token, int64 channel_id, + bool is_host_transfer) { + return absl::make_unique(shape, token, channel_id, + is_host_transfer); } /* static */ std::unique_ptr HloInstruction::CreateRecvDone( - HloInstruction* operand) { - CHECK(operand->opcode() == HloOpcode::kRecv) + HloInstruction* operand, bool is_host_transfer) { + auto recv_operand = DynCast(operand); + CHECK(recv_operand != nullptr) << "RecvDone must take the context operand from Recv"; - Shape output_shape = ShapeUtil::GetTupleElementShape(operand->shape(), 0); - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kRecvDone, output_shape)); - instruction->AppendOperand(operand); - instruction->channel_id_ = operand->channel_id(); - return instruction; + return absl::make_unique(recv_operand, + is_host_transfer); } /* static */ std::unique_ptr HloInstruction::CreateReverse( const Shape& shape, HloInstruction* operand, - tensorflow::gtl::ArraySlice dimensions) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kReverse, shape)); - instruction->AppendOperand(operand); - instruction->dimensions_.assign(dimensions.begin(), dimensions.end()); + absl::Span dimensions) { + return absl::make_unique(shape, operand, dimensions); +} + +/* static */ std::unique_ptr HloInstruction::CreateAfterAll( + absl::Span operands) { + CHECK(!operands.empty()); + auto instruction = absl::WrapUnique( + new HloInstruction(HloOpcode::kAfterAll, ShapeUtil::MakeTokenShape())); + for (auto operand : operands) { + instruction->AppendOperand(operand); + } return instruction; } +/* static */ std::unique_ptr HloInstruction::CreateToken() { + return absl::WrapUnique( + new HloInstruction(HloOpcode::kAfterAll, ShapeUtil::MakeTokenShape())); +} + /* static */ std::unique_ptr HloInstruction::CreateWhile( const Shape& shape, HloComputation* condition, HloComputation* body, HloInstruction* init) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kWhile, shape)); + auto instruction = + absl::WrapUnique(new HloInstruction(HloOpcode::kWhile, shape)); instruction->AppendOperand(init); // Body comes before condition computation in the vector. instruction->called_computations_.push_back(body); @@ -518,7 +804,7 @@ HloInstruction::CreateCrossReplicaSum( HloInstruction* true_computation_arg, HloComputation* true_computation, HloInstruction* false_computation_arg, HloComputation* false_computation) { auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kConditional, shape)); + absl::WrapUnique(new HloInstruction(HloOpcode::kConditional, shape)); instruction->AppendOperand(pred); instruction->AppendOperand(true_computation_arg); instruction->AppendOperand(false_computation_arg); @@ -532,33 +818,17 @@ HloInstruction::CreateCrossReplicaSum( /* static */ std::unique_ptr HloInstruction::CreateSlice( const Shape& shape, HloInstruction* operand, - tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices, - tensorflow::gtl::ArraySlice strides) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kSlice, shape)); - instruction->AppendOperand(operand); - instruction->slice_starts_.assign(start_indices.begin(), start_indices.end()); - instruction->slice_limits_.assign(limit_indices.begin(), limit_indices.end()); - instruction->slice_strides_.assign(strides.begin(), strides.end()); - // For backward compatibility with old serialized computations: if there are - // no strides, assume all strides are 1. - // TODO(b/63317920): remove this code. - if (instruction->slice_strides_.empty()) { - instruction->slice_strides_ = std::vector(start_indices.size(), 1LL); - } - return instruction; + absl::Span start_indices, + absl::Span limit_indices, absl::Span strides) { + return absl::make_unique(shape, operand, start_indices, + limit_indices, strides); } /* static */ std::unique_ptr HloInstruction::CreateDynamicSlice( const Shape& shape, HloInstruction* operand, HloInstruction* start_indices, - tensorflow::gtl::ArraySlice slice_sizes) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kDynamicSlice, shape)); - instruction->AppendOperand(operand); - instruction->AppendOperand(start_indices); - instruction->dynamic_slice_sizes_.assign(slice_sizes.begin(), - slice_sizes.end()); - return instruction; + absl::Span slice_sizes) { + return absl::make_unique( + shape, operand, start_indices, slice_sizes); } /* static */ std::unique_ptr @@ -566,8 +836,8 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape, HloInstruction* operand, HloInstruction* update, HloInstruction* start_indices) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kDynamicUpdateSlice, shape)); + auto instruction = absl::WrapUnique( + new HloInstruction(HloOpcode::kDynamicUpdateSlice, shape)); instruction->AppendOperand(operand); instruction->AppendOperand(update); instruction->AppendOperand(start_indices); @@ -575,20 +845,16 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape, } /* static */ std::unique_ptr HloInstruction::CreateConcatenate( - const Shape& shape, tensorflow::gtl::ArraySlice operands, + const Shape& shape, absl::Span operands, int64 dimension) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kConcatenate, shape)); - for (auto operand : operands) { - instruction->AppendOperand(operand); - } - instruction->dimensions_.push_back(dimension); - return instruction; + return absl::make_unique(shape, operands, + dimension); } /* static */ std::unique_ptr HloInstruction::CreateConvert( const Shape& shape, HloInstruction* operand) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kConvert, shape)); + auto instruction = + absl::WrapUnique(new HloInstruction(HloOpcode::kConvert, shape)); instruction->AppendOperand(operand); return instruction; } @@ -597,34 +863,38 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape, HloInstruction::CreateBitcastConvert(const Shape& shape, HloInstruction* operand) { auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kBitcastConvert, shape)); + absl::WrapUnique(new HloInstruction(HloOpcode::kBitcastConvert, shape)); instruction->AppendOperand(operand); return instruction; } /* static */ std::unique_ptr HloInstruction::CreateReduce( - const Shape& shape, HloInstruction* arg, HloInstruction* init_value, - tensorflow::gtl::ArraySlice dimensions_to_reduce, + const Shape& shape, HloInstruction* operand, HloInstruction* init_value, + absl::Span dimensions_to_reduce, HloComputation* reduce_computation) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kReduce, shape)); - instruction->AppendOperand(arg); - instruction->AppendOperand(init_value); - instruction->dimensions_.assign(dimensions_to_reduce.begin(), - dimensions_to_reduce.end()); - instruction->called_computations_.push_back(reduce_computation); - return instruction; + auto instruction = absl::WrapUnique(new HloReduceInstruction( + shape, {operand, init_value}, dimensions_to_reduce, reduce_computation)); + return std::move(instruction); +} + +/* static */ std::unique_ptr HloInstruction::CreateReduce( + const Shape& shape, absl::Span operands, + absl::Span init_values, + absl::Span dimensions_to_reduce, + HloComputation* reduce_computation) { + std::vector all_args; + all_args.reserve(operands.size() * 2); + all_args.insert(all_args.end(), operands.begin(), operands.end()); + all_args.insert(all_args.end(), init_values.begin(), init_values.end()); + return absl::make_unique( + shape, all_args, dimensions_to_reduce, reduce_computation); } /* static */ std::unique_ptr HloInstruction::CreateReduceWindow( const Shape& shape, HloInstruction* operand, HloInstruction* init_value, const Window& window, HloComputation* reduce_computation) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kReduceWindow, shape)); - instruction->AppendOperand(operand); - instruction->AppendOperand(init_value); - instruction->called_computations_.push_back(reduce_computation); - instruction->window_ = MakeUnique(window); - return instruction; + return absl::make_unique( + shape, operand, init_value, window, reduce_computation); } /* static */ std::unique_ptr @@ -633,14 +903,8 @@ HloInstruction::CreateBatchNormTraining(const Shape& shape, HloInstruction* scale, HloInstruction* offset, float epsilon, int64 feature_index) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kBatchNormTraining, shape)); - instruction->AppendOperand(operand); - instruction->AppendOperand(scale); - instruction->AppendOperand(offset); - instruction->epsilon_ = epsilon; - instruction->feature_index_ = feature_index; - return instruction; + return absl::make_unique( + shape, operand, scale, offset, epsilon, feature_index); } /* static */ std::unique_ptr @@ -648,16 +912,8 @@ HloInstruction::CreateBatchNormInference( const Shape& shape, HloInstruction* operand, HloInstruction* scale, HloInstruction* offset, HloInstruction* mean, HloInstruction* variance, float epsilon, int64 feature_index) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kBatchNormInference, shape)); - instruction->AppendOperand(operand); - instruction->AppendOperand(scale); - instruction->AppendOperand(offset); - instruction->AppendOperand(mean); - instruction->AppendOperand(variance); - instruction->epsilon_ = epsilon; - instruction->feature_index_ = feature_index; - return instruction; + return absl::make_unique( + shape, operand, scale, offset, mean, variance, epsilon, feature_index); } /* static */ std::unique_ptr @@ -666,16 +922,9 @@ HloInstruction::CreateBatchNormGrad(const Shape& shape, HloInstruction* operand, HloInstruction* variance, HloInstruction* grad_output, float epsilon, int64 feature_index) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kBatchNormGrad, shape)); - instruction->AppendOperand(operand); - instruction->AppendOperand(scale); - instruction->AppendOperand(mean); - instruction->AppendOperand(variance); - instruction->AppendOperand(grad_output); - instruction->epsilon_ = epsilon; - instruction->feature_index_ = feature_index; - return instruction; + return absl::make_unique( + shape, operand, scale, mean, variance, grad_output, epsilon, + feature_index); } /* static */ std::unique_ptr @@ -683,27 +932,15 @@ HloInstruction::CreateSelectAndScatter( const Shape& shape, HloInstruction* operand, HloComputation* select, const Window& window, HloInstruction* source, HloInstruction* init_value, HloComputation* scatter) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kSelectAndScatter, shape)); - instruction->AppendOperand(operand); - instruction->AppendOperand(source); - instruction->AppendOperand(init_value); - // Select comes before scatter in the vector. - instruction->called_computations_.push_back(select); - instruction->called_computations_.push_back(scatter); - instruction->window_ = MakeUnique(window); - return instruction; + return absl::make_unique( + shape, operand, select, window, source, init_value, scatter); } /* static */ std::unique_ptr HloInstruction::CreateBroadcast( const Shape& shape, HloInstruction* operand, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kBroadcast, shape)); - instruction->AppendOperand(operand); - instruction->dimensions_.assign(broadcast_dimensions.begin(), - broadcast_dimensions.end()); - return instruction; + absl::Span broadcast_dimensions) { + return absl::make_unique(shape, operand, + broadcast_dimensions); } /* static */ std::unique_ptr @@ -761,11 +998,8 @@ HloInstruction::CreateBroadcastSequence( /* static */ std::unique_ptr HloInstruction::CreatePad( const Shape& shape, HloInstruction* operand, HloInstruction* padding_value, const PaddingConfig& padding_config) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kPad, shape)); - instruction->AppendOperand(operand); - instruction->AppendOperand(padding_value); - instruction->padding_config_ = MakeUnique(padding_config); - return instruction; + return absl::make_unique(shape, operand, padding_value, + padding_config); } /* static */ std::unique_ptr HloInstruction::CreateReshape( @@ -774,61 +1008,44 @@ HloInstruction::CreateBroadcastSequence( ShapeUtil::ElementsIn(operand->shape())) << "shape: " << ShapeUtil::HumanString(shape) << " operand: " << ShapeUtil::HumanString(operand->shape()); - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kReshape, shape)); + auto instruction = + absl::WrapUnique(new HloInstruction(HloOpcode::kReshape, shape)); instruction->AppendOperand(operand); return instruction; } /* static */ std::unique_ptr HloInstruction::CreateTranspose( const Shape& shape, HloInstruction* operand, - tensorflow::gtl::ArraySlice dimensions) { - CHECK_EQ(shape.dimensions().size(), dimensions.size()); - CHECK_EQ(shape.dimensions().size(), operand->shape().dimensions().size()); - CHECK(std::equal(operand->shape().dimensions().begin(), - operand->shape().dimensions().end(), - Permute(dimensions, shape.dimensions()).begin())) - << "shape: " << ShapeUtil::HumanString(shape) - << ", operand->shape(): " << ShapeUtil::HumanString(shape) - << ", dimensions: {" << Join(dimensions, ", ") << "}"; - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kTranspose, shape)); - instruction->AppendOperand(operand); - instruction->dimensions_.assign(dimensions.begin(), dimensions.end()); - return instruction; + absl::Span dimensions) { + return absl::make_unique(shape, operand, dimensions); +} + +/* static */ std::unique_ptr HloInstruction::CreateSort( + const Shape& shape, int64 dimension, HloInstruction* keys, + HloInstruction* values) { + return absl::make_unique(shape, dimension, keys, values); } /* static */ std::unique_ptr HloInstruction::CreateFusion( const Shape& shape, FusionKind fusion_kind, HloInstruction* fused_root) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFusion, shape)); - instruction->fusion_kind_ = fusion_kind; - instruction->name_ = "fusion"; - instruction->set_parent(fused_root->parent()); - instruction->set_metadata(fused_root->metadata()); - instruction->CloneAndFuseInternal(fused_root); - return instruction; + return absl::make_unique(shape, fusion_kind, + fused_root); } /* static */ std::unique_ptr HloInstruction::CreateFusion( const Shape& shape, FusionKind fusion_kind, - tensorflow::gtl::ArraySlice operands, + absl::Span operands, HloComputation* fusion_computation) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFusion, shape)); - for (auto operand : operands) { - instruction->AppendOperand(operand); - } - instruction->fusion_kind_ = fusion_kind; - instruction->name_ = "fusion"; - instruction->called_computations_.push_back(fusion_computation); - fusion_computation->SetFusionInstruction(instruction.get()); - return instruction; + return absl::make_unique(shape, fusion_kind, operands, + fusion_computation); } -void HloInstruction::set_device_sharding(int64 device) { - HloSharding device_sharding = HloSharding::AssignDevice(device); +void HloInstruction::set_single_sharding(const HloSharding& sharding) { + CHECK(!sharding.IsTuple()) << sharding; if (ShapeUtil::IsTuple(shape())) { - set_sharding(HloSharding::Tuple(device_sharding.GetAsShapeTree(shape()))); + set_sharding(HloSharding::Tuple(sharding.GetAsShapeTree(shape()))); } else { - set_sharding(device_sharding); + set_sharding(sharding); } } @@ -840,326 +1057,45 @@ void HloInstruction::SetupDerivedInstruction( derived_instruction->clear_sharding(); } derived_instruction->set_metadata(metadata_); + derived_instruction->set_precision_config(precision_config_); } -HloInstruction* HloInstruction::AddFusionOperand(HloInstruction* new_operand) { - CHECK_EQ(opcode(), HloOpcode::kFusion); - CHECK_EQ(operand_count(), - fused_instructions_computation()->parameter_instructions().size()); - const int64 param_no = operand_count(); - // Name the parameter after the instruction it represents in the outer - // (non-fusion) computation. - string param_name = StrCat(new_operand->name(), ".param_", param_no); - HloInstruction* fused_parameter = - fused_instructions_computation()->AddParameter( - HloInstruction::CreateParameter(param_no, new_operand->shape(), - param_name)); - AppendOperand(new_operand); - return fused_parameter; +bool HloInstruction::HasSideEffectNoRecurse() const { + switch (opcode_) { + case HloOpcode::kSend: + case HloOpcode::kSendDone: + case HloOpcode::kRecv: + case HloOpcode::kRecvDone: + case HloOpcode::kRng: + case HloOpcode::kInfeed: + case HloOpcode::kOutfeed: + case HloOpcode::kTrace: + return true; + case HloOpcode::kCrossReplicaSum: + return all_reduce_id().has_value(); + default: + return false; + } } -void HloInstruction::MergeFusionInstruction( - HloInstruction* instruction_to_merge) { - CHECK_EQ(opcode_, HloOpcode::kFusion); - CHECK_EQ(instruction_to_merge->opcode(), HloOpcode::kFusion); - CHECK(std::find(operands().begin(), operands().end(), instruction_to_merge) != - operands().end()); - // Clone the instruction from which to merge fused instructions. - std::unique_ptr clone = instruction_to_merge->Clone(); - // Replace uses of fused parameters with the corresponding operand of the - // fusion. Add all non-parameter fused instructions to 'unfused_instructions' - // to be merged into 'this'. This is done in reverse post order. - std::vector unfused_instructions; - auto fused_instructions = - clone->fused_instructions_computation()->MakeInstructionPostOrder(); - for (auto fused_it = fused_instructions.rbegin(); - fused_it != fused_instructions.rend(); ++fused_it) { - auto fused_instruction = *fused_it; - if (fused_instruction->opcode() == HloOpcode::kParameter) { - TF_CHECK_OK(fused_instruction->ReplaceAllUsesWith( - clone->mutable_operand(fused_instruction->parameter_number()))); - } else { - unfused_instructions.push_back(fused_instruction); - } +bool HloInstruction::HasSideEffect() const { + if (HasSideEffectNoRecurse()) { + return true; } - CHECK(unfused_instructions.front() == clone->fused_expression_root()); - // Replace instruction_to_merge use of 'this' with unfused_root. - TF_CHECK_OK( - instruction_to_merge->ReplaceUseWith(this, unfused_instructions.front())); - // Fuse 'unfused_instructions' into 'this'. - for (auto& instruction : unfused_instructions) { - FuseInstruction(instruction); - instruction->DetachFromOperands(); + // Check if any of the called computations has a side effect. + for (const auto& computation : called_computations()) { + if (computation->HasSideEffect()) { + return true; + } } - CHECK_EQ(0, clone->user_count()); - clone->DetachFromOperands(); - TF_CHECK_OK(parent()->parent()->RemoveEmbeddedComputation( - clone->fused_instructions_computation())); -} - -void HloInstruction::MergeFusionInstructionIntoMultiOutput( - HloInstruction* instruction_to_merge) { - CHECK_EQ(opcode_, HloOpcode::kFusion); - CHECK_EQ(instruction_to_merge->opcode(), HloOpcode::kFusion); - // Add all non-parameter fused instructions to 'unfused_instructions' to be - // merged into 'this'. `old_to_new' maps the instructions in the fused node - // to the disaseembled fusion instructions. - // Note that we add the unfused instructions to this->parent_ computation. - // This is necessary because the unique_id needs for an instruction and - // it's only added when inserting to the computation. - tensorflow::gtl::FlatMap old_to_new; - std::vector unfused_instructions; - auto computation_to_merge = - instruction_to_merge->fused_instructions_computation(); - auto post_order = computation_to_merge->MakeInstructionPostOrder(); - for (auto rit = post_order.rbegin(); rit != post_order.rend(); ++rit) { - auto fused_instruction = *rit; - if (fused_instruction->opcode() == HloOpcode::kParameter) { - InsertOrDie(&old_to_new, fused_instruction, - instruction_to_merge->mutable_operand( - fused_instruction->parameter_number())); - continue; - } - - // Here we clone the insertion and call FuseInstructionIntoMultiOutput() - // which clones again. This can be improved. - auto cloned_instruction = - parent_->AddInstruction(fused_instruction->Clone()); - unfused_instructions.push_back(cloned_instruction); - InsertOrDie(&old_to_new, fused_instruction, cloned_instruction); - } - for (auto unfused_instruction : unfused_instructions) { - for (int64 index = 0; index < unfused_instruction->operand_count(); - index++) { - auto new_operand = - FindOrDie(old_to_new, unfused_instruction->mutable_operand(index)); - TF_CHECK_OK(unfused_instruction->ReplaceOperandWith(index, new_operand)); - } - } - - HloInstruction* unfused_root = unfused_instructions.front(); - TF_CHECK_OK(instruction_to_merge->ReplaceAllUsesWith(unfused_root)); - - TF_CHECK_OK( - instruction_to_merge->parent()->RemoveInstruction(instruction_to_merge)); - if (GetModule()) { - TF_CHECK_OK(GetModule()->RemoveEmbeddedComputation(computation_to_merge)); - } - - // Fuse the root instruction and generate multiple outputs. - FuseInstructionIntoMultiOutput(unfused_root); - TF_CHECK_OK(unfused_root->parent()->RemoveInstruction(unfused_root)); - // The rest instructions are of normal fusing. - for (int64 i = 1; i < unfused_instructions.size(); i++) { - auto instruction = unfused_instructions[i]; - FuseInstruction(instruction); - TF_CHECK_OK(instruction->parent()->RemoveInstruction(instruction)); - } -} - -HloInstruction* HloInstruction::FuseInstructionInternal( - HloInstruction* instruction_to_fuse, bool add_output) { - CHECK_EQ(opcode_, HloOpcode::kFusion); - - // When add_output is false, this fusion instruction must be a user of - // instruction_to_fuse. - if (!add_output) { - CHECK(IsUserOf(instruction_to_fuse)); - } - HloInstruction* fused_instruction = - CloneAndFuseInternal(instruction_to_fuse, add_output); - return fused_instruction; -} - -HloInstruction* HloInstruction::CloneAndFuseInternal( - HloInstruction* instruction_to_fuse, bool add_output) { - CHECK_EQ(opcode_, HloOpcode::kFusion); - CHECK(instruction_to_fuse->IsFusable()) << instruction_to_fuse->ToString(); - VLOG(3) << "CloneAndFuseInternal:\n" << instruction_to_fuse->ToString(); - HloInstruction* clone = nullptr; - if (called_computations_.empty()) { - // New fusion instruction. It should not be a multioutput instruction. - CHECK(!add_output); - auto builder = HloComputation::Builder("fused_computation", this); - builder.AddInstruction(instruction_to_fuse->Clone(/*suffix=*/"")); - called_computations_.push_back( - CHECK_NOTNULL(GetModule())->AddEmbeddedComputation(builder.Build())); - clone = fused_expression_root(); - } else { - clone = fused_instructions_computation()->AddInstruction( - instruction_to_fuse->Clone(/*suffix=*/"")); - // When add_output is false, instruction_to_fuse is necessarily an operand - // of the fusion instruction. After fusion this will no longer be the case. - // Remove the operand from the operand list and remove its corresponding - // fused parameter instruction. Renumber parameters as necessary to make - // parameter numbers consistent with their index in the - // fused_parameter_ vector. - bool in_operand_list = std::find(operands_.begin(), operands_.end(), - instruction_to_fuse) != operands_.end(); - CHECK(add_output || in_operand_list); - const std::vector& fused_parameters = - fused_instructions_computation()->parameter_instructions(); - for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) { - if (instruction_to_fuse == operands_[operand_num]) { - // replace the fused parameter instruction's uses with the clone. - HloInstruction* fused_parameter = fused_parameters[operand_num]; - TF_CHECK_OK(fused_parameter->ReplaceAllUsesWith(clone)); - - // Remove the corresponding fused parameter and operand from their - // respective vectors. - TF_CHECK_OK( - fused_instructions_computation()->RemoveParameter(operand_num)); - operands_.erase(operands_.begin() + operand_num); - break; - } - } - // We've cloned instruction_to_fuse into this fusion instruction, so this - // fusion instruction is no longer a use of instruction_to_fuse. - if (in_operand_list) { - instruction_to_fuse->RemoveUser(this); - // When the instruction_to_fuse does not have other users, we don't need - // to generate a multioutput fusion instruction. - if (instruction_to_fuse->user_count() == 0) { - add_output = false; - } - } - } - - // Reread the parameters in the computation. - const std::vector& fused_parameters = - fused_instructions_computation()->parameter_instructions(); - - // Add each operand of the clone as an operand of the fusion instruction. A - // complication is that some clone operands may already be operands of the - // fusion instruction. - for (int64 operand_num = 0; operand_num < clone->operand_count(); - ++operand_num) { - HloInstruction* operand = clone->mutable_operand(operand_num); - - // See if this operand is already an operand of the fusion node. - CHECK_EQ(operands_.size(), fused_parameters.size()); - HloInstruction* fused_param = nullptr; - for (int64 i = 0; i < operands_.size(); ++i) { - if (operands_[i] == operand) { - fused_param = fused_parameters[i]; - break; - } - } - - if (fused_param == nullptr) { - // Clone's operand was not already an operand of the fusion - // instruction. Add it as an operand and add a corresponding fused - // parameter instruction. - fused_param = AddFusionOperand(operand); - } - TF_CHECK_OK(clone->ReplaceOperandWith(operand_num, fused_param)); - } - - if (add_output) { - CHECK_GT(instruction_to_fuse->user_count(), 0); - // If this is already a multioutput fusion instruction, expand the root - // tuple by 1. - HloInstruction* fused_root = fused_expression_root(); - HloInstruction::InstructionVector tuple_elements; - bool newly_created_tuple_instr = false; - if (fused_root->opcode() == HloOpcode::kTuple) { - tuple_elements = fused_root->operands(); - } else { - tuple_elements.push_back(fused_root); - newly_created_tuple_instr = true; - } - if (clone->opcode() == HloOpcode::kTuple) { - for (auto inst : clone->operands()) { - tuple_elements.push_back(inst); - } - } else { - tuple_elements.push_back(clone); - } - HloInstruction* new_root = fused_instructions_computation()->AddInstruction( - HloInstruction::CreateTuple(tuple_elements)); - fused_instructions_computation()->set_root_instruction(new_root); - shape_ = new_root->shape(); - if (fused_root->opcode() == HloOpcode::kTuple) { - TF_CHECK_OK( - fused_instructions_computation()->RemoveInstruction(fused_root)); - } - - // If this is a newly created multioutput instruction, we need to update - // the use of the original fusion instruction. - if (newly_created_tuple_instr) { - HloInstruction* new_instr = parent_->AddInstruction( - HloInstruction::CreateGetTupleElement(fused_root->shape(), this, 0)); - TF_CHECK_OK(ReplaceAllUsesWith(new_instr)); - } - int64 index = tuple_elements.size(); - if (instruction_to_fuse->opcode() == HloOpcode::kTuple) { - index -= instruction_to_fuse->operand_count(); - std::vector to_be_removed; - for (auto old_gte : instruction_to_fuse->users()) { - CHECK_EQ(old_gte->opcode(), HloOpcode::kGetTupleElement); - int64 old_tuple_index = old_gte->tuple_index(); - HloInstruction* new_gte = - parent_->AddInstruction(HloInstruction::CreateGetTupleElement( - old_gte->shape(), this, index + old_tuple_index)); - TF_CHECK_OK(old_gte->ReplaceAllUsesWith(new_gte)); - to_be_removed.push_back(old_gte); - } - for (auto old_gte : to_be_removed) { - TF_CHECK_OK(parent_->RemoveInstruction(old_gte)); - } - TF_CHECK_OK(fused_instructions_computation()->RemoveInstruction(clone)); - } else { - HloInstruction* new_gte = - parent_->AddInstruction(HloInstruction::CreateGetTupleElement( - clone->shape(), this, index - 1)); - TF_CHECK_OK(instruction_to_fuse->ReplaceAllUsesWith(new_gte)); - } - } - - VLOG(2) << "New clone:\n" << clone->ToString(); - return clone; -} - -RandomDistribution HloInstruction::random_distribution() const { - CHECK_EQ(opcode_, HloOpcode::kRng); - return distribution_; -} - -bool HloInstruction::HasSideEffectNoRecurse() const { - switch (opcode_) { - case HloOpcode::kSend: - case HloOpcode::kSendDone: - case HloOpcode::kRecv: - case HloOpcode::kRecvDone: - case HloOpcode::kRng: - case HloOpcode::kInfeed: - case HloOpcode::kOutfeed: - case HloOpcode::kTrace: - case HloOpcode::kHostCompute: - return true; - default: - return false; - } -} - -bool HloInstruction::HasSideEffect() const { - if (HasSideEffectNoRecurse()) { - return true; - } - // Check if any of the called computations has a side effect. - for (const auto& computation : called_computations()) { - if (computation->HasSideEffect()) { - return true; - } - } - return false; + return false; } /* static */ std::unique_ptr HloInstruction::CreateCall( - const Shape& shape, tensorflow::gtl::ArraySlice operands, + const Shape& shape, absl::Span operands, HloComputation* computation) { std::unique_ptr instruction = - WrapUnique(new HloInstruction(HloOpcode::kCall, shape)); + absl::WrapUnique(new HloInstruction(HloOpcode::kCall, shape)); for (auto operand : operands) { instruction->AppendOperand(operand); } @@ -1168,32 +1104,14 @@ bool HloInstruction::HasSideEffect() const { } /* static */ std::unique_ptr HloInstruction::CreateCustomCall( - const Shape& shape, tensorflow::gtl::ArraySlice operands, - tensorflow::StringPiece custom_call_target) { - std::unique_ptr instruction = - WrapUnique(new HloInstruction(HloOpcode::kCustomCall, shape)); - for (auto operand : operands) { - instruction->AppendOperand(operand); - } - instruction->custom_call_target_ = std::string(custom_call_target); - return instruction; -} - -/* static */ std::unique_ptr HloInstruction::CreateHostCompute( - const Shape& shape, tensorflow::gtl::ArraySlice operands, - tensorflow::StringPiece channel_name, const int64 cost_estimate_ns) { - std::unique_ptr instruction = - WrapUnique(new HloInstruction(HloOpcode::kHostCompute, shape)); - for (auto operand : operands) { - instruction->AppendOperand(operand); - } - instruction->channel_name_ = std::string(channel_name); - instruction->cost_estimate_ns_ = cost_estimate_ns; - return instruction; + const Shape& shape, absl::Span operands, + absl::string_view custom_call_target) { + return absl::make_unique(shape, operands, + custom_call_target); } /* static */ std::unique_ptr HloInstruction::CreateTuple( - tensorflow::gtl::ArraySlice elements) { + absl::Span elements) { std::vector element_shapes; for (auto element : elements) { element_shapes.push_back(element->shape()); @@ -1203,44 +1121,29 @@ bool HloInstruction::HasSideEffect() const { } /* static */ std::unique_ptr HloInstruction::CreateGather( - const Shape& shape, HloInstruction* operand, HloInstruction* gather_indices, + const Shape& shape, HloInstruction* operand, HloInstruction* start_indices, const GatherDimensionNumbers& gather_dim_numbers, - tensorflow::gtl::ArraySlice window_bounds) { - std::unique_ptr instruction = - WrapUnique(new HloInstruction(HloOpcode::kGather, shape)); - instruction->AppendOperand(operand); - instruction->AppendOperand(gather_indices); - instruction->gather_dimension_numbers_ = - MakeUnique(gather_dim_numbers); - c_copy(window_bounds, std::back_inserter(instruction->gather_window_bounds_)); - return instruction; + absl::Span slice_sizes) { + return absl::make_unique( + shape, operand, start_indices, gather_dim_numbers, slice_sizes); } -/* static */ GatherDimensionNumbers HloInstruction::MakeGatherDimNumbers( - tensorflow::gtl::ArraySlice output_window_dims, - tensorflow::gtl::ArraySlice elided_window_dims, - tensorflow::gtl::ArraySlice gather_dims_to_operand_dims, - int64 index_vector_dim) { - GatherDimensionNumbers gather_dim_numbers; - for (int64 output_window_dim : output_window_dims) { - gather_dim_numbers.add_output_window_dims(output_window_dim); - } - for (int64 elided_window_dim : elided_window_dims) { - gather_dim_numbers.add_elided_window_dims(elided_window_dim); - } - for (int64 gather_dim_to_input_dim : gather_dims_to_operand_dims) { - gather_dim_numbers.add_gather_dims_to_operand_dims(gather_dim_to_input_dim); - } - - gather_dim_numbers.set_index_vector_dim(index_vector_dim); - return gather_dim_numbers; +/* static */ std::unique_ptr HloInstruction::CreateScatter( + const Shape& shape, HloInstruction* operand, + HloInstruction* scatter_indices, HloInstruction* updates, + HloComputation* update_computation, + const ScatterDimensionNumbers& scatter_dim_numbers) { + return absl::make_unique( + shape, operand, scatter_indices, updates, update_computation, + scatter_dim_numbers); } /* static */ std::unique_ptr HloInstruction::CreateDomain( const Shape& shape, HloInstruction* operand, std::unique_ptr operand_side_metadata, std::unique_ptr user_side_metadata) { - auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDomain, shape)); + auto instruction = + absl::WrapUnique(new HloInstruction(HloOpcode::kDomain, shape)); instruction->operand_side_metadata_ = std::move(operand_side_metadata); instruction->user_side_metadata_ = std::move(user_side_metadata); instruction->AppendOperand(operand); @@ -1248,8 +1151,7 @@ bool HloInstruction::HasSideEffect() const { } std::unique_ptr HloInstruction::CloneWithNewOperands( - const Shape& shape, - tensorflow::gtl::ArraySlice new_operands, + const Shape& shape, absl::Span new_operands, HloCloneContext* context) const { VLOG(3) << "CloneWithNewOperands:\n " << ToString(); VLOG(3) << " new operands:"; @@ -1262,6 +1164,47 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( // in the face of code changes than copying fields explicitly. This also // properly sets the user fields of the operands. switch (opcode_) { + // Ops migrated to subclasses. + // TODO(b/80131774): Remove this switch when migration is complete. + case HloOpcode::kBatchNormTraining: + case HloOpcode::kBatchNormInference: + case HloOpcode::kBatchNormGrad: + case HloOpcode::kFft: + case HloOpcode::kSend: + case HloOpcode::kSendDone: + case HloOpcode::kRecv: + case HloOpcode::kRecvDone: + case HloOpcode::kReverse: + case HloOpcode::kConcatenate: + case HloOpcode::kReduce: + case HloOpcode::kTranspose: + case HloOpcode::kBroadcast: + case HloOpcode::kMap: + case HloOpcode::kSlice: + case HloOpcode::kConstant: + case HloOpcode::kTrace: + case HloOpcode::kFusion: + case HloOpcode::kRng: + case HloOpcode::kParameter: + case HloOpcode::kGetTupleElement: + case HloOpcode::kReducePrecision: + case HloOpcode::kCrossReplicaSum: + case HloOpcode::kAllToAll: + case HloOpcode::kCollectivePermute: + case HloOpcode::kInfeed: + case HloOpcode::kOutfeed: + case HloOpcode::kConvolution: + case HloOpcode::kCustomCall: + case HloOpcode::kReduceWindow: + case HloOpcode::kSelectAndScatter: + case HloOpcode::kPad: + case HloOpcode::kDynamicSlice: + case HloOpcode::kSort: + case HloOpcode::kGather: + case HloOpcode::kScatter: + case HloOpcode::kIota: + clone = CloneWithNewOperandsImpl(shape, new_operands, context); + break; // Unary ops. case HloOpcode::kAbs: case HloOpcode::kRoundNearestAfz: @@ -1282,7 +1225,6 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( case HloOpcode::kReal: case HloOpcode::kSign: case HloOpcode::kSin: - case HloOpcode::kSort: case HloOpcode::kTanh: CHECK_EQ(new_operands.size(), 1); clone = CreateUnary(shape, opcode_, new_operands[0]); @@ -1306,6 +1248,7 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( case HloOpcode::kRemainder: case HloOpcode::kAnd: case HloOpcode::kOr: + case HloOpcode::kXor: case HloOpcode::kShiftLeft: case HloOpcode::kShiftRightArithmetic: case HloOpcode::kShiftRightLogical: @@ -1315,28 +1258,15 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( // Ternary ops. case HloOpcode::kClamp: case HloOpcode::kSelect: + case HloOpcode::kTupleSelect: CHECK_EQ(new_operands.size(), 3); clone = CreateTernary(shape, opcode_, new_operands[0], new_operands[1], new_operands[2]); break; // Other supported ops. - case HloOpcode::kBroadcast: - CHECK_EQ(new_operands.size(), 1); - clone = CreateBroadcast(shape, new_operands[0], dimensions_); - break; case HloOpcode::kCall: clone = CreateCall(shape, new_operands, to_apply()); break; - case HloOpcode::kCustomCall: - clone = CreateCustomCall(shape, new_operands, custom_call_target_); - break; - case HloOpcode::kHostCompute: - clone = CreateHostCompute(shape, new_operands, channel_name_, - cost_estimate_ns_); - break; - case HloOpcode::kConcatenate: - clone = CreateConcatenate(shape, new_operands, dimensions(0)); - break; case HloOpcode::kConvert: CHECK_EQ(new_operands.size(), 1); clone = CreateConvert(shape, new_operands[0]); @@ -1345,85 +1275,20 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( CHECK_EQ(new_operands.size(), 1); clone = CreateBitcastConvert(shape, new_operands[0]); break; - case HloOpcode::kReducePrecision: - CHECK_EQ(new_operands.size(), 1); - clone = CreateReducePrecision(shape, new_operands[0], exponent_bits_, - mantissa_bits_); - break; - case HloOpcode::kConvolution: - CHECK_EQ(new_operands.size(), 2); - clone = CreateConvolve(shape, new_operands[0], new_operands[1], *window_, - *convolution_dimension_numbers_); - break; case HloOpcode::kDot: CHECK_EQ(new_operands.size(), 2); clone = CreateDot(shape, new_operands[0], new_operands[1], *dot_dimension_numbers_); break; - case HloOpcode::kFft: - CHECK_EQ(new_operands.size(), 1); - clone = CreateFft(shape, new_operands[0], fft_type_, fft_length_); - break; - case HloOpcode::kCrossReplicaSum: - clone = CreateCrossReplicaSum(shape, new_operands); - break; - case HloOpcode::kGetTupleElement: - CHECK_EQ(new_operands.size(), 1); - clone = CreateGetTupleElement(shape, new_operands[0], tuple_index()); - break; - case HloOpcode::kMap: - clone = CreateMap(shape, new_operands, to_apply()); - break; - case HloOpcode::kPad: - CHECK_EQ(new_operands.size(), 2); - clone = - CreatePad(shape, new_operands[0], new_operands[1], *padding_config_); - break; - case HloOpcode::kReduce: - CHECK_EQ(new_operands.size(), 2); - clone = CreateReduce(shape, new_operands[0], new_operands[1], dimensions_, - to_apply()); - break; - case HloOpcode::kReduceWindow: - CHECK_EQ(new_operands.size(), 2); - clone = CreateReduceWindow(shape, new_operands[0], new_operands[1], - *window_, to_apply()); - break; - case HloOpcode::kSelectAndScatter: - CHECK_EQ(new_operands.size(), 3); - clone = - CreateSelectAndScatter(shape, new_operands[0], select(), *window_, - new_operands[1], new_operands[2], scatter()); - break; - case HloOpcode::kReverse: - CHECK_EQ(new_operands.size(), 1); - clone = CreateReverse(shape, new_operands[0], dimensions_); - break; - case HloOpcode::kRng: - clone = CreateRng(shape, distribution_, new_operands); - break; case HloOpcode::kReshape: CHECK_EQ(new_operands.size(), 1); clone = CreateReshape(shape, new_operands[0]); break; - case HloOpcode::kSlice: - CHECK_EQ(new_operands.size(), 1); - clone = CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_, - slice_strides_); - break; - case HloOpcode::kDynamicSlice: - clone = CreateDynamicSlice(shape, new_operands[0], new_operands[1], - dynamic_slice_sizes_); - break; case HloOpcode::kDynamicUpdateSlice: CHECK_EQ(new_operands.size(), 3); clone = CreateDynamicUpdateSlice(shape, new_operands[0], new_operands[1], new_operands[2]); break; - case HloOpcode::kTranspose: - CHECK_EQ(new_operands.size(), 1); - clone = CreateTranspose(shape, new_operands[0], dimensions_); - break; case HloOpcode::kTuple: clone = CreateTuple(new_operands); *clone->mutable_shape() = shape; @@ -1433,95 +1298,30 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( clone = CreateWhile(shape, while_condition(), while_body(), new_operands[0]); break; - case HloOpcode::kConstant: - clone = CreateConstant(literal_->CloneToUnique()); - break; - case HloOpcode::kFusion: { - HloModule* module = context != nullptr ? context->module() : GetModule(); - HloComputation* new_fused_computation = nullptr; - if (context != nullptr) { - new_fused_computation = - context->FindComputation(fused_instructions_computation()); - } - if (new_fused_computation == nullptr) { - new_fused_computation = module->AddEmbeddedComputation( - fused_instructions_computation()->Clone("clone", context)); - } - clone = CreateFusion(/*shape=*/shape, /*fusion_kind=*/fusion_kind(), - /*operands=*/new_operands, - /*fusion_computation=*/new_fused_computation); - break; - } - case HloOpcode::kParameter: - clone = CreateParameter(parameter_number_, shape, name_); - break; - case HloOpcode::kBatchNormTraining: - CHECK_EQ(new_operands.size(), 3); - clone = - CreateBatchNormTraining(shape, new_operands[0], new_operands[1], - new_operands[2], epsilon(), feature_index()); - break; - case HloOpcode::kBatchNormInference: - CHECK_EQ(new_operands.size(), 5); - clone = CreateBatchNormInference( - shape, new_operands[0], new_operands[1], new_operands[2], - new_operands[3], new_operands[4], epsilon(), feature_index()); - break; - case HloOpcode::kInfeed: - CHECK_EQ(new_operands.size(), 0); - clone = CreateInfeed(shape, infeed_config()); - break; - case HloOpcode::kOutfeed: - CHECK_EQ(new_operands.size(), 1); - clone = CreateOutfeed(outfeed_shape_, new_operands[0], outfeed_config()); - break; - case HloOpcode::kBatchNormGrad: - CHECK_EQ(new_operands.size(), 5); - clone = CreateBatchNormGrad(shape, new_operands[0], new_operands[1], - new_operands[2], new_operands[3], - new_operands[4], epsilon(), feature_index()); - break; case HloOpcode::kConditional: CHECK_EQ(new_operands.size(), 3); clone = CreateConditional(shape, new_operands[0], new_operands[1], true_computation(), new_operands[2], false_computation()); break; - case HloOpcode::kSend: - CHECK_EQ(new_operands.size(), 1); - clone = CreateSend(new_operands[0], channel_id()); - break; - case HloOpcode::kSendDone: - CHECK_EQ(new_operands.size(), 1); - clone = CreateSendDone(new_operands[0]); - break; - case HloOpcode::kRecv: - CHECK_EQ(new_operands.size(), 0); - // The shape is a tuple, but CreateRecv() wants the raw data shape. - clone = - CreateRecv(ShapeUtil::GetTupleElementShape(shape, 0), channel_id()); - break; - case HloOpcode::kRecvDone: - CHECK_EQ(new_operands.size(), 1); - clone = CreateRecvDone(new_operands[0]); - break; - case HloOpcode::kGather: - CHECK_EQ(new_operands.size(), 2); - clone = CreateGather(shape, new_operands[0], new_operands[1], - *gather_dimension_numbers_, gather_window_bounds_); - break; case HloOpcode::kDomain: CHECK_EQ(new_operands.size(), 1); clone = CreateDomain(shape, new_operands[0], operand_side_metadata_->Clone(), user_side_metadata_->Clone()); break; - case HloOpcode::kTrace: - LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_); + case HloOpcode::kAfterAll: + if (new_operands.empty()) { + clone = CreateToken(); + } else { + clone = CreateAfterAll(new_operands); + } + break; } + // SetupDerivedInstruction will setup the precision_config_ field. SetupDerivedInstruction(clone.get()); clone->set_parent(parent_); - clone->set_backend_config(backend_config()); + clone->set_raw_backend_config_string(backend_config_); if (context != nullptr) { context->MapInstruction(this, clone.get()); clone->ReplaceCalledComputations([&](HloComputation* callee) { @@ -1533,7 +1333,29 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( return clone; } -HloInstruction::~HloInstruction() {} +HloInstruction::~HloInstruction() { + // Detach from operands. An instruction may be repeated as an operand. To + // avoid calling RemoveUser twice on the same operand, check before remove. + for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) { + HloInstruction* operand = operands_[operand_num]; + if (operand == nullptr) { + continue; + } + if (operand->user_set_.find(this) != operand->user_set_.end()) { + operand->RemoveUser(this); + } + operands_[operand_num] = nullptr; + } + + // Update users. Set `nullptr` to the correpsonding operand slot for users. + for (auto& user : this->users()) { + for (int i = 0; i < user->operand_count(); ++i) { + if (user->operands_[i] == this) { + user->operands_[i] = nullptr; + } + } + } +} std::unique_ptr HloInstruction::Clone( const string& suffix, HloCloneContext* context) const { @@ -1562,7 +1384,7 @@ std::unique_ptr HloInstruction::Clone( // If names ends with .suffix[0-9]+ then replace with a suffix with the // numeric value incremented. int64 numeric_suffix; - if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) { + if (absl::SimpleAtoi(after_suffix, &numeric_suffix)) { clone->name_ = StrCat(name().substr(0, index), dot_suffix, numeric_suffix + 1); } else { @@ -1598,40 +1420,6 @@ const HloInstruction* HloInstruction::LatestNonGteAncestor() const { return hlo; } -const Literal& HloInstruction::literal() const { - CHECK_EQ(HloOpcode::kConstant, opcode_); - return *literal_; -} - -bool HloInstruction::HasLiteral() const { return literal_ != nullptr; } - -bool HloInstruction::CanHaveDimensionsField() const { - return (opcode() == HloOpcode::kReverse || - opcode() == HloOpcode::kConcatenate || - opcode() == HloOpcode::kReduce || opcode() == HloOpcode::kBroadcast || - opcode() == HloOpcode::kTranspose); -} - -const std::vector& HloInstruction::dimensions() const { - CHECK(CanHaveDimensionsField()); - return dimensions_; -} - -int64 HloInstruction::dimensions(int64 index) const { - return dimensions()[index]; -} - -int64 HloInstruction::concatenate_dimension() const { - CHECK(opcode() == HloOpcode::kConcatenate); - CHECK_EQ(1, dimensions_.size()); - return dimensions(0); -} - -int64 HloInstruction::tuple_index() const { - CHECK_EQ(HloOpcode::kGetTupleElement, opcode_); - return tuple_index_; -} - const HloInstruction* HloInstruction::operand(int64 i) const { return operands_[i]; } @@ -1713,15 +1501,35 @@ void HloInstruction::AppendOperand(HloInstruction* operand) { operand->AddUser(this); } -void HloInstruction::AddUser(HloInstruction* user) { - if (!ContainsKey(user_set_, user)) { - user_set_.insert(user); - users_.push_back(user); +void HloInstruction::RemoveOperandsAtAscendingIndices( + absl::Span ascending_indices) { + if (ascending_indices.empty()) { + return; + } + int next_index = 0; + int removed_count = 0; + for (int to_remove : ascending_indices) { + while (next_index < to_remove) { + operands_[next_index - removed_count] = operands_[next_index]; + ++next_index; + } + CHECK_LT(to_remove, operands_.size()); + ++removed_count; + ++next_index; + } + while (next_index < operands_.size()) { + operands_[next_index - removed_count] = operands_[next_index]; + ++next_index; } + CHECK_EQ(removed_count, ascending_indices.size()); + operands_.resize(operands_.size() - removed_count); } -bool HloInstruction::IsConstant() const { - return opcode_ == HloOpcode::kConstant; +void HloInstruction::AddUser(HloInstruction* user) { + if (!ContainsKey(user_set_, user)) { + user_set_.insert(user); + users_.push_back(user); + } } bool HloInstruction::HasConstantOperand() const { @@ -1753,9 +1561,7 @@ bool HloInstruction::IdenticalSlowPath( case HloOpcode::kConvert: case HloOpcode::kCopy: case HloOpcode::kCos: - case HloOpcode::kCrossReplicaSum: case HloOpcode::kDivide: - case HloOpcode::kDynamicSlice: case HloOpcode::kDynamicUpdateSlice: case HloOpcode::kEq: case HloOpcode::kExp: @@ -1771,6 +1577,7 @@ bool HloInstruction::IdenticalSlowPath( case HloOpcode::kAnd: case HloOpcode::kNot: case HloOpcode::kOr: + case HloOpcode::kXor: case HloOpcode::kLt: case HloOpcode::kMaximum: case HloOpcode::kMinimum: @@ -1791,122 +1598,80 @@ bool HloInstruction::IdenticalSlowPath( case HloOpcode::kSubtract: case HloOpcode::kTanh: case HloOpcode::kTuple: + case HloOpcode::kTupleSelect: return true; - // Broadcast, Concatenate, and Transpose need the same dimensions field. - case HloOpcode::kBroadcast: - case HloOpcode::kConcatenate: - case HloOpcode::kTranspose: - return dimensions() == other.dimensions(); - - case HloOpcode::kFusion: - return fusion_kind() == other.fusion_kind() && - eq_computations(fused_instructions_computation(), - other.fused_instructions_computation()); - - // These opcodes have complex or special behavior so just return false. - case HloOpcode::kDomain: - case HloOpcode::kRng: - case HloOpcode::kTrace: - case HloOpcode::kWhile: + // This opcode has complex or special behavior so just return false. + case HloOpcode::kAfterAll: return false; - case HloOpcode::kParameter: - return parameter_number() == other.parameter_number(); - - case HloOpcode::kBatchNormTraining: - case HloOpcode::kBatchNormInference: - case HloOpcode::kBatchNormGrad: - return feature_index() == other.feature_index() && - epsilon() == other.epsilon(); - - // A constant is defined by the value in the literal. - case HloOpcode::kConstant: - return literal() == other.literal(); - - // A reduce-precision operation is determined by the bit sizes. - case HloOpcode::kReducePrecision: - return exponent_bits() == other.exponent_bits() && - mantissa_bits() == other.mantissa_bits(); - - // Convolution has a window and dimensions. - case HloOpcode::kConvolution: - return protobuf_util::ProtobufEquals(window(), other.window()) && - protobuf_util::ProtobufEquals( - convolution_dimension_numbers(), - other.convolution_dimension_numbers()); // Check dot dimension numbers. case HloOpcode::kDot: return protobuf_util::ProtobufEquals(dot_dimension_numbers(), other.dot_dimension_numbers()); - case HloOpcode::kGather: - return protobuf_util::ProtobufEquals(gather_dimension_numbers(), - other.gather_dimension_numbers()) && - gather_window_bounds() == other.gather_window_bounds(); - - // FFT has various types & lengths. - case HloOpcode::kFft: - return fft_type() == other.fft_type() && - fft_length() == other.fft_length(); - - // Reduction results are determined by the reduction dimension and the - // reduction computation. - case HloOpcode::kReduce: - return dimensions() == other.dimensions() && - eq_computations(to_apply(), other.to_apply()); - case HloOpcode::kReduceWindow: - return eq_computations(to_apply(), other.to_apply()) && - protobuf_util::ProtobufEquals(window(), other.window()); - - // SelectAndScatter is determined by both select and scatter - // computation as well as the window configuration. - case HloOpcode::kSelectAndScatter: - return eq_computations(select(), other.select()) && - eq_computations(scatter(), other.scatter()) && - protobuf_util::ProtobufEquals(window(), other.window()); - - // Remaining instructions with special values. - case HloOpcode::kGetTupleElement: - return tuple_index() == other.tuple_index(); - case HloOpcode::kPad: - return protobuf_util::ProtobufEquals(padding_config(), - other.padding_config()); - case HloOpcode::kSlice: - return slice_starts_ == other.slice_starts_ && - slice_limits_ == other.slice_limits_ && - slice_strides_ == other.slice_strides_; case HloOpcode::kCall: - case HloOpcode::kMap: return eq_computations(to_apply(), other.to_apply()); - case HloOpcode::kCustomCall: - return custom_call_target_ == other.custom_call_target_; - case HloOpcode::kReverse: - return dimensions() == other.dimensions(); case HloOpcode::kConditional: return eq_computations(true_computation(), other.true_computation()) && eq_computations(false_computation(), other.false_computation()); - // These opcodes are not yet supported. - case HloOpcode::kInfeed: - case HloOpcode::kOutfeed: - case HloOpcode::kSort: - case HloOpcode::kRecv: - case HloOpcode::kRecvDone: + case HloOpcode::kWhile: { + if (eq_computations(while_body(), other.while_body()) && + eq_computations(while_condition(), other.while_condition())) { + return true; + } + return false; + } + + case HloOpcode::kDomain: + return operand_side_metadata().Matches(other.operand_side_metadata()) && + user_side_metadata().Matches(other.user_side_metadata()); + + // Ops migrated to subclasses should never come to this line. + // TODO(b/80131774): Remove this switch when migration is complete. + case HloOpcode::kBatchNormTraining: + case HloOpcode::kBatchNormInference: + case HloOpcode::kBatchNormGrad: + case HloOpcode::kFft: case HloOpcode::kSend: case HloOpcode::kSendDone: - case HloOpcode::kHostCompute: - return false; + case HloOpcode::kRecv: + case HloOpcode::kRecvDone: + case HloOpcode::kReverse: + case HloOpcode::kConcatenate: + case HloOpcode::kReduce: + case HloOpcode::kSort: + case HloOpcode::kTranspose: + case HloOpcode::kBroadcast: + case HloOpcode::kMap: + case HloOpcode::kSlice: + case HloOpcode::kConstant: + case HloOpcode::kIota: + case HloOpcode::kTrace: + case HloOpcode::kFusion: + case HloOpcode::kRng: + case HloOpcode::kParameter: + case HloOpcode::kGetTupleElement: + case HloOpcode::kReducePrecision: + case HloOpcode::kInfeed: + case HloOpcode::kOutfeed: + case HloOpcode::kCrossReplicaSum: + case HloOpcode::kAllToAll: + case HloOpcode::kCollectivePermute: + case HloOpcode::kConvolution: + case HloOpcode::kCustomCall: + case HloOpcode::kReduceWindow: + case HloOpcode::kSelectAndScatter: + case HloOpcode::kPad: + case HloOpcode::kDynamicSlice: + case HloOpcode::kGather: + case HloOpcode::kScatter: + LOG(FATAL) << "Base class impl called for opcode with subclass: " + << opcode(); } -} - -bool HloInstruction::IsRank2Transpose() const { - return (opcode_ == HloOpcode::kTranspose) && - dimensions_ == std::vector({1, 0}) && - shape_.dimensions_size() == 2 && - std::equal(shape_.dimensions().begin(), shape_.dimensions().end(), - operands_[0]->shape_.dimensions().rbegin()); + return false; } void HloInstruction::RemoveUser(HloInstruction* user) { @@ -1938,6 +1703,10 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user, std::replace(user->operands_.begin(), user->operands_.end(), this, new_producer); new_producer->AddUser(user); + if (user->opcode() == HloOpcode::kFusion) { + TF_RETURN_IF_ERROR( + Cast(user)->DeduplicateFusionOperands()); + } return Status::OK(); } @@ -1946,10 +1715,14 @@ Status HloInstruction::ReplaceOperandWith(int64 operand_num, TF_RET_CHECK(operand_num >= 0); TF_RET_CHECK(operand_num < operand_count()); HloInstruction* old_operand = mutable_operand(operand_num); + if (old_operand == new_operand) { + return Status::OK(); + } + TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(old_operand->shape(), new_operand->shape())) - << old_operand->shape().ShortDebugString() << " is not compatible with " - << new_operand->shape().ShortDebugString(); + << old_operand->shape() << " is not compatible with " + << new_operand->shape(); operands_[operand_num] = new_operand; VLOG(3) << "Replacing operand " << operand_num << " of " << name() << " with " @@ -1976,6 +1749,10 @@ Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) { std::replace(user->operands_.begin(), user->operands_.end(), this, new_producer); new_producer->AddUser(user); + if (user->opcode() == HloOpcode::kFusion) { + TF_RETURN_IF_ERROR( + Cast(user)->DeduplicateFusionOperands()); + } } } users_.clear(); @@ -1990,28 +1767,14 @@ Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) { return Status::OK(); } -void HloInstruction::DetachFromOperands() { - VLOG(3) << "DetachFromOperands:\n " << ToString(); - CHECK_EQ(0, user_count()); - // An instruction may be repeated as an operand. To avoid calling RemoveUser - // twice on the same operand, keep a set of already detached operands. - std::set detached_operands; - for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) { - HloInstruction* operand = operands_[operand_num]; - if (!ContainsKey(detached_operands, operand)) { - operand->RemoveUser(this); - detached_operands.insert(operand); - } - operands_[operand_num] = nullptr; - } -} - HloComputation* HloInstruction::to_apply() const { switch (opcode_) { case HloOpcode::kCall: case HloOpcode::kMap: case HloOpcode::kReduceWindow: case HloOpcode::kReduce: + case HloOpcode::kCrossReplicaSum: + case HloOpcode::kScatter: CHECK_EQ(called_computations_.size(), 1); return called_computations_[0]; default: @@ -2029,6 +1792,8 @@ void HloInstruction::set_to_apply(HloComputation* computation) { case HloOpcode::kMap: case HloOpcode::kReduceWindow: case HloOpcode::kReduce: + case HloOpcode::kCrossReplicaSum: + case HloOpcode::kScatter: CHECK_EQ(called_computations_.size(), 1); called_computations_[0] = computation; break; @@ -2038,16 +1803,6 @@ void HloInstruction::set_to_apply(HloComputation* computation) { } } -const string& HloInstruction::custom_call_target() const { - CHECK_EQ(opcode_, HloOpcode::kCustomCall); - return custom_call_target_; -} - -const string& HloInstruction::outfeed_config() const { - CHECK_EQ(opcode_, HloOpcode::kOutfeed); - return outfeed_config_; -} - HloComputation* HloInstruction::while_condition() const { CHECK_EQ(HloOpcode::kWhile, opcode_); return called_computations_[kConditionComputationIndex]; @@ -2074,32 +1829,6 @@ void HloInstruction::set_while_body(HloComputation* computation) { called_computations_[kBodyComputationIndex] = computation; } -HloComputation* HloInstruction::select() const { - CHECK_EQ(HloOpcode::kSelectAndScatter, opcode_); - return called_computations_[kSelectComputationIndex]; -} - -HloComputation* HloInstruction::scatter() const { - CHECK_EQ(HloOpcode::kSelectAndScatter, opcode_); - return called_computations_[kScatterComputationIndex]; -} - -void HloInstruction::set_select(HloComputation* computation) { - // Don't allow changing the computation for fused instructions so we don't - // have to recompute called_instructions for the entire fusion instruction. - CHECK(!IsFused()); - CHECK_EQ(HloOpcode::kSelectAndScatter, opcode_); - called_computations_[kSelectComputationIndex] = computation; -} - -void HloInstruction::set_scatter(HloComputation* computation) { - // Don't allow changing the computation for fused instructions so we don't - // have to recompute called_instructions for the entire fusion instruction. - CHECK(!IsFused()); - CHECK_EQ(HloOpcode::kSelectAndScatter, opcode_); - called_computations_[kScatterComputationIndex] = computation; -} - HloComputation* HloInstruction::true_computation() const { CHECK_EQ(HloOpcode::kConditional, opcode_); return called_computations_[kTrueComputationIndex]; @@ -2128,7 +1857,7 @@ void HloInstruction::set_false_computation(HloComputation* false_computation) { string HloInstruction::SignatureString() const { string operands = - Join(operands_, ", ", [](string* out, HloInstruction* operand) { + StrJoin(operands_, ", ", [](string* out, HloInstruction* operand) { StrAppend(out, ShapeUtil::HumanString(operand->shape())); }); return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape())); @@ -2147,6 +1876,78 @@ string HloInstruction::ToString(const HloPrintOptions& options) const { return ToStringWithCanonicalNameMap(options, &new_map); } +bool HloInstruction::IsElementwiseImpl( + const absl::optional& operand_idx) const { + switch (opcode_) { + // Unary elementwise operations. + case HloOpcode::kAbs: + case HloOpcode::kRoundNearestAfz: + case HloOpcode::kCeil: + case HloOpcode::kClz: + case HloOpcode::kConvert: + case HloOpcode::kBitcastConvert: + case HloOpcode::kCopy: + case HloOpcode::kCos: + case HloOpcode::kExp: + case HloOpcode::kExpm1: + case HloOpcode::kFloor: + case HloOpcode::kImag: + case HloOpcode::kIsFinite: + case HloOpcode::kLog: + case HloOpcode::kLog1p: + case HloOpcode::kNot: + case HloOpcode::kNegate: + case HloOpcode::kReal: + case HloOpcode::kReducePrecision: + case HloOpcode::kSign: + case HloOpcode::kSin: + case HloOpcode::kTanh: + CHECK_EQ(1, operand_count()); + return true; + + // Binary elementwise operations, the same as in IsElementwiseBinary(). + case HloOpcode::kAdd: + case HloOpcode::kAtan2: + case HloOpcode::kComplex: + case HloOpcode::kDivide: + case HloOpcode::kEq: + case HloOpcode::kGe: + case HloOpcode::kGt: + case HloOpcode::kLe: + case HloOpcode::kLt: + case HloOpcode::kMaximum: + case HloOpcode::kMinimum: + case HloOpcode::kMultiply: + case HloOpcode::kNe: + case HloOpcode::kPower: + case HloOpcode::kRemainder: + case HloOpcode::kSubtract: + case HloOpcode::kAnd: + case HloOpcode::kOr: + case HloOpcode::kXor: + case HloOpcode::kShiftLeft: + case HloOpcode::kShiftRightArithmetic: + case HloOpcode::kShiftRightLogical: + CHECK_EQ(2, operand_count()); + return true; + + // Ternary elementwise operations. + case HloOpcode::kSelect: + case HloOpcode::kClamp: + return true; + + case HloOpcode::kDynamicUpdateSlice: + return operand_idx.has_value() && operand_idx.value() == 0; + + default: + return false; + } +} + +bool HloInstruction::IsCrossModuleAllReduce() const { + return opcode() == HloOpcode::kCrossReplicaSum && all_reduce_id(); +} + string HloInstruction::ToStringWithCanonicalNameMap( const HloPrintOptions& options, CanonicalNameMap* canonical_name_map) const { @@ -2182,8 +1983,8 @@ string HloInstruction::ToStringWithCanonicalNameMap( !metadata_.source_file().empty())) { StrAppend(&result, ", metadata={", xla::OpMetadataToString(metadata_), "}"); } - if (options.print_backend_config() && !backend_config().empty()) { - StrAppend(&result, ", backend_config=\"", CEscape(backend_config()), "\""); + if (options.print_backend_config() && !backend_config_.empty()) { + StrAppend(&result, ", backend_config=\"", CEscape(backend_config_), "\""); } return result; } @@ -2197,121 +1998,52 @@ string HloInstruction::OperandsToStringWithCanonicalNameMap( const HloPrintOptions& options, CanonicalNameMap* canonical_name_map) const { string operands; - if (opcode() == HloOpcode::kConstant) { - // For constants, show the actual value in place of an empty operand list. - // - // In HloInstruction, sometimes a constant literal is not constructed due - // to its size. Skip the printing in this case. - if (HasLiteral() && ((!ShapeUtil::IsTuple(shape()) && - ShapeUtil::ElementsIn(shape()) <= 10) || - options.print_large_constants())) { - // Literal::ToString emits multidimensional arrays over multiple - // lines. Compact this into one line by stripping out white space. - string tmp = literal().ToString(); - std::replace(tmp.begin(), tmp.end(), '\n', ' '); - std::vector v = tensorflow::str_util::Split(tmp, ' '); - bool first = true; - // Concatenate elements in "v" with spaces separating them, but ignoring - // empty entries. - for (const auto& s : v) { - if (s.empty()) { - continue; - } - StrAppend(&operands, (first ? "" : " "), s); - first = false; - } - } else { - // Do not show large constants or tuples. - operands = "{...}"; + absl::Span slice(operands_); + const int64 kMaxOperandsToShowIfCompact = 4; + if (options.compact_operands() && + slice.size() > kMaxOperandsToShowIfCompact) { + slice.remove_suffix(slice.size() - kMaxOperandsToShowIfCompact); + } + operands = StrJoin(slice, ", ", [&](string* out, HloInstruction* operand) { + // If operand is already been deleted, put `null` to the string output. + if (operand == nullptr) { + StrAppend(out, "null "); + return; } - } else if (opcode() == HloOpcode::kParameter) { - StrAppend(&operands, parameter_number_); - } else { - tensorflow::gtl::ArraySlice slice(operands_); - const int64 kMaxOperandsToShowIfCompact = 4; - if (options.compact_operands() && - slice.size() > kMaxOperandsToShowIfCompact) { - slice.remove_suffix(slice.size() - kMaxOperandsToShowIfCompact); + std::vector str; + if (options.print_operand_shape()) { + str.push_back(ShapeUtil::HumanStringWithLayout(operand->shape())); } - operands = Join(slice, ", ", [&](string* out, HloInstruction* operand) { - std::vector str; - if (options.print_operand_shape()) { - str.push_back(ShapeUtil::HumanStringWithLayout(operand->shape())); - } - // In a top-level HloInstruction::ToString() call, the operand name is not - // part of the canonical string. - if (options.canonicalize_instruction_names() && - options.is_in_nested_computation()) { - str.push_back(PrintName( - canonical_name_map->LookupOrInsert(operand->name()), options)); - } else if (!options.compact_operands()) { - str.push_back(PrintName(operand->name(), options)); - } - StrAppend(out, Join(str, " ")); - }); - const int64 remaining = operands_.size() - slice.size(); - if (slice.size() != operands_.size()) { - StrAppend(&operands, ", ...(+", remaining, ")"); + // In a top-level HloInstruction::ToString() call, the operand name is not + // part of the canonical string. + if (options.canonicalize_instruction_names() && + options.is_in_nested_computation()) { + str.push_back(PrintName( + canonical_name_map->LookupOrInsert(operand->name()), options)); + } else if (!options.compact_operands()) { + str.push_back(PrintName(operand->name(), options)); } + StrAppend(out, StrJoin(str, " ")); + }); + const int64 remaining = operands_.size() - slice.size(); + if (slice.size() != operands_.size()) { + StrAppend(&operands, ", ...(+", remaining, ")"); } return operands; } std::vector HloInstruction::ExtraAttributesToString( const HloPrintOptions& options) const { - std::vector extra; - if (opcode() == HloOpcode::kFusion) { - extra.push_back(StrCat("kind=", xla::ToString(fusion_kind()))); - } - if (CanHaveDimensionsField()) { - extra.push_back(StrCat("dimensions={", Join(dimensions(), ","), "}")); - } - if (window_ != nullptr && window_->dimensions_size() != 0) { - extra.push_back(StrCat("window={", window_util::ToString(*window_), "}")); - } - if (padding_config_ != nullptr) { - extra.push_back( - StrCat("padding=", xla::PaddingConfigToString(*padding_config_))); - } - if (opcode() == HloOpcode::kSlice) { - std::vector bounds; - bounds.reserve(slice_starts_.size()); - const bool omit_stride = - std::all_of(slice_strides_.begin(), slice_strides_.end(), - [](int64 stride) { return stride == 1; }); - for (int i = 0; i < slice_starts_.size(); ++i) { - string stride_str = omit_stride ? "" : StrCat(":", slice_strides_[i]); - bounds.push_back(StrCat("[", slice_starts_[i], ":", slice_limits_[i], - stride_str, "]")); - } - extra.push_back(StrCat("slice={", Join(bounds, ", "), "}")); - } - if (opcode() == HloOpcode::kDynamicSlice) { - extra.push_back( - StrCat("dynamic_slice_sizes={", Join(dynamic_slice_sizes(), ","), "}")); - } - if (opcode() == HloOpcode::kBatchNormTraining || - opcode() == HloOpcode::kBatchNormInference || - opcode() == HloOpcode::kBatchNormGrad) { - extra.push_back(StrCat("epsilon=", epsilon())); - extra.push_back(StrCat("feature_index=", feature_index())); - } + std::vector extra = ExtraAttributesToStringImpl(options); - if (convolution_dimension_numbers_ != nullptr) { - extra.push_back(ConvolutionDimensionNumbersToString()); - } if (dot_dimension_numbers_ != nullptr) { extra.push_back(DotDimensionNumbersToString()); } - if (gather_dimension_numbers_ != nullptr) { - extra.push_back(GatherDimensionNumbersToString()); - extra.push_back( - StrCat("window_bounds={", Join(gather_window_bounds(), ","), "}")); - } - if (opcode() == HloOpcode::kFft) { - extra.push_back(StrCat("fft_type=", FftType_Name(fft_type()))); - extra.push_back(StrCat("fft_length={", Join(fft_length(), ","), "}")); + + string precision_config_string = PrecisionConfigToString(); + if (!precision_config_string.empty()) { + extra.push_back(precision_config_string); } if (options.print_subcomputation_mode() == @@ -2332,16 +2064,18 @@ std::vector HloInstruction::ExtraAttributesToString( PrintName(false_computation()->name(), options))); } else if (opcode() == HloOpcode::kCall || opcode() == HloOpcode::kMap || opcode() == HloOpcode::kReduceWindow || - opcode() == HloOpcode::kReduce) { + opcode() == HloOpcode::kReduce || + opcode() == HloOpcode::kCrossReplicaSum || + opcode() == HloOpcode::kScatter) { extra.push_back( StrCat("to_apply=", PrintName(to_apply()->name(), options))); } else if (!called_computations().empty()) { extra.push_back(StrCat( - "calls=", Join(called_computations(), ", ", - [&](string* out, const HloComputation* computation) { - StrAppend(out, - PrintName(computation->name(), options)); - }))); + "calls=", + StrJoin(called_computations(), ", ", + [&](string* out, const HloComputation* computation) { + StrAppend(out, PrintName(computation->name(), options)); + }))); } } else if (options.print_subcomputation_mode() == HloPrintOptions::PrintSubcomputationMode::kFullBodies) { @@ -2367,69 +2101,40 @@ std::vector HloInstruction::ExtraAttributesToString( case HloOpcode::kMap: case HloOpcode::kReduceWindow: case HloOpcode::kReduce: + case HloOpcode::kCrossReplicaSum: + case HloOpcode::kScatter: extra.push_back( StrCat("to_apply=\n", to_apply()->ToString(new_options))); break; default: if (!called_computations().empty()) { - extra.push_back( - StrCat("calls=\n", - Join(called_computations(), ", ", - [&](string* out, const HloComputation* computation) { - StrAppend(out, computation->ToString(new_options)); - }))); + extra.push_back(StrCat( + "calls=\n", + StrJoin(called_computations(), ", ", + [&](string* out, const HloComputation* computation) { + StrAppend(out, computation->ToString(new_options)); + }))); } break; } } - if (opcode() == HloOpcode::kSend || opcode() == HloOpcode::kRecv || - opcode() == HloOpcode::kSendDone || opcode() == HloOpcode::kRecvDone) { - extra.push_back(StrCat("channel_id=", channel_id_)); - } - if (opcode() == HloOpcode::kGetTupleElement) { - extra.push_back(StrCat("index=", tuple_index())); - } if (has_sharding()) { extra.push_back(StrCat("sharding=", sharding().ToString())); } if (!control_predecessors_.empty()) { extra.push_back(StrCat("control-predecessors={", - Join(control_predecessors_, ", ", - [&](string* out, HloInstruction* pre) { - StrAppend(out, - PrintName(pre->name(), options)); - }), + StrJoin(control_predecessors_, ", ", + [&](string* out, HloInstruction* pre) { + StrAppend(out, + PrintName(pre->name(), options)); + }), "}")); } - if (opcode() == HloOpcode::kInfeed && !infeed_config_.empty()) { - extra.push_back(StrCat("infeed_config=\"", CEscape(infeed_config_), "\"")); - } - if (opcode() == HloOpcode::kOutfeed && !outfeed_config_.empty()) { - extra.push_back( - StrCat("outfeed_config=\"", CEscape(outfeed_config_), "\"")); - } - if (opcode() == HloOpcode::kRng) { - extra.push_back( - StrCat("distribution=", RandomDistributionToString(distribution_))); - } - if (opcode() == HloOpcode::kReducePrecision) { - extra.push_back(StrCat("exponent_bits=", exponent_bits_)); - extra.push_back(StrCat("mantissa_bits=", mantissa_bits_)); - } - if (operand_side_metadata_ != nullptr) { - extra.push_back( - StrCat("operand_side=", operand_side_metadata_->ToString())); - } - if (user_side_metadata_ != nullptr) { - extra.push_back(StrCat("user_side=", user_side_metadata_->ToString())); - } - // By contract, we print the custom call target even if - // options.print_subcomputation_mode() == kOff, because the call target is not - // an HloComputation. - if (opcode() == HloOpcode::kCustomCall) { - extra.push_back( - StrCat("custom_call_target=\"", CEscape(custom_call_target_), "\"")); + if (operand_side_metadata_ != nullptr && user_side_metadata_ != nullptr) { + extra.push_back(StrCat("domain={kind=\"", operand_side_metadata_->Kind(), + "\", entry=", user_side_metadata_->ToString(), + ", exit=", operand_side_metadata_->ToString(), "}")); } return extra; @@ -2437,10 +2142,10 @@ std::vector HloInstruction::ExtraAttributesToString( string HloInstruction::ToShortString() const { return StrCat("%", name(), " = ", HloOpcodeString(opcode()), "(", - Join(operands_, ", ", - [](string* out, HloInstruction* operand) { - StrAppend(out, "%", operand->name()); - }), + StrJoin(operands_, ", ", + [](string* out, HloInstruction* operand) { + StrAppend(out, "%", operand->name()); + }), ")"); } @@ -2461,79 +2166,22 @@ HloInstructionProto HloInstruction::ToProto() const { } *proto.mutable_metadata() = metadata_; - proto.set_backend_config(backend_config()); - if (literal_ != nullptr) { - *proto.mutable_literal() = literal_->ToProto(); - } - proto.set_parameter_number(parameter_number_); - if (opcode() == HloOpcode::kFusion) { - proto.set_fusion_kind(xla::ToString(fusion_kind())); - proto.add_called_computation_ids( - fused_instructions_computation()->unique_id()); - } else { + proto.set_backend_config(backend_config_); + *proto.mutable_precision_config() = precision_config_; + if (opcode() != HloOpcode::kFusion) { for (const HloComputation* computation : called_computations_) { proto.add_called_computation_ids(computation->unique_id()); } } - proto.set_tuple_index(tuple_index_); - for (int64 dimension : dimensions_) { - proto.add_dimensions(dimension); - } - if (window_ != nullptr) { - *proto.mutable_window() = *window_; - } - if (convolution_dimension_numbers_ != nullptr) { - *proto.mutable_convolution_dimension_numbers() = - *convolution_dimension_numbers_; - } if (dot_dimension_numbers_ != nullptr) { *proto.mutable_dot_dimension_numbers() = *dot_dimension_numbers_; } - if (gather_dimension_numbers_ != nullptr) { - *proto.mutable_gather_dimension_numbers() = *gather_dimension_numbers_; - } - if (opcode() == HloOpcode::kGather) { - for (int64 bound : gather_window_bounds()) { - proto.add_gather_window_bounds(bound); - } - } - for (int i = 0; i < slice_starts_.size(); ++i) { - auto* slice_dimension = proto.add_slice_dimensions(); - slice_dimension->set_start(slice_starts_[i]); - slice_dimension->set_limit(slice_limits_[i]); - slice_dimension->set_stride(slice_strides_[i]); - } - proto.set_exponent_bits(exponent_bits_); - proto.set_mantissa_bits(mantissa_bits_); - for (int64 slice_size : dynamic_slice_sizes_) { - proto.add_dynamic_slice_sizes(slice_size); - } - if (padding_config_ != nullptr) { - *proto.mutable_padding_config() = *padding_config_; - } - proto.set_outfeed_config(outfeed_config_); - if (opcode() == HloOpcode::kRng) { - proto.set_distribution(distribution_); - } - proto.set_epsilon(epsilon_); - proto.set_feature_index(feature_index_); - proto.set_channel_id(channel_id_); - proto.set_infeed_config(infeed_config_); - proto.set_custom_call_target(custom_call_target_); - *proto.mutable_outfeed_shape() = outfeed_shape_; - proto.set_fft_type(fft_type_); - for (int64 fft_len : fft_length_) { - proto.add_fft_length(fft_len); - } if (has_sharding()) { *proto.mutable_sharding() = sharding().ToProto(); } - proto.set_channel_name(channel_name_); - proto.set_cost_estimate_ns(cost_estimate_ns_); - return proto; } @@ -2543,35 +2191,6 @@ string HloInstruction::ToCategory() const { return "data formatting"; } - if (opcode() == HloOpcode::kConvolution) { - string category = "convolution"; - if (window_util::HasBaseDilation(window())) { - category += " base-dilated"; - } - if (window_util::HasWindowDilation(window())) { - category += " window-dilated"; - } - return category; - } - - // Give transpose-dot and backwards-conv fusions the categories "dot" and - // "convolution" so they match the categories of proper kDot and kConvolution - // ops. These fusion categories are really just a way of expressing a - // particular kind of dot or conv, so they should have the same category as a - // vanilla dot/conv. - if (opcode() == HloOpcode::kFusion) { - switch (fusion_kind()) { - case FusionKind::kLoop: - return "loop fusion"; - case FusionKind::kInput: - return "input fusion"; - case FusionKind::kOutput: - return "output fusion"; - case FusionKind::kCustom: - return "custom fusion"; - } - } - if (IsElementwise()) { return "non-fusion elementwise"; } @@ -2585,15 +2204,9 @@ void HloInstruction::set_tracing(HloInstruction* trace_instruction) { trace_instruction_ = trace_instruction; } -string HloInstruction::TracingTag() const { - CHECK_EQ(HloOpcode::kTrace, opcode()); - CHECK(literal_ != nullptr); - return literal_->GetR1U8AsString(); -} - bool HloInstruction::IsFused() const { return parent_->IsFusionComputation(); } -bool HloInstruction::IsFusable() const { +bool HloInstruction::IsFusible() const { // Instructions which are traced should not be fused. if (tracing()) { return false; @@ -2609,57 +2222,12 @@ bool HloInstruction::IsFusable() const { } } -HloComputation* HloInstruction::fused_instructions_computation() const { - CHECK_EQ(opcode_, HloOpcode::kFusion); - CHECK(!called_computations_.empty()); - auto* fused_instructions_computation = called_computations_.front(); - CHECK(fused_instructions_computation->IsFusionComputation()) - << "Computation " << fused_instructions_computation->name() - << " is not a fusion kind"; - return fused_instructions_computation; -} - -HloInstruction* HloInstruction::fused_expression_root() const { - CHECK_EQ(opcode_, HloOpcode::kFusion); - return fused_instructions_computation()->root_instruction(); -} - -HloInstruction* HloInstruction::fused_parameter(int64 parameter_number) const { - CHECK_EQ(opcode_, HloOpcode::kFusion); - return fused_instructions_computation()->parameter_instruction( - parameter_number); -} - -const std::vector& HloInstruction::fused_parameters() const { - CHECK_EQ(opcode_, HloOpcode::kFusion); - return fused_instructions_computation()->parameter_instructions(); -} - -const tensorflow::gtl::iterator_range>::const_iterator>> -HloInstruction::fused_instructions() const { - CHECK_EQ(opcode_, HloOpcode::kFusion); - const HloComputation* subcomp = fused_instructions_computation(); - return subcomp->instructions(); -} - -const tensorflow::gtl::iterator_range< - UnwrappingIterator>::iterator>> -HloInstruction::fused_instructions() { - CHECK_EQ(opcode_, HloOpcode::kFusion); - return fused_instructions_computation()->instructions(); -} - -int64 HloInstruction::fused_instruction_count() const { - return fused_instructions_computation()->instruction_count(); -} - -HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape) - : unique_id_(-1), - opcode_(opcode), - shape_(shape), - name_(HloOpcodeString(opcode)) { - TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_)); +HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape) + : unique_id_(-1), + opcode_(opcode), + shape_(shape), + name_(HloOpcodeString(opcode)) { + TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_)); } template @@ -2708,6 +2276,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return visitor->HandleAnd(this); case HloOpcode::kOr: return visitor->HandleOr(this); + case HloOpcode::kXor: + return visitor->HandleXor(this); case HloOpcode::kShiftLeft: return visitor->HandleShiftLeft(this); case HloOpcode::kShiftRightArithmetic: @@ -2732,12 +2302,18 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return visitor->HandleRemainder(this); case HloOpcode::kSelect: return visitor->HandleSelect(this); + case HloOpcode::kTupleSelect: + return visitor->HandleTupleSelect(this); case HloOpcode::kConvolution: return visitor->HandleConvolution(this); case HloOpcode::kFft: return visitor->HandleFft(this); case HloOpcode::kCrossReplicaSum: return visitor->HandleCrossReplicaSum(this); + case HloOpcode::kAllToAll: + return visitor->HandleAllToAll(this); + case HloOpcode::kCollectivePermute: + return visitor->HandleCollectivePermute(this); case HloOpcode::kTuple: return visitor->HandleTuple(this); case HloOpcode::kMap: @@ -2806,8 +2382,6 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return visitor->HandleInfeed(this); case HloOpcode::kOutfeed: return visitor->HandleOutfeed(this); - case HloOpcode::kHostCompute: - return visitor->HandleHostCompute(this); case HloOpcode::kRng: return visitor->HandleRng(this); case HloOpcode::kWhile: @@ -2830,8 +2404,14 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return visitor->HandleSendDone(this); case HloOpcode::kGather: return visitor->HandleGather(this); + case HloOpcode::kScatter: + return visitor->HandleScatter(this); case HloOpcode::kDomain: return visitor->HandleDomain(this); + case HloOpcode::kAfterAll: + return visitor->HandleAfterAll(this); + case HloOpcode::kIota: + return visitor->HandleIota(this); // These opcodes are not handled here. case HloOpcode::kTrace: @@ -2840,15 +2420,14 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return InternalError( "Unhandled HloOpcode for DfsHloVisitor: %s. This should not happen - " "please file a bug for XLA.", - HloOpcodeString(opcode_).c_str()); + HloOpcodeString(opcode_)); } // Explicit instantiations. template Status HloInstruction::Visit(DfsHloVisitor* visitor); template Status HloInstruction::Visit(ConstDfsHloVisitor* visitor); -using DFSStack = - tensorflow::gtl::InlinedVector, 16>; +using DFSStack = absl::InlinedVector, 16>; // Push "child" onto the dfs_stack if not already visited. Returns false if a // cycle was detected, and true otherwise. @@ -2924,7 +2503,7 @@ static Status PostOrderDFS(HloInstruction* root, Visitor* visitor, if (!TF_PREDICT_TRUE(PushDFSChild(visitor, &dfs_stack, child))) { return FailedPrecondition( "A cycle is detected while visiting instruction %s", - current_node->ToString().c_str()); + current_node->ToString()); } } @@ -2933,7 +2512,7 @@ static Status PostOrderDFS(HloInstruction* root, Visitor* visitor, if (!TF_PREDICT_TRUE(PushDFSChild(visitor, &dfs_stack, child))) { return FailedPrecondition( "A cycle is detected while visiting instruction %s", - current_node->ToString().c_str()); + current_node->ToString()); } } } @@ -3072,12 +2651,6 @@ Status HloInstruction::AcceptOrdered( return visitor->FinishVisit(this); } -const Shape& HloInstruction::outfeed_shape() const { - DCHECK_EQ(opcode_, HloOpcode::kOutfeed); - TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_)); - return outfeed_shape_; -} - const Shape& HloInstruction::shape() const { TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_)); return shape_; @@ -3099,87 +2672,7 @@ bool HloInstruction::IsElementwiseBinary() const { } bool HloInstruction::IsElementwise() const { - switch (opcode_) { - // Nullary elementwise operations. - case HloOpcode::kConstant: - return true; - - // Unary elementwise operations. - case HloOpcode::kAbs: - case HloOpcode::kRoundNearestAfz: - case HloOpcode::kCeil: - case HloOpcode::kClz: - case HloOpcode::kConvert: - case HloOpcode::kBitcastConvert: - case HloOpcode::kCopy: - case HloOpcode::kCos: - case HloOpcode::kExp: - case HloOpcode::kExpm1: - case HloOpcode::kFloor: - case HloOpcode::kImag: - case HloOpcode::kIsFinite: - case HloOpcode::kLog: - case HloOpcode::kLog1p: - case HloOpcode::kNot: - case HloOpcode::kNegate: - case HloOpcode::kReal: - case HloOpcode::kReducePrecision: - case HloOpcode::kSign: - case HloOpcode::kSin: - case HloOpcode::kTanh: - CHECK_EQ(1, operand_count()); - return true; - - // Binary elementwise operations, the same as in IsElementwiseBinary(). - case HloOpcode::kAdd: - case HloOpcode::kAtan2: - case HloOpcode::kComplex: - case HloOpcode::kDivide: - case HloOpcode::kEq: - case HloOpcode::kGe: - case HloOpcode::kGt: - case HloOpcode::kLe: - case HloOpcode::kLt: - case HloOpcode::kMaximum: - case HloOpcode::kMinimum: - case HloOpcode::kMultiply: - case HloOpcode::kNe: - case HloOpcode::kPower: - case HloOpcode::kRemainder: - case HloOpcode::kSubtract: - case HloOpcode::kAnd: - case HloOpcode::kOr: - case HloOpcode::kShiftLeft: - case HloOpcode::kShiftRightArithmetic: - case HloOpcode::kShiftRightLogical: - CHECK_EQ(2, operand_count()); - return true; - - // Ternary elementwise operations. - case HloOpcode::kSelect: - return !ShapeUtil::IsTuple(shape_); - case HloOpcode::kClamp: - return true; - - // Other operations. - case HloOpcode::kRng: - case HloOpcode::kMap: - return true; - case HloOpcode::kFusion: - if (fusion_kind() != FusionKind::kLoop) { - return false; - } - for (auto* fused : fused_instructions()) { - if (fused->opcode() != HloOpcode::kParameter && - !fused->IsElementwise()) { - return false; - } - } - return true; - - default: - return false; - } + return IsElementwiseImpl(absl::nullopt); } bool HloInstruction::ImplicitlyBroadcastsOperand(int64 operand_idx) const { @@ -3187,54 +2680,8 @@ bool HloInstruction::ImplicitlyBroadcastsOperand(int64 operand_idx) const { return !ShapeUtil::SameDimensions(shape(), operand(operand_idx)->shape()); } -namespace { -bool IsInstructionElementwiseOnOperand(const HloInstruction* instruction, - const HloInstruction* operand) { - std::vector operand_indices = instruction->OperandIndices(operand); - return std::all_of( - operand_indices.begin(), operand_indices.end(), - [instruction](int64 operand_index) { - return instruction->IsElementwiseOnOperand(operand_index); - }); -} -} // namespace - bool HloInstruction::IsElementwiseOnOperand(int64 operand_idx) const { - // For all instructions other than kFusion, being elementwise on one of the - // operands is equivalent to being elementwise on all the operands. - if (opcode() != HloOpcode::kFusion) { - return IsElementwise(); - } - - CHECK_EQ(HloOpcode::kFusion, opcode()); - if (fusion_kind() != FusionKind::kLoop) { - return false; - } - - // A loop-fusion is elementwise on an operand if all operations (computed - // using BFS) between the operand and the fused root are elementwise. - std::deque worklist; - std::unordered_set visited; - worklist.push_back(fused_parameter(operand_idx)); - visited.insert(fused_parameter(operand_idx)); - while (!worklist.empty()) { - HloInstruction* operand = worklist.front(); - worklist.pop_front(); - for (HloInstruction* user : operand->users()) { - CHECK_GE(user->unique_id(), 0); - if (ContainsKey(visited, user)) { - continue; - } - if (user->IsElementwise() || - IsInstructionElementwiseOnOperand(user, operand)) { - worklist.push_back(user); - visited.insert(user); - } else { - return false; - } - } - } - return true; + return IsElementwiseImpl(operand_idx); } // A helper class for memoized, recursive computation of HloOpcode::kFusion @@ -3256,8 +2703,10 @@ class HloInstruction::FusionReusesParamElements { static UseKind ComputeInternal( int64 i, const HloInstruction& hlo, tensorflow::gtl::FlatMap* cache) { - if (hlo.opcode_ == HloOpcode::kParameter && hlo.parameter_number_ == i) { - return UseKind::kUse; + if (auto hlo_param = DynCast(&hlo)) { + if (hlo_param->parameter_number() == i) { + return UseKind::kUse; + } } auto p = cache->emplace(&hlo, UseKind{}); @@ -3311,10 +2760,13 @@ HloInstruction::UseKind HloInstruction::OperandElementUse(int64 i) const { case HloOpcode::kTranspose: return UseKind::kUsePermutingElements; case HloOpcode::kPad: - case HloOpcode::kReduce: // Pad reuses the padding value but not the padded array elements. - // Reduce reuses the init value but not the operand array elements. return i > 0 ? UseKind::kReuse : UseKind::kUsePermutingElements; + case HloOpcode::kReduce: + // Reduce reuses the init values but not the operand array elements. + return i >= Cast(this)->input_count() + ? UseKind::kReuse + : UseKind::kUsePermutingElements; case HloOpcode::kFusion: // Uses the memoizing, recursive computation defined above. return FusionReusesParamElements::Compute(i, *fused_expression_root()); @@ -3379,7 +2831,7 @@ StatusOr StringToFusionKind( if (kind_name == "kCustom") { return HloInstruction::FusionKind::kCustom; } - return InvalidArgument("Unknown fusion kind: %s", kind_name.c_str()); + return InvalidArgument("Unknown fusion kind: %s", kind_name); } string PaddingConfigToString(const PaddingConfig& padding) { @@ -3388,7 +2840,7 @@ string PaddingConfigToString(const PaddingConfig& padding) { [](const PaddingConfig::PaddingConfigDimension& dim) { return dim.interior_padding() != 0; }); - return Join( + return StrJoin( padding.dimensions(), "x", [&](string* out, const PaddingConfig::PaddingConfigDimension& dim) { StrAppend( @@ -3412,49 +2864,19 @@ string OpMetadataToString(const OpMetadata& metadata) { if (metadata.source_line() != 0) { result.push_back(StrCat("source_line=", metadata.source_line())); } - return Join(result, " "); + return StrJoin(result, " "); } string RandomDistributionToString(const RandomDistribution& distribution) { - return tensorflow::str_util::Lowercase(RandomDistribution_Name(distribution)); -} - -StatusOr StringToRandomDistribution(const string& name) { - static std::unordered_map* map = [] { - static auto* map = new std::unordered_map; - for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) { - if (RandomDistribution_IsValid(i)) { - auto value = static_cast(i); - (*map)[RandomDistributionToString(value)] = value; - } - } - return map; - }(); - auto found = map->find(tensorflow::str_util::Lowercase(name)); - if (found == map->end()) { - return InvalidArgument("Unknown distribution"); - } - return found->second; + return absl::AsciiStrToLower(RandomDistribution_Name(distribution)); } -std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) { - return os << ToString(kind); +string PrecisionToString(const PrecisionConfigProto::Precision& precision) { + return absl::AsciiStrToLower(PrecisionConfigProto::Precision_Name(precision)); } -string HloInstruction::ConvolutionDimensionNumbersToString() const { - string result; - if (convolution_dimension_numbers_ == nullptr) { - return result; - } - const ConvolutionDimensionNumbers& dnums = *convolution_dimension_numbers_; - // Show the given dimension labels in order of major to minor based on the - // shape's layout. - const auto append_dims = [&](const std::vector& dims, - const Shape& shape) { - CHECK_EQ(dims.size(), ShapeUtil::Rank(shape)); - StrAppend(&result, Join(dims, "")); - }; - +string ConvolutionDimensionNumbersToString( + const ConvolutionDimensionNumbers& dnums) { // lhs_dims[i] is the symbol of the logical dimension i for the lhs // operand. E.g. if batch has dimension number 2, then lhs_dims[2] == "b". std::vector lhs_dims(2 + dnums.input_spatial_dimensions().size()); @@ -3478,19 +2900,8 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const { output_dims[dnums.output_spatial_dimensions(i)] = StrCat(i); } - result += "dim_labels="; - append_dims(lhs_dims, operand(0)->shape()); - result += "_"; - append_dims(rhs_dims, operand(1)->shape()); - result += "->"; - - // A convolution can be represented as a kConvolution HLO or as a CustomCall - // that returns a tuple, the first element of which is the result of the - // convolution. - Shape this_shape = - ShapeUtil::IsTuple(shape()) ? shape().tuple_shapes(0) : shape(); - append_dims(output_dims, this_shape); - return result; + return StrCat(StrJoin(lhs_dims, ""), "_", StrJoin(rhs_dims, ""), "->", + StrJoin(output_dims, "")); } string HloInstruction::DotDimensionNumbersToString() const { @@ -3501,39 +2912,80 @@ string HloInstruction::DotDimensionNumbersToString() const { const DotDimensionNumbers& dnums = *dot_dimension_numbers_; if (!dnums.lhs_batch_dimensions().empty()) { result.push_back(StrCat("lhs_batch_dims={", - Join(dnums.lhs_batch_dimensions(), ","), "}")); + StrJoin(dnums.lhs_batch_dimensions(), ","), "}")); } result.push_back(StrCat("lhs_contracting_dims={", - Join(dnums.lhs_contracting_dimensions(), ","), "}")); + StrJoin(dnums.lhs_contracting_dimensions(), ","), + "}")); if (!dnums.rhs_batch_dimensions().empty()) { result.push_back(StrCat("rhs_batch_dims={", - Join(dnums.rhs_batch_dimensions(), ","), "}")); + StrJoin(dnums.rhs_batch_dimensions(), ","), "}")); } result.push_back(StrCat("rhs_contracting_dims={", - Join(dnums.rhs_contracting_dimensions(), ","), "}")); + StrJoin(dnums.rhs_contracting_dimensions(), ","), + "}")); - return Join(result, ", "); + return StrJoin(result, ", "); } -string HloInstruction::GatherDimensionNumbersToString() const { - CHECK_NE(gather_dimension_numbers_.get(), nullptr); - string output_window_dims = - StrCat("output_window_dims={", - Join(gather_dimension_numbers_->output_window_dims(), ","), "}"); - string elided_window_dims = - StrCat("elided_window_dims={", - Join(gather_dimension_numbers_->elided_window_dims(), ","), "}"); - string gather_dims_to_operand_dims = StrCat( - "gather_dims_to_operand_dims={", - Join(gather_dimension_numbers_->gather_dims_to_operand_dims(), ","), "}"); - string index_vector_dim = StrCat( - "index_vector_dim=", gather_dimension_numbers_->index_vector_dim()); +StatusOr StringToRandomDistribution(const string& name) { + static std::unordered_map* map = [] { + static auto* map = new std::unordered_map; + for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) { + if (RandomDistribution_IsValid(i)) { + auto value = static_cast(i); + (*map)[RandomDistributionToString(value)] = value; + } + } + return map; + }(); + auto found = map->find(absl::AsciiStrToLower(name)); + if (found == map->end()) { + return InvalidArgument("Unknown distribution"); + } + return found->second; +} + +string HloInstruction::PrecisionConfigToString() const { + if (precision_config_.operand_precision().empty()) { + return ""; + } + return StrCat( + "operand_precision={", + StrJoin(precision_config_.operand_precision(), ",", + [](string* out, int32 precision) { + CHECK(PrecisionConfigProto::Precision_IsValid(precision)) + << precision; + StrAppend(out, PrecisionToString( + static_cast( + precision))); + }), + "}"); +} + +StatusOr StringToPrecision( + const string& name) { + static std::unordered_map* map = [] { + static auto* map = + new std::unordered_map; + for (int i = 0; i < PrecisionConfigProto::Precision_ARRAYSIZE; i++) { + if (PrecisionConfigProto::Precision_IsValid(i)) { + auto value = static_cast(i); + (*map)[PrecisionToString(value)] = value; + } + } + return map; + }(); + auto found = map->find(absl::AsciiStrToLower(name)); + if (found == map->end()) { + return InvalidArgument("Unknown distribution"); + } + return found->second; +} - return Join>( - {output_window_dims, elided_window_dims, gather_dims_to_operand_dims, - index_vector_dim}, - ", "); +std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) { + return os << ToString(kind); } bool HloInstruction::CouldBeBitcast() const { @@ -3547,6 +2999,31 @@ bool HloInstruction::CouldBeBitcast() const { } } +Status HloInstruction::GetBackendConfigInternal( + tensorflow::protobuf::Message* proto) const { + proto->Clear(); + + // Empty string does not parse as valid JSON, but it's a valid backend config, + // corresponding to the empty proto. + if (backend_config_.empty()) { + return Status::OK(); + } + return tensorflow::HumanReadableJsonToProto(backend_config_, proto); +} + +Status HloInstruction::set_backend_config( + const tensorflow::protobuf::Message& proto) { + TF_ASSIGN_OR_RETURN(backend_config_, BackendConfigToRawString(proto)); + return Status::OK(); +} + +/* static */ StatusOr HloInstruction::BackendConfigToRawString( + const tensorflow::protobuf::Message& proto) { + string ret; + TF_RETURN_IF_ERROR(tensorflow::ProtoToHumanReadableJson(proto, &ret)); + return ret; +} + HloModule* HloInstruction::GetModule() const { if (parent_) { return parent_->parent(); @@ -3564,21 +3041,291 @@ void HloInstruction::set_outer_dimension_partitions( outer_dimension_partitions_ = outer_dimension_partitions; } +// TODO(b/80131774): Remove these temporary methods after transition. +int64 HloInstruction::feature_index() const { + return Cast(this)->feature_index(); +} + +float HloInstruction::epsilon() const { + return Cast(this)->epsilon(); +} + +FftType HloInstruction::fft_type() const { + return Cast(this)->fft_type(); +} + +const std::vector& HloInstruction::fft_length() const { + return Cast(this)->fft_length(); +} + +int64 HloInstruction::channel_id() const { + return Cast(this)->channel_id(); +} + +int64 HloInstruction::concatenate_dimension() const { + return Cast(this)->concatenate_dimension(); +} + +bool HloInstruction::IsRank2Transpose() const { + auto transpose = DynCast(this); + return transpose != nullptr && transpose->IsRank2Transpose(); +} + +int64 HloInstruction::slice_starts(int64 dimension) const { + return Cast(this)->slice_starts(dimension); +} + +const std::vector& HloInstruction::slice_starts() const { + return Cast(this)->slice_starts(); +} + +int64 HloInstruction::slice_limits(int64 dimension) const { + return Cast(this)->slice_limits(dimension); +} + +const std::vector& HloInstruction::slice_limits() const { + return Cast(this)->slice_limits(); +} + +int64 HloInstruction::slice_strides(int64 dimension) const { + return Cast(this)->slice_strides(dimension); +} + +const std::vector& HloInstruction::slice_strides() const { + return Cast(this)->slice_strides(); +} + +bool HloInstruction::IsInPlaceSlice() const { + return Cast(this)->IsInPlaceSlice(); +} + +const Literal& HloInstruction::literal() const { + return Cast(this)->literal(); +} + +bool HloInstruction::IsConstant() const { + return DynCast(this) != nullptr; +} + void HloInstruction::RelayoutConstant(const Layout& new_layout, const ShapeIndex& shape_index) { - CHECK_EQ(opcode(), HloOpcode::kConstant); - Shape* mutable_array_subshape = - ShapeUtil::GetMutableSubshape(mutable_shape(), shape_index); - CHECK(ShapeUtil::IsArray(*mutable_array_subshape)); + Cast(this)->RelayoutConstant(new_layout, shape_index); +} + +string HloInstruction::TracingTag() const { + return Cast(this)->TracingTag(); +} - // Normally array_subshape will always have a layout, but this invariant is - // temporarily broken in LayoutAssignment::AssignLayouts. +HloInstruction* HloInstruction::AddFusionOperand(HloInstruction* new_operand) { + return Cast(this)->AddFusionOperand(new_operand); +} - if (!mutable_array_subshape->has_layout() || - !LayoutUtil::Equal(mutable_array_subshape->layout(), new_layout)) { - literal_ = literal_->Relayout(new_layout, shape_index); - *mutable_array_subshape->mutable_layout() = new_layout; +// Delegates to HloFusionInstruction::MergeFusionInstruction. +void HloInstruction::MergeFusionInstruction( + HloInstruction* instruction_to_merge) { + return Cast(this)->MergeFusionInstruction( + Cast(instruction_to_merge)); +} + +// Delegates to HloFusionInstruction::MergeFusionInstructionIntoMultiOutput. +void HloInstruction::MergeFusionInstructionIntoMultiOutput( + HloInstruction* instruction_to_merge) { + return Cast(this) + ->MergeFusionInstructionIntoMultiOutput( + Cast(instruction_to_merge)); +} + +HloInstruction* HloInstruction::FuseInstruction( + HloInstruction* instruction_to_fuse) { + return Cast(this)->FuseInstruction(instruction_to_fuse); +} + +HloInstruction* HloInstruction::FuseInstructionIntoMultiOutput( + HloInstruction* instruction_to_fuse) { + return Cast(this)->FuseInstructionIntoMultiOutput( + instruction_to_fuse); +} + +HloComputation* HloInstruction::fused_instructions_computation() const { + return Cast(this)->fused_instructions_computation(); +} + +HloInstruction* HloInstruction::fused_expression_root() const { + return Cast(this)->fused_expression_root(); +} + +const tensorflow::gtl::iterator_range>::const_iterator>> +HloInstruction::fused_instructions() const { + return Cast(this)->fused_instructions(); +} + +const tensorflow::gtl::iterator_range< + UnwrappingIterator>::iterator>> +HloInstruction::fused_instructions() { + return Cast(this)->fused_instructions(); +} + +int64 HloInstruction::fused_instruction_count() const { + return Cast(this)->fused_instruction_count(); +} + +HloInstruction* HloInstruction::fused_parameter(int64 parameter_number) const { + return Cast(this)->fused_parameter(parameter_number); +} + +const std::vector& HloInstruction::fused_parameters() const { + return Cast(this)->fused_parameters(); +} + +const bool HloInstruction::IsMultiOutputFusion() const { + const HloFusionInstruction* fusion = DynCast(this); + return fusion != nullptr && fusion->IsMultiOutputFusion(); +} + +HloInstruction::FusionKind HloInstruction::fusion_kind() const { + return Cast(this)->fusion_kind(); +} + +void HloInstruction::set_fusion_kind(FusionKind kind) { + return Cast(this)->set_fusion_kind(kind); +} + +RandomDistribution HloInstruction::random_distribution() const { + return Cast(this)->random_distribution(); +} + +int64 HloInstruction::parameter_number() const { + return Cast(this)->parameter_number(); +} + +int64 HloInstruction::tuple_index() const { + return Cast(this)->tuple_index(); +} + +int32 HloInstruction::exponent_bits() const { + return Cast(this)->exponent_bits(); +} + +int32 HloInstruction::mantissa_bits() const { + return Cast(this)->mantissa_bits(); +} + +string HloInstruction::infeed_config() const { + return Cast(this)->infeed_config(); +} + +void HloInstruction::set_infeed_config(const string& config) { + return Cast(this)->set_infeed_config(config); +} + +const Shape& HloInstruction::outfeed_shape() const { + return Cast(this)->outfeed_shape(); +} + +const string& HloInstruction::outfeed_config() const { + return Cast(this)->outfeed_config(); +} + +const std::vector& HloInstruction::replica_groups() const { + return Cast(this)->replica_groups(); +} + +const std::vector>& +HloInstruction::source_target_pairs() const { + return Cast(this)->source_target_pairs(); +} + +string HloInstruction::cross_replica_sum_barrier() const { + return Cast(this)->cross_replica_sum_barrier(); +} + +void HloInstruction::set_cross_replica_sum_barrier(const string& barrier) { + return Cast(this)->set_cross_replica_sum_barrier( + barrier); +} + +absl::optional HloInstruction::all_reduce_id() const { + return Cast(this)->all_reduce_id(); +} + +const ConvolutionDimensionNumbers& +HloInstruction::convolution_dimension_numbers() const { + if (auto convolution = DynCast(this)) { + return convolution->convolution_dimension_numbers(); + } + if (auto custom_call = DynCast(this)) { + return custom_call->convolution_dimension_numbers(); + } + LOG(FATAL) << "Unimplemented method."; +} + +void HloInstruction::set_convolution_dimension_numbers( + const ConvolutionDimensionNumbers& dnums) { + if (auto convolution = DynCast(this)) { + convolution->set_convolution_dimension_numbers(dnums); + } else if (auto custom_call = DynCast(this)) { + custom_call->set_convolution_dimension_numbers(dnums); + } else { + LOG(FATAL) << "Unimplemented method."; } } +int64 HloInstruction::feature_group_count() const { + if (auto convolution = DynCast(this)) { + return convolution->feature_group_count(); + } + return Cast(this)->feature_group_count(); +} + +void HloInstruction::set_feature_group_count(int64 feature_group_count) { + Cast(this)->set_feature_group_count( + feature_group_count); +} + +HloComputation* HloInstruction::select() const { + return Cast(this)->select(); +} + +HloComputation* HloInstruction::scatter() const { + return Cast(this)->scatter(); +} + +void HloInstruction::set_select(HloComputation* computation) { + return Cast(this)->set_select(computation); +} + +void HloInstruction::set_scatter(HloComputation* computation) { + return Cast(this)->set_scatter(computation); +} + +const string& HloInstruction::custom_call_target() const { + return Cast(this)->custom_call_target(); +} + +const PaddingConfig& HloInstruction::padding_config() const { + return Cast(this)->padding_config(); +} + +int64 HloInstruction::slice_sizes(int64 dimension) const { + return Cast(this)->slice_sizes(dimension); +} + +const std::vector& HloInstruction::dynamic_slice_sizes() const { + return Cast(this)->dynamic_slice_sizes(); +} + +const GatherDimensionNumbers& HloInstruction::gather_dimension_numbers() const { + return Cast(this)->gather_dimension_numbers(); +} + +absl::Span HloInstruction::gather_slice_sizes() const { + return Cast(this)->gather_slice_sizes(); +} + +const ScatterDimensionNumbers& HloInstruction::scatter_dimension_numbers() + const { + return Cast(this)->scatter_dimension_numbers(); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 6df97c40bad3893e8707b090377d42a98f249d8b..cca134e8b45f89a1c395c791029ee68eeec3c8f0 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -32,8 +32,13 @@ limitations under the License. #include #include +#include "absl/container/inlined_vector.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/iterator_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" @@ -45,13 +50,11 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" -#include "tensorflow/core/lib/gtl/inlined_vector.h" #include "tensorflow/core/lib/gtl/iterator_range.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -100,6 +103,7 @@ class HloPrintOptions { return HloPrintOptions() .set_print_subcomputation_mode(PrintSubcomputationMode::kFullBodies) .set_print_metadata(false) + .set_print_backend_config(false) .set_compact_operands(true) .set_print_operand_shape(true) .set_print_program_shape(false) @@ -181,7 +185,7 @@ class HloPrintOptions { return print_subcomputation_mode_; } bool print_metadata() const { return print_metadata_; } - bool print_backend_config() const { return print_metadata_; } + bool print_backend_config() const { return print_backend_config_; } bool compact_operands() const { return compact_operands_; } bool print_operand_shape() const { return print_operand_shape_; } bool print_program_shape() const { return print_program_shape_; } @@ -219,7 +223,7 @@ class CanonicalNameMap { return iter->second; } - string new_name = tensorflow::strings::StrCat("tmp_", index++); + string new_name = absl::StrCat("tmp_", index++); canonical_name_map[old_name] = new_name; return new_name; } @@ -321,7 +325,7 @@ class HloInstruction { kCustom, }; - ~HloInstruction(); + virtual ~HloInstruction(); // Creates an instruction from the given proto. Arguments: // @@ -345,6 +349,10 @@ class HloInstruction { static std::unique_ptr CreateConstant( std::unique_ptr literal); + // Creates an Iota instruction. + static std::unique_ptr CreateIota(const Shape& shape, + int64 iota_dimension); + // Creates a get tuple element instruction. static std::unique_ptr CreateGetTupleElement( const Shape& shape, HloInstruction* operand, int64 index); @@ -357,7 +365,7 @@ class HloInstruction { // random numbers from a given distribution. static std::unique_ptr CreateRng( const Shape& shape, RandomDistribution distribution, - tensorflow::gtl::ArraySlice parameters); + absl::Span parameters); // Creates a unary instruction (one operand). // Precondition: opcode must be a legitimate unary operation. @@ -384,27 +392,27 @@ class HloInstruction { // Precondition: opcode must be a legitimate variadic operation. static std::unique_ptr CreateVariadic( const Shape& shape, HloOpcode opcode, - tensorflow::gtl::ArraySlice operands); + absl::Span operands); // Creates a map instruction, where the computation (given by the handle) is // applied element-wise to every element in operands (across the operands, - // at a given index) with the same `static_operands`. + // at a given index) static std::unique_ptr CreateMap( - const Shape& shape, tensorflow::gtl::ArraySlice operands, - HloComputation* map_computation, - tensorflow::gtl::ArraySlice static_operands = {}); + const Shape& shape, absl::Span operands, + HloComputation* map_computation); // Creates a convolution op, where rhs is the convolutional filter // and window describes how the filter is applied to lhs. static std::unique_ptr CreateConvolve( const Shape& shape, HloInstruction* lhs, HloInstruction* rhs, const Window& window, - const ConvolutionDimensionNumbers& dimension_numbers); + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count = 1); // Creates an FFT op, of the type indicated by fft_type. static std::unique_ptr CreateFft( const Shape& shape, HloInstruction* operand, FftType fft_type, - tensorflow::gtl::ArraySlice fft_length); + absl::Span fft_length); // Creates a dot op with operands 'lhs' and 'rhs' with contracting and batch // dimensions specified in 'dimension_numbers'. @@ -425,10 +433,52 @@ class HloInstruction { const Shape& shape, HloInstruction* operand, const int exponent_bits, const int mantissa_bits); - // Creates a cross replica sum op. + // Creates a cross replica reduction op. + // + // `reduction_computation`: the reduction function. + // + // `replica_groups`: each ReplicaGroup contains a list of replica id. If + // empty, all replicas belong to one group in the order of 0 - (n-1). + // Allreduce will be applied within subgroups. + // For example, we have 4 replicas, then replica_groups={{0,2},{1,3}} means, + // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1. + // + // `all_reduce_id`: for Allreduce nodes from different modules, if they have + // the same all_reduce_id, they will be 'Allreduce'd. If empty, Allreduce will + // not be applied cross modules. + // + // TODO(b/79737069): Rename this to AllReduce. static std::unique_ptr CreateCrossReplicaSum( - const Shape& shape, - tensorflow::gtl::ArraySlice operands); + const Shape& shape, absl::Span operands, + HloComputation* reduce_computation, + const std::vector& replica_groups, + absl::string_view barrier, const absl::optional& all_reduce_id); + + // This op handles the communication of an Alltoall operation. On each core, + // the operands are N ops in the same shape, where N is the number of cores + // participating the Alltoall. Then the N operands are scattered to N cores, + // e.g., the ith operand is sent to the ith core. Then each core gathers the + // received data into a tuple. + // + // - `replica_groups`: each ReplicaGroup contains a list of replica id. If + // empty, all replicas belong to one group in the order of 0 - (n-1). Alltoall + // will be applied within subgroups in the specified order. For example, + // replica groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied + // within replica 1, 2, 3, and in the gather phase, the received blocks will + // be concatenated in the order of 1, 2, 3; another Alltoall will be applied + // within replica 4, 5, 0, and the concatenation order is 4, 5, 0. + static std::unique_ptr CreateAllToAll( + const Shape& shape, absl::Span operands, + const std::vector& replica_groups); + + // Creates a communitation instructions that permutes data cross replicas. + // Data is sent/received according to the (source_replica_id, + // target_replica_id) pairs in `source_target_pairs`. If a replica id is not a + // target_replica_id in any pair, the output on that replica is a tensor + // conssits of 0(s) in `shape`. + static std::unique_ptr CreateCollectivePermute( + const Shape& shape, HloInstruction* operand, + const std::vector>& source_target_pairs); // Creates a conversion instruction, where operand is the data to convert and // shape is the target shape for the conversion. @@ -441,52 +491,60 @@ class HloInstruction { const Shape& shape, HloInstruction* operand); // Creates an infeed instruction, which reads data of the given shape from the - // Infeed interface of the device. - static std::unique_ptr CreateInfeed(const Shape& shape, - const string& config); - - // Creates an outfeed instruction, which outputs data. + // Infeed interface of the device. infeed_shape is the shape of the data + // received from the infeed *not* the shape of the infeed instruction which + // is a tuple containing the infeed_shape and the TOKEN. + static std::unique_ptr CreateInfeed( + const Shape& infeed_shape, HloInstruction* token_operand, + const string& config); + + // Creates an outfeed instruction, which outputs data. outfeed_shape is the + // shape of the data being outfed *not* the shape of the outfeed instruction + // which is a TOKEN. static std::unique_ptr CreateOutfeed( - const Shape& shape, HloInstruction* operand, - tensorflow::StringPiece outfeed_config); + const Shape& outfeed_shape, HloInstruction* operand, + HloInstruction* token_operand, absl::string_view outfeed_config); // Creates an asynchronous send instruction with the given channel id, which // initiates sending the operand data to a unique receive instruction in - // another computation that has the same channel id. - static std::unique_ptr CreateSend(HloInstruction* operand, - int64 channel_id); + // another computation that has the same channel id. If is_host_transfer is + // true, then this Send operation transfers data to the host. + static std::unique_ptr CreateSend( + HloInstruction* operand, HloInstruction* token, int64 channel_id, + bool is_host_transfer = false); // Blocks until data transfer for the Send instruction (operand) is complete. // The operand must be kSend. static std::unique_ptr CreateSendDone( - HloInstruction* operand); + HloInstruction* operand, bool is_host_transfer = false); // Creates an asynchronous receive instruction with the given channel id, // which allocates resources to receive data of the given shape from a unique - // send instruction in another computation that has the same channel id. - static std::unique_ptr CreateRecv(const Shape& shape, - int64 channel_id); + // send instruction in another computation that has the same channel id. If + // is_host_transfer is true, then this Send operation transfers data from the + // host. + static std::unique_ptr CreateRecv( + const Shape& shape, HloInstruction* token, int64 channel_id, + bool is_host_transfer = false); // Blocks until data transfer for the Recv instruction (operand) is complete // and returns the receive buffer. The operand must be kRecv. static std::unique_ptr CreateRecvDone( - HloInstruction* operand); + HloInstruction* operand, bool is_host_transfer = false); // Creates a slice instruction, where the operand is sliced by the given // start/limit indices. static std::unique_ptr CreateSlice( const Shape& shape, HloInstruction* operand, - tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices, - tensorflow::gtl::ArraySlice strides); + absl::Span start_indices, + absl::Span limit_indices, absl::Span strides); // Creates a slice instruction, where the first operand is sliced by // start indices specified in the second operand, and by size specified in // 'slice_sizes'. static std::unique_ptr CreateDynamicSlice( const Shape& shape, HloInstruction* operand, - HloInstruction* start_indices, - tensorflow::gtl::ArraySlice slice_sizes); + HloInstruction* start_indices, absl::Span slice_sizes); // Creates a dynamic update slice instruction, which updates a slice // of 'operand' with 'update' and 'start_indices'. @@ -497,19 +555,36 @@ class HloInstruction { // Creates a concatenate instruction, where the operands are concatenated on // the provided dimension. static std::unique_ptr CreateConcatenate( - const Shape& shape, tensorflow::gtl::ArraySlice operands, + const Shape& shape, absl::Span operands, int64 dimension); // Creates a reduce instruction, where the computation (given by the handle) - // is applied successively to every element in operand. That is, if f is the - // function to apply (which either takes 2 [accumulator, value] or 3 - // [accumulator, index, value] arguments) and init is a reduction operator - // specified initial value (for example, 0 for addition), then this operation - // will compute: - // f(f(init, [index0], value0), [index1], value1), ...) + // is applied successively to every element in operand. For example, let f be + // the function to apply, which takes 2 arguments, an accumulator and the + // current value. Let init be an initial value (which is normally chosen to be + // the identity element for f, e.g. 0 if f is addition). + // Then the reduce HLO will compute: + // f(f(init, value0), value1), ...) static std::unique_ptr CreateReduce( const Shape& shape, HloInstruction* operand, HloInstruction* init_value, - tensorflow::gtl::ArraySlice dimensions_to_reduce, + absl::Span dimensions_to_reduce, + HloComputation* reduce_computation); + + // A more general, multiple-argument version of the above. + // The function to apply, f, now takes N arguments: + // [accumulator0, accumulator1, ..., accumulatorN, value0, value1, ..., + // init_valueN], and returns an N-tuple. The performed computation is (for + // commutative and associative f operators) equivalent to: + // + // f_1 = f(init0, ... initN, input0.value0, ..., inputN.value0) + // f_2 = f(f_1.tuple_element(0), ..., f_1.tuple_element(N), input0.value1, + // ..., inputN.value1) + // ... + // TODO(b/112040122): Add support to this in HLO passes and in backends. + static std::unique_ptr CreateReduce( + const Shape& shape, absl::Span operands, + absl::Span init_values, + absl::Span dimensions_to_reduce, HloComputation* reduce_computation); // Creates a reduce-window instruction, where the computation (given @@ -546,7 +621,7 @@ class HloInstruction { // Creates a broadcast instruction. static std::unique_ptr CreateBroadcast( const Shape& shape, HloInstruction* operand, - tensorflow::gtl::ArraySlice broadcast_dimensions); + absl::Span broadcast_dimensions); // Creates a sequence of instructions that performs an explicit broadcast of // the operand to the target shape. @@ -576,7 +651,12 @@ class HloInstruction { // Creates a transpose instruction which permutes the operand dimensions. static std::unique_ptr CreateTranspose( const Shape& shape, HloInstruction* operand, - tensorflow::gtl::ArraySlice dimensions); + absl::Span dimensions); + + // Creates a sort op, with a keys operand, and an optional values operand. + static std::unique_ptr CreateSort( + const Shape& shape, int64 dimension, HloInstruction* keys, + HloInstruction* values = nullptr); // Creates a while instruction, given a condition computation, a body // computation, and the initial value for the input of the computations. For @@ -595,9 +675,15 @@ class HloInstruction { static std::unique_ptr CreateGather( const Shape& shape, HloInstruction* operand, - HloInstruction* gather_indices, + HloInstruction* start_indices, const GatherDimensionNumbers& gather_dim_numbers, - tensorflow::gtl::ArraySlice window_bounds); + absl::Span slice_sizes); + + static std::unique_ptr CreateScatter( + const Shape& shape, HloInstruction* operand, + HloInstruction* scatter_indices, HloInstruction* updates, + HloComputation* update_computation, + const ScatterDimensionNumbers& scatter_dim_numbers); // Creates a kDomain instruction which delimits an HLO domain which have // the provided user and operand side metadata. @@ -615,44 +701,44 @@ class HloInstruction { static std::unique_ptr CreateFusion( const Shape& shape, FusionKind fusion_kind, - tensorflow::gtl::ArraySlice operands, + absl::Span operands, HloComputation* fusion_computation); // Creates a call instruction that applies the given computation on the given // operands. "shape" is the resultant shape. static std::unique_ptr CreateCall( - const Shape& shape, tensorflow::gtl::ArraySlice operands, + const Shape& shape, absl::Span operands, HloComputation* computation); // Creates a custom call instruction that applies the given custom call target // to the given operands. "shape" is the resultant shape. static std::unique_ptr CreateCustomCall( - const Shape& shape, tensorflow::gtl::ArraySlice operands, - tensorflow::StringPiece custom_call_target); - - // Creates a HostCompute instruction, which records host-side control and - // data dependencies for use in instruction scheduling. - static std::unique_ptr CreateHostCompute( - const Shape& shape, tensorflow::gtl::ArraySlice operands, - tensorflow::StringPiece channel_name, const int64 cost_estimate_ns); + const Shape& shape, absl::Span operands, + absl::string_view custom_call_target); // Creates a tuple instruction with the given elements. This is a convenience // wrapper around CreateVariadic. static std::unique_ptr CreateTuple( - tensorflow::gtl::ArraySlice elements); + absl::Span elements); // Creates a reverse instruction, which reverses the order of the elements // in the specified dimensions. static std::unique_ptr CreateReverse( const Shape& shape, HloInstruction* operand, - tensorflow::gtl::ArraySlice dimensions); + absl::Span dimensions); + + // Creates a Afterall instruction used for joining or creating new values of + // token type which thread through side-effecting operations. Operands must + // all be tokens, and there must be at least one operand. + static std::unique_ptr CreateAfterAll( + absl::Span operands); - // Creates an instance of GatherDimensionNumbers. - static GatherDimensionNumbers MakeGatherDimNumbers( - tensorflow::gtl::ArraySlice output_window_dims, - tensorflow::gtl::ArraySlice elided_window_dims, - tensorflow::gtl::ArraySlice gather_dims_to_operand_dims, - int64 index_vector_dim); + // Creates an AfterAll instruction which creates a token type out of thin air + // (no operands). This is a separate method from CreateAfterAll to facility + // the removal of operand-less AfterAll instructions. + // TODO(b/110532604): Remove this capability of creating a token from nothing + // when we plumb a primordial token from the entry computation. + static std::unique_ptr CreateToken(); // Returns the opcode for this instruction. HloOpcode opcode() const { return opcode_; } @@ -682,7 +768,7 @@ class HloInstruction { int64 operand_count() const { return operands_.size(); } // Returns the vector of operands of this instruction. - using InstructionVector = tensorflow::gtl::InlinedVector; + using InstructionVector = absl::InlinedVector; const InstructionVector& operands() const { return operands_; } // Returns the vector of unique operands, in the same order they are found @@ -775,21 +861,33 @@ class HloInstruction { } } + if (backend_config_ != other.backend_config_) { + return false; + } + + if (!absl::c_equal(precision_config_.operand_precision(), + other.precision_config_.operand_precision())) { + return false; + } + return IdenticalSlowPath(other, eq_computations); } // Returns whether the instruction has a constant operand. bool HasConstantOperand() const; - // Returns whether this instruction does a rank-2 transposition. - bool IsRank2Transpose() const; - // Replaces the use of this instruction in "user" with "new_producer". Note // that there might be multiple uses of this instruction in "user"; all will // be replaced. + // + // If user is a fusion instruction, this function will remove any duplicated + // operands of it which could be created due to this replacement. Status ReplaceUseWith(HloInstruction* user, HloInstruction* new_producer); // Replaces the specified operand with new_operand. + // + // This function does NOT remove duplicated operands even if this instruction + // is a fusion, so that the existing operand numbers do not change. Status ReplaceOperandWith(int64 operand_no, HloInstruction* new_operand); // Replaces all uses of this instruction with the new producer. If @@ -798,14 +896,10 @@ class HloInstruction { // // If this instruction is the root of its computation, sets the computation's // root to new_producer. - Status ReplaceAllUsesWith(HloInstruction* new_producer); - - // Detaches an instruction from its operands. That is, remove the instruction - // from each operand's user set. This should only be called prior to - // deallocating the instruction. // - // TODO(b/78305363): Make this automatic when deleting an instruction. - void DetachFromOperands(); + // If a user is a fusion instruction, this function will remove any duplicated + // operands of it which could be created due to this replacement. + Status ReplaceAllUsesWith(HloInstruction* new_producer); // Performs a postorder DFS visit using this node as the root. If // call_finish_visit is true, then DfsHloVisitor::FinishVisit is called when @@ -852,38 +946,6 @@ class HloInstruction { template Status Visit(DfsHloVisitorBase* visitor); - // Returns the literal associated with this instruction. - // - // Note: only constant and parameter opcodes have an associated literal. - const Literal& literal() const; - - // Returns whether there is literal associated with this instruction. - bool HasLiteral() const; - - // Returns the parameter number associated with this instruction. - // - // Note: only parameter opcodes have an associated parameter number. - int64 parameter_number() const { - CHECK_EQ(HloOpcode::kParameter, opcode_); - return parameter_number_; - } - - // Returns the dimension sizes or numbers associated with this instruction. - // - // Precondition: opcode() is one of: concatenate, reduce, broadcast, reshape, - // and reverse. - const std::vector& dimensions() const; - int64 dimensions(int64 index) const; - - // Accessor for the dimension in which a concatenate HLO should occur. - // Precondition: opcode() == HloOpcode::kConcatenate - int64 concatenate_dimension() const; - - // Returns the tuple index associated with this instruction. - // - // Precondition: opcode() == HloOpcode::kGetTupleElement - int64 tuple_index() const; - // Returns the first non-GetTupleElement ancestor instruction of 'hlo'. // If the first non-GTE ancestor is tuple-shaped, populates 'index' with the // (possibly nested) tuple indices used on the path from ancestor to 'hlo'. @@ -911,18 +973,6 @@ class HloInstruction { HloComputation* to_apply() const; void set_to_apply(HloComputation* to_apply); - // Returns the custom_call_target for CustomCall. - // Precondition: opcode() == HloOpcode::kCustomCall - const string& custom_call_target() const; - - // Returns the config for the Outfeed instruction. - // Precondition: opcode() == HloOpcode::kOutfeed - const string& outfeed_config() const; - - // Returns the shape for the Outfeed instruction. - // Precondition: opcode() == HloOpcode::kOutfeed - const Shape& outfeed_shape() const; - // Gets/sets the while_condition or while_body HloComputation for While. The // setters should only be called by HloModule or HloComputation methods. // @@ -932,15 +982,6 @@ class HloInstruction { void set_while_condition(HloComputation* while_condition); void set_while_body(HloComputation* while_body); - // Gets/sets the select or scatter HloComputation for SelectAndScatter. The - // setters should only be called by HloModule or HloComputation methods. - // - // Precondition: opcode() == HloOpcode::kSelectAndScatter. - HloComputation* select() const; - HloComputation* scatter() const; - void set_select(HloComputation* select); - void set_scatter(HloComputation* scatter); - // Gets/sets the true and false HloComputation for Conditional. The setters // should only be called by HloModule or HloComputation methods. // @@ -978,11 +1019,11 @@ class HloInstruction { string ToShortString() const; // Returns a serialized representation of this instruction. - HloInstructionProto ToProto() const; + virtual HloInstructionProto ToProto() const; // Returns a category for the HLO. This could be something like "convolution" // or "elementwise". - string ToCategory() const; + virtual string ToCategory() const; // Returns a logging instruction, if the output of this instruction is logged. // @@ -990,110 +1031,13 @@ class HloInstruction { HloInstruction* tracing() const; void set_tracing(HloInstruction* trace_instruction); - // Returns the channel id associated with the instruction. The id is - // shared between each Send/Recv pair and is globally unique to identify each - // channel. - // - // Precondition: opcode() == HloOpcode::kSend or HloOpcode::kRecv - int64 channel_id() const { return channel_id_; } - - // Returns the channel name associated with the instruction. The name is - // used to identify host Send/Recv operations. - // - // Precondition: opcode() == HloOpcode::kHostCompute - string channel_name() const { return channel_name_; } - - // Returns feature_index field associated with the instruction. The index - // represents the index of the feature dimension. - // - // Precondition: opcode() is one of kBatchNormTraining, kBatchNormInference, - // or kBatchNormGrad. - int64 feature_index() const { return feature_index_; } - - // Returns a epsilon value associated with the instruction. The is a small - // number added to the variance to avoid divide-by-zero error. - // - // Precondition: opcode() is one of kBatchNormTraining, kBatchNormInference, - // or kBatchNormGrad. - float epsilon() const { return epsilon_; } - - // Returns the infeed configuration string. The infeed configuration includes - // any metadata needed for the backend compiler (e.g., infeed buffer address) - // and is target-dependent. - string infeed_config() const { return infeed_config_; } - void set_infeed_config(const string& config) { infeed_config_ = config; } - - // Returns a tag to be used in tracing. - // - // Precondition: opcode() == HloOpcode::kTrace - string TracingTag() const; - - // Returns whether the instruction is a constant. - bool IsConstant() const; - // Returns true if this instruction is fused, ie contained within a fusion // instruction. bool IsFused() const; - // Returns the computation for this fused instruction. - // - // Precondition: opcode() == HloOpcode::kFusion - HloComputation* fused_instructions_computation() const; - // Returns true if this instruction can be legally fused into a fusion // instruction. - bool IsFusable() const; - - // Returns the root instruction of the fused expression contained within this - // fusion instruction. - // - // Precondition: opcode() == HloOpcode::kFusion - HloInstruction* fused_expression_root() const; - - // Returns the list of fused instructions inside this fusion instruction. The - // returned type is a range of HloInstruction*s. - // - // Precondition: opcode() == HloOpcode::kFusion - const tensorflow::gtl::iterator_range>::const_iterator>> - fused_instructions() const; - - const tensorflow::gtl::iterator_range< - UnwrappingIterator>::iterator>> - fused_instructions(); - - // Gets the number of instructions inside this fusion instruction. - // - // Precondition: opcode() == HloOpcode::kFusion - int64 fused_instruction_count() const; - - // Returns the fused parameter instruction in this fusion instruction - // corresponding to the given parameter number. - // - // Precondition: opcode() == HloOpcode::kFusion - HloInstruction* fused_parameter(int64 parameter_number) const; - - // Returns the vector of fused parameters inside this fusion instruction. - // - // Precondition: opcode() == HloOpcode::kFusion - const std::vector& fused_parameters() const; - - // Returns true if this instruction is a fusion instruction that generates - // multiple outputs. - const bool IsMultiOutputFusion() const { - return opcode() == HloOpcode::kFusion && - fused_expression_root()->opcode() == HloOpcode::kTuple; - } - - FusionKind fusion_kind() const { - CHECK_EQ(HloOpcode::kFusion, opcode_); - return fusion_kind_; - } - - void set_fusion_kind(FusionKind kind) { - CHECK_EQ(HloOpcode::kFusion, opcode_); - fusion_kind_ = kind; - } + bool IsFusible() const; // Returns the sharding applied to this operator. // REQUIRES: has_sharding() is true. @@ -1101,26 +1045,32 @@ class HloInstruction { CHECK(has_sharding()); return *sharding_; } + std::shared_ptr sharding_ptr() const { return sharding_; } + // Returns the sharding applied to this operator, or default_ if none exists. const HloSharding& sharding_or_default(const HloSharding& default_) const { return sharding_ ? *sharding_ : default_; } // Returns the sharding unique device, if any. - tensorflow::gtl::optional sharding_unique_device() const { + absl::optional sharding_unique_device() const { if (sharding_ == nullptr) { - return tensorflow::gtl::optional(); + return absl::optional(); } - auto device = sharding_->UniqueDevice(); - return device.ok() ? device.ValueOrDie() - : tensorflow::gtl::optional(); + return sharding_->UniqueDevice(); } // Sets the sharding of this operator. Should only be called by HloModule or // HloComputation methods. void set_sharding(const HloSharding& sharding) { - sharding_ = MakeUnique(sharding); + sharding_ = std::make_shared(sharding); } + void set_sharding(std::shared_ptr sharding) { + sharding_ = std::move(sharding); + } + void set_single_sharding(const HloSharding& sharding); // Sets a sharding that assigns the current instruction to device. - void set_device_sharding(int64 device); + void set_device_sharding(int64 device) { + set_single_sharding(HloSharding::AssignDevice(device)); + } // Remove any sharding from this operator. void clear_sharding() { sharding_ = nullptr; } // Return true if this operator has a sharding assigned. @@ -1150,172 +1100,6 @@ class HloInstruction { // instruction. void SetupDerivedInstruction(HloInstruction* derived_instruction) const; - // Adds a new operand the fusion instruction. - HloInstruction* AddFusionOperand(HloInstruction* new_operand); - - // Merges the fused instructions from 'instruction_to_merge' into the - // fused instruction set of 'this', updating operands as necessary. - // - // Precondition: opcode() == HloOpcode::kFusion - // Predondition: 'instruction_to_merge' must be an operand of 'this'. - void MergeFusionInstruction(HloInstruction* instruction_to_merge); - - // Merges the fused instructions from instruction_to_merge into the fused - // instruction set of 'this' and generates multioutput fusion instructions. - // All the users of instruction_to_merge will be redirected to 'this' - // instruction. instruction_to_merge will be removed from its parent - // computation. - // - // Precondition: opcode() == HloOpcode::kFusion - void MergeFusionInstructionIntoMultiOutput( - HloInstruction* instruction_to_merge); - - // Fuses the given instruction in this fusion instruction. instruction_to_fuse - // is cloned and the clone is placed in the fusion - // instruction. instruction_to_fuse is unchanged. Instruction is cloned rather - // than moved to cleanly handle the case where the instruction has a use - // outside the fusion instruction. Moving such an instruction into a fusion - // instruction would violate the single-result invariant of HLO instructions - // and significantly complicate code generation. - // - // Precondition: this->opcode() == HloOpcode::kFusion - HloInstruction* FuseInstruction(HloInstruction* instruction_to_fuse) { - return FuseInstructionInternal(instruction_to_fuse); - } - - // Fuses the given instruction in this fusion instruction and generate - // multioutput fusion instruction. A clone of the instruction_to_fuse will - // be part of the output of fusion instructions. The users of - // instruction_to_fuse will be redirected to this fusion instructions. - // instruction_to_fuse will be removed from its parent computation. - // - // Precondition: this->opcode() == HloOpcode::kFusion - HloInstruction* FuseInstructionIntoMultiOutput( - HloInstruction* instruction_to_fuse) { - return FuseInstructionInternal(instruction_to_fuse, /* add_output */ true); - } - - // Returns the start index in the given dimension for a slice node. - // - // Precondition: opcode() == HloOpcode::kSlice - int64 slice_starts(int64 dimension) const { - CHECK_EQ(HloOpcode::kSlice, opcode_); - return slice_starts_[dimension]; - } - const std::vector& slice_starts() const { return slice_starts_; } - - // Returns the (exclusive) limit index in the given dimension for a slice - // node. - // - // Precondition: opcode() == HloOpcode::kSlice - int64 slice_limits(int64 dimension) const { - CHECK_EQ(HloOpcode::kSlice, opcode_); - return slice_limits_[dimension]; - } - const std::vector& slice_limits() const { - CHECK_EQ(HloOpcode::kSlice, opcode_); - return slice_limits_; - } - - // Returns the stride in the given dimension for a slice node. - // - // Precondition: opcode() == HloOpcode::kSlice - int64 slice_strides(int64 dimension) const { - CHECK_EQ(HloOpcode::kSlice, opcode_); - return slice_strides_[dimension]; - } - const std::vector& slice_strides() const { return slice_strides_; } - - // Returns the flag that describes whether a slice must be lowered into an - // offset into the original operand. - bool IsInPlaceSlice() const { return is_in_place_slice_; } - - // Sets and returns the flag that describes whether a slice must be lowered - // into an offset into the original operand. - bool SetIsInPlaceSlice(bool value) { - is_in_place_slice_ = value; - return value; - } - - // Returns the size of the slice in the given dimension for a dynamic - // slice node. - // - // Precondition: opcode() == HloOpcode::kDynamicSlice - int64 slice_sizes(int64 dimension) const { - CHECK_EQ(HloOpcode::kDynamicSlice, opcode_); - return dynamic_slice_sizes_[dimension]; - } - const std::vector& dynamic_slice_sizes() const { - CHECK_EQ(HloOpcode::kDynamicSlice, opcode_); - return dynamic_slice_sizes_; - } - - // Returns the number of exponent bits for a reduce-precision node. - // - // Precondition: opcode() == HloOpcode::kReducePrecision - int32 exponent_bits() const { - CHECK_EQ(HloOpcode::kReducePrecision, opcode_); - return exponent_bits_; - } - - // Returns the number of mantissa bits for a reduce-precision node. - // - // Precondition: opcode() == HloOpcode::kReducePrecision - int32 mantissa_bits() const { - CHECK_EQ(HloOpcode::kReducePrecision, opcode_); - return mantissa_bits_; - } - - // Returns data on the window in a windowed operation such as - // convolution. - const Window& window() const { - CHECK(window_ != nullptr); - return *window_; - } - - // Sets the window data in a windowed operation such as convolution. - void set_window(const Window& window) { - window_ = MakeUnique(window); - } - - // Returns the padding configuration for a pad node. - // - // Precondition: opcode() == HloOpcode::kPad - const PaddingConfig& padding_config() const { - CHECK(padding_config_ != nullptr); - return *padding_config_; - } - - // Returns data on the dimension numbers used for a convolution operation, - // which may be a kConvolution instruction or a kCustomCall that implements a - // convolution. - const ConvolutionDimensionNumbers& convolution_dimension_numbers() const { - CHECK(convolution_dimension_numbers_ != nullptr); - return *convolution_dimension_numbers_; - } - - // Sets the convolution dimension numbers on this instruction. In general you - // shouldn't need to call this; instead, specify the convolution dimension - // numbers when you create the instruction. - void set_convolution_dimension_numbers( - const ConvolutionDimensionNumbers& dnums) { - convolution_dimension_numbers_ = - MakeUnique(dnums); - } - - FftType fft_type() const { - CHECK_EQ(HloOpcode::kFft, opcode_); - return fft_type_; - } - - const std::vector& fft_length() const { - CHECK_EQ(HloOpcode::kFft, opcode_); - return fft_length_; - } - - // Returns the dump string of the convolution dimension numbers. - string ConvolutionDimensionNumbersToString() const; - // Returns data on the dimension numbers used for a dot operation. const DotDimensionNumbers& dot_dimension_numbers() const { CHECK(dot_dimension_numbers_ != nullptr); @@ -1325,23 +1109,8 @@ class HloInstruction { // Returns the dump string of the dot dimension numbers. string DotDimensionNumbersToString() const; - const GatherDimensionNumbers& gather_dimension_numbers() const { - CHECK(gather_dimension_numbers_ != nullptr); - return *gather_dimension_numbers_; - } - - tensorflow::gtl::ArraySlice gather_window_bounds() const { - CHECK_EQ(opcode(), HloOpcode::kGather); - return gather_window_bounds_; - } - - // Returns the dump string of the gather dimension numbers. - string GatherDimensionNumbersToString() const; - - // Returns the random distribution for this rng node. - // - // Precondition: opcode() == HloOpcode::kRng - RandomDistribution random_distribution() const; + // Returns the dump string of the precision configuration. + string PrecisionConfigToString() const; // Clones the HLO instruction. The clone will have the same opcode, shape, and // operands. After creation the clone has no uses. "this" (the instruction @@ -1353,7 +1122,7 @@ class HloInstruction { // Clones the HLO instruction as above but with new shape and operands. std::unique_ptr CloneWithNewOperands( - const Shape& shape, tensorflow::gtl::ArraySlice operands, + const Shape& shape, absl::Span new_operands, HloCloneContext* context = nullptr) const; // Returns the computations this instruction directly calls (if any). @@ -1396,6 +1165,9 @@ class HloInstruction { // Returns true if this instruction is elementwise on all its operands. bool IsElementwise() const; + // Returns true if this is an cross module all-reduce instrucion. + bool IsCrossModuleAllReduce() const; + // Returns true if this elementwise instruction implicitly broadcasts operand // `operand_idx`. // @@ -1424,9 +1196,14 @@ class HloInstruction { std::tuple, std::vector> ReshapeMerelyInsertsOrDeletes1SizedDimensions() const; - // Gets/sets the string identifier for this instruction. + // Gets the string identifier for this instruction. const string& name() const { return name_; } - void set_name(tensorflow::StringPiece name) { name_ = std::string(name); } + + // Sets the string identifier for this instruction. Name will be sanitized to + // match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*". + void SetAndSanitizeName(const string& name) { + name_ = NameUniquer::GetSanitizedName(name); + } // Use the given NameUniquer to select a unique name for the instruction based // on the instruction's existing name. @@ -1449,11 +1226,46 @@ class HloInstruction { // this field and they cannot interpret it due to its meaning being backend // specific. // - // TODO(b/78194644): Introduce structured configuration format as per - // go/xla-heuristics. - const string& backend_config() const { return backend_config_; } - void set_backend_config(string backend_config) { - backend_config_ = std::move(backend_config); + // ConfigProto should be a protobuf Message type. + template + StatusOr backend_config() const { + ConfigProto proto; + TF_RETURN_IF_ERROR(GetBackendConfigInternal(&proto)); + return std::move(proto); + } + Status set_backend_config(const tensorflow::protobuf::Message& proto); + + // Getter/setter for raw JSON-encoded backend config. Prefer the + // functions above that deal in proto Messages where possible. + const string& raw_backend_config_string() const { return backend_config_; } + void set_raw_backend_config_string(string config_str) { + backend_config_ = std::move(config_str); + } + + // Returns a string representation of a proto in the format used by + // raw_backend_config_string. + // + // This is morally equivalent to: + // + // HloInstruction instr; + // TF_RETURN_IF_ERROR(instr.set_backend_config(proto)); + // return instr.raw_backend_config_string(); + // + static StatusOr BackendConfigToRawString( + const tensorflow::protobuf::Message& proto); + + // Returns the information used to tell the implementation information about + // what sort of precision is requested. The meaning of the field is backend + // specific. At the moment, it is only supported for kConvolution and kDot. + // Transformations on one kDot or kConvolution to another will preserve this + // information. Transformations to other HLOs will not preserve this + // information but it is presumed that the alternate lowering is strictly + // superior. + const PrecisionConfigProto& precision_config() const { + return precision_config_; + } + void set_precision_config(const PrecisionConfigProto& precision_config) { + precision_config_ = precision_config; } // Sets the debug metadata for this instruction. @@ -1486,13 +1298,287 @@ class HloInstruction { void set_outer_dimension_partitions( const std::vector& outer_dimension_partitions); - // Change the layout for an Constant Hlo instruction to match new_layout. For - // tuple shaped constants shape_index is the path to the internal array - // subshape whose layout needs to be changed. + // Old methods kept for smooth subclassing transition BEGIN. + // TODO(b/80131774): Remove this code. + + // Delegates to HloBatchNormInstruction::feature_index. + int64 feature_index() const; + + // Delegates to HloBatchNormInstruction::epsilon. + float epsilon() const; + + // Delegates to HloFftInstruction::fft_type. + FftType fft_type() const; + + // Delegates to HloFftInstruction::fft_length. + const std::vector& fft_length() const; + + // Delegates to HloSendRecvInstruction::channel_id. + int64 channel_id() const; + + // Returns the dimension sizes or numbers associated with this instruction. + virtual const std::vector& dimensions() const { + LOG(FATAL) << "Unimplemented method."; + } + virtual int64 dimensions(int64 index) const { + LOG(FATAL) << "Unimplemented method."; + } + + // Delegates to HloConcatenateInstruction::concatenate_dimension. + int64 concatenate_dimension() const; + + // Returns whether this instruction does a rank-2 transposition. + bool IsRank2Transpose() const; + + // Delegates to HloSliceInstruction::slice_start. + int64 slice_starts(int64 dimension) const; + const std::vector& slice_starts() const; + + // Delegates to HloSliceInstruction::slice_limits. + int64 slice_limits(int64 dimension) const; + const std::vector& slice_limits() const; + + // Delegates to HloSliceInstruction::slice_strides. + int64 slice_strides(int64 dimension) const; + const std::vector& slice_strides() const; + + // Delegates to HloSliceInstruction::IsInPlaceSlice. + bool IsInPlaceSlice() const; + + // Returns the literal associated with this instruction. + const Literal& literal() const; + + // Returns whether the instruction is a constant. + bool IsConstant() const; + + // Delegate to HloConstantInstruction::RelayoutConstant. void RelayoutConstant(const Layout& new_layout, const ShapeIndex& shape_index = {}); + // Delegates to HloTraceInstruction::TracingTag. + string TracingTag() const; + + // Delegates to HloFusionInstruction::AddFusionOperand. + HloInstruction* AddFusionOperand(HloInstruction* new_operand); + + // Delegates to HloFusionInstruction::MergeFusionInstruction. + void MergeFusionInstruction(HloInstruction* instruction_to_merge); + + // Delegates to HloFusionInstruction::MergeFusionInstructionIntoMultiOutput. + void MergeFusionInstructionIntoMultiOutput( + HloInstruction* instruction_to_merge); + + // Delegates to HloFusionInstruction::FuseInstruction. + HloInstruction* FuseInstruction(HloInstruction* instruction_to_fuse); + + // Delegates to HloFusionInstruction::FuseInstructionIntoMultiOutput. + HloInstruction* FuseInstructionIntoMultiOutput( + HloInstruction* instruction_to_fuse); + + // Delegates to HloFusionInstruction::fused_instruction. + HloComputation* fused_instructions_computation() const; + + // Delegates to HloFusionInstruction::fused_expression_root. + HloInstruction* fused_expression_root() const; + + // Delegates to HloFusionInstruction::fused_instructions. + const tensorflow::gtl::iterator_range>::const_iterator>> + fused_instructions() const; + + const tensorflow::gtl::iterator_range< + UnwrappingIterator>::iterator>> + fused_instructions(); + + // Delegates to HloFusionInstruction::fused_instruction_count. + int64 fused_instruction_count() const; + + // Delegates to HloFusionInstruction::fused_parameter. + HloInstruction* fused_parameter(int64 parameter_number) const; + + // Delegates to HloFusionInstruction::fused_parameters. + const std::vector& fused_parameters() const; + + // Returns true if this instruction is a fusion instruction that generates + // multiple outputs. + const bool IsMultiOutputFusion() const; + + // Delegates to HloFusionInstruction::fusion_kind. + FusionKind fusion_kind() const; + + // Delegates to HloFusionInstruction::set_fusion_kind. + void set_fusion_kind(FusionKind kind); + + // Delegates to HloRngInstruction::random_distribution. + RandomDistribution random_distribution() const; + + // Delegates to HloParameterInstruction::parameter_number. + int64 parameter_number() const; + + // Delegates to HloGetTupleElementInstruction::tuple_index. + int64 tuple_index() const; + + // Delegates to HloReducePrecisionInstruction::exponent_bits. + int32 exponent_bits() const; + + // Delegates to HloReducePrecisionInstruction::mantissa_bits. + int32 mantissa_bits() const; + + // Delegates to HloInfeedInstruction::infeed_config. + string infeed_config() const; + + // Delegates to HloInfeedInstruction::set_infeed_config. + void set_infeed_config(const string& config); + + // Returns the config for the Outfeed instruction. + const string& outfeed_config() const; + + // Returns the shape for the Outfeed instruction. + const Shape& outfeed_shape() const; + + // Delegates to HloCollectiveInstruction::replica_groups. + const std::vector& replica_groups() const; + + // Delegates to HloCollectivePermuteInstruction::source_target_pairs. + const std::vector>& source_target_pairs() const; + + // Delegates to HloAllReduceInstruction::cross_replica_sum_barrier. + string cross_replica_sum_barrier() const; + void set_cross_replica_sum_barrier(const string& barrier); + + // Delegates to HloAllReduceInstruction::all_reduce_id. + absl::optional all_reduce_id() const; + + // Returns data on the window in a windowed operation such as + // convolution. + virtual const Window& window() const { + LOG(FATAL) << "Unimplemented method."; + } + + // Sets the window data in a windowed operation such as convolution. + virtual void set_window(const Window& window) { + LOG(FATAL) << "Unimplemented method."; + } + + // Returns data on the dimension numbers used for a convolution operation, + // which may be a kConvolution instruction or a kCustomCall that implements a + // convolution. + const ConvolutionDimensionNumbers& convolution_dimension_numbers() const; + + // Sets the convolution dimension numbers on this instruction. In general you + // shouldn't need to call this; instead, specify the convolution dimension + // numbers when you create the instruction. + void set_convolution_dimension_numbers( + const ConvolutionDimensionNumbers& dnums); + + // The number of feature groups. Must be a divisor of the input feature + // dimension and output feature dimension. + int64 feature_group_count() const; + + void set_feature_group_count(int64 feature_group_count); + + // Delegates to HloSelectAndScatterInstruction::select. + HloComputation* select() const; + + // Delegates to HloSelectAndScatterInstruction::scatter. + HloComputation* scatter() const; + + // Delegates to HloSelectAndScatterInstruction::set_select. + void set_select(HloComputation* computation); + + // Delegates to HloSelectAndScatterInstruction::set_scatter. + void set_scatter(HloComputation* computation); + + // Delegates to HloCustomCallInstruction::custom_call_target. + const string& custom_call_target() const; + + // Delegates to HloPadInstruction::padding_config. + const PaddingConfig& padding_config() const; + + // Delegates to HloDynamicSliceInstruction::slice_sizes. + int64 slice_sizes(int64 dimension) const; + + // Delegates to HloDynamicSliceInstruction::dynamic_slice_sizes. + const std::vector& dynamic_slice_sizes() const; + + // Delegates to HloGatherInstruction::gather_dimension_numbers. + const GatherDimensionNumbers& gather_dimension_numbers() const; + // Delegates to HloGatherInstruction::gather_slice_sizes. + absl::Span gather_slice_sizes() const; + + // Delegates to HloScatterInstruction::scatter_dimension_numbers(). + const ScatterDimensionNumbers& scatter_dimension_numbers() const; + + // Old methods kept for smooth subclassing transition END. + + protected: + enum class UseKind { kNoUse, kReuse, kUsePermutingElements, kUse }; + // Helper class for computing OperandElementUse for kFusion. + class FusionReusesParamElements; + + // Internal constructor for a given opcode/shape, other fields must be filled + // by factory methods. + HloInstruction(HloOpcode opcode, const Shape& shape); + + // Appends operand to the list of operands and adds this instruction as a user + // of the operand. + void AppendOperand(HloInstruction* operand); + + void RemoveOperandAt(int index) { + operands_.erase(operands_.begin() + index); + } + + // Removes a list of operands with the given indices in ascending order. + void RemoveOperandsAtAscendingIndices( + absl::Span ascending_indices); + + void AppendComputation(HloComputation* computation) { + called_computations_.push_back(computation); + } + + void DetachFrom(HloInstruction* usee) { usee->RemoveUser(this); } + + void set_called_computation(int index, HloComputation* computation) { + called_computations_[index] = computation; + } + // Indices of computations in called_computations_ for instructions which call + // multiple computations. + enum { + // kWhile computations. + kBodyComputationIndex = 0, + kConditionComputationIndex = 1, + + // kSelectAndScatter computations. + kSelectComputationIndex = 0, + kScatterComputationIndex = 1, + + // kConditional computations. + kTrueComputationIndex = 0, + kFalseComputationIndex = 1, + }; + private: + // Implementation for non-common logic of CloneWithNewOperands. + virtual std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + // TODO(b/80131774): This should be pure virtual. + LOG(FATAL) << "Unimplemented method."; + } + + // Implementation for non-common logic of ExtraAttributesToString. + virtual std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {}; + } + + // Implementation for IsElementwise if operand_idx is nullopt and for + // IsElementwiseOnOperand if otherwise. + // + // NOTE: For all instructions other than kFusion, being elementwise on one of + // the operands is equivalent to being elementwise on all the operands. + virtual bool IsElementwiseImpl( + const absl::optional& operand_idx) const; // Prints an instruction to a string. // // The canonical string representation needs to name operands and instruction @@ -1503,7 +1589,7 @@ class HloInstruction { CanonicalNameMap* canonical_name_map) const; // Prints an operand to a string. - string OperandsToStringWithCanonicalNameMap( + virtual string OperandsToStringWithCanonicalNameMap( const HloPrintOptions& options, CanonicalNameMap* canonical_name_map) const; @@ -1511,13 +1597,8 @@ class HloInstruction { // OperandsToStringWithCanonicalNameMap() functions. friend class HloComputation; - enum class UseKind { kNoUse, kReuse, kUsePermutingElements, kUse }; - - // Helper class for computing OperandElementUse for kFusion. - class FusionReusesParamElements; - // See comments on Identical(). - bool IdenticalSlowPath( + virtual bool IdenticalSlowPath( const HloInstruction& other, const std::function& eq_computations) const; @@ -1525,11 +1606,7 @@ class HloInstruction { // Creates an n-ary elementwise operation. static std::unique_ptr CreateNary( const Shape& shape, HloOpcode opcode, - tensorflow::gtl::ArraySlice operands); - - // Appends operand to the list of operands and adds this instruction as a user - // of the operand. - void AppendOperand(HloInstruction* operand); + absl::Span operands); // Adds a user for this instruction. void AddUser(HloInstruction* user); @@ -1537,45 +1614,13 @@ class HloInstruction { // Removes a user for this instruction. void RemoveUser(HloInstruction* user); - // Internal constructor for a given opcode/shape, other fields must be filled - // by factory methods. - HloInstruction(HloOpcode opcode, const Shape& shape); - - // Fuses the given instruction into this fusion instruction. When add_output - // is false (which is the default), instruction_to_fuse is cloned and the - // clone is placed in the fusion instruction. instruction_to_fuse is - // unchanged. - // - // When add_output is true, a clone of the instruction_to_fuse will be part - // of the output of fusion instructions. The users of instruction_to_fuse - // will be redirected to this fusion instructions. instruction_to_fuse will - // be removed from its parent computation. - // - // Precondition: this->opcode() == HloOpcode::kFusion - HloInstruction* FuseInstructionInternal(HloInstruction* instruction_to_fuse, - bool add_output = false); - - // Clones the given instruction_to_fuse and insert the clone into this fusion - // instruction. If add_output is true, a clone of instruction_to_fuse will - // be in the output of the this fusion instruction (part of the tuple of the - // fusion root). - // - // Precondition: opcode() == HloOpcode::kFusion - HloInstruction* CloneAndFuseInternal(HloInstruction* instruction_to_fuse, - bool add_output = false); - - // Clones a fusion instruction with a new shape and operands. - std::unique_ptr CloneFusionWithNewOperands( - const Shape& shape, tensorflow::gtl::ArraySlice operands, - HloCloneContext* context = nullptr) const; - - // Returns true if this instruction can legally have the dimensions field - // set. Used for checking precondition of dimensions field accessors. - bool CanHaveDimensionsField() const; - // Returns how this instruction uses elements of its `i`th operand. UseKind OperandElementUse(int64 i) const; + // Helper for implementing backend_config(). Parses backend_config_ into the + // given proto. + Status GetBackendConfigInternal(tensorflow::protobuf::Message* proto) const; + int unique_id_; // Unique to this HloInstruction within a HloModule // Opcode for this instruction. @@ -1600,133 +1645,42 @@ class HloInstruction { // The computation in which this instruction is contained. HloComputation* parent_ = nullptr; - // Shape of outfeed request. - Shape outfeed_shape_; - // Result shape of this instruction. Shape shape_; - // Literal, only present for kConstant. - std::unique_ptr literal_; - - // Constant index, only present for kGetTupleElement. - int64 tuple_index_ = -1; - - // Dimensions present for some operations that require reshaping or - // broadcasting, including Reshape, Reduce, ReduceWindow, and Reverse. - std::vector dimensions_; - - // Describes the window in a windowed operation such as convolution. - std::unique_ptr window_; - - // Describes the dimension numbers used for a convolution. - std::unique_ptr convolution_dimension_numbers_; - // Describes the dimension numbers used for a dot. std::unique_ptr dot_dimension_numbers_; - std::unique_ptr gather_dimension_numbers_; - std::vector gather_window_bounds_; - - // Describes FFT type for an FFT instruction. - FftType fft_type_ = FftType::FFT; - - // Indicates the FFT length for an FFT instruction. - std::vector fft_length_; - - // Describes the [begin, end) index range for a slice. - std::vector slice_starts_; - std::vector slice_limits_; - std::vector slice_strides_; - - // Describes whether the slice can be lowered to an offset into the operand. - bool is_in_place_slice_ = false; - - // The bit sizes for a reduce-precision operation. - int32 exponent_bits_ = 0; - int32 mantissa_bits_ = 0; - - // Describes the [start, start + size) range size for a dynamic slice - // ('start' is specified dynamically in the second operand of the operation). - std::vector dynamic_slice_sizes_; - - // The padding configuration that describes the edge padding and interior - // padding of this pad instruction. Only set for pad instructions. - std::unique_ptr padding_config_; - - // The type of the fusion. Used by kFusion only. - FusionKind fusion_kind_; + // Used to tag kCopy instructions that are eligible for copy elision. + bool copy_elision_allowed_ = true; // The sharding, if one exists. - std::unique_ptr sharding_; + // Uses std::shared_ptr to allow reuse of the same sharding object between + // HloInstructions and other components as HloSharding can be very large for + // many element tuples. + std::shared_ptr sharding_; // Fields used by the kDomain instruction. std::unique_ptr operand_side_metadata_; std::unique_ptr user_side_metadata_; - // For parameter instructions this field holds the parameter number. - int64 parameter_number_ = 0; - - // Name of a global symbol to call, only present for kCustomCall. - string custom_call_target_; - - // Name to use for host send/recv channels, only present for kHostCompute. - string channel_name_; - - // Estimate of the duration of a host computation in nanoseconds. - int64 cost_estimate_ns_ = 0; - // Computations called by this instruction. std::vector called_computations_; - // Indices of computations in called_computations_ for instructions which call - // multiple computations. - enum { - // kWhile computations. - kBodyComputationIndex = 0, - kConditionComputationIndex = 1, - - // kSelectAndScatter computations. - kSelectComputationIndex = 0, - kScatterComputationIndex = 1, - - // kConditional computations. - kTrueComputationIndex = 0, - kFalseComputationIndex = 1, - }; - - // Outfeed configuration information, only present for kOutfeed. - string outfeed_config_; - // A trace instruction that consumes this instruction. // // Invariant: if trace_instruction_ != nullptr, trace_instruction has this as // an operand. HloInstruction* trace_instruction_ = nullptr; - // The distribution requested for random number generation. - // Only present for kRng. - RandomDistribution distribution_; - - // A small float number added to the variance to avoid divide-by-zero error. - // Only present for kBatchNormTraining. - float epsilon_ = 0.0f; - - // An integer value representing the index of the feature dimension. - // Only present for kBatchNormTraining. - int64 feature_index_ = -1; - - // Represents a unique identifier for each Send/Recv instruction pair. - // Only present for kSend or kRecv. - int64 channel_id_ = -1; - - // The string representation of the infeed configuration. - string infeed_config_; - // The backend-specific configuration for how a backend should compile this // HLO. See the documentation on backend_config(). string backend_config_; + // Information used to communicate to the implementation about the algorithm + // used to produce results. See the documentation on precision_config(). + PrecisionConfigProto precision_config_; + // String identifier for instruction. string name_; @@ -1749,7 +1703,12 @@ StatusOr StringToFusionKind( string PaddingConfigToString(const PaddingConfig& padding); string OpMetadataToString(const OpMetadata& metadata); string RandomDistributionToString(const RandomDistribution& distribution); +string PrecisionToString(const PrecisionConfigProto::Precision& precision); +string ConvolutionDimensionNumbersToString( + const ConvolutionDimensionNumbers& dnums); + StatusOr StringToRandomDistribution(const string& name); +StatusOr StringToPrecision(const string& name); std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind); diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc index e91cf2076f296c2584e2b21e2e4a6a8851a30360..76b0e940a656ee2f54781b927fdca367a83056c6 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc @@ -20,16 +20,18 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/protobuf_util.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" -#include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" +#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/compiler/xla/util.h" +#include "tensorflow/compiler/xla/window_util.h" namespace xla { namespace { @@ -37,10 +39,8 @@ namespace { using ::testing::ElementsAre; using ::testing::UnorderedElementsAre; -class HloInstructionTest : public HloTestBase { +class HloInstructionTest : public HloVerifiedTestBase { protected: - HloInstructionTest() {} - Shape r0f32_ = ShapeUtil::MakeShape(F32, {}); }; @@ -51,7 +51,7 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault { public: Status DefaultAction(HloInstruction* hlo_instruction) override { return Unimplemented("not implemented %s", - HloOpcodeString(hlo_instruction->opcode()).c_str()); + HloOpcodeString(hlo_instruction->opcode())); } Status HandleParameter(HloInstruction* parameter) override { @@ -248,7 +248,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperands) { auto param1 = builder.AddInstruction( HloInstruction::CreateParameter(1, r0f32_, "param1")); auto c0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); auto addleft = builder.AddInstruction( HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, param0, c0)); auto addright = builder.AddInstruction( @@ -293,7 +293,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperandsWithUnaryOps) { auto param1 = builder.AddInstruction( HloInstruction::CreateParameter(1, r0f32_, "param1")); auto c0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); auto neg1 = builder.AddInstruction( HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, c0)); auto addleft = builder.AddInstruction( @@ -333,7 +333,7 @@ TEST_F(HloInstructionTest, TrivialMap) { auto param = embedded_builder.AddInstruction( HloInstruction::CreateParameter(0, r0f32, "x")); auto value = embedded_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); embedded_builder.AddInstruction( HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param, value)); auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build()); @@ -341,7 +341,7 @@ TEST_F(HloInstructionTest, TrivialMap) { // Builds a parameter and feeds it to the map. HloComputation::Builder builder(TestName()); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, f32a100x10, "")); + HloInstruction::CreateParameter(0, f32a100x10, "p")); auto map = builder.AddInstruction( HloInstruction::CreateMap(f32a100x10, {param0}, add_f32)); module->AddEntryComputation(builder.Build()); @@ -380,11 +380,11 @@ TEST_F(HloInstructionTest, TrivialReduce) { // Builds a parameter and an initial value and feeds them to the reduce. HloComputation::Builder builder(TestName()); auto param0 = builder.AddInstruction( - HloInstruction::CreateParameter(0, f32a100x10, "")); + HloInstruction::CreateParameter(0, f32a100x10, "p")); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); auto reduce = builder.AddInstruction( HloInstruction::CreateReduce(f32v100, param0, const0, /*dimensions_to_reduce=*/{1}, add_f32)); @@ -625,7 +625,7 @@ TEST_F(HloInstructionTest, SingletonFusionOp) { HloComputation::Builder builder(TestName()); // Create a fusion instruction containing a single unary operation. auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); auto exp = builder.AddInstruction( HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant)); auto module = CreateNewModule(); @@ -641,9 +641,9 @@ TEST_F(HloInstructionTest, BinaryFusionOp) { HloComputation::Builder builder(TestName()); // Create a fusion instruction containing a single binary operation. auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.1f))); auto add = builder.AddInstruction(HloInstruction::CreateBinary( r0f32_, HloOpcode::kAdd, constant1, constant2)); auto module = CreateNewModule(); @@ -660,7 +660,7 @@ TEST_F(HloInstructionTest, ChainFusionOp) { HloComputation::Builder builder(TestName()); // Create a chain of fused unary ops. auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); auto exp1 = builder.AddInstruction( HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant)); auto exp2 = builder.AddInstruction( @@ -681,7 +681,7 @@ TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) { HloComputation::Builder builder(TestName()); // Create a chain of fused unary ops. auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); auto exp1 = builder.AddInstruction( HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant)); auto exp2 = builder.AddInstruction( @@ -709,16 +709,17 @@ TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) { TEST_F(HloInstructionTest, PreserveOutfeedShapeThroughClone) { HloComputation::Builder builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2({ + HloInstruction::CreateConstant(LiteralUtil::CreateR2({ {1, 2}, {3, 4}, }))); auto shape10 = ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0}); auto shape01 = ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1}); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); auto outfeed10 = builder.AddInstruction( - HloInstruction::CreateOutfeed(shape10, constant, "")); + HloInstruction::CreateOutfeed(shape10, constant, token, "")); auto outfeed01 = builder.AddInstruction( - HloInstruction::CreateOutfeed(shape01, constant, "")); + HloInstruction::CreateOutfeed(shape01, constant, token, "")); auto clone01 = builder.AddInstruction(outfeed01->Clone()); auto clone10 = builder.AddInstruction(outfeed10->Clone()); @@ -730,7 +731,7 @@ TEST_F(HloInstructionTest, PreserveOutfeedShapeThroughClone) { TEST_F(HloInstructionTest, PreserveTupleShapeThroughClone) { HloComputation::Builder builder(TestName()); auto* constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2({ + HloInstruction::CreateConstant(LiteralUtil::CreateR2({ {1, 2}, {3, 4}, }))); @@ -761,13 +762,13 @@ TEST_F(HloInstructionTest, FusionOpWithCalledComputations) { HloComputation::Builder builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); - auto map_1_x = builder.AddInstruction(HloInstruction::CreateMap( - scalar_shape, {constant}, computation_x, /*static_operands=*/{})); - auto map_2_x = builder.AddInstruction(HloInstruction::CreateMap( - scalar_shape, {map_1_x}, computation_x, /*static_operands=*/{})); - auto map_3_y = builder.AddInstruction(HloInstruction::CreateMap( - scalar_shape, {map_2_x}, computation_y, /*static_operands=*/{})); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); + auto map_1_x = builder.AddInstruction( + HloInstruction::CreateMap(scalar_shape, {constant}, computation_x)); + auto map_2_x = builder.AddInstruction( + HloInstruction::CreateMap(scalar_shape, {map_1_x}, computation_x)); + auto map_3_y = builder.AddInstruction( + HloInstruction::CreateMap(scalar_shape, {map_2_x}, computation_y)); auto* computation = module->AddEntryComputation(builder.Build()); auto* fusion = computation->CreateFusionInstruction( @@ -796,11 +797,11 @@ TEST_F(HloInstructionTest, ComplexFusionOp) { // Notable complexities are repeated operands in the same instruction, // different shapes, use of value in different expressions. auto c1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.1f))); auto c2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.1f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.1f))); auto c3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(9.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(9.0f))); auto add = builder.AddInstruction( HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c1, c2)); @@ -871,11 +872,11 @@ TEST_F(HloInstructionTest, IdenticalInstructions) { // Create a set of random constant operands to use below. Make them matrices // so dimensions are interesting. auto operand1 = HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}})); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}})); auto operand2 = HloInstruction::CreateConstant( - Literal::CreateR2({{10.0, 20.0}, {30.0, 40.0}})); - auto vector_operand = - HloInstruction::CreateConstant(Literal::CreateR1({42.0, 123.0})); + LiteralUtil::CreateR2({{10.0, 20.0}, {30.0, 40.0}})); + auto vector_operand = HloInstruction::CreateConstant( + LiteralUtil::CreateR1({42.0, 123.0})); Shape shape = operand1->shape(); // Convenient short names for the operands. @@ -922,6 +923,40 @@ TEST_F(HloInstructionTest, IdenticalInstructions) { *HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op1, op2))); } +TEST_F(HloInstructionTest, IdenticalCallInstructions) { + const char* const hlo_string = R"( +HloModule Module + +subcomp1 (x: f32[]) -> f32[] { + x = f32[] parameter(0) + ROOT n = f32[] sine(x) +} + +subcomp2 (x: f32[]) -> f32[] { + x = f32[] parameter(0) + ROOT n = f32[] cosine(x) +} + +ENTRY entry (param: f32[]) -> (f32[], f32[], f32[]) { + p = f32[] parameter(0) + t1 = f32[] call(p), to_apply=subcomp1 + t2 = f32[] call(p), to_apply=subcomp1 + t3 = f32[] call(p), to_apply=subcomp2 + ROOT t = (f32[], f32[], f32[]) tuple(t1, t2, t3) + } +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_string)); + + auto* root = module->entry_computation()->root_instruction(); + auto* t1 = root->operand(0); + auto* t2 = root->operand(1); + auto* t3 = root->operand(2); + + EXPECT_TRUE(StructuralEqual(*t1, *t2)); + EXPECT_FALSE(StructuralEqual(*t1, *t3)); +} + TEST_F(HloInstructionTest, FunctionVisitor) { // Verify the function visitor HloInstruction::Accept visits all instructions // from a root properly given the following graph: @@ -979,6 +1014,23 @@ TEST_F(HloInstructionTest, FullyElementwise) { } } +TEST_F(HloInstructionTest, MapIsElementwise) { + auto module = CreateNewModule(); + const Shape r2f32 = ShapeUtil::MakeShapeWithLayout(F32, {10, 10}, {1, 0}); + HloComputation::Builder builder(TestName()); + HloComputation::Builder map_builder("id"); + map_builder.AddInstruction( + HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0")); + auto map_computation = module->AddEmbeddedComputation(map_builder.Build()); + auto x = + builder.AddInstruction(HloInstruction::CreateParameter(0, r2f32, "x")); + auto map = builder.AddInstruction( + HloInstruction::CreateMap(r2f32, {x}, map_computation)); + module->AddEntryComputation(builder.Build()); + + EXPECT_TRUE(map->IsElementwise()); +} + TEST_F(HloInstructionTest, PartiallyElementwise) { const Shape r1f32 = ShapeUtil::MakeShape(F32, {5}); const Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 5}); @@ -1032,16 +1084,14 @@ TEST_F(HloInstructionTest, PartiallyElementwise) { TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) { // Fused expression: - // - // x y - // \ / \ - // min broadcast + // y + // / + // x broadcast + // \ / | + // min | // \ / // sub // - // The fusion instruction is elementwise on `x` because the only path from x - // to sub contains only elementwise operations. It is not elementwise on `y` - // because the path y->broadcast->sub is not all elementwise. const Shape r0f32 = ShapeUtil::MakeShape(F32, {}); const Shape r1f32 = ShapeUtil::MakeShape(F32, {5}); @@ -1050,10 +1100,10 @@ TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) { builder.AddInstruction(HloInstruction::CreateParameter(0, r1f32, "x")); HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(1, r0f32, "y")); - HloInstruction* min = builder.AddInstruction( - HloInstruction::CreateBinary(r1f32, HloOpcode::kMinimum, x, y)); HloInstruction* broadcast = - builder.AddInstruction(HloInstruction::CreateBroadcast(r1f32, y, {0})); + builder.AddInstruction(HloInstruction::CreateBroadcast(r1f32, y, {})); + HloInstruction* min = builder.AddInstruction( + HloInstruction::CreateBinary(r1f32, HloOpcode::kMinimum, x, broadcast)); HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary( r1f32, HloOpcode::kSubtract, min, broadcast)); @@ -1064,10 +1114,10 @@ TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) { EXPECT_FALSE(fusion->IsElementwise()); for (int64 operand_idx = 0; operand_idx < fusion->operand_count(); ++operand_idx) { - if (fusion->operand(operand_idx) == x) { - EXPECT_TRUE(fusion->IsElementwiseOnOperand(operand_idx)); - } else { + if (fusion->operand(operand_idx) == y) { EXPECT_FALSE(fusion->IsElementwiseOnOperand(operand_idx)); + } else { + EXPECT_TRUE(fusion->IsElementwiseOnOperand(operand_idx)); } } } @@ -1118,6 +1168,40 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) { EXPECT_TRUE(StructuralEqual(*fusion, *fusion2)); } +TEST_F(HloInstructionTest, NoRedundantFusionOperandsAfterReplacingUse) { + // Fused expression: + // + // x y + // | | + // | transpose + // \ / + // dot + const Shape s = ShapeUtil::MakeShape(F32, {10, 10}); + + HloComputation::Builder builder("TransposeDot"); + HloInstruction* x = + builder.AddInstruction(HloInstruction::CreateParameter(0, s, "x")); + HloInstruction* y = + builder.AddInstruction(HloInstruction::CreateParameter(1, s, "y")); + HloInstruction* reshape = + builder.AddInstruction(HloInstruction::CreateTranspose(s, y, {1, 0})); + DotDimensionNumbers dot_dnums; + dot_dnums.add_lhs_contracting_dimensions(1); + dot_dnums.add_rhs_contracting_dimensions(0); + HloInstruction* dot = builder.AddInstruction( + HloInstruction::CreateDot(s, x, reshape, dot_dnums)); + + auto module = CreateNewModule(); + auto* computation = module->AddEntryComputation(builder.Build()); + HloInstruction* fusion = computation->CreateFusionInstruction( + {dot, reshape}, HloInstruction::FusionKind::kLoop); + + EXPECT_TRUE(x->ReplaceAllUsesWith(y).ok()); + + EXPECT_THAT(fusion->operands(), UnorderedElementsAre(y)); + EXPECT_EQ(fusion->fused_instructions_computation()->num_parameters(), 1); +} + TEST_F(HloInstructionTest, FusionEquality) { auto module = CreateNewModule(); HloComputation::Builder builder(TestName()); @@ -1147,9 +1231,9 @@ TEST_F(HloInstructionTest, NestedFusionEquality) { // Build a nested fusion computation. Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); auto a = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 0.0}, {0.0, 1.0}}))); + LiteralUtil::CreateR2({{1.0, 0.0}, {0.0, 1.0}}))); auto b = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); + LiteralUtil::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); auto b_t = builder.AddInstruction( HloInstruction::CreateTranspose(data_shape, b, {1, 0})); DotDimensionNumbers dot_dnums; @@ -1158,9 +1242,9 @@ TEST_F(HloInstructionTest, NestedFusionEquality) { auto dot = builder.AddInstruction( HloInstruction::CreateDot(data_shape, a, b_t, dot_dnums)); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto add_operand = builder.AddInstruction( - HloInstruction::CreateBroadcast(data_shape, one, {1})); + HloInstruction::CreateBroadcast(data_shape, one, {})); auto add = builder.AddInstruction(HloInstruction::CreateBinary( data_shape, HloOpcode::kAdd, dot, add_operand)); auto sub = builder.AddInstruction(HloInstruction::CreateBinary( @@ -1255,7 +1339,7 @@ TEST_F(HloInstructionTest, Stringification) { "condition=%TransposeDot, body=%TransposeDot"); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); HloInstruction* conditional = builder.AddInstruction(HloInstruction::CreateConditional( sout, pred, x, computation, x, computation)); @@ -1267,7 +1351,7 @@ TEST_F(HloInstructionTest, Stringification) { TEST_F(HloInstructionTest, StringifyGather_0) { Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46}); - Shape gather_indices_tensor_shape = + Shape start_indices_tensor_shape = ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 5}); Shape gather_result_shape = ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}); @@ -1275,19 +1359,18 @@ TEST_F(HloInstructionTest, StringifyGather_0) { HloComputation::Builder builder("Gather"); HloInstruction* input = builder.AddInstruction( HloInstruction::CreateParameter(0, input_tensor_shape, "input_tensor")); - HloInstruction* gather_indices = + HloInstruction* start_indices = builder.AddInstruction(HloInstruction::CreateParameter( - 1, gather_indices_tensor_shape, "gather_indices")); - - HloInstruction* gather_instruction = - builder.AddInstruction(HloInstruction::CreateGather( - gather_result_shape, input, gather_indices, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, - /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26})); + 1, start_indices_tensor_shape, "start_indices")); + + HloInstruction* gather_instruction = builder.AddInstruction( + HloInstruction::CreateGather(gather_result_shape, input, start_indices, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4), + /*slice_sizes=*/{30, 29, 28, 27, 26})); auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); @@ -1295,15 +1378,15 @@ TEST_F(HloInstructionTest, StringifyGather_0) { EXPECT_EQ(gather_instruction->ToString(), "%gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} " "gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, " - "s64[10,9,8,7,5]{4,3,2,1,0} %gather_indices), " - "output_window_dims={4,5,6,7,8}, elided_window_dims={}, " - "gather_dims_to_operand_dims={0,1,2,3,4}, " - "index_vector_dim=4, window_bounds={30,29,28,27,26}"); + "s64[10,9,8,7,5]{4,3,2,1,0} %start_indices), " + "offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, " + "start_index_map={0,1,2,3,4}, " + "index_vector_dim=4, slice_sizes={30,29,28,27,26}"); } TEST_F(HloInstructionTest, StringifyGather_1) { Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46}); - Shape gather_indices_tensor_shape = + Shape start_indices_tensor_shape = ShapeUtil::MakeShape(S64, {10, 9, 5, 7, 6}); Shape gather_result_shape = ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26}); @@ -1311,19 +1394,18 @@ TEST_F(HloInstructionTest, StringifyGather_1) { HloComputation::Builder builder("Gather"); HloInstruction* input = builder.AddInstruction( HloInstruction::CreateParameter(0, input_tensor_shape, "input_tensor")); - HloInstruction* gather_indices = + HloInstruction* start_indices = builder.AddInstruction(HloInstruction::CreateParameter( - 1, gather_indices_tensor_shape, "gather_indices")); - - HloInstruction* gather_instruction = - builder.AddInstruction(HloInstruction::CreateGather( - gather_result_shape, input, gather_indices, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, - /*index_vector_dim=*/2), - /*window_bounds=*/{30, 29, 28, 27, 26})); + 1, start_indices_tensor_shape, "start_indices")); + + HloInstruction* gather_instruction = builder.AddInstruction( + HloInstruction::CreateGather(gather_result_shape, input, start_indices, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/2), + /*slice_sizes=*/{30, 29, 28, 27, 26})); auto module = CreateNewModule(); module->AddEntryComputation(builder.Build()); @@ -1331,10 +1413,59 @@ TEST_F(HloInstructionTest, StringifyGather_1) { EXPECT_EQ(gather_instruction->ToString(), "%gather = f32[10,9,7,6,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} " "gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, " - "s64[10,9,5,7,6]{4,3,2,1,0} %gather_indices), " - "output_window_dims={4,5,6,7,8}, elided_window_dims={}, " - "gather_dims_to_operand_dims={0,1,2,3,4}, " - "index_vector_dim=2, window_bounds={30,29,28,27,26}"); + "s64[10,9,5,7,6]{4,3,2,1,0} %start_indices), " + "offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, " + "start_index_map={0,1,2,3,4}, " + "index_vector_dim=2, slice_sizes={30,29,28,27,26}"); +} + +TEST_F(HloInstructionTest, StringifyScatter) { + Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46}); + Shape scatter_indices_tensor_shape = + ShapeUtil::MakeShape(S64, {10, 9, 5, 7, 6}); + Shape scatter_updates_shape = + ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26}); + + HloComputation::Builder builder("Scatter"); + HloInstruction* input = builder.AddInstruction( + HloInstruction::CreateParameter(0, input_tensor_shape, "input_tensor")); + HloInstruction* scatter_indices = + builder.AddInstruction(HloInstruction::CreateParameter( + 1, scatter_indices_tensor_shape, "scatter_indices")); + HloInstruction* scatter_updates = + builder.AddInstruction(HloInstruction::CreateParameter( + 2, scatter_updates_shape, "scatter_updates")); + + HloComputation::Builder update_builder("Scatter.update"); + update_builder.AddInstruction( + HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p1")); + update_builder.AddInstruction( + HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "p2")); + + auto module = CreateNewModule(); + auto* update_computation = + module->AddEmbeddedComputation(update_builder.Build()); + + HloInstruction* scatter_instruction = + builder.AddInstruction(HloInstruction::CreateScatter( + input_tensor_shape, input, scatter_indices, scatter_updates, + update_computation, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 8}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/2))); + module->AddEntryComputation(builder.Build()); + + EXPECT_EQ( + scatter_instruction->ToString(), + "%scatter = f32[50,49,48,47,46]{4,3,2,1,0} " + "scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, " + "s64[10,9,5,7,6]{4,3,2,1,0} %scatter_indices, " + "f32[10,9,7,6,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %scatter_updates), " + "update_window_dims={4,5,6,7,8}, inserted_window_dims={}, " + "scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=2, " + "to_apply=%Scatter.update"); } TEST_F(HloInstructionTest, CanonnicalStringificationFusion) { @@ -1368,15 +1499,15 @@ TEST_F(HloInstructionTest, CanonnicalStringificationFusion) { HloInstruction* fusion = computation->CreateFusionInstruction( {dot, reshape}, HloInstruction::FusionKind::kLoop); - EXPECT_EQ( - fusion->ToString(options), + const string expected_fusion = R"(f32[5,20]{1,0} fusion(f32[5,10]{1,0}, f32[20,10]{1,0}), kind=kLoop, calls= { tmp_0 = f32[5,10]{1,0} parameter(0) tmp_1 = f32[20,10]{1,0} parameter(1) tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0} ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0} -})"); +})"; + EXPECT_EQ(fusion->ToString(options), expected_fusion); } TEST_F(HloInstructionTest, CanonnicalStringificationWhile) { @@ -1408,8 +1539,8 @@ TEST_F(HloInstructionTest, CanonnicalStringificationWhile) { HloInstruction::CreateWhile(sout, computation, computation, x)); auto options = HloPrintOptions().Canonical(); - EXPECT_EQ(loop->ToString(options), - R"(f32[5,20]{1,0} while(f32[5,10]{1,0}), condition= + const string expected_loop = + R"(f32[5,20]{1,0} while(f32[5,10]{1,0}), condition= { tmp_0 = f32[5,10]{1,0} parameter(0) tmp_1 = f32[20,10]{1,0} parameter(1) @@ -1431,7 +1562,8 @@ TEST_F(HloInstructionTest, CanonnicalStringificationWhile) { tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0} ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0} } -})"); +})"; + EXPECT_EQ(loop->ToString(options), expected_loop); } TEST_F(HloInstructionTest, CanonnicalStringificationConditional) { @@ -1463,13 +1595,12 @@ TEST_F(HloInstructionTest, CanonnicalStringificationConditional) { HloInstruction::CreateWhile(sout, computation, computation, x)); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); HloInstruction* conditional = builder.AddInstruction(HloInstruction::CreateConditional( sout, pred, x, computation, x, computation)); auto options = HloPrintOptions().Canonical(); - EXPECT_EQ( - conditional->ToString(options), + const string expected_conditional = R"(f32[5,20]{1,0} conditional(pred[], f32[5,10]{1,0}, f32[5,10]{1,0}), true_computation= { tmp_0 = f32[5,10]{1,0} parameter(0) @@ -1492,7 +1623,8 @@ TEST_F(HloInstructionTest, CanonnicalStringificationConditional) { tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0} ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0} } -})"); +})"; + EXPECT_EQ(conditional->ToString(options), expected_conditional); } TEST_F(HloInstructionTest, CheckDeepClone) { @@ -1532,7 +1664,7 @@ ENTRY entry (param: s32[]) -> s32[] { // Check that deep clones really deep clones every instruction and // computations, without leaving dangling pointers to the old module. TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); std::unique_ptr clone = module->Clone(); for (HloComputation* computation : clone->computations()) { EXPECT_EQ(computation->parent(), clone.get()); @@ -1542,5 +1674,88 @@ ENTRY entry (param: s32[]) -> s32[] { } } +TEST_F(HloInstructionTest, IdenticalAccountsForBackendConfig) { + const Shape shape = ShapeUtil::MakeShape(F32, {42}); + HloComputation::Builder builder("test"); + HloInstruction* p = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p")); + + HloInstruction* add1 = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p, p)); + HloInstruction* add2 = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p, p)); + + EXPECT_TRUE(add1->Identical(*add2)); + add1->set_raw_backend_config_string("abc"); + EXPECT_FALSE(add1->Identical(*add2)); +} + +TEST_F(HloInstructionTest, IdenticalAccountsForCustomCallWindow) { + auto instr1 = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}), + /*operands=*/{}, + /*custom_call_target=*/"foo"); + auto instr2 = instr1->Clone(); + EXPECT_TRUE(instr1->Identical(*instr2)); + + Window w = window_util::MakeWindow({1, 2, 3}); + instr1->set_window(w); + EXPECT_FALSE(instr1->Identical(*instr2)); +} + +TEST_F(HloInstructionTest, IdenticalAccountsForCustomCallDnums) { + auto instr1 = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}), + /*operands=*/{}, + /*custom_call_target=*/"foo"); + auto instr2 = instr1->Clone(); + EXPECT_TRUE(instr1->Identical(*instr2)); + + ConvolutionDimensionNumbers dnums; + dnums.set_output_batch_dimension(42); + instr1->set_convolution_dimension_numbers(dnums); + EXPECT_FALSE(instr1->Identical(*instr2)); +} + +TEST_F(HloInstructionTest, CloneWindowOnCustomCall) { + auto instr = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}), + /*operands=*/{}, + /*custom_call_target=*/"foo"); + Window w = window_util::MakeWindow({1, 2, 3}); + instr->set_window(w); + auto clone = instr->Clone(); + EXPECT_TRUE(protobuf_util::ProtobufEquals(clone->window(), w)) + << clone->window().DebugString(); +} + +TEST_F(HloInstructionTest, CloneDnumsOnCustomCall) { + auto instr = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}), + /*operands=*/{}, + /*custom_call_target=*/"foo"); + ConvolutionDimensionNumbers dnums; + dnums.set_output_batch_dimension(42); + instr->set_convolution_dimension_numbers(dnums); + auto clone = instr->Clone(); + EXPECT_TRUE(protobuf_util::ProtobufEquals( + clone->convolution_dimension_numbers(), dnums)) + << clone->convolution_dimension_numbers().DebugString(); +} + +TEST_F(HloInstructionTest, PreserveOperandPrecisionOnCloneConv) { + constexpr char kHloString[] = R"( + HloModule test_module + ENTRY test { + arg0 = f32[1,2,1] parameter(0) + arg1 = f32[1,1,1] parameter(1) + ROOT conv = f32[1,2,1] convolution(arg0, arg1), window={size=1}, + dim_labels=b0f_0io->b0f, operand_precision={high,default} + })"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kHloString)); + auto* conv = module->entry_computation()->root_instruction(); + + auto clone = conv->Clone(); + EXPECT_THAT(clone->precision_config().operand_precision(), + ::testing::ElementsAre(PrecisionConfigProto::HIGH, + PrecisionConfigProto::DEFAULT)); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc new file mode 100644 index 0000000000000000000000000000000000000000..e46afa764f519c9f7b6e3e9a8a37c84bd173b9a2 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_instructions.cc @@ -0,0 +1,2160 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_instructions.h" + +#include + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "absl/strings/escaping.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "absl/strings/str_split.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/window_util.h" +#include "tensorflow/core/lib/gtl/flatmap.h" + +namespace xla { +namespace { + +using absl::CEscape; +using absl::StrAppend; +using absl::StrCat; +using absl::StrJoin; + +bool IsInstructionElementwiseOnOperand(const HloInstruction* instruction, + const HloInstruction* operand) { + std::vector operand_indices = instruction->OperandIndices(operand); + return std::all_of( + operand_indices.begin(), operand_indices.end(), + [instruction](int64 operand_index) { + return instruction->IsElementwiseOnOperand(operand_index); + }); +} +} // namespace + +HloBatchNormInstruction::HloBatchNormInstruction( + HloOpcode opcode, const Shape& shape, HloInstruction* operand, + HloInstruction* scale, float epsilon, int64 feature_index) + : HloInstruction(opcode, shape), + epsilon_(epsilon), + feature_index_(feature_index) { + AppendOperand(operand); + AppendOperand(scale); +} + +bool HloBatchNormInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return feature_index() == casted_other.feature_index() && + epsilon() == casted_other.epsilon(); +} + +HloInstructionProto HloBatchNormInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_epsilon(epsilon_); + proto.set_feature_index(feature_index_); + return proto; +} + +std::vector HloBatchNormInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("epsilon=", epsilon()), + StrCat("feature_index=", feature_index())}; +} + +HloBatchNormTrainingInstruction::HloBatchNormTrainingInstruction( + const Shape& shape, HloInstruction* operand, HloInstruction* scale, + HloInstruction* offset, float epsilon, int64 feature_index) + : HloBatchNormInstruction(HloOpcode::kBatchNormTraining, shape, operand, + scale, epsilon, feature_index) { + AppendOperand(offset); +} + +std::unique_ptr +HloBatchNormTrainingInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 3); + return absl::make_unique( + shape, new_operands[0], new_operands[1], new_operands[2], epsilon(), + feature_index()); +} + +HloBatchNormInferenceInstruction::HloBatchNormInferenceInstruction( + const Shape& shape, HloInstruction* operand, HloInstruction* scale, + HloInstruction* offset, HloInstruction* mean, HloInstruction* variance, + float epsilon, int64 feature_index) + : HloBatchNormInstruction(HloOpcode::kBatchNormInference, shape, operand, + scale, epsilon, feature_index) { + AppendOperand(offset); + AppendOperand(mean); + AppendOperand(variance); +} + +std::unique_ptr +HloBatchNormInferenceInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 5); + return absl::make_unique( + shape, new_operands[0], new_operands[1], new_operands[2], new_operands[3], + new_operands[4], epsilon(), feature_index()); +} + +HloBatchNormGradInstruction::HloBatchNormGradInstruction( + const Shape& shape, HloInstruction* operand, HloInstruction* scale, + HloInstruction* mean, HloInstruction* variance, HloInstruction* grad_output, + float epsilon, int64 feature_index) + : HloBatchNormInstruction(HloOpcode::kBatchNormGrad, shape, operand, scale, + epsilon, feature_index) { + AppendOperand(mean); + AppendOperand(variance); + AppendOperand(grad_output); +} + +std::unique_ptr +HloBatchNormGradInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 5); + return absl::make_unique( + shape, new_operands[0], new_operands[1], new_operands[2], new_operands[3], + new_operands[4], epsilon(), feature_index()); +} + +HloFftInstruction::HloFftInstruction(const Shape& shape, + HloInstruction* operand, FftType fft_type, + absl::Span fft_length) + : HloInstruction(HloOpcode::kFft, shape), fft_type_(fft_type) { + fft_length_.assign(fft_length.begin(), fft_length.end()); + AppendOperand(operand); +} + +HloInstructionProto HloFftInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_fft_type(fft_type_); + for (int64 fft_len : fft_length_) { + proto.add_fft_length(fft_len); + } + return proto; +} + +std::vector HloFftInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("fft_type=", FftType_Name(fft_type())), + StrCat("fft_length={", StrJoin(fft_length(), ","), "}")}; +} + +bool HloFftInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return fft_type() == casted_other.fft_type() && + fft_length() == casted_other.fft_length(); +} + +std::unique_ptr HloFftInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique(shape, new_operands[0], fft_type_, + fft_length_); +} + +HloSendRecvInstruction::HloSendRecvInstruction(HloOpcode opcode, + const Shape& shape, + int64 channel_id, + bool is_host_transfer) + : HloInstruction(opcode, shape), + channel_id_(channel_id), + is_host_transfer_(is_host_transfer) {} + +HloInstructionProto HloSendRecvInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_channel_id(channel_id_); + return proto; +} + +std::vector HloSendRecvInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + std::vector attrs; + attrs.push_back(StrCat("channel_id=", channel_id_)); + if (is_host_transfer()) { + attrs.push_back("is_host_transfer=true"); + } + return attrs; +} + +bool HloSendRecvInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + // Not yet supported. + return false; +} + +// Send instruction produces a tuple of {aliased operand, U32 context}. +HloSendInstruction::HloSendInstruction(HloInstruction* operand, + HloInstruction* token, int64 channel_id, + bool is_host_transfer) + : HloSendRecvInstruction( + HloOpcode::kSend, + ShapeUtil::MakeTupleShape({CHECK_NOTNULL(operand)->shape(), + ShapeUtil::MakeShape(U32, {}), + ShapeUtil::MakeTokenShape()}), + channel_id, is_host_transfer) { + AppendOperand(operand); + AppendOperand(token); +} + +std::unique_ptr HloSendInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 2); + return absl::make_unique( + new_operands[0], new_operands[1], channel_id(), is_host_transfer()); +} + +HloSendDoneInstruction::HloSendDoneInstruction(HloSendInstruction* operand, + bool is_host_transfer) + : HloSendRecvInstruction(HloOpcode::kSendDone, ShapeUtil::MakeTokenShape(), + CHECK_NOTNULL(operand)->channel_id(), + is_host_transfer) { + AppendOperand(operand); +} + +std::unique_ptr +HloSendDoneInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique( + Cast(new_operands[0]), is_host_transfer()); +} + +// Recv instruction produces a tuple of {receive buffer, U32 context}. +HloRecvInstruction::HloRecvInstruction(const Shape& shape, + HloInstruction* token, int64 channel_id, + bool is_host_transfer) + : HloSendRecvInstruction( + HloOpcode::kRecv, + ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {}), + ShapeUtil::MakeTokenShape()}), + channel_id, is_host_transfer) { + AppendOperand(token); +} + +std::unique_ptr HloRecvInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique( + ShapeUtil::GetTupleElementShape(shape, 0), new_operands[0], channel_id(), + is_host_transfer()); +} + +HloRecvDoneInstruction::HloRecvDoneInstruction(HloRecvInstruction* operand, + bool is_host_transfer) + : HloSendRecvInstruction( + HloOpcode::kRecvDone, + ShapeUtil::MakeTupleShape( + {ShapeUtil::GetTupleElementShape(operand->shape(), 0), + ShapeUtil::MakeTokenShape()}), + CHECK_NOTNULL(operand)->channel_id(), is_host_transfer) { + AppendOperand(operand); +} + +std::unique_ptr +HloRecvDoneInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique( + Cast(new_operands[0]), is_host_transfer()); +} + +HloCollectiveInstruction::HloCollectiveInstruction( + HloOpcode opcode, const Shape& shape, + absl::Span operands, + const std::vector& replica_groups) + : HloInstruction(opcode, shape), replica_groups_(replica_groups) { + for (auto operand : operands) { + AppendOperand(operand); + } +} + +HloInstructionProto HloCollectiveInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_replica_groups() = {replica_groups_.begin(), + replica_groups_.end()}; + return proto; +} + +std::vector HloCollectiveInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& /*options*/) const { + std::vector result; + std::vector replica_group_str; + for (const ReplicaGroup& group : replica_groups()) { + replica_group_str.push_back( + StrCat("{", StrJoin(group.replica_ids(), ","), "}")); + } + result.push_back( + StrCat("replica_groups={", StrJoin(replica_group_str, ","), "}")); + return result; +} + +bool HloCollectiveInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + /*eq_computations*/) const { + const auto& casted_other = + static_cast(other); + return absl::c_equal(replica_groups(), casted_other.replica_groups(), + [](const ReplicaGroup& a, const ReplicaGroup& b) { + return absl::c_equal(a.replica_ids(), b.replica_ids()); + }); +} + +HloAllReduceInstruction::HloAllReduceInstruction( + const Shape& shape, absl::Span operands, + HloComputation* reduce_computation, + const std::vector& replica_groups, absl::string_view barrier, + const absl::optional& all_reduce_id) + : HloCollectiveInstruction(HloOpcode::kCrossReplicaSum, shape, operands, + replica_groups), + cross_replica_sum_barrier_(barrier), + all_reduce_id_(all_reduce_id) { + AppendComputation(reduce_computation); +} + +HloInstructionProto HloAllReduceInstruction::ToProto() const { + HloInstructionProto proto = HloCollectiveInstruction::ToProto(); + // Proto3 is so sad. + if (all_reduce_id_) { + proto.set_all_reduce_id(*all_reduce_id_); + } + proto.set_cross_replica_sum_barrier(cross_replica_sum_barrier_); + return proto; +} + +std::vector HloAllReduceInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + std::vector result = + HloCollectiveInstruction::ExtraAttributesToStringImpl(options); + if (!cross_replica_sum_barrier().empty()) { + result.push_back(StrCat("barrier=\"", cross_replica_sum_barrier(), "\"")); + } + if (all_reduce_id_) { + result.push_back(StrCat("all_reduce_id=", *all_reduce_id_)); + } + return result; +} + +bool HloAllReduceInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) && + eq_computations(to_apply(), casted_other.to_apply()) && + cross_replica_sum_barrier() == + casted_other.cross_replica_sum_barrier() && + all_reduce_id() == casted_other.all_reduce_id(); +} + +std::unique_ptr +HloAllReduceInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* /*context*/) const { + return absl::make_unique( + shape, new_operands, to_apply(), replica_groups(), + cross_replica_sum_barrier(), all_reduce_id()); +} + +HloAllToAllInstruction::HloAllToAllInstruction( + const Shape& shape, absl::Span operands, + const std::vector& replica_groups) + : HloCollectiveInstruction(HloOpcode::kAllToAll, shape, operands, + replica_groups) {} + +std::unique_ptr +HloAllToAllInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* /*context*/) const { + return absl::make_unique(shape, new_operands, + replica_groups()); +} + +HloCollectivePermuteInstruction::HloCollectivePermuteInstruction( + const Shape& shape, HloInstruction* operand, + const std::vector>& source_target_pairs) + : HloInstruction(HloOpcode::kCollectivePermute, shape), + source_target_pairs_(source_target_pairs) { + AppendOperand(operand); +} + +HloInstructionProto HloCollectivePermuteInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (const auto& pair : source_target_pairs()) { + auto* proto_pair = proto.add_source_target_pairs(); + proto_pair->set_source(pair.first); + proto_pair->set_target(pair.second); + } + return proto; +} + +std::vector +HloCollectivePermuteInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& /*options*/) const { + std::vector result; + std::vector strs; + for (const auto& pair : source_target_pairs()) { + strs.push_back(StrCat("{", pair.first, ",", pair.second, "}")); + } + result.push_back(StrCat("source_target_pairs={", StrJoin(strs, ","), "}")); + return result; +} + +bool HloCollectivePermuteInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + /*eq_computations*/) const { + const auto& casted_other = + static_cast(other); + return absl::c_equal(source_target_pairs(), + casted_other.source_target_pairs(), + [](const std::pair& a, + const std::pair& b) { return a == b; }); +} + +std::unique_ptr +HloCollectivePermuteInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* /*context*/) const { + return absl::make_unique( + shape, new_operands[0], source_target_pairs()); +} + +HloReverseInstruction::HloReverseInstruction(const Shape& shape, + HloInstruction* operand, + absl::Span dimensions) + : HloInstruction(HloOpcode::kReverse, shape), + dimensions_(dimensions.begin(), dimensions.end()) { + AppendOperand(operand); +} + +HloInstructionProto HloReverseInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (int64 dimension : dimensions_) { + proto.add_dimensions(dimension); + } + return proto; +} + +std::vector HloReverseInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")}; +} + +bool HloReverseInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return dimensions() == casted_other.dimensions(); +} + +std::unique_ptr HloReverseInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique(shape, new_operands[0], + dimensions()); +} + +HloConcatenateInstruction::HloConcatenateInstruction( + const Shape& shape, absl::Span operands, + int64 dimension) + : HloInstruction(HloOpcode::kConcatenate, shape), dimensions_({dimension}) { + for (auto operand : operands) { + AppendOperand(operand); + } +} + +HloInstructionProto HloConcatenateInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (int64 dimension : dimensions_) { + proto.add_dimensions(dimension); + } + return proto; +} + +std::vector HloConcatenateInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")}; +} + +bool HloConcatenateInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = + static_cast(other); + return dimensions() == casted_other.dimensions(); +} + +std::unique_ptr +HloConcatenateInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + return absl::make_unique(shape, new_operands, + dimensions(0)); +} + +HloReduceInstruction::HloReduceInstruction( + const Shape& shape, absl::Span args, + absl::Span dimensions_to_reduce, + HloComputation* reduce_computation) + : HloInstruction(HloOpcode::kReduce, shape), + dimensions_(dimensions_to_reduce.begin(), dimensions_to_reduce.end()) { + for (HloInstruction* arg : args) { + AppendOperand(arg); + } + AppendComputation(reduce_computation); +} + +HloInstructionProto HloReduceInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (int64 dimension : dimensions_) { + proto.add_dimensions(dimension); + } + return proto; +} + +std::vector HloReduceInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")}; +} + +bool HloReduceInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + // Reduction results are determined by the reduction dimension and the + // reduction computation. + return dimensions() == casted_other.dimensions() && + eq_computations(to_apply(), casted_other.to_apply()); +} + +std::unique_ptr HloReduceInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size() % 2, 0); + return absl::make_unique(shape, new_operands, + dimensions(), to_apply()); +} + +HloSortInstruction::HloSortInstruction(const Shape& shape, int64 dimension, + HloInstruction* keys, + HloInstruction* values) + : HloInstruction(HloOpcode::kSort, shape), dimensions_({dimension}) { + AppendOperand(keys); + if (values) { + AppendOperand(values); + } +} + +HloInstructionProto HloSortInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (int64 dimension : dimensions_) { + proto.add_dimensions(dimension); + } + return proto; +} + +std::vector HloSortInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")}; +} + +bool HloSortInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return dimensions() == casted_other.dimensions(); +} + +std::unique_ptr HloSortInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + HloInstruction* keys = new_operands[0]; + HloInstruction* values = new_operands.size() == 2 ? new_operands[1] : nullptr; + return absl::make_unique(shape, dimensions(0), keys, + values); +} + +HloTransposeInstruction::HloTransposeInstruction( + const Shape& shape, HloInstruction* operand, + absl::Span dimensions) + : HloInstruction(HloOpcode::kTranspose, shape), + dimensions_(dimensions.begin(), dimensions.end()) { + CHECK_EQ(shape.dimensions().size(), dimensions.size()); + CHECK_EQ(shape.dimensions().size(), operand->shape().dimensions().size()); + CHECK(std::equal(operand->shape().dimensions().begin(), + operand->shape().dimensions().end(), + Permute(dimensions, shape.dimensions()).begin())) + << "shape: " << ShapeUtil::HumanString(shape) + << ", operand->shape(): " << ShapeUtil::HumanString(shape) + << ", dimensions: {" << StrJoin(dimensions, ", ") << "}"; + AppendOperand(operand); +} + +bool HloTransposeInstruction::IsRank2Transpose() const { + return dimensions() == std::vector({1, 0}) && + shape().dimensions_size() == 2 && + std::equal(shape().dimensions().begin(), shape().dimensions().end(), + operand(0)->shape().dimensions().rbegin()); +} + +HloInstructionProto HloTransposeInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (int64 dimension : dimensions_) { + proto.add_dimensions(dimension); + } + return proto; +} + +std::vector HloTransposeInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")}; +} + +bool HloTransposeInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return dimensions() == casted_other.dimensions(); +} + +std::unique_ptr +HloTransposeInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique(shape, new_operands[0], + dimensions()); +} + +HloBroadcastInstruction::HloBroadcastInstruction( + const Shape& shape, HloInstruction* operand, + absl::Span broadcast_dimension) + : HloInstruction(HloOpcode::kBroadcast, shape), + dimensions_(broadcast_dimension.begin(), broadcast_dimension.end()) { + AppendOperand(operand); +} + +HloInstructionProto HloBroadcastInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (int64 dimension : dimensions_) { + proto.add_dimensions(dimension); + } + return proto; +} + +std::vector HloBroadcastInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")}; +} + +bool HloBroadcastInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return dimensions() == casted_other.dimensions(); +} + +std::unique_ptr +HloBroadcastInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique(shape, new_operands[0], + dimensions()); +} + +HloMapInstruction::HloMapInstruction(const Shape& shape, + absl::Span operands, + HloComputation* map_computation) + : HloInstruction(HloOpcode::kMap, shape) { + for (auto operand : operands) { + AppendOperand(operand); + } + AppendComputation(map_computation); + // TODO(b/65689298) Remove code below once Map is generalized to accept + // arbitrary map dimensions. + dimensions_.resize(ShapeUtil::Rank(shape)); + std::iota(dimensions_.begin(), dimensions_.end(), 0); +} + +HloInstructionProto HloMapInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (int64 dimension : dimensions_) { + proto.add_dimensions(dimension); + } + return proto; +} + +bool HloMapInstruction::IsElementwiseImpl( + const absl::optional& operand_idx) const { + if (!dimensions().empty()) { + // Check that the map is executed in elementwise compatible dimensions. + if (dimensions().size() != shape().dimensions_size()) { + return false; + } + for (int i = 0; i < dimensions().size(); ++i) { + if (dimensions()[i] != i) { + return false; + } + } + } + return true; +} + +std::vector HloMapInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")}; +} + +bool HloMapInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + return eq_computations(to_apply(), other.to_apply()); +} + +std::unique_ptr HloMapInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + return absl::make_unique(shape, new_operands, to_apply()); +} + +HloSliceInstruction::HloSliceInstruction(const Shape& shape, + HloInstruction* operand, + absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides) + : HloInstruction(HloOpcode::kSlice, shape), + slice_starts_(start_indices.begin(), start_indices.end()), + slice_limits_(limit_indices.begin(), limit_indices.end()), + slice_strides_(strides.begin(), strides.end()) { + AppendOperand(operand); + // For backward compatibility with old serialized computations: if there are + // no strides, assume all strides are 1. + // TODO(b/63317920): remove this code. + if (slice_strides_.empty()) { + slice_strides_ = std::vector(start_indices.size(), 1LL); + } +} + +HloInstructionProto HloSliceInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (int i = 0; i < slice_starts_.size(); ++i) { + auto* slice_dimension = proto.add_slice_dimensions(); + slice_dimension->set_start(slice_starts_[i]); + slice_dimension->set_limit(slice_limits_[i]); + slice_dimension->set_stride(slice_strides_[i]); + } + return proto; +} + +std::vector HloSliceInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + std::vector bounds; + bounds.reserve(slice_starts_.size()); + const bool omit_stride = + std::all_of(slice_strides_.begin(), slice_strides_.end(), + [](int64 stride) { return stride == 1; }); + for (int i = 0; i < slice_starts_.size(); ++i) { + string stride_str = omit_stride ? "" : StrCat(":", slice_strides_[i]); + bounds.push_back( + StrCat("[", slice_starts_[i], ":", slice_limits_[i], stride_str, "]")); + } + return {StrCat("slice={", StrJoin(bounds, ", "), "}")}; +} + +bool HloSliceInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& other_slice = static_cast(other); + return slice_starts_ == other_slice.slice_starts_ && + slice_limits_ == other_slice.slice_limits_ && + slice_strides_ == other_slice.slice_strides_; +} + +std::unique_ptr HloSliceInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique( + shape, new_operands[0], slice_starts_, slice_limits_, slice_strides_); +} + +HloConstantInstruction::HloConstantInstruction(std::unique_ptr literal) + : HloInstruction(HloOpcode::kConstant, CHECK_NOTNULL(literal)->shape()), + literal_(std::move(literal)) {} + +HloConstantInstruction::HloConstantInstruction(const Shape& shape) + : HloInstruction(HloOpcode::kConstant, shape) {} + +HloInstructionProto HloConstantInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + if (literal_ != nullptr) { + *proto.mutable_literal() = literal_->ToProto(); + } + return proto; +} + +bool HloConstantInstruction::IsElementwiseImpl( + const absl::optional& operand_idx) const { + return true; +} + +void HloConstantInstruction::RelayoutConstant(const Layout& new_layout, + const ShapeIndex& shape_index) { + Shape* mutable_array_subshape = + ShapeUtil::GetMutableSubshape(mutable_shape(), shape_index); + CHECK(ShapeUtil::IsArray(*mutable_array_subshape)); + + // Normally array_subshape will always have a layout, but this invariant is + // temporarily broken in LayoutAssignment::AssignLayouts. + + if (!mutable_array_subshape->has_layout() || + !LayoutUtil::Equal(mutable_array_subshape->layout(), new_layout)) { + literal_ = literal_->Relayout(new_layout, shape_index); + *mutable_array_subshape->mutable_layout() = new_layout; + } +} + +bool HloConstantInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& other_slice = static_cast(other); + return literal() == other_slice.literal(); +} + +std::unique_ptr +HloConstantInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + return absl::make_unique(literal_->CloneToUnique()); +} + +string HloConstantInstruction::OperandsToStringWithCanonicalNameMap( + const HloPrintOptions& options, + CanonicalNameMap* canonical_name_map) const { + string operands; + // For constants, show the actual value in place of an empty operand list. + if (literal_ != nullptr && + ((ShapeUtil::IsArray(shape()) && ShapeUtil::ElementsIn(shape()) <= 10) || + options.print_large_constants())) { + // Literal::ToString emits multidimensional arrays over multiple + // lines. Compact this into one line by stripping out white space. + string tmp = literal().ToString(); + std::replace(tmp.begin(), tmp.end(), '\n', ' '); + std::vector v = absl::StrSplit(tmp, ' '); + bool first = true; + // Concatenate elements in "v" with spaces separating them, but ignoring + // empty entries. + for (const auto& s : v) { + if (s.empty()) { + continue; + } + StrAppend(&operands, (first ? "" : " "), s); + first = false; + } + } else { + // Do not show large constants or tuples. + operands = "{...}"; + } + return operands; +} + +HloTraceInstruction::HloTraceInstruction(const string& tag, + HloInstruction* operand) + : HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()), + literal_(LiteralUtil::CreateR1U8(tag)) { + AppendOperand(operand); + operand->set_tracing(this); +} + +HloInstructionProto HloTraceInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_literal() = literal_->ToProto(); + return proto; +} + +bool HloTraceInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + return false; +} + +std::unique_ptr HloTraceInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode()); +} + +HloFusionInstruction::HloFusionInstruction(const Shape& shape, + FusionKind fusion_kind, + HloInstruction* fused_root) + : HloInstruction(HloOpcode::kFusion, shape), fusion_kind_(fusion_kind) { + CHECK(fused_root != nullptr); + SetAndSanitizeName("fusion"); + set_parent(fused_root->parent()); + set_metadata(fused_root->metadata()); + CloneAndFuseInternal(fused_root); +} + +HloFusionInstruction::HloFusionInstruction( + const Shape& shape, FusionKind fusion_kind, + absl::Span operands, + HloComputation* fusion_computation) + : HloInstruction(HloOpcode::kFusion, shape), fusion_kind_(fusion_kind) { + for (auto operand : operands) { + AppendOperand(operand); + } + SetAndSanitizeName("fusion"); + AppendComputation(fusion_computation); + fusion_computation->SetFusionInstruction(this); +} + +string HloFusionInstruction::ToCategory() const { + switch (fusion_kind()) { + case FusionKind::kLoop: + return "loop fusion"; + case FusionKind::kInput: + return "input fusion"; + case FusionKind::kOutput: + return "output fusion"; + case FusionKind::kCustom: + return "custom fusion"; + } +} + +HloInstructionProto HloFusionInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_fusion_kind(xla::ToString(fusion_kind())); + proto.add_called_computation_ids( + fused_instructions_computation()->unique_id()); + return proto; +} + +bool HloFusionInstruction::IsElementwiseImpl( + const absl::optional& operand_idx) const { + if (!operand_idx.has_value()) { + for (auto* fused : fused_instructions()) { + if (fused->opcode() != HloOpcode::kParameter && !fused->IsElementwise()) { + return false; + } + } + return true; + } + // A loop-fusion is elementwise on an operand if all operations (computed + // using BFS) between the operand and the fused root are elementwise. + std::deque worklist; + std::unordered_set visited; + worklist.push_back(fused_parameter(operand_idx.value())); + visited.insert(fused_parameter(operand_idx.value())); + while (!worklist.empty()) { + HloInstruction* operand = worklist.front(); + worklist.pop_front(); + for (HloInstruction* user : operand->users()) { + CHECK_GE(user->unique_id(), 0); + if (ContainsKey(visited, user)) { + continue; + } + if (user->IsElementwise() || + IsInstructionElementwiseOnOperand(user, operand)) { + worklist.push_back(user); + visited.insert(user); + } else { + return false; + } + } + } + return true; +} + +HloInstruction* HloFusionInstruction::AddFusionOperand( + HloInstruction* new_operand) { + CHECK_EQ(operand_count(), + fused_instructions_computation()->parameter_instructions().size()); + const int64 param_no = operand_count(); + // Name the parameter after the instruction it represents in the outer + // (non-fusion) computation. + string param_name = StrCat(new_operand->name(), ".param_", param_no); + HloInstruction* fused_parameter = + fused_instructions_computation()->AddParameter( + HloInstruction::CreateParameter(param_no, new_operand->shape(), + param_name)); + AppendOperand(new_operand); + return fused_parameter; +} + +void HloFusionInstruction::MergeFusionInstruction( + HloFusionInstruction* instruction_to_merge) { + CHECK(std::find(operands().begin(), operands().end(), instruction_to_merge) != + operands().end()); + // Clone the instruction from which to merge fused instructions. + std::unique_ptr cloned = instruction_to_merge->Clone(); + HloFusionInstruction* cloned_fusion = + static_cast(cloned.get()); + // Replace uses of fused parameters with the corresponding operand of the + // fusion. Add all non-parameter fused instructions to + // 'unfused_instructions' to be merged into 'this'. This is done in reverse + // post order. + std::vector unfused_instructions; + auto fused_instructions = cloned_fusion->fused_instructions_computation() + ->MakeInstructionPostOrder(); + for (auto fused_it = fused_instructions.rbegin(); + fused_it != fused_instructions.rend(); ++fused_it) { + auto fused_instruction = *fused_it; + if (fused_instruction->opcode() == HloOpcode::kParameter) { + TF_CHECK_OK( + fused_instruction->ReplaceAllUsesWith(cloned_fusion->mutable_operand( + fused_instruction->parameter_number()))); + } else { + unfused_instructions.push_back(fused_instruction); + } + } + CHECK(unfused_instructions.front() == cloned_fusion->fused_expression_root()); + // Replace instruction_to_merge use of 'this' with unfused_root. + TF_CHECK_OK( + instruction_to_merge->ReplaceUseWith(this, unfused_instructions.front())); + // Fuse 'unfused_instructions' into 'this'. + for (auto& instruction : unfused_instructions) { + FuseInstruction(instruction); + } + CHECK_EQ(0, cloned_fusion->user_count()); + TF_CHECK_OK(parent()->parent()->RemoveEmbeddedComputation( + cloned_fusion->fused_instructions_computation())); +} + +void HloFusionInstruction::MergeFusionInstructionIntoMultiOutput( + HloFusionInstruction* instruction_to_merge) { + // Add all non-parameter fused instructions to 'unfused_instructions' to be + // merged into 'this'. `old_to_new' maps the instructions in the fused node + // to the disaseembled fusion instructions. + // Note that we add the unfused instructions to this->parent_ computation. + // This is necessary because the unique_id needs for an instruction and + // it's only added when inserting to the computation. + tensorflow::gtl::FlatMap old_to_new; + std::vector unfused_instructions; + auto computation_to_merge = + instruction_to_merge->fused_instructions_computation(); + auto post_order = computation_to_merge->MakeInstructionPostOrder(); + for (auto rit = post_order.rbegin(); rit != post_order.rend(); ++rit) { + auto fused_instruction = *rit; + if (fused_instruction->opcode() == HloOpcode::kParameter) { + InsertOrDie(&old_to_new, fused_instruction, + instruction_to_merge->mutable_operand( + fused_instruction->parameter_number())); + continue; + } + + // Here we clone the insertion and call FuseInstructionIntoMultiOutput() + // which clones again. This can be improved. + auto cloned_instruction = + parent()->AddInstruction(fused_instruction->Clone()); + unfused_instructions.push_back(cloned_instruction); + InsertOrDie(&old_to_new, fused_instruction, cloned_instruction); + } + for (auto unfused_instruction : unfused_instructions) { + for (int64 index = 0; index < unfused_instruction->operand_count(); + index++) { + auto new_operand = + FindOrDie(old_to_new, unfused_instruction->mutable_operand(index)); + TF_CHECK_OK(unfused_instruction->ReplaceOperandWith(index, new_operand)); + } + } + + HloInstruction* unfused_root = unfused_instructions.front(); + TF_CHECK_OK(instruction_to_merge->ReplaceAllUsesWith(unfused_root)); + + TF_CHECK_OK( + instruction_to_merge->parent()->RemoveInstruction(instruction_to_merge)); + if (GetModule()) { + TF_CHECK_OK(GetModule()->RemoveEmbeddedComputation(computation_to_merge)); + } + + // Fuse the root instruction and generate multiple outputs. + FuseInstructionIntoMultiOutput(unfused_root); + TF_CHECK_OK(unfused_root->parent()->RemoveInstruction(unfused_root)); + // The rest instructions are of normal fusing. + for (int64 i = 1; i < unfused_instructions.size(); i++) { + auto instruction = unfused_instructions[i]; + FuseInstruction(instruction); + TF_CHECK_OK(instruction->parent()->RemoveInstruction(instruction)); + } +} + +HloComputation* HloFusionInstruction::fused_instructions_computation() const { + CHECK(!called_computations().empty()); + auto* fused_instructions_computation = called_computations().front(); + CHECK(fused_instructions_computation->IsFusionComputation()) + << "Computation " << fused_instructions_computation->name() + << " is not a fusion kind"; + return fused_instructions_computation; +} + +HloInstruction* HloFusionInstruction::fused_expression_root() const { + return fused_instructions_computation()->root_instruction(); +} + +HloInstruction* HloFusionInstruction::fused_parameter( + int64 parameter_number) const { + return fused_instructions_computation()->parameter_instruction( + parameter_number); +} + +const std::vector& HloFusionInstruction::fused_parameters() + const { + return fused_instructions_computation()->parameter_instructions(); +} + +const tensorflow::gtl::iterator_range>::const_iterator>> +HloFusionInstruction::fused_instructions() const { + const HloComputation* subcomp = fused_instructions_computation(); + return subcomp->instructions(); +} + +const tensorflow::gtl::iterator_range< + UnwrappingIterator>::iterator>> +HloFusionInstruction::fused_instructions() { + return fused_instructions_computation()->instructions(); +} + +int64 HloFusionInstruction::fused_instruction_count() const { + return fused_instructions_computation()->instruction_count(); +} + +HloInstruction* HloFusionInstruction::FuseInstructionInternal( + HloInstruction* instruction_to_fuse, bool add_output) { + // When add_output is false, this fusion instruction must be a user of + // instruction_to_fuse. + if (!add_output) { + CHECK(IsUserOf(instruction_to_fuse)); + } + HloInstruction* fused_instruction = + CloneAndFuseInternal(instruction_to_fuse, add_output); + return fused_instruction; +} + +HloInstruction* HloFusionInstruction::CloneAndFuseInternal( + HloInstruction* instruction_to_fuse, bool add_output) { + CHECK(instruction_to_fuse->IsFusible()) << instruction_to_fuse->ToString(); + VLOG(3) << "CloneAndFuseInternal:\n" << instruction_to_fuse->ToString(); + HloInstruction* clone = nullptr; + if (called_computations().empty()) { + // New fusion instruction. It should not be a multioutput instruction. + CHECK(!add_output); + auto builder = HloComputation::Builder("fused_computation", this); + builder.AddInstruction(instruction_to_fuse->Clone(/*suffix=*/"")); + AppendComputation( + CHECK_NOTNULL(GetModule())->AddEmbeddedComputation(builder.Build())); + clone = fused_expression_root(); + } else { + // When add_output is false, instruction_to_fuse is necessarily an operand + // of the fusion instruction. After fusion this will no longer be the + // case. Remove the operand from the operand list and remove its + // corresponding fused parameter instruction. Renumber parameters as + // necessary to make parameter numbers consistent with their index in the + // fused_parameter_ vector. + bool in_operand_list = std::find(operands().begin(), operands().end(), + instruction_to_fuse) != operands().end(); + CHECK(add_output || in_operand_list); + if (instruction_to_fuse->opcode() == HloOpcode::kTuple) { + // We assume all uses of a kTuple operation are GTE ops, not another + // fusion node. In this case, we don't need to clone + // 'instruction_to_fuse'. + CHECK(!in_operand_list); + clone = instruction_to_fuse; + } else { + clone = fused_instructions_computation()->AddInstruction( + instruction_to_fuse->Clone(/*suffix=*/"")); + } + const std::vector& fused_parameters = + fused_instructions_computation()->parameter_instructions(); + for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) { + if (instruction_to_fuse == operand(operand_num)) { + // replace the fused parameter instruction's uses with the clone. + HloInstruction* fused_parameter = fused_parameters[operand_num]; + TF_CHECK_OK(fused_parameter->ReplaceAllUsesWith(clone)); + + // Remove the corresponding fused parameter and operand from their + // respective vectors. + TF_CHECK_OK( + fused_instructions_computation()->RemoveParameter(operand_num)); + RemoveOperandAt(operand_num); + break; + } + } + // We've cloned instruction_to_fuse into this fusion instruction, so this + // fusion instruction is no longer a use of instruction_to_fuse. + if (in_operand_list) { + DetachFrom(instruction_to_fuse); + // When the instruction_to_fuse does not have other users, we don't need + // to generate a multioutput fusion instruction. + if (instruction_to_fuse->user_count() == 0) { + add_output = false; + } + } + } + + // Reread the parameters in the computation. + const std::vector& fused_parameters = + fused_instructions_computation()->parameter_instructions(); + + // Add each operand of the clone as an operand of the fusion instruction. A + // complication is that some clone operands may already be operands of the + // fusion instruction. + for (int64 operand_num = 0; operand_num < clone->operand_count(); + ++operand_num) { + HloInstruction* operand = clone->mutable_operand(operand_num); + + // See if this operand is already an operand of the fusion node. + CHECK_EQ(operands().size(), fused_parameters.size()); + HloInstruction* fused_param = nullptr; + for (int64 i = 0; i < operands().size(); ++i) { + if (this->operand(i) == operand) { + fused_param = fused_parameters[i]; + break; + } + } + + if (fused_param == nullptr) { + // Clone's operand was not already an operand of the fusion + // instruction. Add it as an operand and add a corresponding fused + // parameter instruction. + fused_param = AddFusionOperand(operand); + } + TF_CHECK_OK(clone->ReplaceOperandWith(operand_num, fused_param)); + } + + if (add_output) { + CHECK_GT(instruction_to_fuse->user_count(), 0); + // If this is already a multioutput fusion instruction, expand the root + // tuple by 1. + HloInstruction* fused_root = fused_expression_root(); + HloInstruction::InstructionVector tuple_elements; + bool newly_created_tuple_instr = false; + if (fused_root->opcode() == HloOpcode::kTuple) { + tuple_elements = fused_root->operands(); + } else { + tuple_elements.push_back(fused_root); + newly_created_tuple_instr = true; + } + if (clone->opcode() == HloOpcode::kTuple) { + for (auto inst : clone->operands()) { + tuple_elements.push_back(inst); + } + } else { + tuple_elements.push_back(clone); + } + HloInstruction* new_root = fused_instructions_computation()->AddInstruction( + HloInstruction::CreateTuple(tuple_elements)); + fused_instructions_computation()->set_root_instruction(new_root); + *mutable_shape() = new_root->shape(); + if (fused_root->opcode() == HloOpcode::kTuple) { + TF_CHECK_OK( + fused_instructions_computation()->RemoveInstruction(fused_root)); + } + + // If this is a newly created multioutput instruction, we need to update + // the use of the original fusion instruction. + if (newly_created_tuple_instr) { + HloInstruction* new_instr = parent()->AddInstruction( + HloInstruction::CreateGetTupleElement(fused_root->shape(), this, 0)); + TF_CHECK_OK(ReplaceAllUsesWith(new_instr)); + } + int64 index = tuple_elements.size(); + if (instruction_to_fuse->opcode() == HloOpcode::kTuple) { + CHECK_EQ(clone, instruction_to_fuse); + index -= clone->operand_count(); + std::vector to_be_removed; + for (auto old_gte : clone->users()) { + CHECK_EQ(old_gte->opcode(), HloOpcode::kGetTupleElement); + int64 old_tuple_index = old_gte->tuple_index(); + HloInstruction* new_gte = + parent()->AddInstruction(HloInstruction::CreateGetTupleElement( + old_gte->shape(), this, index + old_tuple_index)); + TF_CHECK_OK(old_gte->ReplaceAllUsesWith(new_gte)); + to_be_removed.push_back(old_gte); + } + for (auto old_gte : to_be_removed) { + TF_CHECK_OK(parent()->RemoveInstruction(old_gte)); + } + } else { + HloInstruction* new_gte = + parent()->AddInstruction(HloInstruction::CreateGetTupleElement( + clone->shape(), this, index - 1)); + TF_CHECK_OK(instruction_to_fuse->ReplaceAllUsesWith(new_gte)); + } + } + + if (clone != instruction_to_fuse) { + VLOG(2) << "New clone:\n" << clone->ToString(); + } + return clone; +} + +std::vector HloFusionInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("kind=", xla::ToString(fusion_kind()))}; +} + +bool HloFusionInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + return fusion_kind() == other.fusion_kind() && + eq_computations(fused_instructions_computation(), + other.fused_instructions_computation()); +} + +std::unique_ptr HloFusionInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + HloModule* module = context != nullptr ? context->module() : GetModule(); + HloComputation* new_fused_computation = nullptr; + if (context != nullptr) { + new_fused_computation = + context->FindComputation(fused_instructions_computation()); + } + if (new_fused_computation == nullptr) { + new_fused_computation = module->AddEmbeddedComputation( + fused_instructions_computation()->Clone("clone", context)); + } + return absl::make_unique( + shape, fusion_kind(), new_operands, new_fused_computation); +} + +Status HloFusionInstruction::DeduplicateFusionOperands() { + tensorflow::gtl::FlatMap operand_indices; + std::vector operands_to_remove; + for (int i = 0; i < operand_count(); ++i) { + auto emplace_result = operand_indices.emplace(operand(i), i); + if (!emplace_result.second) { + TF_RETURN_IF_ERROR(fused_parameter(i)->ReplaceAllUsesWith( + fused_parameter(emplace_result.first->second))); + operands_to_remove.push_back(i); + } + } + if (operands_to_remove.empty()) { + return Status::OK(); + } + TF_RETURN_IF_ERROR( + fused_instructions_computation()->RemoveUnusedParameters()); + RemoveOperandsAtAscendingIndices(operands_to_remove); + return Status::OK(); +} + +HloRngInstruction::HloRngInstruction( + const Shape& shape, RandomDistribution distribution, + absl::Span parameters) + : HloInstruction(HloOpcode::kRng, shape), distribution_(distribution) { + for (HloInstruction* param : parameters) { + AppendOperand(param); + } +} + +HloInstructionProto HloRngInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_distribution(distribution_); + return proto; +} + +std::vector HloRngInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("distribution=", RandomDistributionToString(distribution_))}; +} + +bool HloRngInstruction::IsElementwiseImpl( + const absl::optional& operand_idx) const { + return true; +} + +bool HloRngInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + return false; +} + +std::unique_ptr HloRngInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + return absl::make_unique(shape, distribution_, + new_operands); +} + +HloParameterInstruction::HloParameterInstruction(int64 parameter_number, + const Shape& shape, + const string& name) + : HloInstruction(HloOpcode::kParameter, shape), + parameter_number_(parameter_number) { + SetAndSanitizeName(name); +} + +HloInstructionProto HloParameterInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_parameter_number(parameter_number_); + return proto; +} + +string HloParameterInstruction::OperandsToStringWithCanonicalNameMap( + const HloPrintOptions& options, + CanonicalNameMap* canonical_name_map) const { + return StrCat(parameter_number_); +} + +bool HloParameterInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return parameter_number() == casted_other.parameter_number(); +} + +std::unique_ptr +HloParameterInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + return absl::make_unique(parameter_number_, shape, + name()); +} + +HloGetTupleElementInstruction::HloGetTupleElementInstruction( + const Shape& shape, HloInstruction* operand, int64 index) + : HloInstruction(HloOpcode::kGetTupleElement, shape), tuple_index_(index) { + CHECK(ShapeUtil::IsTuple(operand->shape())); + AppendOperand(operand); +} + +HloInstructionProto HloGetTupleElementInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_tuple_index(tuple_index_); + return proto; +} + +std::vector HloGetTupleElementInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("index=", tuple_index())}; +} + +bool HloGetTupleElementInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = + static_cast(other); + return tuple_index() == casted_other.tuple_index(); +} + +std::unique_ptr +HloGetTupleElementInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique( + shape, new_operands[0], tuple_index()); +} + +HloReducePrecisionInstruction::HloReducePrecisionInstruction( + const Shape& shape, HloInstruction* operand, const int exponent_bits, + const int mantissa_bits) + : HloInstruction(HloOpcode::kReducePrecision, shape), + exponent_bits_(exponent_bits), + mantissa_bits_(mantissa_bits) { + AppendOperand(operand); +} + +HloInstructionProto HloReducePrecisionInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_exponent_bits(exponent_bits_); + proto.set_mantissa_bits(mantissa_bits_); + return proto; +} + +std::vector HloReducePrecisionInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("exponent_bits=", exponent_bits_), + StrCat("mantissa_bits=", mantissa_bits_)}; +} + +bool HloReducePrecisionInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = + static_cast(other); + // A reduce-precision operation is determined by the bit sizes. + return exponent_bits() == casted_other.exponent_bits() && + mantissa_bits() == casted_other.mantissa_bits(); +} + +std::unique_ptr +HloReducePrecisionInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique( + shape, new_operands[0], exponent_bits(), mantissa_bits()); +} + +HloInfeedInstruction::HloInfeedInstruction(const Shape& infeed_shape, + HloInstruction* token_operand, + const string& config) + : HloInstruction(HloOpcode::kInfeed, + ShapeUtil::MakeTupleShape( + {infeed_shape, ShapeUtil::MakeTokenShape()})), + infeed_config_(config) { + AppendOperand(token_operand); +} + +HloInstructionProto HloInfeedInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_infeed_config(infeed_config_); + return proto; +} + +std::vector HloInfeedInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + if (infeed_config_.empty()) { + return {}; + } + return {StrCat("infeed_config=\"", CEscape(infeed_config_), "\"")}; +} + +bool HloInfeedInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + // Not yet supported. + return false; +} + +std::unique_ptr HloInfeedInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 1); + return absl::make_unique( + infeed_shape(), new_operands[0], infeed_config()); +} + +HloOutfeedInstruction::HloOutfeedInstruction(const Shape& outfeed_shape, + HloInstruction* operand, + HloInstruction* token_operand, + absl::string_view outfeed_config) + : HloInstruction(HloOpcode::kOutfeed, ShapeUtil::MakeTokenShape()), + outfeed_shape_(outfeed_shape), + outfeed_config_(outfeed_config) { + CHECK(ShapeUtil::Compatible(operand->shape(), outfeed_shape)) + << "Outfeed shape " << outfeed_shape + << " must be compatible with operand shape " << operand->shape(); + AppendOperand(operand); + AppendOperand(token_operand); +} + +HloInstructionProto HloOutfeedInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.set_outfeed_config(outfeed_config()); + *proto.mutable_outfeed_shape() = outfeed_shape(); + return proto; +} + +std::vector HloOutfeedInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + if (outfeed_config_.empty()) { + return {}; + } + return {StrCat("outfeed_config=\"", CEscape(outfeed_config_), "\"")}; +} + +bool HloOutfeedInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + // Not yet supported. + return false; +} + +std::unique_ptr HloOutfeedInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 2); + return absl::make_unique( + outfeed_shape(), new_operands[0], new_operands[1], outfeed_config()); +} + +HloConvolutionInstruction::HloConvolutionInstruction( + const Shape& shape, HloInstruction* lhs, HloInstruction* rhs, + const Window& window, const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count) + : HloInstruction(HloOpcode::kConvolution, shape), + window_(window), + convolution_dimension_numbers_(dimension_numbers), + feature_group_count_(feature_group_count) { + if (window_util::HasBaseDilation(window)) { + SetAndSanitizeName(StrCat(name(), "-base-dilated")); + } + if (window_util::HasWindowDilation(window)) { + SetAndSanitizeName(StrCat(name(), "-window-dilated")); + } + AppendOperand(lhs); + AppendOperand(rhs); +} + +string HloConvolutionInstruction::ToCategory() const { + string category = "convolution"; + if (window_util::HasBaseDilation(window())) { + category += " base-dilated"; + } + if (window_util::HasWindowDilation(window())) { + category += " window-dilated"; + } + return category; +} + +HloInstructionProto HloConvolutionInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_window() = window_; + *proto.mutable_convolution_dimension_numbers() = + convolution_dimension_numbers_; + proto.set_feature_group_count(feature_group_count_); + return proto; +} + +std::vector HloConvolutionInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + std::vector extra; + if (window_.dimensions_size() != 0) { + extra.push_back(StrCat("window={", window_util::ToString(window()), "}")); + } + extra.push_back(StrCat("dim_labels=", ConvolutionDimensionNumbersToString( + convolution_dimension_numbers_))); + extra.push_back(StrCat("feature_group_count=", feature_group_count_)); + return extra; +} + +bool HloConvolutionInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = + static_cast(other); + if (feature_group_count_ != other.feature_group_count()) { + return false; + } + return protobuf_util::ProtobufEquals(window(), casted_other.window()) && + protobuf_util::ProtobufEquals( + convolution_dimension_numbers(), + casted_other.convolution_dimension_numbers()); +} + +std::unique_ptr +HloConvolutionInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 2); + return absl::make_unique( + shape, new_operands[0], new_operands[1], window(), + convolution_dimension_numbers_, feature_group_count_); +} + +HloReduceWindowInstruction::HloReduceWindowInstruction( + const Shape& shape, HloInstruction* operand, HloInstruction* init_value, + const Window& window, HloComputation* reduce_computation) + : HloInstruction(HloOpcode::kReduceWindow, shape), window_(window) { + AppendOperand(operand); + AppendOperand(init_value); + AppendComputation(reduce_computation); +} + +HloInstructionProto HloReduceWindowInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_window() = window_; + return proto; +} + +std::vector HloReduceWindowInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + std::vector extra; + if (window_.dimensions_size() != 0) { + extra.push_back(StrCat("window={", window_util::ToString(window()), "}")); + } + return extra; +} + +bool HloReduceWindowInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = + static_cast(other); + return eq_computations(to_apply(), casted_other.to_apply()) && + protobuf_util::ProtobufEquals(window(), casted_other.window()); +} + +std::unique_ptr +HloReduceWindowInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 2); + return absl::make_unique( + shape, new_operands[0], new_operands[1], window(), to_apply()); +} + +HloSelectAndScatterInstruction::HloSelectAndScatterInstruction( + const Shape& shape, HloInstruction* operand, HloComputation* select, + const Window& window, HloInstruction* source, HloInstruction* init_value, + HloComputation* scatter) + : HloInstruction(HloOpcode::kSelectAndScatter, shape), window_(window) { + AppendOperand(operand); + AppendOperand(source); + AppendOperand(init_value); + // Select comes before scatter in the vector. + AppendComputation(select); + AppendComputation(scatter); +} + +HloInstructionProto HloSelectAndScatterInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_window() = window_; + return proto; +} + +std::vector HloSelectAndScatterInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + std::vector extra; + if (window_.dimensions_size() != 0) { + extra.push_back(StrCat("window={", window_util::ToString(window()), "}")); + } + return extra; +} + +bool HloSelectAndScatterInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = + static_cast(other); + return eq_computations(select(), casted_other.select()) && + eq_computations(scatter(), casted_other.scatter()) && + protobuf_util::ProtobufEquals(window(), casted_other.window()); +} + +std::unique_ptr +HloSelectAndScatterInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 3); + return absl::make_unique( + shape, new_operands[0], select(), window(), new_operands[1], + new_operands[2], scatter()); +} + +HloCustomCallInstruction::HloCustomCallInstruction( + const Shape& shape, absl::Span operands, + absl::string_view custom_call_target) + : HloInstruction(HloOpcode::kCustomCall, shape), + custom_call_target_(custom_call_target.begin(), custom_call_target.end()), + feature_group_count_(1) { + for (auto operand : operands) { + AppendOperand(operand); + } +} + +HloInstructionProto HloCustomCallInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + if (window_ != nullptr) { + *proto.mutable_window() = *window_; + } + if (convolution_dimension_numbers_ != nullptr) { + *proto.mutable_convolution_dimension_numbers() = + *convolution_dimension_numbers_; + } + proto.set_custom_call_target(custom_call_target_); + proto.set_feature_group_count(feature_group_count_); + return proto; +} + +std::vector HloCustomCallInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + std::vector extra; + if (window_ != nullptr && window_->dimensions_size() != 0) { + extra.push_back(StrCat("window={", window_util::ToString(*window_), "}")); + } + if (convolution_dimension_numbers_ != nullptr) { + extra.push_back(StrCat( + "dim_labels=", + ConvolutionDimensionNumbersToString(*convolution_dimension_numbers_))); + } + if (feature_group_count_ != 1) { + extra.push_back(StrCat("feature_group_count=", feature_group_count_)); + } + // By contract, we print the custom call target even if + // options.print_subcomputation_mode() == kOff, because the call target is not + // an HloComputation. + extra.push_back( + StrCat("custom_call_target=\"", CEscape(custom_call_target_), "\"")); + return extra; +} + +bool HloCustomCallInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = + static_cast(other); + if ((window_ == nullptr) != (casted_other.window_ == nullptr) || + (window_ != nullptr && + !protobuf_util::ProtobufEquals(*window_, *casted_other.window_))) { + return false; + } + if ((convolution_dimension_numbers_ == nullptr) != + (casted_other.convolution_dimension_numbers_ == nullptr) || + (convolution_dimension_numbers_ != nullptr && + !protobuf_util::ProtobufEquals( + convolution_dimension_numbers(), + casted_other.convolution_dimension_numbers()))) { + return false; + } + if (feature_group_count_ != casted_other.feature_group_count_) { + return false; + } + return custom_call_target_ == casted_other.custom_call_target_; +} + +std::unique_ptr +HloCustomCallInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + auto cloned = absl::make_unique( + shape, new_operands, custom_call_target()); + if (window_ != nullptr) { + cloned->set_window(*window_); + } + if (convolution_dimension_numbers_ != nullptr) { + cloned->set_convolution_dimension_numbers(*convolution_dimension_numbers_); + } + cloned->set_feature_group_count(feature_group_count_); + return std::move(cloned); +} + +HloPadInstruction::HloPadInstruction(const Shape& shape, + HloInstruction* operand, + HloInstruction* padding_value, + const PaddingConfig& padding_config) + : HloInstruction(HloOpcode::kPad, shape), padding_config_(padding_config) { + AppendOperand(operand); + AppendOperand(padding_value); +} + +HloInstructionProto HloPadInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_padding_config() = padding_config_; + return proto; +} + +std::vector HloPadInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("padding=", xla::PaddingConfigToString(padding_config_))}; +} + +bool HloPadInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return protobuf_util::ProtobufEquals(padding_config(), + casted_other.padding_config()); +} + +std::unique_ptr HloPadInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 2); + return absl::make_unique(shape, new_operands[0], + new_operands[1], padding_config_); +} + +HloDynamicSliceInstruction::HloDynamicSliceInstruction( + const Shape& shape, HloInstruction* operand, HloInstruction* start_indices, + absl::Span slice_sizes) + : HloInstruction(HloOpcode::kDynamicSlice, shape), + dynamic_slice_sizes_(slice_sizes.begin(), slice_sizes.end()) { + AppendOperand(operand); + AppendOperand(start_indices); +} + +HloInstructionProto HloDynamicSliceInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + for (int64 slice_size : dynamic_slice_sizes_) { + proto.add_dynamic_slice_sizes(slice_size); + } + return proto; +} + +std::vector HloDynamicSliceInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("dynamic_slice_sizes={", StrJoin(dynamic_slice_sizes(), ","), + "}")}; +} + +bool HloDynamicSliceInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + return true; +} + +std::unique_ptr +HloDynamicSliceInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 2); + return absl::make_unique( + shape, new_operands[0], new_operands[1], dynamic_slice_sizes_); +} + +HloGatherInstruction::HloGatherInstruction( + const Shape& shape, HloInstruction* operand, HloInstruction* start_indices, + const GatherDimensionNumbers& gather_dim_numbers, + absl::Span slice_sizes) + : HloInstruction(HloOpcode::kGather, shape) { + AppendOperand(operand); + AppendOperand(start_indices); + gather_dimension_numbers_ = + absl::make_unique(gather_dim_numbers); + absl::c_copy(slice_sizes, std::back_inserter(gather_slice_sizes_)); +} + +string HloGatherInstruction::GatherDimensionNumbersToString() const { + CHECK(gather_dimension_numbers_ != nullptr); + string offset_dims = + StrCat("offset_dims={", + StrJoin(gather_dimension_numbers_->offset_dims(), ","), "}"); + string collapsed_slice_dims = StrCat( + "collapsed_slice_dims={", + StrJoin(gather_dimension_numbers_->collapsed_slice_dims(), ","), "}"); + string start_index_map = + StrCat("start_index_map={", + StrJoin(gather_dimension_numbers_->start_index_map(), ","), "}"); + string index_vector_dim = StrCat( + "index_vector_dim=", gather_dimension_numbers_->index_vector_dim()); + + return StrJoin>( + {offset_dims, collapsed_slice_dims, start_index_map, index_vector_dim}, + ", "); +} + +/* static */ GatherDimensionNumbers HloGatherInstruction::MakeGatherDimNumbers( + absl::Span offset_dims, + absl::Span collapsed_slice_dims, + absl::Span start_index_map, int64 index_vector_dim) { + GatherDimensionNumbers gather_dim_numbers; + for (int64 output_window_dim : offset_dims) { + gather_dim_numbers.add_offset_dims(output_window_dim); + } + for (int64 elided_window_dim : collapsed_slice_dims) { + gather_dim_numbers.add_collapsed_slice_dims(elided_window_dim); + } + for (int64 gather_dim_to_input_dim : start_index_map) { + gather_dim_numbers.add_start_index_map(gather_dim_to_input_dim); + } + + gather_dim_numbers.set_index_vector_dim(index_vector_dim); + return gather_dim_numbers; +} + +HloInstructionProto HloGatherInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_gather_dimension_numbers() = gather_dimension_numbers(); + for (int64 bound : gather_slice_sizes()) { + proto.add_gather_slice_sizes(bound); + } + return proto; +} + +std::vector HloGatherInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {GatherDimensionNumbersToString(), + StrCat("slice_sizes={", StrJoin(gather_slice_sizes(), ","), "}")}; +} + +bool HloGatherInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return protobuf_util::ProtobufEquals( + gather_dimension_numbers(), + casted_other.gather_dimension_numbers()) && + gather_slice_sizes() == casted_other.gather_slice_sizes(); +} + +std::unique_ptr HloGatherInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 2); + return absl::make_unique( + shape, new_operands[0], new_operands[1], gather_dimension_numbers(), + gather_slice_sizes()); +} + +HloScatterInstruction::HloScatterInstruction( + const Shape& shape, HloInstruction* operand, + HloInstruction* scatter_indices, HloInstruction* updates, + HloComputation* update_computation, + const ScatterDimensionNumbers& scatter_dim_numbers) + : HloInstruction(HloOpcode::kScatter, shape) { + AppendOperand(operand); + AppendOperand(scatter_indices); + AppendOperand(updates); + AppendComputation(update_computation); + scatter_dimension_numbers_ = + absl::make_unique(scatter_dim_numbers); +} + +string HloScatterInstruction::ScatterDimensionNumbersToString() const { + string update_window_dims = StrCat( + "update_window_dims={", + StrJoin(scatter_dimension_numbers().update_window_dims(), ","), "}"); + string inserted_window_dims = StrCat( + "inserted_window_dims={", + StrJoin(scatter_dimension_numbers().inserted_window_dims(), ","), "}"); + string scatter_dims_to_operand_dims = StrCat( + "scatter_dims_to_operand_dims={", + StrJoin(scatter_dimension_numbers().scatter_dims_to_operand_dims(), ","), + "}"); + string index_vector_dim = StrCat( + "index_vector_dim=", scatter_dimension_numbers().index_vector_dim()); + + return StrJoin>( + {update_window_dims, inserted_window_dims, scatter_dims_to_operand_dims, + index_vector_dim}, + ", "); +} + +/* static */ ScatterDimensionNumbers +HloScatterInstruction::MakeScatterDimNumbers( + absl::Span update_window_dims, + absl::Span inserted_window_dims, + absl::Span scatter_dims_to_operand_dims, + int64 index_vector_dim) { + ScatterDimensionNumbers scatter_dim_numbers; + for (int64 update_window_dim : update_window_dims) { + scatter_dim_numbers.add_update_window_dims(update_window_dim); + } + for (int64 inserted_window_dim : inserted_window_dims) { + scatter_dim_numbers.add_inserted_window_dims(inserted_window_dim); + } + for (int64 scatter_dim_to_operand_dim : scatter_dims_to_operand_dims) { + scatter_dim_numbers.add_scatter_dims_to_operand_dims( + scatter_dim_to_operand_dim); + } + scatter_dim_numbers.set_index_vector_dim(index_vector_dim); + return scatter_dim_numbers; +} + +HloInstructionProto HloScatterInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_scatter_dimension_numbers() = scatter_dimension_numbers(); + return proto; +} + +std::vector HloScatterInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {ScatterDimensionNumbersToString()}; +} + +bool HloScatterInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return protobuf_util::ProtobufEquals( + scatter_dimension_numbers(), + casted_other.scatter_dimension_numbers()) && + eq_computations(to_apply(), casted_other.to_apply()); +} + +std::unique_ptr HloScatterInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 3); + return absl::make_unique( + shape, new_operands[0], new_operands[1], new_operands[2], to_apply(), + scatter_dimension_numbers()); +} + +HloIotaInstruction::HloIotaInstruction(const Shape& shape, int64 iota_dimension) + : HloInstruction(HloOpcode::kIota, shape), + iota_dimension_(iota_dimension) {} + +HloInstructionProto HloIotaInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + proto.add_dimensions(iota_dimension()); + return proto; +} + +std::vector HloIotaInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {StrCat("iota_dimension=", iota_dimension())}; +} + +bool HloIotaInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return iota_dimension() == casted_other.iota_dimension(); +} + +std::unique_ptr HloIotaInstruction::CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const { + return absl::make_unique(shape, iota_dimension()); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h new file mode 100644 index 0000000000000000000000000000000000000000..323038357993c4e9b99d1527aa8f593ada92f1c8 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_instructions.h @@ -0,0 +1,1275 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// All HloInstruction subclasses are put in this file. + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_ + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" + +namespace xla { + +class HloBatchNormInstruction : public HloInstruction { + public: + // Returns feature_index field associated with the instruction. The index + // represents the index of the feature dimension. + int64 feature_index() const { return feature_index_; } + + // Returns a epsilon value associated with the instruction. The is a small + // number added to the variance to avoid divide-by-zero error. + float epsilon() const { return epsilon_; } + + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + protected: + explicit HloBatchNormInstruction(HloOpcode opcode, const Shape& shape, + HloInstruction* operand, + HloInstruction* scale, float epsilon, + int64 feature_index); + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // A small float number added to the variance to avoid divide-by-zero error. + float epsilon_ = 0.0f; + + // An integer value representing the index of the feature dimension. + int64 feature_index_ = -1; +}; + +class HloBatchNormTrainingInstruction : public HloBatchNormInstruction { + public: + explicit HloBatchNormTrainingInstruction(const Shape& shape, + HloInstruction* operand, + HloInstruction* scale, + HloInstruction* offset, + float epsilon, int64 feature_index); + + private: + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; +}; + +class HloBatchNormInferenceInstruction : public HloBatchNormInstruction { + public: + explicit HloBatchNormInferenceInstruction( + const Shape& shape, HloInstruction* operand, HloInstruction* scale, + HloInstruction* offset, HloInstruction* mean, HloInstruction* variance, + float epsilon, int64 feature_index); + + private: + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; +}; + +class HloBatchNormGradInstruction : public HloBatchNormInstruction { + public: + explicit HloBatchNormGradInstruction( + const Shape& shape, HloInstruction* operand, HloInstruction* scale, + HloInstruction* mean, HloInstruction* variance, + HloInstruction* grad_output, float epsilon, int64 feature_index); + + private: + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; +}; + +class HloFftInstruction : public HloInstruction { + public: + explicit HloFftInstruction(const Shape& shape, HloInstruction* operand, + FftType fft_type, + absl::Span fft_length); + FftType fft_type() const { return fft_type_; } + + const std::vector& fft_length() const { return fft_length_; } + + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // Describes FFT type for an FFT instruction. + FftType fft_type_ = FftType::FFT; + + // Indicates the FFT length for an FFT instruction. + std::vector fft_length_; +}; + +class HloSendRecvInstruction : public HloInstruction { + public: + // Returns the channel id associated with the instruction. The id is + // shared between each Send/Recv pair and is globally unique to identify each + // channel. + int64 channel_id() const { return channel_id_; } + + // Returns whether this send/recv instruction sends data to/from the host. + bool is_host_transfer() const { return is_host_transfer_; } + + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + protected: + explicit HloSendRecvInstruction(HloOpcode opcode, const Shape& shape, + int64 channel_id, bool is_host_transfer); + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Represents a unique identifier for each Send/Recv instruction pair. + int64 channel_id_; + + // Whether this send/recv instruction sends data to/from the host. + bool is_host_transfer_; +}; + +class HloSendInstruction : public HloSendRecvInstruction { + public: + explicit HloSendInstruction(HloInstruction* operand, HloInstruction* token, + int64 channel_id, bool is_host_transfer); + + private: + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; +}; + +class HloSendDoneInstruction : public HloSendRecvInstruction { + public: + explicit HloSendDoneInstruction(HloSendInstruction* operand, + bool is_host_transfer); + + private: + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; +}; + +class HloRecvInstruction : public HloSendRecvInstruction { + public: + explicit HloRecvInstruction(const Shape& shape, HloInstruction* token, + int64 channel_id, bool is_host_transfer); + + private: + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; +}; + +class HloRecvDoneInstruction : public HloSendRecvInstruction { + public: + explicit HloRecvDoneInstruction(HloRecvInstruction* operand, + bool is_host_transfer); + + private: + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; +}; + +class HloCollectiveInstruction : public HloInstruction { + public: + const std::vector& replica_groups() const { + return replica_groups_; + } + + protected: + explicit HloCollectiveInstruction( + HloOpcode opcode, const Shape& shape, + absl::Span operands, + const std::vector& replica_groups); + + HloInstructionProto ToProto() const override; + + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + + std::vector replica_groups_; +}; + +class HloAllReduceInstruction : public HloCollectiveInstruction { + public: + explicit HloAllReduceInstruction( + const Shape& shape, absl::Span operands, + HloComputation* reduce_computation, + const std::vector& replica_groups, + absl::string_view barrier, const absl::optional& all_reduce_id); + + // Returns the barrier config used for the CrossReplicaSum implementation of + // each backend. + string cross_replica_sum_barrier() const { + return cross_replica_sum_barrier_; + } + void set_cross_replica_sum_barrier(string barrier) { + cross_replica_sum_barrier_ = barrier; + } + + absl::optional all_reduce_id() const { return all_reduce_id_; } + + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // The string representation of the barrier config used for CrossReplicaSum. + string cross_replica_sum_barrier_; + + // For Allreduce nodes from different modules, if they have the same + // all_reduce_id, they will be 'Allreduce'd. If empty, Allreduce will not be + // applied cross modules. + absl::optional all_reduce_id_; +}; + +class HloAllToAllInstruction : public HloCollectiveInstruction { + public: + explicit HloAllToAllInstruction( + const Shape& shape, absl::Span operands, + const std::vector& replica_groups); + + private: + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; +}; + +class HloCollectivePermuteInstruction : public HloInstruction { + public: + explicit HloCollectivePermuteInstruction( + const Shape& shape, HloInstruction* operand, + const std::vector>& source_target_pairs); + + const std::vector>& source_target_pairs() const { + return source_target_pairs_; + } + + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + const std::vector> source_target_pairs_; +}; + +class HloReverseInstruction : public HloInstruction { + public: + explicit HloReverseInstruction(const Shape& shape, HloInstruction* operand, + absl::Span dimensions); + // Returns the dimension sizes or numbers associated with this instruction. + const std::vector& dimensions() const override { return dimensions_; } + int64 dimensions(int64 index) const override { return dimensions()[index]; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + std::vector dimensions_; +}; + +class HloConcatenateInstruction : public HloInstruction { + public: + explicit HloConcatenateInstruction(const Shape& shape, + absl::Span operands, + int64 dimension); + // Returns the dimension sizes or numbers associated with this instruction. + const std::vector& dimensions() const override { return dimensions_; } + int64 dimensions(int64 index) const override { return dimensions()[index]; } + // Accessor for the dimension in which a concatenate HLO should occur. + int64 concatenate_dimension() const { return dimensions(0); } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + std::vector dimensions_; +}; + +class HloReduceInstruction : public HloInstruction { + public: + explicit HloReduceInstruction(const Shape& shape, + absl::Span args, + absl::Span dimensions_to_reduce, + HloComputation* reduce_computation); + // Returns the dimension sizes or numbers associated with this instruction. + const std::vector& dimensions() const override { return dimensions_; } + int64 dimensions(int64 index) const override { return dimensions()[index]; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + // Returns the number of input arrays (and, consequentially, the number of + // init values) this reduce has. + int64 input_count() const { return operand_count() / 2; } + + // Returns the input tensors to be reduced. + absl::Span inputs() const { + return absl::MakeSpan(operands()).subspan(0, input_count()); + } + + // Returns the init values of the reduction. + absl::Span init_values() const { + return absl::MakeSpan(operands()).subspan(input_count(), operand_count()); + } + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + std::vector dimensions_; +}; + +class HloSortInstruction : public HloInstruction { + public: + explicit HloSortInstruction(const Shape& shape, int64 dimension, + HloInstruction* keys, + HloInstruction* values = nullptr); + // Returns the dimension sizes or numbers associated with this instruction. + const std::vector& dimensions() const override { return dimensions_; } + int64 dimensions(int64 index) const override { return dimensions()[index]; } + // Returns the sort dimension for this instruction + int64 sort_dimension() { return dimensions(0); } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + std::vector dimensions_; +}; + +class HloTransposeInstruction : public HloInstruction { + public: + explicit HloTransposeInstruction(const Shape& shape, HloInstruction* operand, + absl::Span dimensions); + // Returns the dimension sizes or numbers associated with this instruction. + const std::vector& dimensions() const override { return dimensions_; } + int64 dimensions(int64 index) const override { return dimensions()[index]; } + // Returns whether this instruction does a rank-2 transposition. + bool IsRank2Transpose() const; + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + std::vector dimensions_; +}; + +class HloBroadcastInstruction : public HloInstruction { + public: + explicit HloBroadcastInstruction(const Shape& shape, HloInstruction* operand, + absl::Span broadcast_dimension); + // Returns the dimension sizes or numbers associated with this instruction. + const std::vector& dimensions() const override { return dimensions_; } + int64 dimensions(int64 index) const override { return dimensions()[index]; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + std::vector dimensions_; +}; + +class HloMapInstruction : public HloInstruction { + public: + explicit HloMapInstruction(const Shape& shape, + absl::Span operands, + HloComputation* map_computation); + // Returns the dimension sizes or numbers associated with this instruction. + const std::vector& dimensions() const override { return dimensions_; } + int64 dimensions(int64 index) const override { return dimensions()[index]; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + bool IsElementwiseImpl( + const absl::optional& operand_idx) const override; + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + std::vector dimensions_; +}; + +class HloSliceInstruction : public HloInstruction { + public: + explicit HloSliceInstruction(const Shape& shape, HloInstruction* operand, + absl::Span start_indices, + absl::Span limit_indices, + absl::Span strides); + + HloInstructionProto ToProto() const override; + + // Returns the start index in the given dimension for a slice node. + int64 slice_starts(int64 dimension) const { return slice_starts_[dimension]; } + const std::vector& slice_starts() const { return slice_starts_; } + + // Returns the (exclusive) limit index in the given dimension for a slice + // node. + int64 slice_limits(int64 dimension) const { return slice_limits_[dimension]; } + const std::vector& slice_limits() const { return slice_limits_; } + + // Returns the stride in the given dimension for a slice node. + int64 slice_strides(int64 dimension) const { + return slice_strides_[dimension]; + } + const std::vector& slice_strides() const { return slice_strides_; } + + // Returns the flag that describes whether a slice must be lowered into an + // offset into the original operand. + bool IsInPlaceSlice() const { return is_in_place_slice_; } + + // Sets and returns the flag that describes whether a slice must be lowered + // into an offset into the original operand. + bool SetIsInPlaceSlice(bool value) { + is_in_place_slice_ = value; + return value; + } + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // Describes the [begin, end) index range for a slice. + std::vector slice_starts_; + std::vector slice_limits_; + std::vector slice_strides_; + + // Describes whether the slice can be lowered to an offset into the operand. + bool is_in_place_slice_ = false; +}; + +class HloConstantInstruction : public HloInstruction { + public: + explicit HloConstantInstruction(std::unique_ptr literal); + // Used when the literal is too large and dropped. + explicit HloConstantInstruction(const Shape& shape); + // Returns the literal associated with this instruction. + const Literal& literal() const { return *literal_; } + // Returns whether there is literal associated with this instruction. + bool HasLiteral() const { return literal_ != nullptr; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + // Change the layout for an Constant Hlo instruction to match new_layout. For + // tuple shaped constants shape_index is the path to the internal array + // subshape whose layout needs to be changed. + void RelayoutConstant(const Layout& new_layout, + const ShapeIndex& shape_index = {}); + + private: + bool IsElementwiseImpl( + const absl::optional& operand_idx) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + string OperandsToStringWithCanonicalNameMap( + const HloPrintOptions& options, + CanonicalNameMap* canonical_name_map) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + // TODO(b/36360764): Remove unique_ptr wrapping. + std::unique_ptr literal_; +}; + +class HloTraceInstruction : public HloInstruction { + public: + explicit HloTraceInstruction(const string& tag, HloInstruction* operand); + // Returns a tag to be used in tracing. + string TracingTag() const { return literal_->GetR1U8AsString(); } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + // TODO(b/36360764): Remove unique_ptr wrapping. + std::unique_ptr literal_; +}; + +class HloFusionInstruction : public HloInstruction { + public: + explicit HloFusionInstruction(const Shape& shape, FusionKind fusion_kind, + HloInstruction* fused_root); + + explicit HloFusionInstruction(const Shape& shape, FusionKind fusion_kind, + absl::Span operands, + HloComputation* fusion_computation); + + string ToCategory() const override; + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + // Adds a new operand the fusion instruction. + HloInstruction* AddFusionOperand(HloInstruction* new_operand); + + // Merges the fused instructions from 'instruction_to_merge' into the + // fused instruction set of 'this', updating operands as necessary. + // + // Predondition: 'instruction_to_merge' must be an operand of 'this'. + void MergeFusionInstruction(HloFusionInstruction* instruction_to_merge); + + // Merges the fused instructions from instruction_to_merge into the fused + // instruction set of 'this' and generates multioutput fusion instructions. + // All the users of instruction_to_merge will be redirected to 'this' + // instruction. instruction_to_merge will be removed from its parent + // computation. + void MergeFusionInstructionIntoMultiOutput( + HloFusionInstruction* instruction_to_merge); + + // Fuses the given instruction in this fusion instruction. instruction_to_fuse + // is cloned and the clone is placed in the fusion + // instruction. instruction_to_fuse is unchanged. Instruction is cloned rather + // than moved to cleanly handle the case where the instruction has a use + // outside the fusion instruction. Moving such an instruction into a fusion + // instruction would violate the single-result invariant of HLO instructions + // and significantly complicate code generation. + HloInstruction* FuseInstruction(HloInstruction* instruction_to_fuse) { + return FuseInstructionInternal(instruction_to_fuse); + } + + // Fuses the given instruction in this fusion instruction and generate + // multioutput fusion instruction. A clone of the instruction_to_fuse will + // be part of the output of fusion instructions. The users of + // instruction_to_fuse will be redirected to this fusion instructions. + // instruction_to_fuse will be removed from its parent computation. + HloInstruction* FuseInstructionIntoMultiOutput( + HloInstruction* instruction_to_fuse) { + return FuseInstructionInternal(instruction_to_fuse, /* add_output */ true); + } + + // Returns the computation for this fused instruction. + HloComputation* fused_instructions_computation() const; + + // Returns the root instruction of the fused expression contained within this + // fusion instruction. + HloInstruction* fused_expression_root() const; + + // Returns the list of fused instructions inside this fusion instruction. The + // returned type is a range of HloInstruction*s. + const tensorflow::gtl::iterator_range>::const_iterator>> + fused_instructions() const; + + const tensorflow::gtl::iterator_range< + UnwrappingIterator>::iterator>> + fused_instructions(); + + // Gets the number of instructions inside this fusion instruction. + int64 fused_instruction_count() const; + + // Returns the fused parameter instruction in this fusion instruction + // corresponding to the given parameter number. + HloInstruction* fused_parameter(int64 parameter_number) const; + + // Returns the vector of fused parameters inside this fusion instruction. + const std::vector& fused_parameters() const; + + // Returns true if this instruction is a fusion instruction that generates + // multiple outputs. + const bool IsMultiOutputFusion() const { + return fused_expression_root()->opcode() == HloOpcode::kTuple; + } + + FusionKind fusion_kind() const { return fusion_kind_; } + + void set_fusion_kind(FusionKind kind) { fusion_kind_ = kind; } + + // If multiple operands are the same instruction, keeps only one of them. + Status DeduplicateFusionOperands(); + + private: + // Fuses the given instruction into this fusion instruction. When add_output + // is false (which is the default), instruction_to_fuse is cloned and the + // clone is placed in the fusion instruction. instruction_to_fuse is + // unchanged. + // + // When add_output is true, a clone of the instruction_to_fuse will be part + // of the output of fusion instructions. The users of instruction_to_fuse + // will be redirected to this fusion instructions. instruction_to_fuse will + // be removed from its parent computation. + HloInstruction* FuseInstructionInternal(HloInstruction* instruction_to_fuse, + bool add_output = false); + // Clones the given instruction_to_fuse and insert the clone into this fusion + // instruction. If add_output is true, a clone of instruction_to_fuse will + // be in the output of the this fusion instruction (part of the tuple of the + // fusion root). + HloInstruction* CloneAndFuseInternal(HloInstruction* instruction_to_fuse, + bool add_output = false); + + bool IsElementwiseImpl( + const absl::optional& operand_idx) const override; + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // The type of the fusion. Used by kFusion only. + FusionKind fusion_kind_; +}; + +class HloRngInstruction : public HloInstruction { + public: + explicit HloRngInstruction(const Shape& shape, + RandomDistribution distribution, + absl::Span parameters); + // Returns the random distribution for this rng node. + RandomDistribution random_distribution() const { return distribution_; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + bool IsElementwiseImpl( + const absl::optional& operand_idx) const override; + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // The distribution requested for random number generation. + RandomDistribution distribution_; +}; + +class HloParameterInstruction : public HloInstruction { + public: + explicit HloParameterInstruction(int64 parameter_number, const Shape& shape, + const string& name); + int64 parameter_number() const { return parameter_number_; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + string OperandsToStringWithCanonicalNameMap( + const HloPrintOptions& options, + CanonicalNameMap* canonical_name_map) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + int64 parameter_number_ = 0; +}; + +class HloGetTupleElementInstruction : public HloInstruction { + public: + explicit HloGetTupleElementInstruction(const Shape& shape, + HloInstruction* operand, int64 index); + // Returns the tuple index associated with this instruction. + int64 tuple_index() const { return tuple_index_; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + int64 tuple_index_ = -1; +}; + +class HloReducePrecisionInstruction : public HloInstruction { + public: + explicit HloReducePrecisionInstruction(const Shape& shape, + HloInstruction* operand, + const int exponent_bits, + const int mantissa_bits); + // Returns the number of exponent bits for a reduce-precision node. + int32 exponent_bits() const { return exponent_bits_; } + // Returns the number of mantissa bits for a reduce-precision node. + int32 mantissa_bits() const { return mantissa_bits_; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // The bit sizes for a reduce-precision operation. + int32 exponent_bits_ = 0; + int32 mantissa_bits_ = 0; +}; + +class HloInfeedInstruction : public HloInstruction { + public: + explicit HloInfeedInstruction(const Shape& infeed_shape, + HloInstruction* token_operand, + const string& config); + // Returns the infeed configuration string. The infeed configuration includes + // any metadata needed for the backend compiler (e.g., infeed buffer address) + // and is target-dependent. + string infeed_config() const { return infeed_config_; } + void set_infeed_config(const string& config) { infeed_config_ = config; } + // Returns the shape of the data received by the infeed. This is not the same + // as the shape of the infeed instruction which produces a tuple containing + // the infeed data shape and a TOKEN. + const Shape& infeed_shape() const { + TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape())); + return ShapeUtil::GetSubshape(shape(), {0}); + } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // The string representation of the infeed configuration. + string infeed_config_; +}; + +class HloOutfeedInstruction : public HloInstruction { + public: + explicit HloOutfeedInstruction(const Shape& outfeed_shape, + HloInstruction* operand, + HloInstruction* token_operand, + absl::string_view outfeed_config); + // Returns the shape for the Outfeed instruction. + const Shape& outfeed_shape() const { + TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape_)); + return outfeed_shape_; + } + // Returns the config for the Outfeed instruction. + const string& outfeed_config() const { return outfeed_config_; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // Shape of outfeed request. + Shape outfeed_shape_; + // Outfeed configuration information, only present for kOutfeed. + string outfeed_config_; +}; + +class HloConvolutionInstruction : public HloInstruction { + public: + explicit HloConvolutionInstruction( + const Shape& shape, HloInstruction* lhs, HloInstruction* rhs, + const Window& window, + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count); + const Window& window() const override { return window_; } + void set_window(const Window& window) override { window_ = window; } + const ConvolutionDimensionNumbers& convolution_dimension_numbers() const { + return convolution_dimension_numbers_; + } + void set_convolution_dimension_numbers( + const ConvolutionDimensionNumbers& dnums) { + convolution_dimension_numbers_ = dnums; + } + // The number of feature groups. Must be a divisor of the input feature + // dimension and output feature dimension. + int64 feature_group_count() const { return feature_group_count_; } + string ToCategory() const override; + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + Window window_; + // Describes the dimension numbers used for a convolution. + ConvolutionDimensionNumbers convolution_dimension_numbers_; + // The number of feature groups. Must be a divisor of the input feature + // dimension and output feature dimension. + int64 feature_group_count_; +}; + +class HloReduceWindowInstruction : public HloInstruction { + public: + explicit HloReduceWindowInstruction(const Shape& shape, + HloInstruction* operand, + HloInstruction* init_value, + const Window& window, + HloComputation* reduce_computation); + const Window& window() const override { return window_; } + void set_window(const Window& window) override { window_ = window; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + Window window_; +}; + +class HloSelectAndScatterInstruction : public HloInstruction { + public: + explicit HloSelectAndScatterInstruction( + const Shape& shape, HloInstruction* operand, HloComputation* select, + const Window& window, HloInstruction* source, HloInstruction* init_value, + HloComputation* scatter); + const Window& window() const override { return window_; } + void set_window(const Window& window) override { window_ = window; } + // Gets/sets the select or scatter HloComputation for SelectAndScatter. The + // setters should only be called by HloModule or HloComputation methods. + HloComputation* select() const { + return called_computations()[kSelectComputationIndex]; + } + + HloComputation* scatter() const { + return called_computations()[kScatterComputationIndex]; + } + + void set_select(HloComputation* computation) { + // Don't allow changing the computation for fused instructions so we don't + // have to recompute called_instructions for the entire fusion instruction. + CHECK(!IsFused()); + set_called_computation(kSelectComputationIndex, computation); + } + + void set_scatter(HloComputation* computation) { + // Don't allow changing the computation for fused instructions so we don't + // have to recompute called_instructions for the entire fusion instruction. + CHECK(!IsFused()); + set_called_computation(kScatterComputationIndex, computation); + } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + Window window_; +}; + +class HloCustomCallInstruction : public HloInstruction { + public: + explicit HloCustomCallInstruction(const Shape& shape, + absl::Span operands, + absl::string_view custom_call_target); + const Window& window() const override { + CHECK(window_ != nullptr); + return *window_; + } + + void set_window(const Window& window) override { + window_ = absl::make_unique(window); + } + + const ConvolutionDimensionNumbers& convolution_dimension_numbers() const { + CHECK(convolution_dimension_numbers_ != nullptr); + return *convolution_dimension_numbers_; + } + + void set_convolution_dimension_numbers( + const ConvolutionDimensionNumbers& dnums) { + convolution_dimension_numbers_ = + absl::make_unique(dnums); + } + const string& custom_call_target() const { return custom_call_target_; } + void set_feature_group_count(int64 feature_group_count) { + feature_group_count_ = feature_group_count; + } + int64 feature_group_count() const { return feature_group_count_; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + // Name of a global symbol to call, only present for kCustomCall. + string custom_call_target_; + // Describes the window in a windowed operation such as convolution. + std::unique_ptr window_; + // Describes the dimension numbers used for a convolution. + std::unique_ptr convolution_dimension_numbers_; + // The number of feature groups. This is used for grouped convolutions. + int64 feature_group_count_; +}; + +class HloPadInstruction : public HloInstruction { + public: + explicit HloPadInstruction(const Shape& shape, HloInstruction* operand, + HloInstruction* padding_value, + const PaddingConfig& padding_config); + // Returns the padding configuration for a pad node. + const PaddingConfig& padding_config() const { return padding_config_; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // The padding configuration that describes the edge padding and interior + // padding of this pad instruction. + PaddingConfig padding_config_; +}; + +class HloDynamicSliceInstruction : public HloInstruction { + public: + explicit HloDynamicSliceInstruction(const Shape& shape, + HloInstruction* operand, + HloInstruction* start_indices, + absl::Span slice_sizes); + // Old methods kept for smooth subclassing transition END. + // Returns the size of the slice in the given dimension for a dynamic + // slice node. + int64 slice_sizes(int64 dimension) const { + return dynamic_slice_sizes_[dimension]; + } + const std::vector& dynamic_slice_sizes() const { + return dynamic_slice_sizes_; + } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + // Describes the [start, start + size) range size for a dynamic slice + // ('start' is specified dynamically in the second operand of the operation). + std::vector dynamic_slice_sizes_; +}; + +class HloGatherInstruction : public HloInstruction { + public: + explicit HloGatherInstruction( + const Shape& shape, HloInstruction* operand, + HloInstruction* start_indices, + const GatherDimensionNumbers& gather_dim_numbers, + absl::Span slice_sizes); + const GatherDimensionNumbers& gather_dimension_numbers() const { + CHECK(gather_dimension_numbers_ != nullptr); + return *gather_dimension_numbers_; + } + absl::Span gather_slice_sizes() const { + return gather_slice_sizes_; + } + // Returns the dump string of the gather dimension numbers. + string GatherDimensionNumbersToString() const; + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + // Creates an instance of GatherDimensionNumbers. + static GatherDimensionNumbers MakeGatherDimNumbers( + absl::Span offset_dims, + absl::Span collapsed_slice_dims, + absl::Span start_index_map, int64 index_vector_dim); + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + std::unique_ptr gather_dimension_numbers_; + std::vector gather_slice_sizes_; +}; + +class HloScatterInstruction : public HloInstruction { + public: + explicit HloScatterInstruction( + const Shape& shape, HloInstruction* operand, + HloInstruction* scatter_indices, HloInstruction* updates, + HloComputation* update_computation, + const ScatterDimensionNumbers& scatter_dim_numbers); + const ScatterDimensionNumbers& scatter_dimension_numbers() const { + CHECK(scatter_dimension_numbers_ != nullptr); + return *scatter_dimension_numbers_; + } + // Returns the dump string of the scatter dimension numbers. + string ScatterDimensionNumbersToString() const; + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + // Creates an instance of ScatterDimensionNumbers. + static ScatterDimensionNumbers MakeScatterDimNumbers( + absl::Span update_window_dims, + absl::Span inserted_window_dims, + absl::Span scatter_dims_to_operand_dims, + int64 index_vector_dim); + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + std::unique_ptr scatter_dimension_numbers_; +}; + +class HloIotaInstruction : public HloInstruction { + public: + explicit HloIotaInstruction(const Shape& shape, int64 iota_dimension); + // Returns the dimension sizes or numbers associated with this instruction. + int64 iota_dimension() const { return iota_dimension_; } + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, absl::Span new_operands, + HloCloneContext* context) const override; + + const int64 iota_dimension_; +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_ diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc similarity index 81% rename from tensorflow/compiler/xla/tools/parser/hlo_lexer.cc rename to tensorflow/compiler/xla/service/hlo_lexer.cc index 350db126535e418cbfa914edd958f47ba90a3ee5..d9be841dd751651ba029998fd062fcaec3691945 100644 --- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc +++ b/tensorflow/compiler/xla/service/hlo_lexer.cc @@ -13,25 +13,24 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/tools/parser/hlo_lexer.h" +#include "tensorflow/compiler/xla/service/hlo_lexer.h" #include +#include "absl/strings/escaping.h" +#include "absl/strings/numbers.h" +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/regexp.h" namespace xla { -namespace tools { - -using tensorflow::StringPiece; - namespace { +using absl::string_view; + constexpr int kEOF = -1; constexpr int kError = -2; @@ -67,12 +66,12 @@ bool HloLexer::CanDereference(const char* ptr) const { return ptr < buf_.end() && ptr >= buf_.begin(); } -StringPiece HloLexer::StringPieceFromPointers(const char* begin, - const char* end) const { +absl::string_view HloLexer::StringPieceFromPointers(const char* begin, + const char* end) const { CHECK(begin <= end); CHECK(begin == buf_.end() || CanDereference(begin)); CHECK(end == buf_.end() || CanDereference(end)); - return StringPiece(begin, end - begin); + return absl::string_view(begin, end - begin); } tensorflow::RegexpStringPiece HloLexer::RegexpStringPieceFromPointers( @@ -144,8 +143,47 @@ TokKind HloLexer::LexToken() { return TokKind::kLparen; case ')': return TokKind::kRparen; - case '/': - return LexComment(); + case '/': { + if (PeekCurrentChar() == '*') { + // This is the start of a /*...*/ delimited comment. Save the current + // location in case the comment is unterminated so the error message + // will point to the beginning of the comment. + const char* comment_start = current_ptr_; + current_ptr_++; + // Advance until '*/' is found. + while (true) { + int current = GetNextChar(); + if (current == '*' && PeekCurrentChar() == '/') { + // End of comment. + current_ptr_++; + break; + } + if (current == kEOF) { + // Unterminated comment. + current_ptr_ = comment_start; + return TokKind::kError; + } + } + // Return no token for the comment. Keep lexing. + continue; + } else if (PeekCurrentChar() == '/') { + // This is the start of a '//' delimited comment. Throw away + // everything until end of line or file. The end-of-line character(s) + // are left unlexed in the buffer which is harmless because these are + // skipped later by the lexer. This approach enables support for + // different end-of-line encodings. + while (true) { + int current = PeekCurrentChar(); + if (current == kEOF || current == '\n' || current == '\r') { + break; + } + current_ptr_++; + } + continue; + } + // A lone '/' is an error. + return TokKind::kError; + } case '"': return LexString(); } @@ -197,7 +235,8 @@ TokKind HloLexer::LexIdentifier() { return TokKind::kAttributeName; } - StringPiece identifier = StringPieceFromPointers(token_start_, current_ptr_); + absl::string_view identifier = + StringPieceFromPointers(token_start_, current_ptr_); // See if this is a keyword. #define KEYWORD(STR) \ @@ -230,7 +269,7 @@ TokKind HloLexer::LexIdentifier() { } } - str_val_ = std::string(identifier); + str_val_ = string(identifier); return TokKind::kIdent; } @@ -267,8 +306,7 @@ TokKind HloLexer::LexNumberOrPattern() { R"([-]?((\d+|\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+))|[-]?(\d+[.]\d*|\d*[.]\d+))"}; if (RE2::Consume(&consumable, *float_pattern)) { current_ptr_ = consumable.begin(); - tensorflow::strings::safe_strtod(string(token_start_, current_ptr_).c_str(), - &decimal_val_); + CHECK(absl::SimpleAtod(string(token_start_, current_ptr_), &decimal_val_)); return TokKind::kDecimal; } @@ -299,9 +337,12 @@ TokKind HloLexer::LexNumberOrPattern() { static LazyRE2 int_pattern = {R"([-]?\d+)"}; if (RE2::Consume(&consumable, *int_pattern)) { current_ptr_ = consumable.begin(); - tensorflow::strings::safe_strto64( - StringPieceFromPointers(token_start_, current_ptr_), &int64_val_); - return TokKind::kInt; + auto slice = StringPieceFromPointers(token_start_, current_ptr_); + if (absl::SimpleAtoi(slice, &int64_val_)) { + return TokKind::kInt; + } + LOG(ERROR) << "Failed to parse int literal: " << slice; + return TokKind::kError; } static LazyRE2 neg_inf = {"-inf"}; @@ -323,6 +364,7 @@ std::pair HloLexer::GetLineAndColumn(LocTy location) const { line_no = line_no_cache_.line_no_of_query; } for (; ptr != location; ptr++) { + CHECK_LT(ptr, buf_.end()); if (*ptr == '\n') { line_no++; } @@ -332,37 +374,28 @@ std::pair HloLexer::GetLineAndColumn(LocTy location) const { line_no_cache_.last_query = ptr; line_no_cache_.line_no_of_query = line_no; size_t line_offset = StringPieceFromPointers(start, ptr).rfind('\n'); - if (line_offset == StringPiece::npos) { + if (line_offset == absl::string_view::npos) { line_offset = 0; } return {line_no, ptr - start - line_offset}; } -StringPiece HloLexer::GetLine(LocTy loc) const { +absl::string_view HloLexer::GetLine(LocTy loc) const { if (!CanDereference(loc)) { return "LINE OUT OF RANGE"; } size_t line_start = StringPieceFromPointers(buf_.begin(), loc + 1).rfind('\n'); - const char* start = line_start == StringPiece::npos + const char* start = line_start == absl::string_view::npos ? buf_.begin() : buf_.begin() + line_start + 1; size_t line_end = StringPieceFromPointers(loc, buf_.end()).find('\n'); - const char* end = line_end == StringPiece::npos ? buf_.end() : loc + line_end; + const char* end = + line_end == absl::string_view::npos ? buf_.end() : loc + line_end; return StringPieceFromPointers(start, end); } -TokKind HloLexer::LexComment() { - auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end()); - static LazyRE2 comment_pattern = {R"(\/\*.*?\*\/)"}; - if (RE2::Consume(&consumable, *comment_pattern)) { - current_ptr_ = consumable.begin(); - return TokKind::kComment; - } - return TokKind::kError; -} - // Lexes quoted string with escaping characters. If matched, the quoted string // will be unescaped and stored to str_val_. TokKind HloLexer::LexString() { @@ -370,10 +403,10 @@ TokKind HloLexer::LexString() { static LazyRE2 escaping_pattern = {R"("([^"\\]|\\.)*")"}; if (RE2::Consume(&consumable, *escaping_pattern)) { current_ptr_ = consumable.begin(); - StringPiece raw = + absl::string_view raw = StringPieceFromPointers(token_start_ + 1, current_ptr_ - 1); string error; - if (!tensorflow::str_util::CUnescape(raw, &str_val_, &error)) { + if (!absl::CUnescape(raw, &str_val_, &error)) { LOG(ERROR) << "Failed unescaping string: " << raw << ". error: " << error; return TokKind::kError; } @@ -408,8 +441,6 @@ string TokKindToString(TokKind kind) { return "kRparen"; case TokKind::kArrow: return "kArrow"; - case TokKind::kComment: - return "kComment"; case TokKind::kw_HloModule: return "kw_HloModule"; case TokKind::kw_ENTRY: @@ -453,5 +484,4 @@ string TokKindToString(TokKind kind) { } } -} // namespace tools } // namespace xla diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h similarity index 81% rename from tensorflow/compiler/xla/tools/parser/hlo_lexer.h rename to tensorflow/compiler/xla/service/hlo_lexer.h index 27880b9b8afbfa58abfedc3b2cecd5236b78a6d6..3e2f8bcd52f9043f161197756a2060b28dded1d9 100644 --- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h +++ b/tensorflow/compiler/xla/service/hlo_lexer.h @@ -13,26 +13,28 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_LEXER_H_ -#define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_LEXER_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_ #include -#include "tensorflow/compiler/xla/tools/parser/hlo_token.h" +#include "absl/strings/string_view.h" +#include "tensorflow/compiler/xla/service/hlo_token.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/regexp.h" #include "tensorflow/core/platform/types.h" namespace xla { -namespace tools { // Lexer for the HloModule::ToString() format text. +// +// This class is meant to be used by hlo_parser.cc. You shouldn't need to use +// it directly. class HloLexer { public: - explicit HloLexer(tensorflow::StringPiece buf) : buf_(buf) { + explicit HloLexer(absl::string_view buf) : buf_(buf) { current_ptr_ = buf_.begin(); } @@ -57,7 +59,7 @@ class HloLexer { CHECK(GetKind() == TokKind::kShape); return shape_val_; } - int64 GetInt64Val() const { + tensorflow::int64 GetInt64Val() const { CHECK(GetKind() == TokKind::kInt); return int64_val_; } @@ -75,7 +77,7 @@ class HloLexer { std::pair GetLineAndColumn(LocTy location) const; // Returns the whole line given the location. - tensorflow::StringPiece GetLine(LocTy loc) const; + absl::string_view GetLine(LocTy loc) const; private: // Returns the current character. If it's neither the end of input buffer nor @@ -87,8 +89,8 @@ class HloLexer { // Creates StringPiece with the given begin and end. Exits if the begin > end, // or it's out of the range of the current buffer. - tensorflow::StringPiece StringPieceFromPointers(const char* begin, - const char* end) const; + absl::string_view StringPieceFromPointers(const char* begin, + const char* end) const; tensorflow::RegexpStringPiece RegexpStringPieceFromPointers( const char* begin, const char* end) const; @@ -103,18 +105,17 @@ class HloLexer { TokKind LexShape(); TokKind LexConstant(); TokKind LexNumberOrPattern(); - TokKind LexComment(); TokKind LexString(); - const tensorflow::StringPiece buf_; + const absl::string_view buf_; const char* current_ptr_; // Information about the current token. - const char* token_start_; + const char* token_start_ = nullptr; TokKind current_kind_; string str_val_; Shape shape_val_; - int64 int64_val_; + tensorflow::int64 int64_val_; double decimal_val_; struct LineNoCacheTy { @@ -125,7 +126,6 @@ class HloLexer { mutable LineNoCacheTy line_no_cache_{nullptr, 0}; }; -} // namespace tools } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_LEXER_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_ diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc index 43c41ece6efc4f9e8ca74f16e0f63d29abc4de4e..3a1dd471c626ae9497cfcca62c30736bcdbb2b38 100644 --- a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc @@ -17,8 +17,9 @@ limitations under the License. #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/call_graph.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -29,17 +30,14 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" namespace xla { +namespace { using Worklist = std::deque; using Workset = std::unordered_set; -namespace { - void AddToWorklist(const HloInstruction* instruction, Worklist* worklist, Workset* workset) { if (workset->count(instruction) == 0) { @@ -296,7 +294,7 @@ StatusOr> HloLivenessAnalysis::Run( VLOG(1) << "HloLivenessAnalysis::Run on module " << module.name(); XLA_VLOG_LINES(2, module.ToString()); - auto liveness_analysis = WrapUnique(new HloLivenessAnalysis(module)); + auto liveness_analysis = absl::WrapUnique(new HloLivenessAnalysis(module)); liveness_analysis->RunAnalysis(); diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc index 8e2e2c7627ba6ac9e5078446056917a07436cbd7..01b625c29ca2823b2a2490b30a9d4d5128b4c22e 100644 --- a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc @@ -15,15 +15,15 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" @@ -59,7 +59,7 @@ class HloLivenessAnalysisTest : public HloTestBase { // Test that add instruction at entry root is live at all output shape indices. TEST_F(HloLivenessAnalysisTest, AddAtEntryRoot) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleModule ENTRY SimpleComputation { constant.1 = s32[] constant(0) @@ -75,7 +75,7 @@ TEST_F(HloLivenessAnalysisTest, AddAtEntryRoot) { // Test that a dead add instruction is marked as dead by analysis. TEST_F(HloLivenessAnalysisTest, DeadAdd) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleModule ENTRY SimpleComputation { constant.1 = s32[] constant(0) @@ -94,7 +94,7 @@ TEST_F(HloLivenessAnalysisTest, DeadAdd) { // Test that all output shape indices of entry root tuple (and defining // instruction in its output) are marked live. TEST_F(HloLivenessAnalysisTest, TupleAtEntryRoot) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleModule ENTRY SimpleComputation { constant.1 = s32[] constant(0) @@ -113,7 +113,7 @@ TEST_F(HloLivenessAnalysisTest, TupleAtEntryRoot) { // Tests that all outputs of nested tuple and entry root (and defining // instruction values appearing in its output) are marked live. TEST_F(HloLivenessAnalysisTest, NestedTupleAtEntryRoot) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleModule ENTRY SimpleComputation { constant.1 = s32[] constant(1) @@ -140,7 +140,7 @@ TEST_F(HloLivenessAnalysisTest, NestedTupleAtEntryRoot) { // Tests that GTE at entry root of Tuple instruction only propgates liveness // to the live elements in tuple. TEST_F(HloLivenessAnalysisTest, GteOfTuple) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleModule ENTRY SimpleComputation { constant.1 = s32[] constant(0) @@ -162,7 +162,7 @@ TEST_F(HloLivenessAnalysisTest, GteOfTuple) { // Tests that GTE at entry root of nested Tuple instruction only propgates // liveness to the live elements in tuple. TEST_F(HloLivenessAnalysisTest, GteOfNestedTuple) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleModule ENTRY SimpleComputation { constant.1 = s32[] constant(0) @@ -199,7 +199,7 @@ TEST_F(HloLivenessAnalysisTest, GteOfNestedTuple) { // Tests that GTE of GTE (at entry root) of nested Tuple instruction only // propgates liveness to the live elements in tuple. TEST_F(HloLivenessAnalysisTest, GteOfGteOfNestedTuple) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleModule ENTRY SimpleComputation { constant.1 = s32[] constant(0) @@ -240,7 +240,7 @@ TEST_F(HloLivenessAnalysisTest, GteOfGteOfNestedTuple) { // Test that live/dead while tuple elements are marked live/dead correctly. TEST_F(HloLivenessAnalysisTest, WhileWithDeadTupleElement) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleLoop SimpleLoop.body { loop_var.1 = (s32[], s32[3]{0}) parameter(0) @@ -291,7 +291,7 @@ TEST_F(HloLivenessAnalysisTest, WhileWithDeadTupleElement) { // Tests that a tuple element live in while.cond computation, propagates // liveness to while.body.root/while.result/while.operand (where it is unused). TEST_F(HloLivenessAnalysisTest, WhileCondPropagatesLiveness) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleLoop SimpleLoop.body { loop_var.1 = (s32[], s32[3]{0}) parameter(0) @@ -345,7 +345,7 @@ TEST_F(HloLivenessAnalysisTest, WhileCondPropagatesLiveness) { // Tests that a use of while.result{0} propagates liveness to // while.body.param{1} to while.body.root{1}, and then to while.body.param{2}. TEST_F(HloLivenessAnalysisTest, WhileWithLiveTupleElements) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleLoop SimpleLoop.body { loop_var.1 = (s32[], s32[], s32[]) parameter(0) diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc index 7e4b8834357d39099f76450b849d6b5624e4e3b4..5269cad94d35be3dd1c009588bbe422ff1533364 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers.cc @@ -15,15 +15,13 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace xla { namespace testing { -using ::tensorflow::str_util::Join; - bool HloMatcher::MatchAndExplain( const HloInstruction* instruction, ::testing::MatchResultListener* listener) const { @@ -210,8 +208,8 @@ bool HloDotWithContractingDimsMatcher::MatchAndExplain( dim_nums.lhs_contracting_dimensions(0) != lhs_contracting_dim_) { *listener << instruction->ToString() << " has wrong lhs_contracting_dimensions (got {" - << Join(dim_nums.lhs_contracting_dimensions(), ",") << "} want {" - << lhs_contracting_dim_ << "})"; + << absl::StrJoin(dim_nums.lhs_contracting_dimensions(), ",") + << "} want {" << lhs_contracting_dim_ << "})"; return false; } @@ -219,8 +217,8 @@ bool HloDotWithContractingDimsMatcher::MatchAndExplain( dim_nums.rhs_contracting_dimensions(0) != rhs_contracting_dim_) { *listener << instruction->ToString() << " has wrong rhs_contracting_dimensions (got {" - << Join(dim_nums.rhs_contracting_dimensions(), ",") << "} want {" - << rhs_contracting_dim_ << "})"; + << absl::StrJoin(dim_nums.rhs_contracting_dimensions(), ",") + << "} want {" << rhs_contracting_dim_ << "})"; return false; } diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h index dfefad363445eb585b8f6692fb4582dd436d4f9d..5502e565b6dfbaca6cfa2101950fb0a68c89771f 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.h +++ b/tensorflow/compiler/xla/service/hlo_matchers.h @@ -16,10 +16,10 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_ +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" -#include "tensorflow/core/lib/gtl/optional.h" namespace xla { namespace testing { @@ -120,8 +120,7 @@ class HloShapeAndLayoutMatcher class HloShardingMatcher : public ::testing::MatcherInterface { public: - explicit HloShardingMatcher( - const tensorflow::gtl::optional& sharding) + explicit HloShardingMatcher(const absl::optional& sharding) : sharding_(sharding) {} bool MatchAndExplain(const HloInstruction* instruction, @@ -129,7 +128,7 @@ class HloShardingMatcher void DescribeTo(std::ostream* os) const override; private: - tensorflow::gtl::optional sharding_; + absl::optional sharding_; }; // Matches a Dot HLO instruction with specific LHS and RHS contracting @@ -187,7 +186,9 @@ HLO_MATCHER(Exp); HLO_MATCHER(Floor); HLO_MATCHER(Fusion); HLO_MATCHER(Ge); +HLO_MATCHER(AfterAll); HLO_MATCHER(Gt); +HLO_MATCHER(Iota); HLO_MATCHER(Infeed); HLO_MATCHER(IsFinite); HLO_MATCHER(Le); @@ -195,6 +196,7 @@ HLO_MATCHER(Log); HLO_MATCHER(And); HLO_MATCHER(Not); HLO_MATCHER(Or); +HLO_MATCHER(Xor); HLO_MATCHER(Lt); HLO_MATCHER(Map); HLO_MATCHER(Maximum); @@ -229,6 +231,7 @@ HLO_MATCHER(Tanh); HLO_MATCHER(Trace); HLO_MATCHER(Transpose); HLO_MATCHER(Tuple); +HLO_MATCHER(TupleSelect); HLO_MATCHER(While); // The special cases below let you check additional information about the @@ -304,7 +307,7 @@ inline ::testing::Matcher Shape( return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(shape)); } inline ::testing::Matcher Shape( - tensorflow::StringPiece shape) { + absl::string_view shape) { return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher( ShapeUtil::ParseShapeString(shape).ValueOrDie())); } @@ -314,7 +317,7 @@ inline ::testing::Matcher ShapeWithLayout( new ::xla::testing::HloShapeAndLayoutMatcher(shape)); } inline ::testing::Matcher ShapeWithLayout( - tensorflow::StringPiece shape) { + absl::string_view shape) { return ::testing::MakeMatcher(new ::xla::testing::HloShapeAndLayoutMatcher( ShapeUtil::ParseShapeString(shape).ValueOrDie())); } @@ -327,14 +330,14 @@ inline ::testing::Matcher Sharding( } // Matcher for Sharding from sharding string inline ::testing::Matcher Sharding( - tensorflow::StringPiece sharding) { + absl::string_view sharding) { return ::testing::MakeMatcher(new ::xla::testing::HloShardingMatcher( - xla::tools::ParseSharding(sharding).ValueOrDie())); + ParseSharding(sharding).ValueOrDie())); } // Verifies that no HloSharding is set for an HLO instruction. inline ::testing::Matcher NoSharding() { return ::testing::MakeMatcher( - new ::xla::testing::HloShardingMatcher(tensorflow::gtl::nullopt)); + new ::xla::testing::HloShardingMatcher(absl::nullopt)); } inline ::testing::Matcher Dot( diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc index 1d10e3c4fe751cd985a4f3b274f84101896502e1..7961aece541faeb66875885b380158756c503250 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc @@ -14,6 +14,8 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" namespace op = xla::testing::opcode_matchers; @@ -74,8 +76,10 @@ TEST(HloMatchersTest, Test) { } TEST(HloMatchersTest, CustomCallMatcher) { - auto c1 = HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3})); - auto c2 = HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3})); + auto c1 = + HloInstruction::CreateConstant(LiteralUtil::CreateR1({1, 2, 3})); + auto c2 = + HloInstruction::CreateConstant(LiteralUtil::CreateR1({1, 2, 3})); auto call = HloInstruction::CreateCustomCall( ShapeUtil::MakeShape(F32, {1}), {c1.get(), c2.get()}, "foo_target"); @@ -153,9 +157,8 @@ TEST(HloMatchersTest, ShardingMatcher) { Array assignment({2}); assignment.SetValues({0, 1}); auto sharding = HloSharding::Tuple( - tuple_shape, - {HloSharding::Tile(ShapeUtil::MakeShape(F32, {5}), assignment), - HloSharding::AssignDevice(1), HloSharding::Replicate()}); + tuple_shape, {HloSharding::Tile(assignment), HloSharding::AssignDevice(1), + HloSharding::Replicate()}); p2->set_sharding(sharding); EXPECT_THAT(p0.get(), op::NoSharding()); @@ -168,8 +171,7 @@ TEST(HloMatchersTest, ShardingMatcher) { EXPECT_THAT( p2.get(), - op::Sharding( - "{{f32[5] devices=[2]0,1}, {maximal device=1}, {replicated}}")); + op::Sharding("{{devices=[2]0,1}, {maximal device=1}, {replicated}}")); EXPECT_THAT(Explain(p0.get(), op::Sharding(HloSharding::AssignDevice(1))), "%param.0 = f32[5]{0} parameter(0) has no sharding (expected: " @@ -194,7 +196,7 @@ ENTRY DotOperationFusion_TransposeFusion { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); HloInstruction* root = module->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Dot(op::Parameter(0), op::Parameter(1), diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc index e63424c2dfb6c7b9e71e4cede896a8f6609fea62..3a1bc4e328b89d75efde7e7afeb0e52ceed4d8f9 100644 --- a/tensorflow/compiler/xla/service/hlo_module.cc +++ b/tensorflow/compiler/xla/service/hlo_module.cc @@ -22,25 +22,17 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/gtl/map_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/types.h" namespace xla { -HloModule::HloModule(const string& name, - const VersionedComputationHandle& entry_computation_handle, - const HloModuleConfig& config) - : name_(NameUniquer::GetSanitizedName(name)), - config_(config), - has_entry_computation_handle_(true), - entry_computation_handle_(entry_computation_handle), - unique_id_(next_unique_module_id_++) {} - HloModule::HloModule(const string& name, const HloModuleConfig& config) : name_(NameUniquer::GetSanitizedName(name)), config_(config), @@ -67,7 +59,7 @@ HloComputation* HloModule::AddComputationInternal( // If the module configuration has no entry layout computation set, create a // default one based on the program shape. - if (!config_.has_host_entry_computation_layout()) { + if (!config_.has_entry_computation_layout()) { config_.SetDefaultComputationLayout( entry_computation_->ComputeProgramShape()); } @@ -234,21 +226,17 @@ HloModuleProto HloModule::ToProto() const { /* static */ StatusOr> HloModule::CreateFromProto( - const HloModuleProto& proto, const HloModuleConfig& module_config, - const VersionedComputationHandle& entry_computation_handle) { + const HloModuleProto& proto, const HloModuleConfig& module_config) { // The ProgramShape in the passed in module config must match the shapes of // the entry parameters and root. TF_RET_CHECK(proto.has_program_shape()) << "No program shape found in the proto"; const auto& expected_program_shape = proto.program_shape(); - TF_RET_CHECK( - expected_program_shape.parameters_size() == - module_config.device_entry_computation_layout().parameter_count()); + TF_RET_CHECK(expected_program_shape.parameters_size() == + module_config.entry_computation_layout().parameter_count()); for (int i = 0; i < expected_program_shape.parameters_size(); ++i) { const Shape& parameter_shape = - module_config.device_entry_computation_layout() - .parameter_layout(i) - .shape(); + module_config.entry_computation_layout().parameter_layout(i).shape(); TF_RET_CHECK(ShapeUtil::Compatible(expected_program_shape.parameters(i), parameter_shape)) << "HloModuleConfig has different shape for parameter " << i @@ -258,7 +246,7 @@ StatusOr> HloModule::CreateFromProto( << ", actual: " << ShapeUtil::HumanStringWithLayout(parameter_shape); } const Shape& result_shape = - module_config.device_entry_computation_layout().result_layout().shape(); + module_config.entry_computation_layout().result_layout().shape(); TF_RET_CHECK( ShapeUtil::Compatible(expected_program_shape.result(), result_shape)) << "HloModuleConfig has different result shape than the HLO module. " @@ -287,8 +275,7 @@ StatusOr> HloModule::CreateFromProto( } TF_RET_CHECK(entry != nullptr); - auto module = MakeUnique(proto.name(), entry_computation_handle, - module_config); + auto module = absl::make_unique(proto.name(), module_config); // Sort the computations in the proto id's order. std::sort(computations.begin(), computations.end(), @@ -338,7 +325,7 @@ StatusOr HloModule::CreateModuleConfigFromProto( // The module config is constructed with default layouts regardless of what is // passed in via the ProgramShape. Set the layouts to the appropriate values. ComputationLayout* entry_layout = - module_config.mutable_host_entry_computation_layout(); + module_config.mutable_entry_computation_layout(); for (int64 i = 0; i < entry_layout->parameter_count(); ++i) { TF_RETURN_IF_ERROR( entry_layout->mutable_parameter_layout(i)->CopyLayoutFromShape( @@ -346,9 +333,6 @@ StatusOr HloModule::CreateModuleConfigFromProto( } TF_RETURN_IF_ERROR(entry_layout->mutable_result_layout()->CopyLayoutFromShape( program_shape.result())); - *module_config.mutable_device_entry_computation_layout() = - module_config.host_entry_computation_layout(); - return module_config; } @@ -369,7 +353,7 @@ bool IsUsedOutsideSubcomputation( } // anonymous namespace HloInstruction* HloModule::OutlineExpressionFromComputation( - tensorflow::gtl::ArraySlice instructions_to_outline, + absl::Span instructions_to_outline, const string& outlined_computation_name, HloComputation* computation) { auto builder = HloComputation::Builder(outlined_computation_name); @@ -401,7 +385,7 @@ HloInstruction* HloModule::OutlineExpressionFromComputation( // as a parameter in the new function. arguments.push_back(old_operand); *operand_slot = builder.AddInstruction(HloInstruction::CreateParameter( - parameter_count, old_operand->shape(), "")); + parameter_count, old_operand->shape(), "p")); ++parameter_count; } TF_CHECK_OK( @@ -426,7 +410,7 @@ HloInstruction* HloModule::OutlineExpressionFromComputation( string error_message = "The subcomputation to outline has multiple outputs:\n"; for (HloInstruction* output : outputs) { - tensorflow::strings::StrAppend(&error_message, output->ToString(), "\n"); + absl::StrAppend(&error_message, output->ToString(), "\n"); } LOG(FATAL) << error_message; } @@ -462,7 +446,7 @@ int64 HloModule::instruction_count() const { return n; } -std::list HloModule::MakeComputationPostOrder() const { +std::vector HloModule::MakeComputationPostOrder() const { // First determine all root computations by building a set of nonroot // computations (computations which are called by an instruction in the // module). @@ -480,7 +464,7 @@ std::list HloModule::MakeComputationPostOrder() const { // order. This prevents duplication as an embedded computation may be called // from two different root computations. std::set added_computations; - std::list post_order; + std::vector post_order; for (auto& computation : computations_) { if (nonroot_computations.count(computation.get()) == 0) { for (HloComputation* embedded_computation : @@ -524,9 +508,7 @@ std::vector HloModule::MakeNonfusionComputations() const { std::unique_ptr HloModule::Clone(const string& suffix) const { VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n"; - auto module = MakeUnique(name_ + "-" + suffix, config_); - module->entry_computation_handle_ = entry_computation_handle_; - module->has_entry_computation_handle_ = has_entry_computation_handle_; + auto module = absl::make_unique(name_ + "-" + suffix, config_); HloCloneContext context(module.get(), suffix); auto cloned_computation = entry_computation_->Clone(suffix, &context); @@ -554,12 +536,12 @@ uint64 HloModule::RandomNew64() const { return rng_(); } -HloComputation* HloModule::GetComputationWithName( - tensorflow::StringPiece name) { - auto it = c_find_if(computations(), [&](HloComputation* computation) { - return computation->name() == name; - }); - return it == computations().end() ? nullptr : *it; +HloComputation* HloModule::GetComputationWithName(absl::string_view name) { + auto computations_in_module = computations(); + auto it = absl::c_find_if( + computations_in_module, + [&](HloComputation* computation) { return computation->name() == name; }); + return it == computations_in_module.end() ? nullptr : *it; } /* static */ std::atomic HloModule::next_unique_module_id_(0); diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h index c93c74d34a95cfbb3d0d334fb1c1f40a5aad69e9..3c3371426b7a6a054053fe6761f87c3b5a097699 100644 --- a/tensorflow/compiler/xla/service/hlo_module.h +++ b/tensorflow/compiler/xla/service/hlo_module.h @@ -24,6 +24,8 @@ limitations under the License. #include #include +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/iterator_util.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_clone_context.h" @@ -31,10 +33,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/service/name_uniquer.h" -#include "tensorflow/compiler/xla/service/versioned_computation_handle.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/iterator_range.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mutex.h" @@ -57,10 +56,6 @@ namespace xla { // attached to. class HloModule { public: - HloModule(const string& name, - const VersionedComputationHandle& entry_computation_handle, - const HloModuleConfig& config); - // Constructor without a versioned computation handle. This constructor should // only be used for HloModules used outside of the XLA service (eg // tests). The versioned handle is used by the service in the compilation @@ -110,24 +105,19 @@ class HloModule { return entry_computation_; } - ComputationLayout* mutable_host_entry_computation_layout() { - return config_.mutable_host_entry_computation_layout(); - } - - const ComputationLayout& host_entry_computation_layout() const { - return config_.host_entry_computation_layout(); + // Creates the ComputationLayout which describes the current status of the HLO + // module entry computation. + ComputationLayout compute_computation_layout() const { + return ComputationLayout(entry_computation()->ComputeProgramShape(), + /*ignore_layouts=*/false); } - ComputationLayout* mutable_device_entry_computation_layout() { - return config_.mutable_device_entry_computation_layout(); + ComputationLayout* mutable_entry_computation_layout() { + return config_.mutable_entry_computation_layout(); } - const ComputationLayout& device_entry_computation_layout() const { - return config_.device_entry_computation_layout(); - } - - const VersionedComputationHandle& entry_computation_handle() const { - return entry_computation_handle_; + const ComputationLayout& entry_computation_layout() const { + return config_.entry_computation_layout(); } // Gets the computations in this module. @@ -152,7 +142,7 @@ class HloModule { // Returns the computation in this module that has the name `name`. Returns // null if there is no such computation. - HloComputation* GetComputationWithName(tensorflow::StringPiece name); + HloComputation* GetComputationWithName(absl::string_view name); // Gets the number of computations in this module. int64 computation_count() const { return computations_.size(); } @@ -163,7 +153,7 @@ class HloModule { // Compute and return a post order of all computations in the module. The sort // is defined like so: if computation A has an instruction which calls // computation B, then A will appear after B in the sort. - std::list MakeComputationPostOrder() const; + std::vector MakeComputationPostOrder() const; // Gets the computations in this module which aren't for fusion nodes. // @@ -188,9 +178,7 @@ class HloModule { // Convert an HloModule to or from a proto. HloModuleProto ToProto() const; static StatusOr> CreateFromProto( - const HloModuleProto& proto, const HloModuleConfig& module_config, - const VersionedComputationHandle& entry_computation_handle = - VersionedComputationHandle()); + const HloModuleProto& proto, const HloModuleConfig& module_config); // Creates and returns an HloModuleConfig with an appropriate program shape // for the HLO module in the given proto. @@ -204,7 +192,7 @@ class HloModule { // order (root of outlined instructions last). TODO(jingyue): takes a set of // instructions and topologically sorts them. HloInstruction* OutlineExpressionFromComputation( - tensorflow::gtl::ArraySlice instructions_to_outline, + absl::Span instructions_to_outline, const string& outlined_computation_name, HloComputation* computation); // Returns a randomly generated uint64. @@ -264,10 +252,6 @@ class HloModule { mutable std::mt19937_64 rng_{42}; mutable tensorflow::mutex rng_mutex_; - // Versioned handle of the entry computation of the module. - bool has_entry_computation_handle_ = false; - VersionedComputationHandle entry_computation_handle_; - // Unique name generator for computation and instruction names, which are // unique per module. NameUniquer computation_name_uniquer_{/*separator=*/"."}; diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc index dae5578a3158fecb8219e518841dec1020b2ca98..9bfa3a5f45c8e810f9ea7d6bdcd72b90254d15b9 100644 --- a/tensorflow/compiler/xla/service/hlo_module_config.cc +++ b/tensorflow/compiler/xla/service/hlo_module_config.cc @@ -18,46 +18,36 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/shape_layout.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { -using tensorflow::strings::StrAppend; +using absl::StrAppend; -HloModuleConfig::HloModuleConfig() {} - -HloModuleConfig::HloModuleConfig(const ProgramShape& program_shape) - : host_entry_computation_layout_(program_shape), - device_entry_computation_layout_(program_shape) {} +HloModuleConfig::HloModuleConfig(const ProgramShape& program_shape, + bool ignore_layouts) + : entry_computation_layout_( + ComputationLayout(program_shape, ignore_layouts)) {} void HloModuleConfig::SetDefaultComputationLayout( const ProgramShape& program_shape) { - host_entry_computation_layout_ = ComputationLayout(program_shape); - device_entry_computation_layout_ = ComputationLayout(program_shape); + entry_computation_layout_ = ComputationLayout(program_shape); } string HloModuleConfig::compilation_cache_key() const { - string key = - tensorflow::strings::StrCat("profiling=", hlo_profiling_enabled()); + string key = absl::StrCat("profiling=", hlo_profiling_enabled()); StrAppend(&key, "::("); std::vector params; for (const ShapeLayout& param_layout : - host_entry_computation_layout_->parameter_layouts()) { - params.push_back(param_layout.shape().DebugString()); - } - StrAppend(&key, tensorflow::str_util::Join(params, ", "), ") => ", - host_entry_computation_layout_->result_shape().SerializeAsString()); - for (const ShapeLayout& param_layout : - device_entry_computation_layout_->parameter_layouts()) { + entry_computation_layout_->parameter_layouts()) { params.push_back(param_layout.shape().DebugString()); } - StrAppend( - &key, tensorflow::str_util::Join(params, ", "), ") => ", - device_entry_computation_layout_->result_shape().SerializeAsString()); + StrAppend(&key, absl::StrJoin(params, ", "), ") => ", + entry_computation_layout_->result_shape().SerializeAsString()); if (seed() != 0) { // TODO(b/32083678): force recompilation to reset global state. static std::atomic counter{0}; diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h index cdb0b29a2399b387bc617262032e9083ba079625..3f1e1cc73eeb9debe5eb6278ab192fdf9b8cc10f 100644 --- a/tensorflow/compiler/xla/service/hlo_module_config.h +++ b/tensorflow/compiler/xla/service/hlo_module_config.h @@ -18,11 +18,11 @@ limitations under the License. #include +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla.pb.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/optional.h" namespace xla { @@ -37,48 +37,34 @@ class HloModuleConfig { // ComputationLayout. The default ctor creates it without -- in this case // accessing entry_computation_layout will CHECK-fail. The ctor accepting a // ProgramShape creates a computation layout using this shape. - HloModuleConfig(); - explicit HloModuleConfig(const ProgramShape& program_shape); + // The layouts in the ProgramShape will be reset to default unless + // ignore_layouts is set to false. + HloModuleConfig() = default; - // Checks if this config has an entry computation layout already. - bool has_host_entry_computation_layout() const { - return host_entry_computation_layout_.has_value(); - } + explicit HloModuleConfig(const ProgramShape& program_shape, + bool ignore_layouts = true); - bool has_device_entry_computation_layout() const { - return device_entry_computation_layout_.has_value(); + // Checks if this config has an entry computation layout already. + bool has_entry_computation_layout() const { + return entry_computation_layout_.has_value(); } // Sets the entry computation layout for this config. If the entry computation // layout already exists, it is silently replaced. void SetDefaultComputationLayout(const ProgramShape& program_shape); - // Returns a constant reference to the on-host layout of the entry - // computation. Assumes the layout was set. - const ComputationLayout& host_entry_computation_layout() const { - CHECK(host_entry_computation_layout_.has_value()); - return *host_entry_computation_layout_; - } - - // Returns a mutable pointer to the layout of the on-host entry computation. + // Returns a constant reference to the layout of the entry computation. // Assumes the layout was set. - ComputationLayout* mutable_host_entry_computation_layout() { - CHECK(host_entry_computation_layout_.has_value()); - return &(*host_entry_computation_layout_); + const ComputationLayout& entry_computation_layout() const { + CHECK(entry_computation_layout_.has_value()); + return *entry_computation_layout_; } - // Returns a constant reference to the on-device layout of the entry - // computation. Assumes the layout was set. - const ComputationLayout& device_entry_computation_layout() const { - CHECK(device_entry_computation_layout_.has_value()); - return *device_entry_computation_layout_; - } - - // Returns a mutable pointer to the layout of the on-device entry computation. + // Returns a mutable pointer to the layout of the entry computation. // Assumes the layout was set. - ComputationLayout* mutable_device_entry_computation_layout() { - CHECK(device_entry_computation_layout_.has_value()); - return &(*device_entry_computation_layout_); + ComputationLayout* mutable_entry_computation_layout() { + CHECK(entry_computation_layout_.has_value()); + return &(*entry_computation_layout_); } // Returns whether to enable HLO-level profiling. @@ -86,15 +72,6 @@ class HloModuleConfig { return debug_options_.xla_hlo_profile(); } - // Sets/returns whether this is a "host module". Host modules are used to - // record the data- and control-flow dependencies of host side computation - // that communicates with compiled code. They are used for analysis and - // scheduling purposes, but no code is generated. - bool is_host_module() const { return is_host_module_; } - void set_is_host_module(bool is_host_module) { - is_host_module_ = is_host_module; - } - // Sets/returns the module seed set during execution. void set_seed(uint64 seed) { seed_ = seed; } uint64 seed() const { return seed_; } @@ -127,8 +104,7 @@ class HloModuleConfig { private: // If you add new members, be sure to update compilation_cache_key. - tensorflow::gtl::optional host_entry_computation_layout_; - tensorflow::gtl::optional device_entry_computation_layout_; + absl::optional entry_computation_layout_; // Whether this is a 'host module'. bool is_host_module_ = false; diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.h b/tensorflow/compiler/xla/service/hlo_module_dce.h index 29024085c1038961ef2b3721de1ce0e8a55ccf45..12ca2340a6ccaa50780e81168c755c1fec3aa1be 100644 --- a/tensorflow/compiler/xla/service/hlo_module_dce.h +++ b/tensorflow/compiler/xla/service/hlo_module_dce.h @@ -31,7 +31,7 @@ namespace xla { class HloModuleDCE : public HloPassInterface { public: ~HloModuleDCE() override {} - tensorflow::StringPiece name() const override { return "hlo-module-dce"; } + absl::string_view name() const override { return "hlo-module-dce"; } // Run the pass on the given module. Returns whether the module was changed // (instructions were removed). diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc index 53b7d0ed3964ca8a2c3bb73c62015a1c7dbfe487..363862e4905fc13a4ef07aeaac255259fc6b86ba 100644 --- a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc +++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc @@ -19,11 +19,11 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_utils.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/types.h" @@ -73,7 +73,7 @@ class HloModuleDceTest : public HloTestBase { // Tests that a while with all outputs live is unmodified. TEST_F(HloModuleDceTest, WhileWithLiveOutputs) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleLoop SimpleLoop.body { loop_var.1 = (s32[], s32[3]{0}) parameter(0) @@ -110,7 +110,7 @@ TEST_F(HloModuleDceTest, WhileWithLiveOutputs) { // Tests a while loop with one unused output (which is used in the while loop // body by an instruction with side-effects: rng) is unmodified. TEST_F(HloModuleDceTest, WhileWithUnusedSideEffectingTupleElement) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleLoop SimpleLoop.body { loop_var.1 = (s32[], f32[]) parameter(0) @@ -150,7 +150,7 @@ TEST_F(HloModuleDceTest, WhileWithUnusedSideEffectingTupleElement) { // Tests that a while loop with one dead tuple element at {1} has its while // loop body modified to make that tuple element pass-through the while body. TEST_F(HloModuleDceTest, OneWhileWithDeadTupleElement) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleLoop SimpleLoop.body { loop_var.1 = (s32[], s32[3]{0}) parameter(0) @@ -193,7 +193,7 @@ TEST_F(HloModuleDceTest, OneWhileWithDeadTupleElement) { // dead in while.body{1} and at while.result{1}) propgates liveness of this // tuple element to while.body{1} and at while.result{1}. TEST_F(HloModuleDceTest, OneWhileWithTupleElementUsedByCond) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleLoop SimpleLoop.body { loop_var.1 = (s32[], s32[]) parameter(0) @@ -235,7 +235,7 @@ TEST_F(HloModuleDceTest, OneWhileWithTupleElementUsedByCond) { // Tests that HloModuleDCE can remove a dead tuple element at index {1} between // two dependent while loops. TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElement) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleLoop SimpleLoop.body0 { loop_var.1 = (s32[], s32[3]{0}) parameter(0) @@ -303,7 +303,7 @@ TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElement) { // Tests that HloModuleDCE can remove a dead tuple element at while.1{0} and // while.2{1}, between two dependent while loops. TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElementSwizzled) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule SimpleLoop SimpleLoop.body0 { loop_var.1 = (s32[3]{0}, s32[]) parameter(0) diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc index 7d706b5fd0094e53486cd5f276e4db1590c6e3fa..9c01862a4b7024826c3f701b795819abe945d07f 100644 --- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc +++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc @@ -19,7 +19,10 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/util.h" @@ -57,7 +60,7 @@ string HloModuleGroupMetadata::TrackedInstruction::ToString() const { /* static */ StatusOr> HloModuleGroupMetadata::Build(const std::vector& modules) { - auto metadata = MakeUnique(modules); + auto metadata = absl::make_unique(modules); TF_RETURN_IF_ERROR(metadata->Build()); return std::move(metadata); } @@ -75,10 +78,23 @@ Status HloModuleGroupMetadata::Build() { if (tracked == nullptr) { return Status::OK(); } - // Add the parent computation of this channel instruction and its peer - // computation (both must be while computations) as companions. + + std::vector peers; if (IsChannelInstruction(hlo)) { - HloComputation* peer_computation = PeerComputation(hlo); + peers.push_back(PeerComputation(hlo)); + } else if (hlo->IsCrossModuleAllReduce()) { + for (HloInstruction* instr : GetAllReduceGroup(*hlo->all_reduce_id())) { + if (instr == hlo) { + continue; + } + peers.push_back(instr->parent()); + } + } + + // Add the parent computation of this channel (or all-reduce) instruction + // and its peer computation(s) (both must be while computations) as + // companions. + for (HloComputation* peer_computation : peers) { const TrackedInstruction* peer_tracked = GetTrackedInstruction(peer_computation); TF_RET_CHECK(peer_tracked != nullptr) @@ -113,6 +129,17 @@ Status HloModuleGroupMetadata::Build() { } } TF_RETURN_IF_ERROR(VerifyCompanionSets()); + if (VLOG_IS_ON(4)) { + DumpCollectedStats(); + } + + for (HloModule* module : modules_) { + TF_ASSIGN_OR_RETURN( + std::unique_ptr points_to_analysis, + TuplePointsToAnalysis::Run(module)); + points_to_analyses_[module] = std::move(points_to_analysis); + } + return Status::OK(); } @@ -124,9 +151,14 @@ Status HloModuleGroupMetadata::VerifyCompanionSets() const { for (HloInstruction* instruction : *companions) { // Go through all the communicating instructions (send, recv) of the given // companion, and record their device. + auto it = tracked_instructions_comms_.find(instruction); + if (it == tracked_instructions_comms_.end()) { + // Companions can be added even if they have no communicating + // instructions, if they are parent of companions. + continue; + } std::unordered_set comm_devices; - for (HloInstruction* comm_instruction : - tracked_instructions_comms_.at(instruction)) { + for (HloInstruction* comm_instruction : it->second) { auto device = GetInstructionDevice(*comm_instruction); TF_RET_CHECK(device) << "Instruction " << comm_instruction->ToString() << " does not have a device"; @@ -140,7 +172,7 @@ Status HloModuleGroupMetadata::VerifyCompanionSets() const { ss << " " << hlo->name() << std::endl; } ss << "has multiple instructions on the same device"; - return FailedPrecondition("%s", ss.str().c_str()); + return FailedPrecondition("%s", ss.str()); } } } @@ -154,8 +186,12 @@ bool HloModuleGroupMetadata::IsChannelInstruction( case HloOpcode::kSend: case HloOpcode::kRecv: case HloOpcode::kSendDone: - case HloOpcode::kRecvDone: - return true; + case HloOpcode::kRecvDone: { + const HloSendRecvInstruction* send_recv_instr = + DynCast(instruction); + CHECK(send_recv_instr != nullptr); + return !send_recv_instr->is_host_transfer(); + } default: return false; } @@ -167,7 +203,8 @@ bool HloModuleGroupMetadata::IsCompanionInstruction(HloInstruction* hlo) const { bool HloModuleGroupMetadata::InstructionCommunicates( HloInstruction* hlo) const { - return IsChannelInstruction(hlo) || IsCompanionInstruction(hlo); + return IsChannelInstruction(hlo) || IsCompanionInstruction(hlo) || + hlo->IsCrossModuleAllReduce(); } const HloModuleGroupMetadata::Channel& HloModuleGroupMetadata::GetChannel( @@ -176,6 +213,10 @@ const HloModuleGroupMetadata::Channel& HloModuleGroupMetadata::GetChannel( return channels_[channel_id_map_.at(channel_id)]; } +bool HloModuleGroupMetadata::HasChannel(int64 channel_id) const { + return channel_id_map_.find(channel_id) != channel_id_map_.end(); +} + HloComputation* HloModuleGroupMetadata::PeerComputation( const HloInstruction* instruction) const { CHECK(IsChannelInstruction(instruction)); @@ -192,6 +233,13 @@ HloComputation* HloModuleGroupMetadata::PeerComputation( } } +const std::vector& HloModuleGroupMetadata::GetAllReduceGroup( + int64 all_reduce_id) const { + auto it = all_reduce_map_.find(all_reduce_id); + CHECK(it != all_reduce_map_.end()); + return it->second; +} + std::vector HloModuleGroupMetadata::GetCompanionsPath(const HloInstruction* hlo) const { std::vector path; @@ -232,21 +280,24 @@ int64 HloModuleGroupMetadata::GetModuleId(const HloModule* module) const { LOG(FATAL) << "unknown module"; } -tensorflow::gtl::optional HloModuleGroupMetadata::GetInstructionDevice( +absl::optional HloModuleGroupMetadata::GetInstructionDevice( const HloInstruction& instruction) const { // The module group metadata can be created in both "single module, multiple // devices" and "multiple modules, no explicit devices" fashions. // The API returns an optional even though the current implementation always // returns a device, to account for cases where we cannot guess a device. // In such cases the VerifyChannelInstructions() will return proper errors. - tensorflow::gtl::optional device = - instruction.sharding_unique_device(); + absl::optional device = instruction.sharding_unique_device(); if (!device) { device = GetModuleId(instruction.parent()->parent()); } return device; } +int64 HloModuleGroupMetadata::GetDeviceModulesCount() const { + return modules_.size(); +} + Status HloModuleGroupMetadata::RecordInstructions() { const auto visitor = [this](HloInstruction* hlo) -> Status { if (hlo->opcode() == HloOpcode::kWhile) { @@ -263,10 +314,27 @@ Status HloModuleGroupMetadata::RecordInstructions() { tracked_instructions_[hlo->to_apply()] = TrackedInstruction(hlo, ComputationKind::kCallFunction); } + + // Group cross module all-reduce instructions by the all_reduce id. + if (hlo->IsCrossModuleAllReduce()) { + TF_RET_CHECK(channel_id_map_.find(*hlo->all_reduce_id()) == + channel_id_map_.end()) + << "all_reduce_id " << *hlo->all_reduce_id() + << " is already used by a send/recv instruction"; + all_reduce_map_[*hlo->all_reduce_id()].push_back(hlo); + max_channel_id_ = std::max(max_channel_id_, *hlo->all_reduce_id()); + return Status::OK(); + } + if (!IsChannelInstruction(hlo)) { return Status::OK(); } + TF_RET_CHECK(all_reduce_map_.find(hlo->channel_id()) == + all_reduce_map_.end()) + << "channel id " << hlo->channel_id() + << " is already used by an all-reduce instruction"; + // Add a new channel if needed. if (channel_id_map_.find(hlo->channel_id()) == channel_id_map_.end()) { channels_.emplace_back(); @@ -308,6 +376,8 @@ Status HloModuleGroupMetadata::RecordInstructions() { TF_RETURN_IF_ERROR(computation->Accept(visitor)); } } + VLOG(2) << "Created " << channels_.size() << " channels"; + VLOG(2) << "Created " << all_reduce_map_.size() << " all-reduce groups"; return Status::OK(); } @@ -322,7 +392,7 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1, if (!ContainsKey(companion_set_index_, instruction1) && !ContainsKey(companion_set_index_, instruction2)) { companion_sets_.push_back( - tensorflow::MakeUnique>()); + absl::make_unique>()); auto companion_set = companion_sets_.back().get(); companion_set->insert(instruction1); companion_set->insert(instruction2); @@ -350,23 +420,24 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1, Status HloModuleGroupMetadata::VerifyChannelInstructions() { for (const Channel& channel : channels_) { if (channel.send == nullptr) { - return FailedPrecondition("missing send for id : %lld", channel.id); + return FailedPrecondition("missing send for id : %d", channel.id); } if (channel.recv == nullptr) { - return FailedPrecondition("missing recv for id : %lld", channel.id); + return FailedPrecondition("missing recv for id : %d", channel.id); } if (channel.send_done == nullptr) { - return FailedPrecondition("missing send-done for id : %lld", channel.id); + return FailedPrecondition("missing send-done for id : %d", channel.id); } if (channel.recv_done == nullptr) { - return FailedPrecondition("missing recv-done for id : %lld", channel.id); + return FailedPrecondition("missing recv-done for id : %d", channel.id); } } // Check if the shapes match for each channel. for (const Channel& channel : channels_) { const Shape& send_shape = channel.send->operand(0)->shape(); - const Shape& recv_shape = channel.recv_done->shape(); + const Shape& recv_shape = + ShapeUtil::GetTupleElementShape(channel.recv_done->shape(), 0); if (!ShapeUtil::Compatible(send_shape, recv_shape)) { return FailedPrecondition("send/recv shapes do not match"); } @@ -374,33 +445,33 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() { auto send_done_device = GetInstructionDevice(*channel.send_done); if (!send_device) { return FailedPrecondition("send instruction must have a device: %s", - channel.send->ToString().c_str()); + channel.send->ToString()); } if (!send_done_device) { return FailedPrecondition("send_done instruction must have a device: %s", - channel.send_done->ToString().c_str()); + channel.send_done->ToString()); } if (*send_device != *send_done_device) { return FailedPrecondition( - "send and send-done (channel=%lld) must be on the same device: %lld " - "vs. %lld", + "send and send-done (channel=%d) must be on the same device: %d " + "vs. %d", channel.id, *send_device, *send_done_device); } auto recv_device = GetInstructionDevice(*channel.recv); auto recv_done_device = GetInstructionDevice(*channel.recv_done); if (!recv_done_device) { return FailedPrecondition("recv_done instruction must have a device: %s", - channel.recv_done->ToString().c_str()); + channel.recv_done->ToString()); } if (*recv_device != *recv_done_device) { return FailedPrecondition( - "recv and recv-done (channel=%lld) must be on the same device: %lld " - "vs. %lld", + "recv and recv-done (channel=%d) must be on the same device: %d " + "vs. %d", channel.id, *recv_device, *recv_done_device); } if (*send_device == *recv_device) { return FailedPrecondition( - "send and recv (channel=%lld) must be on different devices: %lld", + "send and recv (channel=%d) must be on different devices: %d", channel.id, *send_device); } } @@ -421,7 +492,7 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() { !CheckCompanionPathsCompatibility( path, GetCompanionsPath(channel.recv_done))) { return FailedPrecondition( - "Nest companion paths do not match for channel %lld", channel.id); + "Nest companion paths do not match for channel %d", channel.id); } } return Status::OK(); @@ -438,4 +509,36 @@ Status HloModuleGroupMetadata::CheckCommunicatingInstruction( return FailedPrecondition("channel is used in disallowed computation"); } +void HloModuleGroupMetadata::DumpCollectedStats() const { + std::map, int64> communication_histogram; + for (auto& channel : channels_) { + auto from_device = GetInstructionDevice(*channel.send); + auto to_device = GetInstructionDevice(*channel.recv); + LOG(INFO) << "Channel " << channel.id << ": from_device=" << *from_device + << " to_device=" << *to_device << " send=" << channel.send->name() + << " send_done=" << channel.send_done->name() + << " recv=" << channel.recv->name() + << " recv_done=" << channel.recv_done->name(); + communication_histogram[std::pair(*from_device, + *to_device)] += 1; + } + for (auto& fromto_count : communication_histogram) { + LOG(INFO) << "From " << fromto_count.first.first << " to " + << fromto_count.first.second << ": " << fromto_count.second; + } + for (auto& companion_set : companion_sets_) { + LOG(INFO) << "Companion set:"; + for (HloInstruction* instruction : *companion_set) { + LOG(INFO) << " " << instruction->name(); + } + } + for (auto& instruction_comm : tracked_instructions_comms_) { + LOG(INFO) << "Communicating instruction " << instruction_comm.first->name(); + for (HloInstruction* instruction : instruction_comm.second) { + auto device = GetInstructionDevice(*instruction); + LOG(INFO) << " " << instruction->name() << " on device " << *device; + } + } +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h index 5f5bf274798b9e515721226a1866a77a5596ba39..768b0c7eb3695715de5cef7dad1ed5a110561605 100644 --- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h +++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h @@ -22,14 +22,15 @@ limitations under the License. #include #include +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/flatmap.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -92,7 +93,7 @@ class HloModuleGroupMetadata { ComputationKind kind_ = ComputationKind::kInvalid; }; - // Represents a channel and the 4 instructions that form the channel. + // Represents a channel and the instructions that form the channel. struct Channel { int64 id = -1; HloInstruction* send = nullptr; @@ -118,13 +119,20 @@ class HloModuleGroupMetadata { // comment above on companion instructions. bool IsCompanionInstruction(HloInstruction* hlo) const; - // Returns true if the instruction is either a channel instruction or a - // companion instruction. + // Returns true if the instruction is either a channel instruction, a + // cross-module all-reduce instruction, or a companion instruction. bool InstructionCommunicates(HloInstruction* hlo) const; // Returns the Channel instance for the given channel id. const Channel& GetChannel(int64 channel_id) const; + // Returns if the given channel id exists in metadata. + bool HasChannel(int64 channel_id) const; + + // Returns the all-reduce instructions with the same all_reduce_id. + const std::vector& GetAllReduceGroup( + int64 all_reduce_id) const; + // Returns the computation that contains the peer channel instructions for // the given instruction. // @@ -152,14 +160,17 @@ class HloModuleGroupMetadata { // Retrieves the device an instruction is assigned to. Either from the // sharding information, or from the ordinal of the module the instruction // is in. - tensorflow::gtl::optional GetInstructionDevice( + absl::optional GetInstructionDevice( const HloInstruction& instruction) const; + // Returns the number of modules for devices (excluding the host module). + int64 GetDeviceModulesCount() const; + // Returns the companion instructions for the given instruction. // // Precondition: IsCompanionWhile(instruction) is true. const std::unordered_set& Companions( - HloInstruction* instruction) const { + const HloInstruction* instruction) const { CHECK_EQ(companion_set_index_.count(instruction), 1); return companion_set(companion_set_index_.at(instruction)); } @@ -184,13 +195,18 @@ class HloModuleGroupMetadata { // Returns all channels in the module group. const std::vector& channels() const { return channels_; } - // Returns the maximum channel id used in the module group. + // Returns the maximum channel id or all_reduce_id used in the module group. int64 max_channel_id() const { return max_channel_id_; } + TuplePointsToAnalysis* points_to_analysis(HloModule* module) const { + return points_to_analyses_.at(module).get(); + } + private: Status Build(); - // Record all channel instructions and While instructions. + // Record all channel instructions, cross-module AllReduce instructions, and + // While/Conditional/Call instructions. Status RecordInstructions(); // Verifies the given HloModules are well-formed and follow the specification, @@ -227,12 +243,15 @@ class HloModuleGroupMetadata { return it != tracked_instructions_.end() ? &it->second : nullptr; } + // Dump all the collected module group statistics to the logs. + void DumpCollectedStats() const; + // List of all companion instructions sets in the module. std::vector>> companion_sets_; // Map from each companion while instruction to the index into companion_set_. - tensorflow::gtl::FlatMap companion_set_index_; + tensorflow::gtl::FlatMap companion_set_index_; // Map from computation to the instruction using it (a kWhile, kConditional). tensorflow::gtl::FlatMap @@ -249,11 +268,17 @@ class HloModuleGroupMetadata { // Map from channel ids to the index in channels_. tensorflow::gtl::FlatMap channel_id_map_; + // Map from all-reduce ids to the all reduce instructions. + tensorflow::gtl::FlatMap> all_reduce_map_; + // The maximum channel id used in the module group. int64 max_channel_id_ = -1; // The modules that this metadata was built from. const std::vector& modules_; + + tensorflow::gtl::FlatMap> + points_to_analyses_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc index 5a0d1e264eb5095ff53721416ebcf4842a063f97..d83ee714905252e36f38438e81002a4d6ba7dafa 100644 --- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc +++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc @@ -22,14 +22,17 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_reachability.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -37,31 +40,50 @@ namespace xla { std::vector HloModuleGroupUtil::GlobalPredecessors( HloInstruction* instruction) { - std::vector predecessors; - - // Adds to the unique predecessors list and also add companion instructions - // if the given predecessor has those. + std::vector + predecessors; // Use a vector to avoid non-determinism. + tensorflow::gtl::FlatSet unique; + + // Adds to the unique predecessors list; if the predecessors is a companion + // instruction, also add companion instructions; if the predecessors is a + // cross-module all-reduce, also add the all-reduce instructions in the same + // group. auto add_unique_predecessor = [&](HloInstruction* predecessor) { - if (std::find(predecessors.begin(), predecessors.end(), predecessor) != - predecessors.end()) { + if (unique.find(predecessor) != unique.end()) { return; } - if (!metadata_.IsCompanionInstruction(predecessor)) { - predecessors.push_back(predecessor); + if (metadata_.IsCompanionInstruction(predecessor)) { + for (HloInstruction* instr : metadata_.Companions(predecessor)) { + if (unique.insert(instr).second) { + predecessors.push_back(instr); + } + } return; } - for (HloInstruction* companion : metadata_.Companions(predecessor)) { - predecessors.push_back(companion); + if (predecessor->IsCrossModuleAllReduce()) { + for (HloInstruction* instr : + metadata_.GetAllReduceGroup(*predecessor->all_reduce_id())) { + if (unique.insert(instr).second) { + predecessors.push_back(instr); + } + } + return; } + unique.insert(predecessor); + predecessors.push_back(predecessor); }; - // If the given instruction is a companion instruction, we need to find the - // predecessors of all of its companion instructions. + // predecessors of all of its companion instructions. If the instruction is an + // all-reduce, we need to find the predecessors of all the peer all-reduce + // instructions. std::vector instruction_group; if (metadata_.IsCompanionInstruction(instruction)) { for (HloInstruction* companion : metadata_.Companions(instruction)) { instruction_group.push_back(companion); } + } else if (instruction->IsCrossModuleAllReduce()) { + instruction_group = + metadata_.GetAllReduceGroup(*instruction->all_reduce_id()); } else { instruction_group.push_back(instruction); } @@ -74,12 +96,14 @@ std::vector HloModuleGroupUtil::GlobalPredecessors( add_unique_predecessor(control_predecessor); } } - if (instruction->opcode() == HloOpcode::kRecvDone) { + if (instruction->opcode() == HloOpcode::kRecvDone && + !DynCast(instruction)->is_host_transfer()) { // Send is a remote predecessor of RecvDone. HloInstruction* send = metadata_.GetChannel(instruction->channel_id()).send; add_unique_predecessor(send); } - if (instruction->opcode() == HloOpcode::kSend) { + if (instruction->opcode() == HloOpcode::kSend && + !DynCast(instruction)->is_host_transfer()) { // Recv is a remote predecessor of Send. HloInstruction* recv_done = metadata_.GetChannel(instruction->channel_id()).recv_done; @@ -93,31 +117,51 @@ std::vector HloModuleGroupUtil::GlobalPredecessors( std::vector HloModuleGroupUtil::GlobalSuccessors( HloInstruction* instruction) { - std::vector successors; - - // Adds to the unique successors list and also add companion instructions - // if the given successor has those. + std::vector + successors; // Use a vector to avoid non-determinism. + tensorflow::gtl::FlatSet unique; + + // Adds to the unique successors list; if the successor is a companion + // instruction, also add companion instructions; if the successor is a + // cross-module all-reduce, also add the all-reduce instructions in the same + // group. auto add_unique_successor = [&](HloInstruction* successor) { - if (std::find(successors.begin(), successors.end(), successor) != - successors.end()) { + if (unique.find(successor) != unique.end()) { return; } - if (!metadata_.IsCompanionInstruction(successor)) { - successors.push_back(successor); + if (metadata_.IsCompanionInstruction(successor)) { + for (HloInstruction* instr : metadata_.Companions(successor)) { + if (unique.insert(instr).second) { + successors.push_back(instr); + } + } return; } - for (HloInstruction* companion : metadata_.Companions(successor)) { - successors.push_back(companion); + if (successor->IsCrossModuleAllReduce()) { + for (HloInstruction* instr : + metadata_.GetAllReduceGroup(*successor->all_reduce_id())) { + if (unique.insert(instr).second) { + successors.push_back(instr); + } + } + return; } + unique.insert(successor); + successors.push_back(successor); }; // If the given instruction is a companion instruction, we need to find the - // successors of all of its companion instructions. + // successors of all of its companion instructions. If the instruction is an + // all-reduce, we need to find the successors of all its peer all-reduce + // instructions. std::vector instruction_group; if (metadata_.IsCompanionInstruction(instruction)) { for (HloInstruction* companion : metadata_.Companions(instruction)) { instruction_group.push_back(companion); } + } else if (instruction->IsCrossModuleAllReduce()) { + instruction_group = + metadata_.GetAllReduceGroup(*instruction->all_reduce_id()); } else { instruction_group.push_back(instruction); } @@ -130,14 +174,16 @@ std::vector HloModuleGroupUtil::GlobalSuccessors( add_unique_successor(control_successor); } } - if (instruction->opcode() == HloOpcode::kRecv) { + if (instruction->opcode() == HloOpcode::kRecv && + !DynCast(instruction)->is_host_transfer()) { // Send is a remote successor of Recv. const HloInstruction* recv_done = instruction->users().front(); CHECK(recv_done->opcode() == HloOpcode::kRecvDone); HloInstruction* send = metadata_.GetChannel(instruction->channel_id()).send; add_unique_successor(send); } - if (instruction->opcode() == HloOpcode::kSend) { + if (instruction->opcode() == HloOpcode::kSend && + !DynCast(instruction)->is_host_transfer()) { // RecvDone is a remote successor of Send. HloInstruction* recv_done = metadata_.GetChannel(instruction->channel_id()).recv_done; @@ -147,7 +193,7 @@ std::vector HloModuleGroupUtil::GlobalSuccessors( } std::vector HloModuleGroupUtil::RootInstructions( - tensorflow::gtl::ArraySlice computations) { + absl::Span computations) { std::vector roots; for (HloComputation* computation : computations) { for (HloInstruction* instruction : computation->instructions()) { @@ -170,15 +216,17 @@ Status HloModuleGroupUtil::VisitTopologicalOrder( HloInstruction* hlo = stack.top(); // Find the instruction group of the currently visited instruction. The - // instruction group represents all companion instructions of the - // current instruction, and are considered to be a single entity for the - // purpose of the traversal (i.e., they must always be in the same visit - // state). + // instruction group represents all companion instructions of the current + // instruction, or all the all-reduce instructions that belong to the same + // group, or are considered to be a single entity for the purpose of the + // traversal (i.e., they must always be in the same visit state). std::vector instruction_group; if (metadata_.IsCompanionInstruction(hlo)) { for (HloInstruction* companion : metadata_.Companions(hlo)) { instruction_group.push_back(companion); } + } else if (hlo->IsCrossModuleAllReduce()) { + instruction_group = metadata_.GetAllReduceGroup(*hlo->all_reduce_id()); } else { instruction_group.push_back(hlo); } @@ -222,8 +270,8 @@ Status HloModuleGroupUtil::VisitTopologicalOrder( string cyclic_instructions; for (const auto& state : *visit_state) { if (state.second == VisitState::kVisiting) { - tensorflow::strings::StrAppend(&cyclic_instructions, - state.first->ToString(), "\n"); + absl::StrAppend(&cyclic_instructions, state.first->ToString(), + "\n"); } } // TODO(b/64305524): Improve the error message to print out the @@ -234,7 +282,7 @@ Status HloModuleGroupUtil::VisitTopologicalOrder( "following nodes. Note that the order of the nodes is arbitrary " "and that the list may include nodes that are not part of the " "cycle.\n%s", - predecessor->ToString().c_str(), cyclic_instructions.c_str()); + predecessor->ToString(), cyclic_instructions); } stack.push(predecessor); } @@ -245,7 +293,7 @@ Status HloModuleGroupUtil::VisitTopologicalOrder( } Status HloModuleGroupUtil::VerifyComputations( - tensorflow::gtl::ArraySlice computations) { + absl::Span computations) { auto visit_function = [&](HloInstruction* instruction, const std::vector& instruction_group) { @@ -276,8 +324,8 @@ Status HloModuleGroupUtil::VerifyComputations( StatusOr> HloModuleGroupUtil::ComputeReachability( - tensorflow::gtl::ArraySlice computations) { - std::list post_order; + absl::Span computations) { + std::vector post_order; auto visit_function = [&](HloInstruction* instruction, const std::vector& instruction_group) { @@ -290,9 +338,9 @@ HloModuleGroupUtil::ComputeReachability( TF_RETURN_IF_ERROR( VisitTopologicalOrder(&visit_states, visit_function, root)); } - auto reachability = MakeUnique(post_order); + auto reachability = absl::make_unique(post_order); for (HloInstruction* hlo : post_order) { - reachability->SetReachabilityToUnion(GlobalPredecessors(hlo), hlo); + reachability->FastSetReachabilityToUnion(GlobalPredecessors(hlo), hlo); } return std::move(reachability); } diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.h b/tensorflow/compiler/xla/service/hlo_module_group_util.h index c25ca1aff50b288f3ac3885cbed53e7ba9768430..309c23045d1e0dd91e2f245d00c51d9bf9961bf5 100644 --- a/tensorflow/compiler/xla/service/hlo_module_group_util.h +++ b/tensorflow/compiler/xla/service/hlo_module_group_util.h @@ -20,6 +20,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module_group_metadata.h" @@ -27,7 +28,6 @@ limitations under the License. #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" namespace xla { @@ -56,7 +56,7 @@ class HloModuleGroupUtil { // Returns the root instructions of the computations. std::vector RootInstructions( - tensorflow::gtl::ArraySlice computations); + absl::Span computations); // Visit state of each instruction during DFS traversal. enum VisitState { @@ -93,15 +93,14 @@ class HloModuleGroupUtil { HloInstruction* root); // Verifies that the computations are well-formed (e.g., no cycles). - Status VerifyComputations( - tensorflow::gtl::ArraySlice computations); + Status VerifyComputations(absl::Span computations); // Below Reachability utils resemble those in HloComputation, except that // they can handle instructions across multiple computations. // // Creates the reachability map for the instructions in the computations. StatusOr> ComputeReachability( - tensorflow::gtl::ArraySlice computations); + absl::Span computations); // Updates the reachability of the given instruction, taking the global // predeccessorss and successors into account. diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc index 7f28a804bfec9c2f1bbb5fa08f7dd4e68be14d35..4bc1bacd7ddd6573e75eb5e2b38b24ff5899d330 100644 --- a/tensorflow/compiler/xla/service/hlo_module_test.cc +++ b/tensorflow/compiler/xla/service/hlo_module_test.cc @@ -15,16 +15,16 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { @@ -38,13 +38,13 @@ class HloModuleTest : public HloTestBase { std::unique_ptr CreateConstantComputation() { auto builder = HloComputation::Builder("Constant"); builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); return builder.Build(); } // Creates a computation which calls the given zero-parameter computations. std::unique_ptr CreateCallComputation( - tensorflow::gtl::ArraySlice computations) { + absl::Span computations) { auto builder = HloComputation::Builder("Call"); for (auto computation : computations) { builder.AddInstruction( @@ -122,7 +122,7 @@ TEST_F(HloModuleTest, CloneHasFusion) { { auto b = HloComputation::Builder("Entry"); auto input = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); b.AddInstruction( HloInstruction::CreateFusion(r0f32_, HloInstruction::FusionKind::kInput, /*operands=*/{input}, fused_computation)); @@ -173,7 +173,7 @@ TEST_F(HloModuleTest, LargeConstantToString) { auto builder = HloComputation::Builder("Constant"); std::vector values(16, 42.0); builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1(values))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1(values))); module->AddEntryComputation(builder.Build()); EXPECT_EQ( diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc index d1eaf357855205f1e9867e86f3042b96b6beff97..2d4e38589fe4693e73c46d6c82e51cb0a8388f85 100644 --- a/tensorflow/compiler/xla/service/hlo_opcode.cc +++ b/tensorflow/compiler/xla/service/hlo_opcode.cc @@ -39,7 +39,7 @@ StatusOr StringToHloOpcode(const string& opcode_name) { }); auto it = opcode_map->find(opcode_name); if (it == opcode_map->end()) { - return InvalidArgument("Unknown opcode: %s", opcode_name.c_str()); + return InvalidArgument("Unknown opcode: %s", opcode_name); } return it->second; } diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h index 1fe06ee0c0d14255b8358fb998bfd8d0b029506f..e6bfb8025d4bfeba1d334d1f946e33841a2da092 100644 --- a/tensorflow/compiler/xla/service/hlo_opcode.h +++ b/tensorflow/compiler/xla/service/hlo_opcode.h @@ -47,6 +47,7 @@ namespace xla { #define HLO_OPCODE_LIST(V) \ V(kAbs, "abs") \ V(kAdd, "add") \ + V(kAllToAll, "all-to-all") \ V(kAtan2, "atan2") \ V(kBatchNormGrad, "batch-norm-grad") \ V(kBatchNormInference, "batch-norm-inference") \ @@ -57,6 +58,7 @@ namespace xla { V(kCall, "call", kHloOpcodeIsVariadic) \ V(kCeil, "ceil") \ V(kClamp, "clamp") \ + V(kCollectivePermute, "collective-permute") \ V(kClz, "count-leading-zeros") \ V(kComplex, "complex") \ V(kConcatenate, "concatenate", kHloOpcodeIsVariadic) \ @@ -81,11 +83,12 @@ namespace xla { V(kFusion, "fusion", kHloOpcodeIsVariadic) \ V(kGather, "gather") \ V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \ + V(kAfterAll, "after-all", kHloOpcodeIsVariadic) \ V(kGetTupleElement, "get-tuple-element") \ V(kGt, "greater-than", kHloOpcodeIsComparison) \ - V(kHostCompute, "host-compute") \ V(kImag, "imag") \ V(kInfeed, "infeed") \ + V(kIota, "iota") \ V(kIsFinite, "is-finite") \ V(kLe, "less-than-or-equal-to", kHloOpcodeIsComparison) \ V(kLog, "log") \ @@ -93,6 +96,7 @@ namespace xla { V(kAnd, "and") \ V(kNot, "not") \ V(kOr, "or") \ + V(kXor, "xor") \ V(kLt, "less-than", kHloOpcodeIsComparison) \ V(kMap, "map", kHloOpcodeIsVariadic) \ V(kMaximum, "maximum") \ @@ -115,6 +119,7 @@ namespace xla { V(kReverse, "reverse") \ V(kRng, "rng") \ V(kRoundNearestAfz, "round-nearest-afz") \ + V(kScatter, "scatter") \ V(kSelect, "select") \ V(kSelectAndScatter, "select-and-scatter") \ V(kSend, "send") \ @@ -131,6 +136,7 @@ namespace xla { V(kTrace, "trace") \ V(kTranspose, "transpose") \ V(kTuple, "tuple", kHloOpcodeIsVariadic) \ + V(kTupleSelect, "tuple-select") \ V(kWhile, "while") enum class HloOpcode { @@ -150,7 +156,7 @@ enum HloOpcodeProperty { // Returns a string representation of the opcode. string HloOpcodeString(HloOpcode opcode); -// Returns a string representation of the opcode. +// Retrieves the opcode enum by name if the opcode exists. StatusOr StringToHloOpcode(const string& opcode_name); inline std::ostream& operator<<(std::ostream& os, HloOpcode opcode) { diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc index cd2ce5c69f030c65b889d67e082a3677b8739ddb..6f3f83f63a05fafaa3f3ddcff8a7cac7cb7b06d5 100644 --- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc +++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc @@ -58,6 +58,7 @@ TEST(HloOpcodeTest, OpcodeProperties) { case HloOpcode::kConcatenate: case HloOpcode::kFusion: case HloOpcode::kMap: + case HloOpcode::kAfterAll: case HloOpcode::kTuple: EXPECT_TRUE(HloOpcodeIsVariadic(opcode)); break; diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc index dcd4725fe78e8b9b5d14437e964cb5aaf1664117..0581d5c40425d332d89cc92ca6c6b0b10dd8fcf1 100644 --- a/tensorflow/compiler/xla/service/hlo_ordering.cc +++ b/tensorflow/compiler/xla/service/hlo_ordering.cc @@ -18,6 +18,8 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -25,8 +27,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -232,6 +232,11 @@ bool HloOrdering::UseIsBeforeValueDefinition( << " and def is in FALSE computation"; return true; } + if (value.defining_instruction() == use.instruction) { + VLOG(4) << " use is conditional " << use << " and def is " + << value.ToShortString(); + return true; + } } VLOG(4) << " use is not before value"; @@ -249,6 +254,10 @@ bool HloOrdering::LiveRangeStrictlyBefore( } // All uses of 'a' must be before 'b' is defined. for (const HloUse& use : a.uses()) { + if (dataflow.DoesNotUseOperandBuffer(a.instruction(), a.index(), + use.instruction)) { + continue; + } if (!UseIsBeforeValueDefinition(use, b, dataflow)) { VLOG(4) << "use of " << a << " (" << use << ") not before " << b << " is defined"; @@ -297,22 +306,20 @@ string PredecessorHloOrdering::ToStringHelper(const string& name) const { std::vector pieces; pieces.push_back(name); for (auto* computation : module_->MakeNonfusionComputations()) { - pieces.push_back(tensorflow::strings::Printf("computation %s:", - computation->name().c_str())); + pieces.push_back(absl::StrFormat("computation %s:", computation->name())); const auto all = computation->MakeInstructionPostOrder(); for (auto instruction : all) { - pieces.push_back(tensorflow::strings::Printf( - " %s predecessors:", instruction->name().c_str())); + pieces.push_back( + absl::StrFormat(" %s predecessors:", instruction->name())); for (auto predecessor : all) { if (predecessors_.at(computation) ->IsReachable(predecessor, instruction)) { - pieces.push_back( - tensorflow::strings::Printf(" %s", predecessor->name().c_str())); + pieces.push_back(absl::StrFormat(" %s", predecessor->name())); } } } } - return tensorflow::str_util::Join(pieces, "\n"); + return absl::StrJoin(pieces, "\n"); } DependencyHloOrdering::DependencyHloOrdering(const HloModule* module) @@ -363,8 +370,8 @@ string SequentialHloOrdering::ToString() const { std::vector pieces; pieces.push_back("SequentialHloOrdering"); for (auto* computation : module_->computations()) { - pieces.push_back(tensorflow::strings::Printf("computation %s order:", - computation->name().c_str())); + pieces.push_back( + absl::StrFormat("computation %s order:", computation->name())); // Gather all instructions in the module sequence for this computation and // sort them by their position. std::vector instructions; @@ -379,11 +386,10 @@ string SequentialHloOrdering::ToString() const { return order_position_.at(a) < order_position_.at(b); }); for (auto instruction : instructions) { - pieces.push_back( - tensorflow::strings::Printf(" %s", instruction->name().c_str())); + pieces.push_back(absl::StrFormat(" %s", instruction->name())); } } - return tensorflow::str_util::Join(pieces, "\n"); + return absl::StrJoin(pieces, "\n"); } std::ostream& operator<<( diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h index ee526d8dd7f7e81b3a846741d3e452935f486bd2..985f3fa64d8767b0c0063ee900f7d11c3b7f6d4a 100644 --- a/tensorflow/compiler/xla/service/hlo_ordering.h +++ b/tensorflow/compiler/xla/service/hlo_ordering.h @@ -183,6 +183,10 @@ class DependencyHloOrdering : public PredecessorHloOrdering { // interference is reduced relative to DependencyHloOrdering. class SequentialHloOrdering : public HloOrdering { public: + // TODO(dimvar): HloModuleSequence is not a good name because it sounds like + // a sequence of modules, instead of a map of schedules for all computations + // in a module. We should change it at some point. + // // A sequence of instructions for each computation in the module. using HloModuleSequence = tensorflow::gtl::FlatMap(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); HloComputation* computation_c = module->AddEmbeddedComputation(builder_c.Build()); @@ -145,7 +145,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) { auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto xla_while = builder.AddInstruction( HloInstruction::CreateWhile(scalar_shape, condition, body, constant)); module->AddEntryComputation(builder.Build()); @@ -208,7 +208,7 @@ TEST_F(HloOrderingTest, ValuesInWhileComputations) { auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto xla_while = builder.AddInstruction( HloInstruction::CreateWhile(scalar_shape, condition, body, constant)); auto add = builder.AddInstruction(HloInstruction::CreateBinary( @@ -310,7 +310,7 @@ ENTRY while.v11 { })"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(module_str)); + ParseHloString(module_str)); DependencyHloOrdering ordering(module.get()); ordering.ToString(); // Shouldn't crash. } @@ -347,7 +347,7 @@ ENTRY root { })"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(module_str)); + ParseHloString(module_str)); TF_ASSERT_OK_AND_ASSIGN(auto dataflow, HloDataflowAnalysis::Run(*module, /*ssa_form=*/true)); DependencyHloOrdering ordering(module.get()); diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc new file mode 100644 index 0000000000000000000000000000000000000000..ea8e6a239a22335b644369a78791029c36315560 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_parser.cc @@ -0,0 +1,3260 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_parser.h" + +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "absl/strings/str_split.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/gtl/map_util.h" + +namespace xla { + +namespace { + +using absl::nullopt; +using absl::optional; +using absl::StrAppend; +using absl::StrCat; +using absl::StrFormat; +using absl::StrJoin; + +const double kF16max = 65504; + +// Parser for the HloModule::ToString() format text. +class HloParser { + public: + using LocTy = HloLexer::LocTy; + + explicit HloParser(absl::string_view str, const HloModuleConfig& config) + : lexer_(str), config_(config) {} + + // Runs the parser. Returns false if an error occurred. + bool Run(); + + // Returns the parsed HloModule. + std::unique_ptr ConsumeHloModule() { return std::move(module_); } + + // Returns the error information. + string GetError() const { return StrJoin(error_, "\n"); } + + // Stand alone parsing utils for various aggregate data types. + StatusOr ParseShardingOnly(); + StatusOr ParseWindowOnly(); + StatusOr ParseConvolutionDimensionNumbersOnly(); + StatusOr ParsePaddingConfigOnly(); + + // Stand-alone parsing utility for a single instruction worth of text. + Status ParseSingleInstruction(HloComputation::Builder* builder, + string* root_name); + + private: + // Locates an instruction with the given name in the instruction_pool_ or + // returns nullptr. + // + // If the missing_instruction_hook_ is registered and a "shape" is provided, + // the hook will be called and may satisfy the request for the given + // instruction. This is useful when we reify parameters as they're resolved; + // i.e. for ParseSingleInstruction. + std::pair* FindInstruction( + const string& name, const optional& shape = nullopt); + + // ParseXXX returns false if an error occurred. + bool ParseHloModule(); + bool ParseComputations(); + bool ParseComputation(HloComputation** entry_computation); + bool ParseInstructionList(HloComputation::Builder* builder, + string* root_name); + bool ParseInstruction(HloComputation::Builder* builder, string* root_name); + bool ParseControlPredecessors(HloInstruction* instruction); + bool ParseLiteral(std::unique_ptr* literal, const Shape& shape); + bool ParseTupleLiteral(std::unique_ptr* literal, const Shape& shape); + bool ParseNonTupleLiteral(std::unique_ptr* literal, + const Shape& shape); + bool ParseDenseLiteral(std::unique_ptr* literal, const Shape& shape); + bool ParseSparseLiteral(std::unique_ptr* literal, + const Shape& shape); + template + bool ParseSparseLiteralHelper(std::unique_ptr* literal, + const Shape& shape); + + // Sets the sub-value of literal at the given index to the given value. The + // literal's shape must have the default layout. + bool SetValueInLiteral(tensorflow::int64 value, + tensorflow::int64 linear_index, Literal* literal); + bool SetValueInLiteral(double value, tensorflow::int64 linear_index, + Literal* literal); + bool SetValueInLiteral(bool value, tensorflow::int64 linear_index, + Literal* literal); + template + bool SetValueInLiteralHelper(ParsedElemT value, + tensorflow::int64 linear_index, + Literal* literal); + + bool ParseOperands(std::vector* operands); + // Fills parsed operands into 'operands' and expects a certain number of + // operands. + bool ParseOperands(std::vector* operands, + const int expected_size); + + // Describes the start, limit, and stride on every dimension of the operand + // being sliced. + struct SliceRanges { + std::vector starts; + std::vector limits; + std::vector strides; + }; + + // The data parsed for the kDomain instruction. + struct DomainData { + std::unique_ptr entry_metadata; + std::unique_ptr exit_metadata; + }; + + // Types of attributes. + enum class AttrTy { + kBool, + kInt64, + kInt32, + kFloat, + kString, + kBracedInt64List, + kBracedInt64ListList, + kHloComputation, + kFftType, + kWindow, + kConvolutionDimensionNumbers, + kSharding, + kInstructionList, + kSliceRanges, + kPaddingConfig, + kMetadata, + kFusionKind, + kDistribution, + kDomain, + kPrecisionList, + }; + + struct AttrConfig { + bool required; // whether it's required or optional + AttrTy attr_type; // what type it is + void* result; // where to store the parsed result. + }; + + // attributes ::= (',' attribute)* + // + // Parses attributes given names and configs of the attributes. Each parsed + // result is passed back through the result pointer in corresponding + // AttrConfig. Note that the result pointer must point to a optional typed + // variable which outlives this function. Returns false on error. You should + // not use the any of the results if this function failed. + // + // Example usage: + // + // std::unordered_map attrs; + // optional foo; + // attrs["foo"] = {/*required=*/false, AttrTy::kInt64, &foo}; + // optional bar; + // attrs["bar"] = {/*required=*/true, AttrTy::kWindow, &bar}; + // if (!ParseAttributes(attrs)) { + // return false; // Do not use 'foo' 'bar' if failed. + // } + // // Do something with 'bar'. + // if (foo) { // If attr foo is seen, do something with 'foo'. } + // + bool ParseAttributes(const std::unordered_map& attrs); + + // sub_attributes ::= '{' (','? attribute)* '}' + // + // Usage is the same as ParseAttributes. See immediately above. + bool ParseSubAttributes(const std::unordered_map& attrs); + + // Parses one attribute. If it has already been seen, return error. Returns + // true and adds to seen_attrs on success. + // + // Do not call this except in ParseAttributes or ParseSubAttributes. + bool ParseAttributeHelper(const std::unordered_map& attrs, + std::unordered_set* seen_attrs); + + // Parses a name and finds the corresponding hlo computation. + bool ParseComputationName(HloComputation** value); + // Parses a list of names and finds the corresponding hlo instructions. + bool ParseInstructionNames(std::vector* instructions); + // Pass expect_outer_curlies == true when parsing a Window in the context of a + // larger computation. Pass false when parsing a stand-alone Window string. + bool ParseWindow(Window* window, bool expect_outer_curlies); + bool ParseConvolutionDimensionNumbers(ConvolutionDimensionNumbers* dnums); + bool ParsePaddingConfig(PaddingConfig* padding); + bool ParseMetadata(OpMetadata* metadata); + bool ParseSharding(OpSharding* sharding); + bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed); + + // Parses the metadata behind a kDOmain instruction. + bool ParseDomain(DomainData* domain); + + // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3. + bool ParseDxD(const string& name, std::vector* result); + // Parses window's pad sub-attriute, e.g., pad=0_0x3x3. + bool ParseWindowPad(std::vector>* pad); + + bool ParseSliceRanges(SliceRanges* result); + bool ParsePrecisionList(std::vector* result); + bool ParseInt64List(const TokKind start, const TokKind end, + const TokKind delim, + std::vector* result); + // 'parse_and_add_item' is an lambda to parse an element in the list and add + // the parsed element to the result. It's supposed to capture the result. + bool ParseList(const TokKind start, const TokKind end, const TokKind delim, + const std::function& parse_and_add_item); + + bool ParseParamListToShape(Shape* shape, LocTy* shape_loc); + bool ParseParamList(); + bool ParseName(string* result); + bool ParseAttributeName(string* result); + bool ParseString(string* result); + bool ParseShape(Shape* result); + bool ParseOpcode(HloOpcode* result); + bool ParseFftType(FftType* result); + bool ParseFusionKind(HloInstruction::FusionKind* result); + bool ParseRandomDistribution(RandomDistribution* result); + bool ParsePrecision(PrecisionConfigProto::Precision* result); + bool ParseInt64(tensorflow::int64* result); + bool ParseDouble(double* result); + bool ParseBool(bool* result); + bool ParseToken(TokKind kind, const string& msg); + + // Returns true if the current token is the beginning of a shape. + bool CanBeShape(); + // Returns true if the current token is the beginning of a + // param_list_to_shape. + bool CanBeParamListToShape(); + + // Logs the current parsing line and the given message. Always returns false. + bool TokenError(absl::string_view msg); + bool Error(LocTy loc, absl::string_view msg); + + // If the current token is 'kind', eats it (i.e. lexes the next token) and + // returns true. + bool EatIfPresent(TokKind kind); + // Parses a shape, and returns true if the result is compatible with the given + // shape. + bool EatShapeAndCheckCompatible(const Shape& shape); + + // Adds the instruction to the pool. Returns false and emits an error if the + // instruction already exists. + bool AddInstruction(const string& name, HloInstruction* instruction, + LocTy name_loc); + // Adds the computation to the pool. Returns false and emits an error if the + // computation already exists. + bool AddComputation(const string& name, HloComputation* computation, + LocTy name_loc); + + // The map from the instruction/computation name to the + // instruction/computation itself and it's location. This does not own the + // pointers. + std::unordered_map> + instruction_pool_; + std::unordered_map> + computation_pool_; + + HloLexer lexer_; + std::unique_ptr module_; + std::vector> computations_; + const HloModuleConfig config_; + std::vector error_; + + // Function that gets invoked when we try to resolve an instruction + // instruction_pool_ but fail to do so. + std::function*(string, + const optional&)> + missing_instruction_hook_; +}; + +bool SplitToInt64s(absl::string_view s, char delim, std::vector* out) { + for (const auto& split : absl::StrSplit(s, delim)) { + int64 val; + if (!absl::SimpleAtoi(split, &val)) { + return false; + } + out->push_back(val); + } + return true; +} + +// Creates replica groups from the provided nested array. groups[i] represents +// the replica ids for group 'i'. +std::vector CreateReplicaGroups( + absl::Span> groups) { + std::vector replica_groups; + absl::c_transform(groups, std::back_inserter(replica_groups), + [](const std::vector& ids) { + ReplicaGroup group; + *group.mutable_replica_ids() = {ids.begin(), ids.end()}; + return group; + }); + return replica_groups; +} + +bool HloParser::Error(LocTy loc, absl::string_view msg) { + auto line_col = lexer_.GetLineAndColumn(loc); + const unsigned line = line_col.first; + const unsigned col = line_col.second; + std::vector error_lines; + error_lines.push_back( + StrCat("was parsing ", line, ":", col, ": error: ", msg)); + error_lines.emplace_back(lexer_.GetLine(loc)); + error_lines.push_back(col == 0 ? "" : StrCat(string(col - 1, ' '), "^")); + + error_.push_back(StrJoin(error_lines, "\n")); + VLOG(1) << "Error: " << error_.back(); + return false; +} + +bool HloParser::TokenError(absl::string_view msg) { + return Error(lexer_.GetLoc(), msg); +} + +bool HloParser::Run() { + lexer_.Lex(); + return ParseHloModule(); +} + +std::pair* HloParser::FindInstruction( + const string& name, const optional& shape) { + std::pair* instr = + tensorflow::gtl::FindOrNull(instruction_pool_, name); + // Potentially call the missing instruction hook. + if (instr == nullptr && missing_instruction_hook_ != nullptr) { + return missing_instruction_hook_(name, shape); + } + return instr; +} + +// ::= 'HloModule' name computations +bool HloParser::ParseHloModule() { + if (lexer_.GetKind() != TokKind::kw_HloModule) { + return TokenError("expects HloModule"); + } + // Eat 'HloModule' + lexer_.Lex(); + + string name; + if (!ParseName(&name)) { + return false; + } + + module_ = absl::make_unique(name, config_); + + return ParseComputations(); +} + +// computations ::= (computation)+ +bool HloParser::ParseComputations() { + HloComputation* entry_computation = nullptr; + do { + if (!ParseComputation(&entry_computation)) { + return false; + } + } while (lexer_.GetKind() != TokKind::kEof); + + for (int i = 0; i < computations_.size(); i++) { + // If entry_computation is not nullptr, it means the computation it pointed + // to is marked with "ENTRY"; otherwise, no computation is marked with + // "ENTRY", and we use the last computation as the entry computation. We + // add the non-entry computations as embedded computations to the module. + if ((entry_computation != nullptr && + computations_[i].get() != entry_computation) || + (entry_computation == nullptr && i != computations_.size() - 1)) { + module_->AddEmbeddedComputation(std::move(computations_[i])); + continue; + } + auto computation = + module_->AddEntryComputation(std::move(computations_[i])); + // The parameters and result layouts were set to default layout. Here we + // set the layouts to what the hlo text says. + for (int p = 0; p < computation->num_parameters(); p++) { + const Shape& param_shape = computation->parameter_instruction(p)->shape(); + TF_CHECK_OK(module_->mutable_entry_computation_layout() + ->mutable_parameter_layout(p) + ->CopyLayoutFromShape(param_shape)); + } + const Shape& result_shape = computation->root_instruction()->shape(); + TF_CHECK_OK(module_->mutable_entry_computation_layout() + ->mutable_result_layout() + ->CopyLayoutFromShape(result_shape)); + } + return true; +} + +// computation ::= ('ENTRY')? name (param_list_to_shape)? instruction_list +bool HloParser::ParseComputation(HloComputation** entry_computation) { + LocTy maybe_entry_loc = lexer_.GetLoc(); + const bool is_entry_computation = EatIfPresent(TokKind::kw_ENTRY); + + string name; + LocTy name_loc = lexer_.GetLoc(); + if (!ParseName(&name)) { + return false; + } + auto builder = absl::make_unique(name); + + LocTy shape_loc = nullptr; + Shape shape; + if (CanBeParamListToShape() && !ParseParamListToShape(&shape, &shape_loc)) { + return false; + } + + string root_name; + if (!ParseInstructionList(builder.get(), &root_name)) { + return false; + } + + std::pair* root_node = FindInstruction(root_name); + // This means some instruction was marked as ROOT but we didn't find it in the + // pool, which should not happen. + if (!root_name.empty() && root_node == nullptr) { + LOG(FATAL) << "instruction " << root_name + << " was marked as ROOT but the parser has not seen it before"; + } + + HloInstruction* root = root_node == nullptr ? nullptr : root_node->first; + // Now root can be either an existing instruction or a nullptr. If it's a + // nullptr, the implementation of Builder will set the last instruction as + // root instruction. + computations_.emplace_back(builder->Build(root)); + HloComputation* computation = computations_.back().get(); + + if (!root) { + root = computation->root_instruction(); + } else { + CHECK_EQ(root, computation->root_instruction()); + } + + // If param_list_to_shape was present, check compatibility. + if (shape_loc != nullptr && !ShapeUtil::Compatible(root->shape(), shape)) { + return Error( + shape_loc, + StrCat("Shape of computation ", name, ", ", + ShapeUtil::HumanString(shape), + ", is not compatible with that of its root instruction ", + root_name, ", ", ShapeUtil::HumanString(root->shape()))); + } + + if (is_entry_computation) { + if (*entry_computation != nullptr) { + return Error(maybe_entry_loc, "expects only one ENTRY"); + } + *entry_computation = computation; + } + instruction_pool_.clear(); + + return AddComputation(name, computation, name_loc); +} + +// instruction_list ::= '{' instruction_list1 '}' +// instruction_list1 ::= (instruction)+ +bool HloParser::ParseInstructionList(HloComputation::Builder* builder, + string* root_name) { + if (!ParseToken(TokKind::kLbrace, + "expects '{' at the beginning of instruction list.")) { + return false; + } + do { + if (!ParseInstruction(builder, root_name)) { + return false; + } + } while (lexer_.GetKind() != TokKind::kRbrace); + return ParseToken(TokKind::kRbrace, + "expects '}' at the end of instruction list."); +} + +// instruction ::= ('ROOT')? name '=' shape opcode operands (attribute)* +bool HloParser::ParseInstruction(HloComputation::Builder* builder, + string* root_name) { + string name; + Shape shape; + HloOpcode opcode; + std::vector operands; + + LocTy maybe_root_loc = lexer_.GetLoc(); + bool is_root = EatIfPresent(TokKind::kw_ROOT); + + const LocTy name_loc = lexer_.GetLoc(); + if (!ParseName(&name) || + !ParseToken(TokKind::kEqual, "expects '=' in instruction") || + !ParseShape(&shape) || !ParseOpcode(&opcode)) { + return false; + } + + if (is_root) { + if (!root_name->empty()) { + return Error(maybe_root_loc, "one computation should have only one ROOT"); + } + *root_name = name; + } + + // Add optional attributes. + std::unordered_map attrs; + optional sharding; + attrs["sharding"] = {/*required=*/false, AttrTy::kSharding, &sharding}; + optional> predecessors; + attrs["control-predecessors"] = {/*required=*/false, AttrTy::kInstructionList, + &predecessors}; + optional metadata; + attrs["metadata"] = {/*required=*/false, AttrTy::kMetadata, &metadata}; + + optional backend_config; + attrs["backend_config"] = {/*required=*/false, AttrTy::kString, + &backend_config}; + + optional> operand_precision; + attrs["operand_precision"] = {/*required=*/false, AttrTy::kPrecisionList, + &operand_precision}; + + HloInstruction* instruction; + switch (opcode) { + case HloOpcode::kParameter: { + tensorflow::int64 parameter_number; + if (!ParseToken(TokKind::kLparen, + "expects '(' before parameter number") || + !ParseInt64(¶meter_number) || + !ParseToken(TokKind::kRparen, "expects ')' after parameter number") || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateParameter(parameter_number, shape, name)); + break; + } + case HloOpcode::kConstant: { + std::unique_ptr literal; + if (!ParseToken(TokKind::kLparen, + "expects '(' before constant literal") || + !ParseLiteral(&literal, shape) || + !ParseToken(TokKind::kRparen, "expects ')' after constant literal") || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateConstant(std::move(literal))); + break; + } + case HloOpcode::kIota: { + optional iota_dimension; + attrs["iota_dimension"] = {/*required=*/true, AttrTy::kInt64, + &iota_dimension}; + if (!ParseOperands(&operands, /*expected_size=*/0) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateIota(shape, *iota_dimension)); + break; + } + // Unary ops. + case HloOpcode::kAbs: + case HloOpcode::kRoundNearestAfz: + case HloOpcode::kBitcast: + case HloOpcode::kCeil: + case HloOpcode::kClz: + case HloOpcode::kCopy: + case HloOpcode::kCos: + case HloOpcode::kExp: + case HloOpcode::kExpm1: + case HloOpcode::kImag: + case HloOpcode::kIsFinite: + case HloOpcode::kFloor: + case HloOpcode::kLog: + case HloOpcode::kLog1p: + case HloOpcode::kNot: + case HloOpcode::kNegate: + case HloOpcode::kReal: + case HloOpcode::kSign: + case HloOpcode::kSin: + case HloOpcode::kTanh: { + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateUnary(shape, opcode, operands[0])); + break; + } + // Binary ops. + case HloOpcode::kAdd: + case HloOpcode::kDivide: + case HloOpcode::kMultiply: + case HloOpcode::kSubtract: + case HloOpcode::kAtan2: + case HloOpcode::kComplex: + case HloOpcode::kEq: + case HloOpcode::kGe: + case HloOpcode::kGt: + case HloOpcode::kLe: + case HloOpcode::kLt: + case HloOpcode::kNe: + case HloOpcode::kMaximum: + case HloOpcode::kMinimum: + case HloOpcode::kPower: + case HloOpcode::kRemainder: + case HloOpcode::kAnd: + case HloOpcode::kOr: + case HloOpcode::kXor: + case HloOpcode::kShiftLeft: + case HloOpcode::kShiftRightArithmetic: + case HloOpcode::kShiftRightLogical: { + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateBinary( + shape, opcode, operands[0], operands[1])); + break; + } + // Ternary ops. + case HloOpcode::kClamp: + case HloOpcode::kSelect: + case HloOpcode::kTupleSelect: { + if (!ParseOperands(&operands, /*expected_size=*/3) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateTernary( + shape, opcode, operands[0], operands[1], operands[2])); + break; + } + // Other supported ops. + case HloOpcode::kConvert: { + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateConvert(shape, operands[0])); + break; + } + case HloOpcode::kBitcastConvert: { + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateBitcastConvert(shape, operands[0])); + break; + } + case HloOpcode::kCrossReplicaSum: { + optional>> tmp_groups; + optional to_apply; + optional> replica_group_ids; + optional barrier; + optional all_reduce_id; + attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, + &to_apply}; + attrs["replica_groups"] = {/*required=*/false, + AttrTy::kBracedInt64ListList, &tmp_groups}; + attrs["barrier"] = {/*required=*/false, AttrTy::kString, &barrier}; + attrs["all_reduce_id"] = {/*required=*/false, AttrTy::kInt64, + &all_reduce_id}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + std::vector replica_groups; + if (tmp_groups) { + replica_groups = CreateReplicaGroups(*tmp_groups); + } + instruction = + builder->AddInstruction(HloInstruction::CreateCrossReplicaSum( + shape, operands, *to_apply, replica_groups, + barrier ? *barrier : "", all_reduce_id)); + break; + } + case HloOpcode::kAllToAll: { + optional>> tmp_groups; + optional barrier; + attrs["replica_groups"] = {/*required=*/false, + AttrTy::kBracedInt64ListList, &tmp_groups}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + std::vector replica_groups; + if (tmp_groups) { + replica_groups = CreateReplicaGroups(*tmp_groups); + } + instruction = builder->AddInstruction( + HloInstruction::CreateAllToAll(shape, operands, replica_groups)); + break; + } + case HloOpcode::kCollectivePermute: { + optional>> source_targets; + attrs["source_target_pairs"] = { + /*required=*/true, AttrTy::kBracedInt64ListList, &source_targets}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + std::vector> pairs(source_targets->size()); + for (int i = 0; i < pairs.size(); i++) { + if ((*source_targets)[i].size() != 2) { + return TokenError( + "expects 'source_target_pairs=' to be a list of pairs"); + } + pairs[i].first = (*source_targets)[i][0]; + pairs[i].second = (*source_targets)[i][1]; + } + instruction = builder->AddInstruction( + HloInstruction::CreateCollectivePermute(shape, operands[0], pairs)); + break; + } + case HloOpcode::kReshape: { + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateReshape(shape, operands[0])); + break; + } + case HloOpcode::kAfterAll: { + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + if (operands.empty()) { + instruction = builder->AddInstruction(HloInstruction::CreateToken()); + } else { + instruction = + builder->AddInstruction(HloInstruction::CreateAfterAll(operands)); + } + break; + } + case HloOpcode::kSort: { + auto loc = lexer_.GetLoc(); + + optional> dimensions; + attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, + &dimensions}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs) || + dimensions->size() != 1) { + return false; + } + switch (operands.size()) { + case 1: + instruction = builder->AddInstruction(HloInstruction::CreateSort( + shape, dimensions->at(0), /*keys=*/operands[0])); + break; + case 2: + instruction = builder->AddInstruction(HloInstruction::CreateSort( + shape, dimensions->at(0), + /*keys=*/operands[0], /*values=*/operands[1])); + break; + default: + return Error(loc, StrCat("expects either 1 or 2 operands, but has ", + operands.size(), " operands")); + } + break; + } + case HloOpcode::kTuple: { + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + instruction = + builder->AddInstruction(HloInstruction::CreateTuple(operands)); + break; + } + case HloOpcode::kWhile: { + optional condition; + optional body; + attrs["condition"] = {/*required=*/true, AttrTy::kHloComputation, + &condition}; + attrs["body"] = {/*required=*/true, AttrTy::kHloComputation, &body}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateWhile( + shape, *condition, *body, /*init=*/operands[0])); + break; + } + case HloOpcode::kRecv: { + optional channel_id; + // If the is_host_transfer attribute is not present then default to false. + optional is_host_transfer = false; + attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id}; + attrs["is_host_transfer"] = {/*required=*/false, AttrTy::kBool, + &is_host_transfer}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + // If the is_host_transfer attribute is not present then default to false. + instruction = builder->AddInstruction(HloInstruction::CreateRecv( + shape.tuple_shapes(0), operands[0], *channel_id, *is_host_transfer)); + break; + } + case HloOpcode::kRecvDone: { + optional channel_id; + // If the is_host_transfer attribute is not present then default to false. + optional is_host_transfer = false; + attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id}; + attrs["is_host_transfer"] = {/*required=*/false, AttrTy::kBool, + &is_host_transfer}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + if (channel_id != operands[0]->channel_id()) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateRecvDone(operands[0], *is_host_transfer)); + break; + } + case HloOpcode::kSend: { + optional channel_id; + // If the is_host_transfer attribute is not present then default to false. + optional is_host_transfer = false; + attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id}; + attrs["is_host_transfer"] = {/*required=*/false, AttrTy::kBool, + &is_host_transfer}; + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateSend( + operands[0], operands[1], *channel_id, *is_host_transfer)); + break; + } + case HloOpcode::kSendDone: { + optional channel_id; + // If the is_host_transfer attribute is not present then default to false. + optional is_host_transfer = false; + attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id}; + attrs["is_host_transfer"] = {/*required=*/false, AttrTy::kBool, + &is_host_transfer}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + if (channel_id != operands[0]->channel_id()) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateSendDone(operands[0], *is_host_transfer)); + break; + } + case HloOpcode::kGetTupleElement: { + optional index; + attrs["index"] = {/*required=*/true, AttrTy::kInt64, &index}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateGetTupleElement(shape, operands[0], *index)); + break; + } + case HloOpcode::kCall: { + optional to_apply; + attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, + &to_apply}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateCall(shape, operands, *to_apply)); + break; + } + case HloOpcode::kReduceWindow: { + optional reduce_computation; + optional window; + attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window}; + attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, + &reduce_computation}; + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + if (!window) { + window.emplace(); + } + instruction = builder->AddInstruction(HloInstruction::CreateReduceWindow( + shape, /*operand=*/operands[0], /*init_value=*/operands[1], *window, + *reduce_computation)); + break; + } + case HloOpcode::kConvolution: { + optional window; + optional dnums; + optional feature_group_count; + attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window}; + attrs["dim_labels"] = {/*required=*/true, + AttrTy::kConvolutionDimensionNumbers, &dnums}; + attrs["feature_group_count"] = {/*required=*/false, AttrTy::kInt64, + &feature_group_count}; + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + if (!window) { + window.emplace(); + } + if (!feature_group_count) { + feature_group_count = 1; + } + instruction = builder->AddInstruction(HloInstruction::CreateConvolve( + shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums, + feature_group_count.value())); + break; + } + case HloOpcode::kFft: { + optional fft_type; + optional> fft_length; + attrs["fft_type"] = {/*required=*/true, AttrTy::kFftType, &fft_type}; + attrs["fft_length"] = {/*required=*/true, AttrTy::kBracedInt64List, + &fft_length}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateFft( + shape, operands[0], *fft_type, *fft_length)); + break; + } + case HloOpcode::kBroadcast: { + optional> broadcast_dimensions; + attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, + &broadcast_dimensions}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateBroadcast( + shape, operands[0], *broadcast_dimensions)); + break; + } + case HloOpcode::kConcatenate: { + optional> dimensions; + attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, + &dimensions}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs) || + dimensions->size() != 1) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateConcatenate( + shape, operands, dimensions->at(0))); + break; + } + case HloOpcode::kMap: { + optional to_apply; + attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, + &to_apply}; + optional> dimensions; + attrs["dimensions"] = {/*required=*/false, AttrTy::kBracedInt64List, + &dimensions}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateMap(shape, operands, *to_apply)); + break; + } + case HloOpcode::kReduce: { + auto loc = lexer_.GetLoc(); + + optional reduce_computation; + attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, + &reduce_computation}; + optional> dimensions_to_reduce; + attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, + &dimensions_to_reduce}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + if (operands.size() % 2) { + return Error(loc, StrCat("expects an even number of operands, but has ", + operands.size(), " operands")); + } + instruction = builder->AddInstruction(HloInstruction::CreateReduce( + shape, /*operands=*/ + absl::Span(operands).subspan( + 0, operands.size() / 2), + /*init_values=*/ + absl::Span(operands).subspan( + operands.size() / 2, operands.size()), + *dimensions_to_reduce, *reduce_computation)); + break; + } + case HloOpcode::kReverse: { + optional> dimensions; + attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, + &dimensions}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateReverse(shape, operands[0], *dimensions)); + break; + } + case HloOpcode::kSelectAndScatter: { + optional select; + attrs["select"] = {/*required=*/true, AttrTy::kHloComputation, &select}; + optional scatter; + attrs["scatter"] = {/*required=*/true, AttrTy::kHloComputation, &scatter}; + optional window; + attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window}; + if (!ParseOperands(&operands, /*expected_size=*/3) || + !ParseAttributes(attrs)) { + return false; + } + if (!window) { + window.emplace(); + } + instruction = + builder->AddInstruction(HloInstruction::CreateSelectAndScatter( + shape, /*operand=*/operands[0], *select, *window, + /*source=*/operands[1], /*init_value=*/operands[2], *scatter)); + break; + } + case HloOpcode::kSlice: { + optional slice_ranges; + attrs["slice"] = {/*required=*/true, AttrTy::kSliceRanges, &slice_ranges}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateSlice( + shape, operands[0], slice_ranges->starts, slice_ranges->limits, + slice_ranges->strides)); + break; + } + case HloOpcode::kDynamicSlice: { + optional> dynamic_slice_sizes; + attrs["dynamic_slice_sizes"] = { + /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes}; + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateDynamicSlice( + shape, /*operand=*/operands[0], /*start_indices=*/operands[1], + *dynamic_slice_sizes)); + break; + } + case HloOpcode::kDynamicUpdateSlice: { + if (!ParseOperands(&operands, /*expected_size=*/3) || + !ParseAttributes(attrs)) { + return false; + } + instruction = + builder->AddInstruction(HloInstruction::CreateDynamicUpdateSlice( + shape, /*operand=*/operands[0], /*update=*/operands[1], + /*start_indices=*/operands[2])); + break; + } + case HloOpcode::kTranspose: { + optional> dimensions; + attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, + &dimensions}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateTranspose(shape, operands[0], *dimensions)); + break; + } + case HloOpcode::kBatchNormTraining: { + optional epsilon; + attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon}; + optional feature_index; + attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64, + &feature_index}; + if (!ParseOperands(&operands, /*expected_size=*/3) || + !ParseAttributes(attrs)) { + return false; + } + instruction = + builder->AddInstruction(HloInstruction::CreateBatchNormTraining( + shape, /*operand=*/operands[0], /*scale=*/operands[1], + /*offset=*/operands[2], *epsilon, *feature_index)); + break; + } + case HloOpcode::kBatchNormInference: { + optional epsilon; + attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon}; + optional feature_index; + attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64, + &feature_index}; + if (!ParseOperands(&operands, /*expected_size=*/5) || + !ParseAttributes(attrs)) { + return false; + } + instruction = + builder->AddInstruction(HloInstruction::CreateBatchNormInference( + shape, /*operand=*/operands[0], /*scale=*/operands[1], + /*offset=*/operands[2], /*mean=*/operands[3], + /*variance=*/operands[4], *epsilon, *feature_index)); + break; + } + case HloOpcode::kBatchNormGrad: { + optional epsilon; + attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon}; + optional feature_index; + attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64, + &feature_index}; + if (!ParseOperands(&operands, /*expected_size=*/5) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateBatchNormGrad( + shape, /*operand=*/operands[0], /*scale=*/operands[1], + /*mean=*/operands[2], /*variance=*/operands[3], + /*grad_output=*/operands[4], *epsilon, *feature_index)); + break; + } + case HloOpcode::kPad: { + optional padding; + attrs["padding"] = {/*required=*/true, AttrTy::kPaddingConfig, &padding}; + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreatePad( + shape, operands[0], /*padding_value=*/operands[1], *padding)); + break; + } + case HloOpcode::kFusion: { + optional fusion_computation; + attrs["calls"] = {/*required=*/true, AttrTy::kHloComputation, + &fusion_computation}; + optional fusion_kind; + attrs["kind"] = {/*required=*/true, AttrTy::kFusionKind, &fusion_kind}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateFusion( + shape, *fusion_kind, operands, *fusion_computation)); + break; + } + case HloOpcode::kInfeed: { + optional config; + attrs["infeed_config"] = {/*required=*/false, AttrTy::kString, &config}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + // We need to know the infeed data shape to construct the infeed + // instruction. This is the zero-th element of the tuple-shaped output of + // the infeed instruction. ShapeUtil::GetTupleElementShape will check fail + // if the shape is not a non-empty tuple, so add guard so an error message + // can be emitted instead of a check fail + if (!ShapeUtil::IsTuple(shape) && !ShapeUtil::IsEmptyTuple(shape)) { + return Error(lexer_.GetLoc(), + "infeed must have a non-empty tuple shape"); + } + instruction = builder->AddInstruction(HloInstruction::CreateInfeed( + ShapeUtil::GetTupleElementShape(shape, 0), operands[0], + config ? *config : "")); + break; + } + case HloOpcode::kOutfeed: { + optional config; + attrs["outfeed_config"] = {/*required=*/false, AttrTy::kString, &config}; + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateOutfeed(operands[0]->shape(), operands[0], + operands[1], config ? *config : "")); + break; + } + case HloOpcode::kRng: { + optional distribution; + attrs["distribution"] = {/*required=*/true, AttrTy::kDistribution, + &distribution}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateRng(shape, *distribution, operands)); + break; + } + case HloOpcode::kReducePrecision: { + optional exponent_bits; + optional mantissa_bits; + attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64, + &exponent_bits}; + attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64, + &mantissa_bits}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = + builder->AddInstruction(HloInstruction::CreateReducePrecision( + shape, operands[0], static_cast(*exponent_bits), + static_cast(*mantissa_bits))); + break; + } + case HloOpcode::kConditional: { + optional true_computation; + optional false_computation; + attrs["true_computation"] = {/*required=*/true, AttrTy::kHloComputation, + &true_computation}; + attrs["false_computation"] = {/*required=*/true, AttrTy::kHloComputation, + &false_computation}; + if (!ParseOperands(&operands, /*expected_size=*/3) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateConditional( + shape, /*pred=*/operands[0], + /*true_computation_arg=*/operands[1], *true_computation, + /*false_computation_arg=*/operands[2], *false_computation)); + break; + } + case HloOpcode::kCustomCall: { + optional custom_call_target; + optional window; + optional dnums; + attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString, + &custom_call_target}; + attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window}; + attrs["dim_labels"] = {/*required=*/false, + AttrTy::kConvolutionDimensionNumbers, &dnums}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateCustomCall( + shape, operands, *custom_call_target)); + if (window.has_value()) { + instruction->set_window(*window); + } + if (dnums.has_value()) { + instruction->set_convolution_dimension_numbers(*dnums); + } + break; + } + case HloOpcode::kDot: { + optional> lhs_contracting_dims; + attrs["lhs_contracting_dims"] = { + /*required=*/false, AttrTy::kBracedInt64List, &lhs_contracting_dims}; + optional> rhs_contracting_dims; + attrs["rhs_contracting_dims"] = { + /*required=*/false, AttrTy::kBracedInt64List, &rhs_contracting_dims}; + optional> lhs_batch_dims; + attrs["lhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List, + &lhs_batch_dims}; + optional> rhs_batch_dims; + attrs["rhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List, + &rhs_batch_dims}; + + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + + DotDimensionNumbers dnum; + if (lhs_contracting_dims) { + *dnum.mutable_lhs_contracting_dimensions() = { + lhs_contracting_dims->begin(), lhs_contracting_dims->end()}; + } + if (rhs_contracting_dims) { + *dnum.mutable_rhs_contracting_dimensions() = { + rhs_contracting_dims->begin(), rhs_contracting_dims->end()}; + } + if (lhs_batch_dims) { + *dnum.mutable_lhs_batch_dimensions() = {lhs_batch_dims->begin(), + lhs_batch_dims->end()}; + } + if (rhs_batch_dims) { + *dnum.mutable_rhs_batch_dimensions() = {rhs_batch_dims->begin(), + rhs_batch_dims->end()}; + } + + instruction = builder->AddInstruction( + HloInstruction::CreateDot(shape, operands[0], operands[1], dnum)); + break; + } + case HloOpcode::kGather: { + optional> offset_dims; + attrs["offset_dims"] = {/*required=*/true, AttrTy::kBracedInt64List, + &offset_dims}; + optional> collapsed_slice_dims; + attrs["collapsed_slice_dims"] = { + /*required=*/true, AttrTy::kBracedInt64List, &collapsed_slice_dims}; + optional> start_index_map; + attrs["start_index_map"] = {/*required=*/true, AttrTy::kBracedInt64List, + &start_index_map}; + optional index_vector_dim; + attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64, + &index_vector_dim}; + optional> slice_sizes; + attrs["slice_sizes"] = {/*required=*/true, AttrTy::kBracedInt64List, + &slice_sizes}; + + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + + GatherDimensionNumbers dim_numbers = + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/*offset_dims, + /*collapsed_slice_dims=*/*collapsed_slice_dims, + /*start_index_map=*/*start_index_map, + /*index_vector_dim=*/*index_vector_dim); + + instruction = builder->AddInstruction(HloInstruction::CreateGather( + shape, /*operand=*/operands[0], /*start_indices=*/operands[1], + dim_numbers, *slice_sizes)); + break; + } + case HloOpcode::kScatter: { + optional> update_window_dims; + attrs["update_window_dims"] = { + /*required=*/true, AttrTy::kBracedInt64List, &update_window_dims}; + optional> inserted_window_dims; + attrs["inserted_window_dims"] = { + /*required=*/true, AttrTy::kBracedInt64List, &inserted_window_dims}; + optional> scatter_dims_to_operand_dims; + attrs["scatter_dims_to_operand_dims"] = {/*required=*/true, + AttrTy::kBracedInt64List, + &scatter_dims_to_operand_dims}; + optional index_vector_dim; + attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64, + &index_vector_dim}; + + optional update_computation; + attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, + &update_computation}; + + if (!ParseOperands(&operands, /*expected_size=*/3) || + !ParseAttributes(attrs)) { + return false; + } + + ScatterDimensionNumbers dim_numbers = + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/*update_window_dims, + /*inserted_window_dims=*/*inserted_window_dims, + /*scatter_dims_to_operand_dims=*/*scatter_dims_to_operand_dims, + /*index_vector_dim=*/*index_vector_dim); + + instruction = builder->AddInstruction(HloInstruction::CreateScatter( + shape, /*operand=*/operands[0], /*scatter_indices=*/operands[1], + /*updates=*/operands[2], *update_computation, dim_numbers)); + break; + } + case HloOpcode::kDomain: { + DomainData domain; + attrs["domain"] = {/*required=*/true, AttrTy::kDomain, &domain}; + if (!ParseOperands(&operands, /*expected_size=*/1) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction(HloInstruction::CreateDomain( + shape, operands[0], std::move(domain.exit_metadata), + std::move(domain.entry_metadata))); + break; + } + case HloOpcode::kTrace: + return TokenError(StrCat("parsing not yet implemented for op: ", + HloOpcodeString(opcode))); + } + + instruction->SetAndSanitizeName(name); + if (instruction->name() != name) { + return Error(name_loc, + StrCat("illegal instruction name: ", name, + "; suggest renaming to: ", instruction->name())); + } + + // Add shared attributes like metadata to the instruction, if they were seen. + if (sharding) { + instruction->set_sharding( + HloSharding::FromProto(sharding.value()).ValueOrDie()); + } + if (predecessors) { + for (auto* pre : *predecessors) { + Status status = pre->AddControlDependencyTo(instruction); + if (!status.ok()) { + return Error(name_loc, StrCat("error adding control dependency for: ", + name, " status: ", status.ToString())); + } + } + } + if (metadata) { + instruction->set_metadata(*metadata); + } + if (backend_config) { + instruction->set_raw_backend_config_string(std::move(*backend_config)); + } + if (operand_precision) { + PrecisionConfigProto precision_config; + *precision_config.mutable_operand_precision() = {operand_precision->begin(), + operand_precision->end()}; + instruction->set_precision_config(precision_config); + } + return AddInstruction(name, instruction, name_loc); +} // NOLINT(readability/fn_size) + +// ::= '{' (single_sharding | tuple_sharding) '}' +// +// tuple_sharding ::= single_sharding* (',' single_sharding)* +bool HloParser::ParseSharding(OpSharding* sharding) { + // A single sharding starts with '{' and is not followed by '{'. + // A tuple sharding starts with '{' and is followed by '{', or is '{''}' for + // an empty tuple. + if (!ParseToken(TokKind::kLbrace, + "expected '{' to start sharding attribute")) { + return false; + } + + if (lexer_.GetKind() != TokKind::kLbrace && + lexer_.GetKind() != TokKind::kRbrace) { + return ParseSingleSharding(sharding, /*lbrace_pre_lexed=*/true); + } + + // Tuple sharding. + // Allow empty tuple shardings. + if (lexer_.GetKind() != TokKind::kRbrace) { + do { + if (!ParseSingleSharding(sharding->add_tuple_shardings(), + /*lbrace_pre_lexed=*/false)) { + return false; + } + } while (EatIfPresent(TokKind::kComma)); + } + sharding->set_type(OpSharding::Type::OpSharding_Type_TUPLE); + + return ParseToken(TokKind::kRbrace, "expected '}' to end sharding attribute"); +} + +// ::= '{' 'replicated'? 'maximal'? ('device=' int)? shape? +// ('devices=' ('[' dims ']')* device_list)? '}' +// dims ::= int_list device_list ::= int_list +bool HloParser::ParseSingleSharding(OpSharding* sharding, + bool lbrace_pre_lexed) { + if (!lbrace_pre_lexed && + !ParseToken(TokKind::kLbrace, + "expected '{' to start sharding attribute")) { + return false; + } + + LocTy loc = lexer_.GetLoc(); + bool maximal = false; + bool replicated = false; + std::vector devices; + std::vector tile_assignment_dimensions; + while (lexer_.GetKind() != TokKind::kRbrace) { + switch (lexer_.GetKind()) { + case TokKind::kw_maximal: + maximal = true; + lexer_.Lex(); + break; + case TokKind::kw_replicated: + replicated = true; + lexer_.Lex(); + break; + case TokKind::kAttributeName: { + if (lexer_.GetStrVal() == "device") { + if (lexer_.Lex() != TokKind::kInt) { + return TokenError("device= attribute must be an integer"); + } + devices = {lexer_.GetInt64Val()}; + lexer_.Lex(); + } else if (lexer_.GetStrVal() == "devices") { + lexer_.Lex(); + if (!ParseToken(TokKind::kLsquare, + "expected '[' to start sharding devices shape")) { + return false; + } + + do { + tensorflow::int64 dim; + if (!ParseInt64(&dim)) { + return false; + } + tile_assignment_dimensions.push_back(dim); + } while (EatIfPresent(TokKind::kComma)); + + if (!ParseToken(TokKind::kRsquare, + "expected ']' to start sharding devices shape")) { + return false; + } + do { + tensorflow::int64 device; + if (!ParseInt64(&device)) { + return false; + } + devices.push_back(device); + } while (EatIfPresent(TokKind::kComma)); + } else { + return TokenError( + "unknown attribute in sharding: expected device= or devices="); + } + break; + } + case TokKind::kShape: + // TODO(b/112302613): Left here for backward compatibility to ignore the + // removed tile shape data. + lexer_.Lex(); + break; + case TokKind::kRbrace: + break; + default: + return TokenError("unexpected token"); + } + } + + if (replicated) { + if (!devices.empty()) { + return Error(loc, + "replicated shardings should not have any devices assigned"); + } + sharding->set_type(OpSharding::Type::OpSharding_Type_REPLICATED); + } else if (maximal) { + if (devices.size() != 1) { + return Error(loc, + "maximal shardings should have exactly one device assigned"); + } + sharding->set_type(OpSharding::Type::OpSharding_Type_MAXIMAL); + sharding->add_tile_assignment_devices(devices[0]); + } else { + if (devices.size() <= 1) { + return Error( + loc, "non-maximal shardings must have more than one device assigned"); + } + if (tile_assignment_dimensions.empty()) { + return Error( + loc, + "non-maximal shardings must have a tile assignment list including " + "dimensions"); + } + sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER); + for (tensorflow::int64 dim : tile_assignment_dimensions) { + sharding->add_tile_assignment_dimensions(dim); + } + for (tensorflow::int64 device : devices) { + sharding->add_tile_assignment_devices(device); + } + } + + lexer_.Lex(); + return true; +} + +// domain ::= '{' 'kind=' domain_kind ',' 'entry=' entry_sharding ',' +// 'exit=' exit_sharding '}' +bool HloParser::ParseDomain(DomainData* domain) { + std::unordered_map attrs; + optional kind; + optional entry_sharding; + optional exit_sharding; + attrs["kind"] = {/*required=*/true, AttrTy::kString, &kind}; + attrs["entry"] = {/*required=*/true, AttrTy::kSharding, &entry_sharding}; + attrs["exit"] = {/*required=*/true, AttrTy::kSharding, &exit_sharding}; + if (!ParseSubAttributes(attrs)) { + return false; + } + if (*kind == ShardingMetadata::KindName()) { + auto entry_sharding_ptr = absl::make_unique( + HloSharding::FromProto(*entry_sharding).ValueOrDie()); + auto exit_sharding_ptr = absl::make_unique( + HloSharding::FromProto(*exit_sharding).ValueOrDie()); + domain->entry_metadata = + absl::make_unique(std::move(entry_sharding_ptr)); + domain->exit_metadata = + absl::make_unique(std::move(exit_sharding_ptr)); + } else { + return TokenError(StrCat("unsupported domain kind: ", *kind)); + } + return true; +} + +// '{' name+ '}' +bool HloParser::ParseInstructionNames( + std::vector* instructions) { + if (!ParseToken(TokKind::kLbrace, + "expects '{' at the beginning of instruction name list")) { + return false; + } + LocTy loc = lexer_.GetLoc(); + do { + string name; + if (!ParseName(&name)) { + return Error(loc, "expects a instruction name"); + } + std::pair* instr = FindInstruction(name); + if (!instr) { + return TokenError(StrFormat("instruction '%s' is not defined", name)); + } + instructions->push_back(instr->first); + } while (EatIfPresent(TokKind::kComma)); + + return ParseToken(TokKind::kRbrace, + "expects '}' at the end of instruction name list"); +} + +bool HloParser::SetValueInLiteral(tensorflow::int64 value, + tensorflow::int64 linear_index, + Literal* literal) { + const Shape& shape = literal->shape(); + switch (shape.element_type()) { + case S8: + return SetValueInLiteralHelper(value, linear_index, + literal); + case S16: + return SetValueInLiteralHelper(value, linear_index, + literal); + case S32: + return SetValueInLiteralHelper(value, linear_index, + literal); + case S64: + return SetValueInLiteralHelper(value, linear_index, + literal); + case U8: + return SetValueInLiteralHelper(value, linear_index, + literal); + case U16: + return SetValueInLiteralHelper(value, linear_index, + literal); + case U32: + return SetValueInLiteralHelper(value, linear_index, + literal); + case U64: + return SetValueInLiteralHelper(value, linear_index, + literal); + default: + LOG(FATAL) << "unknown integral primitive type " + << PrimitiveType_Name(shape.element_type()); + } +} + +bool HloParser::SetValueInLiteral(double value, tensorflow::int64 linear_index, + Literal* literal) { + const Shape& shape = literal->shape(); + switch (shape.element_type()) { + case F16: + return SetValueInLiteralHelper(value, linear_index, literal); + case BF16: + return SetValueInLiteralHelper(value, linear_index, + literal); + case F32: + return SetValueInLiteralHelper(value, linear_index, literal); + case F64: + return SetValueInLiteralHelper(value, linear_index, literal); + default: + LOG(FATAL) << "unknown floating point primitive type " + << PrimitiveType_Name(shape.element_type()); + } +} + +bool HloParser::SetValueInLiteral(bool value, tensorflow::int64 linear_index, + Literal* literal) { + const Shape& shape = literal->shape(); + switch (shape.element_type()) { + case PRED: + return SetValueInLiteralHelper(value, linear_index, literal); + default: + LOG(FATAL) << PrimitiveType_Name(shape.element_type()) + << " is not PRED type"; + } +} + +template +bool HloParser::SetValueInLiteralHelper(ParsedElemT value, + tensorflow::int64 linear_index, + Literal* literal) { + // Check that linear_index is in range. + if (linear_index >= ShapeUtil::ElementsIn(literal->shape())) { + return TokenError( + StrCat("trys to set value ", value, " to a literal in shape ", + ShapeUtil::HumanString(literal->shape()), " at linear index ", + linear_index, ", but the index is out of range")); + } + + if (std::isnan(value) || + (std::numeric_limits::has_infinity && + (std::numeric_limits::infinity() == value || + -std::numeric_limits::infinity() == value))) { + // Skip range checking for non-finite value. + } else if (literal->shape().element_type() == F16 || + literal->shape().element_type() == BF16) { + if (value > kF16max || value < -kF16max) { + return TokenError(StrCat( + "value ", value, " is out of range for literal's primitive type ", + PrimitiveType_Name(literal->shape().element_type()))); + } + } else if (std::is_unsigned::value) { + CHECK((std::is_same::value || + std::is_same::value)) + << "Unimplemented checking for ParsedElemT"; + + ParsedElemT upper_bound; + if (sizeof(LiteralNativeT) >= sizeof(ParsedElemT)) { + upper_bound = std::numeric_limits::max(); + } else { + upper_bound = + static_cast(std::numeric_limits::max()); + } + if (value > upper_bound || value < 0) { + // Value is out of range for LiteralNativeT. + return TokenError(StrCat( + "value ", value, " is out of range for literal's primitive type ", + PrimitiveType_Name(literal->shape().element_type()))); + } + } else if (value > static_cast( + std::numeric_limits::max()) || + value < static_cast( + std::numeric_limits::lowest())) { + // Value is out of range for LiteralNativeT. + return TokenError(StrCat( + "value ", value, " is out of range for literal's primitive type ", + PrimitiveType_Name(literal->shape().element_type()))); + } + + literal->data().at(linear_index) = + static_cast(value); + return true; +} + +bool HloParser::EatShapeAndCheckCompatible(const Shape& shape) { + Shape new_shape; + if (!ParseShape(&new_shape)) { + return TokenError(StrCat("expects shape ", ShapeUtil::HumanString(shape))); + } + if (!ShapeUtil::Compatible(shape, new_shape)) { + return TokenError(StrCat( + "expects shape ", ShapeUtil::HumanString(shape), + ", but sees a different shape: ", ShapeUtil::HumanString(new_shape))); + } + return true; +} + +// literal +// ::= tuple +// ::= non_tuple +bool HloParser::ParseLiteral(std::unique_ptr* literal, + const Shape& shape) { + return ShapeUtil::IsTuple(shape) ? ParseTupleLiteral(literal, shape) + : ParseNonTupleLiteral(literal, shape); +} + +// tuple +// ::= shape '(' literal_list ')' +// literal_list +// ::= /*empty*/ +// ::= literal (',' literal)* +bool HloParser::ParseTupleLiteral(std::unique_ptr* literal, + const Shape& shape) { + if (!EatShapeAndCheckCompatible(shape)) { + return TokenError(StrCat("expects tuple constant in shape ", + ShapeUtil::HumanString(shape))); + } + if (!ParseToken(TokKind::kLparen, "expects '(' in front of tuple elements")) { + return false; + } + std::vector> elements( + ShapeUtil::TupleElementCount(shape)); + + if (lexer_.GetKind() == TokKind::kRparen) { + // empty + } else { + // literal, (',' literal)* + for (int i = 0; i < elements.size(); i++) { + if (i > 0) { + ParseToken(TokKind::kComma, "exepcts ',' to separate tuple elements"); + } + if (!ParseLiteral(&elements[i], + ShapeUtil::GetTupleElementShape(shape, i))) { + return TokenError(StrCat("expects the ", i, "th element")); + } + } + } + *literal = LiteralUtil::MakeTupleOwned(std::move(elements)); + return ParseToken(TokKind::kRparen, + StrCat("expects ')' at the end of the tuple with ", + ShapeUtil::TupleElementCount(shape), "elements")); +} + +// non_tuple +// ::= rank01 +// ::= rank2345 +// rank2345 ::= shape sparse_or_nested_array +bool HloParser::ParseNonTupleLiteral(std::unique_ptr* literal, + const Shape& shape) { + if (LayoutUtil::IsSparseArray(shape)) { + return ParseSparseLiteral(literal, shape); + } + + CHECK(LayoutUtil::IsDenseArray(shape)); + return ParseDenseLiteral(literal, shape); +} + +bool HloParser::ParseDenseLiteral(std::unique_ptr* literal, + const Shape& shape) { + const tensorflow::int64 rank = ShapeUtil::Rank(shape); + if (rank > 1 && !EatShapeAndCheckCompatible(shape)) { + return false; + } + + // Create a literal with the given shape in default layout. + *literal = LiteralUtil::CreateFromDimensions( + shape.element_type(), AsInt64Slice(shape.dimensions())); + tensorflow::int64 nest_level = 0; + tensorflow::int64 linear_index = 0; + // elems_seen_per_dim[i] is how many elements or sub-arrays we have seen for + // the dimension i. For example, to parse f32[2,3] {{1, 2, 3}, {4, 5, 6}}, + // when we are parsing the 2nd '{' (right before '1'), we are seeing a + // sub-array of the dimension 0, so elems_seen_per_dim[0]++. When we are at + // the first '}' (right after '3'), it means the sub-array ends, and the + // sub-array is supposed to contain exactly 3 elements, so check if + // elems_seen_per_dim[1] is 3. + std::vector elems_seen_per_dim(rank); + auto get_index_str = [&elems_seen_per_dim](int dim) -> string { + std::vector elems_seen_until_dim( + elems_seen_per_dim.begin(), elems_seen_per_dim.begin() + dim); + return StrCat("[", + StrJoin(elems_seen_until_dim, ",", + [](string* out, const tensorflow::int64& num_elems) { + StrAppend(out, num_elems - 1); + }), + "]"); + }; + do { + switch (lexer_.GetKind()) { + default: + return TokenError("unexpected token type in a literal"); + case TokKind::kLbrace: { + nest_level++; + if (nest_level > rank) { + return TokenError(absl::StrFormat( + "expects nested array in rank %d, but sees larger", rank)); + } + if (nest_level > 1) { + elems_seen_per_dim[nest_level - 2]++; + if (elems_seen_per_dim[nest_level - 2] > + shape.dimensions(nest_level - 2)) { + return TokenError(absl::StrFormat( + "expects %d elements in the %sth element, but sees more", + shape.dimensions(nest_level - 2), + get_index_str(nest_level - 2))); + } + } + lexer_.Lex(); + break; + } + case TokKind::kRbrace: { + nest_level--; + if (elems_seen_per_dim[nest_level] != shape.dimensions(nest_level)) { + return TokenError(absl::StrFormat( + "expects %d elements in the %sth element, but sees %d", + shape.dimensions(nest_level), get_index_str(nest_level), + elems_seen_per_dim[nest_level])); + } + elems_seen_per_dim[nest_level] = 0; + lexer_.Lex(); + break; + } + case TokKind::kComma: + // Skip. + lexer_.Lex(); + break; + case TokKind::kw_true: + case TokKind::kw_false: + case TokKind::kInt: + case TokKind::kDecimal: + case TokKind::kw_nan: + case TokKind::kw_inf: + case TokKind::kNegInf: { + if (rank > 0) { + if (nest_level != rank) { + return TokenError( + absl::StrFormat("expects nested array in rank %d, but sees %d", + rank, nest_level)); + } + elems_seen_per_dim[rank - 1]++; + if (elems_seen_per_dim[rank - 1] > shape.dimensions(rank - 1)) { + return TokenError(absl::StrFormat( + "expects %d elements on the minor-most dimension, but " + "sees more", + shape.dimensions(rank - 1))); + } + } + if (lexer_.GetKind() == TokKind::kw_true || + lexer_.GetKind() == TokKind::kw_false) { + // TODO(congliu): bool type literals with rank >= 1 are actually + // printed in a compact form instead of "true" or "false". Fix that. + if (!SetValueInLiteral(lexer_.GetKind() == TokKind::kw_true, + linear_index++, literal->get())) { + return false; + } + lexer_.Lex(); + } else if (primitive_util::IsIntegralType(shape.element_type())) { + LocTy loc = lexer_.GetLoc(); + tensorflow::int64 value; + if (!ParseInt64(&value)) { + return Error(loc, StrCat("expects integer for primitive type: ", + PrimitiveType_Name(shape.element_type()))); + } + if (!SetValueInLiteral(value, linear_index++, literal->get())) { + return false; + } + } else if (primitive_util::IsFloatingPointType(shape.element_type())) { + LocTy loc = lexer_.GetLoc(); + double value; + if (!ParseDouble(&value)) { + return Error( + loc, StrCat("expect floating point value for primitive type: ", + PrimitiveType_Name(shape.element_type()))); + } + if (!SetValueInLiteral(value, linear_index++, literal->get())) { + return false; + } + } else { + return TokenError(StrCat("unsupported primitive type ", + PrimitiveType_Name(shape.element_type()))); + } + break; + } + } // end of switch + } while (nest_level > 0); + + *literal = (*literal)->Relayout(shape.layout()); + return true; +} + +bool HloParser::ParseSparseLiteral(std::unique_ptr* literal, + const Shape& shape) { + if (!EatShapeAndCheckCompatible(shape)) { + return false; + } + + switch (shape.element_type()) { + case PRED: + return ParseSparseLiteralHelper(literal, shape); + case S8: + return ParseSparseLiteralHelper(literal, shape); + case S16: + return ParseSparseLiteralHelper(literal, shape); + case S32: + return ParseSparseLiteralHelper(literal, shape); + case S64: + return ParseSparseLiteralHelper(literal, shape); + case U8: + return ParseSparseLiteralHelper(literal, shape); + case U16: + return ParseSparseLiteralHelper(literal, shape); + case U32: + return ParseSparseLiteralHelper(literal, shape); + case U64: + return ParseSparseLiteralHelper(literal, shape); + case F16: + return ParseSparseLiteralHelper(literal, shape); + case F32: + return ParseSparseLiteralHelper(literal, shape); + case BF16: + return ParseSparseLiteralHelper(literal, shape); + case F64: + return ParseSparseLiteralHelper(literal, shape); + default: + return Error(lexer_.GetLoc(), + StrCat("invalid primitive type for sparse literal: ", + PrimitiveType_Name(shape.element_type()))); + } +} + +template +bool HloParser::ParseSparseLiteralHelper(std::unique_ptr* literal, + const Shape& shape) { + std::vector index; + + tensorflow::int64 rank = ShapeUtil::Rank(shape); + + *literal = absl::make_unique(shape); + + if (!ParseToken(TokKind::kLbrace, + "expects '{' at the beginning of a sparse literal")) { + return false; + } + + for (;;) { + if (lexer_.GetKind() == TokKind::kRbrace) { + lexer_.Lex(); + break; + } + + LocTy index_loc = lexer_.GetLoc(); + index.clear(); + if (lexer_.GetKind() == TokKind::kInt) { + tensorflow::int64 single_index = lexer_.GetInt64Val(); + lexer_.Lex(); + if (rank != 1) { + return Error( + index_loc, + StrCat("invalid single-dimensional index for shape with rank ", + rank, ": ", single_index)); + } + index.push_back(single_index); + } else { + if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kComma, + &index)) { + return false; + } + if (index.size() != rank) { + return Error( + index_loc, + StrCat("invalid multi-dimension index for shape with rank ", rank, + ": [", StrJoin(index, ", "), "]")); + } + } + if (!ParseToken(TokKind::kColon, + "expects ':' after after the sparse array index and before " + "the sparse array value")) { + return false; + } + LocTy value_loc = lexer_.GetLoc(); + LiteralNativeT value; + if (lexer_.GetKind() == TokKind::kw_true || + lexer_.GetKind() == TokKind::kw_false) { + value = static_cast(lexer_.GetKind() == TokKind::kw_true); + lexer_.Lex(); + } else if (primitive_util::IsIntegralType(shape.element_type())) { + tensorflow::int64 value_s64; + if (!ParseInt64(&value_s64)) { + return Error(value_loc, + StrCat("expects integer for primitive type: ", + PrimitiveType_Name(shape.element_type()))); + } + value = static_cast(value_s64); + } else if (primitive_util::IsFloatingPointType(shape.element_type())) { + double value_f64; + if (!ParseDouble(&value_f64)) { + return Error(value_loc, + StrCat("expects floating point value for primitive type: ", + PrimitiveType_Name(shape.element_type()))); + } + value = static_cast(value_f64); + } else { + LOG(FATAL) << "Unexpected element type: " + << PrimitiveType_Name(shape.element_type()); + } + if (lexer_.GetKind() != TokKind::kRbrace && + !ParseToken(TokKind::kComma, + "expects ',' separator between sparse array elements")) { + return false; + } + + if ((*literal)->sparse_element_count() + 1 == + LayoutUtil::MaxSparseElements(shape.layout())) { + return Error( + lexer_.GetLoc(), + StrCat("number of sparse elements exceeds maximum for layout: ", + ShapeUtil::HumanStringWithLayout(shape))); + } + + (*literal)->AppendSparseElement(index, value); + } + + (*literal)->SortSparseElements(); + return true; +} + +// operands ::= '(' operands1 ')' +// operands1 +// ::= /*empty*/ +// ::= operand (, operand)* +// operand ::= (shape)? name +bool HloParser::ParseOperands(std::vector* operands) { + CHECK(operands != nullptr); + if (!ParseToken(TokKind::kLparen, + "expects '(' at the beginning of operands")) { + return false; + } + if (lexer_.GetKind() == TokKind::kRparen) { + // empty + } else { + do { + LocTy loc = lexer_.GetLoc(); + string name; + optional shape; + if (CanBeShape()) { + shape.emplace(); + if (!ParseShape(&shape.value())) { + return false; + } + } + if (!ParseName(&name)) { + return false; + } + std::pair* instruction = + FindInstruction(name, shape); + if (instruction == nullptr) { + return Error(loc, StrCat("instruction does not exist: ", name)); + } + operands->push_back(instruction->first); + } while (EatIfPresent(TokKind::kComma)); + } + return ParseToken(TokKind::kRparen, "expects ')' at the end of operands"); +} + +bool HloParser::ParseOperands(std::vector* operands, + const int expected_size) { + CHECK(operands != nullptr); + LocTy loc = lexer_.GetLoc(); + if (!ParseOperands(operands)) { + return false; + } + if (expected_size != operands->size()) { + return Error(loc, StrCat("expects ", expected_size, " operands, but has ", + operands->size(), " operands")); + } + return true; +} + +// sub_attributes ::= '{' (','? attribute)* '}' +bool HloParser::ParseSubAttributes( + const std::unordered_map& attrs) { + LocTy loc = lexer_.GetLoc(); + if (!ParseToken(TokKind::kLbrace, "expects '{' to start sub attributes")) { + return false; + } + std::unordered_set seen_attrs; + if (lexer_.GetKind() == TokKind::kRbrace) { + // empty + } else { + do { + EatIfPresent(TokKind::kComma); + if (!ParseAttributeHelper(attrs, &seen_attrs)) { + return false; + } + } while (lexer_.GetKind() != TokKind::kRbrace); + } + // Check that all required attrs were seen. + for (const auto& attr_it : attrs) { + if (attr_it.second.required && + seen_attrs.find(attr_it.first) == seen_attrs.end()) { + return Error(loc, StrFormat("sub-attribute %s is expected but not seen", + attr_it.first)); + } + } + return ParseToken(TokKind::kRbrace, "expects '}' to end sub attributes"); +} + +// attributes ::= (',' attribute)* +bool HloParser::ParseAttributes( + const std::unordered_map& attrs) { + LocTy loc = lexer_.GetLoc(); + std::unordered_set seen_attrs; + while (EatIfPresent(TokKind::kComma)) { + if (!ParseAttributeHelper(attrs, &seen_attrs)) { + return false; + } + } + // Check that all required attrs were seen. + for (const auto& attr_it : attrs) { + if (attr_it.second.required && + seen_attrs.find(attr_it.first) == seen_attrs.end()) { + return Error(loc, StrFormat("attribute %s is expected but not seen", + attr_it.first)); + } + } + return true; +} + +bool HloParser::ParseAttributeHelper( + const std::unordered_map& attrs, + std::unordered_set* seen_attrs) { + LocTy loc = lexer_.GetLoc(); + string name; + if (!ParseAttributeName(&name)) { + return Error(loc, "error parsing attributes"); + } + VLOG(1) << "Parsing attribute " << name; + if (!seen_attrs->insert(name).second) { + return Error(loc, StrFormat("attribute %s already exists", name)); + } + auto attr_it = attrs.find(name); + if (attr_it == attrs.end()) { + string allowed_attrs; + if (attrs.empty()) { + allowed_attrs = "No attributes are allowed here."; + } else { + allowed_attrs = StrCat( + "Allowed attributes: ", + StrJoin(attrs, ", ", + [&](string* out, const std::pair& kv) { + StrAppend(out, kv.first); + })); + } + return Error(loc, StrFormat("unexpected attribute \"%s\". %s", name, + allowed_attrs)); + } + AttrTy attr_type = attr_it->second.attr_type; + void* attr_out_ptr = attr_it->second.result; + bool success = [&] { + LocTy attr_loc = lexer_.GetLoc(); + switch (attr_type) { + case AttrTy::kBool: { + bool result; + if (!ParseBool(&result)) { + return false; + } + static_cast*>(attr_out_ptr)->emplace(result); + return true; + } + case AttrTy::kInt64: { + tensorflow::int64 result; + if (!ParseInt64(&result)) { + return false; + } + static_cast*>(attr_out_ptr) + ->emplace(result); + return true; + } + case AttrTy::kInt32: { + tensorflow::int64 result; + if (!ParseInt64(&result)) { + return false; + } + if (result != static_cast(result)) { + return Error(attr_loc, "value out of range for int32"); + } + static_cast*>(attr_out_ptr) + ->emplace(static_cast(result)); + return true; + } + case AttrTy::kFloat: { + double result; + if (!ParseDouble(&result)) { + return false; + } + if (result > std::numeric_limits::max() || + result < std::numeric_limits::lowest()) { + return Error(attr_loc, "value out of range for float"); + } + static_cast*>(attr_out_ptr) + ->emplace(static_cast(result)); + return true; + } + case AttrTy::kHloComputation: { + HloComputation* result; + if (!ParseComputationName(&result)) { + return false; + } + static_cast*>(attr_out_ptr)->emplace(result); + return true; + } + case AttrTy::kFftType: { + FftType result; + if (!ParseFftType(&result)) { + return false; + } + static_cast*>(attr_out_ptr)->emplace(result); + return true; + } + case AttrTy::kWindow: { + Window result; + if (!ParseWindow(&result, /*expect_outer_curlies=*/true)) { + return false; + } + static_cast*>(attr_out_ptr)->emplace(result); + return true; + } + case AttrTy::kConvolutionDimensionNumbers: { + ConvolutionDimensionNumbers result; + if (!ParseConvolutionDimensionNumbers(&result)) { + return false; + } + static_cast*>(attr_out_ptr) + ->emplace(result); + return true; + } + case AttrTy::kSharding: { + OpSharding sharding; + if (!ParseSharding(&sharding)) { + return false; + } + static_cast*>(attr_out_ptr)->emplace(sharding); + return true; + } + case AttrTy::kInstructionList: { + std::vector result; + if (!ParseInstructionNames(&result)) { + return false; + } + static_cast>*>(attr_out_ptr) + ->emplace(result); + return true; + } + case AttrTy::kFusionKind: { + HloInstruction::FusionKind result; + if (!ParseFusionKind(&result)) { + return false; + } + static_cast*>(attr_out_ptr) + ->emplace(result); + return true; + } + case AttrTy::kBracedInt64List: { + std::vector result; + if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma, + &result)) { + return false; + } + static_cast>*>(attr_out_ptr) + ->emplace(result); + return true; + } + case AttrTy::kBracedInt64ListList: { + std::vector> result; + auto parse_and_add_item = [&]() { + std::vector item; + if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, + TokKind::kComma, &item)) { + return false; + } + result.push_back(item); + return true; + }; + if (!ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma, + parse_and_add_item)) { + return false; + } + static_cast>>*>( + attr_out_ptr) + ->emplace(result); + return true; + } + case AttrTy::kSliceRanges: { + SliceRanges result; + if (!ParseSliceRanges(&result)) { + return false; + } + static_cast*>(attr_out_ptr)->emplace(result); + return true; + } + case AttrTy::kPaddingConfig: { + PaddingConfig result; + if (!ParsePaddingConfig(&result)) { + return false; + } + static_cast*>(attr_out_ptr)->emplace(result); + return true; + } + case AttrTy::kString: { + string result; + if (!ParseString(&result)) { + return false; + } + static_cast*>(attr_out_ptr)->emplace(result); + return true; + } + case AttrTy::kMetadata: { + OpMetadata result; + if (!ParseMetadata(&result)) { + return false; + } + static_cast*>(attr_out_ptr)->emplace(result); + return true; + } + case AttrTy::kDistribution: { + RandomDistribution result; + if (!ParseRandomDistribution(&result)) { + return false; + } + static_cast*>(attr_out_ptr) + ->emplace(result); + return true; + } + case AttrTy::kDomain: { + return ParseDomain(static_cast(attr_out_ptr)); + } + case AttrTy::kPrecisionList: { + std::vector result; + if (!ParsePrecisionList(&result)) { + return false; + } + static_cast>*>( + attr_out_ptr) + ->emplace(result); + return true; + } + } + }(); + if (!success) { + return Error(loc, StrFormat("error parsing attribute %s", name)); + } + return true; +} + +bool HloParser::ParseComputationName(HloComputation** value) { + string name; + LocTy loc = lexer_.GetLoc(); + if (!ParseName(&name)) { + return Error(loc, "expects computation name"); + } + std::pair* computation = + tensorflow::gtl::FindOrNull(computation_pool_, name); + if (computation == nullptr) { + return Error(loc, StrCat("computation does not exist: ", name)); + } + *value = computation->first; + return true; +} + +// ::= '{' size stride? pad? lhs_dilate? rhs_dilate? '}' +// The subattributes can appear in any order. 'size=' is required, others are +// optional. +bool HloParser::ParseWindow(Window* window, bool expect_outer_curlies) { + LocTy loc = lexer_.GetLoc(); + if (expect_outer_curlies && + !ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) { + return false; + } + + std::vector size; + std::vector stride; + std::vector> pad; + std::vector lhs_dilate; + std::vector rhs_dilate; + std::vector rhs_reversal; + const auto end_token = + expect_outer_curlies ? TokKind::kRbrace : TokKind::kEof; + while (lexer_.GetKind() != end_token) { + LocTy attr_loc = lexer_.GetLoc(); + string field_name; + if (!ParseAttributeName(&field_name)) { + return Error(attr_loc, "expects sub-attributes in window"); + } + bool ok = [&] { + if (field_name == "size") { + return ParseDxD("size", &size); + } + if (field_name == "stride") { + return ParseDxD("stride", &stride); + } + if (field_name == "lhs_dilate") { + return ParseDxD("lhs_dilate", &lhs_dilate); + } + if (field_name == "rhs_dilate") { + return ParseDxD("rls_dilate", &rhs_dilate); + } + if (field_name == "pad") { + return ParseWindowPad(&pad); + } + if (field_name == "rhs_reversal") { + return ParseDxD("rhs_reversal", &rhs_reversal); + } + return Error(attr_loc, StrCat("unexpected attribute name: ", field_name)); + }(); + if (!ok) { + return false; + } + } + + if (size.empty()) { + return Error(loc, + "sub-attribute 'size=' is required in the window attribute"); + } + if (!stride.empty() && stride.size() != size.size()) { + return Error(loc, "expects 'stride=' has the same size as 'size='"); + } + if (!lhs_dilate.empty() && lhs_dilate.size() != size.size()) { + return Error(loc, "expects 'lhs_dilate=' has the same size as 'size='"); + } + if (!rhs_dilate.empty() && rhs_dilate.size() != size.size()) { + return Error(loc, "expects 'rhs_dilate=' has the same size as 'size='"); + } + if (!pad.empty() && pad.size() != size.size()) { + return Error(loc, "expects 'pad=' has the same size as 'size='"); + } + + for (int i = 0; i < size.size(); i++) { + window->add_dimensions()->set_size(size[i]); + if (!pad.empty()) { + window->mutable_dimensions(i)->set_padding_low(pad[i][0]); + window->mutable_dimensions(i)->set_padding_high(pad[i][1]); + } + // If some field is not present, it has the default value. + window->mutable_dimensions(i)->set_stride(stride.empty() ? 1 : stride[i]); + window->mutable_dimensions(i)->set_base_dilation( + lhs_dilate.empty() ? 1 : lhs_dilate[i]); + window->mutable_dimensions(i)->set_window_dilation( + rhs_dilate.empty() ? 1 : rhs_dilate[i]); + window->mutable_dimensions(i)->set_window_reversal( + rhs_reversal.empty() ? false : (rhs_reversal[i] == 1)); + } + return !expect_outer_curlies || + ParseToken(TokKind::kRbrace, "expected '}' to end window attribute"); +} + +// This is the inverse of HloInstruction::ConvolutionDimensionNumbersToString. +// The string looks like "dim_labels=0bf_0io->0bf". +bool HloParser::ParseConvolutionDimensionNumbers( + ConvolutionDimensionNumbers* dnums) { + if (lexer_.GetKind() != TokKind::kDimLabels) { + return TokenError("expects dim labels pattern, e.g., 'bf0_0io->0bf'"); + } + string str = lexer_.GetStrVal(); + + // The str is expected to have 3 items, lhs, rhs, out, and it must look like + // lhs_rhs->out, that is, the first separator is "_" and the second is "->". + std::vector split1 = absl::StrSplit(str, "_"); + if (split1.size() != 2) { + LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees " + << str; + } + std::vector split2 = absl::StrSplit(split1[1], "->"); + if (split2.size() != 2) { + LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees " + << str; + } + absl::string_view lhs = split1[0]; + absl::string_view rhs = split2[0]; + absl::string_view out = split2[1]; + + const tensorflow::int64 rank = lhs.length(); + if (rank != rhs.length() || rank != out.length()) { + return TokenError( + "convolution lhs, rhs, and output must have the same rank"); + } + if (rank < 2) { + return TokenError("convolution rank must >=2"); + } + + auto is_unique = [](string str) -> bool { + std::sort(str.begin(), str.end()); + return std::unique(str.begin(), str.end()) == str.end(); + }; + + // lhs + { + if (!is_unique(string(lhs))) { + return TokenError( + StrCat("expects unique lhs dimension numbers, but sees ", lhs)); + } + for (int i = 0; i < rank - 2; i++) { + dnums->add_input_spatial_dimensions(-1); + } + for (int i = 0; i < rank; i++) { + char c = lhs[i]; + if (c == 'b') { + dnums->set_input_batch_dimension(i); + } else if (c == 'f') { + dnums->set_input_feature_dimension(i); + } else if (c < '0' + rank && c >= '0') { + dnums->set_input_spatial_dimensions(c - '0', i); + } else { + return TokenError( + StrFormat("expects [0-%dbf] in lhs dimension numbers", rank - 1)); + } + } + } + // rhs + { + if (!is_unique(string(rhs))) { + return TokenError( + StrCat("expects unique rhs dimension numbers, but sees ", rhs)); + } + for (int i = 0; i < rank - 2; i++) { + dnums->add_kernel_spatial_dimensions(-1); + } + for (int i = 0; i < rank; i++) { + char c = rhs[i]; + if (c == 'i') { + dnums->set_kernel_input_feature_dimension(i); + } else if (c == 'o') { + dnums->set_kernel_output_feature_dimension(i); + } else if (c < '0' + rank && c >= '0') { + dnums->set_kernel_spatial_dimensions(c - '0', i); + } else { + return TokenError( + StrFormat("expects [0-%dio] in rhs dimension numbers", rank - 1)); + } + } + } + // output + { + if (!is_unique(string(out))) { + return TokenError( + StrCat("expects unique output dimension numbers, but sees ", out)); + } + for (int i = 0; i < rank - 2; i++) { + dnums->add_output_spatial_dimensions(-1); + } + for (int i = 0; i < rank; i++) { + char c = out[i]; + if (c == 'b') { + dnums->set_output_batch_dimension(i); + } else if (c == 'f') { + dnums->set_output_feature_dimension(i); + } else if (c < '0' + rank && c >= '0') { + dnums->set_output_spatial_dimensions(c - '0', i); + } else { + return TokenError(StrFormat( + "expects [0-%dbf] in output dimension numbers", rank - 1)); + } + } + } + + lexer_.Lex(); + return true; +} + +// ::= '{' ranges '}' +// ::= /*empty*/ +// ::= range (',' range)* +// range ::= '[' start ':' limit (':' stride)? ']' +// +// The slice ranges are printed as: +// +// {[dim0_start:dim0_limit:dim0stride], [dim1_start:dim1_limit], ...} +// +// This function extracts the starts, limits, and strides as 3 vectors to the +// result. If stride is not present, stride is 1. For example, if the slice +// ranges is printed as: +// +// {[2:3:4], [5:6:7], [8:9]} +// +// The parsed result will be: +// +// {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}} +// +bool HloParser::ParseSliceRanges(SliceRanges* result) { + if (!ParseToken(TokKind::kLbrace, "expects '{' to start ranges")) { + return false; + } + std::vector> ranges; + if (lexer_.GetKind() == TokKind::kRbrace) { + // empty + return ParseToken(TokKind::kRbrace, "expects '}' to end ranges"); + } + do { + LocTy loc = lexer_.GetLoc(); + ranges.emplace_back(); + if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kColon, + &ranges.back())) { + return false; + } + const auto& range = ranges.back(); + if (range.size() != 2 && range.size() != 3) { + return Error(loc, + StrFormat("expects [start:limit:step] or [start:limit], " + "but sees %d elements.", + range.size())); + } + } while (EatIfPresent(TokKind::kComma)); + + for (const auto& range : ranges) { + result->starts.push_back(range[0]); + result->limits.push_back(range[1]); + result->strides.push_back(range.size() == 3 ? range[2] : 1); + } + return ParseToken(TokKind::kRbrace, "expects '}' to end ranges"); +} + +// precisionlist ::= start precision_elements end +// precision_elements +// ::= /*empty*/ +// ::= precision_val (delim precision_val)* +bool HloParser::ParsePrecisionList( + std::vector* result) { + auto parse_and_add_item = [&]() { + PrecisionConfigProto::Precision item; + if (!ParsePrecision(&item)) { + return false; + } + result->push_back(item); + return true; + }; + return ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma, + parse_and_add_item); +} + +// int64list ::= start int64_elements end +// int64_elements +// ::= /*empty*/ +// ::= int64_val (delim int64_val)* +bool HloParser::ParseInt64List(const TokKind start, const TokKind end, + const TokKind delim, + std::vector* result) { + if (!ParseToken(start, StrCat("expects an int64 list starting with ", + TokKindToString(start)))) { + return false; + } + if (lexer_.GetKind() == end) { + // empty + } else { + do { + tensorflow::int64 i; + if (!ParseInt64(&i)) { + return false; + } + result->push_back(i); + } while (EatIfPresent(delim)); + } + return ParseToken( + end, StrCat("expects an int64 list to end with ", TokKindToString(end))); +} + +bool HloParser::ParseList(const TokKind start, const TokKind end, + const TokKind delim, + const std::function& parse_and_add_item) { + if (!ParseToken(start, StrCat("expects a list starting with ", + TokKindToString(start)))) { + return false; + } + if (lexer_.GetKind() == end) { + // empty + } else { + do { + if (!parse_and_add_item()) { + return false; + } + } while (EatIfPresent(delim)); + } + return ParseToken( + end, StrCat("expects a list to end with ", TokKindToString(end))); +} + +// param_list_to_shape ::= param_list '->' shape +bool HloParser::ParseParamListToShape(Shape* shape, LocTy* shape_loc) { + if (!ParseParamList() || !ParseToken(TokKind::kArrow, "expects '->'")) { + return false; + } + *shape_loc = lexer_.GetLoc(); + return ParseShape(shape); +} + +bool HloParser::CanBeParamListToShape() { + return lexer_.GetKind() == TokKind::kLparen; +} + +// param_list ::= '(' param_list1 ')' +// param_list1 +// ::= /*empty*/ +// ::= param (',' param)* +// param ::= name shape +bool HloParser::ParseParamList() { + if (!ParseToken(TokKind::kLparen, + "expects '(' at the beginning of param list")) { + return false; + } + + if (lexer_.GetKind() == TokKind::kRparen) { + // empty + } else { + do { + Shape shape; + string name; + if (!ParseName(&name) || !ParseShape(&shape)) { + return false; + } + } while (EatIfPresent(TokKind::kComma)); + } + return ParseToken(TokKind::kRparen, "expects ')' at the end of param list"); +} + +// shape ::= shape_val_ +// shape ::= '(' tuple_elements ')' +// tuple_elements +// ::= /*empty*/ +// ::= shape (',' shape)* +bool HloParser::ParseShape(Shape* result) { + if (EatIfPresent(TokKind::kLparen)) { // Tuple + std::vector shapes; + if (lexer_.GetKind() == TokKind::kRparen) { + /*empty*/ + } else { + // shape (',' shape)* + do { + shapes.emplace_back(); + if (!ParseShape(&shapes.back())) { + return false; + } + } while (EatIfPresent(TokKind::kComma)); + } + *result = ShapeUtil::MakeTupleShape(shapes); + return ParseToken(TokKind::kRparen, "expects ')' at the end of tuple."); + } + + if (lexer_.GetKind() != TokKind::kShape) { + return TokenError("expects shape"); + } + *result = lexer_.GetShapeVal(); + lexer_.Lex(); + return true; +} + +bool HloParser::CanBeShape() { + // A non-tuple shape starts with a kShape token; a tuple shape starts with + // '('. + return lexer_.GetKind() == TokKind::kShape || + lexer_.GetKind() == TokKind::kLparen; +} + +bool HloParser::ParseName(string* result) { + VLOG(1) << "ParseName"; + if (lexer_.GetKind() != TokKind::kIdent && + lexer_.GetKind() != TokKind::kName) { + return TokenError("expects name"); + } + *result = lexer_.GetStrVal(); + lexer_.Lex(); + return true; +} + +bool HloParser::ParseAttributeName(string* result) { + if (lexer_.GetKind() != TokKind::kAttributeName) { + return TokenError("expects attribute name"); + } + *result = lexer_.GetStrVal(); + lexer_.Lex(); + return true; +} + +bool HloParser::ParseString(string* result) { + VLOG(1) << "ParseString"; + if (lexer_.GetKind() != TokKind::kString) { + return TokenError("expects string"); + } + *result = lexer_.GetStrVal(); + lexer_.Lex(); + return true; +} + +bool HloParser::ParseDxD(const string& name, + std::vector* result) { + LocTy loc = lexer_.GetLoc(); + if (!result->empty()) { + return Error(loc, StrFormat("sub-attribute '%s=' already exists", name)); + } + // 1D + if (lexer_.GetKind() == TokKind::kInt) { + tensorflow::int64 number; + if (!ParseInt64(&number)) { + return Error(loc, StrFormat("expects sub-attribute '%s=i'", name)); + } + result->push_back(number); + return true; + } + // 2D or higher. + if (lexer_.GetKind() == TokKind::kDxD) { + string str = lexer_.GetStrVal(); + if (!SplitToInt64s(str, 'x', result)) { + return Error(loc, StrFormat("expects sub-attribute '%s=ixj...'", name)); + } + lexer_.Lex(); + return true; + } + return TokenError("expects token type kInt or kDxD"); +} + +bool HloParser::ParseWindowPad( + std::vector>* pad) { + LocTy loc = lexer_.GetLoc(); + if (!pad->empty()) { + return Error(loc, "sub-attribute 'pad=' already exists"); + } + if (lexer_.GetKind() != TokKind::kPad) { + return TokenError("expects window pad pattern, e.g., '0_0x3_3'"); + } + string str = lexer_.GetStrVal(); + for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) { + std::vector low_high; + if (!SplitToInt64s(padding_dim_str, '_', &low_high) || + low_high.size() != 2) { + return Error(loc, + "expects padding_low and padding_high separated by '_'"); + } + pad->push_back(low_high); + } + lexer_.Lex(); + return true; +} + +// This is the inverse xla::ToString(PaddingConfig). The padding config string +// looks like "0_0_0x3_3_1". The string is first separated by 'x', each +// substring represents one PaddingConfigDimension. The substring is 3 (or 2) +// numbers joined by '_'. +bool HloParser::ParsePaddingConfig(PaddingConfig* padding) { + if (lexer_.GetKind() != TokKind::kPad) { + return TokenError("expects padding config, e.g., '0_0_0x3_3_1'"); + } + LocTy loc = lexer_.GetLoc(); + string str = lexer_.GetStrVal(); + for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) { + std::vector padding_dim; + if (!SplitToInt64s(padding_dim_str, '_', &padding_dim) || + (padding_dim.size() != 2 && padding_dim.size() != 3)) { + return Error(loc, + "expects padding config pattern like 'low_high_interior' or " + "'low_high'"); + } + auto* dim = padding->add_dimensions(); + dim->set_edge_padding_low(padding_dim[0]); + dim->set_edge_padding_high(padding_dim[1]); + dim->set_interior_padding(padding_dim.size() == 3 ? padding_dim[2] : 0); + } + lexer_.Lex(); + return true; +} + +// '{' metadata_string '}' +bool HloParser::ParseMetadata(OpMetadata* metadata) { + std::unordered_map attrs; + optional op_type; + optional op_name; + optional source_file; + optional source_line; + attrs["op_type"] = {/*required=*/false, AttrTy::kString, &op_type}; + attrs["op_name"] = {/*required=*/false, AttrTy::kString, &op_name}; + attrs["source_file"] = {/*required=*/false, AttrTy::kString, &source_file}; + attrs["source_line"] = {/*required=*/false, AttrTy::kInt32, &source_line}; + if (!ParseSubAttributes(attrs)) { + return false; + } + if (op_type) { + metadata->set_op_type(*op_type); + } + if (op_name) { + metadata->set_op_name(*op_name); + } + if (source_file) { + metadata->set_source_file(*source_file); + } + if (source_line) { + metadata->set_source_line(*source_line); + } + return true; +} + +bool HloParser::ParseOpcode(HloOpcode* result) { + VLOG(1) << "ParseOpcode"; + if (lexer_.GetKind() != TokKind::kIdent) { + return TokenError("expects opcode"); + } + string val = lexer_.GetStrVal(); + auto status_or_result = StringToHloOpcode(val); + if (!status_or_result.ok()) { + return TokenError(StrFormat("expects opcode but sees: %s, error: %s", val, + status_or_result.status().error_message())); + } + *result = status_or_result.ValueOrDie(); + lexer_.Lex(); + return true; +} + +bool HloParser::ParseFftType(FftType* result) { + VLOG(1) << "ParseFftType"; + if (lexer_.GetKind() != TokKind::kIdent) { + return TokenError("expects fft type"); + } + string val = lexer_.GetStrVal(); + if (!FftType_Parse(val, result) || !FftType_IsValid(*result)) { + return TokenError(StrFormat("expects fft type but sees: %s", val)); + } + lexer_.Lex(); + return true; +} + +bool HloParser::ParseFusionKind(HloInstruction::FusionKind* result) { + VLOG(1) << "ParseFusionKind"; + if (lexer_.GetKind() != TokKind::kIdent) { + return TokenError("expects fusion kind"); + } + string val = lexer_.GetStrVal(); + auto status_or_result = StringToFusionKind(val); + if (!status_or_result.ok()) { + return TokenError(StrFormat("expects fusion kind but sees: %s, error: %s", + val, + status_or_result.status().error_message())); + } + *result = status_or_result.ValueOrDie(); + lexer_.Lex(); + return true; +} + +bool HloParser::ParseRandomDistribution(RandomDistribution* result) { + VLOG(1) << "ParseRandomDistribution"; + if (lexer_.GetKind() != TokKind::kIdent) { + return TokenError("expects random distribution"); + } + string val = lexer_.GetStrVal(); + auto status_or_result = StringToRandomDistribution(val); + if (!status_or_result.ok()) { + return TokenError( + StrFormat("expects random distribution but sees: %s, error: %s", val, + status_or_result.status().error_message())); + } + *result = status_or_result.ValueOrDie(); + lexer_.Lex(); + return true; +} + +bool HloParser::ParsePrecision(PrecisionConfigProto::Precision* result) { + VLOG(1) << "ParsePrecision"; + if (lexer_.GetKind() != TokKind::kIdent) { + return TokenError("expects random distribution"); + } + string val = lexer_.GetStrVal(); + auto status_or_result = StringToPrecision(val); + if (!status_or_result.ok()) { + return TokenError(StrFormat("expects precision but sees: %s, error: %s", + val, + status_or_result.status().error_message())); + } + *result = status_or_result.ValueOrDie(); + lexer_.Lex(); + return true; +} + +bool HloParser::ParseInt64(tensorflow::int64* result) { + VLOG(1) << "ParseInt64"; + if (lexer_.GetKind() != TokKind::kInt) { + return TokenError("expects integer"); + } + *result = lexer_.GetInt64Val(); + lexer_.Lex(); + return true; +} + +bool HloParser::ParseDouble(double* result) { + switch (lexer_.GetKind()) { + case TokKind::kDecimal: + *result = lexer_.GetDecimalVal(); + break; + case TokKind::kInt: + *result = static_cast(lexer_.GetInt64Val()); + break; + case TokKind::kw_nan: + *result = std::numeric_limits::quiet_NaN(); + break; + case TokKind::kw_inf: + *result = std::numeric_limits::infinity(); + break; + case TokKind::kNegInf: + *result = -std::numeric_limits::infinity(); + break; + default: + return TokenError("expects decimal or integer"); + } + lexer_.Lex(); + return true; +} + +bool HloParser::ParseBool(bool* result) { + if (lexer_.GetKind() != TokKind::kw_true && + lexer_.GetKind() != TokKind::kw_false) { + return TokenError("expects true or false"); + } + *result = lexer_.GetKind() == TokKind::kw_true; + lexer_.Lex(); + return true; +} + +bool HloParser::ParseToken(TokKind kind, const string& msg) { + VLOG(1) << "ParseToken " << TokKindToString(kind) << " " << msg; + if (lexer_.GetKind() != kind) { + return TokenError(msg); + } + lexer_.Lex(); + return true; +} + +bool HloParser::EatIfPresent(TokKind kind) { + if (lexer_.GetKind() != kind) { + return false; + } + lexer_.Lex(); + return true; +} + +bool HloParser::AddInstruction(const string& name, HloInstruction* instruction, + LocTy name_loc) { + auto result = instruction_pool_.insert({name, {instruction, name_loc}}); + if (!result.second) { + Error(name_loc, StrCat("instruction already exists: ", name)); + return Error(/*loc=*/result.first->second.second, + "instruction previously defined here"); + } + return true; +} + +bool HloParser::AddComputation(const string& name, HloComputation* computation, + LocTy name_loc) { + auto result = computation_pool_.insert({name, {computation, name_loc}}); + if (!result.second) { + Error(name_loc, StrCat("computation already exists: ", name)); + return Error(/*loc=*/result.first->second.second, + "computation previously defined here"); + } + return true; +} + +StatusOr HloParser::ParseShardingOnly() { + lexer_.Lex(); + OpSharding op_sharding; + if (!ParseSharding(&op_sharding)) { + return InvalidArgument("Syntax error:\n%s", GetError()); + } + if (lexer_.GetKind() != TokKind::kEof) { + return InvalidArgument("Syntax error:\nExtra content after sharding"); + } + return HloSharding::FromProto(op_sharding); +} + +StatusOr HloParser::ParseWindowOnly() { + lexer_.Lex(); + Window window; + if (!ParseWindow(&window, /*expect_outer_curlies=*/false)) { + return InvalidArgument("Syntax error:\n%s", GetError()); + } + if (lexer_.GetKind() != TokKind::kEof) { + return InvalidArgument("Syntax error:\nExtra content after window"); + } + return window; +} + +StatusOr +HloParser::ParseConvolutionDimensionNumbersOnly() { + lexer_.Lex(); + ConvolutionDimensionNumbers dnums; + if (!ParseConvolutionDimensionNumbers(&dnums)) { + return InvalidArgument("Syntax error:\n%s", GetError()); + } + if (lexer_.GetKind() != TokKind::kEof) { + return InvalidArgument( + "Syntax error:\nExtra content after convolution dnums"); + } + return dnums; +} + +StatusOr HloParser::ParsePaddingConfigOnly() { + lexer_.Lex(); + PaddingConfig padding_config; + if (!ParsePaddingConfig(&padding_config)) { + return InvalidArgument("Syntax error:\n%s", GetError()); + } + if (lexer_.GetKind() != TokKind::kEof) { + return InvalidArgument("Syntax error:\nExtra content after PaddingConfig"); + } + return padding_config; +} + +Status HloParser::ParseSingleInstruction(HloComputation::Builder* builder, + string* root_name) { + TF_RET_CHECK(missing_instruction_hook_ == nullptr); + + // The missing instruction hook we register creates the shaped instruction on + // the fly as a parameter and returns it. + int64 parameter_count = 0; + missing_instruction_hook_ = + [this, builder, ¶meter_count]( + string name, + const optional& shape) -> std::pair* { + if (!shape.has_value()) { + Error(lexer_.GetLoc(), + StrCat("Operand ", name, + " had no shape in HLO text; cannot create parameter for " + "single-instruction module.")); + return nullptr; + } + HloInstruction* parameter = builder->AddInstruction( + HloInstruction::CreateParameter(parameter_count++, *shape, name)); + instruction_pool_[name] = {parameter, lexer_.GetLoc()}; + return tensorflow::gtl::FindOrNull(instruction_pool_, name); + }; + + // Prime the lexer. + lexer_.Lex(); + + // Parse the instruction with the registered hook. + if (!ParseInstruction(builder, root_name)) { + return InvalidArgument("Syntax error:\n%s", GetError()); + } + return Status::OK(); +} + +} // namespace + +StatusOr> ParseHloString( + absl::string_view str, const HloModuleConfig& config) { + HloParser parser(str, config); + if (!parser.Run()) { + return InvalidArgument("Syntax error:\n%s", parser.GetError()); + } + return parser.ConsumeHloModule(); +} + +StatusOr> ParseHloString(absl::string_view str) { + HloModuleConfig config; + return ParseHloString(str, config); +} + +StatusOr> ParseHloOpToModule( + absl::string_view str, absl::string_view name) { + HloModuleConfig config; + HloParser parser(str, config); + auto builder = absl::make_unique(string(name)); + string root_name; + TF_RETURN_IF_ERROR(parser.ParseSingleInstruction(builder.get(), &root_name)); + std::unique_ptr computation = builder->Build(); + auto module = absl::make_unique(string(name), config); + module->AddEntryComputation(std::move(computation)); + return std::move(module); +} + +StatusOr ParseSharding(absl::string_view str) { + HloModuleConfig config; + HloParser parser(str, config); + return parser.ParseShardingOnly(); +} + +StatusOr ParseWindow(absl::string_view str) { + HloModuleConfig config; + HloParser parser(str, config); + return parser.ParseWindowOnly(); +} + +StatusOr ParseConvolutionDimensionNumbers( + absl::string_view str) { + HloModuleConfig config; + HloParser parser(str, config); + return parser.ParseConvolutionDimensionNumbersOnly(); +} + +StatusOr ParsePaddingConfig(absl::string_view str) { + HloModuleConfig config; + HloParser parser(str, config); + return parser.ParsePaddingConfigOnly(); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h new file mode 100644 index 0000000000000000000000000000000000000000..1882a184da8f09a9626daf7a2bbc531cb6ba6138 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_parser.h @@ -0,0 +1,67 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_ + +#include "absl/memory/memory.h" +#include "absl/strings/string_view.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_lexer.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { + +// For details about the syntax accepted by this parser, see +// g3doc/hlo_parser.md. + +// The api of the hlo parser. Given a string in the HloModule::ToString() +// format, parses the string and creates a HloModule with the given config. +StatusOr> ParseHloString( + absl::string_view str, const HloModuleConfig& config); + +// Parses the text for a single HLO operation into an HLO module with a function +// that runs that operation (with the same parameters) as its entry computation. +StatusOr> ParseHloOpToModule( + absl::string_view str, absl::string_view name = "single_op"); + +// The api of the hlo parser. Given a string in the HloModule::ToString() +// format, parses the string and creates a HloModule with default config. +StatusOr> ParseHloString(absl::string_view str); + +// Parses the result of HloSharding::ToString(), e.g. "{replicated}". +StatusOr ParseSharding(absl::string_view str); + +// Parses the result of window_util::ToString(const Window&). +StatusOr ParseWindow(absl::string_view str); + +// Parses the result of ConvolutionDimensionNumbersToString(), e.g. +// "b0f_0io->b0f". +StatusOr ParseConvolutionDimensionNumbers( + absl::string_view str); + +// ParseHloString sharding from str. str is supposed to contain the body of the +// sharding, i.e. just the rhs of the "sharding={...}" attribute string. +StatusOr ParseSharding(absl::string_view str); + +// Parses the result of PaddingConfigToString(), e.g. "0_0x1_1". +StatusOr ParsePaddingConfig(absl::string_view str); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_ diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..759789437c12d489ee607638e736dfd6a6e1dda1 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc @@ -0,0 +1,1779 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_parser.h" + +#include +#include "absl/strings/match.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/window_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace { + +namespace op = ::xla::testing::opcode_matchers; +using absl::string_view; + +struct TestData { + string test_name; + string module_string; +}; + +string TestDataToString(const ::testing::TestParamInfo& data) { + return data.param.test_name; +} + +// For each string below, we check that: +// - we parse it to an HloModule successfully, and +// - the stringification of the resulting HloModule is equal to our original +// string. +std::vector CreateTestCases() { + // clang-format off + return std::vector({ +// ax + y +{ +"AxpyParam", +R"(HloModule axpy_module + +ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] { + %alpha = f32[] parameter(0) + %broadcast = f32[2,4]{1,0} broadcast(f32[] %alpha), dimensions={} + %x = f32[2,4]{1,0} parameter(1) + %multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x) + %y = f32[2,4]{1,0} parameter(2) + ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y) +} + +)" +}, +// pred constant +{ +"ConstantPred", +R"(HloModule constant_pred_module + +ENTRY %constant_pred () -> pred[] { + ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="\"it\'s not a problem\n" source_file="path/to/test.cc" source_line=68}, backend_config="foo\" bar" +} + +)" +}, +// s32 constant +{ +"ConstantS32", +R"(HloModule constant_s32_module + +ENTRY %constant_s32 () -> s32[] { + ROOT %constant = s32[] constant(-42) +} + +)" +}, +// f32 constant, but the value is not a decimal and there is a backend +// configuration +{ +"ConstantF32", +R"(HloModule ConstantF32_module + +ENTRY %ConstantF32.v4 () -> f32[] { + ROOT %constant = f32[] constant(42), backend_config="this is a configuration" +} + +)" +}, +// f32 constant, rank 1 empty array. +{ +"ConstantF32R1Empty", +R"(HloModule ConstantF32Empty_module + +ENTRY %ConstantF32Empty.v4 () -> f32[0] { + ROOT %constant = f32[0]{0} constant({}) +} + +)" +}, +// f32 constant, rank 4 empty array. +{ +"ConstantF32R4Empty", +R"(HloModule ConstantF32R4Empty_module + +ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] { + ROOT %constant = f32[2,0,4,3]{3,2,1,0} constant(f32[2,0,4,3] { { /*i0=0*/ }, { /*i0=1*/ } }) +} + +)" +}, +// constant 4D +{ +"Constant4D", +R"(HloModule Small_3x2x1x1_module + +ENTRY %Small_3x2x1x1.v1 () -> f32[3,2,1,1] { + ROOT %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } }) +} + +)" +}, +// non-finite constants: nan, inf, -inf +{ +"ConstantNonFinite", +R"(HloModule IsFiniteR1F32s_module + +ENTRY %IsFiniteR1F32s.v2 () -> pred[6] { + %constant = f32[6]{0} constant({nan, 7, nan, -1, inf, -inf}) + ROOT %is-finite = pred[6]{0} is-finite(f32[6]{0} %constant) +} + +)" +}, +// constant f16 +{ +"ConstantF16", +R"(HloModule ConstantF16_module + +ENTRY %ConstantF16.v4 () -> f16[] { + ROOT %constant = f16[] constant(500) +} + +)" +}, +// bf16 +{ +"BF16", +R"(HloModule BF16 + +ENTRY %BF16.v4 () -> bf16[] { + ROOT %constant = bf16[] constant(500) +} + +)" +}, +// constant + constant +{ +"AddConstants", +R"(HloModule add_constants_module + +ENTRY %add_constants () -> f32[] { + %constant = f32[] constant(3.14) + ROOT %add = f32[] add(f32[] %constant, f32[] %constant) +} + +)" +}, +// tuple constant +{ +"TupleConstant", +R"(HloModule TupleConstant_module + +ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) { + ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} )) +} + +)" +}, +// v1 > v2 ? v1 : v2 +{ +"SelectR1F32", +R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module + +ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f32[4] { + %v1 = f32[4]{0} parameter(0), sharding={maximal device=1} + %v2 = f32[4]{0} parameter(1), sharding={maximal device=1} + %greater-than = pred[4]{0} greater-than(f32[4]{0} %v1, f32[4]{0} %v2), sharding={replicated} + ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2), sharding={} +} + +)" +}, +// empty tuple +{ +"EmptyTupleCreate", +R"(HloModule EmptyTupleCreate_module + +ENTRY %EmptyTupleCreate.v1 () -> () { + ROOT %tuple = () tuple() +} + +)" +}, +// tuple +{ +"TupleCreate", +R"(HloModule TupleCreate_module + +ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) { + %v1 = f32[] parameter(0) + %v2 = f32[3]{0} parameter(1) + %v3 = f32[2,3]{1,0} parameter(2) + ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3) +} + +)" +}, +{ +"ShardedTupleCreate", +R"(HloModule ShardedTupleCreate_module + +ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) { + %v1 = f32[] parameter(0) + %v2 = f32[3]{0} parameter(1) + %v3 = f32[2,3]{1,0} parameter(2) + ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3), sharding={{replicated}, {maximal device=0}, {replicated}} +} + +)" +}, +{ +"DomainParsing", +R"(HloModule DomainParsing_module + +ENTRY %DomainParsing (v1: f32[]) -> f32[] { + %v1 = f32[] parameter(0) + ROOT %dom = f32[] domain(f32[] %v1), domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}} +} + +)" +}, +// int32 result = 0; +// while (result < 5) { result = result + 1; } +{ +"WhileWithScalarS32Result", +R"(HloModule WhileWithScalarS32Result_module + +%body.v3 (prev.1: s32[]) -> s32[] { + %constant = s32[] constant(1) + %prev.1 = s32[] parameter(0) + ROOT %add = s32[] add(s32[] %constant, s32[] %prev.1) +} + +%condition.v3 (prev.2: s32[]) -> pred[] { + %constant.1 = s32[] constant(5) + %prev.2 = s32[] parameter(0) + ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %prev.2) +} + +ENTRY %WhileWithScalarS32Result.v2 () -> s32[] { + %constant.2 = s32[] constant(0) + ROOT %while = s32[] while(s32[] %constant.2), condition=%condition.v3, body=%body.v3 +} + +)" +}, +// send and recv +{ +"SendRecv", +R"(HloModule TwoSendRecvBothWayRecvFist_module + +ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) { + %token = token[] after-all() + %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15, sharding={maximal device=1} + ROOT %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15, sharding={maximal device=1} + %constant = f32[] constant(2.1), sharding={maximal device=0} + %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv} + %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16, sharding={maximal device=0} +} + +)" +}, +{ +"SendRecvWithHostTransfer", +R"(HloModule HostTransferSendRecv_module + +ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) { + %token = token[] after-all() + %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15, is_host_transfer=true + ROOT %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15, is_host_transfer=true + %constant = f32[] constant(2.1), sharding={maximal device=0} + %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, is_host_transfer=true + %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16, is_host_transfer=true +} + +)" +}, +// get-tuple-element +{ +"GetTupleElement", +R"(HloModule GetTupleElement_module + +ENTRY %GetTupleElement.v4 () -> s32[2,3] { + %constant = f32[3]{0} constant({1, 2, 3}) + %constant.1 = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 4, 5, 6 } }) + %tuple = (f32[3]{0}, s32[2,3]{1,0}) tuple(f32[3]{0} %constant, s32[2,3]{1,0} %constant.1) + ROOT %get-tuple-element = s32[2,3]{1,0} get-tuple-element((f32[3]{0}, s32[2,3]{1,0}) %tuple), index=1, sharding={maximal device=0} +} + +)" +}, +// call +{ +"Call", +R"(HloModule CallR0F32IdentityScalar_module + +%Identity.v1 (x: f32[]) -> f32[] { + ROOT %x = f32[] parameter(0) +} + +ENTRY %CallR0F32IdentityScalar.v2 () -> f32[] { + %constant = f32[] constant(42) + ROOT %call = f32[] call(f32[] %constant), to_apply=%Identity.v1 +} + +)" +}, +// reduce window +{ +"ReduceWindow", +R"(HloModule R4UnitWindow_module + +%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] { + %lhs = f32[] parameter(0) + %rhs = f32[] parameter(1) + ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs) +} + +ENTRY %R4UnitWindow.v3 (operand: f32[13,12,8,15]) -> f32[13,3,8,15] { + %operand = f32[13,12,8,15]{0,3,2,1} parameter(0) + %constant = f32[] constant(0) + ROOT %reduce-window = f32[13,3,8,15]{0,3,2,1} reduce-window(f32[13,12,8,15]{0,3,2,1} %operand, f32[] %constant), window={size=1x1x7x1 stride=1x4x1x1 pad=0_0x0_0x3_3x0_0}, to_apply=%add_F32.v3 +} + +)" +}, +// reduce window on scalar +{ +"ReduceWindowScalar", +R"(HloModule reduce_window_scalar + +%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] { + %lhs = f32[] parameter(0) + %rhs = f32[] parameter(1) + ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs) +} + +ENTRY %R4UnitWindowScalar () -> f32[] { + %constant = f32[] constant(42) + %constant.1 = f32[] constant(1) + ROOT %reduce-window = f32[] reduce-window(f32[] %constant, f32[] %constant.1), to_apply=%add_F32.v3 +} + +)" +}, +// convolution +{ +"Convolution", +R"(HloModule Convolve1D1Window_0_module + +ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] { + %input = f32[1,2,1]{2,1,0} parameter(0) + %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input) + %filter = f32[1,1,1]{2,1,0} parameter(1) + ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, feature_group_count=1, operand_precision={high,default} +} + +)" +}, +// convolution rank 2 +{ +"ConvolutionR2", +R"(HloModule ConvolveR2_module + +ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[1,1]) -> f32[1,2] { + %input = f32[1,2]{1,0} parameter(0) + %filter = f32[1,1]{1,0} parameter(1) + ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), dim_labels=bf_io->bf, feature_group_count=1 +} + +)" +}, +// convolution backward +{ +"ConvolutionBackward", +R"(HloModule ConvolveBackward_module + +ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f32[128,14,14,512] { + %input = f32[128,7,7,512]{0,3,2,1} parameter(0) + %filter = f32[3,3,512,512]{3,2,1,0} parameter(1) + ROOT %convolution-base-dilated = f32[128,14,14,512]{0,3,2,1} convolution(f32[128,7,7,512]{0,3,2,1} %input, f32[3,3,512,512]{3,2,1,0} %filter), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f, feature_group_count=1 +} + +)" +}, +// reverse(constant) +{ +"Reverse4D", +R"(HloModule Reverse4DFloatArrayOnDim01_module + +ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] { + %constant = f32[4,3,2,1]{0,1,2,3} constant(f32[4,3,2,1] { { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } }) + ROOT %reverse = f32[4,3,2,1]{0,1,2,3} reverse(f32[4,3,2,1]{0,1,2,3} %constant), dimensions={0,1} +} + +)" +}, +// concat +{ +"Concat", +R"(HloModule Concat2x3With2x5_module + +ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] { + %constant = f32[2,3]{1,0} constant(f32[2,3] { { 0, 1, 2 }, { 1000, 1001, 1002 } }) + %constant.1 = f32[2,5]{1,0} constant(f32[2,5] { { 64, 65, 66, 67, 68 }, { 1064, 1065, 1066, 1067, 1068 } }) + ROOT %concatenate = f32[2,8]{1,0} concatenate(f32[2,3]{1,0} %constant, f32[2,5]{1,0} %constant.1), dimensions={1} +} + +)" +}, +// select and scatter +{ +"SelectAndScatter", +R"(HloModule R4F32OverlapSmall_module + +%ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] { + %lhs = f32[] parameter(0) + %rhs = f32[] parameter(1) + ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs) +} + +%add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] { + %lhs.1 = f32[] parameter(0) + %rhs.1 = f32[] parameter(1) + ROOT %add = f32[] add(f32[] %lhs.1, f32[] %rhs.1) +} + +ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] { + %constant = f32[4,5,1,1]{3,2,1,0} constant(f32[4,5,1,1] { { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } }) + %constant.1 = f32[2,2,1,1]{3,2,1,0} constant(f32[2,2,1,1] { { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } }) + %constant.2 = f32[] constant(0) + ROOT %select-and-scatter = f32[4,5,1,1]{3,2,1,0} select-and-scatter(f32[4,5,1,1]{3,2,1,0} %constant, f32[2,2,1,1]{3,2,1,0} %constant.1, f32[] %constant.2), window={size=2x3x1x1 stride=2x2x1x1}, select=%ge_F32.v3, scatter=%add_F32.v3 +} + +)" +}, +// select and scatter on scalar +{ +"SelectAndScatterScalar", +R"(HloModule select_and_scatter_scalar + +%ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] { + %lhs = f32[] parameter(0) + %rhs = f32[] parameter(1) + ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs) +} + +%add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] { + %lhs.1 = f32[] parameter(0) + %rhs.1 = f32[] parameter(1) + ROOT %add = f32[] add(f32[] %lhs.1, f32[] %rhs.1) +} + +ENTRY %SelectAndScatterScalar () -> f32[] { + %constant = f32[] constant(42) + %constant.1 = f32[] constant(1) + %constant.2 = f32[] constant(2) + ROOT %select-and-scatter = f32[] select-and-scatter(f32[] %constant, f32[] %constant.1, f32[] %constant.2), select=%ge_F32.v3, scatter=%add_F32.v3 +} + +)" +}, +// slice +{ +"Slice", +R"(HloModule slice_module + +ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] { + %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0) + ROOT %slice = f32[3,3,2,4]{3,2,1,0} slice(f32[3,3,4,4]{3,2,1,0} %p0), slice={[0:3:1], [0:3:1], [0:4:2], [0:4:1]} +} + +)" +}, +// slice, no stride +{ +"SliceNoStride", +R"(HloModule Slice3x3x3_To_1x3x3_F32_module + +ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] { + %constant = f32[3,3,3]{2,1,0} constant(f32[3,3,3] { { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } }) + ROOT %slice = f32[1,3,3]{2,1,0} slice(f32[3,3,3]{2,1,0} %constant), slice={[0:1], [0:3], [0:3]} +} + +)" +}, +// slice R0 +{ +"SliceR0", +R"(HloModule SliceR0_module + +ENTRY %SliceR0.v2 () -> s32[] { + %constant = s32[] constant(1) + ROOT %slice = s32[] slice(s32[] %constant), slice={} +} + +)" +}, +// transpose +{ +"Transpose", +R"(HloModule Transpose_module + +ENTRY %Transpose.v2 () -> s32[1,2,3] { + %constant = s32[1,2,3]{2,1,0} constant(s32[1,2,3] { { { 1, 2, 3 }, { 4, 5, 6 } } }) + ROOT %transpose = s32[1,2,3]{2,1,0} transpose(s32[1,2,3]{2,1,0} %constant), dimensions={0,1,2} +} + +)" +}, +// Dynamic slice +{ +"DynamicSlice", +R"(HloModule DynamicSlice_module + +ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -> s32[2,2,258] { + %original_parameter = s32[2,2,258]{2,1,0} parameter(0) + %constant = s32[1]{0} constant({0}) + %start_index = s32[1]{0} parameter(1) + %concatenate = s32[3]{0} concatenate(s32[1]{0} %constant, s32[1]{0} %constant, s32[1]{0} %start_index), dimensions={0} + ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[3]{0} %concatenate), dynamic_slice_sizes={2,2,258} +} + +)" +}, +// Dynamic update slice +{ +"DynamicUpdateSlice", +R"(HloModule DynamicUpdateSlice_module + +ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] { + %input = s32[1,1,25,1]{3,2,1,0} parameter(0) + %update = s32[1,1,2,1]{3,2,1,0} parameter(1) + %start_indices = s32[4]{0} parameter(2) + ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[4]{0} %start_indices) +} + +)" +}, +// batch norm training +{ +"BatchNormTraining", +R"(HloModule BasicTraining_module + +ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) { + %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } }) + %constant.1 = f32[2]{0} constant({2, 3}) + %constant.2 = f32[2]{0} constant({1, 2}) + ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3 +} + +)" +}, +// batch norm inference +{ +"BatchNormInference", +R"(HloModule BatchNormInference_module + +ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2], mean: f32[2], variance: f32[2]) -> f32[2,2,2,2] { + %input = f32[2,2,2,2]{3,2,1,0} parameter(0) + %offset = f32[2]{0} parameter(1) + %scale = f32[2]{0} parameter(2) + %mean = f32[2]{0} parameter(3) + %variance = f32[2]{0} parameter(4) + ROOT %batch-norm-inference = f32[2,2,2,2]{3,2,1,0} batch-norm-inference(f32[2,2,2,2]{3,2,1,0} %input, f32[2]{0} %offset, f32[2]{0} %scale, f32[2]{0} %mean, f32[2]{0} %variance), epsilon=0.001, feature_index=0 +} + +)" +}, +// batch norm grad +{ +"BatchNormGrad", +R"(HloModule BatchNormGrad_module + +ENTRY %BatchNormGrad.v4 (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], variance: f32[2], grad_output: f32[2,2,2,2]) -> (f32[2,2,2,2], f32[2], f32[2]) { + %input = f32[2,2,2,2]{3,2,1,0} parameter(0) + %scale = f32[2]{0} parameter(1) + %mean = f32[2]{0} parameter(2) + %variance = f32[2]{0} parameter(3) + %grad_output = f32[2,2,2,2]{3,2,1,0} parameter(4) + ROOT %batch-norm-grad = (f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-grad(f32[2,2,2,2]{3,2,1,0} %input, f32[2]{0} %scale, f32[2]{0} %mean, f32[2]{0} %variance, f32[2,2,2,2]{3,2,1,0} %grad_output), epsilon=0.001, feature_index=0 +} + +)" +}, +// fft +{ +"Fft", +R"(HloModule Fft_module + +ENTRY %Fft (input: c64[8,32]) -> c64[8,32] { + %input = c64[8,32]{1,0} parameter(0) + ROOT %fft = c64[8,32]{1,0} fft(c64[8,32]{1,0} %input), fft_type=FFT, fft_length={32} +} + +)" +}, +// ifft +{ +"Ifft2d", +R"(HloModule Ifft2d_module + +ENTRY %Ifft2d (input: c64[5,8,32]) -> c64[5,8,32] { + %input = c64[5,8,32]{2,1,0} parameter(0) + ROOT %fft = c64[5,8,32]{2,1,0} fft(c64[5,8,32]{2,1,0} %input), fft_type=IFFT, fft_length={8,32} +} + +)" +}, +// rfft2d +{ +"Rfft2d", +R"(HloModule Rfft2d_module + +ENTRY %Rfft2d (input: f32[5,64,32]) -> c64[5,64,17] { + %input = f32[5,64,32]{2,1,0} parameter(0) + ROOT %fft = c64[5,64,17]{2,1,0} fft(f32[5,64,32]{2,1,0} %input), fft_type=RFFT, fft_length={64,32} +} + +)" +}, +// irfft3d +{ +"Irfft3d", +R"(HloModule Irfft3d_module + +ENTRY %Irfft3d (input: c64[5,64,128,33]) -> f32[5,64,128,64] { + %input = c64[5,64,128,33]{3,2,1,0} parameter(0) + ROOT %fft = f32[5,64,128,64]{3,2,1,0} fft(c64[5,64,128,33]{3,2,1,0} %input), fft_type=IRFFT, fft_length={64,128,64} +} + +)" +}, +// pad +{ +"Pad", +R"(HloModule Pad1DS3Array_module + +ENTRY %Pad1DS3Array.v3 () -> f32[8] { + %constant = f32[3]{0} constant({1, 2, 3}) + %constant.1 = f32[] constant(0.1) + ROOT %pad = f32[8]{0} pad(f32[3]{0} %constant, f32[] %constant.1), padding=3_1 +} + +)" +}, +// pad has interior +{ +"PadHasInterior", +R"(HloModule PadHasInterior_module + +ENTRY %PadHasInterior.v3 (input: f32[1,25,7,7]) -> f32[1,25,17,11] { + %input = f32[1,25,7,7]{3,2,1,0} parameter(0) + %constant = f32[] constant(-5.123) + ROOT %pad = f32[1,25,17,11]{3,2,1,0} pad(f32[1,25,7,7]{3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_0_0x2_2_1x2_2_0 +} + +)" +}, +// Negative padding +{ +"PadHasNegativePadding", +R"(HloModule PadHasNegativePadding_module + +ENTRY %PadHasNegativePadding (input: f32[1,25,7,7,10]) -> f32[1,15,6,3,29] { + %input = f32[1,25,7,7,10]{4,3,2,1,0} parameter(0) + %constant = f32[] constant(-5.123) + ROOT %pad = f32[1,15,6,3,29]{4,3,2,1,0} pad(f32[1,25,7,7,10]{4,3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_-10_0x0_-1_0x-2_-2_0x-1_-1_3 +} + +)" +}, +// fusion +{ +"Fusion", +R"(HloModule fusion_module + +%fused_computation (constant.param_0: f32[3,2,1,1], constant.1.param_1: f32[2]) -> f32[3,2,1,1] { + %constant.param_0 = f32[3,2,1,1]{3,2,1,0} parameter(0) + %constant.1.param_1 = f32[2]{0} parameter(1) + %broadcast = f32[3,2,1,1]{3,2,1,0} broadcast(f32[2]{0} %constant.1.param_1), dimensions={1} + ROOT %subtract = f32[3,2,1,1]{3,2,1,0} subtract(f32[3,2,1,1]{3,2,1,0} %constant.param_0, f32[3,2,1,1]{3,2,1,0} %broadcast) +} + +ENTRY %fusion.v3 () -> f32[3,2,1,1] { + %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } }) + %constant.1 = f32[2]{0} constant({3.14, 4.25}) + ROOT %fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %constant, f32[2]{0} %constant.1), kind=kLoop, calls=%fused_computation +} + +)" +}, +{ +"Sparse", +R"(HloModule sparse_f32 + +ENTRY %sparse () -> f32[2,3,4] { + ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{[0, 1, 2]: 1, [1, 2, 3]: 2, [2, 3, 4]: 3}) +} + +)" +}, +{ +"SparseEmpty", +R"(HloModule sparse_f32_empty + +ENTRY %sparse_f32_empty () -> f32[2,3,4] { + ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{}) +} + +)" +}, +{ +"SparseR1", +R"(HloModule sparse_f32_r1 + +ENTRY %sparse_f32_r1 () -> f32[9] { + ROOT %foo = f32[9]sparse{10} constant(f32[9]{1: 2, 3: 4, 5: 6}) +} + +)" +}, +{ +"gather", +R"(HloModule StringifyGather + +ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] { + %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0) + %start_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1) + ROOT %gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %start_indices), offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, start_index_map={0,1,2,3,4}, index_vector_dim=4, slice_sizes={30,29,28,27,26} +} + +)" +}, +{ +"scatter", +R"(HloModule StringifyScatter + +%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] { + %lhs = f32[] parameter(0) + %rhs = f32[] parameter(1) + ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs) +} + +ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7,5], updates: f32[10,9,8,7,30,29,28,27,26]) -> f32[50,49,48,47,46] { + %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0) + %scatter_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1) + %updates = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} parameter(2) + ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, to_apply=%add_F32.v3 +} + +)" +}, +{ + "ConstantUnsignedNoUnderflow", + R"(HloModule ConstantUnsignedNoUnderflow_module + +ENTRY %ConstantUnsignedNoUnderflow () -> u64[] { + ROOT %constant = u64[] constant(1) +} + +)" +}, + +{ + "ConstantUnsignedNoOverflow", + R"(HloModule ConstantUnsignedNoOverflow_module + +ENTRY %ConstantUnsignedNoOverflow () -> u64[] { + ROOT %constant = u64[] constant(9223372036854775807) +} + +)" +}, + }); + // clang-format on +} + +std::vector CreateShortTestCases() { + // clang-format off + return std::vector({ +// map +{ +"Map", +R"(HloModule MapBinaryAdder_module + +add_F32.v3 { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) +} + +ENTRY MapBinaryAdder.v3 { + param0 = f32[4]{0} parameter(0) + param1 = f32[4]{0} parameter(1) + ROOT map = f32[4]{0} map(param0, param1), dimensions={0}, to_apply=add_F32.v3 +} + +)" +}, +// reduce +{ +"Reduce", +R"(HloModule ReduceR3ToR2_module + +add_F32.v3 { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) +} + +ENTRY ReduceR3ToR2.v3 { + input = f32[8,16,256]{2,1,0} parameter(0) + constant = f32[] constant(0) + ROOT reduce = f32[8,16]{1,0} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3 +} + +)" +}, +// tuple reduce +{ +"TupleReduce", +R"(HloModule TupleReduce + +max_argmax { + value = f32[] parameter(2) + prev_max = f32[] parameter(0) + is_next_larger = pred[] greater-than-or-equal-to(value, prev_max) + max = f32[] select(is_next_larger, value, prev_max) + index = s32[] parameter(3) + prev_argmax = s32[] parameter(1) + argmax = s32[] select(is_next_larger, index, prev_argmax) + ROOT pair = (f32[], s32[]) tuple(max, argmax) +} + +ENTRY reduce_entry { + values = f32[1024]{0} parameter(0) + indices = f32[1024]{0} parameter(1) + init_value = f32[] constant(-inf) + init_index = s32[] constant(-1) + ROOT result = (f32[], s32[]) reduce(values, indices, init_value, init_index), dimensions={0}, to_apply=max_argmax +} + +)" +}, +// infeed/outfeed +{ +"InfeedOutfeed", +R"(HloModule outfeed_module + +ENTRY InfeedToOutfeed { + token = token[] after-all() + infeed = ((u32[3]{0}, pred[]), token[]) infeed(token) + infeed.data = (u32[3]{0}, pred[]) get-tuple-element(infeed), index=0 + outfeed = token[] outfeed(infeed.data, token) + ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token) + infeed.1.data = (u32[3]{0}, pred[]) get-tuple-element(infeed.1), index=0 + infeed.1.token = token[] get-tuple-element(infeed.1), index=1 + outfeed.1 = token[] outfeed(infeed.1.data, infeed.1.token) +} + +)" +}, +// Rng +{ +"Rng", +R"(HloModule rng_module + +ENTRY Rng { + constant = f32[] constant(0) + constant.1 = f32[] constant(1) + ROOT rng = f32[8]{0} rng(constant, constant.1), distribution=rng_uniform +} + +)" +}, +// Reduce precision +{ +"ReducePrevison", +R"(HloModule reduce_precision + +ENTRY ReducePrecision { + constant = f32[1]{0} constant({3.14159}) + ROOT reduce-precision = f32[1]{0} reduce-precision(constant), exponent_bits=8, mantissa_bits=10 +} + +)" +}, +// Sort (Key) +{ +"SortKey", +R"(HloModule sort + +ENTRY Sort { + x = f32[1024]{0} parameter(0) + ROOT sorted = f32[1024]{0} sort(x), dimensions={0} +} + +)" +}, +// Sort (Key, Value) +{ +"SortKeyValue", +R"(HloModule sort + +ENTRY Sort { + keys = f32[1024]{0} parameter(0) + values = s32[1024]{0} parameter(1) + ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0} +} + +)" +}, +// R2 Sort (Key) +{ +"SortKeyR2", +R"(HloModule sort + +ENTRY Sort { + x = f32[1024,16]{0,1} parameter(0) + ROOT sorted = f32[1024,16]{0,1} sort(x), dimensions={0} +} + +)" +}, +// R2 Sort (Key, Value) +{ +"SortKeyValueR2", +R"(HloModule sort + +ENTRY Sort { + keys = f32[1024,16]{0,1} parameter(0) + values = s32[1024,16]{0,1} parameter(1) + ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}) sort(keys, values), dimensions={0} +} + +)" +}, +// Conditional +{ +"Conditional", +R"(HloModule conditional + +Negate { + x = f32[] parameter(0) + ROOT negate = f32[] negate(x) +} + +Identity { + y = f32[] parameter(0) + ROOT copy = f32[] copy(y) +} + +ENTRY Parameters1.v4 { + constant = pred[] constant(true) + constant.1 = f32[] constant(56) + constant.2 = f32[] constant(12) + ROOT conditional = f32[] conditional(constant, constant.1, constant.2), true_computation=Negate, false_computation=Identity +} + +)" +}, +// CustomCall +{ +"CustomCall", +R"(HloModule custom_call + +ENTRY CustomCall { + constant = f32[1]{0} constant({12345}) + ROOT custom-call = f32[1,2,3]{0,2,1} custom-call(constant), custom_call_target="foo\"bar" +} + +)" +}, +// Variables with non-default names +{ +"NonDefaultNames", +R"(HloModule add_constants_module + +ENTRY add_constants { + foo = f32[] constant(3.14) + ROOT bar = f32[] add(foo, foo) +} + +)" +}, +{ +"Dot", +R"(HloModule dot + +ENTRY dot { + a = f32[2,10]{1,0} parameter(0) + b = f32[10,3]{1,0} parameter(1) + ROOT dot = f32[2,3]{1,0} dot(a, b), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={0} +} + +)" +}, +{ +"gather", +R"(HloModule gather + +ENTRY Gather { + input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0) + start_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1) + ROOT gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(input_tensor, start_indices), offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, start_index_map={0,1,2,3,4}, index_vector_dim=4, slice_sizes={30,29,28,27,26} +} + +)" +}, +// cross-replica-sum +{ +"CrossReplicaSum", +R"(HloModule CRS + +add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) +} + +ENTRY CRS { + input = f32[8]{0} parameter(0) + ROOT crs = f32[8]{0} cross-replica-sum(input), replica_groups={}, to_apply=add +} + +)" +}, +// cross-replica-sum with subgroups +{ +"CrossReplicaSumWithSubgroups", +R"(HloModule CRS_Subgroups + +add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) +} + +ENTRY CrossReplicaSumWithSubgroups { + input = f32[128,32]{0,1} parameter(0) + ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_groups={{0,1},{2,3}}, barrier="abc", to_apply=add +} + +)" +}, +// all-to-all +{ +"AllToAll", +R"(HloModule AllToAll + +ENTRY AllToAll { + input = f32[128,32]{0,1} parameter(0) + ROOT a2a = f32[128,32]{0,1} all-to-all(input), replica_groups={} +} + +)" +}, +// all-to-all with subgroups +{ +"AllToAllWithSubgroups", +R"(HloModule AllToAllWithSubgroups + +ENTRY AllToAllWithSubgroups { + input = f32[128,32]{0,1} parameter(0) + ROOT a2a = f32[128,32]{0,1} all-to-all(input), replica_groups={{1,2},{3,0}} +} + +)" +}, +// collective-permute +{ +"CollectivePermute", +R"(HloModule CollectivePermute + +ENTRY CollectivePermute { + input = f32[128,32]{0,1} parameter(0) + ROOT root = f32[128,32]{0,1} collective-permute(input), source_target_pairs={{0,1},{1,2},{2,3}} +} + +)" +}, +// Iota +{ +"Iota", +R"(HloModule iota + +ENTRY Iota { + ROOT iota = f32[100]{0} iota(), iota_dimension=0 +} + +)" +}, +// custom-call with window and dim_labels +{ +"CustomCallWithWindowAndDimLabels", +R"(HloModule CustomCallWithWindowAndDimLabels + +ENTRY Computation { + ROOT r = f32[100]{0} custom-call(), window={size=2x2}, dim_labels=b01f_01io->b01f, custom_call_target="target" +} + +)" +} + }); + // clang-format on +} + +class HloParserTest : public ::testing::Test, + public ::testing::WithParamInterface { + protected: + static void ExpectHasSubstr(string_view s, string_view expected) { + EXPECT_TRUE(absl::StrContains(s, expected)) + << "'" << s << "' does not contain '" << expected << "'"; + } + + // Expects "ToString(ParseHloString(string)) == string", that is, parses the + // string, asserts that it succeeded, stringifies the parsed module, and + // checks that the it equals the original string. + void ExpectEqual() { + const string& original = GetParam().module_string; + auto result = ParseHloString(original); + TF_ASSERT_OK(result.status()); + EXPECT_EQ(original, result.ValueOrDie()->ToString( + HloPrintOptions().set_print_large_constants(true))); + } +}; + +class HloParserShortTest : public HloParserTest { + protected: + void ExpectEqualShort() { + const string& original = GetParam().module_string; + auto result = ParseHloString(original); + TF_ASSERT_OK(result.status()); + EXPECT_EQ(original, + result.ValueOrDie()->ToString(HloPrintOptions::ShortParsable())); + } +}; + +TEST_P(HloParserTest, Run) { ExpectEqual(); } + +TEST_P(HloParserShortTest, Run) { ExpectEqualShort(); } + +INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTest, + ::testing::ValuesIn(CreateTestCases()), + TestDataToString); + +INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserShortTest, + ::testing::ValuesIn(CreateShortTestCases()), + TestDataToString); + +TEST_F(HloParserTest, Empty) { + const string original = ""; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); +} + +TEST_F(HloParserTest, Garbage) { + const string original = "HloModule thi$ str1ng makes# N0 sen$e @all!*&^%$"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); +} + +TEST_F(HloParserTest, WrongOpcode) { + const string original = R"(HloModule wrong_opcode: + +ENTRY %blabla (x: f32[], y: f32[]) -> f32[] { + %x = f32[]{} parameter(0) + %y = f32[]{} parameter(1) + %le = pred[]{} le(f32[]{} %x, f32[]{} %y) +} + +)"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); +} + +TEST_F(HloParserTest, WrongShape) { + const string original = R"(HloModule wrong_opcode: + +ENTRY %blabla (x: g32[]) -> g32[] { + %x = g32[]{} parameter(0) +} + +)"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); +} + +TEST_F(HloParserTest, WrongOperandsSize) { + const string original = R"(HloModule wrong_opcode: + +ENTRY %blabla (x: f32[]) -> pred[] { + %x = f32[]{} parameter(0) + %eq = pred[]{} equal-to(f32[]{} %x) +} + +)"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); +} + +TEST_F(HloParserTest, OperandNotFound) { + const string original = R"(HloModule operand_not_found: +ENTRY %blabla (x: f32[]) -> pred[] { + %x = f32[]{} parameter(0) + %eq = pred[]{} equal-to(f32[]{} %x, f32[]{} %y) +} +)"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); +} + +TEST_F(HloParserTest, MoreConstants) { + const string original = R"(HloModule SelectScalarS32True_module + +ENTRY %SelectScalarS32True.v4 () -> s32[] { + %constant.2 = pred[] constant(true) + %constant.1 = s32[] constant(-42), sharding={s32[5,6] devices=[2,3]1,2,3,4} + %constant = s32[] constant(42) + %select = s32[] select(pred[] %constant.2, s32[] %constant.1, s32[] %constant) +} + +)"; + auto result = ParseHloString(original); + TF_EXPECT_OK(result.status()); + // Constant instructions have no name. The string will be parsed successfully + // but the constant names will not be exactly the same. +} + +TEST_F(HloParserTest, ConfigurationField) { + const string original = R"(HloModule AModule +ENTRY %configuration_test() -> s32[] { + %constant = s32[] constant(42), backend_config="foo bar" +})"; + auto result = ParseHloString(original); + TF_ASSERT_OK(result.status()); + EXPECT_EQ("foo bar", result.ValueOrDie() + ->entry_computation() + ->root_instruction() + ->raw_backend_config_string()); +} + +TEST_F(HloParserTest, LiteralDimensionsMismatch_1) { + const string original = R"(HloModule some_2_module + +ENTRY %some_2 () -> f32[2] { + ROOT %constant = f32[2]{0} constant({1,{2}}) +} + +)"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); + ExpectHasSubstr(result.status().error_message(), + "expects nested array in rank 1, but sees larger"); +} + +TEST_F(HloParserTest, LiteralDimensionsMismatch_2) { + const string original = R"(HloModule some_2x3_module + +ENTRY %some_2x3 () -> f32[2,3] { + ROOT %constant = f32[2,3]{1,0} constant(f32[2,3] {1, 2, 3, 4, 5, 6}) +} + +)"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); + ExpectHasSubstr(result.status().error_message(), + "expects nested array in rank 2, but sees 1"); +} + +TEST_F(HloParserTest, LiteralDimensionsMismatch_3) { + const string original = R"(HloModule some_2x3x2_module + +ENTRY %some_2x3x2 () -> f32[2,3,2] { + ROOT %constant = f32[2,3,2]{2,1,0} constant(f32[2,3,2] {{{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, 12}}}) +} + +)"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); + ExpectHasSubstr(result.status().error_message(), + "expects 3 elements in the [0]th element"); +} + +TEST_F(HloParserTest, ConstantF16Overflow) { + const string original = + R"(HloModule ConstantF16Overflow_module + +ENTRY %ConstantF16Overflow.v4 () -> f16[] { + ROOT %constant = f16[] constant(-65505) +} + +)"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); + ExpectHasSubstr(result.status().error_message(), + "is out of range for literal's primitive type F16"); +} + +TEST_F(HloParserTest, ConstantUnsignedUnderflow) { + const string original = R"( + HloModule ConstantUnsignedUnderflow_module + ENTRY %ConstantUnsignedUnderflow () -> u64[] { + ROOT %constant = u64[] constant(-1) + })"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); + ExpectHasSubstr(result.status().error_message(), + "is out of range for literal's primitive type U64"); +} + +TEST_F(HloParserTest, ConstantUnsignedOverflow) { + const string original = R"( + HloModule ConstantUnsignedOverflow_module + ENTRY %ConstantUnsignedOverflow () -> u32[] { + ROOT %constant = u32[] constant(4294967296) + })"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); + ExpectHasSubstr(result.status().error_message(), + "is out of range for literal's primitive type U32"); +} + +TEST_F(HloParserTest, ConstantUnsignedInt64Overflow) { + const string original = R"( + HloModule ConstantUnsignedOverflow_module + ENTRY %ConstantUnsignedOverflow () -> u64[] { + ROOT %constant = u64[] constant(9223372036854775808) + })"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); +} + +TEST_F(HloParserTest, ConstantWithExp) { + const string original = R"(HloModule ConstantWithExp_module + +ENTRY %ConstantWithExp.v4 () -> f32[] { + %constant.1 = f32[] constant(3e+2) +} + +)"; + auto result = ParseHloString(original); + TF_EXPECT_OK(result.status()); + // The string will be parsed successfully but the output strings are not + // exactly the same, because "3e2" is parsed into value 300 and will be + // printed as "300". +} + +TEST_F(HloParserTest, AttibutesAnyOrder) { + const string original = R"(HloModule any_order_module + +ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] { + %input = f32[1,2,1]{2,1,0} parameter(0) + %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input) + %filter = f32[1,1,1]{2,1,0} parameter(1) + ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), feature_group_count=1, sharding={maximal device=1}, backend_config="foo", dim_labels=b0f_0io->b0f, window={pad=1_1 size=2} +} + +)"; + TF_EXPECT_OK(ParseHloString(original).status()); +} + +TEST_F(HloParserTest, InvalidDimLabels) { + string prefix = R"(HloModule invalid_dim_labels_module + +ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] { + %input = f32[1,2,1]{2,1,0} parameter(0) + %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input) + %filter = f32[1,1,1]{2,1,0} parameter(1) + ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1} )"; + string suffix = R"( +} + +)"; + + ExpectHasSubstr( + ParseHloString(absl::StrCat(prefix, ",dim_labels=00_01_10", suffix)) + .status() + .error_message(), + "expects dim labels pattern"); + + ExpectHasSubstr( + ParseHloString(absl::StrCat(prefix, ",dim_labels=010_1100->010", suffix)) + .status() + .error_message(), + "must have the same rank"); +} + +TEST_F(HloParserTest, UnexpectedAttribute) { + const string original = R"(HloModule unexpected_attr_module + +ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] { + %token = token[] after-all() + %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15 + %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15 + ROOT %constant = f32[] constant(2.1) + %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, calls=%recv + %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16 +} + +)"; + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "unexpected attribute \"calls\""); +} + +TEST_F(HloParserTest, MissingAttribute) { + const string original = R"(HloModule missing_attr_module + +ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] { + %token = token[] after-all() + %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15 + %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15 + ROOT %constant = f32[] constant(-2.1) + %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token) + %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16 +} + +)"; + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "attribute channel_id is expected but not seen"); +} + +TEST_F(HloParserTest, PredecessorUndefined) { + const string original = R"(HloModule pre_not_found_module + +ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] { + %token = token[] after-all() + %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15 + %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15 + ROOT %constant = f32[] constant(2.1) + %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, control-predecessors={%done} + %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16 +} + +)"; + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "'done' is not defined"); +} + +TEST_F(HloParserTest, SliceAllowOmitStride1) { + const string original = R"(HloModule slice_module + +ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] { + %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0) + ROOT %slice = f32[3,3,2,4]{3,2,1,0} slice(f32[3,3,4,4]{3,2,1,0} %p0), slice={[0:3], [0:3], [0:4:2], [0:4]} +} + +)"; + TF_EXPECT_OK(ParseHloString(original).status()); +} + +TEST_F(HloParserTest, PaddingConfigIsNotWindowPad) { + const string original = R"(HloModule window_pad_module + +ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] { + %input = f32[1,2,1]{2,1,0} parameter(0) + %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input) + %filter = f32[1,1,1]{2,1,0} parameter(1) + ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), dim_labels=b0f_0io->b0f, window={pad=1_1_0 size=1} +} + +)"; + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "expects padding_low and padding_high separated by '_'"); +} + +TEST_F(HloParserTest, CommaBetweenSubAttributes) { + const string original = R"(HloModule test_comma_module + +ENTRY %test_comma.v4 () -> f32[] { + ROOT %constant = f32[] constant(-4.2), metadata={source_line=5, op_type="::const"} +} + +)"; + TF_EXPECT_OK(ParseHloString(original).status()); +} + +TEST_F(HloParserTest, ComputationShapeDoesNotMatchRootShape) { + const string original = R"(HloModule custom_call: + +ENTRY %CustomCall () -> f32[1] { + %constant = f32[1]{0} constant({12345}) + ROOT %foo = f32[1,2,3]{0,2,1} custom-call(f32[1]{0} %constant), custom_call_target="foo\"bar" +})"; + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "Shape of computation CustomCall, f32[1], is not compatible " + "with that of its root instruction foo, f32[1,2,3]"); +} + +TEST_F(HloParserTest, EntryComputationWithLayout) { + const string original = R"(HloModule layout: +add_F32.v3 { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) +} + +ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] { + input = f32[8,16,256]{0,1,2} parameter(0) + constant = f32[] constant(0) + ROOT reduce = f32[8,16]{0,1} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3 +})"; + + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); + auto program_layout = module.ValueOrDie()->entry_computation_layout(); + ASSERT_EQ(program_layout.parameter_count(), 1); + auto param_layout = program_layout.parameter_layout(0).layout(); + auto result_layout = program_layout.result_layout().layout(); + EXPECT_TRUE( + LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1, 2}), param_layout)) + << "actual layout of parameter(0) is " + << LayoutUtil::HumanString(param_layout); + EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1}), result_layout)) + << "actual layout of result is " + << LayoutUtil::HumanString(result_layout); +} + +TEST_F(HloParserTest, NoEntry) { + const string original = R"(HloModule no_entry: +c1 { + const1 = f32[1]{0} constant({12345}) +} +c2 { + const2 = f32[1]{0} constant({67890}) +})"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); + EXPECT_EQ(module.ValueOrDie()->entry_computation()->name(), "c2"); +} + +TEST_F(HloParserTest, NoRoot) { + const string original = R"(HloModule no_root: +ENTRY consts { + first = f32[1]{0} constant({12345}) + last = f32[1]{0} constant({67890}) +})"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); + EXPECT_EQ( + module.ValueOrDie()->entry_computation()->root_instruction()->name(), + "last"); +} + +TEST_F(HloParserTest, Comments) { + const string original = R"(/* module description. */ +HloModule comments: + +ENTRY /*comment*/ c1 { + /* blah */ + ROOT const1 = /*foo*/f32[1]{0} constant({12345 /*bar*/}) + /* comment */ +} + +/* something else */ + +)"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + +TEST_F(HloParserTest, MultilineComments) { + const string original = R"(HloModule multiline_comment: +ENTRY c1 { + /* + ROOT foo = f32[1]{0} constant({12345}) + */ + ROOT const1 = f32[1]{0} constant({12345}) +/* +a +b +c +d + +*/ +})"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + +TEST_F(HloParserTest, UnterminatedComment) { + const string original = R"(HloModule unterminated_comment: +ENTRY c1 { +/* unterminated + ROOT const1 = f32[1]{0} constant({12345}) +})"; + // Verify that the error message points to the beginning of the unterminated + // comment. + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "/* unterminated\n^"); +} + +TEST_F(HloParserTest, SlashSlashComments) { + const string original = R"(HloModule slash_slash_comment: +// Garbage +ENTRY c1 { + // Foo bar + ROOT const1 = f32[1]{0} constant({12345}) // Something else +})"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + +TEST_F(HloParserTest, SlashSlashCommentMsDosEolFormat) { + const string original = + "HloModule slash_slash_comment:\r\n// Garbage\r\nENTRY c1 {\r\n// Foo " + "bar\r\nROOT const1 = f32[1]{0} constant({12345}) // Something else\r\n}"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + +TEST_F(HloParserTest, SlashSlashCommentMacEolFormat) { + const string original = + "HloModule slash_slash_comment:\r// Garbage\rENTRY c1 {\r// Foo " + "bar\rROOT const1 = f32[1]{0} constant({12345}) // Something else\r}"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + +TEST_F(HloParserTest, MultipleEntries) { + const string original = R"(HloModule multiple_entries: +ENTRY c1 { + const1 = f32[1]{0} constant({12345}) +} +ENTRY c2 { + const2 = f32[1]{0} constant({67890}) +})"; + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "expects only one ENTRY"); +} + +TEST_F(HloParserTest, MultipleRoots) { + const string original = R"(HloModule multiple_roots: +ENTRY consts { + ROOT const1 = f32[1]{0} constant({12345}) + ROOT const2 = f32[1]{0} constant({12345}) +})"; + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "one computation should have only one ROOT"); +} + +TEST_F(HloParserTest, ComputationExists) { + const string original = R"(HloModule comp_exists +comp { + const1 = f32[1]{0} constant({12345}) +} +comp { + const2 = f32[1]{0} constant({67890}) +})"; + ExpectHasSubstr(ParseHloString(original).status().error_message(), + R"(was parsing 2:1: error: computation previously defined here +comp { +^)"); +} + +TEST_F(HloParserTest, CrossComputationLookup) { + const string original = R"(HloModule cross_computation_lookup: +tcalla (a: (s32[], s32[])) -> (s32[], s32[]) { + ROOT aparam = (s32[], s32[]) parameter(0) +} + +tcallb (b: (s32[], s32[])) -> s32[] { + rparam = (s32[], s32[]) parameter(0) + ROOT gte0 = s32[] get-tuple-element(aparam), index=0 +} + +ENTRY entry { + param = (s32[], s32[]) parameter(0) + call0 = (s32[], s32[]) call(param), to_apply=tcalla + ROOT call1 = s32[] call(param), to_apply=tcallb +})"; + ExpectHasSubstr( + ParseHloString(original).status().error_message(), + "was parsing 8:39: error: instruction does not exist: aparam"); +} + +TEST_F(HloParserTest, ParseSharding) { + const string original = "{maximal device=42}"; + TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original)); + EXPECT_EQ(sharding.ToString(), original); +} + +TEST_F(HloParserTest, ParseWindow) { + Window original = window_util::MakeWindow({1, 2, 3}); + TF_ASSERT_OK_AND_ASSIGN(Window parsed, + ParseWindow(window_util::ToString(original))) + EXPECT_EQ(window_util::ToString(original), window_util::ToString(parsed)); +} + +TEST_F(HloParserTest, ParseConvolutionDimensionNumbers) { + const string original = "b0f_0io->b0f"; + TF_ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums, + ParseConvolutionDimensionNumbers(original)); + EXPECT_EQ(original, ConvolutionDimensionNumbersToString(dnums)); +} + +TEST_F(HloParserTest, ParsePaddingConfigNoInteriorPadding) { + const string original = "0_1x2_3"; + TF_ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original)); + EXPECT_EQ(original, PaddingConfigToString(dnums)); +} + +TEST_F(HloParserTest, ParsePaddingConfigInteriorPadding) { + const string original = "0_1_0x2_3_4"; + TF_ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original)); + EXPECT_EQ(original, PaddingConfigToString(dnums)); +} + +TEST_F(HloParserTest, ParsePaddingConfigInteriorPaddingImplicitZeroDim) { + TF_ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig("0_1x2_3_4")); + // The extra "_0" gets added to the canonical string because the other dim has + // interior padding. + EXPECT_EQ("0_1_0x2_3_4", PaddingConfigToString(dnums)); +} + +TEST_F(HloParserTest, NontupleInfeed) { + const string original = R"(HloModule nontuple_infeed: +ENTRY nontuple_infeed { + token = token[] after-all() + ROOT infeed = pred[] infeed(token) +})"; + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "infeed must have a non-empty tuple shape"); +} + +TEST(HloParserSingleOpTest, SingleOp) { + const string text = + "%multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, " + "f32[2,4]{1,0} %x)"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text)); + const HloComputation* computation = module->entry_computation(); + ASSERT_NE(computation, nullptr); + EXPECT_THAT(computation->root_instruction(), + op::Multiply(op::Parameter(0), op::Parameter(1))); +} + +TEST(HloParserSingleOpTest, SingleOpNoShapesProducesError) { + const string text = "%multiply = f32[2,4]{1,0} multiply(%broadcast, %x)"; + StatusOr> module = ParseHloOpToModule(text); + ASSERT_TRUE(!module.status().ok()); + LOG(INFO) << "Status: " << module.status(); + EXPECT_THAT( + module.status().ToString(), + ::testing::HasSubstr("Operand broadcast had no shape in HLO text")); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h index b3d0a07add39968c6310392ea01daeab8a7dd9af..791b1a97b0b82edf19ff1588fd8d5d996ac0fef4 100644 --- a/tensorflow/compiler/xla/service/hlo_pass_fix.h +++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h @@ -16,6 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PASS_FIX_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PASS_FIX_H_ +#include + #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -34,9 +36,19 @@ class HloPassFix : public Pass { StatusOr Run(HloModule* module) override { bool changed = false; bool changed_this_iteration = true; + int64 iteration_count = 0; + int64 limit = + std::max(static_cast(1000), module->instruction_count()); while (changed_this_iteration) { TF_ASSIGN_OR_RETURN(changed_this_iteration, Pass::Run(module)); changed |= changed_this_iteration; + ++iteration_count; + if (iteration_count == limit) { + LOG(ERROR) + << "Unexpectedly high number of iterations in HLO passes (" + << iteration_count + << ")\nIf compilation hangs here, please file a bug with XLA."; + } } return changed; } diff --git a/tensorflow/compiler/xla/service/hlo_pass_interface.h b/tensorflow/compiler/xla/service/hlo_pass_interface.h index 0cddf8fb8f7589739d1233fa4974ff703211a137..f1ad0f9b0148cb3d5f938e7f5d220d6cb82ea98d 100644 --- a/tensorflow/compiler/xla/service/hlo_pass_interface.h +++ b/tensorflow/compiler/xla/service/hlo_pass_interface.h @@ -29,7 +29,7 @@ namespace xla { class HloPassInterface { public: virtual ~HloPassInterface() = default; - virtual tensorflow::StringPiece name() const = 0; + virtual absl::string_view name() const = 0; // Run the pass on the given HLO module. Return whether it modified the // module. diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc index d8f1ab916b5c5c500c2d8dcd8605be083f95862a..6e4ed0de626688c0d836d6bc9c619245db8d61dd 100644 --- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc +++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc @@ -17,22 +17,23 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_proto_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" -using ::tensorflow::strings::StrAppend; -using ::tensorflow::strings::StrCat; - namespace xla { - namespace { + +using absl::StrAppend; +using absl::StrCat; + void DumpModuleGraph(const HloModule& module, const string& message) { hlo_graph_dumper::MaybeDumpHloModule(module, message); VLOG(3) << "HLO " << message << ":"; @@ -48,9 +49,9 @@ void DumpModuleProto(const HloModule& module, const string& dump_to, tensorflow::mutex_lock lock(mu); const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++; - const string mod_name = SanitizeFileName(tensorflow::strings::Printf( - "module_%04d.%04lld.%s.after_%s", module.unique_id(), pass_number, - pipeline_name.c_str(), pass_name.c_str())); + const string mod_name = SanitizeFileName( + absl::StrFormat("module_%04d.%04d.%s.after_%s", module.unique_id(), + pass_number, pipeline_name, pass_name)); TF_QCHECK_OK(protobuf_util::DumpProtoToDirectory(MakeHloProto(module), dump_to, mod_name)); @@ -68,7 +69,7 @@ StatusOr HloPassPipeline::Run(HloModule* module) { repeated_field.end()); if (!disabled_passes.empty()) { VLOG(1) << "Passes disabled by --xla_disable_hlo_passes: " - << tensorflow::str_util::Join(disabled_passes, ", "); + << absl::StrJoin(disabled_passes, ", "); } auto run_invariant_checkers = [this, @@ -90,7 +91,7 @@ StatusOr HloPassPipeline::Run(HloModule* module) { return Status::OK(); }; - string prefix = std::string(name()) + ": pipeline start"; + string prefix = StrCat(name(), ": pipeline start"); bool changed = false; string message; TF_RETURN_IF_ERROR( @@ -98,12 +99,12 @@ StatusOr HloPassPipeline::Run(HloModule* module) { const string xla_dump_per_pass_hlo_proto_to = module->config().debug_options().xla_dump_per_pass_hlo_proto_to(); if (!xla_dump_per_pass_hlo_proto_to.empty()) { - DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, - std::string(name()), "pipeline_start"); + DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, string(name()), + "pipeline_start"); } for (auto& pass : passes_) { - if (disabled_passes.count(std::string(pass->name())) > 0) { + if (disabled_passes.count(string(pass->name())) > 0) { VLOG(1) << " Skipping HLO pass " << pass->name() << ", disabled by --xla_disable_hlo_passes"; continue; @@ -120,8 +121,8 @@ StatusOr HloPassPipeline::Run(HloModule* module) { TF_RETURN_IF_ERROR( run_invariant_checkers(StrCat("after running pass: ", pass->name()))); if (!xla_dump_per_pass_hlo_proto_to.empty()) { - DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, - std::string(name()), std::string(pass->name())); + DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, string(name()), + string(pass->name())); } changed |= changed_this_pass; diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h index a42d7e59fed2d838dfe3cb7f99e6b946edfdb0b4..1d41a4dac1d8e2f392be0e4e856ead36a5b71d68 100644 --- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h +++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h @@ -21,7 +21,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" #include "tensorflow/compiler/xla/statusor.h" @@ -34,7 +34,7 @@ namespace xla { class HloPassPipeline : public HloPassInterface { public: explicit HloPassPipeline(const string& name) : name_(name) {} - tensorflow::StringPiece name() const override { return name_; } + absl::string_view name() const override { return name_; } // Add a pass to the pipeline. It should be called with the arguments for the // pass constructor: diff --git a/tensorflow/compiler/xla/service/hlo_proto_util_test.cc b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc index b9cca138703c8fa61aadf69dd7304a215a9f4be2..c3cacd7ce6b1ea3ad7cf84e898f274ae12622ac5 100644 --- a/tensorflow/compiler/xla/service/hlo_proto_util_test.cc +++ b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace xla { namespace { diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc index d45038f1f4a2e4aa19234eec93fdc9a068a902e1..2a07b6fcbc243d955e136ccdf097c8155a115845 100644 --- a/tensorflow/compiler/xla/service/hlo_query.cc +++ b/tensorflow/compiler/xla/service/hlo_query.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_query.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -61,7 +61,7 @@ bool AllOperandsAreConstants(const HloInstruction& instruction) { } HloInstruction* GetMatchingOperand( - std::function matcher, + const std::function& matcher, HloInstruction* instruction) { for (HloInstruction* op : instruction->operands()) { if (matcher(op)) { @@ -72,7 +72,7 @@ HloInstruction* GetMatchingOperand( } bool MatchBinaryInstructionOperand( - std::function matcher, + const std::function& matcher, HloInstruction* instruction, HloInstruction** matching_operand, HloInstruction** other_operand) { CHECK_EQ(instruction->operand_count(), 2); diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h index c79347bbf9d6146943b7b787f713369cb37fadee..c0826a6aee1f693484207a86ec258c6604d92318 100644 --- a/tensorflow/compiler/xla/service/hlo_query.h +++ b/tensorflow/compiler/xla/service/hlo_query.h @@ -45,7 +45,7 @@ bool IsScalarConstant(const HloInstruction* instruction); // multiple matching operands, then the first matching operand is returned. If // there are no matching operands then nullptr is returned. HloInstruction* GetMatchingOperand( - std::function matcher, + const std::function& matcher, HloInstruction* instruction); // Returns whether a binary instruction has a matching operand. Sets @@ -53,7 +53,7 @@ HloInstruction* GetMatchingOperand( // other_operand. Note: in the case where both operands match, the first operand // of the instruction is returned. bool MatchBinaryInstructionOperand( - std::function matcher, + const std::function& matcher, HloInstruction* instruction, HloInstruction** matching_operand, HloInstruction** other_operand); diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc index 4738e46f8aeb96a4c25d04b3246bd21f644fe3ea..961930f0a888e90f86e4354fa1373a303af8ec2f 100644 --- a/tensorflow/compiler/xla/service/hlo_reachability.cc +++ b/tensorflow/compiler/xla/service/hlo_reachability.cc @@ -18,7 +18,7 @@ limitations under the License. namespace xla { HloReachabilityMap::HloReachabilityMap( - const std::list& instructions) + absl::Span instructions) : size_(instructions.size()) { bit_vectors_.reserve(size_); for (const HloInstruction* hlo : instructions) { @@ -29,7 +29,7 @@ HloReachabilityMap::HloReachabilityMap( } bool HloReachabilityMap::SetReachabilityToUnion( - tensorflow::gtl::ArraySlice inputs, + absl::Span inputs, const HloInstruction* instruction) { BitVector& bit_vector = GetBitVector(instruction); tmp_bit_vector_ = bit_vector; @@ -38,13 +38,13 @@ bool HloReachabilityMap::SetReachabilityToUnion( } void HloReachabilityMap::FastSetReachabilityToUnion( - tensorflow::gtl::ArraySlice inputs, + absl::Span inputs, const HloInstruction* instruction) { SetReachabilityToUnionHelper(inputs, instruction, &GetBitVector(instruction)); } void HloReachabilityMap::SetReachabilityToUnionHelper( - tensorflow::gtl::ArraySlice inputs, + absl::Span inputs, const HloInstruction* instruction, BitVector* bit_vector) { // If instruction is part of inputs, don't reset the bit_vector. if (std::find(inputs.begin(), inputs.end(), instruction) == inputs.end()) { diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h index 69bb2b3cee6dafe058c45b4e74e93401bea2cfc9..b66a2aa4bd2b00a88cdbfa6b41c9123bb370aa87 100644 --- a/tensorflow/compiler/xla/service/hlo_reachability.h +++ b/tensorflow/compiler/xla/service/hlo_reachability.h @@ -19,10 +19,10 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/platform/types.h" @@ -41,7 +41,8 @@ class HloReachabilityMap { public: // Sets up a graph with no edges and where the nodes correspond to the given // instructions. - explicit HloReachabilityMap(const std::list& instructions); + explicit HloReachabilityMap( + absl::Span instructions); // Set the reachability set of 'instruction' to the union of the reachability // sets of 'inputs'. Upon return, IsReachable(x, instruction) where @@ -53,13 +54,12 @@ class HloReachabilityMap { // vector in the internal graph of this HloReachabilityMap for the given // instruction and does not transitively update any other part of the // adjacency matrix. - bool SetReachabilityToUnion( - tensorflow::gtl::ArraySlice inputs, - const HloInstruction* instruction); + bool SetReachabilityToUnion(absl::Span inputs, + const HloInstruction* instruction); // As above, but faster because it does not check if the reachability changed. void FastSetReachabilityToUnion( - tensorflow::gtl::ArraySlice inputs, + absl::Span inputs, const HloInstruction* instruction); // Sets entry so that IsReachable(a, b) will return true @@ -140,7 +140,7 @@ class HloReachabilityMap { // Helper for SetReachabilityToUnion/FastSetReachabilityToUnion. void SetReachabilityToUnionHelper( - tensorflow::gtl::ArraySlice inputs, + absl::Span inputs, const HloInstruction* instruction, BitVector* bit_vector); // Return the index of the given instruction. The value is used to index into diff --git a/tensorflow/compiler/xla/service/hlo_reachability_test.cc b/tensorflow/compiler/xla/service/hlo_reachability_test.cc index 657a9ee83d29e72b95660325f9139f44159d6508..585c95972b0e01abc14543205af71b4b0c0bdf3c 100644 --- a/tensorflow/compiler/xla/service/hlo_reachability_test.cc +++ b/tensorflow/compiler/xla/service/hlo_reachability_test.cc @@ -39,15 +39,15 @@ TEST_F(HloReachabilityTest, Reachability) { */ auto builder = HloComputation::Builder(TestName()); auto a = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); auto b = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); auto c = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); auto d = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); auto e = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); builder.Build(); HloReachabilityMap reachability({a, b, c, d, e}); diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc index 39b85de0f12024f5e20ddd37618987c6d06bc307..c9629926eae5132f683a353a430a724a66ef3d60 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc +++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc @@ -20,9 +20,14 @@ limitations under the License. #include #include +#include "absl/container/inlined_vector.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/service/buffer_value.h" +#include "tensorflow/compiler/xla/service/copy_insertion.h" #include "tensorflow/compiler/xla/service/flatten_call_graph.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_dce.h" @@ -36,17 +41,13 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" -using ::tensorflow::strings::HumanReadableNumBytes; - namespace xla { - namespace { +using ::tensorflow::strings::HumanReadableNumBytes; + // Potential optimizations: // . TODO(b/35244891): Avoid N^2 behavior by keeping a priority queue // of candidates. @@ -71,9 +72,23 @@ bool IsRematerializable(const HloInstruction* instruction) { } } +// Checks whether an instruction can be rematerialized, by looking up the +// cache before, and eventually calling the IsRematerializable() API. +bool CanBeRematerialized( + const HloInstruction* instruction, + tensorflow::gtl::FlatMap* remat_able) { + auto it = remat_able->find(instruction); + if (it != remat_able->end()) { + return it->second; + } + bool rematerializable = IsRematerializable(instruction); + (*remat_able)[instruction] = rematerializable; + return rematerializable; +} + // Type holding a unique identifier for each Buffer object. using BufferId = int64; -using BufferIdList = tensorflow::gtl::InlinedVector; +using BufferIdList = absl::InlinedVector; // We wrap HloInstruction* with an Item that holds auxiliary // per-instruction state. @@ -108,7 +123,7 @@ struct Item { int64 position; }; -using ItemList = tensorflow::gtl::InlinedVector; +using ItemList = absl::InlinedVector; // Class which maintains an ordered list of instructions with fast insertion // before arbitrary elements. @@ -187,15 +202,14 @@ class InstructionList { // On object construction this ordinal is precisely the instruction's index // in the list. Later, instructions inserted via InsertBefore receive // duplicate values. However, monotonicity is preserved. - void InsertBeforeInstructions( - Item* to_insert, tensorflow::gtl::ArraySlice before_instructions) { + void InsertBeforeInstructions(Item* to_insert, + absl::Span before_instructions) { VLOG(3) << "InsertBeforeInstructions: " << to_insert->instruction->name() << " before {" - << tensorflow::str_util::Join(before_instructions, ", ", - [](string* out, Item* item) { - tensorflow::strings::StrAppend( - out, item->instruction->name()); - }) + << absl::StrJoin(before_instructions, ", ", + [](string* out, Item* item) { + absl::StrAppend(out, item->instruction->name()); + }) << "}"; // Find the minimal position number of any instruction in @@ -378,10 +392,9 @@ class MemoryUsageTracker { int64 unfinished_user_count; string ToString() const { - return tensorflow::strings::StrCat( - "Buffer ", id, " (defined by ", - defining_instruction->instruction->name(), ", size ", size, - " bytes)"); + return absl::StrCat("Buffer ", id, " (defined by ", + defining_instruction->instruction->name(), ", size ", + size, " bytes)"); } }; @@ -725,29 +738,27 @@ Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item, } string MemoryUsageTracker::ToString() const { - string output = tensorflow::strings::StrCat("MemoryUsageTracker for ", - computation_->name(), "\n"); - tensorflow::strings::StrAppend( - &output, "Memory usage: ", HumanReadableNumBytes(memory_usage()), " (", - memory_usage(), " bytes)"); + string output = + absl::StrCat("MemoryUsageTracker for ", computation_->name(), "\n"); + absl::StrAppend(&output, + "Memory usage: ", HumanReadableNumBytes(memory_usage()), " (", + memory_usage(), " bytes)"); for (auto* item = instruction_list_.first(); item != nullptr; item = instruction_list_.next(item)) { const HloInstruction* instruction = item->instruction; string inprogress = item == in_progress_item_ ? " in-progress" : ""; string placed = item->placed ? " placed" : ""; - tensorflow::strings::StrAppend(&output, " ", instruction->name(), - inprogress, placed, "\n Defines:\n"); + absl::StrAppend(&output, " ", instruction->name(), inprogress, placed, + "\n Defines:\n"); for (BufferId buffer_id : item->buffers_defined) { const Buffer& buffer = buffers_[buffer_id]; string live = IsCurrentlyLive(buffer_id) ? " live" : ""; - tensorflow::strings::StrAppend(&output, " ", buffer.ToString(), live, - ", ", buffer.unfinished_user_count, - " unfinished uses\n"); + absl::StrAppend(&output, " ", buffer.ToString(), live, ", ", + buffer.unfinished_user_count, " unfinished uses\n"); } - tensorflow::strings::StrAppend(&output, " Uses:\n"); + absl::StrAppend(&output, " Uses:\n"); for (BufferId buffer_id : item->buffers_used) { - tensorflow::strings::StrAppend(&output, " ", - buffers_[buffer_id].ToString(), "\n"); + absl::StrAppend(&output, " ", buffers_[buffer_id].ToString(), "\n"); } } return output; @@ -765,10 +776,9 @@ bool MemoryUsageTracker::Check() const { CHECK(elements_are_unique(defined_buffers)) << "Instruction " << instruction->name() << " does not have unique defined buffers: " - << tensorflow::str_util::Join( + << absl::StrJoin( defined_buffers, ", ", [this](string* out, BufferId buffer_id) { - tensorflow::strings::StrAppend( - out, buffers_.at(buffer_id).ToString()); + absl::StrAppend(out, buffers_.at(buffer_id).ToString()); }); for (const Buffer& buffer : buffers_) { @@ -788,10 +798,9 @@ bool MemoryUsageTracker::Check() const { CHECK(elements_are_unique(used_buffers)) << "Instruction " << instruction->name() << " does not have unique used buffers: " - << tensorflow::str_util::Join( + << absl::StrJoin( used_buffers, ", ", [this](string* out, BufferId buffer_id) { - tensorflow::strings::StrAppend( - out, buffers_.at(buffer_id).ToString()); + absl::StrAppend(out, buffers_.at(buffer_id).ToString()); }); } for (const Buffer& buffer : buffers_) { @@ -843,9 +852,10 @@ int64 RematerializationCost(const HloInstruction* instruction, // candidate which reduce memory use at the program point of the current // instruction as indicated by memory_tracker. nullptr is returned if no // candidate can be found. -Item* PickRematerializationCandidate(const MemoryUsageTracker& memory_tracker, - const InstructionList& instruction_list, - int64 memory_limit_bytes) { +Item* PickRematerializationCandidate( + const MemoryUsageTracker& memory_tracker, + const InstructionList& instruction_list, int64 memory_limit_bytes, + tensorflow::gtl::FlatMap* remat_able) { Item* best_item = nullptr; int64 best_cost = 0; @@ -869,8 +879,7 @@ Item* PickRematerializationCandidate(const MemoryUsageTracker& memory_tracker, << " is excluded from rematerialization"; continue; } - - if (!IsRematerializable(candidate)) { + if (!CanBeRematerialized(candidate, remat_able)) { VLOG(5) << "candidate " << candidate->name() << " not viable: is not rematerializable"; continue; @@ -974,6 +983,9 @@ StatusOr HloRematerialization::RematerializeComputation( // blacklist. tensorflow::gtl::FlatSet remat_move_instructions; + // The map from instructions to their rematerializable status. + tensorflow::gtl::FlatMap remat_able; + // The peak memory of the computation at any point in the instruction // sequence. int64 peak_memory = memory_tracker.memory_usage(); @@ -1011,7 +1023,7 @@ StatusOr HloRematerialization::RematerializeComputation( << ", limit is " << HumanReadableNumBytes(memory_limit_bytes); Item* best_item = PickRematerializationCandidate( - memory_tracker, instruction_list, memory_limit_bytes); + memory_tracker, instruction_list, memory_limit_bytes, &remat_able); if (best_item == nullptr) { VLOG(3) << "Unable to find rematerialization candidate at program " @@ -1184,12 +1196,56 @@ StatusOr HloRematerialization::RematerializeComputation( StatusOr HloRematerialization::Run( HloModule* module, SequentialHloOrdering::HloModuleSequence* sequence, - int64 memory_limit_bytes, RematerializationSizes* sizes) { + int64 memory_limit_bytes, RematerializationSizes* sizes, + CopyInsertion* copy_insertion) { // The sequence is constructed entirely by this method. TF_RET_CHECK(sequence->empty()); VLOG(1) << "HloRematerialization() with memory limit of " << HumanReadableNumBytes(memory_limit_bytes); + XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString()); + + // Create initial sequence of HLO instructions. + TF_ASSIGN_OR_RETURN(*sequence, ScheduleComputationsInModule( + *module, + [this](const BufferValue& buffer) { + return size_function_(buffer.shape()); + }, + scheduler_algorithm_)); + if (copy_insertion) { + // We run a separate pass of copy elision here because the sequential + // ordering from the HLO schedule allows for more copies to be eliminated. + // TODO(b/80249101): Instead of a separate copy elision pass, use the + // ordering from the HLO schedule directly for copy insertion. + + // First create a copy of the schedule which contains HloInstruction unique + // ids instead of HloInstruction*. This is necessary for updating the + // schedule below. + // TODO(b/113175018): Remove this when the HLO schedule is self-contained + // and can update itself. + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(*sequence); + + SequentialHloOrdering ordering(module, *sequence); + TF_RETURN_IF_ERROR( + copy_insertion->RemoveUnnecessaryCopies(ordering, module)); + + // RemoveUnnecessaryCopies only considers interference when determining + // whether it is legal to remove a copy. However, copies in the graph may be + // necessary for other reason such as preventing a constant from being live + // out of the graph. So run AddSpecialCaseCopies to re-insert these copies. + // TODO(b/80249101): Break copy insertion into several passes and run each + // one once in the regular HLO pipeline. + TF_RETURN_IF_ERROR(copy_insertion->AddSpecialCaseCopies(module)); + + // The passes above can add and remove copies, update the schedule to + // account for these transformations. Newly added instructions will be + // placed ASAP in the schedule. + TF_RETURN_IF_ERROR(UpdateSchedule(*module, id_sequence, sequence)); + + TF_DCHECK_OK(copy_insertion->VerifyNoLiveRangeInterference( + SequentialHloOrdering(module, *sequence), module)); + } TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module)); @@ -1211,14 +1267,6 @@ StatusOr HloRematerialization::Run( << HumanReadableNumBytes(module_output_size) << "): " << HumanReadableNumBytes(adjusted_memory_limit_bytes); - XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString()); - // Create initial sequence of HLO instructions. - TF_ASSIGN_OR_RETURN(*sequence, CreateMemoryMinimizingSequence( - *module, - [this](const BufferValue& buffer) { - return size_function_(buffer.shape()); - }, - scheduler_algorithm_)); // Compute peak memory usage of all computations in the module called in a // sequential context. call_graph_ = CallGraph::Build(module); @@ -1305,12 +1353,11 @@ StatusOr HloRematerialization::Run( XLA_VLOG_LINES(3, "After HloRematerialization:\n" + module->ToString()); if (current_peak_memory > memory_limit_bytes) { - LOG(WARNING) << tensorflow::strings::Printf( - "Can't reduce memory use below %s (%lld bytes) by rematerialization; " - "only reduced to %s (%lld bytes)", - HumanReadableNumBytes(memory_limit_bytes).c_str(), memory_limit_bytes, - HumanReadableNumBytes(current_peak_memory).c_str(), - current_peak_memory); + LOG(WARNING) << absl::StrFormat( + "Can't reduce memory use below %s (%d bytes) by rematerialization; " + "only reduced to %s (%d bytes)", + HumanReadableNumBytes(memory_limit_bytes), memory_limit_bytes, + HumanReadableNumBytes(current_peak_memory), current_peak_memory); } return changed; @@ -1321,9 +1368,10 @@ StatusOr HloRematerialization::Run( int64 memory_limit_bytes, HloModule* hlo_module, MemorySchedulerAlgorithm scheduler_algorithm, SequentialHloOrdering::HloModuleSequence* sequence, - RematerializationSizes* sizes) { + RematerializationSizes* sizes, CopyInsertion* copy_insertion) { HloRematerialization remat(scheduler_algorithm, size_function); - return remat.Run(hlo_module, sequence, memory_limit_bytes, sizes); + return remat.Run(hlo_module, sequence, memory_limit_bytes, sizes, + copy_insertion); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h index 2ee2dd0571ae8c6604e4ca722351fd48a913bda5..2ec004350ad88ff31ece90ec419d90a55b965166 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization.h +++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h @@ -17,6 +17,7 @@ #include "tensorflow/compiler/xla/service/buffer_liveness.h" #include "tensorflow/compiler/xla/service/call_graph.h" +#include "tensorflow/compiler/xla/service/copy_insertion.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -57,6 +58,13 @@ class HloRematerialization { // sizes: Optional outparam that indicates the peak memory usage of the HLO // module before/after rematerialization. // + // copy_insertion: If non-null, run copy elision after scheduling. This + // pass is used to eliminate copies that were inserted by copy insertion + // before HLO scheduling. + // + // TODO(b/80249101): Remove the 'run_copy_elision' parameter when copy + // insertion is integrated with HLO scheduling. + // // Returns whether any instructions were rematerialized. If memory use is // already below the given limit then no instructions are rematerialized and // false is returned. @@ -68,7 +76,7 @@ class HloRematerialization { const ShapeSizeFunction& size_function, int64 memory_limit_bytes, HloModule* hlo_module, MemorySchedulerAlgorithm scheduler_algorithm, SequentialHloOrdering::HloModuleSequence* sequence, - RematerializationSizes* sizes = nullptr); + RematerializationSizes* sizes, CopyInsertion* copy_insertion = nullptr); protected: HloRematerialization(MemorySchedulerAlgorithm scheduler_algorithm, @@ -83,7 +91,8 @@ class HloRematerialization { // contains the memory-minimizing order in which to emit the HLO instructions. StatusOr Run(HloModule* module, SequentialHloOrdering::HloModuleSequence* sequence, - int64 memory_limit, RematerializationSizes* sizes); + int64 memory_limit, RematerializationSizes* sizes, + CopyInsertion* copy_insertion); // Rematerializes instructions within the given computation. 'order' is the // order in which the computation's instructions will be emitted in the diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc index 83de54f3fa56ee660b79d8c366dbc0b52f9fde87..ac8c97d380953764b66135ad1c5fcee0d481c004 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc +++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/status_test_util.h" namespace xla { namespace { @@ -40,7 +41,8 @@ class HloRematerializationTest : public HloTestBase { // Creates and returns a computation which can benefit from // rematerialization. The computation looks like: // - // F32[] %param = {...} + // F32[1] %param = {...} + // F32[] %reshape = reshape(F32[], param) // F32[1024] %bcast = broadcast(%param) // F32[1024] %negate = negate(%bcast) // F32[2048] %concat_1 = concat({%negate, %negate}) @@ -57,9 +59,11 @@ class HloRematerializationTest : public HloTestBase { const string& suffix = "") { auto builder = HloComputation::Builder(TestName() + suffix); auto param = builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param")); + HloInstruction::CreateParameter(0, vec1_shape_, "param")); + auto reshape = builder.AddInstruction( + HloInstruction::CreateReshape(scalar_shape_, param)); auto bcast = builder.AddInstruction( - HloInstruction::CreateBroadcast(vec1024_shape_, param, {})); + HloInstruction::CreateBroadcast(vec1024_shape_, reshape, {})); auto negate = builder.AddInstruction( HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, bcast)); auto concat_1 = builder.AddInstruction(HloInstruction::CreateConcatenate( @@ -100,9 +104,11 @@ class HloRematerializationTest : public HloTestBase { const string& suffix = "") { auto builder = HloComputation::Builder(TestName() + suffix); auto param = builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param")); + HloInstruction::CreateParameter(0, vec1_shape_, "param")); + auto reshape = builder.AddInstruction( + HloInstruction::CreateReshape(scalar_shape_, param)); auto bcast = builder.AddInstruction( - HloInstruction::CreateBroadcast(vec1024_shape_, param, {})); + HloInstruction::CreateBroadcast(vec1024_shape_, reshape, {})); auto slice_1 = builder.AddInstruction( HloInstruction::CreateSlice(vec1_shape_, bcast, /*start_indices=*/{0}, /*limit_indices=*/{1}, @@ -126,7 +132,7 @@ class HloRematerializationTest : public HloTestBase { builder.AddInstruction( HloInstruction::CreateParameter(0, vec1_shape_, "param")); builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); return builder.Build(); } @@ -135,6 +141,15 @@ class HloRematerializationTest : public HloTestBase { return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); } + StatusOr RunHloRematerialization( + int64 memory_limit_bytes, HloModule* module, + SequentialHloOrdering::HloModuleSequence* sequence) { + TF_EXPECT_OK(verifier().Run(module).status()); + return HloRematerialization::RematerializeAndSchedule( + ByteSizeOf, memory_limit_bytes, module, DefaultMemoryScheduler, + sequence, /*sizes=*/nullptr); + } + // Various shapes used in the canned computations. const Shape scalar_shape_ = ShapeUtil::MakeShape(xla::F32, {}); const Shape vec1_shape_ = ShapeUtil::MakeShape(xla::F32, {1}); @@ -158,11 +173,9 @@ TEST_F(HloRematerializationTest, SingleComputation) { SequentialHloOrdering::HloModuleSequence sequence; // Computation requires 16KB without rematerialization, but uses only 12KB // with rematerialization so pick a memory limit between these values (14KB). - TF_ASSERT_OK_AND_ASSIGN(bool changed, - HloRematerialization::RematerializeAndSchedule( - ByteSizeOf, - /*memory_limit_bytes=*/14 * 1024, module.get(), - DefaultMemoryScheduler, &sequence)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization( + /*memory_limit_bytes=*/14 * 1024, + module.get(), &sequence)); EXPECT_TRUE(changed); // Root should not have changed. @@ -188,18 +201,16 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) { HloComputation* computation = module->AddEntryComputation(MakeRematerializableComputation()); - EXPECT_EQ(computation->instruction_count(), 7); + EXPECT_EQ(computation->instruction_count(), 8); SequentialHloOrdering::HloModuleSequence sequence; - TF_ASSERT_OK_AND_ASSIGN(bool changed, - HloRematerialization::RematerializeAndSchedule( - ByteSizeOf, - /*memory_limit_bytes=*/20 * 1024, module.get(), - DefaultMemoryScheduler, &sequence)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization( + /*memory_limit_bytes=*/20 * 1024, + module.get(), &sequence)); // No instructions should have been materialized. EXPECT_FALSE(changed); - EXPECT_EQ(computation->instruction_count(), 7); + EXPECT_EQ(computation->instruction_count(), 8); } // Test rematerialization of a computation which calls another computation via a @@ -215,7 +226,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, vec1_shape_, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); HloComputation* while_cond = module->AddEmbeddedComputation(cond_builder.Build()); @@ -225,23 +236,21 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) { module->AddEntryComputation(MakeRematerializableWhileComputation( while_cond, /*while_body=*/body_computation)); - EXPECT_EQ(entry_computation->instruction_count(), 6); - EXPECT_EQ(body_computation->instruction_count(), 7); + EXPECT_EQ(entry_computation->instruction_count(), 7); + EXPECT_EQ(body_computation->instruction_count(), 8); // The body computation uses 16KB and the entry computation uses 2KB at the // while so the peak memory use of the module is 18KB. Set the memory limit a // bit lower (17KB) to force rematerialization of the entry computation. SequentialHloOrdering::HloModuleSequence sequence; - TF_ASSERT_OK_AND_ASSIGN(bool changed, - HloRematerialization::RematerializeAndSchedule( - ByteSizeOf, - /*memory_limit_bytes=*/17 * 1024, module.get(), - DefaultMemoryScheduler, &sequence)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization( + /*memory_limit_bytes=*/17 * 1024, + module.get(), &sequence)); EXPECT_TRUE(changed); // Only the entry computation should have a rematerialized instruction added. - EXPECT_EQ(entry_computation->instruction_count(), 7); - EXPECT_EQ(body_computation->instruction_count(), 7); + EXPECT_EQ(entry_computation->instruction_count(), 8); + EXPECT_EQ(body_computation->instruction_count(), 8); } // Test rematerialization of a computation which calls another computation via a @@ -254,7 +263,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, vec1_shape_, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); HloComputation* while_cond = module->AddEmbeddedComputation(cond_builder.Build()); @@ -264,20 +273,18 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) { module->AddEntryComputation(MakeRematerializableWhileComputation( while_cond, /*while_body=*/body_computation)); - EXPECT_EQ(entry_computation->instruction_count(), 6); - EXPECT_EQ(body_computation->instruction_count(), 7); + EXPECT_EQ(entry_computation->instruction_count(), 7); + EXPECT_EQ(body_computation->instruction_count(), 8); SequentialHloOrdering::HloModuleSequence sequence; - TF_ASSERT_OK_AND_ASSIGN(bool changed, - HloRematerialization::RematerializeAndSchedule( - ByteSizeOf, - /*memory_limit_bytes=*/15 * 1024, module.get(), - DefaultMemoryScheduler, &sequence)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization( + /*memory_limit_bytes=*/15 * 1024, + module.get(), &sequence)); EXPECT_TRUE(changed); - // Both computations should have a rematerialized instruction added. - EXPECT_EQ(entry_computation->instruction_count(), 7); - EXPECT_EQ(body_computation->instruction_count(), 8); + // Both computations should have rematerialized instructions added. + EXPECT_EQ(entry_computation->instruction_count(), 9); + EXPECT_EQ(body_computation->instruction_count(), 9); } // Test rematerialization of a doubly nested computation. All computations @@ -289,7 +296,7 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) { cond_builder.AddInstruction( HloInstruction::CreateParameter(0, vec1_shape_, "param")); cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); HloComputation* while_cond = module->AddEmbeddedComputation(cond_builder.Build()); @@ -303,24 +310,22 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) { module->AddEntryComputation(MakeRematerializableWhileComputation( while_cond, /*while_body=*/middle_computation)); - EXPECT_EQ(entry_computation->instruction_count(), 6); - EXPECT_EQ(middle_computation->instruction_count(), 6); - EXPECT_EQ(inner_computation->instruction_count(), 7); + EXPECT_EQ(entry_computation->instruction_count(), 7); + EXPECT_EQ(middle_computation->instruction_count(), 7); + EXPECT_EQ(inner_computation->instruction_count(), 8); // If all computations are maximally rematerialized then peak memory usage is // ~12K so pick something slightly larger. SequentialHloOrdering::HloModuleSequence sequence; - TF_ASSERT_OK_AND_ASSIGN(bool changed, - HloRematerialization::RematerializeAndSchedule( - ByteSizeOf, - /*memory_limit_bytes=*/13 * 1024, module.get(), - DefaultMemoryScheduler, &sequence)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization( + /*memory_limit_bytes=*/13 * 1024, + module.get(), &sequence)); EXPECT_TRUE(changed); - // All computations should have a rematerialized instruction added. - EXPECT_EQ(entry_computation->instruction_count(), 7); - EXPECT_EQ(middle_computation->instruction_count(), 7); - EXPECT_EQ(inner_computation->instruction_count(), 8); + // All computations should have rematerialized instructions added. + EXPECT_EQ(entry_computation->instruction_count(), 9); + EXPECT_EQ(middle_computation->instruction_count(), 9); + EXPECT_EQ(inner_computation->instruction_count(), 9); } TEST_F(HloRematerializationTest, RngNotRematerialized) { @@ -382,10 +387,9 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) { // parameter and output) and 20KB (peak memory possible with // rematerialization). TF_ASSERT_OK_AND_ASSIGN( - bool changed, HloRematerialization::RematerializeAndSchedule( - ByteSizeOf, + bool changed, RunHloRematerialization( /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_), - module.get(), DefaultMemoryScheduler, &sequence)); + module.get(), &sequence)); EXPECT_TRUE(changed); // The rng should not have been rematerialized. EXPECT_EQ(count_rngs(entry_computation), 1); @@ -476,11 +480,9 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) { // Pick a memory limit some where between 24KB (initial peak memory including // parameter and output) and 20KB (peak memory possible with // rematerialization). - TF_ASSERT_OK_AND_ASSIGN(bool changed, - HloRematerialization::RematerializeAndSchedule( - ByteSizeOf, - /*memory_limit_bytes=*/22 * 1024, module.get(), - DefaultMemoryScheduler, &sequence)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization( + /*memory_limit_bytes=*/22 * 1024, + module.get(), &sequence)); EXPECT_TRUE(changed); // The broadcast should have been rematerialized 3 times. @@ -573,11 +575,9 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) { // Pick a memory limit some where between 24KB (initial peak memory including // parameter and output) and 20KB (peak memory possible with // rematerialization). - TF_ASSERT_OK_AND_ASSIGN(bool changed, - HloRematerialization::RematerializeAndSchedule( - ByteSizeOf, - /*memory_limit_bytes=*/22 * 1024, module.get(), - DefaultMemoryScheduler, &sequence)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization( + /*memory_limit_bytes=*/22 * 1024, + module.get(), &sequence)); // Rematerialization should only occur if the rematerializable instruction has // no indirect uses. if (indirectly_used) { diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc index 31e13da0c07f31be121e8b13f790de0b63a39f6c..66ac1f66fd035074c69d070821a951fd0e357289 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.cc +++ b/tensorflow/compiler/xla/service/hlo_runner.cc @@ -19,12 +19,12 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" #include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/core/common_runtime/eigen_thread_pool.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -32,11 +32,11 @@ limitations under the License. namespace xla { /*static*/ StatusOr> -HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string, +HloRunner::CreateModuleFromString(const absl::string_view hlo_string, const DebugOptions& debug_options) { HloModuleConfig config; config.set_debug_options(debug_options); - return tools::Parse(hlo_string, config); + return ParseHloString(hlo_string, config); } namespace { @@ -80,7 +80,7 @@ HloRunner::ReadModuleFromHloTextFile(const std::string& filename, filename, &hlo_string)); HloModuleConfig config; config.set_debug_options(debug_options); - return tools::Parse(hlo_string, config); + return ParseHloString(hlo_string, config); } HloRunner::HloRunner(se::Platform* platform) { @@ -98,13 +98,15 @@ StatusOr HloRunner::TransferLiteralToDevice( backend().transfer_manager()->AllocateScopedShapedBuffer( literal.shape(), backend().memory_allocator(), backend().default_device_ordinal())); + TF_ASSIGN_OR_RETURN( + auto stream, backend().BorrowStream(backend().default_stream_executor())); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice( - backend().default_stream_executor(), literal, buffer)); + stream.get(), literal, buffer)); return std::move(buffer); } StatusOr> HloRunner::TransferLiteralsToDevice( - const tensorflow::gtl::ArraySlice literals) { + const absl::Span literals) { std::vector buffers; for (const Literal* literal : literals) { CHECK(literal != nullptr); @@ -116,7 +118,7 @@ StatusOr> HloRunner::TransferLiteralsToDevice( } StatusOr> HloRunner::TransferLiteralsToDevice( - const tensorflow::gtl::ArraySlice> literals) { + const absl::Span> literals) { std::vector literal_pointers; literal_pointers.reserve(literals.size()); for (const auto& literal : literals) { @@ -127,14 +129,16 @@ StatusOr> HloRunner::TransferLiteralsToDevice( StatusOr> HloRunner::TransferLiteralFromDevice( const ShapedBuffer& buffer) { - return backend().transfer_manager()->TransferLiteralFromDevice( - backend().default_stream_executor(), buffer); + TF_ASSIGN_OR_RETURN( + auto stream, backend().BorrowStream(backend().default_stream_executor())); + return backend().transfer_manager()->TransferLiteralFromDevice(stream.get(), + buffer); } StatusOr> HloRunner::Execute( std::unique_ptr module, - const tensorflow::gtl::ArraySlice arguments, - bool run_hlo_passes, ExecutionProfile* profile) { + const absl::Span arguments, bool run_hlo_passes, + ExecutionProfile* profile) { TF_ASSIGN_OR_RETURN(std::vector argument_buffers, TransferLiteralsToDevice(arguments)); TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, @@ -148,7 +152,7 @@ StatusOr> HloRunner::Execute( StatusOr> HloRunner::Execute( std::unique_ptr module, - const tensorflow::gtl::ArraySlice> arguments, + const absl::Span> arguments, bool run_hlo_passes, ExecutionProfile* profile) { // Construct a vector of plain pointers for the arguments. std::vector argument_pointers; @@ -165,8 +169,8 @@ StatusOr> HloRunner::Execute( StatusOr HloRunner::ExecuteWithDeviceBuffers( std::unique_ptr module, - const tensorflow::gtl::ArraySlice arguments, - bool run_hlo_passes, ExecutionProfile* profile) { + const absl::Span arguments, bool run_hlo_passes, + ExecutionProfile* profile) { // Get service run options. se::Stream stream(backend().default_stream_executor()); stream.Init(); @@ -176,14 +180,18 @@ StatusOr HloRunner::ExecuteWithDeviceBuffers( TF_ASSIGN_OR_RETURN(std::unique_ptr executable, CreateExecutable(std::move(module), run_hlo_passes)); - return executable->ExecuteOnStreamWrapper(&service_run_options, - /*profile=*/profile, arguments); + TF_ASSIGN_OR_RETURN( + ScopedShapedBuffer retval, + executable->ExecuteOnStreamWrapper(&service_run_options, + /*profile=*/profile, arguments)); + TF_RETURN_IF_ERROR(stream.BlockHostUntilDone()); + return std::move(retval); } StatusOr HloRunner::ExecuteWithDeviceBuffers( std::unique_ptr module, - const tensorflow::gtl::ArraySlice arguments, - bool run_hlo_passes, ExecutionProfile* profile) { + const absl::Span arguments, bool run_hlo_passes, + ExecutionProfile* profile) { std::vector argument_pointers; argument_pointers.reserve(arguments.size()); for (const auto& argument : arguments) { @@ -218,14 +226,13 @@ StatusOr>> HloRunner::ExecuteReplicated( // no arguments. std::vector argument_buffer_ptrs( options.num_replicas * options.arguments.size() + 1); - std::vector> - argument_buffer_slices; + std::vector> argument_buffer_slices; int64 index = 0; for (int64 i = 0; i < options.num_replicas; ++i) { int64 device = device_assignment(i, 0); TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, backend().stream_executor(device)); - streams.push_back(MakeUnique(executor)); + streams.push_back(absl::make_unique(executor)); streams.back()->Init(); service_run_options.emplace_back(GetServiceRunOptionsForDevice( device, streams.back().get(), &device_assignment)); @@ -237,7 +244,7 @@ StatusOr>> HloRunner::ExecuteReplicated( backend().transfer_manager()->AllocateScopedShapedBuffer( argument->shape(), backend().memory_allocator(), device)); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice( - executor, *argument, argument_buffer)); + streams.back().get(), *argument, argument_buffer)); argument_buffers.push_back(std::move(argument_buffer)); argument_buffer_ptrs[index++] = &argument_buffers.back(); } @@ -252,7 +259,7 @@ StatusOr>> HloRunner::ExecuteReplicated( num_threads += options.num_replicas; } if (num_threads > 0) { - pool = MakeUnique( + pool = absl::make_unique( tensorflow::Env::Default(), "infeed_outfeed", /*num_threads=*/num_threads); } @@ -283,7 +290,7 @@ StatusOr>> HloRunner::ExecuteReplicated( VLOG(1) << "Starting outfeed on device " << device; for (int64 step = 1; options.infeed_steps < 0 || step <= options.infeed_steps; ++step) { - auto literal = MakeUnique(); + auto literal = absl::make_unique(); TF_CHECK_OK(backend().transfer_manager()->TransferLiteralFromOutfeed( executor, options.outfeed_shape, literal.get())); if (options.outfeed_values != nullptr) { @@ -305,9 +312,10 @@ StatusOr>> HloRunner::ExecuteReplicated( std::vector> exec_results; for (int64 i = 0; i < options.num_replicas; ++i) { + TF_RETURN_IF_ERROR(streams[i]->BlockHostUntilDone()); TF_ASSIGN_OR_RETURN(std::unique_ptr literal, backend().transfer_manager()->TransferLiteralFromDevice( - streams[i]->parent(), results[i])); + streams[i].get(), results[i])); exec_results.push_back(std::move(literal)); } return std::move(exec_results); diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h index 65537f07f56e74b7fe2c2f9792af21efc7229573..76d8b92bed484381a59d7f54e0a75bb7e75649ee 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.h +++ b/tensorflow/compiler/xla/service/hlo_runner.h @@ -22,6 +22,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/computation_placer.h" @@ -33,7 +34,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -87,8 +87,7 @@ class HloRunner { // Converts an HloModule from the given hlo textual IR string (in // HloModule::ToString format). static StatusOr> CreateModuleFromString( - const tensorflow::StringPiece hlo_string, - const DebugOptions& debug_options); + const absl::string_view hlo_string, const DebugOptions& debug_options); // Reads the proto file in xla.HloProto format, creates and returns the // HloModule. @@ -105,9 +104,9 @@ class HloRunner { // Transfers data between the host and device. StatusOr TransferLiteralToDevice(const Literal& literal); StatusOr> TransferLiteralsToDevice( - const tensorflow::gtl::ArraySlice literals); + const absl::Span literals); StatusOr> TransferLiteralsToDevice( - const tensorflow::gtl::ArraySlice> literals); + const absl::Span> literals); StatusOr> TransferLiteralFromDevice( const ShapedBuffer& buffer); @@ -118,24 +117,24 @@ class HloRunner { // optimization. StatusOr> Execute( std::unique_ptr module, - const tensorflow::gtl::ArraySlice arguments, + const absl::Span arguments, bool run_hlo_passes = true, ExecutionProfile* profile = nullptr); StatusOr> Execute( std::unique_ptr module, - const tensorflow::gtl::ArraySlice> arguments, + const absl::Span> arguments, bool run_hlo_passes = true, ExecutionProfile* profile = nullptr); // As Execute(), but accepts and returns device buffers instead of host // buffers. StatusOr ExecuteWithDeviceBuffers( std::unique_ptr module, - const tensorflow::gtl::ArraySlice arguments, + const absl::Span arguments, bool run_hlo_passes = true, ExecutionProfile* profile = nullptr); StatusOr ExecuteWithDeviceBuffers( std::unique_ptr module, - const tensorflow::gtl::ArraySlice arguments, + const absl::Span arguments, bool run_hlo_passes = true, ExecutionProfile* profile = nullptr); // Executes a given HLO module into a set of replicas, and returns a map diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc index 68b2cde83a2eb479d9ba71fc6eab9ac9ab1c8267..0fc3b268c059802a3882ad5032a9fe5da28cbf23 100644 --- a/tensorflow/compiler/xla/service/hlo_scheduling.cc +++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_scheduling.h" #include +#include #include #include @@ -28,39 +29,14 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/lib/gtl/map_util.h" #include "tensorflow/core/platform/logging.h" -using ::tensorflow::strings::HumanReadableNumBytes; - namespace xla { - -StatusOr MinimumMemoryForSequence( - const SequentialHloOrdering::HloModuleSequence& module_sequence, - const LogicalBuffer::SizeFunction& size_function) { - if (module_sequence.empty()) { - return 0; - } - - const HloModule* module = module_sequence.begin()->first->parent(); - TF_ASSIGN_OR_RETURN(std::unique_ptr points_to_analysis, - TuplePointsToAnalysis::Run(module)); - - // The absolute minimum memory required for a given sequence of instructions - // is determined by the sequence of Alloc and Free calls on a simulated heap, - // ignoring fragmentation. We run the heap simulation on the whole module, - // rather than summing each computation, since it gives us a better lower - // bound, by minimizing the liveness of sub-computations. - TF_ASSIGN_OR_RETURN( - HeapSimulator::Result result, - HeapSimulator::Run(MakeUnique(), *module, - module_sequence, *points_to_analysis, size_function)); - return result.heap_size; -} - namespace { +using ::tensorflow::strings::HumanReadableNumBytes; + // Class implementing a list scheduler of HLO instructions which produces a // sequence which minimizes memory usage by preferring to schedule the node that // frees bigger buffer and defines smaller outputs. @@ -398,7 +374,7 @@ int64 SumLogicalBufferSizes( return size; } -StatusOr> CreateMemoryMinimizingSequence( +StatusOr> ScheduleComputationHelper( const HloComputation& computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, @@ -416,30 +392,15 @@ StatusOr> CreateMemoryMinimizingSequence( } // namespace -StatusOr MinimumMemoryForComputation( - const HloComputation& computation, - const std::vector& sequence, - const TuplePointsToAnalysis& points_to_analysis, - const LogicalBuffer::SizeFunction& size_function) { - TF_ASSIGN_OR_RETURN( - HeapSimulator::Result result, - HeapSimulator::Run(MakeUnique(), computation, - sequence, points_to_analysis, size_function)); - return result.heap_size; -} - StatusOr> DFSMemoryScheduler( const HloComputation& computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const tensorflow::gtl::FlatMap& memory_by_computation) { - // This ordering is based on DFS post-order, with a heuristic to decide which - // operand to visit first. The heuristic is based on 'extra_users', which is - // simply users-1 for each instruction. By subtracting 1, we're saying that - // instructions with no users or a single user don't count; instructions with - // lots of fan-out will be visited earlier. + // These variables are a hack to prevent overflows. int64 cumulative_total_size = 0; + int64 total_hlos = computation.parent()->NumUniqueInstructionIds(); tensorflow::gtl::FlatMap extra_users; tensorflow::gtl::FlatMap total_sizes; for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) { @@ -448,6 +409,11 @@ StatusOr> DFSMemoryScheduler( total_sizes[hlo] = 0; continue; } + // This ordering is based on DFS post-order, with a heuristic to decide + // which operand to visit first. The heuristic is based on 'extra_users', + // which is simply users-1 for each instruction. By subtracting 1, we're + // saying that instructions with no users or a single user don't count; + // instructions with lots of fan-out will be visited earlier. extra_users[hlo] = hlo->users().empty() ? 0 : hlo->users().size() - 1; int64 logical_buffer_size = SumLogicalBufferSizes( points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function); @@ -463,10 +429,13 @@ StatusOr> DFSMemoryScheduler( // lead to it. But computation is a DAG, so we are double-counting nodes, // which can lead to overflows for large programs. // cumulative_total_size caps the size to prevent overflows. + // Same for total_hlos: it prevents overflows on very large and branchy + // models, where the number of paths is exponential to the number of nodes. // NOTE(dimvar): this is quite ugly and should be changed. It's unclear // why we care about transitive sizes; when scheduling a node, its input // and output buffers should be all that matters, not its "history". total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size); + extra_users[hlo] = std::min(extra_users[hlo], total_hlos); } CHECK_EQ(extra_users.size(), computation.instruction_count()); CHECK_EQ(total_sizes.size(), computation.instruction_count()); @@ -533,29 +502,29 @@ StatusOr> DefaultMemoryScheduler( std::vector list_sequence, ListMemoryScheduler(computation, points_to_analysis, size_function, memory_by_computation)); - TF_ASSIGN_OR_RETURN( - const int64 list_memory, - MinimumMemoryForComputation(computation, list_sequence, - points_to_analysis, size_function)); + TF_ASSIGN_OR_RETURN(const int64 list_memory, + HeapSimulator::MinimumMemoryForComputation( + computation, list_sequence, points_to_analysis, + size_function, &memory_by_computation)); VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory); TF_ASSIGN_OR_RETURN(std::vector dfs_sequence, DFSMemoryScheduler(computation, points_to_analysis, size_function, memory_by_computation)); - TF_ASSIGN_OR_RETURN( - const int64 dfs_memory, - MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis, - size_function)); + TF_ASSIGN_OR_RETURN(const int64 dfs_memory, + HeapSimulator::MinimumMemoryForComputation( + computation, dfs_sequence, points_to_analysis, + size_function, &memory_by_computation)); VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory); TF_ASSIGN_OR_RETURN( std::vector post_order_sequence, PostOrderMemoryScheduler(computation, points_to_analysis, size_function, memory_by_computation)); - TF_ASSIGN_OR_RETURN( - const int64 post_order_memory, - MinimumMemoryForComputation(computation, post_order_sequence, - points_to_analysis, size_function)); + TF_ASSIGN_OR_RETURN(const int64 post_order_memory, + HeapSimulator::MinimumMemoryForComputation( + computation, post_order_sequence, points_to_analysis, + size_function, &memory_by_computation)); VLOG(2) << "Min-memory post order sequence: " << HumanReadableNumBytes(post_order_memory); @@ -576,10 +545,9 @@ StatusOr> DefaultMemoryScheduler( } } -StatusOr -CreateMemoryMinimizingSequence(const HloModule& module, - const LogicalBuffer::SizeFunction& size_function, - const MemorySchedulerAlgorithm& algorithm) { +StatusOr ScheduleComputationsInModule( + const HloModule& module, const LogicalBuffer::SizeFunction& size_function, + const MemorySchedulerAlgorithm& algorithm) { SequentialHloOrdering::HloModuleSequence sequence; TF_ASSIGN_OR_RETURN(std::unique_ptr points_to_analysis, TuplePointsToAnalysis::Run(&module)); @@ -587,28 +555,213 @@ CreateMemoryMinimizingSequence(const HloModule& module, for (const auto* computation : module.MakeComputationPostOrder()) { if (!computation->IsFusionComputation()) { TF_ASSIGN_OR_RETURN(auto one_computation_sequence, - CreateMemoryMinimizingSequence( + ScheduleComputationHelper( *computation, *points_to_analysis, size_function, algorithm, memory_by_computation)); memory_by_computation[computation] = - MinimumMemoryForComputation(*computation, one_computation_sequence, - *points_to_analysis, size_function) + HeapSimulator::MinimumMemoryForComputation( + *computation, one_computation_sequence, *points_to_analysis, + size_function, &memory_by_computation) .ValueOrDie(); sequence[computation] = std::move(one_computation_sequence); } } + VLOG(1) << "Module schedule:\n" << sequence; return sequence; } -StatusOr> CreateMemoryMinimizingSequence( +StatusOr> ScheduleOneComputation( const HloComputation& computation, const LogicalBuffer::SizeFunction& size_function) { CHECK(!computation.IsFusionComputation()); TF_ASSIGN_OR_RETURN(std::unique_ptr points_to_analysis, TuplePointsToAnalysis::Run(computation.parent())); tensorflow::gtl::FlatMap empty_map; - return CreateMemoryMinimizingSequence(computation, *points_to_analysis, - size_function, nullptr, empty_map); + return ScheduleComputationHelper(computation, *points_to_analysis, + size_function, nullptr, empty_map); +} + +tensorflow::gtl::FlatMap> +ComputeIdSchedule(const SequentialHloOrdering::HloModuleSequence& sequence) { + tensorflow::gtl::FlatMap> id_sequence; + for (const auto& computation_sequence : sequence) { + for (const HloInstruction* instruction : computation_sequence.second) { + id_sequence[computation_sequence.first].push_back( + instruction->unique_id()); + } + } + return id_sequence; +} + +Status UpdateSchedule( + const HloModule& module, + const tensorflow::gtl::FlatMap>& + id_sequence, + SequentialHloOrdering::HloModuleSequence* sequence) { + // Map from unique ID to HloInstruction pointer for instructions in the + // module. + tensorflow::gtl::FlatMap id_to_instruction; + // Set of all HloInstructions in the schedule. + tensorflow::gtl::FlatSet ids_in_schedule; + std::vector nonfusion_computations = + module.MakeNonfusionComputations(); + for (const HloComputation* computation : nonfusion_computations) { + for (const HloInstruction* instruction : computation->instructions()) { + TF_RET_CHECK( + id_to_instruction.insert({instruction->unique_id(), instruction}) + .second); + } + for (int id : id_sequence.at(computation)) { + ids_in_schedule.insert(id); + } + } + + // Map from HloInstruction X to newly added instructions (instruction is in + // module, but not in schedule) which use X. If an instruction is not in the + // map, then it has no users which are newly added instructions. + tensorflow::gtl::FlatMap> + new_instruction_uses; + + // For each newly added instruction, this is the count of the instruction's + // operands that have not yet been scheduled. When this value reaches zero, + // then the instruction may be placed in the schedule. + tensorflow::gtl::FlatMap + unscheduled_operand_count; + // For each computation, this is the set of newly added instructions which + // have no operands. These must be handled specially and are added to the + // beginning of the schedule. + tensorflow::gtl::FlatMap> + new_zero_operand_instructions; + for (const HloComputation* computation : nonfusion_computations) { + new_zero_operand_instructions[computation] = {}; + for (const HloInstruction* instruction : computation->instructions()) { + if (ids_in_schedule.count(instruction->unique_id()) == 0) { + // This is a newly added instruction which is not in the schedule. + for (const HloInstruction* operand : instruction->operands()) { + new_instruction_uses[operand].push_back(instruction); + } + if (instruction->operands().empty()) { + new_zero_operand_instructions[computation].push_back(instruction); + } + unscheduled_operand_count[instruction] = instruction->operand_count(); + } + } + } + + // Update the schedule with the newly added instructions, and remove any + // instructions no longer in the graph. + for (const HloComputation* computation : nonfusion_computations) { + std::vector old_computation_sequence = + std::move(sequence->at(computation)); + sequence->at(computation).clear(); + + // Create a worklist of newly added instructions which are ready to be added + // to the schedule. Initialize worklist with those that have zero operands. + std::queue worklist; + for (const HloInstruction* instruction : + new_zero_operand_instructions.at(computation)) { + worklist.push(instruction); + } + + // Lambda which schedules all instructions on the worklist. + auto schedule_worklist = [&]() { + while (!worklist.empty()) { + const HloInstruction* instruction = worklist.front(); + worklist.pop(); + sequence->at(computation).push_back(instruction); + std::vector* new_users = + tensorflow::gtl::FindOrNull(new_instruction_uses, instruction); + if (new_users != nullptr) { + // This just-scheduled instruction has users which are newly added to + // the module. Update the number of unscheduled operands and push the + // newly added instruction to the worklist if it is ready to + // schedule. + for (const HloInstruction* new_user : *new_users) { + unscheduled_operand_count.at(new_user)--; + CHECK_GE(unscheduled_operand_count.at(new_user), 0); + if (unscheduled_operand_count.at(new_user) == 0) { + worklist.push(new_user); + } + } + } + } + }; + + schedule_worklist(); + for (int id : id_sequence.at(computation)) { + auto it = id_to_instruction.find(id); + if (it == id_to_instruction.end()) { + // This instruction in the schedule is no longer in the module. + continue; + } + const HloInstruction* instruction = it->second; + worklist.push(instruction); + schedule_worklist(); + } + } + + TF_RETURN_IF_ERROR(VerifySchedule(module, *sequence)); + return Status::OK(); +} + +Status VerifySchedule( + const HloModule& module, + const SequentialHloOrdering::HloModuleSequence& sequence) { + VLOG(2) << "VerifySchedule()"; + XLA_VLOG_LINES(2, module.ToString()); + VLOG(2) << sequence; + + // Verify the set of computations in the sequence is exactly the set of + // computations in the module. + std::vector nonfusion_computations = + module.MakeNonfusionComputations(); + TF_RET_CHECK(nonfusion_computations.size() == sequence.size()); + tensorflow::gtl::FlatSet computations_in_module( + module.computations().begin(), module.computations().end()); + for (const auto& computation_sequence : sequence) { + TF_RET_CHECK(computations_in_module.count(computation_sequence.first) == 1); + } + + // For each computation verify the set of instructions is the same and that + // each dependency and control edge is honored. + for (const HloComputation* computation : nonfusion_computations) { + tensorflow::gtl::FlatMap instruction_position; + int pos = 0; + for (const HloInstruction* instruction : sequence.at(computation)) { + TF_RET_CHECK(instruction_position.insert({instruction, pos}).second) + << "Instruction " << instruction->name() + << " appears more than once in the schedule"; + pos++; + } + + TF_RET_CHECK(instruction_position.size() == + computation->instruction_count()); + for (const HloInstruction* instruction : computation->instructions()) { + TF_RET_CHECK(instruction_position.count(instruction) == 1) + << "Instruction " << instruction->name() << " is not in schedule"; + } + + for (const HloInstruction* instruction : computation->instructions()) { + for (const HloInstruction* operand : instruction->operands()) { + TF_RET_CHECK(instruction_position.at(operand) < + instruction_position.at(instruction)) + << "Instruction " << instruction->name() + << " is not scheduled after its operand " << operand->name(); + } + + for (const HloInstruction* pred : instruction->control_predecessors()) { + TF_RET_CHECK(instruction_position.at(pred) < + instruction_position.at(instruction)) + << "Instruction " << instruction->name() + << " is not scheduled after its control predecessor " + << pred->name(); + } + } + } + + return Status::OK(); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h index 49b927eefd24f4e26df781dd8d2b977bedba2b80..d06b8d9a5cdef82380bd68ae0991a3957db80f48 100644 --- a/tensorflow/compiler/xla/service/hlo_scheduling.h +++ b/tensorflow/compiler/xla/service/hlo_scheduling.h @@ -28,20 +28,6 @@ limitations under the License. namespace xla { -// Returns the minimum memory required to compute the given module sequence, -// assuming no fragmentation. -StatusOr MinimumMemoryForSequence( - const SequentialHloOrdering::HloModuleSequence& module_sequence, - const LogicalBuffer::SizeFunction& size_function); - -// Returns the minimum memory required to compute the given computation, -// assuming no fragmentation. -StatusOr MinimumMemoryForComputation( - const HloComputation& computation, - const std::vector& sequence, - const TuplePointsToAnalysis& points_to_analysis, - const LogicalBuffer::SizeFunction& size_function); - // A memory scheduler computes an execution sequence for the HLO instructions in // 'computation' that minimizes peak memory, given a points-to analysis result // that describes buffer aliasing, together with a target-specific size function @@ -89,17 +75,53 @@ StatusOr> DefaultMemoryScheduler( // Returns an HloModuleSequence which seeks to minimize the memory required for // the computation. size_function is the function returning the number of bytes // required for a LogicalBuffer. -StatusOr -CreateMemoryMinimizingSequence(const HloModule& module, - const LogicalBuffer::SizeFunction& size_function, - const MemorySchedulerAlgorithm& algorithm = {}); +StatusOr ScheduleComputationsInModule( + const HloModule& module, const LogicalBuffer::SizeFunction& size_function, + const MemorySchedulerAlgorithm& algorithm = {}); -// Overload of above that computes the sequence for a single computation. +// Computes the schedule for a single computation. // Currently only used by the GPU backend. -StatusOr> CreateMemoryMinimizingSequence( +StatusOr> ScheduleOneComputation( const HloComputation& computation, const LogicalBuffer::SizeFunction& size_function); +// Transforms the given schedule such that it is (again) a valid schedule for +// the module. This is used to update a schedule after the HLO module has been +// transformed in some way. In general, the only transformations to the module +// for which a schedule can be updated is the addition or removal of +// instructions to/from the module. Updating the schedule after new dependencies +// between existing instructions in the module is not supported and may result +// in an error status returned. +// +// Instructions in the module which also exist in the given schedule will remain +// in the same order in the updated schedule. Instructions which exist in the +// module but not in the given schedule will be placed as early as possible in +// the updated schedule. +// +// 'id_sequence' is a mirror of the given schedule 'sequence' but with +// HloInstruction ids rather than HloInstruction pointers. This should be +// constructed using ComputeIdSchedule below after the schedule is constructed +// but before the HLO module is transformed. +Status UpdateSchedule( + const HloModule& module, + const tensorflow::gtl::FlatMap>& + id_sequence, + SequentialHloOrdering::HloModuleSequence* sequence); + +// Constructs a copy of the given schedule but with HloInstruction unique ids +// rather than HloInstruction pointers. This is necessary for updating a +// schedule as HloInstruction points in the schedule may become invalid if +// instructions are removed from the module. Used by UpdateSchedule above.. +// TODO(b/113175018): Remove this function when HLO schedule is its own class. +tensorflow::gtl::FlatMap> +ComputeIdSchedule(const SequentialHloOrdering::HloModuleSequence& sequence); + +// Verifies that the given schedule is valid for the given module. Specifically, +// the schedule contains exactly the instructions in the module and every +// dependency in the module is satisfied in the schedule. +Status VerifySchedule(const HloModule& module, + const SequentialHloOrdering::HloModuleSequence& sequence); + } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SCHEDULING_H_ diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc index 0bc930f9ea450a6544ad261257c5bff24791ddb1..d49d09d459758840ce0f9f0b05e3c033da3337f8 100644 --- a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc +++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc @@ -18,78 +18,22 @@ limitations under the License. #include #include +#include "tensorflow/compiler/xla/service/heap_simulator.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_dce.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_ordering.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/status_test_util.h" namespace xla { namespace { -class MinimumMemoryForSequenceTest : public HloTestBase {}; - -TEST_F(MinimumMemoryForSequenceTest, MultiComputation) { - auto module = CreateNewModule(); - const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {}); - const Shape tuple_shape = - ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape}); - - auto cond_builder = HloComputation::Builder("WhileCond"); - // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element) - HloInstruction* cond_param = cond_builder.AddInstruction( - HloInstruction::CreateParameter(0, tuple_shape, "cond_param")); - HloInstruction* cond_iter = cond_builder.AddInstruction( - HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0)); - HloInstruction* cond_data = cond_builder.AddInstruction( - HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1)); - // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte) - HloInstruction* cond_lt = cond_builder.AddInstruction( - HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}), - HloOpcode::kLt, cond_iter, cond_data)); - HloComputation* cond_computation = - module->AddEmbeddedComputation(cond_builder.Build()); - - auto body_builder = HloComputation::Builder("WhileBody"); - // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element) - HloInstruction* body_param = body_builder.AddInstruction( - HloInstruction::CreateParameter(0, tuple_shape, "body_param")); - HloComputation* body_computation = - module->AddEmbeddedComputation(body_builder.Build()); - - auto builder = HloComputation::Builder(TestName()); - // Entry params: 8 bytes (4 bytes per param), TOTAL=8 - HloInstruction* iter = builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape, "param_iter")); - HloInstruction* data = builder.AddInstruction( - HloInstruction::CreateParameter(1, scalar_shape, "param_data")); - // Tuple: 16 bytes (8 bytes per pointer), TOTAL=24 - HloInstruction* tuple = - builder.AddInstruction(HloInstruction::CreateTuple({iter, data})); - // While: 8 bytes (4 bytes per element), TOTAL=32 - // Both cond and body use a max of 24 bytes, TOTAL=56 - HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile( - tuple_shape, cond_computation, body_computation, tuple)); - HloComputation* entry_computation = - module->AddEntryComputation(builder.Build()); - - auto size_fn = [](const BufferValue& buffer) { - return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8); - }; - - SequentialHloOrdering::HloModuleSequence module_sequence; - module_sequence[cond_computation] = {cond_param, cond_iter, cond_data, - cond_lt}; - module_sequence[body_computation] = {body_param}; - module_sequence[entry_computation] = {iter, data, tuple, while_op}; - EXPECT_EQ(56, - MinimumMemoryForSequence(module_sequence, size_fn).ValueOrDie()); -} - class HloSchedulingTest : public HloTestBase {}; TEST_F(HloSchedulingTest, LastUseScheduledFirst) { @@ -124,7 +68,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) { TF_ASSERT_OK_AND_ASSIGN( SequentialHloOrdering::HloModuleSequence sequence, - CreateMemoryMinimizingSequence(*module, [](const BufferValue& buffer) { + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape()); })); // Verify that all instructions are in the sequence. @@ -158,14 +102,14 @@ ENTRY root { })"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(module_str)); + ParseHloString(module_str)); auto size_fn = [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8); }; TF_ASSERT_OK_AND_ASSIGN( SequentialHloOrdering::HloModuleSequence sequence, - CreateMemoryMinimizingSequence(*module, size_fn, ListMemoryScheduler)); + ScheduleComputationsInModule(*module, size_fn, ListMemoryScheduler)); // Verify that all instructions are in the sequence. EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.at(module->entry_computation()).size()); @@ -203,7 +147,7 @@ TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) { // ROOT %subtract = f32[4]{0} subtract( // f32[4]{0} %body_param, f32[1,4]{1,0} %constant.1) // } - // %SubcomputationsNotAccounted () -> f32[2,4] { + // %ListAccountsForSubcomputations () -> f32[2,4] { // %constant.3 = f32[2,4]{1,0} constant( // f32[2,4] { { 1, 2, 3, 4 }, { 1, 2, 3, 4 } }) // %transpose = f32[2,4]{1,0} transpose( @@ -226,8 +170,9 @@ TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) { auto cond_builder = HloComputation::Builder("WhileCond"); HloInstruction* cond_param = cond_builder.AddInstruction( HloInstruction::CreateParameter(0, r1f32, "cond_param")); - HloInstruction* zero_vector = cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2({{0, 0, 0, 0}}))); + HloInstruction* zero_vector = + cond_builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2({{0, 0, 0, 0}}))); cond_builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector)); auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build()); @@ -237,16 +182,18 @@ TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) { auto body_builder = HloComputation::Builder("WhileBody"); HloInstruction* body_param = body_builder.AddInstruction( HloInstruction::CreateParameter(0, r1f32, "body_param")); - HloInstruction* one_vector = body_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2({{1, 1, 1, 1}}))); + HloInstruction* one_vector = + body_builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2({{1, 1, 1, 1}}))); body_builder.AddInstruction(HloInstruction::CreateBinary( r1f32, HloOpcode::kSubtract, body_param, one_vector)); auto body_computation = module->AddEmbeddedComputation(body_builder.Build()); // transpose(matrix) + bcast(while) auto builder = HloComputation::Builder(TestName()); - HloInstruction* while_init = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2({{1, 1, 1, 1}}))); + HloInstruction* while_init = + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2({{1, 1, 1, 1}}))); // Creates 16 bytes, ignoring subcomputations HloInstruction* while_loop = builder.AddInstruction(HloInstruction::CreateWhile( @@ -257,7 +204,7 @@ TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) { HloInstruction::CreateBroadcast(r2f32, while_loop, {0})); HloInstruction* matrix = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2( + HloInstruction::CreateConstant(LiteralUtil::CreateR2( {{1.0, 2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0}}))); // Creates 32 bytes HloInstruction* transpose = builder.AddInstruction( @@ -269,16 +216,16 @@ TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) { module->AddEntryComputation(builder.Build()); - TF_ASSERT_OK_AND_ASSIGN(SequentialHloOrdering::HloModuleSequence sequence, - CreateMemoryMinimizingSequence( - *module, - [](const BufferValue& buffer) { - return ShapeUtil::ByteSizeOf(buffer.shape()); - }, - ListMemoryScheduler)); + auto size_fn = [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + }; + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, size_fn, ListMemoryScheduler)); // Verify that all instructions are in the sequence. - EXPECT_EQ(module->entry_computation()->instruction_count(), - sequence.at(module->entry_computation()).size()); + auto entry_computation = module->entry_computation(); + EXPECT_EQ(entry_computation->instruction_count(), + sequence.at(entry_computation).size()); SequentialHloOrdering ordering(module.get(), sequence); // This schedule is an example of List's greedy heuristics being suboptimal. // The while_loop is more expensive than transpose, so it would have been @@ -287,6 +234,24 @@ TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) { EXPECT_TRUE(ordering.ExecutesBefore(transpose, bcast)); EXPECT_TRUE(ordering.ExecutesBefore(bcast, add)); EXPECT_TRUE(ordering.ExecutesBefore(transpose, add)); + + tensorflow::gtl::FlatMap memory_by_computation; + memory_by_computation[cond_computation] = 17; + memory_by_computation[body_computation] = 16; + std::unique_ptr points_to_analysis = + TuplePointsToAnalysis::Run(module.get()).ValueOrDie(); + + // HeapSimulator doesn't account for subcomputations + EXPECT_EQ(80, HeapSimulator::MinimumMemoryForComputation( + *entry_computation, sequence.at(entry_computation), + *points_to_analysis, size_fn) + .ValueOrDie()); + // HeapSimulator accounts for subcomputations. The output buffer is aliased, + // so we don't double count. + EXPECT_EQ(64, HeapSimulator::MinimumMemoryForComputation( + *entry_computation, sequence.at(entry_computation), + *points_to_analysis, size_fn, &memory_by_computation) + .ValueOrDie()); } TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) { @@ -297,14 +262,14 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) { // Wrap lit in abs because constants are considered free by // IgnoreInstruction, and it skews the accounting. auto lit = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1, 1, 1, 1, 1, 1}))); + LiteralUtil::CreateR1({1, 1, 1, 1, 1, 1}))); auto abs_const = builder.AddInstruction( HloInstruction::CreateUnary(r1f32, HloOpcode::kAbs, lit)); auto abs_abs1 = builder.AddInstruction( HloInstruction::CreateUnary(r1f32, HloOpcode::kAbs, abs_const)); auto tuple = builder.AddInstruction(HloInstruction::CreateTuple( - tensorflow::gtl::ArraySlice({abs_abs1}))); + absl::Span({abs_abs1}))); auto tuple_elm = builder.AddInstruction( HloInstruction::CreateGetTupleElement(r1f32, tuple, 0)); @@ -318,12 +283,12 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) { module->AddEntryComputation(builder.Build()); TF_ASSERT_OK_AND_ASSIGN( SequentialHloOrdering::HloModuleSequence sequence, - CreateMemoryMinimizingSequence(*module, - [&TUPLE_SIZE](const BufferValue& buffer) { - return ShapeUtil::ByteSizeOf( - buffer.shape(), TUPLE_SIZE); - }, - ListMemoryScheduler)); + ScheduleComputationsInModule(*module, + [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf( + buffer.shape(), TUPLE_SIZE); + }, + ListMemoryScheduler)); // Verify that all instructions are in the sequence. EXPECT_EQ(module->entry_computation()->instruction_count(), @@ -340,11 +305,11 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) { HloComputation::Builder builder(TestName()); auto c1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1, 1, 1, 1, 1}))); + LiteralUtil::CreateR1({1, 1, 1, 1, 1}))); auto c2 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1, 2, 3, 4, 5}))); + LiteralUtil::CreateR1({1, 2, 3, 4, 5}))); auto c3 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({0, 2, 4, 6, 8}))); + LiteralUtil::CreateR1({0, 2, 4, 6, 8}))); auto add = builder.AddInstruction( HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, c1, c2)); @@ -368,7 +333,7 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) { {tuple, mul, add}, HloInstruction::FusionKind::kLoop); TF_ASSERT_OK_AND_ASSIGN(SequentialHloOrdering::HloModuleSequence sequence, - CreateMemoryMinimizingSequence( + ScheduleComputationsInModule( *module, [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape(), 2); @@ -384,5 +349,319 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) { EXPECT_TRUE(ordering.ExecutesBefore(exp, fusion)); } +TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) { + auto module = CreateNewModule(); + const Shape r1f32 = ShapeUtil::MakeShape(F32, {4}); + + // param != 0 + // Needs 17 bytes + auto cond_builder = HloComputation::Builder("WhileCond"); + HloInstruction* cond_param = cond_builder.AddInstruction( + HloInstruction::CreateParameter(0, r1f32, "cond_param")); + HloInstruction* zero_vector = + cond_builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2({{0, 0, 0, 0}}))); + cond_builder.AddInstruction(HloInstruction::CreateBinary( + ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector)); + auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build()); + + // param - 1 + // Needs 16 bytes + auto body_builder = HloComputation::Builder("WhileBody"); + HloInstruction* body_param = body_builder.AddInstruction( + HloInstruction::CreateParameter(0, r1f32, "body_param")); + HloInstruction* one_vector = + body_builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2({{1, 1, 1, 1}}))); + body_builder.AddInstruction(HloInstruction::CreateBinary( + r1f32, HloOpcode::kSubtract, body_param, one_vector)); + auto body_computation = module->AddEmbeddedComputation(body_builder.Build()); + + auto builder = HloComputation::Builder(TestName()); + HloInstruction* while_init = + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2({{1, 1, 1, 1}}))); + // Creates 16 bytes, ignoring subcomputations + builder.AddInstruction(HloInstruction::CreateWhile( + r1f32, cond_computation, body_computation, while_init)); + + module->AddEntryComputation(builder.Build()); + + auto size_fn = [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + }; + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, size_fn, ListMemoryScheduler)); + // Verify that all instructions are in the sequence. + auto entry_computation = module->entry_computation(); + EXPECT_EQ(entry_computation->instruction_count(), + sequence.at(entry_computation).size()); + + tensorflow::gtl::FlatMap memory_by_computation; + memory_by_computation[cond_computation] = 17; + memory_by_computation[body_computation] = 16; + std::unique_ptr points_to_analysis = + TuplePointsToAnalysis::Run(module.get()).ValueOrDie(); + + // HeapSimulator doesn't account for subcomputations + EXPECT_EQ(16, HeapSimulator::MinimumMemoryForComputation( + *entry_computation, sequence.at(entry_computation), + *points_to_analysis, size_fn) + .ValueOrDie()); + // HeapSimulator accounts for subcomputations. Cond is the largest one. + // The output buffer of the while is aliased. + EXPECT_EQ(17, HeapSimulator::MinimumMemoryForComputation( + *entry_computation, sequence.at(entry_computation), + *points_to_analysis, size_fn, &memory_by_computation) + .ValueOrDie()); +} + +TEST_F(HloSchedulingTest, UpdateScheduleUnchangedModule) { + // Updating the schedule of an unchanged HLO module should not affect the + // schedule at all. + const string module_str = R"( +HloModule UpdateScheduleUnchanged + +ENTRY main { + a = f32[] parameter(0) + b = f32[] parameter(1) + c = f32[] constant(42.0) + sum = f32[] add(a, b) + neg = f32[] negate(c) + ROOT root = f32[] multiply(sum, neg) +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + std::vector entry_schedule = sequence.begin()->second; + + EXPECT_EQ(entry_schedule.size(), 6); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(entry_schedule, sequence.begin()->second); +} + +TEST_F(HloSchedulingTest, UpdateScheduleWithNewInstructions) { + // Add some additional instructions to a module and verify the schedule can be + // updated. + const string module_str = R"( +HloModule UpdateScheduleWithNewInstructions + +ENTRY main { + a = f32[] parameter(0) + b = f32[] parameter(1) + c = f32[] constant(42.0) + sum = f32[] add(a, b) + neg = f32[] negate(c) + ROOT root = f32[] multiply(sum, neg) +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + + HloComputation* entry = module->entry_computation(); + const Shape shape = entry->root_instruction()->shape(); + HloInstruction* constant = entry->AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); + HloInstruction* sub = entry->AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kSubtract, constant, entry->root_instruction())); + entry->set_root_instruction(sub); + + auto in_schedule = [&](const HloInstruction* hlo) { + return std::find(sequence.at(entry).begin(), sequence.at(entry).end(), + hlo) != sequence.at(entry).end(); + }; + + EXPECT_EQ(sequence.at(entry).size(), 6); + EXPECT_FALSE(in_schedule(constant)); + EXPECT_FALSE(in_schedule(sub)); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(sequence.at(entry).size(), 8); + EXPECT_TRUE(in_schedule(constant)); + EXPECT_TRUE(in_schedule(sub)); +} + +TEST_F(HloSchedulingTest, UpdateScheduleWithAddedAndDeletedInstruction) { + // Add and delete some instructions from a module and verify that the schedule + // can be updated successfully. + const string module_str = R"( +HloModule UpdateScheduleWithAddedAndDeletedInstruction + +ENTRY main { + a = f32[] parameter(0) + b = f32[] parameter(1) + c = f32[] constant(42.0) + sum = f32[] add(a, b) + neg = f32[] negate(c) + ROOT root = f32[] multiply(sum, neg) +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + + // Set the entry root to some expression containing just a parameter and a + // constant. + HloComputation* entry = module->entry_computation(); + HloInstruction* constant = entry->AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); + HloInstruction* new_root = entry->AddInstruction( + HloInstruction::CreateBinary(constant->shape(), HloOpcode::kSubtract, + constant, entry->parameter_instruction(0))); + entry->set_root_instruction(new_root); + + // DCE should remove everything but the parameters and the newly added code. + HloDCE dce; + TF_ASSERT_OK(dce.Run(module.get()).status()); + + EXPECT_EQ(sequence.at(entry).size(), 6); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(sequence.at(entry).size(), 4); +} + +TEST_F(HloSchedulingTest, UpdateScheduleWithCompletelyReplacedModule) { + // Completely replace a module with an entirely new set of instructions and + // verify that the schedule can be updated successfully. + const string module_str = R"( +HloModule UpdateScheduleWithCompletelyReplacedModule + +ENTRY main { + a = f32[] constant(42.0) + b = f32[] constant(123.0) + ROOT sum = f32[] add(a, b) +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape()); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + + // Replace the entry computation with the negation of a constant. + HloComputation* entry = module->entry_computation(); + HloInstruction* constant = entry->AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); + HloInstruction* new_root = entry->AddInstruction(HloInstruction::CreateUnary( + constant->shape(), HloOpcode::kNegate, constant)); + entry->set_root_instruction(new_root); + + // DCE the old instructions. + HloDCE dce; + TF_ASSERT_OK(dce.Run(module.get()).status()); + + EXPECT_EQ(sequence.at(entry).size(), 3); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(sequence.at(entry).size(), 2); +} + +TEST_F(HloSchedulingTest, UpdateScheduleWithMultipleComputations) { + // Create changes to more than one computation in an HLO module and verify + // that the schedule can be updated. + const string module_str = R"( +HloModule UpdateScheduleWithMultipleComputations + +%Body (param.1: (s32[], token[])) -> (s32[], token[]) { + %param.1 = (s32[], token[]) parameter(0) + %get-tuple-element.1 = s32[] get-tuple-element((s32[], token[]) %param.1), index=0 + %constant.1 = s32[] constant(1) + %add = s32[] add(s32[] %get-tuple-element.1, s32[] %constant.1) + %get-tuple-element.2 = token[] get-tuple-element((s32[], token[]) %param.1), index=1 + %after-all = token[] after-all(token[] %get-tuple-element.2) + ROOT %tuple = (s32[], token[]) tuple(s32[] %add, token[] %after-all) +} + +%Cond (param: (s32[], token[])) -> pred[] { + %param = (s32[], token[]) parameter(0) + %get-tuple-element = s32[] get-tuple-element((s32[], token[]) %param), index=0 + %constant = s32[] constant(42) + ROOT %less-than = pred[] less-than(s32[] %get-tuple-element, s32[] %constant) +} + +ENTRY %WhileLoop () -> s32[] { + %zero = s32[] constant(0) + %init_token = token[] after-all() + %init_tuple = (s32[], token[]) tuple(s32[] %zero, token[] %init_token) + %while = (s32[], token[]) while((s32[], token[]) %init_tuple), condition=%Cond, body=%Body + ROOT %root = s32[] get-tuple-element((s32[], token[]) %while), index=0 +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + TF_ASSERT_OK_AND_ASSIGN( + SequentialHloOrdering::HloModuleSequence sequence, + ScheduleComputationsInModule(*module, [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape(), + /*pointer_size=*/sizeof(void*)); + })); + tensorflow::gtl::FlatMap> + id_sequence = ComputeIdSchedule(sequence); + + const HloInstruction* xla_while = + module->entry_computation()->root_instruction()->operand(0); + HloComputation* body = xla_while->while_body(); + HloComputation* cond = xla_while->while_condition(); + + // Negate the root of the cond. + cond->set_root_instruction(cond->AddInstruction( + HloInstruction::CreateUnary(ShapeUtil::MakeShape(PRED, {}), + HloOpcode::kNot, cond->root_instruction()))); + + // Replace the body with a computation which just passes through its + // parameter. + body->set_root_instruction(body->parameter_instruction(0)); + + // DCE the dead code in the body. + HloDCE dce; + TF_ASSERT_OK(dce.Run(module.get()).status()); + + EXPECT_EQ(sequence.at(body).size(), 7); + EXPECT_EQ(sequence.at(cond).size(), 4); + + TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence)); + TF_ASSERT_OK(VerifySchedule(*module, sequence)); + + EXPECT_EQ(sequence.at(body).size(), 1); + EXPECT_EQ(sequence.at(cond).size(), 5); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc index 58224ef870096a774d5892b9aa12c38f5ff511bd..de7e6b53d4d2aa88e2213248370b4da82bdeadeb 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding.cc @@ -15,13 +15,14 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_sharding.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace xla { -using ::tensorflow::str_util::Join; -using ::tensorflow::strings::StrCat; +using absl::StrCat; +using absl::StrJoin; HloSharding HloSharding::AssignDevice(int64 device_id) { return HloSharding(device_id); @@ -31,12 +32,54 @@ HloSharding HloSharding::Tile1D(const Shape& input_shape, int64 num_tiles) { CHECK_EQ(1, ShapeUtil::Rank(input_shape)); CHECK_GT(num_tiles, 1); std::vector dimensions(1, num_tiles); - Shape tile_shape = input_shape; - auto& tile_dimension = (*tile_shape.mutable_dimensions())[0]; - tile_dimension = CeilOfRatio(static_cast(tile_dimension), num_tiles); Array assignment(dimensions); std::iota(assignment.begin(), assignment.end(), 0); - return HloSharding(tile_shape, assignment); + return HloSharding(assignment); +} + +HloSharding HloSharding::Tuple(const ShapeTree& sub_shardings) { + std::vector flattened_list; + flattened_list.reserve(sub_shardings.leaf_count()); + for (const auto& index_to_sharding : sub_shardings.leaves()) { + flattened_list.push_back(index_to_sharding.second); + } + if (flattened_list.empty()) { + // Empty tuple sharding ends up having no leaves, but we want to allow + // empty tuple HLO instruction results to have sharding, so we fetch the + // root ({}) sharding value from the ShapeTree. + // A ShapeTree created with ShapeTree(shape, init) will have + // init as value at its root. + flattened_list.push_back(sub_shardings.element(ShapeIndex({}))); + } + return HloSharding(flattened_list); +} + +HloSharding HloSharding::Tuple(const Shape& tuple_shape, + absl::Span shardings) { + CHECK(ShapeUtil::IsTuple(tuple_shape)) << ShapeUtil::HumanString(tuple_shape); + for (auto& sharding : shardings) { + CHECK(!sharding.IsTuple()) << sharding.ToString(); + } + std::vector flattened_list(shardings.begin(), shardings.end()); + CHECK_EQ(flattened_list.size(), RequiredLeaves(tuple_shape)) + << "Flat list has " << flattened_list.size() << ", required " + << RequiredLeaves(tuple_shape); + return HloSharding(flattened_list); +} + +HloSharding HloSharding::SingleTuple(const Shape& tuple_shape, + const HloSharding& sharding) { + CHECK(ShapeUtil::IsTuple(tuple_shape)) << ShapeUtil::HumanString(tuple_shape); + CHECK(!sharding.IsTuple()) << sharding.ToString(); + int64 leaf_count = RequiredLeaves(tuple_shape); + std::vector flattened_list; + flattened_list.resize(leaf_count, sharding); + return HloSharding(flattened_list); +} + +HloSharding HloSharding::Single(const Shape& shape, + const HloSharding& sharding) { + return ShapeUtil::IsTuple(shape) ? SingleTuple(shape, sharding) : sharding; } string HloSharding::ToString() const { @@ -46,7 +89,7 @@ string HloSharding::ToString() const { for (const HloSharding& element : tuple_elements_) { parts.push_back(element.ToString()); } - return StrCat("{", tensorflow::str_util::Join(parts, ", "), "}"); + return StrCat("{", absl::StrJoin(parts, ", "), "}"); } if (replicated_) { @@ -55,9 +98,8 @@ string HloSharding::ToString() const { return StrCat( "{maximal device=", static_cast(*tile_assignment_.begin()), "}"); } else { - return StrCat("{", ShapeUtil::HumanString(tile_shape_), " ", "devices=[", - Join(tile_assignment_.dimensions(), ","), "]", - Join(tile_assignment_, ","), "}"); + return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","), + "]", StrJoin(tile_assignment_, ","), "}"); } } @@ -72,12 +114,34 @@ bool HloSharding::UsesDevice(int64 device) const { std::find(devices.begin(), devices.end(), device) != devices.end(); } +std::map HloSharding::UsedDevices(int64* count) const { + int64 element_count = 1; + std::map device_map; + if (IsTuple()) { + for (auto& tuple_element_sharding : tuple_elements()) { + auto unique_device = tuple_element_sharding.UniqueDevice(); + if (unique_device) { + device_map[*unique_device] += 1; + } + } + element_count = tuple_elements().size(); + } else { + auto unique_device = UniqueDevice(); + if (unique_device) { + device_map[*unique_device] += 1; + } + } + if (count != nullptr) { + *count = element_count; + } + return device_map; +} + std::vector HloSharding::TileIndexForDevice(int64 device) const { - CHECK(!ShapeUtil::IsTuple(tile_shape_)); CHECK(!maximal_); CHECK(!IsTuple()); std::vector ret_index; - tile_assignment_.Each([&](tensorflow::gtl::ArraySlice index, int64 d) { + tile_assignment_.Each([&](absl::Span index, int64 d) { if (d == device) { ret_index = {index.begin(), index.end()}; } @@ -86,95 +150,121 @@ std::vector HloSharding::TileIndexForDevice(int64 device) const { return ret_index; } -int64 HloSharding::DeviceForTileIndex( - tensorflow::gtl::ArraySlice index) const { +int64 HloSharding::DeviceForTileIndex(absl::Span index) const { CHECK(!replicated_); CHECK(!IsTuple()); if (maximal_) { return *tile_assignment_.begin(); } - CHECK_EQ(ShapeUtil::Rank(tile_shape_), tile_assignment_.dimensions().size()); return tile_assignment_(index); } -std::vector HloSharding::TileOffsetForDevice(int64 device) const { +std::vector HloSharding::TileOffsetForDevice(const Shape& shape, + int64 device) const { CHECK(!IsTuple()); - std::vector index = TileIndexForDevice(device); if (maximal_) { - // Index will always be all zeroes if we're maximal, and tile_shape_ is not - // valid. - return index; + return std::vector(shape.dimensions_size(), 0); } + + CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions()); + std::vector index = TileIndexForDevice(device); for (int64 i = 0; i < index.size(); ++i) { - index[i] *= tile_shape_.dimensions(i); + const int64 shape_dim = shape.dimensions(i); + index[i] = std::min( + index[i] * CeilOfRatio(shape_dim, tile_assignment_.dim(i)), shape_dim); } return index; } -std::vector HloSharding::TileLimitForDevice(int64 device) const { +std::vector HloSharding::TileLimitForDevice(const Shape& shape, + int64 device) const { CHECK(!IsTuple()); - CHECK(!maximal_); // Maximal shardings do not have a valid tile shape. + if (maximal_) { + return std::vector(shape.dimensions().begin(), + shape.dimensions().end()); + } + + CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions()); std::vector index = TileIndexForDevice(device); for (int64 i = 0; i < index.size(); ++i) { - index[i] = (index[i] + 1) * tile_shape_.dimensions(i); + const int64 shape_dim = shape.dimensions(i); + index[i] = std::min( + (index[i] + 1) * CeilOfRatio(shape_dim, tile_assignment_.dim(i)), + shape_dim); } return index; } +int64 HloSharding::RequiredLeaves(const Shape& shape) { + // Empty tuples have no leaf nodes as far as ShapeUtil and ShapeTree are + // concerned, but they do have a single tuple_elements_ entry since we want + // to allow empty tuple results to have sharding. + return ShapeUtil::IsEmptyTuple(shape) ? 1 : ShapeUtil::GetLeafCount(shape); +} + +Status HloSharding::CheckLeafCount(const Shape& shape) const { + int64 shape_leaves = RequiredLeaves(shape); + TF_RET_CHECK(shape_leaves == tuple_elements_.size()) + << "Shape " << ShapeUtil::HumanString(shape) << " has " << shape_leaves + << " leaf nodes while this sharding has " << tuple_elements_.size(); + return Status::OK(); +} + StatusOr> HloSharding::AsShapeTree( const Shape& shape) const { if (IsTuple()) { ShapeTree result(shape, HloSharding::Replicate()); - int64 num_leaves = result.leaf_count(); - TF_RET_CHECK(num_leaves == tuple_elements_.size()) - << "Shape " << ShapeUtil::HumanString(shape) << " has " << num_leaves - << " leaf nodes while this sharding has " << tuple_elements_.size(); + TF_RETURN_IF_ERROR(CheckLeafCount(shape)); auto it = tuple_elements_.begin(); for (auto& index_to_sharding : result.leaves()) { index_to_sharding.second = *it++; } + if (ShapeUtil::IsEmptyTuple(shape)) { + // Empty tuples have no leaves, but we want to assign them a sharding + // anyway, so we use the root element sharding. + *result.mutable_element(ShapeIndex({})) = *it; + } return std::move(result); } else { return ShapeTree(shape, *this); } } -StatusOr HloSharding::UniqueDevice() const { +StatusOr HloSharding::GetTupleSharding(const Shape& shape) const { + if (IsTuple()) { + TF_RETURN_IF_ERROR(CheckLeafCount(shape)); + return *this; + } + return Tuple(ShapeTree(shape, *this)); +} + +absl::optional HloSharding::UniqueDevice() const { if (IsTuple()) { if (tuple_elements_.empty()) { - return tensorflow::errors::InvalidArgument( - "UniqueDevice() called on empty tuple"); + return absl::nullopt; } - std::vector> results; - std::transform(tuple_elements_.begin(), tuple_elements_.end(), - std::back_inserter(results), - [](const HloSharding& s) { return s.UniqueDevice(); }); - if (std::all_of(results.begin(), results.end(), - [&](const StatusOr& s) { - return s.ok() && results[0].ok() && - s.ValueOrDie() == results[0].ValueOrDie(); - })) { - return results[0]; - } else { - return tensorflow::errors::InvalidArgument( - "Tuple did not contain a unique device"); + absl::optional unique_device; + for (auto& tuple_sharding : tuple_elements_) { + auto device = tuple_sharding.UniqueDevice(); + if (!device || (unique_device && *device != *unique_device)) { + return absl::nullopt; + } + unique_device = device; } + return unique_device; } - if (!replicated_ && maximal_ && !IsTuple()) { + if (!replicated_ && maximal_) { return static_cast(*tile_assignment_.begin()); } - return tensorflow::errors::InvalidArgument( - "UniqueDevice() called on sharding that executes on multiple devices"); + return absl::nullopt; } -bool HloSharding::HasUniqueDevice() const { - if (IsTuple()) { - return UniqueDevice().status().ok(); - } else { - return !IsReplicated() && IsTileMaximal(); - } +int64 HloSharding::GetUniqueDevice() const { + auto device = UniqueDevice(); + CHECK(device) << "Sharding does not have a unique device: " << *this; + return *device; } Status HloSharding::ValidateTuple(const Shape& shape, int64 num_devices) const { @@ -182,28 +272,12 @@ Status HloSharding::ValidateTuple(const Shape& shape, int64 num_devices) const { return tensorflow::errors::InvalidArgument( StrCat("Sharding is tuple-shaped but validation shape is not.")); } - // The easiest way to get the number of elements in a nested tuple is just to - // create a shape tree. We could call GetAsShapeTree, but that will try and - // apply our tuple_shardings_ to the shape tree, and that might cause a crash - // at this point as we haven't validated them. - ShapeTree bool_shape_tree(shape, false); - int64 num_leaves = - std::distance(bool_shape_tree.leaf_begin(), bool_shape_tree.leaf_end()); - if (num_leaves != tuple_elements_.size()) { - return tensorflow::errors::InvalidArgument( - StrCat("Validation tuple shape has ", num_leaves, - " leaf elements, but this sharding contains ", - tuple_elements_.size(), " elements.")); - } + TF_RETURN_IF_ERROR(CheckLeafCount(shape)); // Now we've validated the number of tuple elements, it's safe to request a // shape tree. ShapeTree shape_tree = GetAsShapeTree(shape); for (const auto& index_to_sharding : shape_tree.leaves()) { - if (index_to_sharding.first.empty()) { - // An empty tuple has a ShapeTree with a single leaf at the empty index. - continue; - } Status status = index_to_sharding.second.ValidateNonTuple( ShapeUtil::GetSubshape(shape, index_to_sharding.first), num_devices); if (!status.ok()) { @@ -243,7 +317,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, Status status = Status::OK(); std::set seen_cores; tile_assignment_.Each( - [&](tensorflow::gtl::ArraySlice indices, int32 core) { + [&](absl::Span indices, int32 core) { // Don't overwrite a bad status, so we report the first error. if (status.ok()) { if (core >= num_devices) { @@ -264,11 +338,12 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, return Status::OK(); } - // The tile rank must be the same as the input rank. - if (ShapeUtil::Rank(shape) != ShapeUtil::Rank(tile_shape_)) { + // The tile assignment tensor must have the same rank as the input. + if (ShapeUtil::Rank(shape) != tile_assignment_.num_dimensions()) { return tensorflow::errors::InvalidArgument( - "Tile rank is different to the input rank. sharding=", ToString(), - ", input_shape=", ShapeUtil::HumanString(shape)); + "Number of tile assignment dimensions is different to the input rank. " + "sharding=", + ToString(), ", input_shape=", ShapeUtil::HumanString(shape)); } // The correct constructor have to be used to create tile maximal shardings. @@ -278,20 +353,6 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, "sharding was intended, use HloSharding::Replicated(). If a device " "placement was intended, use HloSharding::AssignDevice()"); } - - // The tile assignment tensor must contain enough element to cover the full - // shape with tiles of the specified size. - for (int64 i = 0, e = tile_assignment_.dimensions().size(); i != e; ++i) { - int64 total_tile_size = tile_assignment_.dim(i) * tile_shape_.dimensions(i); - if (shape.dimensions(i) > total_tile_size) { - return tensorflow::errors::InvalidArgument( - StrCat("Tile assignment tensor has too few element to cover the full " - "shape. Dimension ", - i, ", shape ", shape.dimensions(i), ", total size ", - total_tile_size)); - } - } - return Status::OK(); } @@ -321,7 +382,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, proto.tile_assignment_dimensions().end())); std::copy(proto.tile_assignment_devices().begin(), proto.tile_assignment_devices().end(), tile_assignment.begin()); - return HloSharding(proto.tile_shape(), tile_assignment); + return HloSharding(tile_assignment); } OpSharding HloSharding::ToProto() const { @@ -335,7 +396,6 @@ OpSharding HloSharding::ToProto() const { return result; } - *result.mutable_tile_shape() = tile_shape_; for (int64 dim : tile_assignment_.dimensions()) { result.add_tile_assignment_dimensions(dim); } @@ -352,41 +412,68 @@ OpSharding HloSharding::ToProto() const { return result; } -HloSharding HloSharding::TransformShardedTileShape( - const Shape& new_shape, - const std::function& transform) const { - CHECK(!IsTuple()); +Shape HloSharding::TileShape(const Shape& shape) const { if (IsTileMaximal()) { - return *this; + return shape; } - CHECK_EQ(ShapeUtil::Rank(new_shape), ShapeUtil::Rank(tile_shape())); - Shape new_tile_shape; - new_tile_shape.set_element_type(tile_shape().element_type()); - for (int64 i = 0; i < ShapeUtil::Rank(new_shape); ++i) { - int64 dim; - if (tile_assignment().dim(i) == 1) { - dim = new_shape.dimensions(i); - } else if (transform) { - dim = transform(i, tile_shape().dimensions(i)); - } else { - dim = tile_shape().dimensions(i); - } - new_tile_shape.add_dimensions(dim); + Shape result_shape = shape; + for (int64 i = 0; i < shape.dimensions_size(); ++i) { + (*result_shape.mutable_dimensions())[i] = + CeilOfRatio(shape.dimensions(i), tile_assignment_.dim(i)); } - TF_CHECK_OK( - LayoutUtil::CopyLayoutBetweenShapes(tile_shape_, &new_tile_shape)); - return HloSharding::Tile(new_tile_shape, tile_assignment()); + return result_shape; } HloSharding HloSharding::GetSubSharding(const Shape& shape, const ShapeIndex& index) const { CHECK(IsTuple()); + int64 sharding_index = 0; + const Shape* sub_shape = &shape; + for (int64 idx : index) { + for (int64 i = 0; i < idx; ++i) { + sharding_index += + ShapeUtil::GetLeafCount(ShapeUtil::GetSubshape(*sub_shape, {i})); + } + sub_shape = &ShapeUtil::GetSubshape(*sub_shape, {idx}); + } + if (ShapeUtil::IsTuple(*sub_shape)) { + auto begin_it = tuple_elements_.begin() + sharding_index; + std::vector sub_shardings( + begin_it, begin_it + ShapeUtil::GetLeafCount(*sub_shape)); + return HloSharding::Tuple(*sub_shape, sub_shardings); + } else { + return tuple_elements_[sharding_index]; + } +} + +absl::optional HloSharding::ExtractSingleSharding() const { + if (!IsTuple()) { + return *this; + } + for (int64 i = 1; i < tuple_elements_.size(); ++i) { + if (tuple_elements_[0] != tuple_elements_[i]) { + return absl::nullopt; + } + } + return tuple_elements_.front(); +} - Shape sub_shape = ShapeUtil::GetSubshape(shape, index); - ShapeTree sub_shape_tree(sub_shape, Replicate()); - sub_shape_tree.CopySubtreeFrom(GetAsShapeTree(shape), index, {}); - return ShapeUtil::IsTuple(sub_shape) ? Tuple(sub_shape_tree) - : sub_shape_tree.element(ShapeIndex({})); +size_t HloSharding::Hash() const { + if (tuple_) { + size_t h = 0; + for (const auto& element : tuple_elements_) { + h = tensorflow::Hash64Combine(h, element.Hash()); + } + return h; + } + if (replicated_) { + return 0; + } + size_t h = 0; + for (uint32 v : tile_assignment_) { + h = tensorflow::Hash64Combine(h, std::hash{}(v)); + } + return h; } std::ostream& operator<<(std::ostream& out, const HloSharding& sharding) { diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h index f4a0fb626f2c3e417c020cbfa2f7168359a47788..9775505f8608ced3e33abe376f4922cc6a972726 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding.h +++ b/tensorflow/compiler/xla/service/hlo_sharding.h @@ -19,14 +19,16 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_H_ +#include #include +#include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/protobuf_util.h" #include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" @@ -46,22 +48,10 @@ class HloSharding { // the input shape (one tile) assigned to a single device. static HloSharding AssignDevice(int64 device_id); - // Creates a new sharding which splits a shape into tiles each with shape - // `tile_shape`. Each tile is assigned to one device, which is specified by - // `tile_assignment`. Any tensor not a multiple of the tile size in any - // dimension is implicitly padded to the tile size. - // - // e.g. Tile({2, 2}, {0, 1}) on a tensor of shape {3, 2} would look like: - // 2 1 padding - // <------><-> - // +----+----+ - // | 0 | 1 | - // +----+----+ - // - // Split into two tiles, one of which is implicitly padded by one. - static HloSharding Tile(const Shape& tile_shape, - const Array& tile_assignment) { - return HloSharding(tile_shape, tile_assignment); + // Creates a new sharding which splits a shape into tiles amongst the devices + // specified by `tile_assignment`. + static HloSharding Tile(const Array& tile_assignment) { + return HloSharding(tile_assignment); } // Creates a new sharding which splits a one-dimensional input shape into @@ -70,26 +60,22 @@ class HloSharding { // Creates a new sharding for a tuple type. The given ShapeTree must have // elements for every leaf shape contained in the tuple. - static HloSharding Tuple(const ShapeTree& sub_shardings) { - std::vector flattened_list; - flattened_list.reserve( - std::distance(sub_shardings.leaf_begin(), sub_shardings.leaf_end())); - for (const auto& index_to_sharding : sub_shardings.leaves()) { - flattened_list.push_back(index_to_sharding.second); - } - return HloSharding(flattened_list); - } + static HloSharding Tuple(const ShapeTree& sub_shardings); - // Creates a new sharding for a tuple type. The requested tuple shape must not - // be nested. For nested tuples, use the ShapeTree overload. + // Creates a new sharding for a tuple type. The number of elements in + // shardings must match the number of leaf nodes in tuple_shape. For + // empty tuples, the shardings array must have one element. static HloSharding Tuple(const Shape& tuple_shape, - tensorflow::gtl::ArraySlice shardings) { - CHECK(ShapeUtil::IsTuple(tuple_shape)); - CHECK(!ShapeUtil::IsNestedTuple(tuple_shape)); - std::vector flattened_list(shardings.begin(), shardings.end()); - CHECK_EQ(flattened_list.size(), ShapeUtil::TupleElementCount(tuple_shape)); - return HloSharding(flattened_list); - } + absl::Span shardings); + + // Creates a new sharding for a tuple type, with a single input sharding + // repeated on each leaf. + static HloSharding SingleTuple(const Shape& tuple_shape, + const HloSharding& sharding); + + // If shape is an array, returns sharding, otherwise returns the tuple shaped + // sharding with all the leaf nodes having the same input sharding. + static HloSharding Single(const Shape& shape, const HloSharding& sharding); // Create a new sharding from a protobuf OpSharding. static StatusOr FromProto(const OpSharding& proto); @@ -131,6 +117,14 @@ class HloSharding { // Returns true if the sharding defines an operation on the given device. bool UsesDevice(int64 device) const; + // Retrieves an histogram of the devices used by the sharding. The returned + // map has the device number as key, and the occurrence count as value. + // If a sharding does not have a device, it will not be incuded in the + // histogram. The count argument, if not nullptr, will receive the total + // number of elements this sharding is made of (one for array, N leaves for + // tuples). + std::map UsedDevices(int64* count) const; + // Returns the tile that should be executed on the given device. // REQUIRES: !IsTuple() std::vector TileIndexForDevice(int64 device) const; @@ -138,26 +132,32 @@ class HloSharding { // Returns the device that should execute the given tile. // It is an error to call this if is_replicated() is true. // REQUIRES: !IsTuple() - int64 DeviceForTileIndex(tensorflow::gtl::ArraySlice index) const; + int64 DeviceForTileIndex(absl::Span index) const; - // Given a device ID, returns the offset within the input space of the + // Given a device ID, returns the offset within the specified shape of the // tile that should be executed on the given core. This returns the lower // extent of the tile in the input space. // REQUIRES: !IsTuple() - std::vector TileOffsetForDevice(int64 device) const; + std::vector TileOffsetForDevice(const Shape& shape, + int64 device) const; - // Given a device ID, returns the limit within the input space of the + // Given a device ID, returns the limit within the specified shape of the // tile that should be executed on the given core. This returns the upper // extent of the tile in the input space. // REQUIRES: !IsTuple() - std::vector TileLimitForDevice(int64 device) const; + std::vector TileLimitForDevice(const Shape& shape, int64 device) const; + + // Returns the single device this op operates on. If the sharding does not + // span a single device, the return value will be empty. + // In order for a sharding to span a single device, every leaf sharding must + // be maximal and not replicated, and the used device must match. + absl::optional UniqueDevice() const; - // Returns the single device this op operates on. - // REQUIRES: !IsTuple&& !Replicated() && IsTileMaximal() - StatusOr UniqueDevice() const; + // Retrieves the unique device or fails with a CHECK. + int64 GetUniqueDevice() const; // Returns true if this op only uses a single device. - bool HasUniqueDevice() const; + bool HasUniqueDevice() const { return UniqueDevice().has_value(); } // Returns the ShapeTree containing the shardings for each element of this // tuple, if IsTuple, or a ShapeTree with a single element containing this @@ -172,34 +172,26 @@ class HloSharding { // REQUIRES: IsTuple() HloSharding GetSubSharding(const Shape& shape, const ShapeIndex& index) const; + // If the current sharding is a tuple sharding, return itself as result. + // Otherwise returns a tuple sharding for the input shape, with all the leaves + // having this object sharding. + StatusOr GetTupleSharding(const Shape& shape) const; + + // Extracts the sharding that is common within the current sharding. + // If the current sharding is not a tuple sharding, the current sharding will + // be returned. If it is a tuple, and all the tuple elements are common, the + // common element will be returned. Otherwise the optional will contain no + // value. + absl::optional ExtractSingleSharding() const; + bool operator==(const HloSharding& other) const { return replicated_ == other.replicated_ && maximal_ == other.maximal_ && - ShapeUtil::Compatible(tile_shape_, other.tile_shape_) && tile_assignment_ == other.tile_assignment_ && tuple_elements_ == other.tuple_elements_; } bool operator!=(const HloSharding& other) const { return !(*this == other); } - size_t Hash() const { - if (!tuple_) { - size_t h = 0; - for (const auto& element : tuple_elements_) { - h = tensorflow::Hash64Combine(h, element.Hash()); - } - return h; - } - if (replicated_) { - return 0; - } - size_t h = 0; - for (uint32 v : tile_assignment_) { - h = tensorflow::Hash64Combine(h, std::hash{}(v)); - } - for (uint32 v : tile_shape_.dimensions()) { - h = tensorflow::Hash64Combine(h, std::hash{}(v)); - } - return h; - } + size_t Hash() const; struct Hasher { size_t operator()(const HloSharding& sharding) const { @@ -207,9 +199,6 @@ class HloSharding { } }; - // Gets the tile shape. - // REQUIRES: !IsTileMaximal() && !IsTuple() - const Shape& tile_shape() const { return tile_shape_; } // Gets the tile assignment tensor. // REQUIRES: !IsReplicated() && !IsTuple() const Array& tile_assignment() const { return tile_assignment_; } @@ -221,58 +210,59 @@ class HloSharding { return tuple_elements_; } - // Return a new sharding that can apply to the given new shape. - // If this sharding is tile-maximal, the returned sharding will be the same as - // this sharding. If this sharding is not tile-maximal, the returned - // sharding's tile size will differ: - // - Non-sharded dimensions will be adapted to be the same as `new_shape`; - // tile_dimension(i) = new_shape.dimensions(i); - // - Sharded dimensions will be kept the same unless `transform` is supplied - // in which case tile_dimension(i) = transform(i, tile_dimension(i)); - // REQUIRES: !IsTuple(). - HloSharding TransformShardedTileShape( - const Shape& new_shape, - const std::function& transform = nullptr) const; + // Gets the tile shape. + // REQUIRES: !IsTuple() + Shape TileShape(const Shape& shape) const; private: HloSharding() : replicated_(true), maximal_(true), tuple_(false), - tile_shape_(), tile_assignment_({0}) {} + // device_id values: + // -2: magic number to mean unassigned device, used by spatial partitioning + // -1: the id of the host + // 0 or positive: the id of a device + // NOTE(dimvar): -1 is needed for outside compilation. It can be removed once + // we have fully switched to the side-effect tokens. explicit HloSharding(int64 device_id) : replicated_(false), maximal_(true), tuple_(false), - tile_shape_(), tile_assignment_({1}, device_id) {} - HloSharding(const Shape& tile_shape, const Array& tile_assignment) + explicit HloSharding(const Array& tile_assignment) : replicated_(false), maximal_(false), tuple_(false), - tile_shape_(tile_shape), tile_assignment_(tile_assignment) {} - HloSharding(const std::vector& tuple_shardings) + explicit HloSharding(const std::vector& tuple_shardings) : replicated_(false), maximal_(false), tuple_(true), tile_assignment_({0}), tuple_elements_(tuple_shardings) {} + // Checks that the number of elements in tuple_elements_ is consistent with + // the tuple shape passes as argument. + Status CheckLeafCount(const Shape& shape) const; + // Internal helper to validate a tuple sharding. Status ValidateTuple(const Shape& shape, int64 num_devices) const; + // Internal helper to validate a non-tuple (leaf) sharding. Status ValidateNonTuple(const Shape& shape, int64 num_devices) const; + // Returns the number of tuple_elements_ entries to fit the shape. + static int64 RequiredLeaves(const Shape& shape); + bool replicated_; bool maximal_; bool tuple_; - Shape tile_shape_; Array tile_assignment_; - // Only non-empty when tuple_ is true, but because empty tuples are allowed - // may also be empty even then. This is a flattened list of all the leaf - // shardings in a tuple shape, by pre-order walk (ShapeTree iterator order). + // Only non-empty when tuple_ is true. If a tuple is empty then one entry is + // present for the root. This is a flattened list of all the leaf shardings in + // a tuple shape, by pre-order walk (ShapeTree iterator order). std::vector tuple_elements_; }; diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc index 82cff2a4b7146c2d454feb2d90673d419ca1a54d..34cba6136ff3fe95529f3bcf594db7776c8bfd0a 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -23,6 +24,23 @@ namespace xla { namespace { +// AssignmentKind and kUnassignedDevice are used during tuple domain sharding +// propagation in order to distinguish among three cases: +// kUnassigned: no assignment has occurred +// kAssigned: at least an assignment has occurred +// kConflict: no assignment has occurred because of conflicting propagations, +// which occurs when multiple users of an instruction have different +// shardings. +enum class AssignmentKind { kUnassigned, kAssigned, kConflict }; + +// kUnassignedDevice can only be assigned to tuple leaf shardings to indicate +// absence of sharding information for that particular sub-sharding during +// sharding propagation. It is used to be able to express tuple shardings with +// partial information. At the end of the propagation the sharding of +// tuple-shaped instructions using kUnassignedDevice's is cleared. +// TODO(b/112883246): Centralized enum of reserved devices. +constexpr int64 kUnassignedDevice = -2; + struct PassThrough { PassThrough(HloInstruction* user, HloInstruction* operand) : user(user), operand(operand) {} @@ -31,32 +49,22 @@ struct PassThrough { HloInstruction* operand = nullptr; }; -void SetDeviceSharding(HloInstruction* instruction, int64 device) { - VLOG(4) << " " << instruction->name() << " to device " << device; - instruction->set_device_sharding(device); -} - -tensorflow::gtl::optional ShardingUniqueDevice( - const HloSharding& sharding) { - if (sharding.IsTileMaximal()) { - auto device = sharding.UniqueDevice(); - if (device.ok()) { - return device.ValueOrDie(); - } - } - return tensorflow::gtl::optional(); +void SetSingleSharding(HloInstruction* instruction, + const HloSharding& sharding) { + VLOG(4) << " " << instruction->name() << " to " << sharding; + instruction->set_single_sharding(sharding); } bool ShardingMatches(const HloSharding& sharding1, const HloSharding& sharding2) { - auto device1 = ShardingUniqueDevice(sharding1); - if (device1) { - auto device2 = ShardingUniqueDevice(sharding2); - if (device2) { - return *device1 == *device2; + auto single_sharding1 = sharding1.ExtractSingleSharding(); + if (single_sharding1) { + auto single_sharding2 = sharding2.ExtractSingleSharding(); + if (single_sharding2) { + return *single_sharding1 == single_sharding2; } } - // Anything which is not tile maximal with unique device, gets a full sharding + // Anything which is not unique across all elements, gets a full sharding // compare. return sharding1 == sharding2; } @@ -98,6 +106,12 @@ std::vector LocatePassThroughDomainLinks( VLOG(2) << " " << instruction->ToString(); } } + if (instruction == instruction->parent()->root_instruction()) { + pass_through.emplace_back(nullptr, instruction); + VLOG(2) << "Found passthrough domain link:"; + VLOG(2) << " "; + VLOG(2) << " " << instruction->ToString(); + } } return pass_through; } @@ -111,29 +125,37 @@ Status FixupPassThroughDomainLinks(const DomainMetadata::Domain& domain, HloInstruction::CreateGetTupleElement(pass_through.operand->shape(), tuple, 0)); gte->set_sharding(sharding); - TF_RETURN_IF_ERROR( - pass_through.operand->ReplaceUseWith(pass_through.user, gte)); + if (pass_through.user != nullptr) { + TF_RETURN_IF_ERROR( + pass_through.operand->ReplaceUseWith(pass_through.user, gte)); + } else { + pass_through.operand->parent()->set_root_instruction(gte); + } } return Status::OK(); } -std::unique_ptr CloneShardingForDomain( - const HloSharding& sharding) { - auto device = ShardingUniqueDevice(sharding); - if (!device) { - return MakeUnique(sharding); +// For tuple shardings if every element have the same sharsing then we want to +// treat them as single element sharsings to insert less domain separation as a +// domain can prevent some optimizations and we want to minimize that from +// happening. +std::shared_ptr CloneShardingForDomain( + std::shared_ptr sharding) { + auto single_sharding = sharding->ExtractSingleSharding(); + if (!single_sharding) { + return sharding; } - return MakeUnique(HloSharding::AssignDevice(*device)); + return std::make_shared(*single_sharding); } -Status ApplyDomainDeviceSharding(const DomainMetadata::Domain& domain, - int64 device) { - VLOG(4) << "Applying device " << device << " sharding"; +Status ApplyDomainSingleSharding(const DomainMetadata::Domain& domain, + const HloSharding& sharding) { + VLOG(4) << "Applying " << sharding << " sharding"; for (HloInstruction* instruction : domain.instructions) { // We only change instructions without sharding, since otherwise we might // mess up with eventual HLO passes which has knowledge of it. if (!instruction->has_sharding()) { - SetDeviceSharding(instruction, device); + SetSingleSharding(instruction, sharding); } else { VLOG(4) << " " << instruction->name() << " already has sharding " << instruction->sharding(); @@ -142,99 +164,174 @@ Status ApplyDomainDeviceSharding(const DomainMetadata::Domain& domain, return Status::OK(); } -// Retrieves the sharding of a tuple shaped instruction in form of a ShapeTree. -// If the instruction has no sharding, a ShapeTree with HloSharding::Replicate() -// sharding will be returned. -ShapeTree GetTupleSharding(HloInstruction* tuple) { - if (tuple->has_sharding()) { - return tuple->sharding().GetAsShapeTree(tuple->shape()); +// Return the ShapeTree of the user argument. The user argument +// is assumed to be a user of the instruction argument. +// If user is a tuple instruction, return the tuple subsharding corresponding to +// the operand matching the instruction argument, because that is the +// subsharding corresponding to instruction. +ShapeTree GetShardingTreeFromUser( + const HloInstruction& instruction, const HloInstruction& user) { + if (user.opcode() == HloOpcode::kTuple) { + return user.sharding() + .GetSubSharding(user.shape(), {user.operand_index(&instruction)}) + .GetAsShapeTree(instruction.shape()); + } + return user.sharding().GetAsShapeTree(user.shape()); +} + +// Assign rhs to lhs. If rhs is unassigned (assigned to kUnassignedDevice) +// then no assignment is made. Therefore kUnassignedDevice is never propagated. +// kConflict is returned if lhs is already assigned and rhs is assigned to a +// different device. +StatusOr AssignLeafSharding(HloSharding* lhs, + const HloSharding& rhs) { + TF_RET_CHECK(!lhs->IsTuple() && !rhs.IsTuple()); + if (rhs.UsesDevice(kUnassignedDevice)) { + return AssignmentKind::kUnassigned; + } + if (lhs->UsesDevice(kUnassignedDevice)) { + *lhs = rhs; + return AssignmentKind::kAssigned; } - return ShapeTree(tuple->shape(), HloSharding::Replicate()); + return lhs->UniqueDevice() != rhs.UniqueDevice() + ? AssignmentKind::kConflict + : AssignmentKind::kUnassigned; } -// Retrieves the sharding of operand, asked from a user instruction which is -// within domain. If operand is a kDomain, it means that sharding argument is -// the operand sharding, otherwise the operand's own sharding will be returned. -const HloSharding* GetOperandSharding(const HloInstruction* operand, +// Assigns the whole rhs tree to lhs_tree, starting at lhs_it. +// In case of conflicting assignment AssignmentKind::kConflict is returned. In +// this case lhs_tree is partially assigned, up to the conflicting leaf. It is +// up to the caller to discard the partial assignment in case of conflict. +StatusOr AssignTreeSharding( + ShapeTree* lhs_tree, ShapeTree::iterator lhs_it, + const ShapeTree& rhs_tree) { + AssignmentKind assigned = AssignmentKind::kUnassigned; + auto rhs_it = rhs_tree.begin(); + for (; lhs_it != lhs_tree->end() && rhs_it != rhs_tree.end(); + ++lhs_it, ++rhs_it) { + // TODO(b/112885211): Add ShapeTree::IsLeaf(const ShapeTreeIterator &it) + if (rhs_tree.IsLeaf(rhs_it->first)) { + TF_RET_CHECK(lhs_tree->IsLeaf(lhs_it->first)); + TF_ASSIGN_OR_RETURN(AssignmentKind sub_assigned, + AssignLeafSharding(&lhs_it->second, rhs_it->second)); + if (sub_assigned == AssignmentKind::kConflict) { + // In case of conflict we return conflict to the caller. At this point + // partial assignments to lhs_tree may have been made already. It is up + // to the caller to discard the partial assignment in case of conflict. + return AssignmentKind::kConflict; + } else if (sub_assigned == AssignmentKind::kAssigned) { + assigned = sub_assigned; + } + } + } + TF_RET_CHECK(rhs_it == rhs_tree.end()); + return assigned; +} + +StatusOr ApplyShardingFromUsers(HloInstruction* instruction, const DomainMetadata::Domain& domain, - const HloSharding& sharding) { - DCHECK_EQ(domain.reach_set.count(const_cast(operand)), 1); - // Here the user of operand is within the domain instruction set, and since it - // is user of operand, we need to look into the enter_domains set. If this is - // not a kDomain within the user domains set, then return the operand - // sharding, if any. - if (operand->opcode() != HloOpcode::kDomain || - domain.enter_domains.count(const_cast(operand)) == 0) { - return operand->has_sharding() ? &operand->sharding() : nullptr; + const HloSharding& domain_sharding) { + if (instruction->users().empty()) { + // No sharding from users, use domain_sharding, after checking + // compatibility. + TF_RET_CHECK(ShapeUtil::IsTuple(instruction->shape()) && + ShapeUtil::GetLeafCount(instruction->shape()) == + domain_sharding.tuple_elements().size()); + instruction->set_sharding(domain_sharding); + return true; + } + AssignmentKind assigned = AssignmentKind::kUnassigned; + // The sharding_tree leaves are initialized to kUnassignedDevice. Only Tuple + // subshardings can result in a final sharding assignment containing + // kUnassignedDevice leaves, in case some tuple indexes are not used, or are + // used by users that don't have a sharding. + // Non-tuple shardings are either assigned to a real sharding, or are not + // assigned at all. As such they will never get assigned to kUnassignedDevice. + // In any case, kUnassignedDevice is never propagated, from the implementation + // of AssignLeafSharding. + ShapeTree sharding_tree( + instruction->shape(), HloSharding::AssignDevice(kUnassignedDevice)); + for (HloInstruction* user : instruction->users()) { + if (user->opcode() == HloOpcode::kDomain && + domain.exit_domains.count(const_cast(user)) > 0) { + // If a user is a domain and it is registered in the domain exits, then + // the instruction sharding is taken directly from the domain, and no + // further users need to be visited. + instruction->set_sharding(domain_sharding); + return true; + } + if (!user->has_sharding()) { + continue; + } + AssignmentKind sub_assigned = AssignmentKind::kUnassigned; + ShapeTree user_sharding_tree = + GetShardingTreeFromUser(*instruction, *user); + if (ShapeUtil::IsTuple(instruction->shape())) { + // For tuple-shaped instructions collect individual tuple subshardings + // from the uses, and then combine them into the tuple sharding. + // If the user is a GTE its sharding concerns only the subtree of + // sharding_tree at index user->tuple_index, otherwise the whole + // sharding_tree is affected. + ShapeTree::iterator sharding_tree_begin = + user->opcode() == HloOpcode::kGetTupleElement + ? sharding_tree.find({user->tuple_index()}) + : sharding_tree.begin(); + TF_ASSIGN_OR_RETURN( + sub_assigned, AssignTreeSharding(&sharding_tree, sharding_tree_begin, + user_sharding_tree)); + } else { + // Non-tuple shape: assign common users sharding. + TF_RET_CHECK(user_sharding_tree.leaf_count() == 1) + << "Expected non-tuple user sharding"; + TF_ASSIGN_OR_RETURN( + sub_assigned, + AssignTreeSharding(&sharding_tree, sharding_tree.begin(), + user_sharding_tree)); + } + + if (sub_assigned == AssignmentKind::kConflict) { + // In case of conflict we don't assign any sharding. + return false; + } else if (sub_assigned == AssignmentKind::kAssigned) { + assigned = sub_assigned; + } } - // At this point operand is a kDomain of the currently processed domain, so we - // can refer to sharding as the domain sharding. - return &sharding; + + if (assigned == AssignmentKind::kAssigned) { + if (ShapeUtil::IsTuple(instruction->shape())) { + instruction->set_sharding(HloSharding::Tuple(sharding_tree)); + } else { + TF_RET_CHECK(sharding_tree.leaf_count() == 1); + instruction->set_sharding(sharding_tree.leaf_begin()->second); + } + return true; + } + return false; } // Tries to propagate the sharding information into the instructions that are -// part of the domain, in a post order manner (operand propagate to user). +// part of the domain, in a reverse post order manner (users propoagate to +// instruction). StatusOr ApplyDomainShardingPass(const DomainMetadata::Domain& domain, - const HloSharding& sharding) { + const HloSharding& domain_sharding) { int64 assigned = 0; - for (HloInstruction* instruction : domain.instructions) { + // domain.instructions are ordered in a post-order manner. As we do + // user->operand propagation we process instructions in reverse order. In so + // doing we are guaranteed to process all users before their operands. + for (auto it = domain.instructions.rbegin(); it != domain.instructions.rend(); + ++it) { + HloInstruction* instruction = *it; if (instruction->has_sharding()) { continue; } - if (instruction->opcode() == HloOpcode::kGetTupleElement) { - HloInstruction* tuple = instruction->mutable_operand(0); - const HloSharding* tuple_sharding = - GetOperandSharding(tuple, domain, sharding); - if (tuple_sharding != nullptr) { - TF_RET_CHECK(tuple_sharding->IsTuple()) << tuple->ToString(); - HloSharding sub_sharding = tuple_sharding->GetSubSharding( - tuple->shape(), {instruction->tuple_index()}); - VLOG(4) << " " << instruction->name() << " to sharding " - << sub_sharding; - instruction->set_sharding(sub_sharding); - ++assigned; - } - } else if (instruction->opcode() == HloOpcode::kTuple) { - int64 tuple_assigned = 0; - ShapeTree shape_tree = GetTupleSharding(instruction); - for (int64 i = 0; i < instruction->operand_count(); ++i) { - const HloSharding* operand_sharding = - GetOperandSharding(instruction->operand(i), domain, sharding); - if (operand_sharding != nullptr && - shape_tree.element({i}) != *operand_sharding) { - *shape_tree.mutable_element({i}) = *operand_sharding; - ++tuple_assigned; - } - } - if (tuple_assigned > 0) { - HloSharding tuple_sharding = HloSharding::Tuple(shape_tree); - VLOG(4) << " " << instruction->name() << " to sharding " - << tuple_sharding; - instruction->set_sharding(tuple_sharding); - ++assigned; - } - } else { - // If all the operand of the given instruction has the same single device - // assignment, assign that device to this instruction as well. - const HloSharding* common_sharding = nullptr; - for (const HloInstruction* operand : instruction->operands()) { - const HloSharding* operand_sharding = - GetOperandSharding(operand, domain, sharding); - if (operand_sharding != nullptr) { - if (common_sharding != nullptr && - *common_sharding != *operand_sharding) { - common_sharding = nullptr; - break; - } - common_sharding = operand_sharding; - } - } - if (common_sharding != nullptr) { - VLOG(4) << " " << instruction->name() << " to sharding " - << *common_sharding; - instruction->set_sharding(*common_sharding); - ++assigned; - } + // Take the sharding from the users. + TF_ASSIGN_OR_RETURN( + bool instruction_assigned, + ApplyShardingFromUsers(instruction, domain, domain_sharding)); + if (instruction_assigned) { + ++assigned; + VLOG(4) << " " << instruction->name() << " to sharding " + << instruction->sharding(); } } return assigned; @@ -242,91 +339,50 @@ StatusOr ApplyDomainShardingPass(const DomainMetadata::Domain& domain, Status ApplyDomainSharding(const DomainMetadata::Domain& domain, const HloSharding& sharding) { - auto device = ShardingUniqueDevice(sharding); - if (device) { - // Shortcut the simple case. We have a unique device sharding, so we call - // the ApplyDomainDeviceSharding() API which will apply array or tuple - // shaped device sharding to the domain instructions. - return ApplyDomainDeviceSharding(domain, *device); + // None of the external normalizers handled the domain sharding, try to see + // whether this is a single sharding first. + auto single_sharding = sharding.ExtractSingleSharding(); + if (single_sharding) { + // Shortcut the simple case. We have a unique sharding, so we call + // the ApplyDomainSingleSharding() API which will apply array or tuple + // shaped sharding to the domain instructions. + return ApplyDomainSingleSharding(domain, *single_sharding); } VLOG(1) << "Assigning non-trivial sharding " << sharding; - for (;;) { - TF_ASSIGN_OR_RETURN(int64 assigned, - ApplyDomainShardingPass(domain, sharding)); - if (assigned == 0) { - break; - } - } + TF_RETURN_IF_ERROR(ApplyDomainShardingPass(domain, sharding).status()); + int64 unassigned = 0; for (HloInstruction* instruction : domain.instructions) { if (!instruction->has_sharding()) { LOG(WARNING) << "Unassigned instruction: " << instruction->ToString(); ++unassigned; + } else { + // Un-set sharding of tuples whose sub-sgardings are assigned to + // kUnassignedDevice. Indeed in case of doubt it is better to leave the + // entire tuple unassigned, and let the device placer decide for it. + if (instruction->sharding().UsesDevice(kUnassignedDevice)) { + TF_RET_CHECK(ShapeUtil::IsTuple(instruction->shape())) + << "Only tuples can have kUnassignedDevice sub shardings"; + instruction->clear_sharding(); + } } } // Should we error out if unassigned > 0? return Status::OK(); } -// Creates a kDomain instruction to be placed between instruction and operand. -// The kDomain instruction will be created only if the sharding differ between -// the instruction and the operand. -std::unique_ptr CreateDomain(HloInstruction* instruction, - HloInstruction* operand) { - const HloSharding* instruction_sharding = - instruction->has_sharding() ? &instruction->sharding() : nullptr; - const HloSharding* operand_sharding = - operand->has_sharding() ? &operand->sharding() : nullptr; - // No need for domain if they both have no sharding. - if (instruction_sharding == nullptr && operand_sharding == nullptr) { - return nullptr; - } - // No need for domain if they match. - if (instruction_sharding != nullptr && operand_sharding != nullptr && - ShardingMatches(*instruction_sharding, *operand_sharding)) { - return nullptr; - } - std::unique_ptr real_instruction_sharding; - std::unique_ptr real_operand_sharding; - if (instruction_sharding != nullptr) { - real_instruction_sharding = CloneShardingForDomain(*instruction_sharding); - } - if (operand_sharding != nullptr) { - real_operand_sharding = CloneShardingForDomain(*operand_sharding); - } - VLOG(3) << "Creating domain:"; - VLOG(3) << " Instruction: " << instruction->name(); - VLOG(3) << " Operand: " << operand->name(); - VLOG(3) << " User side sharding: " - << (real_instruction_sharding != nullptr - ? real_instruction_sharding->ToString() - : "None"); - VLOG(3) << " Operand side sharding: " - << (real_operand_sharding != nullptr - ? real_operand_sharding->ToString() - : "None"); - - std::unique_ptr operand_side_metadata = - MakeUnique(std::move(real_operand_sharding)); - std::unique_ptr user_side_metadata = - MakeUnique(std::move(real_instruction_sharding)); - return HloInstruction::CreateDomain(operand->shape(), operand, - std::move(operand_side_metadata), - std::move(user_side_metadata)); -} - -StatusOr> ExtractOriginalCommonSharding( - tensorflow::gtl::ArraySlice instructions) { +StatusOr> ExtractOriginalCommonSharding( + absl::Span instructions) { // If we are here, all the instructions being passed had the same sharding // (or no sharding), by the means of the ShardingMatches() API. // As such, no kDomain was inserted, and here we are asked to extract the // original common sharding. // All the instructions passed to this API are part of the same computation. - const HloSharding* sharding = nullptr; + std::shared_ptr sharding; for (HloInstruction* instruction : instructions) { if (instruction->has_sharding()) { if (sharding == nullptr) { - sharding = &instruction->sharding(); + sharding = instruction->sharding_ptr(); } else { TF_RET_CHECK(ShardingMatches(*sharding, instruction->sharding())) << "Sharding " << *sharding << " does not match the one in " @@ -335,10 +391,10 @@ StatusOr> ExtractOriginalCommonSharding( } } if (sharding == nullptr) { - return std::unique_ptr(); + return std::shared_ptr(); } VLOG(4) << "Extracted sharding is " << *sharding; - return CloneShardingForDomain(*sharding); + return CloneShardingForDomain(sharding); } } // namespace @@ -346,9 +402,9 @@ StatusOr> ExtractOriginalCommonSharding( std::unique_ptr ShardingMetadata::Clone() const { std::unique_ptr sharding; if (sharding_ != nullptr) { - sharding = MakeUnique(*sharding_); + sharding = absl::make_unique(*sharding_); } - return MakeUnique(std::move(sharding)); + return absl::make_unique(std::move(sharding)); } bool ShardingMetadata::Matches(const DomainMetadata& other) const { @@ -367,35 +423,112 @@ bool ShardingMetadata::Matches(const DomainMetadata& other) const { } string ShardingMetadata::ToString() const { - return sharding_ != nullptr ? sharding_->ToString() : "None"; + return sharding_ != nullptr ? sharding_->ToString() : "{}"; } -Status ShardingMetadata::NormalizeInstructions( - const DomainMetadata::Domain& domain) const { - if (sharding_ != nullptr) { - VLOG(4) << "Normalizing sharding to " << sharding_->ToString() << ":"; - TF_RETURN_IF_ERROR(ApplyDomainSharding(domain, *sharding_)); - TF_RETURN_IF_ERROR(FixupPassThroughDomainLinks(domain, *sharding_)); +/*static*/ StatusOr +ShardingMetadata::ToShardingMetadata(const DomainMetadata* metadata) { + if (metadata->Kind() != ShardingMetadata::KindName()) { + return Status( + tensorflow::error::INVALID_ARGUMENT, + "ShardingMetadata normalizer called with incorrect domain metadata"); } - return Status::OK(); + return static_cast(metadata); } -Status NormalizeShardingDomain(const DomainMetadata::Domain& domain) { - TF_ASSIGN_OR_RETURN(std::unique_ptr sharding, - ExtractOriginalCommonSharding(domain.instructions)); - if (sharding != nullptr) { - VLOG(4) << "Normalizing sharding-less domain to " << sharding->ToString() - << ":"; - TF_RETURN_IF_ERROR(ApplyDomainSharding(domain, *sharding)); +Status ShardingMetadata::NormalizeShardingDomain( + const DomainMetadata::Domain& domain, const DomainMetadata* metadata) { + if (metadata != nullptr) { + TF_ASSIGN_OR_RETURN(const auto& sharding_metadata, + ToShardingMetadata(metadata)); + const HloSharding* sharding = sharding_metadata->sharding(); + if (sharding != nullptr) { + VLOG(4) << "Normalizing sharding to " << sharding->ToString() << ":"; + TF_RETURN_IF_ERROR(ApplyDomainSharding(domain, *sharding)); + TF_RETURN_IF_ERROR(FixupPassThroughDomainLinks(domain, *sharding)); + } } else { - VLOG(1) << "Unable to find common sharding"; + TF_ASSIGN_OR_RETURN(std::shared_ptr sharding, + ExtractOriginalCommonSharding(domain.instructions)); + if (sharding != nullptr) { + VLOG(4) << "Normalizing sharding-less domain to " << sharding->ToString(); + TF_RETURN_IF_ERROR(ApplyDomainSharding(domain, *sharding)); + } else { + VLOG(1) << "Unable to find common sharding"; + } } return Status::OK(); } -std::unique_ptr CreateShardingDomain( - HloInstruction* instruction, HloInstruction* operand) { - return CreateDomain(instruction, operand); +// Creates a kDomain instruction to be placed between instruction and operand. +// The kDomain instruction will be created only if the sharding differ between +// the instruction and the operand. +HloInstruction* ShardingDomainCreator::operator()(HloInstruction* instruction, + HloInstruction* root, + HloInstruction* operand) { + auto instruction_sharding = instruction->sharding_ptr(); + auto root_sharding = root->sharding_ptr(); + // No need for domain if they both have no sharding. + if (instruction_sharding == nullptr && root_sharding == nullptr) { + return nullptr; + } + // No need for domain if they match. + if (instruction_sharding != nullptr && root_sharding != nullptr && + ShardingMatches(*instruction_sharding, *root_sharding)) { + return nullptr; + } + + if (instruction_sharding != nullptr) { + instruction_sharding = CloneShardingForDomain(instruction_sharding); + } + if (root_sharding != nullptr) { + root_sharding = CloneShardingForDomain(root_sharding); + } + + auto it = domain_cse_map_.find({operand, instruction_sharding}); + if (it != domain_cse_map_.end()) { + return it->second; + } + + VLOG(3) << "Creating domain:"; + VLOG(3) << " Instruction: " << instruction->name(); + VLOG(3) << " Operand: " << operand->name(); + VLOG(3) << " User side sharding: " + << (instruction_sharding != nullptr ? instruction_sharding->ToString() + : "None"); + VLOG(3) << " Operand side sharding: " + << (root_sharding != nullptr ? root_sharding->ToString() : "None"); + + HloInstruction* domain = + operand->parent()->AddInstruction(HloInstruction::CreateDomain( + operand->shape(), operand, + absl::make_unique(root_sharding), + absl::make_unique(instruction_sharding))); + domain_cse_map_.emplace(DomainCseMapKey{operand, instruction_sharding}, + domain); + return domain; +} + +bool ShardingDomainCreator::DomainCseMapKey::operator==( + const ShardingDomainCreator::DomainCseMapKey& other) const { + if (instruction != other.instruction) { + return false; + } + if (sharding == nullptr && other.sharding == nullptr) { + return true; + } + if (sharding == nullptr || other.sharding == nullptr) { + return false; + } + return *sharding == *other.sharding; +} + +size_t ShardingDomainCreator::DomainCseMapHasher::operator()( + const ShardingDomainCreator::DomainCseMapKey& key) const { + return tensorflow::Hash64Combine( + std::hash{}(key.instruction), + key.sharding ? key.sharding->Hash() + : static_cast(0x297814aaad196e6dULL)); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.h b/tensorflow/compiler/xla/service/hlo_sharding_metadata.h index ec162c34904ee2dfac3daeeee37133282a9c9698..cba5db927a056c760e1c4a291d96cfdbca818029 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.h +++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.h @@ -16,51 +16,72 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_ +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_sharding.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { // A DomainMetadata implementation that internally wraps a sharding attribute. class ShardingMetadata : public DomainMetadata { public: - explicit ShardingMetadata(std::unique_ptr sharding) + explicit ShardingMetadata(std::shared_ptr sharding) : sharding_(std::move(sharding)) {} std::unique_ptr Clone() const override; - tensorflow::StringPiece Kind() const override { return KindName(); } + absl::string_view Kind() const override { return KindName(); } bool Matches(const DomainMetadata& other) const override; string ToString() const override; - Status NormalizeInstructions( - const DomainMetadata::Domain& domain) const override; + const HloSharding* sharding() const { return sharding_.get(); } - static tensorflow::StringPiece KindName() { return "sharding"; } + static absl::string_view KindName() { return "sharding"; } + + static StatusOr ToShardingMetadata( + const DomainMetadata* metadata); + + // Apply the specified domain metadata onto the specified domain. If no + // metadata is specified then apply sharding heuristics and normalize the + // instructions whose sharding deviates from the one which is inferred as to + // be the original one. Policy wise, HLO passes are allowed to create new + // unassigned instructions, but if they do create assigned ones, they have to + // conform to the ones around. + static Status NormalizeShardingDomain(const DomainMetadata::Domain& domain, + const DomainMetadata* metadata); private: - std::unique_ptr sharding_; + std::shared_ptr sharding_; }; -// Within a set of instructions which had common sharding attributes before -// entring the HLO passes pipeline, apply sharding heuristics and normalize the -// instructions whose sharding deviates from the one which is inferred as to be -// the original one. -// Policy wise, HLO passes are allowed to create new unassigned instructions, -// but if they do create assigned ones, they have to conform to the ones around. -Status NormalizeShardingDomain(const DomainMetadata::Domain& domain); - -// Given an HLO graph edge between instruction and one of its operands, creates -// a ShardingMetadata based kDomain instruction if the sharding between -// instruction and operand changes. Returns nullptr if there is no need for a -// domain separation. -std::unique_ptr CreateShardingDomain( - HloInstruction* instruction, HloInstruction* operand); +// If the sharding between root and instruction changes then returns a +// ShardingMetadata based kDomain instruction what can be used to separate +// operand and instruction. +// Returns nullptr if there is no need for a domain separation. +class ShardingDomainCreator { + public: + HloInstruction* operator()(HloInstruction* instruction, HloInstruction* root, + HloInstruction* operand); + + private: + // Map from instruction and user sharding to domain users to CSE identical + // domains. + struct DomainCseMapKey { + const HloInstruction* instruction; + std::shared_ptr sharding; + + bool operator==(const DomainCseMapKey& other) const; + }; + struct DomainCseMapHasher { + size_t operator()(const DomainCseMapKey& key) const; + }; + std::unordered_map + domain_cse_map_; +}; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc index 94d1a3226b8512689e99666002b8701ba1a6a623..80634677e78e4a35dcb9bf7de018a88122c3c030 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc @@ -18,19 +18,19 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/compiler/xla/util.h" namespace xla { namespace { -Array MakeArray(tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice contents) { +Array MakeArray(absl::Span dimensions, + absl::Span contents) { Array a(dimensions); std::copy(contents.begin(), contents.end(), a.begin()); return a; @@ -39,7 +39,6 @@ Array MakeArray(tensorflow::gtl::ArraySlice dimensions, class HloShardingTest : public HloTestBase {}; TEST_F(HloShardingTest, Replicate) { - Shape tile_shape = ShapeUtil::MakeShape(U32, {4}); HloSharding sharding = HloSharding::Replicate(); EXPECT_TRUE(sharding.IsReplicated()); EXPECT_TRUE(sharding.IsTileMaximal()); @@ -51,7 +50,7 @@ TEST_F(HloShardingTest, Replicate) { EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4}), /*num_devices=*/2)); - EXPECT_IS_NOT_OK(sharding.UniqueDevice()); + EXPECT_FALSE(sharding.HasUniqueDevice()); } TEST_F(HloShardingTest, DevicePlacement) { @@ -60,7 +59,7 @@ TEST_F(HloShardingTest, DevicePlacement) { EXPECT_TRUE(sharding.IsTileMaximal()); EXPECT_FALSE(sharding.UsesDevice(0)); EXPECT_TRUE(sharding.UsesDevice(5)); - EXPECT_EQ(5, sharding.UniqueDevice().ValueOrDie()); + EXPECT_EQ(5, sharding.GetUniqueDevice()); HloSharding other = HloSharding::Replicate(); EXPECT_NE(other, sharding); @@ -79,37 +78,22 @@ TEST_F(HloShardingTest, DevicePlacement) { TEST_F(HloShardingTest, Tile) { { // Test should fail because of a duplicate tile assignment. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 0, 2, 3})); + HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 0, 2, 3})); EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {4, 6}), /*num_devices=*/4)); } { // Test should fail because of more devices used then `num_device`. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); + HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 1, 2, 3})); EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4, 6}), /*num_devices=*/2)); } - { - // Test should fail because the total tiled size in dimension 0 is 4 but we - // have 6 elements along that dimensions. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); - EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {6, 3}), - /*num_devices=*/4)); - } - { // Test should pass. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1})); + Shape shape = ShapeUtil::MakeShape(U32, {4, 5}); + HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1})); EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {3, 5}), /*num_devices=*/5)); @@ -118,15 +102,26 @@ TEST_F(HloShardingTest, Tile) { EXPECT_EQ(2, sharding.DeviceForTileIndex({1, 0})); EXPECT_EQ(1, sharding.DeviceForTileIndex({1, 1})); - EXPECT_EQ(sharding.TileOffsetForDevice(0), (std::vector{0, 0})); - EXPECT_EQ(sharding.TileOffsetForDevice(3), (std::vector{0, 3})); - EXPECT_EQ(sharding.TileOffsetForDevice(2), (std::vector{2, 0})); - EXPECT_EQ(sharding.TileOffsetForDevice(1), (std::vector{2, 3})); + EXPECT_EQ(sharding.TileOffsetForDevice(shape, 0), + (std::vector{0, 0})); + EXPECT_EQ(sharding.TileOffsetForDevice(shape, 3), + (std::vector{0, 3})); + EXPECT_EQ(sharding.TileOffsetForDevice(shape, 2), + (std::vector{2, 0})); + EXPECT_EQ(sharding.TileOffsetForDevice(shape, 1), + (std::vector{2, 3})); - EXPECT_IS_NOT_OK(sharding.UniqueDevice()); + EXPECT_FALSE(sharding.HasUniqueDevice()); } } +// Tests that empty tuple is supported. +TEST_F(HloShardingTest, EmptySingleTuple) { + HloSharding sharding = HloSharding::SingleTuple(ShapeUtil::MakeTupleShape({}), + HloSharding::AssignDevice(0)); + EXPECT_TRUE(sharding.ExtractSingleSharding()); +} + TEST_F(HloShardingTest, NestedTuple) { // nested_tuple_shape = (f32[], (f32[3]), f32[4, 6]) Shape nested_tuple_shape = ShapeUtil::MakeTupleShape({ @@ -135,8 +130,7 @@ TEST_F(HloShardingTest, NestedTuple) { ShapeUtil::MakeShape(F32, {4, 6}), }); - HloSharding tiled_sharding = HloSharding::Tile( - ShapeUtil::MakeShape(F32, {4, 3}), Array({{0, 1}})); + HloSharding tiled_sharding = HloSharding::Tile(Array({{0, 1}})); OpSharding proto; proto.set_type(OpSharding::Type::OpSharding_Type_TUPLE); *proto.add_tuple_shardings() = HloSharding::Replicate().ToProto(); @@ -187,32 +181,11 @@ TEST_F(HloShardingTest, Hash) { } { - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding1 = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1})); - HloSharding sharding2 = HloSharding::Tile(ShapeUtil::MakeShape(U32, {2, 3}), - MakeArray({2, 2}, {0, 3, 2, 1})); + HloSharding sharding1 = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1})); + HloSharding sharding2 = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1})); EXPECT_TRUE(hash_compare_equal(sharding1, sharding2)); } - { - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding1 = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1})); - HloSharding sharding2 = HloSharding::Tile(ShapeUtil::MakeShape(U32, {2, 3}), - MakeArray({2, 2}, {0, 3, 2, 1})); - EXPECT_TRUE(hash_compare_equal(sharding1, sharding2)); - } - - { - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding1 = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1})); - HloSharding sharding2 = HloSharding::Tile(ShapeUtil::MakeShape(U32, {2, 3}), - MakeArray({2, 2}, {0, 3, 1, 2})); - EXPECT_FALSE(hash_compare_equal(sharding1, sharding2)); - } - HloSharding default_sharding = HloSharding::Replicate(); { ShapeTree shape_tree(ShapeUtil::MakeTupleShape({}), @@ -259,19 +232,6 @@ TEST_F(HloShardingTest, Hash) { } } -TEST_F(HloShardingTest, TransformShardedTileShapeTest) { - HloSharding sharding = - HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 5, 7, 11}), - Array4D({{{{0, 1}, {2, 3}}}})); - HloSharding result = sharding.TransformShardedTileShape( - ShapeUtil::MakeShape(F32, {13, 15, 17, 19}), - [](int dim, int value) { return dim * 111; }); - HloSharding expected = - HloSharding::Tile(ShapeUtil::MakeShape(F32, {13, 15, 222, 333}), - Array4D({{{{0, 1}, {2, 3}}}})); - EXPECT_EQ(result, expected); -} - TEST_F(HloShardingTest, ToStringReplicatedTest) { HloSharding sharding = HloSharding::Replicate(); EXPECT_EQ(sharding.ToString(), "{replicated}"); @@ -284,9 +244,8 @@ TEST_F(HloShardingTest, ToStringAssignDeviceTest) { TEST_F(HloShardingTest, ToStringTiledTest) { HloSharding sharding = - HloSharding::Tile(ShapeUtil::MakeShape(S32, {7, 11, 13}), - Array3D({{{2, 3}}, {{5, 7}}})); - EXPECT_EQ(sharding.ToString(), "{s32[7,11,13] devices=[2,1,2]2,3,5,7}"); + HloSharding::Tile(Array3D({{{2, 3}}, {{5, 7}}})); + EXPECT_EQ(sharding.ToString(), "{devices=[2,1,2]2,3,5,7}"); } TEST_F(HloShardingTest, ToStringTupleTest) { @@ -294,35 +253,33 @@ TEST_F(HloShardingTest, ToStringTupleTest) { ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 5}), ShapeUtil::MakeShape(U32, {7, 25}), ShapeUtil::MakeShape(S32, {9, 11})}), - {HloSharding::Replicate(), - HloSharding::Tile(ShapeUtil::MakeShape(U32, {7, 13}), - Array2D({{3, 5}})), + {HloSharding::Replicate(), HloSharding::Tile(Array2D({{3, 5}})), HloSharding::AssignDevice(3)}); EXPECT_EQ(sharding.ToString(), - "{{replicated}, {u32[7,13] devices=[1,2]3,5}, {maximal device=3}}"); + "{{replicated}, {devices=[1,2]3,5}, {maximal device=3}}"); } TEST_F(HloShardingTest, OstreamTest) { HloSharding sharding = - HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 5, 7, 11}), - Array4D({{{{0, 1}, {2, 3}}}})); + HloSharding::Tile(Array4D({{{{0, 1}, {2, 3}}}})); std::ostringstream oss; oss << sharding; - EXPECT_EQ(oss.str(), "{f32[3,5,7,11] devices=[1,1,2,2]0,1,2,3}"); + EXPECT_EQ(oss.str(), "{devices=[1,1,2,2]0,1,2,3}"); } -TEST_F(HloShardingTest, Parse) { +TEST_F(HloShardingTest, ParseHloString) { auto check = [](const HloSharding& sharding) { TF_ASSERT_OK_AND_ASSIGN(auto parsed_sharding, - tools::ParseSharding(sharding.ToString())); + ParseSharding(sharding.ToString())); EXPECT_EQ(sharding, parsed_sharding); }; check(HloSharding::Replicate()); check(HloSharding::AssignDevice(2)); - check(HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}), - Array4D({{{{0}, {1}}}}))); - // Empty tuple. - check(HloSharding::Tuple(ShapeUtil::MakeTupleShape({}), {})); + check(HloSharding::Tile(Array4D({{{{0}, {1}}}}))); + // Empty tuple. One sharding is required for empty tuples, as we need to be + // able to assign sharding to them, even though they have no leaves. + check(HloSharding::Tuple(ShapeUtil::MakeTupleShape({}), + {HloSharding::Replicate()})); { // Non-nested tuple. auto tuple_shape = @@ -330,8 +287,7 @@ TEST_F(HloShardingTest, Parse) { ShapeUtil::MakeShape(F32, {3, 5, 7}), ShapeUtil::MakeShape(F32, {3, 7})}); check(HloSharding::Tuple( - tuple_shape, {HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}), - Array4D({{{{0}, {1}}}})), + tuple_shape, {HloSharding::Tile(Array4D({{{{0}, {1}}}})), HloSharding::Replicate(), HloSharding::AssignDevice(1)})); } { @@ -341,8 +297,7 @@ TEST_F(HloShardingTest, Parse) { ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 5, 7}), ShapeUtil::MakeShape(F32, {3, 7})})}); std::vector leaf_shardings = { - HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}), - Array4D({{{{0}, {1}}}})), + HloSharding::Tile(Array4D({{{{0}, {1}}}})), HloSharding::Replicate(), HloSharding::AssignDevice(1)}; ShapeTree sharding_tree(tuple_shape, HloSharding::Replicate()); // Assign leaf_shardings to sharding_tree leaves. diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification.h b/tensorflow/compiler/xla/service/hlo_subcomputation_unification.h index 2ef38821af632180714911c0ff22731fd559b915..d1cf644f8273e632e2952cca0da749616e9b6233 100644 --- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification.h +++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification.h @@ -24,7 +24,7 @@ namespace xla { // one arbitrarily to use and delete the others. class HloSubcomputationUnification : public HloPassInterface { public: - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "subcomputation-unification"; } diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc index 7b601f9a9578cfa6b293cf7f002255f7db8b1257..45c684d66752862eec301b8943d350804f070309 100644 --- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc +++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc @@ -75,7 +75,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) { module->AddEmbeddedComputation(CreateR0S32IdentityComputation()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5))); auto x = builder.AddInstruction( HloInstruction::CreateCall(r0s32_, {constant}, callee1)); auto y = builder.AddInstruction( @@ -112,9 +112,9 @@ TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) { module->AddEmbeddedComputation(CreateR0S32AdditionComputation()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(5))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(5))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3))); auto x = builder.AddInstruction( HloInstruction::CreateCall(r0s32_, {constant1, constant2}, callee1)); auto y = builder.AddInstruction( diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc index 3dc733940fc89952bd5e75a9b28d9cbf356f8000..487653344976a10e18ba667085525ba1ecbb8612 100644 --- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc +++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc @@ -14,35 +14,34 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/core/framework/attr_value.pb.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/tensor_shape.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" - -using ::tensorflow::GraphDef; -using ::tensorflow::NodeDef; -using ::tensorflow::TensorShapeProto; -using ::tensorflow::strings::StrAppend; -using ::tensorflow::strings::StrCat; -using ::tensorflow::str_util::Join; namespace xla { namespace hlo_graph_dumper { namespace { +using absl::StrAppend; +using absl::StrCat; +using tensorflow::GraphDef; +using tensorflow::NodeDef; +using tensorflow::TensorShapeProto; + string GetOpDefName(const HloInstruction* instruction) { string name = StrCat("hlo-", HloOpcodeString(instruction->opcode())); - tensorflow::str_util::TitlecaseString(&name, "-"); + tensorflow::str_util::TitlecaseString(&name, "-"); // non-absl ok name.erase(std::remove(name.begin(), name.end(), '-'), name.end()); if (instruction->opcode() == HloOpcode::kFusion) { string fusion_name = ToString(instruction->fusion_kind()); - StrAppend(&name, tensorflow::StringPiece(fusion_name).substr(1)); + StrAppend(&name, absl::string_view(fusion_name).substr(1)); } return name; } @@ -101,11 +100,11 @@ const string& HloTfGraphBuilder::GetNodeNameForInstruction( } }; string node_name; - if (debug_options_.xla_hlo_tfgraph_device_scopes() && - instruction->has_sharding() && - instruction->sharding().HasUniqueDevice()) { - node_name = StrCat( - "dev", instruction->sharding().UniqueDevice().ConsumeValueOrDie()); + if (debug_options_.xla_hlo_tfgraph_device_scopes()) { + auto device = instruction->sharding_unique_device(); + if (device) { + node_name = StrCat("dev", *device); + } } // If an instruction is fused, put it in the subgraph of the fusion; // otherwise, put it in the computation subgraph. @@ -166,7 +165,9 @@ void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction, layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape()); } else { layout_string = StrCat( - "{", Join(LayoutUtil::MinorToMajor(instruction->shape()), ","), "}"); + "{", + absl::StrJoin(LayoutUtil::MinorToMajor(instruction->shape()), ","), + "}"); } attrs["layout"].set_s(layout_string); } @@ -215,10 +216,10 @@ Status HloTfGraphBuilder::AddInstruction(const HloInstruction* instruction) { NodeDef* node_def = graph_def_.add_node(); node_def->set_name(GetNodeNameForInstruction(instruction)); node_def->set_op(GetOpDefName(instruction)); - if (instruction->has_sharding() && - instruction->sharding().HasUniqueDevice()) { - TF_ASSIGN_OR_RETURN(int64 device, instruction->sharding().UniqueDevice()); - node_def->set_device(GetDeviceName(device)); + + auto device = instruction->sharding_unique_device(); + if (device) { + node_def->set_device(GetDeviceName(*device)); } SetNodeAttrs(instruction, node_def); if (instruction->opcode() == HloOpcode::kFusion) { diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc index be156d765dc10d54eaf301e90883babbc5693e28..1e2b31a1f2bb4865faafc3d14e2b194e3aa171a1 100644 --- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc +++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc @@ -90,7 +90,7 @@ TEST_F(HloTfGraphBuilderTest, CheckConcatenateDimsAndShapes) { TEST_F(HloTfGraphBuilderTest, CheckScalarValue) { auto builder = HloComputation::Builder("Const"); HloInstruction *instruction = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(123))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(123))); OpMetadata metadata; metadata.set_op_name("x"); metadata.set_op_type("y"); diff --git a/tensorflow/compiler/xla/tools/parser/hlo_token.h b/tensorflow/compiler/xla/service/hlo_token.h similarity index 84% rename from tensorflow/compiler/xla/tools/parser/hlo_token.h rename to tensorflow/compiler/xla/service/hlo_token.h index 7928bee5c2097f353b182095a555c334d7b69c95..4458c251dee4af365e39027dd4289925c8890efd 100644 --- a/tensorflow/compiler/xla/tools/parser/hlo_token.h +++ b/tensorflow/compiler/xla/service/hlo_token.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_ -#define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_ #include @@ -22,9 +22,11 @@ limitations under the License. #include "tensorflow/core/platform/types.h" namespace xla { -namespace tools { // Defines different kinds of tokens in a hlo module string. +// +// You shouldn't need to use this directly unless you're using HloLexer +// directly, and you probably don't need to do that. Use hlo_parser instead. enum class TokKind { // Markers kEof, @@ -42,7 +44,6 @@ enum class TokKind { kRparen, // ( ) kArrow, // -> - kComment, // /*xxx*/ // Keywords kw_HloModule, @@ -72,7 +73,6 @@ enum class TokKind { string TokKindToString(TokKind kind); -} // namespace tools } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_ diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc index 7b27dbfec376b8ba16d00285f10e2cc291e07a61..773fc7d22537ab81d945c197b713b00d322a7f24 100644 --- a/tensorflow/compiler/xla/service/hlo_value.cc +++ b/tensorflow/compiler/xla/service/hlo_value.cc @@ -18,8 +18,10 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -30,16 +32,13 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" namespace xla { -using ::tensorflow::str_util::Join; -using ::tensorflow::strings::StrAppend; -using ::tensorflow::strings::StrCat; +using absl::StrAppend; +using absl::StrCat; const Shape& HloPosition::shape() const { return ShapeUtil::GetSubshape(instruction->shape(), index); @@ -125,7 +124,7 @@ bool MayUseOperandValue(int64 operand_number, const ShapeIndex& index, // transparently. CHECK_EQ(operand_number, 0); return index.empty(); - case HloOpcode::kSelect: + case HloOpcode::kTupleSelect: // Select does not use any nested elements of its selected-from operands // (operand 1 and 2) CHECK_GE(operand_number, 0); @@ -150,7 +149,7 @@ bool MayUseOperandValue(int64 operand_number, const ShapeIndex& index, } // namespace void HloValue::SetPositionsAndComputeUses( - tensorflow::gtl::ArraySlice positions) { + absl::Span positions) { CHECK_EQ(positions_.size(), 1) << "SetPositions should only be called once."; // The positions must be unique and should not contain the defining position @@ -216,14 +215,14 @@ void HloValueSet::SortAndUniquifyValues() { } string HloValueSet::ToString() const { - return StrCat("HloValueSet: ", - Join(values_, ", ", [](string* result, const HloValue* value) { - result->append(value->ToShortString()); - })); + return StrCat( + "HloValueSet: ", + absl::StrJoin(values_, ", ", [](string* result, const HloValue* value) { + result->append(value->ToShortString()); + })); } -bool HloValueSet::AssignUnionOf( - tensorflow::gtl::ArraySlice inputs) { +bool HloValueSet::AssignUnionOf(absl::Span inputs) { HloValueSet union_set; for (const HloValueSet* input : inputs) { for (const HloValue* value : input->values()) { @@ -254,7 +253,7 @@ std::ostream& operator<<(std::ostream& out, const HloValueSet& value_set) { } bool InstructionValueSet::AssignUnionOf( - tensorflow::gtl::ArraySlice inputs) { + absl::Span inputs) { CHECK_GT(inputs.size(), 0); for (int i = 1; i < inputs.size(); ++i) { DCHECK(ShapeUtil::Compatible(inputs[0]->shape(), inputs[i]->shape())); @@ -283,8 +282,7 @@ std::ostream& operator<<(std::ostream& out, string InstructionValueSet::ToString() const { string out = StrCat("InstructionValueSet(", ShapeUtil::HumanString(shape()), ")\n"); - ForEachElement([this, &out](const ShapeIndex& index, - const HloValueSet& value_set) { + ForEachElement([&out](const ShapeIndex& index, const HloValueSet& value_set) { StrAppend(&out, " ", index.ToString(), " : ", value_set.ToString(), "\n"); }); return out; diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h index a1151f65e07dffdcd52f645f61dcc9b4f26459c0..b6670d409b92e8be42f5cdb40fba8d662ae83958 100644 --- a/tensorflow/compiler/xla/service/hlo_value.h +++ b/tensorflow/compiler/xla/service/hlo_value.h @@ -20,13 +20,13 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_value.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -108,8 +108,7 @@ class HloValue : public BufferValue { // Sets the positions in the module at which the HloValue appears. Updates // uses. Should be called once and only once. The defining position should not // be included in 'positions' as this is set at construction time. - void SetPositionsAndComputeUses( - tensorflow::gtl::ArraySlice positions); + void SetPositionsAndComputeUses(absl::Span positions); // Returns whether this value is a phi value. bool is_phi() const { return is_phi_; } @@ -186,14 +185,14 @@ class HloValueSet { public: HloValueSet() = default; - explicit HloValueSet(tensorflow::gtl::ArraySlice values) + explicit HloValueSet(absl::Span values) : values_(values.begin(), values.end()) { SortAndUniquifyValues(); } // Sets this value set to the union of the given value sets. Returns whether // this value set changed. - bool AssignUnionOf(tensorflow::gtl::ArraySlice inputs); + bool AssignUnionOf(absl::Span inputs); // Return the vector of HloValues in the set. Values in the vector are unique // and stably sorted by value id. @@ -247,8 +246,7 @@ class InstructionValueSet : public ShapeTree { // Sets this value set to the union of the given value sets. Returns whether // this value set changed. - bool AssignUnionOf( - tensorflow::gtl::ArraySlice inputs); + bool AssignUnionOf(absl::Span inputs); string ToString() const; }; diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 9cfd8a9bf74bc69ac40b1e0974d9e084d31071c9..95516dec74bd253212901a3d9a92285d11fe122f 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -15,9 +15,13 @@ limitations under the License. #include +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/flatmap.h" @@ -39,6 +43,10 @@ Status ShapeVerifier::HandleSelect(HloInstruction* select) { return CheckTernaryShape(select); } +Status ShapeVerifier::HandleTupleSelect(HloInstruction* tuple_select) { + return CheckTernaryShape(tuple_select); +} + Status ShapeVerifier::HandleConcatenate(HloInstruction* concatenate) { std::vector operand_shapes; for (const HloInstruction* operand : concatenate->operands()) { @@ -78,7 +86,8 @@ Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) { const Shape expected, ShapeInference::InferConvolveShape( convolution->operand(0)->shape(), convolution->operand(1)->shape(), - convolution->window(), convolution->convolution_dimension_numbers())); + convolution->window(), convolution->convolution_dimension_numbers(), + convolution->feature_group_count())); return CheckShape(convolution, expected); } @@ -99,6 +108,20 @@ Status ShapeVerifier::HandleCrossReplicaSum(HloInstruction* crs) { ShapeInference::InferCrossReplicaSumShape(operand_shapes)); } +Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) { + std::vector operand_shapes; + for (const HloInstruction* operand : hlo->operands()) { + operand_shapes.push_back(&operand->shape()); + } + return CheckShape(hlo, + ShapeInference::InferAllToAllTupleShape(operand_shapes)); +} + +Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) { + return CheckShape(hlo, ShapeInference::InferCollectivePermuteShape( + hlo->operand(0)->shape())); +} + Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) { return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape( reduce_precision->operand(0)->shape(), @@ -106,29 +129,119 @@ Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) { reduce_precision->mantissa_bits())); } -Status ShapeVerifier::HandleInfeed(HloInstruction*) { return Status::OK(); } +Status ShapeVerifier::CheckIsTokenOperand(const HloInstruction* instruction, + int64 operand_no) { + const HloInstruction* token = instruction->operand(operand_no); + if (!ShapeUtil::Equal(token->shape(), ShapeUtil::MakeTokenShape())) { + return InternalError( + "Expected operand %d to be token-shaped, actual shape is " + "%s:\n%s", + operand_no, StringifyShape(token->shape()), instruction->ToString()); + } + return Status::OK(); +} + +Status ShapeVerifier::CheckOperandAndParameter( + const HloInstruction* instruction, int64 operand_number, + const HloComputation* computation, int64 parameter_number) { + const HloInstruction* operand = instruction->operand(operand_number); + const HloInstruction* parameter = + computation->parameter_instruction(parameter_number); + if (!ShapesSame(operand->shape(), parameter->shape())) { + return InternalError("Operand %s shape does not match parameter's %s in %s", + operand->ToString(), parameter->ToString(), + instruction->ToString()); + } + return Status::OK(); +} + +Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) { + HloInfeedInstruction* infeed = Cast(instruction); + TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 0)); + + // The output of infeed is a tuple containing the data value and a token. + return CheckShape(infeed, + ShapeUtil::MakeTupleShape( + {infeed->infeed_shape(), ShapeUtil::MakeTokenShape()})); +} + +Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) { + HloOutfeedInstruction* outfeed = Cast(instruction); + TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 1)); -Status ShapeVerifier::HandleOutfeed(HloInstruction* outfeed) { // Outfeed has a separate shape field for the value which is outfed to the - // host. The shape of the instruction itself is always nil because the outfeed - // produces no HLO value in the graph. - if (!ShapeUtil::Compatible(outfeed->outfeed_shape(), - outfeed->operand(0)->shape())) { + // host. The shape of the instruction itself is always a token. + if (!ShapesSame(outfeed->outfeed_shape(), outfeed->operand(0)->shape())) { return InternalError( - "Expected outfeed to have shape compatible with operand's shape %s, " + "Expected outfeed shape to be equal to operand's shape %s, " "actual shape is %s:\n%s", - ShapeUtil::HumanString(outfeed->operand(0)->shape()).c_str(), - ShapeUtil::HumanString(outfeed->outfeed_shape()).c_str(), - outfeed->ToString().c_str()); + StringifyShape(outfeed->operand(0)->shape()), + StringifyShape(outfeed->outfeed_shape()), outfeed->ToString()); } - return CheckShape(outfeed, ShapeUtil::MakeNil()); + return CheckShape(outfeed, ShapeUtil::MakeTokenShape()); } -Status ShapeVerifier::HandleHostCompute(HloInstruction*) { - return Status::OK(); +bool ShapeVerifier::HasCompatibleElementTypes(const Shape& shape_0, + const Shape& shape_1, + const Shape& result_shape) { + return ShapeUtil::SameElementType(shape_0, shape_1) && + (ShapeUtil::SameElementType(shape_0, result_shape) || + (allow_mixed_precision_ && + ShapeUtil::SameElementTypeIgnoringFpPrecision(shape_0, + result_shape))); } -Status ShapeVerifier::HandleRng(HloInstruction*) { return Status::OK(); } +Status ShapeVerifier::HandleRng(HloInstruction* instruction) { + if (instruction->operand_count() != 2) { + return InternalError("Expected two operands for Rng instruction: %s", + instruction->ToString()); + } + + const Shape& shape_0 = instruction->operand(0)->shape(); + const Shape& shape_1 = instruction->operand(1)->shape(); + if (!ShapeUtil::IsScalar(shape_0) || !ShapeUtil::IsScalar(shape_1)) { + return InternalError( + "Expected scalar types for the two operands of Rng instruction: %s", + instruction->ToString()); + } + + if (!HasCompatibleElementTypes(shape_0, shape_1, instruction->shape())) { + return InternalError( + "Expected compatible element types for the result and the two operands" + " of Rng instruction: %s", + instruction->ToString()); + } + + PrimitiveType element_type = shape_0.element_type(); + switch (instruction->random_distribution()) { + case RNG_UNIFORM: + if (!primitive_util::IsFloatingPointType(element_type) && + !primitive_util::IsIntegralType(element_type) && + element_type != PRED) { + return InternalError( + "Element type not supported." + " Expected element to be of floating point type, integral type or" + " predicate type for RngUniform: %s", + instruction->ToString()); + } + break; + + case RNG_NORMAL: + if (!primitive_util::IsFloatingPointType(element_type)) { + return InternalError( + "Element type not supported." + " Expected element to be FloatingPointType for RngNormal: %s", + instruction->ToString()); + } + break; + default: + return InternalError( + "Invalid Rng distribution %s", + RandomDistribution_Name(instruction->random_distribution())); + } + + return Status::OK(); +} Status ShapeVerifier::HandleReverse(HloInstruction* reverse) { return CheckShape( @@ -137,13 +250,36 @@ Status ShapeVerifier::HandleReverse(HloInstruction* reverse) { } Status ShapeVerifier::HandleSort(HloInstruction* sort) { - return CheckUnaryShape(sort); + if (sort->operand_count() == 2 && + !ShapeUtil::SameDimensions(sort->operand(0)->shape(), + sort->operand(1)->shape())) { + return InternalError( + "Expected sort to have to have the same dimensions for the keys and " + "the values. Keys shape is: %s\n, Values shape is: %s", + StringifyShape(sort->operand(0)->shape()), + StringifyShape(sort->operand(1)->shape())); + } + return CheckVariadicShape(sort); } Status ShapeVerifier::HandleConstant(HloInstruction* constant) { return CheckShape(constant, constant->literal().shape()); } +Status ShapeVerifier::HandleIota(HloInstruction* instruction) { + auto* iota = Cast(instruction); + const int64 rank = ShapeUtil::Rank(iota->shape()); + if (rank == 0) { + return InternalError("Iota does not support scalars."); + } + int64 iota_dimension = iota->iota_dimension(); + if (iota_dimension >= rank) { + return InternalError( + "The iota dimension cannot go beyond the operation rank."); + } + return Status::OK(); +} + Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) { return CheckShape(get_tuple_element, ShapeInference::InferGetTupleElementShape( @@ -152,11 +288,13 @@ Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) { } Status ShapeVerifier::HandleReduce(HloInstruction* reduce) { - return CheckShape( - reduce, - ShapeInference::InferReduceShape( - reduce->operand(0)->shape(), reduce->operand(1)->shape(), - reduce->dimensions(), reduce->to_apply()->ComputeProgramShape())); + std::vector operand_shapes; + for (const HloInstruction* operand : reduce->operands()) { + operand_shapes.push_back(&operand->shape()); + } + return CheckShape(reduce, ShapeInference::InferReduceShape( + operand_shapes, reduce->dimensions(), + reduce->to_apply()->ComputeProgramShape())); } Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) { @@ -200,11 +338,25 @@ Status ShapeVerifier::HandleParameter(HloInstruction* hlo) { return Status::OK(); } -Status ShapeVerifier::HandleFusion(HloInstruction*) { return Status::OK(); } +Status ShapeVerifier::HandleFusion(HloInstruction* fusion) { + for (HloInstruction* fused_param : fusion->fused_parameters()) { + int64 param_no = fused_param->parameter_number(); + if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape())) { + return InternalError( + "Shape mismatch between parameter number %d and its operand in " + "%s.", + param_no, fusion->ToString().c_str()); + } + } + return Status::OK(); +} Status ShapeVerifier::HandleCall(HloInstruction* call) { + for (int64 i = 0; i < call->to_apply()->num_parameters(); ++i) { + TF_RETURN_IF_ERROR(CheckOperandAndParameter(call, i, call->to_apply(), i)); + } // The shape of kCall should match the shape of the computation it calls. - return CheckShape(call, call->to_apply()->ComputeProgramShape().result()); + return CheckShape(call, call->to_apply()->root_instruction()->shape()); } Status ShapeVerifier::HandleCustomCall(HloInstruction*) { return Status::OK(); } @@ -273,19 +425,36 @@ Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) { } Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) { + TF_RETURN_IF_ERROR( + CheckOperandAndParameter(xla_while, 0, xla_while->while_body(), 0)); + TF_RETURN_IF_ERROR( + CheckOperandAndParameter(xla_while, 0, xla_while->while_condition(), 0)); + const Shape& conditional_shape = + xla_while->while_condition()->root_instruction()->shape(); + if (!ShapesSame(conditional_shape, ShapeUtil::MakeShape(PRED, {}))) { + return InternalError( + "Conditional computation shape does not lead to a scalar predicate " + "shape: %s", + StringifyShape(conditional_shape)); + } // The shape of kWhile should match the shape of the body computation it // calls. return CheckShape(xla_while, - xla_while->while_body()->ComputeProgramShape().result()); + xla_while->while_body()->root_instruction()->shape()); } Status ShapeVerifier::HandleConditional(HloInstruction* conditional) { + TF_RETURN_IF_ERROR(CheckOperandAndParameter( + conditional, 1, conditional->true_computation(), 0)); + TF_RETURN_IF_ERROR(CheckOperandAndParameter( + conditional, 2, conditional->false_computation(), 0)); + TF_RETURN_IF_ERROR( + CheckShape(conditional, + conditional->true_computation()->root_instruction()->shape())); TF_RETURN_IF_ERROR(CheckShape( conditional, - conditional->true_computation()->ComputeProgramShape().result())); - return CheckShape( - conditional, - conditional->false_computation()->ComputeProgramShape().result()); + conditional->false_computation()->root_instruction()->shape())); + return Status::OK(); } Status ShapeVerifier::HandlePad(HloInstruction* pad) { @@ -295,39 +464,29 @@ Status ShapeVerifier::HandlePad(HloInstruction* pad) { } Status ShapeVerifier::HandleSend(HloInstruction* send) { - TF_RET_CHECK(send->users().size() == 1); - const HloInstruction* send_done = send->users().front(); - TF_RET_CHECK(send_done->opcode() == HloOpcode::kSendDone); - TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done)); - return CheckShape( - send, ShapeUtil::MakeTupleShape( - {send->operand(0)->shape(), ShapeUtil::MakeShape(U32, {})})); + return CheckShape(send, + ShapeUtil::MakeTupleShape({send->operand(0)->shape(), + ShapeUtil::MakeShape(U32, {}), + ShapeUtil::MakeTokenShape()})); } Status ShapeVerifier::HandleSendDone(HloInstruction* send_done) { - TF_RET_CHECK(send_done->operands().size() == 1); - const HloInstruction* send = send_done->operand(0); - TF_RET_CHECK(send->opcode() == HloOpcode::kSend); - TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done)); - return CheckShape(send_done, ShapeUtil::MakeNil()); + return CheckShape(send_done, ShapeUtil::MakeTokenShape()); } Status ShapeVerifier::HandleRecv(HloInstruction* recv) { - TF_RET_CHECK(recv->users().size() == 1); - const HloInstruction* recv_done = recv->users().front(); - TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone); - TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done)); - return CheckShape(recv, - ShapeUtil::MakeTupleShape( - {recv_done->shape(), ShapeUtil::MakeShape(U32, {})})); + return CheckShape( + recv, ShapeUtil::MakeTupleShape( + {ShapeUtil::GetTupleElementShape(recv->shape(), 0), + ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})); } Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) { - TF_RET_CHECK(recv_done->operands().size() == 1); - const HloInstruction* recv = recv_done->operand(0); - TF_RET_CHECK(recv->opcode() == HloOpcode::kRecv); - TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done)); - return CheckShape(recv_done, recv->shape().tuple_shapes(0)); + return CheckShape( + recv_done, + ShapeUtil::MakeTupleShape( + {ShapeUtil::GetTupleElementShape(recv_done->operand(0)->shape(), 0), + ShapeUtil::MakeTokenShape()})); } Status ShapeVerifier::HandleBatchNormTraining( @@ -368,9 +527,9 @@ namespace { // inputs. Status CheckMixedPrecisionOperands(const HloInstruction* instruction) { switch (instruction->opcode()) { - // White list the following opcodes for mixed-precision check, because they - // involve data pass through or grouping via tuples, where the precisions - // of buffers can be different. + // White list the following opcodes for mixed-precision check, because + // they involve data pass through or grouping via tuples, where the + // precisions of buffers can be different. case HloOpcode::kCall: case HloOpcode::kConditional: case HloOpcode::kConstant: @@ -386,6 +545,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) { case HloOpcode::kRecvDone: case HloOpcode::kReducePrecision: case HloOpcode::kSelect: + case HloOpcode::kTupleSelect: case HloOpcode::kSend: case HloOpcode::kSendDone: case HloOpcode::kTuple: @@ -406,7 +566,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) { return InternalError( "Seen floating point types of different precisions in " "%s, but mixed precision is disallowed.", - instruction->ToString().c_str()); + instruction->ToString()); } return Status::OK(); })); @@ -423,7 +583,24 @@ Status ShapeVerifier::HandleGather(HloInstruction* gather) { gather, ShapeInference::InferGatherShape( gather->operand(0)->shape(), gather->operand(1)->shape(), - gather->gather_dimension_numbers(), gather->gather_window_bounds())); + gather->gather_dimension_numbers(), gather->gather_slice_sizes())); +} + +Status ShapeVerifier::HandleScatter(HloInstruction* scatter) { + return CheckShape( + scatter, ShapeInference::InferScatterShape( + scatter->operand(0)->shape(), scatter->operand(1)->shape(), + scatter->operand(2)->shape(), + scatter->to_apply()->ComputeProgramShape(), + scatter->scatter_dimension_numbers())); +} + +Status ShapeVerifier::HandleAfterAll(HloInstruction* token) { + std::vector operand_shapes; + for (const HloInstruction* operand : token->operands()) { + operand_shapes.push_back(&operand->shape()); + } + return CheckShape(token, ShapeInference::InferAfterAllShape(operand_shapes)); } Status ShapeVerifier::CheckShape(const HloInstruction* instruction, @@ -436,59 +613,51 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction, } // Check if the output shape matches the expected shape. - bool compatible; + // // We treat BF16 and F32 as compatible types if mixed precision is allowed, // but only when the instruction defines the BF16/F32 buffer. - switch (instruction->opcode()) { - case HloOpcode::kSelect: - if (ShapeUtil::IsTuple(inferred_shape) || !allow_mixed_precision_) { - // Select only defines the top-level buffer, which in this case is the - // tuple, so we cannot allow mixed precision. - compatible = - ShapeUtil::Compatible(instruction->shape(), inferred_shape); - } else { - compatible = ShapeUtil::CompatibleIgnoringFpPrecision( - instruction->shape(), inferred_shape); - } - break; - case HloOpcode::kGetTupleElement: - case HloOpcode::kTuple: - // Tuple and GetTupleElement do not define BF16/F32 buffers, so mixed - // precision is disallowed. - case HloOpcode::kConstant: - case HloOpcode::kBitcast: - case HloOpcode::kBitcastConvert: - case HloOpcode::kCall: - case HloOpcode::kConditional: - case HloOpcode::kConvert: - case HloOpcode::kCustomCall: - case HloOpcode::kInfeed: - case HloOpcode::kOutfeed: - case HloOpcode::kParameter: - case HloOpcode::kRecv: - case HloOpcode::kRecvDone: - case HloOpcode::kSend: - case HloOpcode::kSendDone: - case HloOpcode::kWhile: - // The above opcodes should match the expected shapes exactly. - compatible = ShapeUtil::Compatible(instruction->shape(), inferred_shape); - break; - default: - if (allow_mixed_precision_) { - compatible = ShapeUtil::CompatibleIgnoringFpPrecision( - instruction->shape(), inferred_shape); - } else { - compatible = - ShapeUtil::Compatible(instruction->shape(), inferred_shape); - } - } - if (!compatible) { + bool equal = [&] { + switch (instruction->opcode()) { + // The opcodes below can't have implicit layout conversions, nor can they + // implicitly transform f32 -> bf16. Fundamentally these are either + // reinterpreting existing data (e.g. kBitcast) or shuffling data around + // without modifying it (e.g. kGetTupleElement, kTupleSelect). + case HloOpcode::kBitcast: + case HloOpcode::kCall: + case HloOpcode::kConditional: + case HloOpcode::kConstant: + case HloOpcode::kCustomCall: + case HloOpcode::kGetTupleElement: + case HloOpcode::kInfeed: + case HloOpcode::kOutfeed: + case HloOpcode::kParameter: + case HloOpcode::kRecv: + case HloOpcode::kRecvDone: + case HloOpcode::kSend: + case HloOpcode::kSendDone: + case HloOpcode::kTuple: + case HloOpcode::kTupleSelect: + case HloOpcode::kWhile: + return ShapesSame(instruction->shape(), inferred_shape); + + // We allow arbitrary layout and f32->bf16 transformations on all other + // instructions, although this may be made more strict pending discussion + // in b/112709536. + default: + if (allow_mixed_precision_) { + return ShapeUtil::CompatibleIgnoringFpPrecision(instruction->shape(), + inferred_shape); + } else { + return ShapeUtil::Compatible(instruction->shape(), inferred_shape); + } + } + }(); + if (!equal) { return InternalError( - "Expected instruction to have shape compatible with %s, actual " + "Expected instruction to have shape equal to %s, actual " "shape is %s:\n%s", - ShapeUtil::HumanString(inferred_shape).c_str(), - ShapeUtil::HumanString(instruction->shape()).c_str(), - instruction->ToString().c_str()); + StringifyShape(inferred_shape), StringifyShape(instruction->shape()), + instruction->ToString()); } return Status::OK(); } @@ -530,30 +699,17 @@ Status ShapeVerifier::CheckVariadicShape(const HloInstruction* instruction) { instruction->opcode(), instruction->operands())); } -// Checks if the given two instructions shares the same channel id. -Status ShapeVerifier::CheckSameChannel(const HloInstruction* instr1, - const HloInstruction* instr2) { - if (instr1->channel_id() != instr2->channel_id()) { - return InternalError( - "Expected to have the same channel id, actual channel ids are: %s " - "(%lld), %s (%lld)", - instr1->ToString().c_str(), instr1->channel_id(), - instr2->ToString().c_str(), instr2->channel_id()); - } - return Status::OK(); -} - -string ComputationsToString( - tensorflow::gtl::ArraySlice computations) { - return tensorflow::str_util::Join( - computations, ",", [](string* s, const HloComputation* computation) { - s->append(computation->name()); - }); +string ComputationsToString(absl::Span computations) { + return absl::StrJoin(computations, ",", + [](string* s, const HloComputation* computation) { + s->append(computation->name()); + }); } // Verifies various invariants about the structure of the HLO: // -// (1) each instruction has a non-null parent() set to the HloComputation which +// (1) each instruction has a non-null parent() set to the HloComputation +// which // contains it. // // (2) each computation has a non-null parent() set to the HloModule which @@ -565,31 +721,31 @@ Status VerifyHloStructure(HloModule* module) { for (const HloComputation* computation : module->computations()) { if (computation->parent() == nullptr) { return InternalError("Computation %s has a null parent pointer", - computation->name().c_str()); + computation->name()); } if (computation->parent() != module) { return InternalError( "Computation %s parent() does not point to parent module", - computation->name().c_str()); + computation->name()); } for (const HloInstruction* instruction : computation->instructions()) { if (instruction->parent() == nullptr) { return InternalError("Instruction %s has a null parent pointer", - instruction->name().c_str()); + instruction->name()); } if (instruction->parent() != computation) { return InternalError( "Instruction %s parent() does not point to parent computation", - instruction->name().c_str()); + instruction->name()); } } } // Check that operands are in the same computation separately from verifying - // parent() correctness so conditions like a null HloInstruction::parent() are - // identified and reported explicitly above rather than reporting a mismatched - // operand. + // parent() correctness so conditions like a null HloInstruction::parent() + // are identified and reported explicitly above rather than reporting a + // mismatched operand. for (const HloComputation* computation : module->computations()) { for (const HloInstruction* instruction : computation->instructions()) { for (int i = 0; i < instruction->operand_count(); ++i) { @@ -598,9 +754,8 @@ Status VerifyHloStructure(HloModule* module) { return InternalError( "Operand %d (%s) of instruction %s is in a different " "computation: %s vs %s", - i, operand->name().c_str(), instruction->name().c_str(), - operand->parent()->name().c_str(), - instruction->parent()->name().c_str()); + i, operand->name(), instruction->name(), + operand->parent()->name(), instruction->parent()->name()); } } } @@ -613,13 +768,14 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { HloComputation* fused_computation = fusion->fused_instructions_computation(); if (fusion != fused_computation->FusionInstruction()) { return InternalError( - "Instruction of fused computation does not match expected instruction " + "Instruction of fused computation does not match expected " + "instruction " "%s.", - fusion->ToString().c_str()); + fusion->ToString()); } - // Fused root instruction and fused parameters must all be owned by the fusion - // computation. + // Fused root instruction and fused parameters must all be owned by the + // fusion computation. bool root_owned = false; const std::vector& fused_parameters = fusion->fused_parameters(); @@ -629,7 +785,7 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { if (fused_root == instruction) { if (root_owned) { return InternalError("Root appears more than once in %s.", - fusion->ToString().c_str()); + fusion->ToString()); } root_owned = true; } @@ -637,7 +793,7 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { if (fused_parameters[i] == instruction) { if (parameter_owned[i]) { return InternalError("Parameter appears more than once in %s.", - fusion->ToString().c_str()); + fusion->ToString()); } parameter_owned[i] = true; } @@ -645,76 +801,68 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { } if (!root_owned) { return InternalError("Root not found in computation of %s.", - fusion->ToString().c_str()); + fusion->ToString()); } // Make sure all the parameter_owned entries are set for (int i = 0; i < parameter_owned.size(); i++) { if (!parameter_owned[i]) { return InternalError("Parameter %d not found in computation of %s.", i, - fusion->ToString().c_str()); + fusion->ToString()); } } // Fused root must have no users. if (fused_root->user_count() != 0) { - return InternalError("Root of %s may not have users.", - fusion->ToString().c_str()); + return InternalError("Root of %s may not have users.", fusion->ToString()); } - // All uses of fused instructions must be in the fusion computation, and every - // non-root instruction must have at least one use. + // All uses of fused instructions must be in the fusion computation, and + // every non-root instruction must have at least one use. for (auto* instruction : fusion->fused_instructions_computation()->instructions()) { if (instruction != fused_root) { if (instruction->user_count() == 0) { return InternalError("Non-root instruction %s in %s must have users.", - instruction->ToString().c_str(), - fusion->ToString().c_str()); + instruction->ToString(), fusion->ToString()); } for (auto& user : instruction->users()) { if (fused_computation != user->parent()) { return InternalError( "Non-root instruction %s in %s may not have external users.", - instruction->ToString().c_str(), fusion->ToString().c_str()); + instruction->ToString(), fusion->ToString()); } } } } // Fused parameter instructions must be numbered contiguously and match up - // (shapes compatible) with their respective operand. + // (shapes equal) with their respective operand. CHECK_EQ(fusion->operands().size(), fused_parameters.size()); std::vector parameter_numbers(fused_parameters.size(), false); for (auto fused_param : fused_parameters) { int64 param_no = fused_param->parameter_number(); if (param_no < 0) { - return InternalError("Unexpected negative parameter number %lld in %s.", - param_no, fusion->ToString().c_str()); + return InternalError("Unexpected negative parameter number %d in %s.", + param_no, fusion->ToString()); } if (param_no >= fused_parameters.size()) { return InternalError( - "Unexpected parameter number %lld in %s: higher then number of " + "Unexpected parameter number %d in %s: higher then number of " "parameters %lu.", - param_no, fusion->ToString().c_str(), fused_parameters.size()); + param_no, fusion->ToString(), fused_parameters.size()); } if (parameter_numbers[param_no]) { return InternalError( - "Did not expect parameter number %lld more than once in %s.", - param_no, fusion->ToString().c_str()); + "Did not expect parameter number %d more than once in %s.", param_no, + fusion->ToString()); } parameter_numbers[param_no] = true; - if (!ShapeUtil::Compatible(fused_param->shape(), - fusion->operand(param_no)->shape())) { - return InternalError( - "Shape mismatch between parameter number %lld and its operand in %s.", - param_no, fusion->ToString().c_str()); - } } // Make sure all the parameter_numbers entries were seen. for (int i = 0; i < parameter_numbers.size(); i++) { if (!parameter_numbers[i]) { return InternalError("Did not see parameter number %d in %s.", i, - fusion->ToString().c_str()); + fusion->ToString()); } } @@ -729,46 +877,34 @@ Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) { auto* while_body = instruction->while_body(); if (while_cond->num_parameters() != 1) { return FailedPrecondition( - "While condition must have exactly 1 parameter; had %lld : %s", - while_cond->num_parameters(), while_cond->ToString().c_str()); + "While condition must have exactly 1 parameter; had %d : %s", + while_cond->num_parameters(), while_cond->ToString()); } if (while_body->num_parameters() != 1) { return FailedPrecondition( - "While body must have exactly 1 parameter; had %lld : %s", - while_body->num_parameters(), while_body->ToString().c_str()); + "While body must have exactly 1 parameter; had %d : %s", + while_body->num_parameters(), while_body->ToString()); } if (instruction->operand_count() != 1) { return FailedPrecondition( - "While loop must have exactly one operand; had %lld : %s", - instruction->operand_count(), instruction->ToString().c_str()); + "While loop must have exactly one operand; had %d : %s", + instruction->operand_count(), instruction->ToString()); } - auto* init = instruction->operand(0); - auto* cond_param = while_cond->parameter_instruction(0); - if (!ShapeUtil::Compatible(init->shape(), cond_param->shape())) { - return FailedPrecondition( - "While condition's parameter must have the same shape as the " - "loop's 'init'. init: %s, param: %s", - init->ToString().c_str(), cond_param->ToString().c_str()); - } - auto* cond_root = while_cond->root_instruction(); - if (!ShapeUtil::Compatible(cond_root->shape(), - ShapeUtil::MakeShape(PRED, {}))) { - return FailedPrecondition("While condition should have shape PRED: %s", - cond_root->ToString().c_str()); - } - auto* body_param = while_body->parameter_instruction(0); - if (!ShapeUtil::Compatible(init->shape(), body_param->shape())) { + return Status::OK(); +} + +Status HloVerifier::CheckConditionalInstruction(HloInstruction* instruction) { + if (instruction->true_computation()->num_parameters() != 1) { return FailedPrecondition( - "While body's parameter must have the same shape as the loop's" - " 'init'. init: %s, param: %s", - init->ToString().c_str(), body_param->ToString().c_str()); + "True computation %s of %s must have 1 parameter insted of %d", + instruction->true_computation()->name(), instruction->ToString(), + instruction->true_computation()->num_parameters()); } - auto* body_root = while_body->root_instruction(); - if (!ShapeUtil::Compatible(init->shape(), body_root->shape())) { + if (instruction->false_computation()->num_parameters() != 1) { return FailedPrecondition( - "While body should have same shape as the loop's 'init'." - "init: %s, body: %s", - init->ToString().c_str(), body_root->ToString().c_str()); + "False computation %s of %s must have 1 parameter insted of %d", + instruction->false_computation()->name(), instruction->ToString(), + instruction->false_computation()->num_parameters()); } return Status::OK(); } @@ -777,22 +913,152 @@ Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) { const Shape& out_shape = instruction->shape(); for (HloInstruction* operand : instruction->operands()) { const Shape& operand_shape = operand->shape(); - if (!ShapeUtil::IsScalar(operand_shape) && - !ShapeUtil::CompatibleIgnoringElementType(operand_shape, out_shape)) { + if (!ShapeUtil::CompatibleIgnoringElementType(operand_shape, out_shape)) { return FailedPrecondition( "Implicit broadcast is not allowed in HLO." - "Found non-compatible shapes for instruction %s.\n" + "Found different shapes for instruction %s.\n" "output: %s\noperand: %s\n", - HloOpcodeString(instruction->opcode()).c_str(), - ShapeUtil::HumanString(out_shape).c_str(), - ShapeUtil::HumanString(operand_shape).c_str()); + HloOpcodeString(instruction->opcode()), + ShapeUtil::HumanString(out_shape), + ShapeUtil::HumanString(operand_shape)); } } return Status::OK(); } +namespace { + +// Returns true if the given Shape has a TOKEN shape as any subshape. +bool ShapeContainsToken(const Shape& shape) { + bool contains_token = false; + ShapeUtil::ForEachSubshape( + shape, [&contains_token](const Shape& subshape, const ShapeIndex&) { + if (ShapeUtil::IsToken(subshape)) { + contains_token = true; + } + }); + return contains_token; +} + +// Verifies that all types entering and exiting the entry computation are +// legal. +Status VerifyEntryAndExitShapes(const HloModule& module) { + // Tokens cannot be passed as entry parameters. + // TODO(b/80000000): Remove this constraint. + for (int i = 0; i < module.entry_computation()->num_parameters(); ++i) { + HloInstruction* param = + module.entry_computation()->parameter_instruction(i); + if (ShapeContainsToken(param->shape())) { + return InternalError( + "Entry parameter %d is or contains a token shape: %s", i, + ShapeUtil::HumanString(param->shape())); + } + } + return Status::OK(); +} + +// Checks if the given two instructions share the same channel id. +Status CheckSameChannel(const HloInstruction* instr1, + const HloInstruction* instr2) { + if (instr1->channel_id() != instr2->channel_id()) { + return InternalError( + "Expected to have the same channel id, actual channel ids are: %s " + "(%d), %s (%d)", + instr1->ToString(), instr1->channel_id(), instr2->ToString(), + instr2->channel_id()); + } + return Status::OK(); +} + +// Checks if the given two instructions have the same is_host_transfer +// attribute value. Intsructions must be send/recv instructions or their +// 'done' variant. +Status CheckSameIsHostTransfer(const HloInstruction* instr1, + const HloInstruction* instr2) { + const HloSendRecvInstruction* send_recv1 = + DynCast(instr1); + const HloSendRecvInstruction* send_recv2 = + DynCast(instr2); + TF_RET_CHECK(send_recv1 != nullptr); + TF_RET_CHECK(send_recv2 != nullptr); + if (send_recv1->is_host_transfer() != send_recv2->is_host_transfer()) { + return InternalError( + "Expected instructions to have the same is-host-transfer property: " + "%s, " + "%s ", + instr1->ToString(), instr2->ToString()); + } + return Status::OK(); +} + +// Checks various invariants of send and recv instructions. +Status VerifySendsAndRecvs(const HloModule& module) { + tensorflow::gtl::FlatMap host_channels; + // Host send/recv instructions must have their own unique channel. + auto check_unique_host_channel = [&](const HloInstruction* instruction) { + const HloSendRecvInstruction* sendrecv = + DynCast(instruction); + if (sendrecv->is_host_transfer()) { + auto it_inserted = + host_channels.insert({sendrecv->channel_id(), sendrecv}); + if (!it_inserted.second) { + return FailedPrecondition( + "Channel %d is used for multiple host send/recv instructions: " + "%s " + "and " + "%s", + sendrecv->channel_id(), sendrecv->ToString(), + it_inserted.first->second->ToString()); + } + } + + return Status::OK(); + }; + + // Send/Recv instruction must have a single user: the corresponding + // SendDone/RecvDone. with matching channel. + for (const HloComputation* computation : module.computations()) { + for (const HloInstruction* instruction : computation->instructions()) { + switch (instruction->opcode()) { + case HloOpcode::kSend: { + TF_RETURN_IF_ERROR(check_unique_host_channel(instruction)); + TF_RET_CHECK(instruction->users().size() == 1); + const HloInstruction* send_done = instruction->users().front(); + TF_RET_CHECK(send_done->opcode() == HloOpcode::kSendDone); + TF_RETURN_IF_ERROR(CheckSameChannel(instruction, send_done)); + TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, send_done)); + break; + } + case HloOpcode::kRecv: { + TF_RETURN_IF_ERROR(check_unique_host_channel(instruction)); + TF_RET_CHECK(instruction->users().size() == 1); + const HloInstruction* recv_done = instruction->users().front(); + TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone); + TF_RETURN_IF_ERROR(CheckSameChannel(instruction, recv_done)); + TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, recv_done)); + break; + } + case HloOpcode::kSendDone: + TF_RET_CHECK(instruction->operands().size() == 1); + TF_RET_CHECK(instruction->operand(0)->opcode() == HloOpcode::kSend); + break; + case HloOpcode::kRecvDone: + TF_RET_CHECK(instruction->operands().size() == 1); + TF_RET_CHECK(instruction->operand(0)->opcode() == HloOpcode::kRecv); + break; + default: + break; + } + } + } + return Status::OK(); +} + +} // namespace + StatusOr HloVerifier::Run(HloModule* module) { TF_RETURN_IF_ERROR(VerifyHloStructure(module)); + TF_RETURN_IF_ERROR(VerifySendsAndRecvs(*module)); tensorflow::gtl::FlatMap instructions; @@ -801,9 +1067,9 @@ StatusOr HloVerifier::Run(HloModule* module) { TF_RET_CHECK(instruction->parent() == computation); if (instruction->opcode() == HloOpcode::kFusion) { TF_RETURN_IF_ERROR(CheckFusionInstruction(instruction)); - TF_RET_CHECK( - ContainersEqual(instruction->called_computations(), - {instruction->fused_instructions_computation()})) + TF_RET_CHECK(instruction->called_computations() == + absl::Span( + {instruction->fused_instructions_computation()})) << "Fusion HLO calls computations other than the " "fused_instructions_computation: " << instruction->ToString() @@ -832,7 +1098,11 @@ StatusOr HloVerifier::Run(HloModule* module) { << " != " << ShapeUtil::Rank(instruction->operand(0)->shape()); } else if (instruction->opcode() == HloOpcode::kWhile) { TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction)); - } else if (instruction->IsElementwise()) { + } else if (instruction->opcode() == HloOpcode::kConditional) { + TF_RETURN_IF_ERROR(CheckConditionalInstruction(instruction)); + } else if (instruction->opcode() != + HloOpcode::kRng /* Rng operands are always scalar. */ + && instruction->IsElementwise()) { TF_RETURN_IF_ERROR(CheckElementwiseInstruction(instruction)); } @@ -851,6 +1121,8 @@ StatusOr HloVerifier::Run(HloModule* module) { TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get())); } + TF_RETURN_IF_ERROR(VerifyEntryAndExitShapes(*module)); + return false; } diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h index 1392a78097aa026b2f7cffa2b0135402d3ca7ae5..42e3027bf14a827bd0a791510c2d9c107d989ab9 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.h +++ b/tensorflow/compiler/xla/service/hlo_verifier.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/shape_inference.h" namespace xla { @@ -27,15 +28,17 @@ namespace xla { // TODO(b/26024837): Check output shape for all instruction types. class ShapeVerifier : public DfsHloVisitor { public: - explicit ShapeVerifier() : allow_mixed_precision_(false) {} - explicit ShapeVerifier(bool allow_mixed_precision) - : allow_mixed_precision_(allow_mixed_precision) {} + explicit ShapeVerifier(bool layout_sensitive, bool allow_mixed_precision) + : layout_sensitive_(layout_sensitive), + allow_mixed_precision_(allow_mixed_precision) {} Status HandleElementwiseUnary(HloInstruction* hlo) override; Status HandleElementwiseBinary(HloInstruction* hlo) override; Status HandleClamp(HloInstruction* clamp) override; Status HandleSelect(HloInstruction* select) override; + Status HandleTupleSelect(HloInstruction* tuple_select) override; Status HandleConcatenate(HloInstruction* concatenate) override; + Status HandleIota(HloInstruction* iota) override; Status HandleConvert(HloInstruction* convert) override; Status HandleBitcastConvert(HloInstruction* convert) override; Status HandleCopy(HloInstruction* copy) override; @@ -43,6 +46,8 @@ class ShapeVerifier : public DfsHloVisitor { Status HandleConvolution(HloInstruction* convolution) override; Status HandleFft(HloInstruction* fft) override; Status HandleCrossReplicaSum(HloInstruction* crs) override; + Status HandleAllToAll(HloInstruction* hlo) override; + Status HandleCollectivePermute(HloInstruction* hlo) override; Status HandleReducePrecision(HloInstruction* reduce_precision) override; Status HandleInfeed(HloInstruction*) override; Status HandleOutfeed(HloInstruction*) override; @@ -60,7 +65,6 @@ class ShapeVerifier : public DfsHloVisitor { Status HandleFusion(HloInstruction*) override; Status HandleCall(HloInstruction* call) override; Status HandleCustomCall(HloInstruction*) override; - Status HandleHostCompute(HloInstruction*) override; Status HandleSlice(HloInstruction* slice) override; Status HandleDynamicSlice(HloInstruction* dynamic_slice) override; Status HandleDynamicUpdateSlice( @@ -81,6 +85,8 @@ class ShapeVerifier : public DfsHloVisitor { HloInstruction* batch_norm_inference) override; Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override; Status HandleGather(HloInstruction* gather) override; + Status HandleScatter(HloInstruction* scatter) override; + Status HandleAfterAll(HloInstruction* token) override; Status FinishVisit(HloInstruction*) override { return Status::OK(); } @@ -100,11 +106,43 @@ class ShapeVerifier : public DfsHloVisitor { Status CheckTernaryShape(const HloInstruction* instruction); Status CheckVariadicShape(const HloInstruction* instruction); - // Checks if the given two instructions share the same channel id. - Status CheckSameChannel(const HloInstruction* instr1, - const HloInstruction* instr2); - private: + // Helpers that switch on layout_sensitive_. + bool ShapesSame(const Shape& a, const Shape& b) { + return layout_sensitive_ ? ShapeUtil::Equal(a, b) + : ShapeUtil::Compatible(a, b); + } + bool ShapesSameIgnoringFpPrecision(const Shape& a, const Shape& b) { + return layout_sensitive_ ? ShapeUtil::EqualIgnoringFpPrecision(a, b) + : ShapeUtil::CompatibleIgnoringFpPrecision(a, b); + } + string StringifyShape(const Shape& s) { + return layout_sensitive_ ? ShapeUtil::HumanStringWithLayout(s) + : ShapeUtil::HumanString(s); + } + + // Checks that the given operand of the given instruction is of type TOKEN. + Status CheckIsTokenOperand(const HloInstruction* instruction, + int64 operand_no); + + // Checks that the shape of the given operand of the given instruction matches + // the given parameter of the given computation. + Status CheckOperandAndParameter(const HloInstruction* instruction, + int64 operand_number, + const HloComputation* computation, + int64 parameter_number); + + // Returns true if the shapes of the two operands have the same element type, + // and the result shape either has the same element type as the operand shapes + // or mixed precision is allowed and the result shape and the operand shapes + // have floating point element types. + bool HasCompatibleElementTypes(const Shape& shape_0, const Shape& shape_1, + const Shape& result_shape); + + // If the verifier is layout-sensitive, shapes must be equal to what's + // expected. Otherwise, the shapes must simply be compatible. + bool layout_sensitive_; + // Whether the inputs and output of an instruction can contain both F32s and // BF16s. Tuples that include both F32s and BF16s are allowed regardless of // this flag. @@ -117,14 +155,10 @@ class HloVerifier : public HloPassInterface { public: using ShapeVerifierFactory = std::function()>; - // Uses standard shape inference. - explicit HloVerifier() - : shape_verifier_factory_( - [] { return MakeUnique(false); }) {} - - explicit HloVerifier(bool allow_mixed_precision) - : shape_verifier_factory_([allow_mixed_precision] { - return MakeUnique(allow_mixed_precision); + explicit HloVerifier(bool layout_sensitive, bool allow_mixed_precision) + : shape_verifier_factory_([layout_sensitive, allow_mixed_precision] { + return absl::make_unique(layout_sensitive, + allow_mixed_precision); }) {} // Uses custom shape verification. @@ -132,10 +166,9 @@ class HloVerifier : public HloPassInterface { : shape_verifier_factory_(std::move(shape_verifier_factory)) {} ~HloVerifier() override = default; - tensorflow::StringPiece name() const override { return "verifier"; } + absl::string_view name() const override { return "verifier"; } - // Note: always returns false (no instructions are ever modified by this - // pass). + // Never returns true; no instructions are ever modified by this pass. StatusOr Run(HloModule* module) override; private: @@ -144,6 +177,8 @@ class HloVerifier : public HloPassInterface { Status CheckWhileInstruction(HloInstruction* instruction); + Status CheckConditionalInstruction(HloInstruction* instruction); + // Checks that the non-scalar operand shapes are compatible to the output // shape, i.e., that there are no implicit broadcasts of size-one dimensions. Status CheckElementwiseInstruction(HloInstruction* instruction); diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc index c92db0be14dceb32ea86521dcc99b8f63738e4a5..0cac210c2413e979300e191cb54860bcd0ab79b5 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" @@ -33,7 +34,21 @@ namespace { using ::testing::HasSubstr; -using HloVerifierTest = HloTestBase; +// This class cannot be converted to use HloVerifiedTestBase. It explicitly +// uses HloTestBase to create and test malformed HLOs. +class HloVerifierTest : public HloTestBase { + public: + HloVerifierTest() + : HloTestBase(/*verifier_layout_sensitive=*/false, + /*allow_mixed_precision_in_hlo_verifier=*/false) {} +}; + +class HloVerifierTestAllowMixedPrecision : public HloTestBase { + public: + HloVerifierTestAllowMixedPrecision() + : HloTestBase(/*verifier_layout_sensitive=*/false, + /*allow_mixed_precision_in_hlo_verifier=*/true) {} +}; TEST_F(HloVerifierTest, NullInstructionParent) { HloComputation::Builder builder(TestName()); @@ -123,5 +138,225 @@ TEST_F(HloVerifierTest, ResetsShapeVerifierState) { EXPECT_FALSE(verifier().Run(module.get()).status().ok()); } +TEST_F(HloVerifierTest, CheckCallOperandParameterShapesMismatch) { + const char* const hlo_string = R"( +HloModule Module + +callme { + ROOT param = (s32[], f32[4]) parameter(0) +} + +ENTRY entry { + p0 = (f32[4], s32[]) parameter(0) + ROOT mycall = (s32[], f32[4]) call(p0), to_apply=callme +} +)"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("shape does not match parameter")); +} + +TEST_F(HloVerifierTest, CheckConditionalOperandParameterShapesMismatch) { + const char* const hlo_string = R"( +HloModule Module + +true_branch { + tparam = (s32[], f32[4]) parameter(0) + ROOT tgte1 = f32[4] get-tuple-element(tparam), index=1 +} + +false_branch { + fparam = (s32[], f32[4]) parameter(0) + ROOT fgte1 = f32[4] get-tuple-element(fparam), index=1 +} + +ENTRY entry { + p0 = (f32[4], s32[]) parameter(0) + constant = pred[] constant(true) + ROOT conditional = f32[4] conditional(constant, p0, p0), + true_computation=true_branch, false_computation=false_branch +} +)"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("shape does not match parameter")); +} + +TEST_F(HloVerifierTest, RngOpnd0NotScalar) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngOpnd0NotScalar { + constant.0 = f32[] constant(0) + constant.1 = f16[2] constant({1, 3}) + ROOT rng.0 = f32[10]{0} rng(f32[] constant.0, f16[2] constant.1), + distribution=rng_uniform + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), HasSubstr("Expected scalar type")); +} + +TEST_F(HloVerifierTest, RngOperandElementTypesDoNotMatch) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngOperandElementTypesNotMatch { + constant.0 = f32[] constant(0) + constant.1 = f16[] constant(1) + ROOT rng.0 = f32[10]{0} rng(f32[] constant.0, f16[] constant.1), + distribution=rng_normal + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Expected compatible element types")); +} + +TEST_F(HloVerifierTest, RngMixedPrecisionNotAllowed) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngResultElementTypeNotMatch { + constant.0 = f32[] constant(0) + constant.1 = f32[] constant(1) + ROOT rng.0 = f16[10]{0} rng(f32[] constant.0, f32[] constant.1), + distribution=rng_normal + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Expected compatible element types")); +} + +TEST_F(HloVerifierTestAllowMixedPrecision, RngMixedPrecisionAllowed) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngResultElementTypeNotMatch { + constant.0 = f32[] constant(0) + constant.1 = f32[] constant(1) + ROOT rng.0 = f16[10]{0} rng(f32[] constant.0, f32[] constant.1), + distribution=rng_normal + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_TRUE(status.ok()); +} + +TEST_F(HloVerifierTest, RngElementTypeNotSupported) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngElementTypeNotSupported { + constant.0 = s32[] constant(0) + constant.1 = s32[] constant(1) + ROOT rng.0 = s32[10]{0} rng(s32[] constant.0, s32[] constant.1), + distribution=rng_normal + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), HasSubstr("Element type not supported")); +} + +TEST_F(HloVerifierTest, NegativeInteriorPaddingNotAllowed) { + // This testcase can't be written using textual HLO, because it doesn't parse + // negative interior padding. That's probably a feature. :) + HloComputation::Builder builder(TestName()); + HloInstruction* param = + builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {100}), "param")); + PaddingConfig padding_config; + padding_config.add_dimensions()->set_interior_padding(-1); + builder.AddInstruction(HloInstruction::CreatePad( + ShapeUtil::MakeShape(F32, {100}), param, + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::Zero(F32).CloneToUnique())), + padding_config)); + + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build()); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Interior padding cannot be negative")); +} + +TEST_F(HloVerifierTest, PadNegativeInteriorDilationNotAllowed) { + // This testcase can't be written using textual HLO, because it doesn't parse + // negative interior padding. That's probably a feature. :) + HloComputation::Builder builder(TestName()); + HloInstruction* param = + builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {100}), "param")); + PaddingConfig padding_config; + padding_config.add_dimensions()->set_interior_padding(-1); + builder.AddInstruction(HloInstruction::CreatePad( + ShapeUtil::MakeShape(F32, {100}), param, + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::Zero(F32).CloneToUnique())), + padding_config)); + + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build()); + + EXPECT_THAT(verifier().Run(module.get()).status().error_message(), + HasSubstr("Interior padding cannot be negative")); +} + +// Simple module containing a convolution as the root. +static const char* const kConvHloString = R"( +HloModule module +ENTRY entry_computation { + param0 = f16[128,128,56,56] parameter(0) + param1 = f16[3,3,128,128] parameter(1) + zero_f16 = f16[] constant(0) + ROOT conv = f16[128,128,28,28] convolution(param0, param1), + window={size=3x3 stride=2x2}, dim_labels=bf01_01io->bf01 +})"; + +TEST_F(HloVerifierTest, ConvNegativeWindowDilationNotAllowed) { + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kConvHloString)); + auto* conv = module->entry_computation()->root_instruction(); + Window w = conv->window(); + w.mutable_dimensions(0)->set_window_dilation(-1); + conv->set_window(w); + + EXPECT_THAT(verifier().Run(module.get()).status().error_message(), + HasSubstr("non-positive window dilation factor")); +} + +TEST_F(HloVerifierTest, ConvNegativeBaseDilationNotAllowed) { + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kConvHloString)); + auto* conv = module->entry_computation()->root_instruction(); + Window w = conv->window(); + w.mutable_dimensions(0)->set_base_dilation(-1); + conv->set_window(w); + + EXPECT_THAT(verifier().Run(module.get()).status().error_message(), + HasSubstr("non-positive base area dilation factor")); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc index dc3bfce0c495bc40a2df7b985cab67e02a3e15ce..e76b93107c923b41666f6b0a388dda143a8cb50a 100644 --- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc @@ -14,29 +14,30 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/metric_table_report.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { -using tensorflow::strings::Appendf; +using absl::StrAppend; +using absl::StrAppendFormat; +using absl::StrCat; +using absl::StrFormat; using tensorflow::strings::HumanReadableElapsedTime; using tensorflow::strings::HumanReadableNumBytes; -using tensorflow::strings::Printf; -using tensorflow::strings::StrAppend; -using tensorflow::strings::StrCat; string HumanReadableProfileBuilder::ToString() const { string s; - Appendf(&s, "Execution profile for %s: (%s @ f_nom)\n", - computation_name_.c_str(), - HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str()); + StrAppendFormat(&s, "Execution profile for %s: (%s @ f_nom)\n", + computation_name_, + HumanReadableElapsedTime(CyclesToSeconds(total_cycles_))); - auto print_op = [&](const OpInfo& op) { + int64 cumulative_cycles = 0; + auto print_op = [&](const OpInfo& op, bool is_total = false) { // Skip ops with 0 optimal seconds and 0 actual cycles. These are ops that // were expected to be free and are actually free -- things like (on most // backends) kParameter or kConstant HLOs. There's no need to clutter the @@ -55,31 +56,45 @@ string HumanReadableProfileBuilder::ToString() const { if (op.bytes_accessed > op.cycles) { bytes_per_cycle = StrCat(HumanReadableNumBytes(bpc), "/cycle"); } else { - bytes_per_cycle = Printf("%.3fB/cycle", bpc); + bytes_per_cycle = StrFormat("%.3fB/cycle", bpc); } } + double cumulative_cycles_percent = 0; double cycles_percent = 0; + if (!is_total) { + cumulative_cycles += op.cycles; + } if (total_cycles_ > 0) { cycles_percent = op.cycles / static_cast(total_cycles_) * 100; + cumulative_cycles_percent = + cumulative_cycles / static_cast(total_cycles_) * 100; + } + + string cycles_percent_str; + if (is_total) { + // Leaving off the two trailing decimal points of "100.%" lets us save two + // columns in the output. + cycles_percent_str = "100.% 100Σ"; + } else { + cycles_percent_str = StrFormat("%5.2f%% %2.0fΣ", cycles_percent, + cumulative_cycles_percent); } double nsecs = op.cycles / clock_rate_ghz_; - Appendf(&s, - "%15lld cycles (%6.2f%%) :: %12.1f usec %22s :: %18s " - ":: %18s :: %14s :: %16s :: %s\n", - op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles), - op.optimal_seconds < 0 - ? "" - : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(), - op.flop_count <= 0 - ? "" - : HumanReadableNumFlops(op.flop_count, nsecs).c_str(), - op.transcendental_count <= 0 ? "" - : HumanReadableNumTranscendentalOps( - op.transcendental_count, nsecs) - .c_str(), - bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str()); + StrAppendFormat( + &s, + "%15d cycles (%s) :: %12.1f usec %22s :: %18s :: %18s :: %14s :: " + "%16s :: %s\n", + op.cycles, cycles_percent_str, CyclesToMicroseconds(op.cycles), + op.optimal_seconds < 0 + ? "" + : StrFormat("(%12.1f optimal)", op.optimal_seconds * 1e6), + op.flop_count <= 0 ? "" : HumanReadableNumFlops(op.flop_count, nsecs), + op.transcendental_count <= 0 + ? "" + : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs), + bytes_per_sec, bytes_per_cycle, op.name); }; float optimal_seconds_sum = 0.0; @@ -98,7 +113,8 @@ string HumanReadableProfileBuilder::ToString() const { VLOG(1) << "Total floating point ops: " << total_flops; print_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops, - total_transcendentals, total_bytes, optimal_seconds_sum}); + total_transcendentals, total_bytes, optimal_seconds_sum}, + /*is_total=*/true); // Sort ops in decreasing order of cycles, and print them. std::vector sorted_ops(op_infos_); @@ -169,6 +185,23 @@ string HumanReadableProfileBuilder::ToString() const { StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_))); } } + + if (total_bytes > 0) { + MetricTableReport table; + table.SetMetricName("MiB read+written"); + table.SetEntryName("ops"); + table.SetShowCategoryTable(); + for (const auto& op : op_infos_) { + MetricTableReport::Entry entry; + entry.text = op.name; + entry.short_text = op.short_name; + entry.category_text = op.category; + entry.metric = static_cast(op.bytes_accessed) / (1 << 20); + table.AddEntry(std::move(entry)); + } + StrAppend(&s, + table.MakeReport(static_cast(total_bytes) / (1 << 20))); + } return s; } diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.h b/tensorflow/compiler/xla/service/human_readable_profile_builder.h index 6f56c3aa82e9d1c942fd67ff7a5948cf2e54370d..925111fa1f1e48650b0089f402d92e431043eabe 100644 --- a/tensorflow/compiler/xla/service/human_readable_profile_builder.h +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.h @@ -18,8 +18,8 @@ limitations under the License. #include +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -29,10 +29,10 @@ namespace xla { // computation, suitable for consumption by humans. class HumanReadableProfileBuilder { public: - explicit HumanReadableProfileBuilder(tensorflow::StringPiece computation_name, + explicit HumanReadableProfileBuilder(absl::string_view computation_name, int64 total_cycles, double clock_rate_ghz) - : computation_name_(std::string(computation_name)), + : computation_name_(computation_name), total_cycles_(total_cycles), clock_rate_ghz_(clock_rate_ghz) { CHECK_GE(clock_rate_ghz, 1e-9); @@ -43,15 +43,13 @@ class HumanReadableProfileBuilder { // Adds an operation to the profile. If you don't know the number of // floating-point ops or bytes touched by the op, or if you don't know how // fast it would run optimally, pass -1 for that param. - void AddOp(tensorflow::StringPiece op_name, - tensorflow::StringPiece short_name, - tensorflow::StringPiece category, int64 cycles, int64 flop_count, + void AddOp(absl::string_view op_name, absl::string_view short_name, + absl::string_view category, int64 cycles, int64 flop_count, int64 transcendental_count, int64 bytes_accessed, float optimal_seconds) { - op_infos_.push_back({std::string(op_name), std::string(short_name), - std::string(category), cycles, flop_count, - transcendental_count, bytes_accessed, - optimal_seconds}); + op_infos_.push_back({string(op_name), string(short_name), string(category), + cycles, flop_count, transcendental_count, + bytes_accessed, optimal_seconds}); } // Gets the human-readable profile. diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h index aa325dc8a353c5bfbfded0c2774c66bfcc71c9cb..85bb4a8b2450a48d461f1d84e0609a38a6818d9c 100644 --- a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h +++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h @@ -30,7 +30,7 @@ class ImplicitBroadcastRemover : public HloPassInterface { ImplicitBroadcastRemover() {} ~ImplicitBroadcastRemover() override {} - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "implicit-broadcast-remover"; } diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc index 8c7b38dd1bf73e0be7b669d7215812aaef1cee17..f85d31d5225b8012b68f851b2bfec219d736ba0d 100644 --- a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc +++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc index 8b3fa6c1572cf0ed91fc427722edcb23d8b8529d..a4de02a89039e07b22b1ad8c268c2f760aa95880 100644 --- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc +++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc @@ -14,12 +14,16 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/indexed_array_analysis.h" + +#include "absl/algorithm/container.h" +#include "absl/container/inlined_vector.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_evaluator.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/gtl/inlined_vector.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { namespace gtl = ::tensorflow::gtl; @@ -28,28 +32,33 @@ namespace { using Analysis = IndexedArrayAnalysis; using UnknownArray = Analysis::UnknownArray; using ConstantArray = Analysis::ConstantArray; +using ReshapedArray = Analysis::ReshapedArray; using ScalarIndexedArray = Analysis::ScalarIndexedArray; -using tensorflow::gtl::ArraySlice; -using tensorflow::str_util::Join; +using absl::StrJoin; } // namespace string IndexedArrayAnalysis::ToString(Array* root, bool print_constants) { switch (root->kind()) { case Array::kUnknown: { auto* unknown_tensor = root->as(); - return tensorflow::strings::StrCat("%", - unknown_tensor->instruction().name()); + return absl::StrCat("%", unknown_tensor->instruction().name()); } case Array::kConstant: { if (print_constants) { string contents = root->as()->literal()->ToString(); - return tensorflow::strings::StrCat( - "(constant ", ShapeUtil::HumanString(root->shape()), " ", contents, - ")"); + return absl::StrCat("(constant ", ShapeUtil::HumanString(root->shape()), + " ", contents, ")"); } - return tensorflow::strings::StrCat( - "(constant ", ShapeUtil::HumanString(root->shape()), ")"); + return absl::StrCat("(constant ", ShapeUtil::HumanString(root->shape()), + ")"); + } + + case Array::kReshaped: { + ReshapedArray* reshaped_array = root->as(); + return absl::StrCat( + "(reshape ", ToString(reshaped_array->operand(), print_constants), + " to ", ShapeUtil::HumanString(reshaped_array->shape()), ")"); } case Array::kScalarIndexedConstant: @@ -58,11 +67,11 @@ string IndexedArrayAnalysis::ToString(Array* root, bool print_constants) { string name = root->kind() == Array::kScalarIndexedConstant ? "scalar-indexed-const" : "scalar-indexed"; - return tensorflow::strings::StrCat( + return absl::StrCat( "(", name, " ", ToString(indexed_array->source(), print_constants), " ", ToString(indexed_array->indices(), print_constants), " ", indexed_array->source_dim(), "->[", - Join(indexed_array->output_dims(), ","), "])"); + StrJoin(indexed_array->output_dims(), ","), "])"); } } } @@ -83,7 +92,7 @@ Status IndexedArrayAnalysis::TraverseAndPopulateCache( // Depth first search over the DAG, invoking ComputeArrayFor in post order. // The HLO instructions already in the cache are considered leaves. - gtl::InlinedVector stack; + absl::InlinedVector stack; enum DfsState { kDiscovered, kVisited }; gtl::FlatMap dfs_state_map; @@ -144,7 +153,7 @@ StatusOr IndexedArrayAnalysis::ComputeArrayFor( TF_ASSIGN_OR_RETURN( computed_array, ComputeArrayForGather(instr->shape(), instr->gather_dimension_numbers(), - instr->gather_window_bounds(), + instr->gather_slice_sizes(), FindOrDie(cache_, instr->operand(0)), FindOrDie(cache_, instr->operand(1)))); } else if (instr->opcode() == HloOpcode::kReshape) { @@ -152,6 +161,12 @@ StatusOr IndexedArrayAnalysis::ComputeArrayFor( computed_array, ComputeArrayForReshape(instr->shape(), FindOrDie(cache_, instr->operand(0)))); + } else if (instr->opcode() == HloOpcode::kDot) { + TF_ASSIGN_OR_RETURN( + computed_array, + ComputeArrayForDot(instr->shape(), instr->dot_dimension_numbers(), + FindOrDie(cache_, instr->operand(0)), + FindOrDie(cache_, instr->operand(1)))); } else { computed_array = nullptr; } @@ -170,7 +185,7 @@ StatusOr IndexedArrayAnalysis::ComputeArrayForConstant( StatusOr IndexedArrayAnalysis::FoldGatherOfGather( ScalarIndexedArray* source, Array* indices, int64 source_dim, - tensorflow::gtl::ArraySlice output_dims, Shape shape) { + absl::Span output_dims, Shape shape) { // We want to transform Gather(Gather(A, X), Y) => Gather(A, Gather(X, Y)). // `source` is the inner Gather(A, X). @@ -236,29 +251,51 @@ StatusOr IndexedArrayAnalysis::FoldGatherOfGather( StatusOr IndexedArrayAnalysis::ComputeArrayForGather( const Shape& shape, const GatherDimensionNumbers& dim_numbers, - tensorflow::gtl::ArraySlice window_bounds, Array* source, - Array* indices) { + absl::Span slice_sizes, Array* source, Array* indices) { if (dim_numbers.index_vector_dim() != indices->shape().dimensions_size()) { + VLOG(3) << "ComputeArrayForGather: indices are not scalar"; return nullptr; } - CHECK_EQ(dim_numbers.gather_dims_to_operand_dims_size(), 1); - if (!c_binary_search(dim_numbers.elided_window_dims(), - dim_numbers.gather_dims_to_operand_dims(0))) { + CHECK_EQ(dim_numbers.start_index_map_size(), 1); + + // We can also handle dim_numbers.collapsed_slice_dims_size() == 0 here, + // should it become relevant. + + if (dim_numbers.collapsed_slice_dims_size() != 1 || + dim_numbers.collapsed_slice_dims(0) != dim_numbers.start_index_map(0)) { + VLOG(3) << "ComputeArrayForGather: gather operations must elide " + "start_index_map[0] and " + "start_index_map[0] only"; return nullptr; } - int64 source_dim = dim_numbers.gather_dims_to_operand_dims(0); + // ScalarIndexedArray cannot represent gathers that "slice" along some + // dimensions -- for instance it cannot represent a gather that picks 5 [2,3] + // arrays from an array of size [7,4,6]. We check that condition down below: + + for (int64 i = 0, e = source->shape().dimensions_size(); i < e; i++) { + if (i != dim_numbers.collapsed_slice_dims(0) && + source->shape().dimensions(i) != slice_sizes[i]) { + VLOG(3) << "ComputeArrayForGather: slice_sizes[" << i + << "] != source->shape().dimensions(" << i << ") -- " + << source->shape().dimensions(i) << " vs. " << slice_sizes[i] + << " with dim_numbers.collapsed_slice_dims(0) = " + << dim_numbers.collapsed_slice_dims(0); + return nullptr; + } + } + + int64 source_dim = dim_numbers.start_index_map(0); std::vector output_dims; for (int64 i = 0, e = shape.dimensions_size(); i < e; i++) { - if (!c_binary_search(dim_numbers.output_window_dims(), i)) { + if (!absl::c_binary_search(dim_numbers.offset_dims(), i)) { output_dims.push_back(i); } } if (auto* indexed = dynamic_cast(source)) { - auto it = c_find(indexed->output_dims(), source_dim); - if (it != indexed->output_dims().end()) { + if (absl::c_linear_search(indexed->output_dims(), source_dim)) { return FoldGatherOfGather(indexed, indices, source_dim, output_dims, shape); } @@ -275,8 +312,8 @@ namespace { // Returns an index into `values` such that the product of the range // [values.begin()+index, values.end()) is equal to `product`. If there is no // such index, return -1. All integers in `values` must be positive. -int64 FindSuffixWithProduct(ArraySlice values, int64 product) { - DCHECK(c_all_of(values, [](int64 value) { return value > 0; })); +int64 FindSuffixWithProduct(absl::Span values, int64 product) { + DCHECK(absl::c_all_of(values, [](int64 value) { return value > 0; })); int64 current_product = 1; int64 i; @@ -304,7 +341,8 @@ struct ReshapePassthroughDimPair { // The returned vector of pairs is sorted in both the result_dim and the // operand_dim components. std::vector ComputeReshapePassthroughDimPairs( - ArraySlice operand_shape, ArraySlice result_shape) { + absl::Span operand_shape, + absl::Span result_shape) { // A reshape can be seen as an index mapping from output index to input index: // // (i_0, ..., i_n) = f(o_0, ..., o_m) @@ -336,7 +374,11 @@ std::vector ComputeReshapePassthroughDimPairs( // result_subarray_size does not include the elements in the current // `result_dim` dimension (we multiply in result_shape[result_dim] at the // end of loop body) so candidate_operand_dim can never be zero. - CHECK_NE(candidate_operand_dim, 0); + CHECK_NE(candidate_operand_dim, 0) + << "result_dim = " << result_dim + << ", result_subarray_size = " << result_subarray_size + << ", result_shape = [" << StrJoin(result_shape, ",") << "]" + << ", operand_shape = [" << StrJoin(operand_shape, ",") << "]"; if (candidate_operand_dim != -1 && result_shape[result_dim] == operand_shape[candidate_operand_dim - 1]) { @@ -346,26 +388,27 @@ std::vector ComputeReshapePassthroughDimPairs( result_subarray_size *= result_shape[result_dim]; } - c_reverse(result); + absl::c_reverse(result); if (VLOG_IS_ON(3)) { std::vector result_strings; - c_transform(result, std::back_inserter(result_strings), - [](ReshapePassthroughDimPair value) { - return tensorflow::strings::StrCat(value.result_dim, "->", - value.operand_dim); - }); - VLOG(3) << "For a reshape from [" << Join(operand_shape, ",") << "] to [" - << Join(result_shape, ",") << "] passthrough indices are [" - << Join(result_strings, ",") << "]"; + absl::c_transform(result, std::back_inserter(result_strings), + [](ReshapePassthroughDimPair value) { + return absl::StrCat(value.result_dim, "->", + value.operand_dim); + }); + VLOG(3) << "For a reshape from [" << StrJoin(operand_shape, ",") << "] to [" + << StrJoin(result_shape, ",") << "] passthrough indices are [" + << StrJoin(result_strings, ",") + << "] (legend: `result`->`operand`)"; } - DCHECK(c_is_sorted( + DCHECK(absl::c_is_sorted( result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) { return lhs.result_dim < rhs.result_dim; })); - DCHECK(c_is_sorted( + DCHECK(absl::c_is_sorted( result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) { return lhs.operand_dim < rhs.operand_dim; })); @@ -376,44 +419,224 @@ std::vector ComputeReshapePassthroughDimPairs( // Return true if `dim` is stated as an passthrough operand dim in // `passthrough_dims`. bool IsReshapePassthroughOperandDim( - ArraySlice passthrough_dims, int64 dim) { - return c_any_of(passthrough_dims, - [&](ReshapePassthroughDimPair passthrough_dim_pair) { - return passthrough_dim_pair.operand_dim == dim; - }); + absl::Span passthrough_dims, int64 dim) { + return absl::c_any_of(passthrough_dims, + [&](ReshapePassthroughDimPair passthrough_dim_pair) { + return passthrough_dim_pair.operand_dim == dim; + }); } // Maps `operand_dim` which must be an passthrough operand dimension to its // corresponding passthrough result dimension based on `passthrough_dims`. int64 MapPassthroughOperandDimToResultDim( - ArraySlice passthrough_dims, int64 operand_dim) { - auto it = c_find_if(passthrough_dims, - [&](ReshapePassthroughDimPair passthrough_dim_pair) { - return passthrough_dim_pair.operand_dim == operand_dim; - }); + absl::Span passthrough_dims, + int64 operand_dim) { + auto it = absl::c_find_if( + passthrough_dims, [&](ReshapePassthroughDimPair passthrough_dim_pair) { + return passthrough_dim_pair.operand_dim == operand_dim; + }); CHECK(it != passthrough_dims.end()); return it->result_dim; } -int64 FindSourcePositionForPassthroughResultDim(ArraySlice operand_shape, - ArraySlice result_shape, - int64 source_passthrough_dim) { +int64 FindSourcePositionForPassthroughResultDim( + absl::Span operand_shape, absl::Span result_shape, + int64 source_passthrough_dim) { + VLOG(3) << "FindSourcePositionForPassthroughResultDim([" + << StrJoin(operand_shape, ",") << "], [" << StrJoin(result_shape, ",") + << "], " << source_passthrough_dim << ")"; + int64 indexed_source_subarray_size = std::accumulate(operand_shape.begin() + source_passthrough_dim + 1, - operand_shape.end(), 1, std::multiplies()); + operand_shape.end(), 1LL, std::multiplies()); return FindSuffixWithProduct(result_shape, indexed_source_subarray_size); } +Shape StripDegenerateDimensions(const Shape& shape) { + DimensionVector new_dims; + absl::c_copy_if(shape.dimensions(), std::back_inserter(new_dims), + [](int64 dim) { return dim != 1; }); + return ShapeUtil::MakeShape(shape.element_type(), new_dims); +} }; // namespace -StatusOr IndexedArrayAnalysis::ComputeArrayForReshape( - const Shape& shape, Array* operand) { - auto* scalar_indexed = dynamic_cast(operand); - if (!scalar_indexed) { +StatusOr +IndexedArrayAnalysis::ReshapeToRemoveDegenerateDims( + ScalarIndexedArray* operand) { + const Shape& shape = operand->shape(); + if (!ShapeUtil::HasDegenerateDimensions(shape)) { + return operand; + } + + // We only need to reshape out the degenerate dims from the indices and the + // source (except the source dim). + + const Shape& source_shape = operand->source()->shape(); + DimensionVector new_source_shape_dims; + for (int64 i = 0, e = source_shape.dimensions_size(); i < e; i++) { + if (i == operand->source_dim() || source_shape.dimensions(i) != 1) { + new_source_shape_dims.push_back(source_shape.dimensions(i)); + } + } + + Shape new_source_shape = + ShapeUtil::MakeShape(shape.element_type(), new_source_shape_dims); + Shape new_indices_shape = + StripDegenerateDimensions(operand->indices()->shape()); + + TF_ASSIGN_OR_RETURN( + Array* const new_source, + ComputeArrayForReshape(new_source_shape, operand->source())); + TF_ASSIGN_OR_RETURN( + Array* const new_indices, + ComputeArrayForReshape(new_indices_shape, operand->indices())); + + // Build the new output dims while keeping track of the degenerate dims that + // will no longer be present. + DimensionVector new_output_dims; + int64 degenerate_dims_seen = 0; + for (int64 i = 0, e = shape.dimensions_size(); i < e; i++) { + if (shape.dimensions(i) == 1) { + degenerate_dims_seen++; + } else if (absl::c_linear_search(operand->output_dims(), i)) { + new_output_dims.push_back(i - degenerate_dims_seen); + } + } + + // Similarly, build the new source dim while keeping track of the degenerate + // dims that will no longer be present. + int64 degenerate_dims_before_source_dim = + std::count(source_shape.dimensions().begin(), + source_shape.dimensions().begin() + operand->source_dim(), 1); + int64 new_source_dim = + operand->source_dim() - degenerate_dims_before_source_dim; + + return ConstructScalarIndexedArray( + new_source, new_indices, new_source_dim, + InlinedVectorToVector(new_output_dims), + StripDegenerateDimensions(operand->shape())); +} + +StatusOr IndexedArrayAnalysis::ReshapeToAddDegenerateDims( + ScalarIndexedArray* operand, absl::Span degenerate_dims) { + if (degenerate_dims.empty()) { + return operand; + } + + CHECK(!ShapeUtil::HasDegenerateDimensions(operand->shape())); + + DimensionVector new_output_dims = [&]() { + // To make things easy we use a "scratch" buffer of bools where the i'th + // element is true iff the i'th component of the result index is an output + // index. + + absl::InlinedVector output_dims_bitvector( + operand->shape().dimensions_size()); + for (int64 output_dim : operand->output_dims()) { + output_dims_bitvector[output_dim] = true; + } + + for (int64 degenerate_dim : degenerate_dims) { + InsertAt(&output_dims_bitvector, degenerate_dim, false); + } + + DimensionVector result; + result.reserve(operand->output_dims().size()); + for (int64 i = 0, e = output_dims_bitvector.size(); i < e; i++) { + if (output_dims_bitvector[i]) { + result.push_back(i); + } + } + + return result; + }(); + + DimensionVector new_result_shape_dims; + absl::c_copy(operand->shape().dimensions(), + std::back_inserter(new_result_shape_dims)); + for (int64 degenerate_dim : degenerate_dims) { + InsertAt(&new_result_shape_dims, degenerate_dim, 1); + } + + DimensionVector new_source_shape_dims = new_result_shape_dims; + for (int64 output_dim : new_output_dims) { + EraseAt(&new_source_shape_dims, output_dim); + } + + int64 new_source_dim = [&]() { + for (int i = 0, e = new_source_shape_dims.size(); i < e; i++) { + int64 non_degenerate_dims_seen = 0; + if (non_degenerate_dims_seen == operand->source_dim()) { + return i; + } + if (new_source_shape_dims[new_source_dim] != 1) { + non_degenerate_dims_seen++; + } + } + LOG(FATAL) << "Did not find source dim in " << ToString(operand); + }(); + + int64 source_dim_size = + operand->source()->shape().dimensions(operand->source_dim()); + InsertAt(&new_source_shape_dims, /*index=*/new_source_dim, + /*value=*/source_dim_size); + + Shape new_source_shape = ShapeUtil::MakeShape(operand->shape().element_type(), + new_source_shape_dims); + Shape new_result_shape = ShapeUtil::MakeShape(operand->shape().element_type(), + new_result_shape_dims); + + TF_ASSIGN_OR_RETURN( + Array* const new_source, + ComputeArrayForReshape(new_source_shape, operand->source())); + return ConstructScalarIndexedArray( + new_source, operand->indices(), new_source_dim, + InlinedVectorToVector(new_output_dims), new_result_shape); +} + +StatusOr IndexedArrayAnalysis::FoldReshapeOfGather( + const Shape& shape, ScalarIndexedConstantArray* operand) { + VLOG(3) << "FoldReshapeOfGather(" << ToString(operand) << ")"; + + // To make things easier on ourselves, instead of directly trying to fold the + // reshape of `operand` to `shape`, we call + // `FoldReshapeOfGatherNoDegenerateDims` on shapes without degenerate dims and + // handle the degenerate dimensions here by inserting reshapes. + + TF_ASSIGN_OR_RETURN(ScalarIndexedArray* const operand_without_degenerate_dims, + ReshapeToRemoveDegenerateDims(operand)); + + Shape output_shape_without_degenerate_dims = StripDegenerateDimensions(shape); + TF_ASSIGN_OR_RETURN( + ScalarIndexedArray* const folded_reshape_without_degenerate_dims, + FoldReshapeOfGatherNoDegenerateDims( + output_shape_without_degenerate_dims, + operand_without_degenerate_dims->as())); + + if (folded_reshape_without_degenerate_dims == nullptr) { return nullptr; } + DimensionVector degenerate_result_dims; + for (int64 i = 0, e = shape.dimensions_size(); i < e; i++) { + if (shape.dimensions(i) == 1) { + degenerate_result_dims.push_back(i); + } + } + + return ReshapeToAddDegenerateDims(folded_reshape_without_degenerate_dims, + degenerate_result_dims); +} + +StatusOr +IndexedArrayAnalysis::FoldReshapeOfGatherNoDegenerateDims( + const Shape& shape, ScalarIndexedConstantArray* scalar_indexed) { + VLOG(3) << "FoldReshapeOfGatherNoDegenerateDims(" << ToString(scalar_indexed) + << ")"; + CHECK(!ShapeUtil::HasDegenerateDimensions(shape)); + CHECK(!ShapeUtil::HasDegenerateDimensions(scalar_indexed->shape())); + // Try to fold Reshape(ScalarIndexed(Const, Indices)) // => ScalarIndexed(Const', Indices) // @@ -464,7 +687,7 @@ StatusOr IndexedArrayAnalysis::ComputeArrayForReshape( std::vector reshape_passthrough_dims = ComputeReshapePassthroughDimPairs( - /*operand_shape=*/AsInt64Slice(operand->shape().dimensions()), + /*operand_shape=*/AsInt64Slice(scalar_indexed->shape().dimensions()), /*result_shape=*/AsInt64Slice(shape.dimensions())); auto is_reshape_passthrough_operand_dim = [&](int64 operand_dim) { @@ -472,8 +695,10 @@ StatusOr IndexedArrayAnalysis::ComputeArrayForReshape( operand_dim); }; - if (!c_all_of(scalar_indexed->output_dims(), - is_reshape_passthrough_operand_dim)) { + if (!absl::c_all_of(scalar_indexed->output_dims(), + is_reshape_passthrough_operand_dim)) { + VLOG(3) << "Not all output dims are passthrough dims " + << ToString(scalar_indexed); return nullptr; } @@ -510,11 +735,11 @@ StatusOr IndexedArrayAnalysis::ComputeArrayForReshape( // operand = s32[3,5,2] constant({...}) // indices = s32[7] parameter(0) // gather = s32[3,2,7] gather(operand, indices), - // output_window_dims={0,1}, - // elided_window_dims={1}, - // gather_dims_to_operand_dims={1}, + // offset_dims={0,1}, + // collapsed_slice_dims={1}, + // start_index_map={1}, // index_vector_dim=1, - // window_bounds={3,1,2} + // slice_sizes={3,1,2} // reshape = s32[6,7] reshape(gather) // // In this case the gather maps to: @@ -527,6 +752,11 @@ StatusOr IndexedArrayAnalysis::ComputeArrayForReshape( // (a.k.a. isn't pass-through) than the [3,5,2] array. if (source_dim_for_new_scalar_indexed_node == -1) { + VLOG(3) << "Could not compute the source dim for the new scalar indexed " + "node: scalar_indexed_source_shape = [" + << StrJoin(scalar_indexed_source_shape.dimensions(), ",") + << "] and new_scalar_indexed_source_shape = [" + << StrJoin(new_scalar_indexed_source_shape, ",") << "]"; return nullptr; } @@ -534,6 +764,10 @@ StatusOr IndexedArrayAnalysis::ComputeArrayForReshape( &new_scalar_indexed_source_shape, source_dim_for_new_scalar_indexed_node, scalar_indexed_source_shape.dimensions(scalar_indexed->source_dim())); + CHECK_EQ(absl::c_accumulate(new_scalar_indexed_source_shape, 1LL, + std::multiplies()), + ShapeUtil::ElementsIn(scalar_indexed_source_shape)); + CHECK(IsReshapePassthroughOperandDim( ComputeReshapePassthroughDimPairs( /*operand_shape=*/AsInt64Slice( @@ -547,9 +781,9 @@ StatusOr IndexedArrayAnalysis::ComputeArrayForReshape( }; std::vector output_dims_for_new_scalar_indexed_node; - c_transform(scalar_indexed->output_dims(), - std::back_inserter(output_dims_for_new_scalar_indexed_node), - map_passthrough_operand_dim_to_result_dim); + absl::c_transform(scalar_indexed->output_dims(), + std::back_inserter(output_dims_for_new_scalar_indexed_node), + map_passthrough_operand_dim_to_result_dim); TF_ASSIGN_OR_RETURN(const Literal* new_scalar_indexed_source_literal, TakeOwnership(scalar_indexed->literal().Reshape( @@ -564,6 +798,31 @@ StatusOr IndexedArrayAnalysis::ComputeArrayForReshape( output_dims_for_new_scalar_indexed_node, shape); } +StatusOr IndexedArrayAnalysis::ComputeArrayForReshape( + const Shape& shape, Array* operand) { + if (ShapeUtil::Compatible(operand->shape(), shape)) { + return operand; + } + + if (auto* scalar_indexed = + dynamic_cast(operand)) { + TF_ASSIGN_OR_RETURN(Analysis::Array * reshape_folded_into_gather, + FoldReshapeOfGather(shape, scalar_indexed)); + if (reshape_folded_into_gather) { + return reshape_folded_into_gather; + } + } + + if (auto* constant_array = dynamic_cast(operand)) { + TF_ASSIGN_OR_RETURN(Literal* const new_literal, + TakeOwnership(constant_array->literal()->Reshape( + AsInt64Slice(shape.dimensions())))); + return Construct(new_literal); + } + + return Construct(operand, shape); +} + StatusOr IndexedArrayAnalysis::ComputeArrayForElementwiseBinaryOp(HloOpcode opcode, Array* lhs, @@ -613,13 +872,14 @@ IndexedArrayAnalysis::ComputeArrayForElementwiseBinaryOp(HloOpcode opcode, return nullptr; } - ArraySlice broadcast_dims = broadcast_instr->dimensions(); + absl::Span broadcast_dims = broadcast_instr->dimensions(); auto is_broadcasted_dim = [&](int64 output_dim) { - return c_find(broadcast_dims, output_dim) == broadcast_dims.end(); + return absl::c_find(broadcast_dims, output_dim) == broadcast_dims.end(); }; // All of the output dims must be "broadcasted" dims for the other operand. - if (!c_all_of(scalar_indexed_const->output_dims(), is_broadcasted_dim)) { + if (!absl::c_all_of(scalar_indexed_const->output_dims(), + is_broadcasted_dim)) { return nullptr; } @@ -635,7 +895,7 @@ IndexedArrayAnalysis::ComputeArrayForElementwiseBinaryOp(HloOpcode opcode, // The scalar-indexed node "removes" the source dim and "inserts" the output // dims. We do the opposite here to undo the scalar-indexed operation. - ArraySlice output_dims = scalar_indexed_const->output_dims(); + absl::Span output_dims = scalar_indexed_const->output_dims(); for (int64 i = output_dims.size() - 1; i >= 0; --i) { CHECK(simulated_index[output_dims[i]] == IndexComponent::Broadcasted); EraseAt(&simulated_index, output_dims[i]); @@ -703,12 +963,178 @@ IndexedArrayAnalysis::ComputeArrayForElementwiseUnaryOp(HloOpcode opcode, return Construct( new_source, scalar_indexed_const->indices(), scalar_indexed_const->source_dim(), - std::vector(scalar_indexed_const->output_dims().begin(), - scalar_indexed_const->output_dims().end()), + ArraySliceToVector(scalar_indexed_const->output_dims()), scalar_indexed_const->shape()); } -tensorflow::StringPiece IndexedArrayAnalysisPrinterPass::name() const { +namespace { + +// Returns the non-contracting non-batch dimension (as per `contracting_dims` +// and `batch_dims`) if there is exactly one, otherwise returns nullopt. +absl::optional GetOnlyNonContractingNonBatchDim( + int64 rank, absl::Span contracting_dims, + absl::Span batch_dims) { + absl::optional result; + for (int64 dim = 0; dim < rank; dim++) { + if (!absl::c_linear_search(contracting_dims, dim) && + !absl::c_linear_search(batch_dims, dim)) { + if (result.has_value()) { + return absl::nullopt; + } + result = dim; + } + } + return result; +} + +// Returns true if `indexed_array`, which is either the LHS or the RHS of a Dot +// HLO, can be folded into the dot operation. For now these conditions are both +// necessary and sufficient. +// +// `tag` describes the caller. Used only for logging. +// +// `contracting_dims` and `batch_dims` are the contracting and batch dimensions +// of whatever operand `indexed_array` is to the dot (LHS or RHS). +bool CanFoldDotIntoIndexedArray( + absl::string_view tag, Analysis::ScalarIndexedConstantArray* indexed_array, + absl::Span contracting_dims, + absl::Span batch_dims) { + absl::optional non_contracting_non_batch_dim = + GetOnlyNonContractingNonBatchDim(ShapeUtil::Rank(indexed_array->shape()), + contracting_dims, batch_dims); + if (!non_contracting_non_batch_dim.has_value()) { + VLOG(3) << tag << ": multiple or no non-contracting non-batch dimensions"; + return false; + } + + if (indexed_array->output_dims().size() != 1 || + indexed_array->output_dims()[0] != *non_contracting_non_batch_dim) { + VLOG(3) << tag << ": output dims != the lhs non-contracting non-batch dim"; + return false; + } + + int64 indexed_array_rank = ShapeUtil::Rank(indexed_array->shape()); + if (indexed_array->source_dim() < (indexed_array_rank - 2)) { + // This restriction can be lifted by inserting reshape nodes. + VLOG(3) << tag + << ": source dim is not in the low two dims, won't be able to form " + "a matmul"; + return false; + } + + return true; +} + +} // namespace + +StatusOr +IndexedArrayAnalysis::ComputeArrayForDotWithIndexedLhs( + const Shape& shape, const DotDimensionNumbers& dim_numbers, + ScalarIndexedConstantArray* lhs, ConstantArray* rhs) { + VLOG(3) << "ComputeArrayForDotWithIndexedLhs(" << ToString(lhs) << " " + << ToString(rhs); + if (!CanFoldDotIntoIndexedArray( + "ComputeArrayForDotWithIndexedLhs", lhs, /*contracting_dims=*/ + AsInt64Slice(dim_numbers.lhs_contracting_dimensions()), + /*batch_dims=*/AsInt64Slice(dim_numbers.lhs_batch_dimensions()))) { + return nullptr; + } + + int64 lhs_rank = ShapeUtil::Rank(lhs->shape()); + DotDimensionNumbers new_dim_numbers = dim_numbers; + new_dim_numbers.set_lhs_contracting_dimensions( + 0, lhs->source_dim() == (lhs_rank - 1) ? (lhs_rank - 2) : (lhs_rank - 1)); + + TF_ASSIGN_OR_RETURN(Literal * literal_for_new_source, + TakeOwnership(HloEvaluator{}.EvaluateDotOp( + new_dim_numbers, lhs->literal(), *rhs->literal()))); + + // The new source dimension is wherever the non-batch non-contracting LHS + // dimension "went". + int64 new_source_dim = dim_numbers.lhs_batch_dimensions_size() + + dim_numbers.rhs_batch_dimensions_size(); + + ConstantArray* new_source = Construct(literal_for_new_source); + return Construct( + new_source, lhs->indices(), new_source_dim, + ArraySliceToVector(lhs->output_dims()), shape); +} + +StatusOr +IndexedArrayAnalysis::ComputeArrayForDotWithIndexedRhs( + const Shape& shape, const DotDimensionNumbers& dim_numbers, + ConstantArray* lhs, ScalarIndexedConstantArray* rhs) { + VLOG(3) << "ComputeArrayForDotWithIndexedRhs(" << ToString(lhs) << " " + << ToString(rhs); + if (!CanFoldDotIntoIndexedArray( + "ComputeArrayForDotWithIndexedRhs", rhs, /*contracting_dims=*/ + AsInt64Slice(dim_numbers.rhs_contracting_dimensions()), + /*batch_dims=*/AsInt64Slice(dim_numbers.rhs_batch_dimensions()))) { + return nullptr; + } + + int64 rhs_rank = ShapeUtil::Rank(rhs->shape()); + + DotDimensionNumbers new_dim_numbers = dim_numbers; + new_dim_numbers.set_rhs_contracting_dimensions( + 0, rhs->source_dim() == (rhs_rank - 1) ? (rhs_rank - 2) : (rhs_rank - 1)); + + TF_ASSIGN_OR_RETURN(Literal * literal_for_new_source, + TakeOwnership(HloEvaluator{}.EvaluateDotOp( + new_dim_numbers, *lhs->literal(), rhs->literal()))); + + // The new source dimension is wherever the non-batch non-contracting RHS + // dimension "went". + int64 new_source_dim = dim_numbers.lhs_batch_dimensions_size() + + dim_numbers.rhs_batch_dimensions_size() + 1; + + ConstantArray* new_source = Construct(literal_for_new_source); + return Construct( + new_source, rhs->indices(), new_source_dim, + ArraySliceToVector(rhs->output_dims()), shape); +} + +StatusOr IndexedArrayAnalysis::ComputeArrayForDot( + const Shape& shape, const DotDimensionNumbers& dim_numbers, Array* lhs, + Array* rhs) { + // Intuitively, if + // + // - The LHS of a dot product is a gathered sequence of rows from a constant + // array (i.e. LHS[I,J] = Const[Indices[I],J]) and the RHS is a constant + // + // OR + // + // - If the RHS of a dot product is a gathered sequence of columns from a + // constant array (i.e. RHS[I,J] = Const[I, Indices[J]]) and the LHS is a + // constant + // + // then the result of the dot product itself is a gather from a constant + // array. E.g. Dot(LHS, ConstRhs) where LHS[I,J] = Const[Indices[I],J] can be + // rewritten as Result where Result[I,J] = Dot(Const, ConstRhs)[Indices[I], + // J]. + // + // We do a general version of this rewrite here. + VLOG(3) << "ComputeArrayForDot(" << ToString(lhs) << " " << ToString(rhs); + if (auto* lhs_indexed_array = + dynamic_cast(lhs)) { + if (auto* rhs_constant = dynamic_cast(rhs)) { + return ComputeArrayForDotWithIndexedLhs(shape, dim_numbers, + lhs_indexed_array, rhs_constant); + } + } + + if (auto* rhs_indexed_array = + dynamic_cast(rhs)) { + if (auto* lhs_constant = dynamic_cast(lhs)) { + return ComputeArrayForDotWithIndexedRhs(shape, dim_numbers, lhs_constant, + rhs_indexed_array); + } + } + + return nullptr; +} + +absl::string_view IndexedArrayAnalysisPrinterPass::name() const { return "indexed-array-analysis-printer-pass"; } diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.h b/tensorflow/compiler/xla/service/indexed_array_analysis.h index ce92fd2919c90fa8a2fb7b796ed6f0fdaf48fe62..dcfb7255358ae08660fe2c6eae5af9f10370e762 100644 --- a/tensorflow/compiler/xla/service/indexed_array_analysis.h +++ b/tensorflow/compiler/xla/service/indexed_array_analysis.h @@ -39,7 +39,13 @@ class IndexedArrayAnalysis { // Array instances are immutable once created. class Array { public: - enum Kind { kUnknown, kConstant, kScalarIndexedConstant, kScalarIndexed }; + enum Kind { + kUnknown, + kConstant, + kReshaped, + kScalarIndexedConstant, + kScalarIndexed + }; virtual Kind kind() const = 0; virtual const Shape& shape() const = 0; @@ -96,6 +102,27 @@ class IndexedArrayAnalysis { friend class IndexedArrayAnalysis; }; + // Represents an Array that is a reshape of another Array. + class ReshapedArray : public Array { + public: + Kind kind() const override { return kReshaped; } + + // The array to reshape. + Array* operand() const { return operand_; } + + // The output shape. + const Shape& shape() const override { return shape_; } + + private: + explicit ReshapedArray(Array* operand, Shape shape) + : operand_(operand), shape_(shape) {} + + Array* operand_; + const Shape shape_; + + friend class IndexedArrayAnalysis; + }; + // --------------------------------------------------------------------------- // Indexed Array Overview // --------------------------------------------------------------------------- @@ -161,9 +188,7 @@ class IndexedArrayAnalysis { // `output_dims` are the dimensions in the output array that are being used // to compute an index into the `indices` array. See the class // documentation and the overview for more details. - tensorflow::gtl::ArraySlice output_dims() const { - return output_dims_; - } + absl::Span output_dims() const { return output_dims_; } private: explicit ScalarIndexedArray(Array* source, Array* indices, int64 source_dim, @@ -238,8 +263,19 @@ class IndexedArrayAnalysis { StatusOr ComputeArrayForGather( const Shape& shape, const GatherDimensionNumbers& dim_numbers, - tensorflow::gtl::ArraySlice window_bounds, Array* source, - Array* indices); + absl::Span slice_sizes, Array* source, Array* indices); + + StatusOr ComputeArrayForDotWithIndexedLhs( + const Shape& shape, const DotDimensionNumbers& dim_numbers, + ScalarIndexedConstantArray* lhs, ConstantArray* rhs); + + StatusOr ComputeArrayForDotWithIndexedRhs( + const Shape& shape, const DotDimensionNumbers& dim_numbers, + ConstantArray* lhs, ScalarIndexedConstantArray* rhs); + + StatusOr ComputeArrayForDot(const Shape& shape, + const DotDimensionNumbers& dim_numbers, + Array* lhs, Array* rhs); // This tries to fold a ScalarIndexedArray which has another // ScalarIndexedArray as a source into a ScalarIndexedArray that instead has a @@ -264,8 +300,22 @@ class IndexedArrayAnalysis { // G1 = [Arr[i] for i in I2] StatusOr FoldGatherOfGather( ScalarIndexedArray* source, Array* indices, int64 source_dim, - tensorflow::gtl::ArraySlice output_dims, Shape shape); - + absl::Span output_dims, Shape shape); + + // Reshapes a scalar-indexed node to remove the degenerate dimensions in its + // output. The result is always a scalar-indexed node. + StatusOr ReshapeToRemoveDegenerateDims( + ScalarIndexedArray* operand); + + // Reshapes a scalar-indexed node such that the result has the degenerate + // dimensions `degenerate_dims`. The result is always a scalar-indexed node. + StatusOr ReshapeToAddDegenerateDims( + ScalarIndexedArray* operand, absl::Span degenerate_dims); + + StatusOr FoldReshapeOfGather( + const Shape& shape, ScalarIndexedConstantArray* operand); + StatusOr FoldReshapeOfGatherNoDegenerateDims( + const Shape& shape, ScalarIndexedConstantArray* scalar_indexed); StatusOr ComputeArrayForReshape(const Shape& shape, Array* operand); StatusOr ComputeArrayForElementwiseBinaryOp(HloOpcode opcode, @@ -317,7 +367,7 @@ class IndexedArrayAnalysis { // unconditionally add to the regular HLO pass pipeline. class IndexedArrayAnalysisPrinterPass : public HloPassInterface { public: - tensorflow::StringPiece name() const override; + absl::string_view name() const override; StatusOr Run(HloModule* module) override; }; diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc index 373556ebeba883f7dc2116bdf0ffc3274182f775..2d03aebc1aca4c55cca588072233b7a18e70a306 100644 --- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc +++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include + #include "tensorflow/compiler/xla/service/indexed_array_analysis.h" #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/compiler/xla/tests/test_utils.h" @@ -34,6 +36,27 @@ class IndexedArrayAnalysisTest : public HloVerifiedTestBase { } private: + // Replaces seqences of whitespace with a single space. This makes the + // strings being matched against "whitespace insensitive" which lets us indent + // them for readability. + string CanonicalizeWhitespace(const string& text) { + string result; + + for (char c : text) { + if (!isspace(c)) { + result.push_back(c); + } else if (!result.empty() && result.back() != ' ') { + result.push_back(' '); + } + } + + while (!result.empty() && result.back() == ' ') { + result.pop_back(); + } + + return result; + } + void AssertArrayForRootExpressionIsImpl(const string& hlo_text, const string& root_expression, bool print_constants) { @@ -44,10 +67,10 @@ class IndexedArrayAnalysisTest : public HloVerifiedTestBase { IndexedArrayAnalysis::Array* const array_result, indexed_tensor_analysis.GetArrayFor( module().entry_computation()->root_instruction())); - string string_result = - indexed_tensor_analysis.ToString(array_result, print_constants); + string string_result = CanonicalizeWhitespace( + indexed_tensor_analysis.ToString(array_result, print_constants)); LOG(INFO) << string_result; - ASSERT_EQ(string_result, root_expression); + ASSERT_EQ(string_result, CanonicalizeWhitespace(root_expression)); } }; @@ -59,11 +82,11 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[5] parameter(1) ROOT gather = s32[5,3] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,3} + slice_sizes={1,3} } )"; @@ -79,11 +102,11 @@ ENTRY main { operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}}) indices = s32[5] parameter(0) ROOT gather = s32[5,3] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,3} + slice_sizes={1,3} } )"; @@ -91,6 +114,82 @@ ENTRY main { hlo_text, "(scalar-indexed-const (constant s32[3,3]) %indices 0->[0])"); } +TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed0) { + string hlo_text = R"( +HloModule SimpleGather + +ENTRY main { + operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}}) + indices = s32[5,2] parameter(0) + ROOT gather = s32[5] gather(operand, indices), + offset_dims={}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, + index_vector_dim=1, + slice_sizes={1,1} +} +)"; + + AssertArrayForRootExpressionIs(hlo_text, "%gather"); +} + +TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed1) { + string hlo_text = R"( +HloModule SimpleGather + +ENTRY main { + operand = s32[3,3,1] parameter(0) + indices = s32[5] parameter(1) + ROOT gather = s32[5,3] gather(operand, indices), + offset_dims={1}, + collapsed_slice_dims={0,2}, + start_index_map={0}, + index_vector_dim=1, + slice_sizes={1,3,1} +} +)"; + + AssertArrayForRootExpressionIs(hlo_text, "%gather"); +} + +TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed2) { + string hlo_text = R"( +HloModule SimpleGather + +ENTRY main { + operand = s32[3,3,1] parameter(0) + indices = s32[5] parameter(1) + ROOT gather = s32[5,2,3] gather(operand, indices), + offset_dims={1,2}, + collapsed_slice_dims={2}, + start_index_map={0}, + index_vector_dim=1, + slice_sizes={2,3,1} +} +)"; + + AssertArrayForRootExpressionIs(hlo_text, "%gather"); +} + +TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed3) { + string hlo_text = R"( +HloModule SimpleGather + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[5] parameter(1) + ROOT gather = s32[5,2] gather(operand, indices), + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, + index_vector_dim=1, + slice_sizes={1,2} +} +)"; + + AssertArrayForRootExpressionIs(hlo_text, "%gather"); +} + TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOne) { string hlo_text = R"( HloModule SimpleGather @@ -100,17 +199,17 @@ ENTRY main { indices_a = s32[5] parameter(0) indices_b = s32[2] parameter(1) gather_a = s32[5,3] gather(operand, indices_a), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,3} + slice_sizes={1,3} ROOT gather_b = s32[2,3] gather(gather_a, indices_b), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,3} + slice_sizes={1,3} } )"; @@ -129,17 +228,17 @@ ENTRY main { indices_a = s32[5,7] parameter(1) indices_b = s32[2] parameter(2) gather_a = s32[5,3,7] gather(operand, indices_a), - output_window_dims={1}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={1}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=2, - window_bounds={3,1} + slice_sizes={3,1} ROOT gather_b = s32[5,3,2] gather(gather_a, indices_b), - output_window_dims={0,1}, - elided_window_dims={2}, - gather_dims_to_operand_dims={2}, + offset_dims={0,1}, + collapsed_slice_dims={2}, + start_index_map={2}, index_vector_dim=1, - window_bounds={5,3,1} + slice_sizes={5,3,1} } )"; @@ -157,17 +256,17 @@ ENTRY main { indices_a = s32[2] parameter(1) indices_b = s32[5,7] parameter(2) gather_a = s32[2,6] gather(operand, indices_a), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,6} + slice_sizes={1,6} ROOT gather_b = s32[5,6,7] gather(gather_a, indices_b), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=2, - window_bounds={1,6} + slice_sizes={1,6} } )"; @@ -185,17 +284,17 @@ ENTRY main { indices_a = s32[5,7] parameter(1) indices_b = s32[4,8] parameter(2) gather_a = s32[5,3,7] gather(operand, indices_a), - output_window_dims={1}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={1}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=2, - window_bounds={3,1} + slice_sizes={3,1} ROOT gather_b = s32[4,5,3,8] gather(gather_a, indices_b), - output_window_dims={1,2}, - elided_window_dims={2}, - gather_dims_to_operand_dims={2}, + offset_dims={1,2}, + collapsed_slice_dims={2}, + start_index_map={2}, index_vector_dim=2, - window_bounds={5,3,1} + slice_sizes={5,3,1} } )"; @@ -213,11 +312,11 @@ ENTRY main { operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}}) indices = s32[5] parameter(0) gather = s32[5,4] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,4} + slice_sizes={1,4} ROOT reshape = s32[5,2,2] reshape(gather) } )"; @@ -234,11 +333,11 @@ ENTRY main { operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}}) indices = s32[5,7] parameter(0) gather = s32[5,4,7] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=2, - window_bounds={1,4} + slice_sizes={1,4} ROOT reshape = s32[5,2,2,7] reshape(gather) } )"; @@ -259,11 +358,11 @@ ENTRY main { {{1,2,3,4,5,6},{1,2,3,4,5,6}}}) indices = s32[5,7] parameter(0) gather = s32[5,2,6,7] gather(operand, indices), - output_window_dims={1,2}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1,2}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=2, - window_bounds={1,2,6} + slice_sizes={1,2,6} ROOT reshape = s32[5,3,4,7] reshape(gather) } )"; @@ -273,7 +372,157 @@ ENTRY main { "(scalar-indexed-const (constant s32[3,3,4]) %indices 0->[0,3])"); } -TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNegative0) { +TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather3) { + string hlo_text = R"( +HloModule ReshapeOfGather + +ENTRY main { + operand = s32[2,6] constant(s32[2,6]{ + {1,2,3,4,5,6},{1,2,3,4,5,6}}) + indices = s32[1] parameter(0) + gather = s32[1,6] gather(operand, indices), + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, + index_vector_dim=1, + slice_sizes={1,6} + ROOT reshape = s32[1,1,6] reshape(gather) +} +)"; + + const char* expected_root_expression = R"( +(scalar-indexed-const + (constant s32[2,1,1,6]) + (reshape %indices to s32[]) + 0->[]) +)"; + + AssertArrayForRootExpressionIs(hlo_text, expected_root_expression); +} + +TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather4) { + string hlo_text = R"( +HloModule ReshapeOfGather + +ENTRY main { + operand = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 1, 2, 3 } }) + + i.0 = s64[1,3]{1,0} parameter(0) + g.0 = s32[1,3,3]{2,1,0} gather(operand, i.0), offset_dims={2}, + collapsed_slice_dims={0}, start_index_map={0}, + index_vector_dim=2, slice_sizes={1,3} + + i.1 = s64[1] parameter(1) + g.1 = s32[1,1,3]{2,1,0} gather(g.0, i.1), offset_dims={0,2}, + collapsed_slice_dims={1}, start_index_map={1}, + index_vector_dim=1, slice_sizes={1,1,3} + + ROOT reshape = s32[1,3]{1,0} reshape(g.1) +} +)"; + + const char* expected_root_expression = R"( +(scalar-indexed-const + (constant s32[2,1,3]) + (reshape + (scalar-indexed %i.0 %i.1 1->[1]) + to s64[]) + 0->[]) +)"; + + AssertArrayForRootExpressionIs(hlo_text, expected_root_expression); +} + +TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather5) { + string hlo_text = R"( +HloModule ReshapeOfGather + +ENTRY main { + operand = s32[1,6] constant(s32[1,6]{{1,2,3,4,5,6}}) + indices = s32[1] parameter(0) + gather = s32[1,6] gather(operand, indices), + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, + index_vector_dim=1, + slice_sizes={1,6} + ROOT reshape = s32[1,1,6] reshape(gather) +} +)"; + + const char* expected_root_expression = R"( +(scalar-indexed-const + (constant s32[1,1,1,6]) + (reshape %indices to s32[]) + 0->[]) +)"; + + AssertArrayForRootExpressionIs(hlo_text, expected_root_expression); +} + +TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather6) { + string hlo_text = R"( +HloModule ReshapeOfGather + +ENTRY main { + operand = s32[1,2,6] constant(s32[1,2,6]{{ + {1,2,3,4,5,6},{1,2,3,4,5,6}}}) + indices = s32[1] parameter(0) + gather = s32[1,1,6] gather(operand, indices), + offset_dims={1,2}, + collapsed_slice_dims={1}, + start_index_map={1}, + index_vector_dim=1, + slice_sizes={1,1,6} + ROOT reshape = s32[1,1,1,6] reshape(gather) +} +)"; + + const char* expected_root_expression = R"( +(scalar-indexed-const + (constant s32[2,1,1,1,6] s32[2,1,1,1,6] { + { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } }, + { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } } }) + (reshape %indices to s32[]) + 0->[]) +)"; + + AssertArrayWithConstantsForRootExpressionIs(hlo_text, + expected_root_expression); +} + +TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather7) { + string hlo_text = R"( +HloModule ReshapeOfGather + +ENTRY main { + operand = s32[2,6] constant(s32[2,6]{ + {1,2,3,4,5,6},{1,2,3,4,5,6}}) + indices = s32[1,5] parameter(0) + gather = s32[1,5,6] gather(operand, indices), + offset_dims={2}, + collapsed_slice_dims={0}, + start_index_map={0}, + index_vector_dim=2, + slice_sizes={1,6} + ROOT reshape = s32[1,1,5,6] reshape(gather) +} +)"; + + const char* expected_root_expression = R"( +(scalar-indexed-const + (constant s32[2,1,1,6] s32[2,1,1,6] { + { /*i0=0*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } }, + { /*i0=1*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } } }) + (reshape %indices to s32[5]) + 0->[2]) +)"; + + AssertArrayWithConstantsForRootExpressionIs(hlo_text, + expected_root_expression); +} + +TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold0) { string hlo_text = R"( HloModule ReshapeOfGather @@ -281,19 +530,28 @@ ENTRY main { operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}}) indices = s32[5,6] parameter(0) gather = s32[5,4,6] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=2, - window_bounds={1,4} + slice_sizes={1,4} ROOT reshape = s32[5,2,2,2,3] reshape(gather) } )"; - AssertArrayForRootExpressionIs(hlo_text, "%reshape"); + const char* expected_root_expression = R"( +(reshape + (scalar-indexed-const + (constant s32[3,4]) + %indices + 0->[0,2]) + to s32[5,2,2,2,3]) +)"; + + AssertArrayForRootExpressionIs(hlo_text, expected_root_expression); } -TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNegative1) { +TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold1) { string hlo_text = R"( HloModule ReshapeOfGather @@ -304,16 +562,57 @@ ENTRY main { {{1,2},{3,4},{5,6},{7,8},{9,10}}}) indices = s32[7] parameter(0) gather = s32[3,2,7] gather(operand, indices), - output_window_dims={0,1}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={0,1}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=1, - window_bounds={3,1,2} + slice_sizes={3,1,2} ROOT reshape = s32[6,7] reshape(gather) } )"; - AssertArrayForRootExpressionIs(hlo_text, "%reshape"); + const char* expected_root_expression = R"( +(reshape + (scalar-indexed-const + (constant s32[3,5,2]) + %indices + 1->[2]) + to s32[6,7]) +)"; + + AssertArrayForRootExpressionIs(hlo_text, expected_root_expression); +} + +TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold2) { + string hlo_text = R"( +HloModule ReshapeOfGather + +ENTRY main { + operand = s32[3,4,1] constant(s32[3,4,1]{ + {{1},{2},{3},{4}}, + {{1},{2},{3},{4}}, + {{1},{2},{3},{4}}}) + indices = s32[5,6] parameter(0) + gather = s32[5,4,6,1] gather(operand, indices), + offset_dims={1,3}, + collapsed_slice_dims={0}, + start_index_map={0}, + index_vector_dim=2, + slice_sizes={1,4,1} + ROOT reshape = s32[5,2,2,2,3,1] reshape(gather) +} +)"; + + const char* expected_root_expression = R"( +(reshape + (scalar-indexed-const + (constant s32[3,4,1]) + %indices + 0->[0,2]) + to s32[5,2,2,2,3,1]) +)"; + + AssertArrayForRootExpressionIs(hlo_text, expected_root_expression); } TEST_F(IndexedArrayAnalysisTest, UnaryOpOfGather) { @@ -324,20 +623,20 @@ ENTRY main { operand = f32[3,4] constant(f32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}}) indices = s32[5] parameter(0) gather = f32[5,4] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,4} + slice_sizes={1,4} ROOT tanh = f32[5,4] tanh(gather) } )"; AssertArrayWithConstantsForRootExpressionIs(hlo_text, 1 + R"( (scalar-indexed-const (constant f32[3,4] f32[3,4] { - { 0.761594176, 0.964027584, 0.995054781, 0.999329329 }, - { 0.761594176, 0.995054781, 0.964027584, 0.999329329 }, - { 0.999329329, 0.995054781, 0.964027584, 0.761594176 } + { 0.761594, 0.964028, 0.995055, 0.999329 }, + { 0.761594, 0.995055, 0.964028, 0.999329 }, + { 0.999329, 0.995055, 0.964028, 0.761594 } }) %indices 0->[0]))"); } @@ -351,11 +650,11 @@ ENTRY main { constant_broadcasted = s32[5,4] broadcast(constant), dimensions={} indices = s32[5] parameter(0) gather = s32[5,4] gather(gather_operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,4} + slice_sizes={1,4} ROOT add = s32[5,4] add(gather, constant_broadcasted) } )"; @@ -379,11 +678,11 @@ ENTRY main { constant_broadcasted = s32[5,4] broadcast(constant), dimensions={} indices = s32[5] parameter(0) gather = s32[5,4] gather(gather_operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,4} + slice_sizes={1,4} ROOT sub = s32[5,4] subtract(gather, constant_broadcasted) } )"; @@ -407,11 +706,11 @@ ENTRY main { constant_broadcasted = s32[5,4] broadcast(constant), dimensions={} indices = s32[5] parameter(0) gather = s32[5,4] gather(gather_operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,4} + slice_sizes={1,4} ROOT sub = s32[5,4] subtract(constant_broadcasted, gather) } )"; @@ -434,11 +733,11 @@ ENTRY main { constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={1} indices = s32[5] parameter(0) gather = s32[5,4] gather(gather_operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,4} + slice_sizes={1,4} ROOT add = s32[5,4] add(gather, constant_broadcasted) } )"; @@ -461,11 +760,11 @@ ENTRY main { constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={0} indices = s32[5] parameter(0) gather = s32[5,4] gather(gather_operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1,4} + slice_sizes={1,4} ROOT add = s32[5,4] add(gather, constant_broadcasted) } )"; @@ -500,5 +799,170 @@ ENTRY main { AssertArrayForRootExpressionIs(hlo_text, "%add"); } +TEST_F(IndexedArrayAnalysisTest, DotOpBasic_0) { + string hlo_text = R"( +HloModule DotOp + +ENTRY main { + gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}}) + dot_rhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}) + indices = s32[5] parameter(0) + dot_lhs = s32[5,4] gather(gather_operand, indices), + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, + index_vector_dim=1, + slice_sizes={1,4} + ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0} +} +)"; + + AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"( +(scalar-indexed-const + (constant s32[3,3] s32[3,3] { + { 70, 80, 90 }, + { 158, 184, 210 }, + { 246, 288, 330 } }) + %indices 0->[0]))"); +} + +TEST_F(IndexedArrayAnalysisTest, DotOpBasic_1) { + string hlo_text = R"( +HloModule DotOp + +ENTRY main { + gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}}) + dot_rhs_constant = s32[3,3] constant(s32[3,3]{{1,2,3},{4,5,6},{7,8,9}}) + indices = s32[5] parameter(0) + dot_lhs = s32[3,5] gather(gather_operand, indices), + offset_dims={0}, + collapsed_slice_dims={1}, + start_index_map={1}, + index_vector_dim=1, + slice_sizes={3,1} + ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={0}, rhs_contracting_dims={0} +} +)"; + + AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"( +(scalar-indexed-const + (constant s32[4,3] s32[4,3] { + { 84, 99, 114 }, + { 96, 114, 132 }, + { 108, 129, 150 }, + { 120, 144, 168 } }) + %indices 0->[1]))"); +} + +TEST_F(IndexedArrayAnalysisTest, DotOpBasic_2) { + string hlo_text = R"( +HloModule DotOp + +ENTRY main { + gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}}) + dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}) + indices = s32[5] parameter(0) + dot_rhs = s32[3,5] gather(gather_operand, indices), + offset_dims={0}, + collapsed_slice_dims={1}, + start_index_map={1}, + index_vector_dim=1, + slice_sizes={3,1} + ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0} +} +)"; + + AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"( +(scalar-indexed-const + (constant s32[4,4] s32[4,4] { + { 38, 44, 50, 56 }, + { 83, 98, 113, 128 }, + { 128, 152, 176, 200 }, + { 173, 206, 239, 272 } }) + %indices 1->[1]) +)"); +} + +TEST_F(IndexedArrayAnalysisTest, DotOpBasic_3) { + string hlo_text = R"( +HloModule DotOp + +ENTRY main { + gather_operand = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}) + dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}) + indices = s32[5] parameter(0) + dot_rhs = s32[5,3] gather(gather_operand, indices), + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, + index_vector_dim=1, + slice_sizes={1,3} + ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={1} +} +)"; + + AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"( +(scalar-indexed-const + (constant s32[4,4] s32[4,4] { + { 14, 32, 50, 68 }, + { 32, 77, 122, 167 }, + { 50, 122, 194, 266 }, + { 68, 167, 266, 365 } }) + %indices 1->[0]) +)"); +} + +TEST_F(IndexedArrayAnalysisTest, DotOpWithBatch) { + string hlo_text = R"( +HloModule DotOp + +ENTRY main { + gather_operand = s32[2,3,2] constant(s32[2,3,2]{{{1,2},{3,4},{5,6}},{{7,8},{9,10},{11,12}}}) + dot_lhs_constant = s32[2,2,3] constant(s32[2,2,3]{{{1,2,3},{4,5,6}},{{7,8,9},{10,11,12}}}) + indices = s32[4] parameter(0) + dot_rhs = s32[2,3,4] gather(gather_operand, indices), + offset_dims={0,1}, + collapsed_slice_dims={2}, + start_index_map={2}, + index_vector_dim=1, + slice_sizes={2,3,1} + ROOT dot = s32[2,2,4] dot(dot_lhs_constant, dot_rhs), + lhs_contracting_dims={2}, rhs_contracting_dims={1}, + lhs_batch_dims={0}, rhs_batch_dims={0} +} +)"; + + AssertArrayWithConstantsForRootExpressionIs(hlo_text, R"( +(scalar-indexed-const + (constant s32[2,2,2] s32[2,2,2] { + { { 22, 28 }, + { 49, 64 } }, + { { 220, 244 }, + { 301, 334 } } }) + %indices 3->[2]) +)"); +} + +TEST_F(IndexedArrayAnalysisTest, DotOpNegative) { + string hlo_text = R"( +HloModule DotOp + +ENTRY main { + gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}}) + dot_rhs_constant = s32[2,3] constant(s32[2,3]{{1,2,3},{4,5,6}}) + indices = s32[2] parameter(0) + dot_lhs = s32[3,2] gather(gather_operand, indices), + offset_dims={0}, + collapsed_slice_dims={1}, + start_index_map={1}, + index_vector_dim=1, + slice_sizes={3,1} + ROOT dot = s32[3,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0} +} +)"; + + AssertArrayWithConstantsForRootExpressionIs(hlo_text, "%dot"); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/inliner.cc b/tensorflow/compiler/xla/service/inliner.cc index 5c193fceb984448cf0532d7e1010281268614293..5fd779ebf9b59e34a0844cc3a898bb72ce6044ee 100644 --- a/tensorflow/compiler/xla/service/inliner.cc +++ b/tensorflow/compiler/xla/service/inliner.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -27,7 +28,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" namespace xla { diff --git a/tensorflow/compiler/xla/service/inliner.h b/tensorflow/compiler/xla/service/inliner.h index a523811f6c141a7dc24b1c88897d82d046aa1a2d..efa8ed3abcc6cd7cd8d31ec2170eae8752988c09 100644 --- a/tensorflow/compiler/xla/service/inliner.h +++ b/tensorflow/compiler/xla/service/inliner.h @@ -27,7 +27,7 @@ namespace xla { class Inliner : public HloPassInterface { public: ~Inliner() override = default; - tensorflow::StringPiece name() const override { return "inline"; } + absl::string_view name() const override { return "inline"; } // Run inlining on the given computation. Returns whether the computation was // changed. diff --git a/tensorflow/compiler/xla/service/inliner_test.cc b/tensorflow/compiler/xla/service/inliner_test.cc index d2af261008f40ee83e0676cfc7e67c45f8be1844..5695bc242057c037a1999e7d63f5b4f21b5f658a 100644 --- a/tensorflow/compiler/xla/service/inliner_test.cc +++ b/tensorflow/compiler/xla/service/inliner_test.cc @@ -18,8 +18,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -51,10 +51,10 @@ TEST_F(InlinerTest, MapMax) { auto max_f32 = max_builder.Build(); auto builder = HloComputation::Builder("MapMaxFunction"); - auto lhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3, 4}))); - auto rhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({4, 3, 2, 1}))); + auto lhs = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({1, 2, 3, 4}))); + auto rhs = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({4, 3, 2, 1}))); builder.AddInstruction( HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get())); @@ -70,7 +70,7 @@ TEST_F(InlinerTest, MapMax) { // Verify execution on CPU. auto result = ExecuteAndTransfer(std::move(hlo_module), {}); - auto expected = Literal::CreateR1({4, 3, 3, 4}); + auto expected = LiteralUtil::CreateR1({4, 3, 3, 4}); EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected)); } @@ -83,12 +83,12 @@ TEST_F(InlinerTest, MapConstant) { HloInstruction::CreateParameter(0, r0f32, "x")); (void)param1; const2_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0f))); auto const2_f32 = const2_builder.Build(); auto builder = HloComputation::Builder("MapConstFunction"); auto lhs = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1, 2, 3, 4}, {5, 6, 7, 8}}))); + LiteralUtil::CreateR2({{1, 2, 3, 4}, {5, 6, 7, 8}}))); builder.AddInstruction( HloInstruction::CreateMap(lhs->shape(), {lhs}, const2_f32.get())); @@ -104,7 +104,7 @@ TEST_F(InlinerTest, MapConstant) { // Verify execution on CPU. auto result = ExecuteAndTransfer(std::move(hlo_module), {}); - auto expected = Literal::CreateR2({{2, 2, 2, 2}, {2, 2, 2, 2}}); + auto expected = LiteralUtil::CreateR2({{2, 2, 2, 2}, {2, 2, 2, 2}}); EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected)); } @@ -123,10 +123,10 @@ TEST_F(InlinerTest, MapSubtractOppositeOrder) { auto max_f32 = max_builder.Build(); auto builder = HloComputation::Builder("MapSubFunction"); - auto lhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3, 4}))); - auto rhs = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({4, 3, 2, 1}))); + auto lhs = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({1, 2, 3, 4}))); + auto rhs = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({4, 3, 2, 1}))); builder.AddInstruction( HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get())); @@ -142,7 +142,7 @@ TEST_F(InlinerTest, MapSubtractOppositeOrder) { // Verify execution on CPU. auto result = ExecuteAndTransfer(std::move(hlo_module), {}); - auto expected = Literal::CreateR1({3, 1, -1, -3}); + auto expected = LiteralUtil::CreateR1({3, 1, -1, -3}); EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected)); } diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc index 429c8503432b79f46aa0e5b1970bb565093128dd..8c907eae0cbe7c3764a2bfe8fed6b6098931de38 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion.cc @@ -21,6 +21,7 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/core/lib/core/errors.h" @@ -73,6 +74,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kGt: case HloOpcode::kImag: case HloOpcode::kInfeed: + case HloOpcode::kIota: case HloOpcode::kIsFinite: case HloOpcode::kLe: case HloOpcode::kLt: @@ -83,6 +85,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kNegate: case HloOpcode::kNot: case HloOpcode::kOr: + case HloOpcode::kXor: case HloOpcode::kOutfeed: case HloOpcode::kPad: case HloOpcode::kReal: @@ -96,8 +99,10 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kShiftRightLogical: case HloOpcode::kSlice: case HloOpcode::kSubtract: + case HloOpcode::kAfterAll: case HloOpcode::kTranspose: case HloOpcode::kTuple: + case HloOpcode::kTupleSelect: return false; // Cheap instructions for reals, but expensive for complex. @@ -116,6 +121,8 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kConditional: case HloOpcode::kConvolution: case HloOpcode::kCrossReplicaSum: + case HloOpcode::kAllToAll: + case HloOpcode::kCollectivePermute: case HloOpcode::kCustomCall: case HloOpcode::kDivide: case HloOpcode::kDomain: @@ -125,7 +132,6 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kFft: case HloOpcode::kFusion: case HloOpcode::kGather: - case HloOpcode::kHostCompute: case HloOpcode::kLog: case HloOpcode::kLog1p: case HloOpcode::kMap: @@ -137,6 +143,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kReduceWindow: case HloOpcode::kRemainder: case HloOpcode::kRng: + case HloOpcode::kScatter: case HloOpcode::kSelectAndScatter: case HloOpcode::kSend: case HloOpcode::kSendDone: @@ -165,7 +172,8 @@ bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) { }); return std::count_if(hlo->operands().begin(), hlo->operands().end(), [output_rank](HloInstruction* operand) { - if (operand->opcode() == HloOpcode::kBroadcast) { + if (operand->opcode() == HloOpcode::kBroadcast || + operand->opcode() == HloOpcode::kIota) { return false; } if (operand->opcode() == HloOpcode::kConstant && @@ -183,13 +191,13 @@ bool InstructionFusion::CanFuseOnAllPaths( if (consumer == producer) { return true; } - if (!consumer->IsFusable()) { + if (!consumer->IsFusible()) { return false; } for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) { auto* consumer_operand = consumer->mutable_operand(i); // If the operand is not on a path to the producer, it doesn't matter - // whether it's fusable. + // whether it's fusible. if (!reachability_->IsReachable(producer, consumer_operand)) { continue; } @@ -199,7 +207,7 @@ bool InstructionFusion::CanFuseOnAllPaths( } // The producer is reachable from consumer_operand which means we need // to be able to fuse consumer_operand into consumer in order for - // producer to be fusable into consumer on all paths. + // producer to be fusible into consumer on all paths. // Perform the recursive step: make sure producer can be fused into // consumer_operand on all paths. if (!CanFuseOnAllPaths(producer, consumer_operand, do_not_duplicate)) { @@ -210,8 +218,8 @@ bool InstructionFusion::CanFuseOnAllPaths( } InstructionFusion::HloInstructionSet -InstructionFusion::ComputeGloballyUnfusable( - tensorflow::gtl::ArraySlice post_order) { +InstructionFusion::ComputeGloballyUnfusible( + absl::Span post_order) { // Forbid fusion of producers that: // a) Need to be duplicated, unless they can be fused into all consumers // via all paths. @@ -236,23 +244,47 @@ InstructionFusion::ComputeGloballyUnfusable( if (EffectivelyAtMostUnary(producer)) { continue; } + + // If the total size of the inputs is less than or equal to the total size + // of the outputs for the producer then duplicating it won't increase the + // memory traffic. In that case, we do not forbid fusion of the operation + // here. + auto total_size = [](const Shape& shape) { + int64 size = 0; + ShapeUtil::ForEachSubshape( + shape, + [&size](const Shape& subshape, const ShapeIndex& shape_index) { + if (ShapeUtil::IsArray(subshape)) { + size += ShapeUtil::ElementsIn(subshape); + } + }); + return size; + }; + int64 operands_size = 0; + for (const HloInstruction* op : producer->operands()) { + operands_size += total_size(op->shape()); + } + if (operands_size <= total_size(producer->shape())) { + continue; + } + // Otherwise we will forbid fusing the op unless we can fuse it into // all of its consumers on all paths. // // That means, that for: - // A --> B (fusable) - // \-> C (non-fusable) + // A --> B (fusible) + // \-> C (non-fusible) // A will be not allowed to be fused into B, as it cannot be fused into C. // // Similarly, for: // A -------------> B // \-> C -> D -/ // If: - // - A is fusable into B and C, and D is fusable into B - // - C is *not* fusable into D + // - A is fusible into B and C, and D is fusible into B + // - C is *not* fusible into D // A will be not allowed to be fused into B, as it cannot be fused via // all paths. - if (producer->IsFusable() && + if (producer->IsFusible() && CanFuseOnAllPaths(producer, consumer, do_not_duplicate)) { continue; } @@ -280,17 +312,15 @@ StatusOr InstructionFusion::Run(HloModule* module) { // map from HloInstruction* to the instruction's index in the vector. An // instruction is "removed" from the vector by setting it's element to // nullptr. - std::list post_order_list = + std::vector post_order = computation_->MakeInstructionPostOrder(); - std::vector post_order(post_order_list.begin(), - post_order_list.end()); tensorflow::gtl::FlatMap post_order_index; for (size_t i = 0; i < post_order.size(); ++i) { InsertOrDie(&post_order_index, post_order[i], i); } - HloInstructionSet do_not_duplicate = ComputeGloballyUnfusable(post_order); + HloInstructionSet do_not_duplicate = ComputeGloballyUnfusible(post_order); // Instruction fusion effectively fuses edges in the computation graph // (producer instruction -> consumer instruction) so we iterate over all @@ -313,7 +343,7 @@ StatusOr InstructionFusion::Run(HloModule* module) { // consistent. post_order_index.erase(instruction); - if (!instruction->IsFusable() && + if (!instruction->IsFusible() && instruction->opcode() != HloOpcode::kFusion) { continue; } @@ -385,7 +415,7 @@ StatusOr InstructionFusion::Run(HloModule* module) { for (int64 i : sorted_operand_numbers) { HloInstruction* operand = instruction->mutable_operand(i); - if (!operand->IsFusable()) { + if (!operand->IsFusible()) { continue; } @@ -469,7 +499,7 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput( bool InstructionFusion::MultiOutputFusionCreatesCycle( HloInstruction* producer, HloInstruction* consumer) { - return c_any_of( + return absl::c_any_of( consumer->operands(), [&](const HloInstruction* consumer_operand) { // The fusion algorithm traverses the HLO graph in reverse post order. // Thus `cosumers` is visited before its operands (including diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h index f73ca9adf768ed26f9ec9f162e01b7b160f50daf..00b658959a2cceeb30d2ec03f243119ec0a8ee47 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.h +++ b/tensorflow/compiler/xla/service/instruction_fusion.h @@ -36,7 +36,7 @@ class InstructionFusion : public HloPassInterface { bool may_duplicate = true) : is_expensive_(is_expensive), may_duplicate_(may_duplicate) {} ~InstructionFusion() override = default; - tensorflow::StringPiece name() const override { return "fusion"; } + absl::string_view name() const override { return "fusion"; } // Run instruction fusion on the given computation. Returns whether the // computation was changed (instructions were fused). @@ -122,8 +122,8 @@ class InstructionFusion : public HloPassInterface { // Computes the set of nodes that we do not want to fuse into any of their // consumers based on a global analysis of the HLO graph. - HloInstructionSet ComputeGloballyUnfusable( - tensorflow::gtl::ArraySlice post_order); + HloInstructionSet ComputeGloballyUnfusible( + absl::Span post_order); // Used to determine if an HLO is expensive. Expensive operations will not be // duplicated. diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc index df109df7877eefe4c337f93cc5a3a7a48e2e76c7..da1ad90959dc0ab1a840b3390281ce9d4999651e 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/instruction_fusion.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" namespace xla { @@ -47,7 +47,7 @@ class InstructionFusionForTesting : public InstructionFusion { }; TEST_F(InstructionFusionTest, FuseInstructions) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY entry_computation { p0 = f32[4,3]{1,0} parameter(0) @@ -67,7 +67,7 @@ TEST_F(InstructionFusionTest, FuseInstructions) { } TEST_F(InstructionFusionTest, FuseIntoFusionInstruction) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module fused_computation { p1 = f32[4,3] parameter(0) @@ -90,7 +90,7 @@ TEST_F(InstructionFusionTest, FuseIntoFusionInstruction) { } TEST_F(InstructionFusionTest, FuseInstructionsIntoMultiOutput) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY entry_computation { p0 = f32[4,3]{1,0} parameter(0) @@ -158,7 +158,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) { .ValueOrDie()); } -TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) { +TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusible) { HloComputation::Builder builder(TestName()); auto shape = ShapeUtil::MakeShape(F32, {16, 16}); auto param0 = @@ -167,7 +167,8 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) { builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "1")); HloInstruction* binary1 = builder.AddInstruction( HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1)); - builder.AddInstruction(HloInstruction::CreateSend(binary1, 0)); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); + builder.AddInstruction(HloInstruction::CreateSend(binary1, token, 0)); HloInstruction* unary = builder.AddInstruction( HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1)); @@ -195,7 +196,7 @@ static int Count(const HloModule& module, HloOpcode op) { } TEST_F(InstructionFusionTest, FuseCheapNonDuplicatableOps) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[4,3]{1,0} parameter(0) @@ -215,12 +216,12 @@ TEST_F(InstructionFusionTest, FuseCheapNonDuplicatableOps) { EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString(); } -TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) { +TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusibleRecursively) { // Make sure we do not duplicate the add, as we cannot fuse through the rng. // // p0 -> add -------------------------> sub // \-> abs1 -> rng -> abs2 -/ - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[4,3]{1,0} parameter(0) @@ -251,14 +252,15 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) { // p0 -> add -------------------------> sub // \-> abs1 -> log -> abs2 -/ // \-> send - module = tools::Parse(R"( + module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[4,3]{1,0} parameter(0) add = f32[4,3]{1,0} add(p0, p0) abs1 = f32[4,3]{1,0} abs(add) log = f32[4,3]{1,0} log(abs1) - send = f32[4,3]{1,0} send(log), channel_id=0 + token = token[] after-all() + send = f32[4,3]{1,0} send(log, token), channel_id=0 abs2 = f32[4,3]{1,0} abs(log) ROOT root = f32[4,3]{1,0} subtract(abs2, add) })") @@ -282,13 +284,14 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) { // \ \-> add2 -/ // \-> log -/ // \-> send - module = tools::Parse(R"( + module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[4,3]{1,0} parameter(0) add1 = f32[4,3]{1,0} add(p0, p0) log = f32[4,3]{1,0} log(p0) - send = f32[4,3]{1,0} send(log), channel_id=0 + token = token[] after-all() + send = f32[4,3]{1,0} send(log, token), channel_id=0 add2 = f32[4,3]{1,0} add(log, add1) ROOT root = f32[4,3]{1,0} subtract(add1, add2) })") @@ -306,7 +309,7 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) { EXPECT_EQ(Count(*module, HloOpcode::kAdd), 2) << module->ToString(); // A variant of the above that allows the algorithm to put add2 into the set - // of unfusable ops to short-circuit the decision whether add1 should be fused + // of unfusible ops to short-circuit the decision whether add1 should be fused // into sub2. // // /---------------\ @@ -314,14 +317,15 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) { // \------> sub1 // log -/ // \-> send - module = tools::Parse(R"( + module = ParseHloString(R"( HloModule test_module ENTRY OutputFusion { p0 = f32[4,3]{1,0} parameter(0) add1 = f32[4,3]{1,0} add(p0, p0) add2 = f32[4,3]{1,0} add(add1, add1) log = f32[4,3]{1,0} log(add2) - send = f32[4,3]{1,0} send(log), channel_id=0 + token = token[] after-all() + send = f32[4,3]{1,0} send(log, token), channel_id=0 sub1 = f32[4,3]{1,0} subtract(log, add2) sub2 = f32[4,3]{1,0} subtract(add2, add1) ROOT root = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(sub1, sub2) @@ -352,7 +356,8 @@ TEST_F(InstructionFusionTest, AllowUnaryDuplication) { builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "0")); HloInstruction* unary1 = builder.AddInstruction( HloInstruction::CreateUnary(shape, HloOpcode::kFloor, param0)); - builder.AddInstruction(HloInstruction::CreateSend(unary1, 0)); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); + builder.AddInstruction(HloInstruction::CreateSend(unary1, token, 0)); HloInstruction* unary2 = builder.AddInstruction( HloInstruction::CreateUnary(shape, HloOpcode::kAbs, unary1)); @@ -375,7 +380,8 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) { builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "1")); HloInstruction* binary1 = builder.AddInstruction( HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1)); - builder.AddInstruction(HloInstruction::CreateSend(binary1, 0)); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); + builder.AddInstruction(HloInstruction::CreateSend(binary1, token, 0)); HloInstruction* unary = builder.AddInstruction( HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1)); @@ -390,7 +396,7 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) { TEST_F(InstructionFusionTest, WideningConvertsAreAlwaysDuplicableIntoConsumers) { - auto module = tools::Parse(R"( + auto module = ParseHloString(R"( HloModule test_module ENTRY Test { p0 = f16[100] parameter(0) diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD index 524d3234eb4eff9c7d000eca1a0d9f5c4fae90af..146c9052f10cca8b199a480491d9a672d8bebdff 100644 --- a/tensorflow/compiler/xla/service/interpreter/BUILD +++ b/tensorflow/compiler/xla/service/interpreter/BUILD @@ -12,12 +12,11 @@ cc_library( srcs = ["interpreter_transfer_manager.cc"], hdrs = ["interpreter_transfer_manager.h"], deps = [ - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:generic_transfer_manager", "//tensorflow/compiler/xla/service:transfer_manager", "//tensorflow/compiler/xla/service/interpreter:platform_id", "//tensorflow/core:lib", + "@com_google_absl//absl/memory", ], alwayslink = True, # Contains per-platform transfer manager registration ) @@ -32,8 +31,6 @@ cc_library( "//tensorflow/compiler/xla:status", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:algebraic_simplifier", "//tensorflow/compiler/xla/service:compiler", "//tensorflow/compiler/xla/service:computation_placer", @@ -54,6 +51,7 @@ cc_library( "//tensorflow/compiler/xla/service:while_loop_simplifier", "//tensorflow/core:lib", "//tensorflow/stream_executor", + "@com_google_absl//absl/memory", ], alwayslink = True, # Contains compiler registration ) @@ -74,12 +72,11 @@ cc_library( hdrs = ["executable.h"], deps = [ ":executor", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:executable", "//tensorflow/compiler/xla/service:hlo", @@ -91,6 +88,8 @@ cc_library( "//tensorflow/compiler/xla/service:transfer_manager", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], ) @@ -116,5 +115,6 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_headers_lib", + "@com_google_absl//absl/types:span", ], ) diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc index c1666530687f2f8407a9dcb4e271c9d95552a689..bb69cb9c47ff2c7de8d13832c4b8e6216c62da73 100644 --- a/tensorflow/compiler/xla/service/interpreter/compiler.cc +++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/algebraic_simplifier.h" #include "tensorflow/compiler/xla/service/computation_placer.h" #include "tensorflow/compiler/xla/service/flatten_call_graph.h" @@ -44,7 +44,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) { HloPassPipeline pipeline("Interpreter"); pipeline.AddPass( - hlo_module->mutable_device_entry_computation_layout()); + hlo_module->mutable_entry_computation_layout()); return pipeline.Run(hlo_module).status(); } @@ -69,8 +69,8 @@ StatusOr> InterpreterCompiler::RunBackend( // Create executable from only the Hlo module. std::unique_ptr executable = - xla::MakeUnique(std::move(hlo_module), - xla::MakeUnique()); + absl::make_unique( + std::move(hlo_module), absl::make_unique()); return std::move(executable); } @@ -103,11 +103,11 @@ HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction() static bool InitModule() { xla::Compiler::RegisterCompilerFactory( se::interpreter::kXlaInterpreterPlatformId, []() { - return xla::MakeUnique(); + return absl::make_unique(); }); xla::ComputationPlacer::RegisterComputationPlacer( se::interpreter::kXlaInterpreterPlatformId, - []() { return xla::MakeUnique(); }); + []() { return absl::make_unique(); }); return true; } diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index 029e71058a7373b9310c6d9ffdb65f72ca28e5af..5dea12476849db6f7a9a9214398b4e57262aeda0 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -21,8 +21,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/interpreter/executor.h" @@ -47,7 +47,7 @@ InterpreterExecutable::~InterpreterExecutable() {} StatusOr InterpreterExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) { se::Stream* stream = run_options->stream(); se::StreamExecutor* executor = stream->parent(); @@ -75,9 +75,9 @@ StatusOr InterpreterExecutable::ExecuteOnStream( // consumes. std::vector> arg_literals; for (int64 p = 0; p < computation->num_parameters(); ++p) { - TF_ASSIGN_OR_RETURN( - std::unique_ptr arg_literal, - transfer_manager->TransferLiteralFromDevice(executor, *arguments[p])); + TF_ASSIGN_OR_RETURN(std::unique_ptr arg_literal, + transfer_manager->TransferLiteralFromDevice( + run_options->stream(), *arguments[p])); arg_literals.push_back(std::move(arg_literal)); } @@ -96,7 +96,7 @@ StatusOr InterpreterExecutable::ExecuteOnStream( result_literal->shape(), run_options->allocator(), executor->device_ordinal())); TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice( - executor, *result_literal, result)); + run_options->stream(), *result_literal, result)); uint64 end_micros = tensorflow::Env::Default()->NowMicros(); @@ -111,7 +111,7 @@ StatusOr InterpreterExecutable::ExecuteOnStream( StatusOr InterpreterExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { return tensorflow::errors::Unimplemented( "ExecuteAsyncOnStream is not yet supported on Interpreter."); } diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h index 91d8148d26dc8eddbafdaf4870d9efbb73a12816..3b1ebce0c75457d65e6834c809fe488a9c4a159a 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.h +++ b/tensorflow/compiler/xla/service/interpreter/executable.h @@ -18,6 +18,7 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h" #include "tensorflow/compiler/xla/service/hlo_evaluator.h" @@ -29,7 +30,6 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -48,13 +48,13 @@ class InterpreterExecutable : public Executable { StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) override LOCKS_EXCLUDED(evaluator_lock_); StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) override; + absl::Span arguments) override; static int64 ShapeSizeBytes(const Shape& shape); diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc index 97e9fa2c8e8ecd918ffe3df2fd4e731f3b91e6db..4fb67bd0b72fc591c1ffa76ebb0513bf14ed3737 100644 --- a/tensorflow/compiler/xla/service/interpreter/executor.cc +++ b/tensorflow/compiler/xla/service/interpreter/executor.cc @@ -53,6 +53,7 @@ bool XlaInterpreterExecutor::Memcpy(Stream *stream, void *host_dst, AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() { port::Status ok = SynchronousMemcpy(host_dst, dev_src, size); }); + AsExecutorStream(stream)->BlockUntilDone(); return true; } @@ -61,6 +62,7 @@ bool XlaInterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst, AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() { port::Status ok = SynchronousMemcpy(dev_dst, host_src, size); }); + AsExecutorStream(stream)->BlockUntilDone(); return true; } diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h index 9b109022fbfc698f7dadc678ef837da270a5e74a..fbb99457847dca69a1901006d5d8ff713882f918 100644 --- a/tensorflow/compiler/xla/service/interpreter/executor.h +++ b/tensorflow/compiler/xla/service/interpreter/executor.h @@ -22,9 +22,9 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/stream_executor/blas.h" #include "tensorflow/stream_executor/device_description.h" @@ -47,7 +47,7 @@ limitations under the License. namespace stream_executor { namespace interpreter { -using Args = tensorflow::gtl::ArraySlice; +using Args = absl::Span; class XlaInterpreterExecutor : public internal::StreamExecutorInterface { public: @@ -104,7 +104,7 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface { } // No "synchronize all activity" implemented for this platform at the moment. - bool SynchronizeAllActivity() override { return false; } + bool SynchronizeAllActivity() override { return true; } bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override { return false; } diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc index d27cd7502f10a1f615fc5b0d610acafdf55e3e43..7955ee5cf37f3fa45b942d8ab05a60076857dc6c 100644 --- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc @@ -17,7 +17,7 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/interpreter/platform_id.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" @@ -31,7 +31,7 @@ InterpreterTransferManager::InterpreterTransferManager() static std::unique_ptr CreateInterpreterTransferManager() { - return xla::MakeUnique(); + return absl::make_unique(); } static bool InitModule() { diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.h b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.h index 2b44f308218e2f61f08012769246b8a0e9639822..b732230fdd88b694f21ad5bc03d373331f8fb8f9 100644 --- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.h +++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_TRANSFER_MANAGER_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_TRANSFER_MANAGER_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_INTERPRETER_TRANSFER_MANAGER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_INTERPRETER_TRANSFER_MANAGER_H_ #include "tensorflow/compiler/xla/service/generic_transfer_manager.h" #include "tensorflow/core/platform/macros.h" @@ -33,4 +33,4 @@ class InterpreterTransferManager : public GenericTransferManager { } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_TRANSFER_MANAGER_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_INTERPRETER_TRANSFER_MANAGER_H_ diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc index 42c2c28997d5f3b02f1fe4effca164c893e4071d..c9b40d3c6195f80a19272a0d98890049d02315b9 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform.cc +++ b/tensorflow/compiler/xla/service/interpreter/platform.cc @@ -17,13 +17,14 @@ limitations under the License. #include +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/interpreter/executor.h" #include "tensorflow/stream_executor/device_options.h" #include "tensorflow/stream_executor/lib/initialize.h" #include "tensorflow/stream_executor/lib/ptr_util.h" #include "tensorflow/stream_executor/lib/status.h" #include "tensorflow/stream_executor/lib/status_macros.h" -#include "tensorflow/stream_executor/lib/stringprintf.h" #include "tensorflow/stream_executor/multi_platform_manager.h" #include "tensorflow/stream_executor/platform.h" @@ -70,15 +71,15 @@ port::StatusOr XlaInterpreterPlatform::GetExecutor( port::StatusOr> XlaInterpreterPlatform::GetUncachedExecutor( const StreamExecutorConfig& config) { - auto executor = MakeUnique( - this, MakeUnique(config.plugin_config)); + auto executor = absl::make_unique( + this, absl::make_unique(config.plugin_config)); auto init_status = executor->Init(config.ordinal, config.device_options); if (!init_status.ok()) { return port::Status{ port::error::INTERNAL, - port::Printf( + absl::StrFormat( "failed initializing StreamExecutor for device ordinal %d: %s", - config.ordinal, init_status.ToString().c_str())}; + config.ordinal, init_status.ToString())}; } return std::move(executor); diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc index 7067b6f86a0fb24fb946ad236bca9bbd48d53722..6e17711f575b24ffcfcbf1a78bb803603b001adf 100644 --- a/tensorflow/compiler/xla/service/layout_assignment.cc +++ b/tensorflow/compiler/xla/service/layout_assignment.cc @@ -26,14 +26,20 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/computation_layout.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_dce.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/logical_buffer.h" #include "tensorflow/compiler/xla/service/tuple_simplifier.h" @@ -46,22 +52,11 @@ limitations under the License. #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" namespace xla { -// For now moving only one API here, but we should have a single top level -// anonymous namespace, instead of three or four spread all over this file. -namespace { - - -} // namespace - std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint) { out << constraint.ToString(); @@ -76,9 +71,8 @@ BufferLayoutConstraint::BufferLayoutConstraint(const Layout& layout, } string BufferLayoutConstraint::ToString() const { - return tensorflow::strings::Printf("BufferLayoutConstraint %s: %s", - buffer_->ToString().c_str(), - LayoutUtil::HumanString(layout_).c_str()); + return absl::StrFormat("BufferLayoutConstraint %s: %s", buffer_->ToString(), + LayoutUtil::HumanString(layout_)); } OperandLayoutConstraint::OperandLayoutConstraint( @@ -97,15 +91,14 @@ OperandLayoutConstraint::OperandLayoutConstraint( } string OperandLayoutConstraint::ToString() const { - return tensorflow::strings::Printf( - "OperandLayoutConstraint %s, operand %lld: %s", - instruction_->name().c_str(), operand_no_, - shape_layout_.ToString().c_str()); + return absl::StrFormat("OperandLayoutConstraint %s, operand %d: %s", + instruction_->name(), operand_no_, + shape_layout_.ToString()); } string ResultLayoutConstraint::ToString() const { - return tensorflow::strings::Printf("ResultLayoutConstraint: %s", - shape_layout_.ToString().c_str()); + return absl::StrFormat("ResultLayoutConstraint: %s", + shape_layout_.ToString()); } LayoutConstraints::LayoutConstraints( @@ -113,14 +106,18 @@ LayoutConstraints::LayoutConstraints( HloComputation* computation) : points_to_analysis_(points_to_analysis), computation_(computation) { // Gather all array-shaped logical buffers into unconstrained_buffer_ids. - for (LogicalBuffer::Id id = 0; id < points_to_analysis_.num_logical_buffers(); - id++) { - auto& buffer = points_to_analysis_.logical_buffer(id); - // The points to analysis is computed per module, restrict constraints to - // array buffers in this computation. - if (buffer.IsArray() && buffer.instruction()->parent() == computation) { - unconstrained_buffer_ids_.insert(buffer.id()); - } + for (HloInstruction* inst : computation_->instructions()) { + points_to_analysis_.GetPointsToSet(inst).ForEachElement( + [&](const ShapeIndex&, const PointsToSet::BufferList& buffers) { + for (const LogicalBuffer* buffer : buffers) { + // The points to analysis is computed per module, restrict + // constraints to array buffers in this computation. + if (buffer->IsArray() && + buffer->instruction()->parent() == computation) { + unconstrained_buffer_ids_.insert(buffer->id()); + } + } + }); } } @@ -132,7 +129,7 @@ PointsToSet::BufferSet* LayoutConstraints::GetBufferSet( } auto& buffer_set = buffer_sets_cache_ - .emplace(instruction, MakeUnique()) + .emplace(instruction, absl::make_unique()) .first->second; const auto& points_to_set = points_to_analysis_.GetPointsToSet(instruction); points_to_set.ForEachElement( @@ -169,47 +166,36 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout, return FailedPrecondition( "Layout of buffer %s cannot be constrained because buffer is not " "array-shaped, has shape: %s", - buffer.ToString().c_str(), - ShapeUtil::HumanString(buffer.shape()).c_str()); + buffer.ToString(), ShapeUtil::HumanString(buffer.shape())); } TF_RETURN_IF_ERROR( LayoutUtil::ValidateLayoutForShape(layout, buffer.shape())); - const BufferLayoutConstraint* curr_constraint = - GetBufferLayoutConstraint(buffer); - if (curr_constraint != nullptr) { - if (LayoutUtil::Equal(curr_constraint->layout(), layout)) { + auto iter = buffer_constraints_.find(&buffer); + if (iter != buffer_constraints_.end()) { + const BufferLayoutConstraint& curr_constraint = iter->second; + if (LayoutUtil::Equal(curr_constraint.layout(), layout)) { // New constraint matches existing constraint. Nothing to do. return Status::OK(); } - if (curr_constraint->mandatory()) { + if (curr_constraint.mandatory()) { return FailedPrecondition( "Buffer %s already has the layout constraint %s, cannot add " "incompatible constraint %s", - buffer.ToString().c_str(), - LayoutUtil::HumanString(curr_constraint->layout()).c_str(), - LayoutUtil::HumanString(layout).c_str()); + buffer.ToString(), LayoutUtil::HumanString(curr_constraint.layout()), + LayoutUtil::HumanString(layout)); } - } - - auto iter = buffer_constraints_.find(&buffer); - bool overwrite = iter != buffer_constraints_.end(); - if (!overwrite) { + iter->second = BufferLayoutConstraint(layout, buffer, mandatory, dfs); + } else { + TF_RET_CHECK(unconstrained_buffer_ids_.erase(buffer.id()) == 1) + << buffer.ToString(); iter = buffer_constraints_ .insert(std::make_pair( &buffer, BufferLayoutConstraint(layout, buffer, mandatory, dfs))) .first; - } else { - iter->second = BufferLayoutConstraint(layout, buffer, mandatory, dfs); } added_constraints_.push_back(&iter->second); - - // Remove buffer from the set of unconstrained buffers. - TF_RET_CHECK(unconstrained_buffer_ids_.count(buffer.id()) == - static_cast(!overwrite)); - unconstrained_buffer_ids_.erase(buffer.id()); - return Status::OK(); } @@ -231,11 +217,11 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout, } if (curr_shape_layout->mandatory()) { return FailedPrecondition( - "Operand %lld of instruction %s already has a layout constraint " + "Operand %d of instruction %s already has a layout constraint " "%s, cannot add incompatible constraint %s", - operand_no, instruction->name().c_str(), - curr_shape_layout->shape_layout().ToString().c_str(), - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str()); + operand_no, instruction->name(), + curr_shape_layout->shape_layout().ToString(), + ShapeUtil::HumanStringWithLayout(shape_with_layout)); } } @@ -244,9 +230,9 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout, // layouts beyond this immediate use and is complicated to handle. if (OperandBufferForwarded(instruction, operand_no)) { return FailedPrecondition( - "Cannot constraint layout of operand %lld of instruction %s " + "Cannot constraint layout of operand %d of instruction %s " "because instruction forwards operand's LogicalBuffer(s)", - operand_no, instruction->name().c_str()); + operand_no, instruction->name()); } auto key = std::make_pair(instruction, operand_no); @@ -288,8 +274,8 @@ Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout, return FailedPrecondition( "Result of computation %s already has the layout constraint %s, " "cannot add incompatible constraint %s", - computation_->name().c_str(), curr_shape_layout->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str()); + computation_->name(), curr_shape_layout->ToString(), + ShapeUtil::HumanStringWithLayout(shape_with_layout)); } // New constraint matches existing constraint. Nothing to do. return Status::OK(); @@ -311,9 +297,8 @@ Status LayoutConstraints::SetInstructionLayout( if (!ShapeUtil::Compatible(shape_with_layout, instruction->shape())) { return FailedPrecondition( "Instruction %s of shape %s cannot be assigned incompatible layout %s", - instruction->name().c_str(), - ShapeUtil::HumanString(instruction->shape()).c_str(), - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str()); + instruction->name(), ShapeUtil::HumanString(instruction->shape()), + ShapeUtil::HumanStringWithLayout(shape_with_layout)); } // Create a BufferLayoutConstraint for each array shape in the output of the @@ -372,35 +357,68 @@ const ShapeLayout* LayoutConstraints::ResultLayout() const { string LayoutConstraints::ToString() const { string output; - tensorflow::strings::StrAppend(&output, "LayoutConstraints for computation ", - computation_->name(), ":\n"); + absl::StrAppend(&output, "LayoutConstraints for computation ", + computation_->name(), ":\n"); for (auto* instruction : computation_->MakeInstructionPostOrder()) { - tensorflow::strings::StrAppend(&output, " ", instruction->ToShortString(), - "\n"); + absl::StrAppend(&output, " ", instruction->ToShortString(), "\n"); for (int64 i = 0; i < instruction->operand_count(); ++i) { if (OperandLayout(instruction, i) != nullptr) { - tensorflow::strings::StrAppend( - &output, " operand (", i, - "): ", OperandLayout(instruction, i)->ToString(), "\n"); + absl::StrAppend(&output, " operand (", i, + "): ", OperandLayout(instruction, i)->ToString(), "\n"); } } for (const LogicalBuffer* buffer : points_to_analysis_.GetBuffersDefinedByInstruction(instruction)) { if (BufferLayout(*buffer) != nullptr) { - tensorflow::strings::StrAppend( - &output, " ", buffer->ToString(), " : ", - LayoutUtil::HumanString(*BufferLayout(*buffer)), "\n"); + absl::StrAppend(&output, " ", buffer->ToString(), " : ", + LayoutUtil::HumanString(*BufferLayout(*buffer)), "\n"); } } } if (ResultLayout() != nullptr) { - tensorflow::strings::StrAppend(&output, " => ", ResultLayout()->ToString(), - "\n"); + absl::StrAppend(&output, " => ", ResultLayout()->ToString(), "\n"); } return output; } +namespace { + +bool IsHostSendRecv(const HloInstruction* instruction) { + const HloSendRecvInstruction* send_recv_instr = + DynCast(instruction); + return send_recv_instr != nullptr && send_recv_instr->is_host_transfer(); +} + +} // namespace + +Status LayoutAssignment::BuildHostChannelConstraints( + HloComputation* computation) { + for (auto* instruction : computation->instructions()) { + const HloSendRecvInstruction* send_recv_instr = + DynCast(instruction); + if (send_recv_instr == nullptr || !send_recv_instr->is_host_transfer()) { + continue; + } + + // For host transfers the Send and Recv instruction carry the layout. + if (instruction->opcode() == HloOpcode::kSend || + instruction->opcode() == HloOpcode::kRecv) { + const Shape& data_shape = + ShapeUtil::GetTupleElementShape(send_recv_instr->shape(), 0); + TF_RET_CHECK(ShapeUtil::IsArray(data_shape)); + TF_RET_CHECK(LayoutUtil::HasLayout(data_shape)); + const Layout* prev_layout = host_channel_constraints_.ConstrainChannel( + send_recv_instr->channel_id(), data_shape.layout()); + TF_RET_CHECK(prev_layout == nullptr) + << "Cannot constrain host transfer layout as it was set to " + << LayoutUtil::HumanString(*prev_layout) << ": " + << send_recv_instr->ToString(); + } + } + return Status::OK(); +} + Status LayoutAssignment::AddMandatoryConstraints( const ComputationLayout* computation_layout, ChannelLayoutConstraints* channel_constraints, HloComputation* computation, @@ -408,6 +426,11 @@ Status LayoutAssignment::AddMandatoryConstraints( VLOG(3) << "Adding mandatory layout constraints to computation " << computation->name(); + auto get_channel_constraints = [&](const HloInstruction* instruction) { + return IsHostSendRecv(instruction) ? &host_channel_constraints_ + : channel_constraints; + }; + // Constrain layouts of instructions which define values with pre-existing // layouts. for (auto* instruction : computation->instructions()) { @@ -444,18 +467,21 @@ Status LayoutAssignment::AddMandatoryConstraints( if (instruction->opcode() == HloOpcode::kSend || instruction->opcode() == HloOpcode::kRecv) { - CHECK(channel_constraints) + CHECK(get_channel_constraints(instruction)) << "Multi-module layout assignment requires ChannelLayoutConstraints"; int64 channel_id = instruction->channel_id(); - if (!channel_constraints->IsChannelConstrained(channel_id)) { + if (!get_channel_constraints(instruction) + ->IsChannelConstrained(channel_id)) { continue; } if (instruction->opcode() == HloOpcode::kSend) { // TODO(b/68493863): Change to use SetOperandLayout(). const Shape send_buffer_shape = instruction->operand(0)->shape(); TF_RET_CHECK(ShapeUtil::IsArray(send_buffer_shape)); - Shape new_buffer_shape = channel_constraints->LayoutShapeForChannel( - send_buffer_shape, instruction->channel_id()); + Shape new_buffer_shape = + get_channel_constraints(instruction) + ->LayoutShapeForChannel(send_buffer_shape, + instruction->channel_id()); TF_RETURN_IF_ERROR(constraints->SetInstructionLayout( new_buffer_shape, instruction->operand(0))); } else { @@ -466,8 +492,9 @@ Status LayoutAssignment::AddMandatoryConstraints( const LogicalBuffer* buffer, constraints->points_to_analysis().GetBufferDefinedAt(instruction, {0})); - Shape new_shape = channel_constraints->LayoutShapeForChannel( - recv_buffer_shape, instruction->channel_id()); + Shape new_shape = get_channel_constraints(instruction) + ->LayoutShapeForChannel( + recv_buffer_shape, instruction->channel_id()); TF_RETURN_IF_ERROR( constraints->SetBufferLayout(new_shape.layout(), *buffer)); } @@ -716,11 +743,12 @@ Status CheckParameterLayout(HloInstruction* parameter, const ComputationLayout& computation_layout) { const ShapeLayout& parameter_layout = computation_layout.parameter_layout(parameter->parameter_number()); - if (!parameter_layout.MatchesLayoutInShape(parameter->shape())) { + if (parameter_layout.LayoutIsSet() && + !parameter_layout.MatchesLayoutInShape(parameter->shape())) { return InternalError( "parameter instruction %s does not match layout of computation " "shape: %s", - parameter->ToString().c_str(), parameter_layout.ToString().c_str()); + parameter->ToString(), parameter_layout.ToString()); } return Status::OK(); } @@ -731,8 +759,8 @@ Status CheckConstantLayout(HloInstruction* constant) { constant->shape())) { return InternalError( "constant instruction %s does not match the layout of its literal %s", - constant->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(constant->literal().shape()).c_str()); + constant->ToString(), + ShapeUtil::HumanStringWithLayout(constant->literal().shape())); } return Status::OK(); } @@ -831,8 +859,8 @@ void LayoutAssignment::SetupCopiedInstruction(const HloInstruction& instruction, // HostCompute module. // Otherwise it is preferable to leave the new instruction without device, // and let the automatic device placer to choose the best location. - if (!sharding.HasUniqueDevice() || - HloSharding::IsReservedDevice(sharding.UniqueDevice().ValueOrDie())) { + auto device = sharding.UniqueDevice(); + if (!device || HloSharding::IsReservedDevice(*device)) { copy->set_sharding(sharding); } } @@ -865,13 +893,10 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) { return InternalError( "Layout of instruction %s at index {%s} does not match " "source LogicalBuffer %s: %s vs %s", - instruction->name().c_str(), - tensorflow::str_util::Join(index, ",").c_str(), - buffer->ToString().c_str(), - ShapeUtil::HumanStringWithLayout(instruction_subshape) - .c_str(), - ShapeUtil::HumanStringWithLayout(buffer->shape()) - .c_str()); + instruction->name(), absl::StrJoin(index, ","), + buffer->ToString(), + ShapeUtil::HumanStringWithLayout(instruction_subshape), + ShapeUtil::HumanStringWithLayout(buffer->shape())); } } } @@ -936,14 +961,15 @@ LayoutAssignment::LayoutAssignment( ComputationLayout* entry_computation_layout, ChannelLayoutConstraints* channel_constraints) : entry_computation_layout_(entry_computation_layout), + saved_entry_computation_layout_(*entry_computation_layout), channel_layout_constraints_(channel_constraints) { + if (channel_layout_constraints_ != nullptr) { + // Save a copy of the input ChannelLayoutConstraints so that we can reset it + // if we have to undo previous operations (ClearPreviousPassSideEffects()). + channel_constraints_ = *channel_layout_constraints_; + } VLOG(1) << "Entry computation layout given to layout assignment: " << entry_computation_layout_->ToString(); - // Layouts of all parameter instructions must be set. - for (const ShapeLayout& parameter_layout : - entry_computation_layout_->parameter_layouts()) { - CHECK(parameter_layout.LayoutIsSet()); - } } std::unique_ptr LayoutAssignment::ChooseOperandLayoutFromOutputLayout( @@ -954,17 +980,18 @@ std::unique_ptr LayoutAssignment::ChooseOperandLayoutFromOutputLayout( CHECK(ShapeUtil::IsArray(instruction->shape())); CHECK(ShapeUtil::IsArray(operand->shape())); - if (instruction->IsElementwiseOnOperand(operand_no) && - !ShapeUtil::IsScalar(operand->shape()) && + if (!ShapeUtil::IsScalar(operand->shape()) && ShapeUtil::Rank(operand->shape()) == - ShapeUtil::Rank(instruction->shape())) { - // Assign operands the same layout as the instruction, so that + ShapeUtil::Rank(instruction->shape()) && + InstructionRequiresInputLayoutEqualToOutputLayout(instruction)) { + // Propagate the result layout to the operand layout if the instruction + // requires the same layout out for the result and the operand. + // + // For elementwise operations, using the same layout for the operands and + // the result also has the following benefits: // 1) the elementwise operation can reuse its operand's buffer, and // 2) the input and output elements can reuse the same linear index. - // - // TODO(jingyue): Other operations, such as kSlice and kConcat, can benefit - // from assigning the same layout to input and output. - return MakeUnique(output_layout); + return absl::make_unique(output_layout); } if (instruction->opcode() == HloOpcode::kReshape) { @@ -987,13 +1014,13 @@ std::unique_ptr LayoutAssignment::ChooseOperandLayoutFromOutputLayout( *operand_shape.mutable_layout() = LayoutUtil::GetDefaultLayoutForShape(operand_shape); if (ShapeUtil::ReshapeIsBitcast(operand_shape, output_shape_with_layout)) { - return MakeUnique(operand_shape.layout()); + return absl::make_unique(operand_shape.layout()); } if (ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape)) { *operand_shape.mutable_layout() = output_layout; if (ShapeUtil::ReshapeIsBitcast(operand_shape, output_shape_with_layout)) { - return MakeUnique(output_layout); + return absl::make_unique(output_layout); } } auto aligned_operand_shape = @@ -1002,7 +1029,7 @@ std::unique_ptr LayoutAssignment::ChooseOperandLayoutFromOutputLayout( auto operand_layout = aligned_operand_shape.value().layout(); TF_CHECK_OK( LayoutUtil::ValidateLayoutForShape(operand_layout, operand_shape)); - return MakeUnique(operand_layout); + return absl::make_unique(operand_layout); } } @@ -1018,7 +1045,7 @@ std::unique_ptr LayoutAssignment::ChooseOperandLayoutFromOutputLayout( Layout operand_layout = LayoutUtil::MakeLayout(new_minor_to_major); TF_CHECK_OK( LayoutUtil::ValidateLayoutForShape(operand_layout, operand->shape())); - return MakeUnique(operand_layout); + return absl::make_unique(operand_layout); } return nullptr; @@ -1032,11 +1059,11 @@ std::unique_ptr LayoutAssignment::ChooseOutputLayoutFromOperandLayout( CHECK(ShapeUtil::IsArray(user->shape()) && ShapeUtil::IsArray(operand->shape())); - if (user->IsElementwiseOnOperand(operand_no) && - !ShapeUtil::IsScalar(operand->shape()) && - ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(user->shape())) { + if (!ShapeUtil::IsScalar(operand->shape()) && + ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(user->shape()) && + InstructionRequiresInputLayoutEqualToOutputLayout(user)) { // Assign users the same layout as the operand. - return MakeUnique(operand_layout); + return absl::make_unique(operand_layout); } if (user->opcode() == HloOpcode::kReshape) { @@ -1059,13 +1086,13 @@ std::unique_ptr LayoutAssignment::ChooseOutputLayoutFromOperandLayout( *output_shape.mutable_layout() = LayoutUtil::GetDefaultLayoutForShape(output_shape); if (ShapeUtil::ReshapeIsBitcast(output_shape, operand_shape_with_layout)) { - return MakeUnique(output_shape.layout()); + return absl::make_unique(output_shape.layout()); } if (ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(output_shape)) { *output_shape.mutable_layout() = operand_layout; if (ShapeUtil::ReshapeIsBitcast(output_shape, operand_shape_with_layout)) { - return MakeUnique(operand_layout); + return absl::make_unique(operand_layout); } } auto aligned_user_shape = @@ -1074,7 +1101,7 @@ std::unique_ptr LayoutAssignment::ChooseOutputLayoutFromOperandLayout( auto user_layout = aligned_user_shape.value().layout(); TF_CHECK_OK( LayoutUtil::ValidateLayoutForShape(user_layout, output_shape)); - return MakeUnique(user_layout); + return absl::make_unique(user_layout); } } @@ -1090,7 +1117,7 @@ std::unique_ptr LayoutAssignment::ChooseOutputLayoutFromOperandLayout( } Layout user_layout = LayoutUtil::MakeLayout(new_minor_to_major); TF_CHECK_OK(LayoutUtil::ValidateLayoutForShape(user_layout, user->shape())); - return MakeUnique(user_layout); + return absl::make_unique(user_layout); } return nullptr; @@ -1184,7 +1211,7 @@ Status LayoutAssignment::PropagateUseConstraintToDefs( const PointsToSet& points_to_set = constraints->points_to_analysis().GetPointsToSet(instruction); return points_to_set.ForEachElementWithStatus( - [this, &shape_layout, constraints]( + [&shape_layout, constraints]( const ShapeIndex& index, const PointsToSet::BufferList& buffers) -> Status { if (ShapeUtil::IsLeafIndex(shape_layout.shape(), index)) { @@ -1341,7 +1368,7 @@ StatusOr InferArrayLayout( // This should not happen because we've assigned layouts to all // instructions preceding this one. return InternalError("LogicalBuffer %s does not have a layout", - source_buffer->ToString().c_str()); + source_buffer->ToString()); } if (first_buffer_layout == nullptr) { @@ -1356,9 +1383,8 @@ StatusOr InferArrayLayout( return FailedPrecondition( "Array at index {%s} in instruction %s aliases buffers %s " "and %s which have different layouts", - tensorflow::str_util::Join(index, ",").c_str(), - instruction->name().c_str(), source_buffers[0]->ToString().c_str(), - source_buffer->ToString().c_str()); + absl::StrJoin(index, ","), instruction->name(), + source_buffers[0]->ToString(), source_buffer->ToString()); } } @@ -1519,14 +1545,14 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) { // and the computation result. The latter two are specified in // computation_layout, so we only need to keep the existing layouts for // infeeds. Clearing the layouts here avoids hiding potential bugs in the - // layout assignment pass that may accidently use the existing layout. + // layout assignment pass that may accidentally use the existing layout. for (HloInstruction* instruction : computation->instructions()) { if (instruction->opcode() == HloOpcode::kBitcast) { // bitcasts are inherently layout sensitive and so a bitcast instruction // present in the IR before layout assignment is a bug. return InternalError( "Unexpected bitcast operation seen during layout assignment: %s.", - instruction->ToString().c_str()); + instruction->ToString()); } if (instruction->opcode() != HloOpcode::kInfeed) { LayoutUtil::ClearLayout(instruction->mutable_shape()); @@ -1542,6 +1568,10 @@ Status LayoutAssignment::RunOnComputation( ChannelLayoutConstraints* channel_constraints) { VLOG(2) << "LayoutAssignment::RunOnComputation(" << computation->name() << ")"; + + // Must be run before clearing layouts. + TF_RETURN_IF_ERROR(BuildHostChannelConstraints(computation)); + TF_RETURN_IF_ERROR(ClearComputationLayouts(computation)); if (computation_layout != nullptr) { auto it = computation_layouts_.find(computation); @@ -1572,6 +1602,13 @@ Status LayoutAssignment::RunOnComputation( // Propagates layouts from mandatory and backend constraints. TF_RETURN_IF_ERROR(PropagateConstraints(&constraints)); + // Prior to applying default layouts, we take note of all HLO instructions + // which lack a layout constraint. + for (LogicalBuffer::Id buffer_id : constraints.unconstrained_buffer_ids()) { + unconstrained_layout_instructions_.insert( + points_to_analysis.GetBuffer(buffer_id).instruction()); + } + // While any unconstrained buffers remain, pick an arbitrary buffer, give it a // layout and propagate the change. while (!constraints.unconstrained_buffer_ids().empty()) { @@ -1614,13 +1651,65 @@ Status LayoutAssignment::RunOnComputation( // Record the layouts assigned for any communication ops in // channel_constraints so that they are constrained for future modules. + if (channel_constraints != nullptr) { + TF_RETURN_IF_ERROR( + ConstrainChannelLayouts(computation, channel_constraints)); + } + return Status::OK(); +} + +Status LayoutAssignment::ConstrainChannelLayouts( + HloComputation* computation, + ChannelLayoutConstraints* channel_constraints) { + auto get_channel_constraints = [&](const HloInstruction* instruction) { + return IsHostSendRecv(instruction) ? &host_channel_constraints_ + : channel_constraints; + }; + // We go through the kRecvDone before. These must either impose their layout, + // or find a matching one already existing (ConstrainChannel() returns + // nullptr). for (HloInstruction* instruction : computation->instructions()) { + if (instruction->opcode() == HloOpcode::kRecvDone) { + const Layout* layout = + get_channel_constraints(instruction) + ->ConstrainChannel( + instruction->channel_id(), + ShapeUtil::GetSubshape(instruction->shape(), {0}).layout()); + TF_RET_CHECK(layout == nullptr) + << instruction->ToString() + << " cannot constrain layout as it was set to " + << LayoutUtil::HumanString(*layout); + } + } + // After that we go through the kSend. These are likely going to have a kCopy + // as operand (otherwise we add it), so in case the constrained layout does + // not match, we can change the kCopy layout (and the kSend one as well). + for (HloInstruction* instruction : computation->MakeInstructionPostOrder()) { if (instruction->opcode() == HloOpcode::kSend) { - channel_constraints->ConstrainChannel( - instruction->channel_id(), instruction->operand(0)->shape().layout()); - } else if (instruction->opcode() == HloOpcode::kRecvDone) { - channel_constraints->ConstrainChannel(instruction->channel_id(), - instruction->shape().layout()); + HloInstruction* operand = instruction->mutable_operand(0); + const Layout* layout = get_channel_constraints(instruction) + ->ConstrainChannel(instruction->channel_id(), + operand->shape().layout()); + if (layout != nullptr) { + // We found an already constrained layout which does not match the one + // the kSend wants to impose. Either add a new kCopy, or use the + // existing one to marshal the correct shape. + Shape shape = operand->shape(); + *shape.mutable_layout() = *layout; + if (operand->opcode() != HloOpcode::kCopy) { + HloInstruction* copy = operand->parent()->AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kCopy, operand)); + RegisterAddedCopy(copy); + SetupCopiedInstruction(*operand, copy, {}); + TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(0, copy)); + operand = copy; + } else { + *operand->mutable_shape() = shape; + } + Shape* send_shape = + ShapeUtil::GetMutableSubshape(instruction->mutable_shape(), {0}); + *send_shape = shape; + } } } return Status::OK(); @@ -1672,13 +1761,14 @@ StatusOr LayoutAssignment::Run(HloModule* module) { // when seen from an outer instruction, which has across-computation // constraints to impose. // For example, the kWhile instruction needs to enforce the same layouts for - // the parameters and root of the bosy, as well as the condition parameters. + // the parameters and root of the body, as well as the condition parameters. // Similarly, the kConditional instruction needs to enforce the same layouts // for the root of the true and false computations. // So in the first pass, while allowing the layouts to flow to parameters and // root, we also fix up the eventually inconsistent ComputationLayout, which // will be then made mandatory by the second pass. for (int64 i = 0; i < 2; ++i) { + VLOG(5) << "Running " << (i == 0 ? "un" : "") << "constrained pass"; TF_RETURN_IF_ERROR(ClearPreviousPassSideEffects(module)); TF_ASSIGN_OR_RETURN(auto points_to_analysis, TuplePointsToAnalysis::Run(module)); @@ -1714,12 +1804,115 @@ StatusOr LayoutAssignment::Run(HloModule* module) { return true; } +bool LayoutAssignment::InstructionRequiresInputLayoutEqualToOutputLayout( + const HloInstruction* instruction) { + switch (instruction->opcode()) { + case HloOpcode::kAbs: + case HloOpcode::kAdd: + case HloOpcode::kAnd: + case HloOpcode::kAtan2: + case HloOpcode::kBitcastConvert: + case HloOpcode::kCeil: + case HloOpcode::kClamp: + case HloOpcode::kClz: + case HloOpcode::kComplex: + case HloOpcode::kConcatenate: + case HloOpcode::kConditional: + case HloOpcode::kConvert: + case HloOpcode::kCos: + case HloOpcode::kCrossReplicaSum: + case HloOpcode::kAllToAll: + case HloOpcode::kCollectivePermute: + case HloOpcode::kCustomCall: + case HloOpcode::kDivide: + case HloOpcode::kDynamicSlice: + case HloOpcode::kDynamicUpdateSlice: + case HloOpcode::kEq: + case HloOpcode::kExp: + case HloOpcode::kExpm1: + case HloOpcode::kFft: + case HloOpcode::kFloor: + case HloOpcode::kGe: + case HloOpcode::kGt: + case HloOpcode::kImag: + case HloOpcode::kIsFinite: + case HloOpcode::kLe: + case HloOpcode::kLog: + case HloOpcode::kLog1p: + case HloOpcode::kLt: + case HloOpcode::kMap: + case HloOpcode::kMaximum: + case HloOpcode::kMinimum: + case HloOpcode::kMultiply: + case HloOpcode::kNe: + case HloOpcode::kNegate: + case HloOpcode::kNot: + case HloOpcode::kOr: + case HloOpcode::kXor: + case HloOpcode::kPad: + case HloOpcode::kPower: + case HloOpcode::kReal: + case HloOpcode::kReducePrecision: + case HloOpcode::kReduceWindow: + case HloOpcode::kRemainder: + case HloOpcode::kReverse: + case HloOpcode::kRoundNearestAfz: + case HloOpcode::kSelect: + case HloOpcode::kSelectAndScatter: + case HloOpcode::kShiftLeft: + case HloOpcode::kShiftRightArithmetic: + case HloOpcode::kShiftRightLogical: + case HloOpcode::kSign: + case HloOpcode::kSin: + case HloOpcode::kSlice: + case HloOpcode::kSort: + case HloOpcode::kSubtract: + case HloOpcode::kTanh: + case HloOpcode::kTupleSelect: + case HloOpcode::kWhile: + return true; + case HloOpcode::kBatchNormGrad: + case HloOpcode::kBatchNormInference: + case HloOpcode::kBatchNormTraining: + case HloOpcode::kBitcast: + case HloOpcode::kBroadcast: + case HloOpcode::kCall: + case HloOpcode::kConstant: + case HloOpcode::kConvolution: + case HloOpcode::kCopy: + case HloOpcode::kDomain: + case HloOpcode::kDot: + case HloOpcode::kFusion: + case HloOpcode::kGather: + case HloOpcode::kGetTupleElement: + case HloOpcode::kInfeed: + case HloOpcode::kIota: + case HloOpcode::kOutfeed: + case HloOpcode::kParameter: + case HloOpcode::kRecv: + case HloOpcode::kRecvDone: + case HloOpcode::kReduce: + case HloOpcode::kReshape: + case HloOpcode::kRng: + case HloOpcode::kScatter: + case HloOpcode::kSend: + case HloOpcode::kSendDone: + case HloOpcode::kAfterAll: + case HloOpcode::kTrace: + case HloOpcode::kTranspose: + case HloOpcode::kTuple: + return false; + } +} + Status LayoutAssignment::Init() { computation_layouts_.clear(); + *entry_computation_layout_ = saved_entry_computation_layout_; return Status::OK(); } Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) { + VLOG(5) << "Clearing previous side effects"; // Clear all the copies which have been added, and all the related // instructions (like GTE and tuples). int64 removed_copies = 0; @@ -1737,12 +1930,14 @@ Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) { } } added_copies_.clear(); + unconstrained_layout_instructions_.clear(); if (removed_copies > 0) { TupleSimplifier tuple_simplifier; HloDCE dce; TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); TF_RETURN_IF_ERROR(dce.Run(module).status()); } + ResetChannelConstraints(); return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h index c287cca0c54ba1bb514bd8d243c137eca99b258f..cf545031d3c7c66770ea4a2392a2df3b8c24cd38 100644 --- a/tensorflow/compiler/xla/service/layout_assignment.h +++ b/tensorflow/compiler/xla/service/layout_assignment.h @@ -249,25 +249,30 @@ class ChannelLayoutConstraints { // Given `shape`, apply the layout for `channel_id`. `channel_id` must already // be constrained. Shape LayoutShapeForChannel(Shape shape, int64 channel_id) const { - CHECK(IsChannelConstrained(channel_id)); - *shape.mutable_layout() = constraints_.at(channel_id); + auto it = constraints_.find(channel_id); + CHECK(it != constraints_.end()) << "Channel " << channel_id; + *shape.mutable_layout() = it->second; return shape; } // Returns the layout constraint for `channel_id`, which must already be // constrained. - Layout LayoutForChannel(int64 channel_id) const { - CHECK(IsChannelConstrained(channel_id)); - return constraints_.at(channel_id); + const Layout& LayoutForChannel(int64 channel_id) const { + auto it = constraints_.find(channel_id); + CHECK(it != constraints_.end()) << "Channel " << channel_id; + return it->second; } // Adds a new layout constraint for `channel_id`. If a constraint for - // `channel_id` already exists, this operation requires that the new layout is - // the same as the previously constrained layout. - void ConstrainChannel(int64 channel_id, const Layout& layout) { - CHECK(!IsChannelConstrained(channel_id) || - LayoutUtil::Equal(layout, constraints_[channel_id])); - constraints_[channel_id] = layout; + // `channel_id` has been added, this API returns nullptr, otherwise returns + // the layout which has already been set for the channel. + const Layout* ConstrainChannel(int64 channel_id, const Layout& layout) { + auto it = constraints_.emplace(std::make_pair(channel_id, layout)); + if (it.second) { + return nullptr; + } + return LayoutUtil::Equal(layout, it.first->second) ? nullptr + : &it.first->second; } private: @@ -292,12 +297,17 @@ class LayoutAssignment : public HloPassInterface { ComputationLayout* entry_computation_layout, ChannelLayoutConstraints* channel_constraints = nullptr); ~LayoutAssignment() override {} - tensorflow::StringPiece name() const override { return "layout-assignment"; } + absl::string_view name() const override { return "layout-assignment"; } // Assign layouts to the given module. Returns whether the module was changed // (any layouts were changed). StatusOr Run(HloModule* module) override; + // Returns true if the instruction requires that operands with the same rank + // as the output have to have the same layout as the output. + virtual bool InstructionRequiresInputLayoutEqualToOutputLayout( + const HloInstruction* instruction); + protected: // These methods, invoked by PropagateConstraints, propagate a layout // constraint to its neighbors (i.e. operands and users) in order to minimize @@ -427,8 +437,13 @@ class LayoutAssignment : public HloPassInterface { Status PropagateComputationLayouts(HloComputation* computation, ComputationLayout* computation_layout); + // The pointer to the ComputationLayout passed as constructor parameter. ComputationLayout* entry_computation_layout_; + // A copy of entry_computation_layout_ used to reset it to the initial values + // during the multiple passes done by the layout assignment operation. + ComputationLayout saved_entry_computation_layout_; + protected: // Sets up the copy instruction according to the characteristic (sharding, // metadata, ...) of the reference instruction. The index argument is used @@ -464,6 +479,23 @@ class LayoutAssignment : public HloPassInterface { // itself). Status AddCopyForOperand(HloInstruction* instruction, int64 operand_number); + // Apply the channel layout constraints by populating the channel_constraints + // data structure passed in at constructor time. Eventually adds copies in + // case two ends of a channel ended up with a different leyout. + Status ConstrainChannelLayouts(HloComputation* computation, + ChannelLayoutConstraints* channel_constraints); + + // Resets the input ChannelLayoutConstraints to the original copy received + // from the constructor input. + void ResetChannelConstraints() { + if (channel_layout_constraints_ != nullptr) { + *channel_layout_constraints_ = channel_constraints_; + } + } + + // Adds constraints related to host Send/Recv instructions. + Status BuildHostChannelConstraints(HloComputation* computation); + // Map containing the layouts of all computations assigned so // far. Computations are handled in a topological sort where computations are // handled before their caller instructions so the layouts of caller @@ -474,7 +506,23 @@ class LayoutAssignment : public HloPassInterface { // here. tensorflow::gtl::FlatSet added_copies_; - ChannelLayoutConstraints* channel_layout_constraints_; + // The pointer to the channel layout constraints passed in with the + // constructor. If not nullptr, this is an input/output argument. + ChannelLayoutConstraints* channel_layout_constraints_ = nullptr; + + // A copy of the input layout constraints used to reset the above pointer in + // case we have to undo operations due to the multiple passes over the + // computations/instructions. + ChannelLayoutConstraints channel_constraints_; + + // Layout constraints for send/recv instructions which communicate with the + // host. + ChannelLayoutConstraints host_channel_constraints_; + + // The set of HLO instructions which lacked any layout constraint, thus + // receiving propagated default layouts. + tensorflow::gtl::FlatSet + unconstrained_layout_instructions_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc index 7508013199a82267efc0e1426cb5989d5fe844a0..021fe630ff6329c51e297d0bb2bee8269a42904b 100644 --- a/tensorflow/compiler/xla/service/layout_assignment_test.cc +++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc @@ -20,8 +20,9 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/algebraic_simplifier.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" @@ -29,18 +30,17 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_layout.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/test_utils.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace op = xla::testing::opcode_matchers; @@ -52,10 +52,18 @@ using ::testing::ElementsAre; class LayoutAssignmentTest : public HloTestBase { protected: void AssignLayouts(HloModule* module, - ComputationLayout* entry_computation_layout) { - LayoutAssignment layout_assignment(entry_computation_layout); + ComputationLayout* entry_computation_layout, + ChannelLayoutConstraints* channel_constraints = nullptr) { + LayoutAssignment layout_assignment( + entry_computation_layout, /*channel_constraints=*/channel_constraints); EXPECT_IS_OK(layout_assignment.Run(module).status()); } + + std::vector LayoutOf(HloModule* module, absl::string_view name) { + auto minor_to_major = + FindInstruction(module, name)->shape().layout().minor_to_major(); + return std::vector(minor_to_major.begin(), minor_to_major.end()); + } }; TEST_F(LayoutAssignmentTest, ComputationLayout) { @@ -133,9 +141,9 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) { std::vector> minor_to_majors = {{0, 1}, {1, 0}}; for (auto& minor_to_major : minor_to_majors) { auto builder = HloComputation::Builder(TestName()); - auto constant_literal1 = Literal::CreateR2WithLayout( + auto constant_literal1 = LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout(minor_to_major)); - auto constant_literal2 = Literal::CreateR2WithLayout( + auto constant_literal2 = LiteralUtil::CreateR2WithLayout( {{5.0, 6.0}, {7.0, 8.0}}, LayoutUtil::MakeLayout(minor_to_major)); Shape ashape = constant_literal1->shape(); @@ -184,10 +192,10 @@ TEST_F(LayoutAssignmentTest, TupleLayout) { // match their source). auto builder = HloComputation::Builder(TestName()); auto constant0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2WithLayout( + HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1})))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2WithLayout( + HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0})))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant0, constant1})); @@ -221,10 +229,10 @@ TEST_F(LayoutAssignmentTest, TupleSelect) { // Verify layouts of a select with tuple operands is assigned properly. auto builder = HloComputation::Builder(TestName()); auto constant0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2WithLayout( + HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1})))); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2WithLayout( + HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0})))); auto tuple0 = builder.AddInstruction( HloInstruction::CreateTuple({constant0, constant1})); @@ -232,7 +240,7 @@ TEST_F(LayoutAssignmentTest, TupleSelect) { HloInstruction::CreateTuple({constant0, constant1})); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); auto select = builder.AddInstruction(HloInstruction::CreateTernary( tuple0->shape(), HloOpcode::kSelect, pred, tuple0, tuple1)); @@ -266,7 +274,7 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) { // tuple and assigning the layouts of the copied arrays as needed. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); auto inner_tuple = builder.AddInstruction(HloInstruction::CreateTuple({constant})); auto nested_tuple = builder.AddInstruction( @@ -576,7 +584,7 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) { auto builder = HloComputation::Builder(TestName()); Shape input_shape = ShapeUtil::MakeShape(F32, {3, 5, 6, 7}); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); auto broadcast = builder.AddInstruction( HloInstruction::CreateBroadcast(input_shape, constant, {})); auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose( @@ -651,7 +659,7 @@ TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) { } )"; - auto module = tools::Parse(module_str).ValueOrDie(); + auto module = ParseHloString(module_str).ValueOrDie(); module = backend() @@ -691,7 +699,7 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) { } )"; - auto module = tools::Parse(module_str).ValueOrDie(); + auto module = ParseHloString(module_str).ValueOrDie(); ComputationLayout computation_layout( module->entry_computation()->ComputeProgramShape()); Shape param_shape = ShapeUtil::MakeTupleShape( @@ -707,17 +715,10 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) { LayoutUtil::MakeLayout({2, 1, 0})); AssignLayouts(module.get(), &computation_layout); - auto layout_of = [&](tensorflow::StringPiece name) { - return FindInstruction(module.get(), name) - ->shape() - .layout() - .minor_to_major(); - }; - - EXPECT_THAT(layout_of("gte0"), ElementsAre(0, 1, 2)); - EXPECT_THAT(layout_of("gte1a"), ElementsAre(1, 2, 0)); - EXPECT_THAT(layout_of("gte1b"), ElementsAre(2, 0, 1)); - EXPECT_THAT(layout_of("fresult"), ElementsAre(2, 1, 0)); + EXPECT_THAT(LayoutOf(module.get(), "gte0"), ElementsAre(0, 1, 2)); + EXPECT_THAT(LayoutOf(module.get(), "gte1a"), ElementsAre(1, 2, 0)); + EXPECT_THAT(LayoutOf(module.get(), "gte1b"), ElementsAre(2, 0, 1)); + EXPECT_THAT(LayoutOf(module.get(), "fresult"), ElementsAre(2, 1, 0)); EXPECT_THAT(FindInstruction(module.get(), "gte1") ->shape() .tuple_shapes(0) @@ -769,9 +770,12 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) { false_builder.AddInstruction( HloInstruction::CreateParameter(0, tshape, "param")); // Using infeed as layout assignment does not mess up with it. - auto infeed = - false_builder.AddInstruction(HloInstruction::CreateInfeed(xshape, "")); - false_builder.AddInstruction(HloInstruction::CreateTuple({infeed})); + auto token = false_builder.AddInstruction(HloInstruction::CreateToken()); + auto infeed = false_builder.AddInstruction( + HloInstruction::CreateInfeed(xshape, token, "")); + auto infeed_data = false_builder.AddInstruction( + HloInstruction::CreateGetTupleElement(xshape, infeed, 0)); + false_builder.AddInstruction(HloInstruction::CreateTuple({infeed_data})); } HloComputation* false_computation = module->AddEmbeddedComputation(false_builder.Build()); @@ -798,7 +802,7 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) { TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) { auto builder = HloComputation::Builder(TestName()); auto constant0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2WithLayout( + HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1})))); builder.AddInstruction(HloInstruction::CreateUnary( constant0->shape(), HloOpcode::kBitcast, constant0)); @@ -816,5 +820,156 @@ TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) { "Unexpected bitcast operation seen during layout assignment")); } +TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) { + // Pin non matching layouts to parameter and root. + const char* module_str = R"( + HloModule test_module + + ENTRY entry_computation { + param = (f32[2,2]) parameter(0) + gte = f32[2,2] get-tuple-element(param), index=0 + token = token[] after-all() + recv = (f32[2,2], u32[], token[]) recv(token), channel_id=1, sharding={maximal device=1} + recv-done = (f32[2,2], token[]) recv-done(recv), channel_id=1, + sharding={maximal device=1} + ROOT root = f32[2,2] get-tuple-element(recv-done), index=0 + send = (f32[2,2], u32[], token[]) send(gte, token), channel_id=1, + sharding={maximal device=0} + send-done = token[] send-done(send), channel_id=1, sharding={maximal device=0} + } + )"; + + auto module = ParseHloString(module_str).ValueOrDie(); + ComputationLayout computation_layout( + module->entry_computation()->ComputeProgramShape()); + Shape param_shape = ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})}); + TF_ASSERT_OK( + computation_layout.mutable_parameter_layout(0)->CopyLayoutFromShape( + param_shape)); + computation_layout.mutable_result_layout()->ResetLayout( + LayoutUtil::MakeLayout({1, 0})); + + ChannelLayoutConstraints channel_constraints; + AssignLayouts(module.get(), &computation_layout, &channel_constraints); + + EXPECT_THAT(LayoutOf(module.get(), "gte"), ElementsAre(0, 1)); + EXPECT_THAT(LayoutOf(module.get(), "root"), ElementsAre(1, 0)); + EXPECT_TRUE( + ShapeUtil::Equal(ShapeUtil::GetSubshape( + FindInstruction(module.get(), "send")->shape(), {0}), + ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0}))); +} + +TEST_F(LayoutAssignmentTest, CopySliceOperandToAvoidImplicitLayoutChange) { + const char* module_str = R"( + HloModule CopySliceOperandToAvoidImplicitLayoutChange + + ENTRY CopySliceOperandToAvoidImplicitLayoutChange { + par0 = f32[3,4]{1,0} parameter(0) + par1 = f32[4,5]{0,1} parameter(1) + slice0 = f32[3,4] slice(par1), slice={[1:4],[1:5]} + ROOT add0 = f32[3,4]{1,0} add(par0,slice0) + } + )"; + + auto module = ParseHloString(module_str).ValueOrDie(); + module = + backend() + .compiler() + ->RunHloPasses(std::move(module), backend().default_stream_executor(), + /*device_allocator=*/nullptr) + .ConsumeValueOrDie(); + + auto copy = FindInstruction(module.get(), "copy.1"); + auto slice = FindInstruction(module.get(), "slice0"); + EXPECT_EQ(slice->operand(0), copy); + EXPECT_TRUE( + LayoutUtil::Equal(slice->shape().layout(), copy->shape().layout())); +} + +TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) { + const char* module_str = R"( + HloModule CopyDSliceOperandToAvoidImplicitLayoutChange + + ENTRY CopyDSliceOperandToAvoidImplicitLayoutChange { + par0 = f32[3,4]{1,0} parameter(0) + par1 = f32[4,5]{0,1} parameter(1) + par2 = s32[2] parameter(2) + dslice0 = f32[3,4] dynamic-slice(par1, par2), dynamic_slice_sizes={3,4} + ROOT add0 = f32[3,4]{1,0} add(par0,dslice0) + } + )"; + + auto module = ParseHloString(module_str).ValueOrDie(); + module = + backend() + .compiler() + ->RunHloPasses(std::move(module), backend().default_stream_executor(), + /*device_allocator=*/nullptr) + .ConsumeValueOrDie(); + + auto copy = FindInstruction(module.get(), "copy.1"); + auto dslice = FindInstruction(module.get(), "dslice0"); + EXPECT_EQ(dslice->operand(0), copy); + EXPECT_TRUE( + LayoutUtil::Equal(dslice->shape().layout(), copy->shape().layout())); +} + +TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) { + const char* module_str = R"( + HloModule CopyConcatOperandToAvoidImplicitLayoutChange + + ENTRY CopyConcatOperandToAvoidImplicitLayoutChange { + par0 = f32[3,8]{1,0} parameter(0) + par1 = f32[3,5]{0,1} parameter(1) + par2 = f32[3,3]{1,0} parameter(2) + concat0 = f32[3,8] concatenate(f32[3,5] par1, f32[3,3] par2), + dimensions={1} + ROOT add0 = f32[3,8]{1,0} add(par0,concat0) + } + )"; + + auto module = ParseHloString(module_str).ValueOrDie(); + module = + backend() + .compiler() + ->RunHloPasses(std::move(module), backend().default_stream_executor(), + /*device_allocator=*/nullptr) + .ConsumeValueOrDie(); + + auto copy = FindInstruction(module.get(), "copy.1"); + auto concat = FindInstruction(module.get(), "concat0"); + EXPECT_EQ(concat->operand(0), copy); + EXPECT_TRUE( + LayoutUtil::Equal(concat->shape().layout(), copy->shape().layout())); +} + +TEST_F(LayoutAssignmentTest, + ConvolutionOperandWithImplicitLayoutChangeNotCopied) { + const char* module_str = R"( + HloModule ConvolutionOperandWithImplicitLayoutChangeNotCopied + + ENTRY ConvolutionOperandWithImplicitLayoutChangeNotCopied { + par0 = f32[128,3,230,230]{2,3,1,0} parameter(0) + par1 = f32[7,7,3,64]{3,2,0,1} parameter(1) + ROOT convolution0 = f32[128,64,112,112]{3,2,1,0} convolution(par0, par1), + window={size=7x7 stride=2x2}, dim_labels=bf01_01io->bf01, + feature_group_count=1 + } + )"; + + auto module = ParseHloString(module_str).ValueOrDie(); + module = + backend() + .compiler() + ->RunHloPasses(std::move(module), backend().default_stream_executor(), + /*device_allocator=*/nullptr) + .ConsumeValueOrDie(); + + auto copy = FindInstruction(module.get(), "copy.1"); + EXPECT_EQ(copy, nullptr); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD index f1e7fc29532ce7e6841010a5258f4000a7c70383..540bbb7c7a74f65ab70f4c6704d6600db2adbb60 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/BUILD +++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD @@ -21,6 +21,11 @@ filegroup( ]), ) +load( + "//tensorflow:tensorflow.bzl", + "tf_cc_test", +) + cc_library( name = "alias_analysis", srcs = ["alias_analysis.cc"], @@ -33,16 +38,30 @@ cc_library( "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:logical_buffer", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", "@llvm//:core", ], ) +tf_cc_test( + name = "alias_analysis_test", + srcs = ["alias_analysis_test.cc"], + deps = [ + ":alias_analysis", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry", + "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test", + "//tensorflow/compiler/xla/tests:filecheck", + "//tensorflow/core:test", + ], +) + cc_library( name = "llvm_util", srcs = ["llvm_util.cc"], hdrs = ["llvm_util.h"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", @@ -51,6 +70,8 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_module_config", "//tensorflow/compiler/xla/service:name_uniquer", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", "@llvm//:core", "@llvm//:support", "@llvm//:target", @@ -70,6 +91,9 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", "@llvm//:core", ], ) @@ -85,6 +109,8 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", "@llvm//:core", ], ) @@ -102,6 +128,23 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/strings:str_format", + "@llvm//:core", + ], +) + +cc_library( + name = "kernel_tiling", + srcs = ["kernel_tiling.cc"], + hdrs = ["kernel_tiling.h"], + deps = [ + ":ir_array", + ":llvm_util", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/core:lib", "@llvm//:core", ], ) @@ -112,6 +155,7 @@ cc_library( hdrs = ["fused_ir_emitter.h"], deps = [ ":ir_array", + ":kernel_tiling", ":llvm_util", ":loop_emitter", ":tuple_ops", @@ -122,14 +166,15 @@ cc_library( "//tensorflow/compiler/xla/service:elemental_ir_emitter", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", "@llvm//:core", ], ) cc_library( - name = "ops", - srcs = ["ops.cc"], - hdrs = ["ops.h"], + name = "dynamic_update_slice_util", + srcs = ["dynamic_update_slice_util.cc"], + hdrs = ["dynamic_update_slice_util.h"], deps = [ ":fused_ir_emitter", ":ir_array", @@ -143,6 +188,26 @@ cc_library( ], ) +cc_library( + name = "sort_util", + srcs = ["sort_util.cc"], + hdrs = ["sort_util.h"], + deps = [ + ":ir_array", + ":llvm_loop", + ":llvm_util", + ":loop_emitter", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter", + "//tensorflow/compiler/xla/service/gpu:partition_assignment", + "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + "@llvm//:core", + "@llvm//:support", + ], +) + cc_library( name = "tuple_ops", srcs = ["tuple_ops.cc"], @@ -154,6 +219,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", "@llvm//:core", ], ) @@ -165,7 +231,36 @@ cc_library( deps = [ ":llvm_loop", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", - "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@llvm//:core", + ], +) + +cc_library( + name = "buffer_assignment_util", + srcs = ["buffer_assignment_util.cc"], + hdrs = ["buffer_assignment_util.h"], + deps = [ + "//tensorflow/compiler/xla/service:buffer_assignment", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "math_ops", + srcs = ["math_ops.cc"], + hdrs = ["math_ops.h"], + deps = [ + ":llvm_util", + "@llvm//:core", + ], +) + +cc_library( + name = "ir_builder_mixin", + srcs = [], + hdrs = ["ir_builder_mixin.h"], + deps = [ "@llvm//:core", ], ) diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc index 21bca1d6beff5b2804531724b94b123d4523c173..e5370eca56f2e3a891523ba2b72961d66ec809aa 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc @@ -28,19 +28,20 @@ namespace llvm_ir { // Sentry allocation used to represent parameters of the entry computation in // alias_scope_metadata_ and noalias_metadata_. static const BufferAllocation* kParameterAllocation = new BufferAllocation( - /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false, - LogicalBuffer::Color(0)); + /*index=*/-1, /*size=*/0, LogicalBuffer::Color(0)); void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo, - llvm_ir::IrArray* array) { + llvm_ir::IrArray* array, + const ShapeIndex& index) { BufferAllocation::Slice buffer_slice; - if (hlo.opcode() == HloOpcode::kParameter) { - // Parameters may alias with each other but may not alias with our temporary - // buffers. + if (hlo.opcode() == HloOpcode::kParameter && + hlo.parent() == hlo.parent()->parent()->entry_computation()) { + // Entry computation parameters may alias with each other but may not alias + // with our temporary buffers. buffer_slice = BufferAllocation::Slice(kParameterAllocation, 0, 0); } else { const std::set slices = - assignment_.GetAllSlices(&hlo, /*index=*/{}); + assignment_.GetAllSlices(&hlo, index); if (slices.empty() || slices.size() > 1) { // Skip HLOs which don't have a buffer assigned or for which the // buffer can't be determined statically. We cannot determine their @@ -137,16 +138,18 @@ llvm::MDNode* AliasAnalysis::GetNoaliasMetadataForBuffer( // 2. Operands of users of the given hlo. // 3. Operands of the given hlo. // - // This set can be increased as we need. For now only consider top-level - // buffers (index = {}) not buffers nested within the instruction's - // operands/output which are not typically touched. + // This set can be increased as we need. std::vector worklist; auto add_buffers_to_worklist = [&worklist, &assignment](const HloInstruction* instruction) { - for (const LogicalBuffer* buffer : - assignment.GetSourceBuffers(instruction, /*index=*/{})) { - worklist.push_back(buffer); - } + ShapeUtil::ForEachSubshape( + instruction->shape(), + [&](const Shape& /*shape*/, const ShapeIndex& index) { + for (const LogicalBuffer* buffer : + assignment.GetSourceBuffers(instruction, index)) { + worklist.push_back(buffer); + } + }); }; for (HloInstruction* user : hlo.users()) { diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h index 5244ac61e56307857aca659854647bd6c3e991d7..8d9fa99d82b4e49b653d9f05cc9baa5e3fdcefa6 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h +++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_ +#include "absl/strings/str_cat.h" #include "llvm/IR/Module.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -23,7 +24,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { namespace llvm_ir { @@ -38,7 +38,8 @@ class AliasAnalysis { // Augments IrArray with aliasing information. void AddAliasingInformationToIrArray(const HloInstruction& hlo, - llvm_ir::IrArray* array); + llvm_ir::IrArray* array, + const ShapeIndex& index = {}); private: // Returns a unique alias domain for this emitter. diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6ae4932f5707f1d15af1e09a735a7de2e48fac5 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc @@ -0,0 +1,83 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h" +#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h" +#include "tensorflow/compiler/xla/tests/filecheck.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace cpu { +namespace { +class AliasAnalysisTest : public CpuCodegenTest {}; + +void FakeCustomCallTarget(float* out, float** in) {} + +REGISTER_CUSTOM_CALL_TARGET(FakeCustomCallTarget); + +TEST_F(AliasAnalysisTest, EmbeddedComputationParamsMayAliasTemps) { + const char* hlo_string = R"( +HloModule while + +body { + const.0.125 = f32[] constant(0.125) + body.state = f32[] parameter(0) + ROOT add.2.2 = f32[] add(const.0.125, body.state) +} + +condition { + const.100 = f32[] constant(100) + condition.state = f32[] parameter(0) + addend = f32[] custom-call(condition.state), custom_call_target="FakeCustomCallTarget" + add = f32[] add(addend, condition.state) + ROOT greater-than = pred[] greater-than(const.100, add) +} + +ENTRY while3 { + const.0 = f32[] constant(0) + ROOT while = f32[] while(const.0), condition=condition, body=body +} +)"; + + CompileAndVerifyIr(hlo_string, R"( +; CHECK-LABEL: @body(i8* %retval +; CHECK: %[[add_result:.*]] = fadd fast float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]] +; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], !alias.scope ![[alias_scope_md_for_store:[0-9]+]] +; +; CHECK-LABEL: @condition(i8* %retval, i8* noalias %run_options, i8** noalias %params +; CHECK: %[[cond_state_buf_ptr:.*]] = getelementptr inbounds i8*, i8** %buffer_table, i64 0 +; CHECK: %[[cond_state_buf_untyped:.*]] = load i8*, i8** %[[cond_state_buf_ptr]] +; CHECK: %[[cond_state_buf_typed:.*]] = bitcast i8* %[[cond_state_buf_untyped]] to float* +; CHECK: load float, float* %[[cond_state_buf_typed]], !alias.scope ![[alias_scope_md_for_store]], !noalias ![[noalias_md_for_load:.*]] +; +; CHECK-LABEL: @while3( + +![[alias_scope_md_for_store]] = !{![[buffer_idx_0:.*]]} +![[buffer_idx_0]] = !{!"buffer: {index:0, offset:0, size:4}", ![[aa_md_root:.*]]} +![[aa_md_root]] = !{!"XLA global AA domain"} +![[buffer_idx_1:.*]] = !{!"buffer: {index:1, offset:0, size:4}", !3} +![[buffer_idx_1_offset_16:.*]] = !{!"buffer: {index:1, offset:16, size:1}", !3} +![[noalias_md_for_load]] = !{![[buffer_idx_1_offset_16]], ![[buffer_idx_1]]} +} +)"); +} + +} // namespace +} // namespace cpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..bdce4a171b8a58f617f1d56e6cf6db5354846703 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc @@ -0,0 +1,60 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h" +#include "absl/strings/str_cat.h" + +namespace xla { +namespace llvm_ir { +static const HloInstruction& InstrForConstantBufferAllocation( + const BufferAllocation& allocation) { + CHECK(allocation.is_constant()); + HloInstruction* const_instr = nullptr; + for (const auto& buffer_offset_pair : allocation.assigned_buffers()) { + const LogicalBuffer* buffer = buffer_offset_pair.first; + // BufferAssignment may have assigned non-constant instructions to this + // allocation too so we can't CHECK this condition. E.g. for + // + // while(init = constant, body = identity, cond = ...) + // + // the LogicalBuffer for the kWhile instruction will have the same + // BufferAllocation as the LogicalBuffer for the (init) constant. + if (buffer->instruction()->opcode() == HloOpcode::kConstant) { + CHECK_EQ(const_instr, nullptr) + << const_instr->ToString() << " " << buffer->ToString(); + const_instr = buffer->instruction(); + } + } + CHECK_NE(const_instr, nullptr); + return *const_instr; +} + +string ConstantBufferAllocationToGlobalName( + const BufferAllocation& allocation) { + string instr_name = InstrForConstantBufferAllocation(allocation).name(); + for (char& c : instr_name) { + if (c == '.') { + c = '_'; + } + } + return absl::StrCat("buffer_for_", instr_name); +} + +const Literal& LiteralForConstantAllocation( + const BufferAllocation& allocation) { + return InstrForConstantBufferAllocation(allocation).literal(); +} +} // namespace llvm_ir +} // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h new file mode 100644 index 0000000000000000000000000000000000000000..bfb6eecb87f6a1b756b3a8da3377f608dd7f0be7 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h @@ -0,0 +1,34 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_ + +#include "tensorflow/compiler/xla/service/buffer_assignment.h" + +namespace xla { +namespace llvm_ir { +// In XLA:GPU we map constant buffer allocations to globals in the generated +// LLVM IR. This function gives us the name of the global variable a constant +// buffer is mapped to. Not used on XLA:CPU. +string ConstantBufferAllocationToGlobalName(const BufferAllocation& allocation); + +// Returns the Literal corresponding to `allocation`, which must be a constant +// allocation. +const Literal& LiteralForConstantAllocation(const BufferAllocation& allocation); +} // namespace llvm_ir +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_ diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc2e862f2eb9a49099c5f90efe1b29fb77c8f106 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc @@ -0,0 +1,194 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h" +#include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h" +#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" +#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" + +namespace xla { +namespace llvm_ir { + +bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice, + const BufferAssignment& assignment) { + CHECK_EQ(HloOpcode::kDynamicUpdateSlice, dynamic_update_slice->opcode()); + const HloInstruction* operand = dynamic_update_slice->operand(0); + return assignment.HasTopLevelAllocation(dynamic_update_slice) && + assignment.HasTopLevelAllocation(operand) && + assignment.SharesTopLevelSlice(dynamic_update_slice, operand); +} + +// Shared implementation of EmitDynamicUpdateSliceInPlace and +// EmitFusedDynamicUpdateSliceInPlace. +// +// Emits a sequential loop if launch_dimensions is null. +static Status EmitDynamicUpdateSliceInPlaceImpl( + const Shape& update_shape, const ElementGenerator& start_indices_generator, + bool is_signed, ElementGenerator update_array_generator, + const IrArray& output_array, const gpu::LaunchDimensions* launch_dimensions, + absl::string_view name, llvm::IRBuilder<>* b) { + const Shape& output_shape = output_array.GetShape(); + + // Read start indices from start_indices_generator. + const int64 rank = ShapeUtil::Rank(output_shape); + IrArray::Index start_index(b->getInt64Ty(), rank); + for (int64 i = 0; i < rank; ++i) { + IrArray::Index dim_index({b->getInt64(i)}); + TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index)); + llvm::Value* output_dim_size = llvm::ConstantInt::get( + start_index[i]->getType(), output_shape.dimensions(i)); + llvm::Value* update_dim_size = llvm::ConstantInt::get( + start_index[i]->getType(), update_shape.dimensions(i)); + + // Clamp the start index so that the update region fits in the operand. + // start_index = clamp(start_index, 0, output_dim_size - update_dim_size) + llvm::Value* max_bound = b->CreateSub(output_dim_size, update_dim_size); + llvm::Value* zero = llvm::ConstantInt::get(start_index[i]->getType(), 0); + start_index[i] = + b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE + : llvm::ICmpInst::ICMP_UGE, + zero, start_index[i]), + zero, start_index[i]); + + start_index[i] = + b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE + : llvm::ICmpInst::ICMP_ULE, + max_bound, start_index[i]), + max_bound, start_index[i]); + } + + auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status { + // Calculate output_index, where we'll write the value from update. For + // each dimension, + // + // output_index[dim] = start_index[dim] + update_index[dim] + // + IrArray::Index output_index(start_index.GetType(), rank); + for (int64 i = 0; i < rank; ++i) { + llvm::Value* start_index0 = + b->CreateSExtOrBitCast(start_index[i], update_index[i]->getType()); + output_index[i] = b->CreateAdd(start_index0, update_index[i]); + } + + // Do output[output_index] = update[update_index]. + TF_ASSIGN_OR_RETURN(llvm::Value * update_data, + update_array_generator(update_index)); + output_array.EmitWriteArrayElement(output_index, update_data, b); + return Status::OK(); + }; + + if (launch_dimensions != nullptr) { + return gpu::ParallelLoopEmitter(loop_body_emitter, update_shape, + *launch_dimensions, b) + .EmitLoop(name); + } + return LoopEmitter(loop_body_emitter, update_shape, b).EmitLoop(name); +} + +Status EmitDynamicUpdateSliceInPlace(absl::Span operand_arrays, + const IrArray& output_array, + absl::string_view name, + llvm::IRBuilder<>* b) { + VLOG(2) << "EmitDynamicUpdateSliceInPlace for " << name; + + // No need to use operand_arrays[0], the input array of the + // dynamic-update-slice, because we know it aliases the op's output. + IrArray update_array = operand_arrays[1]; + IrArray start_indices_array = operand_arrays[2]; + Shape output_shape = output_array.GetShape(); + Shape update_shape = update_array.GetShape(); + + ElementGenerator start_indices_generator = [&](const IrArray::Index& index) { + return start_indices_array.EmitReadArrayElement(index, b); + }; + ElementGenerator update_array_generator = [&](const IrArray::Index& index) { + return update_array.EmitReadArrayElement(index, b); + }; + + bool is_signed = ShapeUtil::ElementIsSigned(start_indices_array.GetShape()); + return EmitDynamicUpdateSliceInPlaceImpl( + update_shape, start_indices_generator, is_signed, update_array_generator, + output_array, /*launch_dimensions=*/nullptr, name, b); +} + +// Shared implementation for EmitFusedDynamicUpdateSliceInPlace and +// EmitParallelFusedDynamicUpdateSliceInPlace. +// +// Emits a sequential loop if launch_dimensions is null. +static Status EmitFusedDynamicUpdateSliceInPlaceImpl( + HloInstruction* fusion, absl::Span fusion_operand_arrays, + const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, + const gpu::LaunchDimensions* launch_dimensions, llvm::IRBuilder<>* b) { + CHECK_EQ(fusion->opcode(), HloOpcode::kFusion); + VLOG(2) << "EmitFusedDynamicUpdateSliceInPlace for " + << fusion->ToShortString(); + + auto* dynamic_update_slice = fusion->fused_expression_root(); + + const auto* update = dynamic_update_slice->operand(1); + const auto* start_indices = dynamic_update_slice->operand(2); + Shape update_shape = update->shape(); + + // Our in-place dynamic-update-slice implementation emits a loop over + // update_shape. To emit a cache-friendly loop, we need to know that shape's + // layout. + // + // update_shape is inside a fusion node -- it's never materialized in memory + // and thus doesn't have a layout. In this case we use the layout of the + // fusion node for iteration, since that corresponds to the order in memory of + // the buffer we'll be writing to. + // + // (This isn't necessarily optimal; in some cases it might be faster to peek + // through the chain of ops that gives us the update operand and use the + // layout of its source buffer(s). But this is no worse than we do with + // fusion elsewhere.) + TF_RETURN_IF_ERROR( + LayoutUtil::CopyLayoutBetweenShapes(fusion->shape(), &update_shape)); + + // Create element generators for update and start_indices. + FusedIrEmitter fused_emitter(fusion_operand_arrays, elemental_emitter); + TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter)); + ElementGenerator update_array_generator = fused_emitter.GetGenerator(update); + ElementGenerator start_indices_generator = + fused_emitter.GetGenerator(start_indices); + + bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape()); + return EmitDynamicUpdateSliceInPlaceImpl( + update_shape, start_indices_generator, is_signed, update_array_generator, + fusion_output_array, launch_dimensions, IrName(fusion), b); +} + +Status EmitFusedDynamicUpdateSliceInPlace( + HloInstruction* fusion, absl::Span fusion_operand_arrays, + const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, + llvm::IRBuilder<>* b) { + return EmitFusedDynamicUpdateSliceInPlaceImpl( + fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter, + /*launch_dimensions=*/nullptr, b); +} + +Status EmitParallelFusedDynamicUpdateSliceInPlace( + HloInstruction* fusion, absl::Span fusion_operand_arrays, + const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, + const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b) { + return EmitFusedDynamicUpdateSliceInPlaceImpl( + fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter, + &launch_dimensions, b); +} + +} // namespace llvm_ir +} // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h new file mode 100644 index 0000000000000000000000000000000000000000..fb3e4eb97cae06f2a0c87dd7118b8332048df56e --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h @@ -0,0 +1,90 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_DYNAMIC_UPDATE_SLICE_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_DYNAMIC_UPDATE_SLICE_UTIL_H_ + +#include "tensorflow/compiler/xla/service/buffer_assignment.h" +#include "tensorflow/compiler/xla/service/elemental_ir_emitter.h" +#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" + +// Utilities related to emitting LLVM IR for various HLO ops. + +namespace xla { +namespace llvm_ir { + +// Checks if we can emit code for the given DynamicUpdateSlice node that updates +// its input in place. Returns true if the dynamic-update-slice's +// array-to-be-updated and output share the same BufferAllocation::Slice. +// +// dynamic_update_slice must be a DynamicUpdateSlice op. +bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice, + const BufferAssignment& assignment); + +// Checks if the given fusion node is amenable to being implemented by +// EmitFusedDynamicUpdateSliceInPlace. +inline bool CanEmitFusedDynamicUpdateSliceInPlace( + HloInstruction* fusion, const BufferAssignment& assignment) { + CHECK_EQ(fusion->opcode(), HloOpcode::kFusion); + HloInstruction* fused_root = fusion->fused_expression_root(); + if (fused_root->opcode() != HloOpcode::kDynamicUpdateSlice || + fusion->fusion_kind() != HloInstruction::FusionKind::kLoop) { + return false; + } + // Walk DynamicUpdateSlice operand(0) to fused parameter and get its + // associated operand. See if it shares an allocation with this operand. + HloInstruction* fusion_operand; + ShapeIndex index; + std::tie(fusion_operand, index) = + fused_root->mutable_operand(0)->LatestNonGteAncestorAndIndex(); + if (fusion_operand->opcode() != HloOpcode::kParameter) { + return false; + } + auto* operand = fusion->operand(fusion_operand->parameter_number()); + return assignment.HasAllocationAt(operand, index) && + assignment.HasAllocationAt(fusion, {}) && + assignment.SharesSliceAtIndex(fusion, {}, operand, index); +} + +// Emits IR for running the given dynamic-update-slice op in-place -- that is, +// where the input and output buffers share the same slice, so we can simply +// modify the input/output buffer without touching any of the other elements. +Status EmitDynamicUpdateSliceInPlace(absl::Span operand_arrays, + const IrArray& output_array, + absl::string_view name, + llvm::IRBuilder<>* b); + +// Given a loop-fusion node whose root is a dynamic-update-slice op whose +// array-to-be-updated and output share the same buffer slice, emits +// (sequential) code for a fusion node that does the dynamic-update-slice in +// place. +Status EmitFusedDynamicUpdateSliceInPlace( + HloInstruction* fusion, absl::Span fusion_operand_arrays, + const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, + llvm::IRBuilder<>* b); + +// Same as EmitFusedDynamicUpdateSliceInPlace, except emits a parallel loop with +// the given launch dimensions. +Status EmitParallelFusedDynamicUpdateSliceInPlace( + HloInstruction* fusion, absl::Span fusion_operand_arrays, + const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, + const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b); + +} // namespace llvm_ir +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_DYNAMIC_UPDATE_SLICE_UTIL_H_ diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc index f172b1d87c870270436f7301ed200b47d08431a7..b606c993a2d58a6d177af10de7b214de130c2279 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc @@ -52,7 +52,7 @@ Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) { // that would be regenerated without caching. But this might increase the // JIT compilation time. if (generated_value_bb == nullptr || - generated_value_bb == ir_builder_->GetInsertBlock()) { + generated_value_bb == b_->GetInsertBlock()) { VLOG(3) << "The cached generated value is reused."; return generated_value; } @@ -60,8 +60,7 @@ Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) { "a different BB (" << llvm_ir::AsString(generated_value_bb->getName()) << ") from the current insertion block (" - << llvm_ir::AsString(ir_builder_->GetInsertBlock()->getName()) - << ")."; + << llvm_ir::AsString(b_->GetInsertBlock()->getName()) << ")."; } TF_ASSIGN_OR_RETURN( @@ -77,12 +76,14 @@ Status FusedIrEmitter::HandleConstant(HloInstruction* constant) { llvm::Constant* initializer = llvm_ir::ConvertLiteralToIrConstant(literal, module_); llvm::GlobalVariable* global = new llvm::GlobalVariable( - *ir_builder_->GetInsertBlock()->getModule(), initializer->getType(), + *b_->GetInsertBlock()->getModule(), initializer->getType(), /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, initializer, /*Name=*/""); + llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast( + global, llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo()); generators_[constant] = [=](const IrArray::Index& index) { - return IrArray(global, constant->shape()) - .EmitReadArrayElement(index, ir_builder_); + return IrArray(shape_constant, constant->shape()) + .EmitReadArrayElement(index, b_); }; return Status::OK(); @@ -97,12 +98,12 @@ Status FusedIrEmitter::HandleGetTupleElement( return Unimplemented( "GetTupleElement fusion currently only supports" " parameter operands, but found operand: %s", - operand->name().c_str()); + operand->name()); } // Emit code to lookup tuple element pointer, and store it in 'gte_values_'. llvm::Value* tuple_element_ptr = llvm_ir::EmitGetTupleElement( get_tuple_element->shape(), get_tuple_element->tuple_index(), - /*alignment=*/1, it->second, ir_builder_, module_); + /*alignment=*/1, it->second, b_, module_); gte_values_.insert(std::make_pair(get_tuple_element, tuple_element_ptr)); // Emit code to read base tuple element array (if non-tuple shaped). if (!ShapeUtil::IsTuple(get_tuple_element->shape())) { @@ -110,16 +111,32 @@ Status FusedIrEmitter::HandleGetTupleElement( [=](const IrArray::Index& index) -> StatusOr { // TODO(b/34080002) Add aliasing information to tuple element IrArray. return IrArray(tuple_element_ptr, get_tuple_element->shape()) - .EmitReadArrayElement(index, ir_builder_); + .EmitReadArrayElement(index, b_); }; } return Status::OK(); } Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) { - generators_[parameter] = [=](const IrArray::Index& index) { + generators_[parameter] = [=](const IrArray::Index& index) -> llvm::Value* { + if (tiled_parameter_info_) { + if (llvm::Value* param_tile_buffer = + tiled_parameter_info_->GetBufferForParameter( + parameter->parameter_number())) { + // TODO(jlebar): Add AA metadata to this load. Tile buffers are global + // variables, so LLVM's points-to analysis doesn't help us much. And we + // want the AA info to be present before address spaces are inferred + // (which is pretty late in the pipeline), so even if we had + // address-space-based AA in LLVM, it wouldn't help us much here. + return b_->CreateLoad( + b_->CreateGEP(param_tile_buffer, {index.GetConstantWithIndexType(0), + tiled_parameter_info_->x(), + tiled_parameter_info_->y()}), + "tiled_buffer"); + } + } return parameter_arrays_[parameter->parameter_number()] - .EmitReadArrayElement(index, ir_builder_); + .EmitReadArrayElement(index, b_); }; // Store ir value for fusion operand associated with fusion parameter to be // accessed by subsequent fused GetTupleElement instructions. @@ -130,7 +147,7 @@ Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) { } Status FusedIrEmitter::HandleTuple(HloInstruction* tuple) { - tensorflow::gtl::ArraySlice operands(tuple->operands()); + absl::Span operands(tuple->operands()); std::vector operand_elemental_ir_types; for (HloInstruction* operand : operands) { operand_elemental_ir_types.push_back(llvm_ir::PrimitiveTypeToIrType( @@ -138,11 +155,11 @@ Status FusedIrEmitter::HandleTuple(HloInstruction* tuple) { } generators_[tuple] = [=](const IrArray::Index& index) -> StatusOr { - llvm::Value* ret = llvm::UndefValue::get(llvm::StructType::get( - ir_builder_->getContext(), operand_elemental_ir_types)); + llvm::Value* ret = llvm::UndefValue::get( + llvm::StructType::get(b_->getContext(), operand_elemental_ir_types)); for (size_t i = 0; i < ShapeUtil::TupleElementCount(tuple->shape()); ++i) { TF_ASSIGN_OR_RETURN(llvm::Value * val_i, generators_[operands[i]](index)); - ret = ir_builder_->CreateInsertValue(ret, val_i, i); + ret = b_->CreateInsertValue(ret, val_i, i); } return ret; }; diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h index b3b6026ef17daa184c0a015fdea618597ef068b3..44d21fa750a532633f46614002d59c90fc0b5d40 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h +++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h @@ -19,16 +19,17 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h" #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { @@ -53,11 +54,12 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault { public: using Generator = llvm_ir::ElementGenerator; - FusedIrEmitter(tensorflow::gtl::ArraySlice parameter_arrays, + FusedIrEmitter(absl::Span parameter_arrays, ElementalIrEmitter* elemental_emitter) : parameter_arrays_(parameter_arrays), + tiled_parameter_info_(nullptr), elemental_emitter_(elemental_emitter), - ir_builder_(elemental_emitter->ir_builder()), + b_(elemental_emitter->b()), module_(elemental_emitter->module()) {} Status DefaultAction(HloInstruction* hlo) override; @@ -86,9 +88,14 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault { return it->second; } + void SetTiledParameterInfo(const llvm_ir::TiledParameterInfo* info) { + tiled_parameter_info_ = info; + } + private: // Arrays of parameters of fusion instruction - tensorflow::gtl::ArraySlice parameter_arrays_; + absl::Span parameter_arrays_; + const llvm_ir::TiledParameterInfo* tiled_parameter_info_; ElementalIrEmitter* elemental_emitter_; @@ -96,7 +103,7 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault { const HloInstruction* fused_root_ = nullptr; // Borrowed - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; llvm::Module* module_; // Map from instruction pointers to functions to generate elements of their diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc index 7323abeb2077154f82828bcda3e90eb45a67138a..67f7423121177e2ca1e3384341dad2644c8f5e34 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc @@ -29,9 +29,9 @@ limitations under the License. namespace xla { namespace llvm_ir { -static void Delinearize(std::vector* multidim, - llvm::Value* linear, const Shape& shape, - llvm::IRBuilder<>* ir_builder) { +void IrArray::Index::Delinearize(std::vector* multidim, + llvm::Value* linear, const Shape& shape, + llvm::IRBuilder<>* b) const { int64 divisor = 1; const Layout& layout = shape.layout(); for (int64 i = 0; i < layout.minor_to_major_size(); ++i) { @@ -48,10 +48,10 @@ static void Delinearize(std::vector* multidim, // useful because cuda-memcheck can't help us much in XLA: Most of our // memory lives in one big allocation, so cuda-memcheck can't detect // out-of-bounds accesses. - auto* quot = ir_builder->CreateUDiv(linear, ir_builder->getInt64(divisor)); + auto* quot = b->CreateUDiv(linear, GetConstantWithIndexType(divisor)); if (i < layout.minor_to_major_size() - 1) { - (*multidim)[dimension] = ir_builder->CreateURem( - quot, ir_builder->getInt64(size_of_current_dimension)); + (*multidim)[dimension] = b->CreateURem( + quot, GetConstantWithIndexType(size_of_current_dimension)); } else { (*multidim)[dimension] = quot; } @@ -60,34 +60,46 @@ static void Delinearize(std::vector* multidim, } IrArray::Index::Index(llvm::Value* linear, const Shape& shape, - llvm::IRBuilder<>* ir_builder) + llvm::IRBuilder<>* b) : multidim_(ShapeUtil::Rank(shape)), linear_(linear), layout_(shape.layout()), dims_(shape.dimensions().begin(), shape.dimensions().end()) { + CHECK_NE(linear, nullptr); + index_type_ = linear->getType(); CHECK(LayoutUtil::HasLayout(shape)) << "Shape " << ShapeUtil::HumanStringWithLayout(shape) << " should have a layout."; - Delinearize(&multidim_, linear, shape, ir_builder); + Delinearize(&multidim_, linear, shape, b); } -IrArray::Index::Index(tensorflow::gtl::ArraySlice multidim, +IrArray::Index::Index(absl::Span multidim, llvm::Value* linear, const Shape& shape) : multidim_(multidim.begin(), multidim.end()), linear_(linear), layout_(shape.layout()), dims_(shape.dimensions().begin(), shape.dimensions().end()) { + if (size()) { + index_type_ = multidim_[0]->getType(); + } else { + CHECK_NE(linear_, nullptr); + index_type_ = linear_->getType(); + } + CHECK_NE(index_type_, nullptr); CHECK_EQ(shape.dimensions_size(), multidim.size()); CHECK(LayoutUtil::HasLayout(shape)) << "Shape " << ShapeUtil::HumanStringWithLayout(shape) << " should have a layout."; } -IrArray::Index::Index(tensorflow::gtl::ArraySlice multidim, - const Shape& shape, llvm::IRBuilder<>* ir_builder) +IrArray::Index::Index(absl::Span multidim, + const Shape& shape, llvm::IRBuilder<>* b) : multidim_(multidim.begin(), multidim.end()), layout_(shape.layout()), dims_(shape.dimensions().begin(), shape.dimensions().end()) { + CHECK_GT(multidim_.size(), 0); + index_type_ = multidim[0]->getType(); + CHECK_NE(index_type_, nullptr); CHECK_EQ(shape.dimensions_size(), multidim.size()); CHECK(LayoutUtil::HasLayout(shape)); } @@ -130,29 +142,29 @@ IrArray::Index IrArray::Index::SourceIndexOfReshape( CommonFactors(AsInt64Slice(input_shape.dimensions()), AsInt64Slice(output_shape.dimensions())); std::vector source_multidim_index( - ShapeUtil::Rank(input_shape), - llvm::UndefValue::get(builder->getInt64Ty())); + ShapeUtil::Rank(input_shape), llvm::UndefValue::get(index_type_)); // We compute the source indices in each common factor from only the target // indices in the same common factor. for (ssize_t k = common_factors.size() - 2; k >= 0; --k) { llvm::Value* logical_linear_index = - Index(tensorflow::gtl::ArraySlice( - multidim_, common_factors[k].second, - common_factors[k + 1].second - common_factors[k].second)) - .Linearize( - tensorflow::gtl::ArraySlice( - AsInt64Slice(output_shape.dimensions()), - common_factors[k].second, - common_factors[k + 1].second - common_factors[k].second), - builder); + Index(absl::Span(multidim_).subspan( + common_factors[k].second, + common_factors[k + 1].second - common_factors[k].second), + index_type_) + .Linearize(AsInt64Slice(output_shape.dimensions()) + .subspan(common_factors[k].second, + common_factors[k + 1].second - + common_factors[k].second), + builder); // Delinearizes logical_linear_index for the source array in row-major // collapsed order. The first rank-1 indices are the remainder of the // linear index by each dimension size. for (int64 i = common_factors[k + 1].first - 1; i >= common_factors[k].first; --i) { - llvm::Value* divisor = builder->getInt64(input_shape.dimensions(i)); + llvm::Value* divisor = + GetConstantWithIndexType(input_shape.dimensions(i)); if (input_shape.dimensions(i) == 1) { - source_multidim_index[i] = builder->getInt64(0); + source_multidim_index[i] = GetConstantWithIndexType(0); } else if (i == common_factors[k].first) { source_multidim_index[i] = logical_linear_index; } else { @@ -168,14 +180,13 @@ IrArray::Index IrArray::Index::SourceIndexOfReshape( ShapeUtil::ReshapeIsBitcast(input_shape, output_shape)) { return Index(source_multidim_index, linear(), input_shape); } - return Index(source_multidim_index); + return Index(source_multidim_index, index_type_); } IrArray::Index IrArray::Index::SourceIndexOfSlice( - const Shape& shape, tensorflow::gtl::ArraySlice starts, - tensorflow::gtl::ArraySlice strides, - llvm::IRBuilder<>* builder) const { - Index source_index(multidim_.size()); + const Shape& shape, absl::Span starts, + absl::Span strides, llvm::IRBuilder<>* builder) const { + Index source_index(index_type_, multidim_.size()); for (int i = 0; i < multidim_.size(); ++i) { int64 stride = strides[i]; auto type = multidim_[i]->getType(); @@ -195,7 +206,7 @@ IrArray::Index IrArray::Index::SourceIndexOfSlice( IrArray::Index IrArray::Index::SourceIndexOfTranspose( const Shape& shape, const Shape& operand_shape, - tensorflow::gtl::ArraySlice dimension_mapping, + absl::Span dimension_mapping, llvm::IRBuilder<>* builder) const { std::vector operand_multidim_index = Permute(dimension_mapping, multidim()); @@ -224,11 +235,12 @@ IrArray::Index IrArray::Index::SourceIndexOfBitcast( // the physical index of the element in the buffer. This is like Linearize, // but takes the layout into account. int64 scale = 1; - llvm::Value* linear_index = builder->getInt64(0); + llvm::Value* linear_index = GetConstantWithIndexType(0); for (auto dimension : LayoutUtil::MinorToMajor(shape)) { linear_index = builder->CreateAdd( linear_index, - builder->CreateMul(multidim_[dimension], builder->getInt64(scale), "", + builder->CreateMul(multidim_[dimension], + GetConstantWithIndexType(scale), "", /*HasNUW=*/true, /*HasNSW=*/true), "", /*HasNUW=*/true, /*HasNSW=*/true); scale *= shape.dimensions(dimension); @@ -243,7 +255,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBitcast( IrArray::Index IrArray::Index::SourceIndexOfBroadcast( const Shape& shape, const Shape& operand_shape, - tensorflow::gtl::ArraySlice dimension_mapping, + absl::Span dimension_mapping, llvm::IRBuilder<>* builder) const { int64 rank = ShapeUtil::Rank(operand_shape); std::vector source_index(rank); @@ -252,7 +264,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast( } if (linear_ == nullptr || !LayoutUtil::HasLayout(operand_shape) || !LayoutUtil::HasLayout(shape)) { - return Index(source_index); + return Index(source_index, index_type_); } // High-level idea: we can reuse the linear index if the broadcasted // dimensions are contiguous, and this part of the operation is a bitcast. @@ -274,7 +286,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast( bool contiguous_broadcast_dimensions = max_broadcasted_dimension - min_broadcasted_dimension == rank - 1; if (!contiguous_broadcast_dimensions) { - return Index(source_index); + return Index(source_index, index_type_); } // Check if the mapped dimensions are a bitcast. std::vector operand_logical_to_physical = @@ -282,7 +294,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast( for (int64 i = 0; i < rank; ++i) { if (operand_logical_to_physical[i] != logical_to_physical[dimension_mapping[i]] - min_broadcasted_dimension) { - return Index(source_index); + return Index(source_index, index_type_); } } llvm::Value* linear = linear_; @@ -291,7 +303,9 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast( divisor *= shape.dimensions(LayoutUtil::Major(shape.layout(), i)); } if (divisor > 1) { - linear = builder->CreateUDiv(linear, builder->getInt64(divisor)); + linear = builder->CreateUDiv( + linear, + IrArray::Index(linear->getType()).GetConstantWithIndexType(divisor)); } if (min_broadcasted_dimension > 0) { int64 mod = 1; @@ -299,22 +313,25 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast( ++i) { mod *= shape.dimensions(LayoutUtil::Major(shape.layout(), i)); } - linear = builder->CreateURem(linear, builder->getInt64(mod)); + linear = builder->CreateURem( + linear, + IrArray::Index(linear->getType()).GetConstantWithIndexType(mod)); } return Index(source_index, linear, operand_shape); } -llvm::Value* IrArray::Index::Linearize( - tensorflow::gtl::ArraySlice dimensions, - llvm::IRBuilder<>* builder) const { +llvm::Value* IrArray::Index::Linearize(absl::Span dimensions, + llvm::IRBuilder<>* builder) const { // Each dimension is multiplied by the product of the sizes of all // earlier dimensions and added to the accumulator logical_linear_index. - llvm::Value* logical_linear_index = builder->getInt64(0); + CHECK_EQ(size(), dimensions.size()); + llvm::Value* logical_linear_index = GetConstantWithIndexType(0); int64 multiplier = 1; for (ssize_t i = size() - 1; i >= 0; --i) { llvm::Value* addend = - builder->CreateMul((*this)[i], builder->getInt64(multiplier), "", + builder->CreateMul((*this)[i], GetConstantWithIndexType(multiplier), "", /*HasNUW=*/true, /*HasNSW=*/true); + addend = builder->CreateZExtOrTrunc(addend, index_type_); logical_linear_index = builder->CreateAdd(logical_linear_index, addend, "", /*HasNUW=*/true, /*HasNSW=*/true); multiplier *= dimensions[i]; @@ -322,9 +339,9 @@ llvm::Value* IrArray::Index::Linearize( return logical_linear_index; } -llvm::Value* IrArray::EmitArrayElementAddress( - const IrArray::Index& index, llvm::IRBuilder<>* ir_builder, - tensorflow::StringPiece name) const { +llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index, + llvm::IRBuilder<>* b, + absl::string_view name) const { if (ShapeUtil::IsScalar(*shape_)) { // Special handling of scalars: a scalar pretends to have the same value for // every index, thus effectively implementing broadcasting of its value @@ -334,12 +351,11 @@ llvm::Value* IrArray::EmitArrayElementAddress( CHECK_EQ(index.size(), ShapeUtil::Rank(*shape_)); if (index.LinearValidOnShape(*shape_)) { - llvm::Module* module = - ir_builder->GetInsertBlock()->getParent()->getParent(); - return ir_builder->CreateInBoundsGEP( - ir_builder->CreateBitCast( - base_ptr_, PrimitiveTypeToIrType(shape_->element_type(), module) - ->getPointerTo()), + llvm::Module* module = b->GetInsertBlock()->getParent()->getParent(); + return b->CreateInBoundsGEP( + b->CreateBitCast(base_ptr_, + PrimitiveTypeToIrType(shape_->element_type(), module) + ->getPointerTo()), {index.linear()}, llvm_ir::AsStringRef(name)); } @@ -349,7 +365,8 @@ llvm::Value* IrArray::EmitArrayElementAddress( // index[i] with 0. However, setting index[i] to 0 here still allows LLVM to // produce better code in some cases. auto dim = shape_->dimensions(i); - actual_index.push_back(dim == 1 ? ir_builder->getInt64(0) : index[i]); + actual_index.push_back( + dim == 1 ? llvm::ConstantInt::get(index[i]->getType(), 0) : index[i]); } // "base_ptr_" has the type of "*" @@ -357,13 +374,15 @@ llvm::Value* IrArray::EmitArrayElementAddress( // should be computed by // // getelementptr base_ptr_, 0, most major index, ..., most minor index - std::vector gep_indices(1, ir_builder->getInt64(0)); + CHECK_GT(index.size(), 0); + std::vector gep_indices( + 1, llvm::ConstantInt::get(index[0]->getType(), 0)); for (int64 i = 0; i < LayoutUtil::MinorToMajor(*shape_).size(); ++i) { int64 dimension = LayoutUtil::Major(shape_->layout(), i); gep_indices.push_back(actual_index[dimension]); } - return ir_builder->CreateInBoundsGEP(base_ptr_, gep_indices, - llvm_ir::AsStringRef(name)); + return b->CreateInBoundsGEP(base_ptr_, gep_indices, + llvm_ir::AsStringRef(name)); } void IrArray::AnnotateLoadStoreInstructionWithMetadata( @@ -379,38 +398,40 @@ void IrArray::AnnotateLoadStoreInstructionWithMetadata( } llvm::Value* IrArray::EmitReadArrayElement(const Index& index, - llvm::IRBuilder<>* ir_builder, - tensorflow::StringPiece name) const { - llvm::Value* element_address = - EmitArrayElementAddress(index, ir_builder, name); - llvm::LoadInst* load = ir_builder->CreateLoad(element_address); + llvm::IRBuilder<>* b, + absl::string_view name) const { + llvm::Value* element_address = EmitArrayElementAddress(index, b, name); + llvm::LoadInst* load = b->CreateLoad(element_address); AnnotateLoadStoreInstructionWithMetadata(load); return load; } void IrArray::EmitWriteArrayElement(const Index& index, llvm::Value* value, - llvm::IRBuilder<>* ir_builder) const { - llvm::Value* element_address = EmitArrayElementAddress(index, ir_builder); - llvm::StoreInst* store = ir_builder->CreateStore(value, element_address); + llvm::IRBuilder<>* b) const { + llvm::Value* element_address = EmitArrayElementAddress(index, b); + llvm::StoreInst* store = b->CreateStore(value, element_address); AnnotateLoadStoreInstructionWithMetadata(store); } IrArray IrArray::CastToShape(const Shape& new_shape, - llvm::IRBuilder<>* ir_builder) const { - llvm::Module* module = ir_builder->GetInsertBlock()->getParent()->getParent(); + llvm::IRBuilder<>* b) const { + llvm::Module* module = b->GetInsertBlock()->getParent()->getParent(); llvm::Type* new_ir_type = llvm_ir::ShapeToIrType(new_shape, module); - return IrArray( - ir_builder->CreatePointerCast(base_ptr_, new_ir_type->getPointerTo()), - new_shape); + IrArray new_irarray( + b->CreatePointerCast(base_ptr_, new_ir_type->getPointerTo()), new_shape); + new_irarray.metadata_ = metadata_; + return new_irarray; } /* static */ IrArray::Index IrArray::BumpIndex(const Index& index, int64 which_dimension, int64 addend, - llvm::IRBuilder<>* ir_builder) { + llvm::IRBuilder<>* b) { Index new_index = index; - new_index[which_dimension] = ir_builder->CreateAdd( - index[which_dimension], ir_builder->getInt64(addend), "", /*HasNUW=*/true, + new_index[which_dimension] = b->CreateAdd( + index[which_dimension], + llvm::ConstantInt::get(index[which_dimension]->getType(), addend), "", + /*HasNUW=*/true, /*HasNSW=*/true); return new_index; } diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h index 4c3195c29c859c9eef08e3f6531b059edbebfc47..f4b05f29c38529b3cce81b4c8ee6fae5c00cafcc 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h +++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h @@ -19,13 +19,14 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -53,40 +54,59 @@ class IrArray { // multidimensional index, which LLVM DCE can delete. class Index { public: - // Constructs an empty zero-dimensional index. - Index() {} - // Constructs an index of rank "size". Each dimension of the index is // initialized to "value". - explicit Index(size_t size, llvm::Value* value = nullptr) - : multidim_(size, value) {} + explicit Index(size_t size, llvm::Value* value) + : multidim_(size, value), index_type_(value->getType()) { + CHECK_NE(index_type_, nullptr); + } + + // Constructs an index of rank "size". Each dimension of the index is + // initialized to nullptr. + explicit Index(llvm::Type* index_ty, size_t size = 0) + : multidim_(size, nullptr), index_type_(index_ty) { + CHECK(index_ty->isIntegerTy()); + } // Constructs an index from multi-dimensional index "multidim". The linear // index is set to nullptr. - explicit Index(tensorflow::gtl::ArraySlice multidim) - : multidim_(multidim.begin(), multidim.end()) {} + explicit Index(absl::Span multidim, + llvm::Type* index_ty = nullptr) + : multidim_(multidim.begin(), multidim.end()) { + if (size() == 0) { + index_type_ = index_ty; + } else { + index_type_ = (*this)[0]->getType(); + if (index_ty != nullptr) { + CHECK_EQ(index_type_, index_ty); + } + } + CHECK_NE(index_type_, nullptr); + CHECK(absl::c_all_of(multidim, [&](llvm::Value* v) { + return index_type_ == v->getType(); + })); + } // Constructs an index from linear index "linear" and computes the - // multi-dimensional index from "linear" and "shape". "ir_builder" is the IR + // multi-dimensional index from "linear" and "shape". "b" is the IR // builder to emit the index of each dimension in the multi-dimensional // index. // // Precondition: "shape" has a layout. - Index(llvm::Value* linear, const Shape& shape, - llvm::IRBuilder<>* ir_builder); + Index(llvm::Value* linear, const Shape& shape, llvm::IRBuilder<>* b); // Constructs an index from the given multi-dimensional index and the shape // that it indexes into. // // Precondition: "shape" has a layout. - Index(tensorflow::gtl::ArraySlice multidim, - const Shape& shape, llvm::IRBuilder<>* ir_builder); + Index(absl::Span multidim, const Shape& shape, + llvm::IRBuilder<>* b); // Constructs an index from both a multi-dimensional index and a linear // index. "shape" has the same meaning as that in the constructor that takes // only a linear index. - Index(tensorflow::gtl::ArraySlice multidim, - llvm::Value* linear, const Shape& shape); + Index(absl::Span multidim, llvm::Value* linear, + const Shape& shape); const std::vector& multidim() const { return multidim_; } llvm::Value* linear() const { return linear_; } @@ -94,19 +114,19 @@ class IrArray { size_t size() const { return multidim().size(); } llvm::Value* operator[](size_t i) const { return multidim()[i]; } - llvm::Value*& operator[](size_t i) { return multidim()[i]; } + llvm::Value*& operator[](size_t i) { return mutable_multidim()[i]; } - void push_back(llvm::Value* value) { multidim().push_back(value); } + void push_back(llvm::Value* value) { mutable_multidim().push_back(value); } void InsertAt(int64 index, llvm::Value* value) { CHECK_LE(index, size()); - multidim().insert(multidim().begin() + index, value); + mutable_multidim().insert(mutable_multidim().begin() + index, value); } using iterator = std::vector::iterator; using const_iterator = std::vector::const_iterator; - iterator begin() { return multidim().begin(); } - iterator end() { return multidim().end(); } + iterator begin() { return mutable_multidim().begin(); } + iterator end() { return mutable_multidim().end(); } const_iterator begin() const { return multidim().begin(); } const_iterator end() const { return multidim().end(); } @@ -125,17 +145,15 @@ class IrArray { // by starting indices `starts` and stride values `strides`. // // Precondition: "this" is an index into a slice whose shape is `shape`. - Index SourceIndexOfSlice(const Shape& shape, - tensorflow::gtl::ArraySlice starts, - tensorflow::gtl::ArraySlice strides, + Index SourceIndexOfSlice(const Shape& shape, absl::Span starts, + absl::Span strides, llvm::IRBuilder<>* builder) const; // Given that "this" is the target index of a transpose from `operand_shape` // to `shape` with the given dimension mapping, returns the source index. - Index SourceIndexOfTranspose( - const Shape& shape, const Shape& operand_shape, - tensorflow::gtl::ArraySlice dimension_mapping, - llvm::IRBuilder<>* builder) const; + Index SourceIndexOfTranspose(const Shape& shape, const Shape& operand_shape, + absl::Span dimension_mapping, + llvm::IRBuilder<>* builder) const; // Given that "this" is the target index of a bitcast from `operand_shape` // to `shape`, returns the source index. @@ -144,23 +162,34 @@ class IrArray { // Given that "this" is the target index of a broadcast from `operand_shape` // to `shape` with the given dimension mapping, returns the source index. - Index SourceIndexOfBroadcast( - const Shape& shape, const Shape& operand_shape, - tensorflow::gtl::ArraySlice dimension_mapping, - llvm::IRBuilder<>* builder) const; + Index SourceIndexOfBroadcast(const Shape& shape, const Shape& operand_shape, + absl::Span dimension_mapping, + llvm::IRBuilder<>* builder) const; // Linearizes the index into the given shape, i.e. reshapes it to rank-1 and // returns the index into the sole dimension 0 of the new shape. - llvm::Value* Linearize(tensorflow::gtl::ArraySlice dimensions, + llvm::Value* Linearize(absl::Span dimensions, llvm::IRBuilder<>* builder) const; + llvm::Type* GetType() const { return index_type_; } + + llvm::Constant* GetConstantWithIndexType(int64 c) const { + // The LLVM function makes sure that the value can be represented by the + // specified type, see ConstantInt::ConstantInt(IntegerType *Ty, const + // APInt &V). + return llvm::ConstantInt::get(index_type_, c); + } + private: // Changing the multi-dimensional index invalidates the linear index. - std::vector& multidim() { + std::vector& mutable_multidim() { linear_ = nullptr; return multidim_; } + void Delinearize(std::vector* multidim, llvm::Value* linear, + const Shape& shape, llvm::IRBuilder<>* b) const; + std::vector multidim_; // These values are purely for efficiency; `multidim_` is enough to find the @@ -177,6 +206,8 @@ class IrArray { llvm::Value* linear_ = nullptr; Layout layout_; std::vector dims_; + + llvm::Type* index_type_; }; // Default constructor. Constructs an IrArray in a null status. @@ -206,9 +237,8 @@ class IrArray { // // The optional name is useful for debugging when looking at // the emitted LLVM IR. - llvm::Value* EmitArrayElementAddress(const Index& index, - llvm::IRBuilder<>* ir_builder, - tensorflow::StringPiece name = "") const; + llvm::Value* EmitArrayElementAddress(const Index& index, llvm::IRBuilder<>* b, + absl::string_view name = "") const; // Attach metadata this IrArray instance knows about to "instruction". void AnnotateLoadStoreInstructionWithMetadata( @@ -221,18 +251,16 @@ class IrArray { // // The optional name is useful for debugging when looking at // the emitted LLVM IR. - llvm::Value* EmitReadArrayElement(const Index& index, - llvm::IRBuilder<>* ir_builder, - tensorflow::StringPiece name = "") const; + llvm::Value* EmitReadArrayElement(const Index& index, llvm::IRBuilder<>* b, + absl::string_view name = "") const; // Emit IR to write the given value to the array element at the given index. void EmitWriteArrayElement(const Index& index, llvm::Value* value, - llvm::IRBuilder<>* ir_builder) const; + llvm::IRBuilder<>* b) const; // Returns a new IrArray whose shape is "new_shape" and base pointer is a // bitcast of the base pointer of "this" IrArray. - IrArray CastToShape(const Shape& new_shape, - llvm::IRBuilder<>* ir_builder) const; + IrArray CastToShape(const Shape& new_shape, llvm::IRBuilder<>* b) const; void AddAliasScopeMetadata(llvm::MDNode* alias_scope) { CHECK_NE(alias_scope, nullptr); @@ -278,7 +306,7 @@ class IrArray { // Bumps the "which_dimension" value within the provided index by the provided // addend. static Index BumpIndex(const Index& index, int64 which_dimension, - int64 addend, llvm::IRBuilder<>* ir_builder); + int64 addend, llvm::IRBuilder<>* b); private: // Add the specified LLVM IR metadata to loads/stores associated with this diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h new file mode 100644 index 0000000000000000000000000000000000000000..abc06fb7b4245294df2dc20d25a22ac4fdaeb4cf --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h @@ -0,0 +1,400 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_ + +#include "llvm/IR/IRBuilder.h" + +namespace xla { + +// Mixin class that injects more ergonomic versions of llvm::IRBuilder methods +// into a class. Intended to be used as a CRTP base class, like: +// +// class MyIrEmitter : public IrBuilderMixin { +// llvm::IRBuilder<>* builder() { return builder_; } +// +// void EmitFoo(HloInstruction* foo) { +// Add(Mul(...), FPToUI(...)); +// } +// }; + +template +class IrBuilderMixin { + protected: + template + llvm::Value* Add(Args&&... args) { + return mixin_builder()->CreateAdd(std::forward(args)...); + } + + template + llvm::LoadInst* AlignedLoad(Args&&... args) { + return mixin_builder()->CreateAlignedLoad(std::forward(args)...); + } + + template + llvm::StoreInst* AlignedStore(Args&&... args) { + return mixin_builder()->CreateAlignedStore(std::forward(args)...); + } + + template + llvm::AllocaInst* Alloca(Args&&... args) { + return mixin_builder()->CreateAlloca(std::forward(args)...); + } + + template + llvm::Value* And(Args&&... args) { + return mixin_builder()->CreateAnd(std::forward(args)...); + } + + template + llvm::Value* AtomicCmpXchg(Args&&... args) { + return mixin_builder()->CreateAtomicCmpXchg(std::forward(args)...); + } + + template + llvm::Value* AtomicRMW(Args&&... args) { + return mixin_builder()->CreateAtomicRMW(std::forward(args)...); + } + + template + llvm::Value* BitCast(Args&&... args) { + return mixin_builder()->CreateBitCast(std::forward(args)...); + } + + template + llvm::Value* Br(Args&&... args) { + return mixin_builder()->CreateBr(std::forward(args)...); + } + + llvm::CallInst* Call(llvm::Value* callee, + llvm::ArrayRef args = llvm::None, + const llvm::Twine& name = "", + llvm::MDNode* fp_math_tag = nullptr) { + return mixin_builder()->CreateCall(callee, args, name, fp_math_tag); + } + + template + llvm::BranchInst* CondBr(Args&&... args) { + return mixin_builder()->CreateCondBr(std::forward(args)...); + } + + template + llvm::Value* ConstInBoundsGEP1_32(Args&&... args) { + return mixin_builder()->CreateConstInBoundsGEP1_32( + std::forward(args)...); + } + + template + llvm::Value* FAdd(Args&&... args) { + return mixin_builder()->CreateFAdd(std::forward(args)...); + } + + template + llvm::Value* FMul(Args&&... args) { + return mixin_builder()->CreateFMul(std::forward(args)...); + } + + llvm::Value* GEP(llvm::Value* ptr, llvm::ArrayRef idx_list, + const llvm::Twine& name = "") { + return mixin_builder()->CreateGEP(ptr, idx_list, name); + } + + template + llvm::Value* ICmpEQ(Args&&... args) { + return mixin_builder()->CreateICmpEQ(std::forward(args)...); + } + + template + llvm::Value* ICmpNE(Args&&... args) { + return mixin_builder()->CreateICmpNE(std::forward(args)...); + } + + template + llvm::Value* ICmpULE(Args&&... args) { + return mixin_builder()->CreateICmpULE(std::forward(args)...); + } + + template + llvm::Value* ICmpULT(Args&&... args) { + return mixin_builder()->CreateICmpULT(std::forward(args)...); + } + + llvm::Value* InBoundsGEP(llvm::Value* ptr, + llvm::ArrayRef idx_list, + const llvm::Twine& name = "") { + return mixin_builder()->CreateInBoundsGEP(ptr, idx_list, name); + } + + llvm::Value* ExtractValue(llvm::Value* agg, llvm::ArrayRef idxs, + const llvm::Twine& name = "") { + return mixin_builder()->CreateExtractValue(agg, idxs, name); + } + + llvm::Value* InsertValue(llvm::Value* agg, llvm::Value* val, + llvm::ArrayRef idxs, + const llvm::Twine& name = "") { + return mixin_builder()->CreateInsertValue(agg, val, idxs, name); + } + + template + llvm::Value* IntToPtr(Args&&... args) { + return mixin_builder()->CreateIntToPtr(std::forward(args)...); + } + + template + llvm::LoadInst* Load(Args&&... args) { + return mixin_builder()->CreateLoad(std::forward(args)...); + } + + template + llvm::CallInst* MemCpy(Args&&... args) { + return mixin_builder()->CreateMemCpy(std::forward(args)...); + } + + template + llvm::Value* Mul(Args&&... args) { + return mixin_builder()->CreateMul(std::forward(args)...); + } + + template + llvm::Value* NSWAdd(Args&&... args) { + return mixin_builder()->CreateNSWAdd(std::forward(args)...); + } + + template + llvm::Value* NSWMul(Args&&... args) { + return mixin_builder()->CreateNSWMul(std::forward(args)...); + } + + template + llvm::Value* NSWSub(Args&&... args) { + return mixin_builder()->CreateNSWSub(std::forward(args)...); + } + + template + llvm::Value* Or(Args&&... args) { + return mixin_builder()->CreateOr(std::forward(args)...); + } + + template + llvm::Value* PointerCast(Args&&... args) { + return mixin_builder()->CreatePointerCast(std::forward(args)...); + } + + template + llvm::Value* PtrToInt(Args&&... args) { + return mixin_builder()->CreatePtrToInt(std::forward(args)...); + } + + template + llvm::Value* SDiv(Args&&... args) { + return mixin_builder()->CreateSDiv(std::forward(args)...); + } + + template + llvm::Value* Select(Args&&... args) { + return mixin_builder()->CreateSelect(std::forward(args)...); + } + + template + llvm::Value* SRem(Args&&... args) { + return mixin_builder()->CreateSRem(std::forward(args)...); + } + + template + llvm::StoreInst* Store(Args&&... args) { + return mixin_builder()->CreateStore(std::forward(args)...); + } + + template + llvm::Value* UDiv(Args&&... args) { + return mixin_builder()->CreateUDiv(std::forward(args)...); + } + + template + llvm::Value* URem(Args&&... args) { + return mixin_builder()->CreateURem(std::forward(args)...); + } + + template + llvm::Value* VectorSplat(Args&&... args) { + return mixin_builder()->CreateVectorSplat(std::forward(args)...); + } + + template + llvm::Value* ZExtOrTrunc(Args&&... args) { + return mixin_builder()->CreateZExtOrTrunc(std::forward(args)...); + } + + template + llvm::Value* AShr(Args&&... args) { + return mixin_builder()->CreateAShr(std::forward(args)...); + } + + template + llvm::Value* FCmpOEQ(Args&&... args) { + return mixin_builder()->CreateFCmpOEQ(std::forward(args)...); + } + + template + llvm::Value* FCmpOLT(Args&&... args) { + return mixin_builder()->CreateFCmpOLT(std::forward(args)...); + } + + template + llvm::Value* FCmpONE(Args&&... args) { + return mixin_builder()->CreateFCmpONE(std::forward(args)...); + } + + template + llvm::Value* FCmpUNE(Args&&... args) { + return mixin_builder()->CreateFCmpUNE(std::forward(args)...); + } + + template + llvm::Value* FDiv(Args&&... args) { + return mixin_builder()->CreateFDiv(std::forward(args)...); + } + + template + llvm::Value* FNeg(Args&&... args) { + return mixin_builder()->CreateFNeg(std::forward(args)...); + } + + template + llvm::Value* FPCast(Args&&... args) { + return mixin_builder()->CreateFPCast(std::forward(args)...); + } + + template + llvm::Value* FPToSI(Args&&... args) { + return mixin_builder()->CreateFPToSI(std::forward(args)...); + } + + template + llvm::Value* FPToUI(Args&&... args) { + return mixin_builder()->CreateFPToUI(std::forward(args)...); + } + + template + llvm::Value* FPTrunc(Args&&... args) { + return mixin_builder()->CreateFPTrunc(std::forward(args)...); + } + + template + llvm::Value* FRem(Args&&... args) { + return mixin_builder()->CreateFRem(std::forward(args)...); + } + + template + llvm::Value* FSub(Args&&... args) { + return mixin_builder()->CreateFSub(std::forward(args)...); + } + + template + llvm::Value* ICmpSGE(Args&&... args) { + return mixin_builder()->CreateICmpSGE(std::forward(args)...); + } + + template + llvm::Value* ICmpSLT(Args&&... args) { + return mixin_builder()->CreateICmpSLT(std::forward(args)...); + } + + template + llvm::Value* IntCast(Args&&... args) { + return mixin_builder()->CreateIntCast(std::forward(args)...); + } + + template + llvm::Value* LShr(Args&&... args) { + return mixin_builder()->CreateLShr(std::forward(args)...); + } + + template + llvm::Value* MemSet(Args&&... args) { + return mixin_builder()->CreateMemSet(std::forward(args)...); + } + + template + llvm::Value* Neg(Args&&... args) { + return mixin_builder()->CreateNeg(std::forward(args)...); + } + + template + llvm::Value* Not(Args&&... args) { + return mixin_builder()->CreateNot(std::forward(args)...); + } + + template + llvm::PHINode* PHI(Args&&... args) { + return mixin_builder()->CreatePHI(std::forward(args)...); + } + + template + llvm::Value* RetVoid(Args&&... args) { + return mixin_builder()->CreateRetVoid(std::forward(args)...); + } + + template + llvm::Value* SExtOrTrunc(Args&&... args) { + return mixin_builder()->CreateSExtOrTrunc(std::forward(args)...); + } + + template + llvm::Value* Shl(Args&&... args) { + return mixin_builder()->CreateShl(std::forward(args)...); + } + + template + llvm::Value* SIToFP(Args&&... args) { + return mixin_builder()->CreateSIToFP(std::forward(args)...); + } + + template + llvm::Value* Sub(Args&&... args) { + return mixin_builder()->CreateSub(std::forward(args)...); + } + + template + llvm::Value* Trunc(Args&&... args) { + return mixin_builder()->CreateTrunc(std::forward(args)...); + } + + template + llvm::Value* UIToFP(Args&&... args) { + return mixin_builder()->CreateUIToFP(std::forward(args)...); + } + + template + llvm::Value* Unreachable(Args&&... args) { + return mixin_builder()->CreateUnreachable(std::forward(args)...); + } + + template + llvm::Value* Xor(Args&&... args) { + return mixin_builder()->CreateXor(std::forward(args)...); + } + + private: + llvm::IRBuilder<>* mixin_builder() { + return static_cast(this)->builder(); + } +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_ diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc index 23d2d4e87d26f4988ebddcf20f5a27af6a7fe0d6..bd0139f85b6a5c5dc23dad962263038451921e65 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc @@ -15,62 +15,66 @@ limitations under the License. #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h" -#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" namespace xla { -void KernelSupportLibrary::For( - tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end, +Status KernelSupportLibrary::For( + absl::string_view name, llvm::Value* start, llvm::Value* end, llvm::Value* step, - const std::function& for_body_generator) { - If(ir_builder_->CreateICmpSLT(start, end), [&]() { - for_body_generator(start, /*is_first_iteration=*/true); - For(name, ir_builder_->CreateAdd(start, step), end, step, - [&](llvm::Value* iv) { for_body_generator(iv, false); }); + const std::function& for_body_generator) { + return If(b_->CreateICmpSLT(start, end), [&]() -> Status { + TF_RETURN_IF_ERROR(for_body_generator(start, /*is_first_iteration=*/true)); + return For(name, b_->CreateAdd(start, step), end, step, + [&](llvm::Value* iv) { return for_body_generator(iv, false); }); }); } -void KernelSupportLibrary::For( - tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end, +Status KernelSupportLibrary::For( + absl::string_view name, llvm::Value* start, llvm::Value* end, llvm::Value* step, bool peel_first_iteration, - const std::function& for_body_generator) { + const std::function& + for_body_generator) { if (peel_first_iteration) { - For(name, start, end, step, true, - [&](llvm::Value* indvar, bool is_first_iteration) { - for_body_generator(indvar, ir_builder_->getInt1(is_first_iteration)); - }); + return For(name, start, end, step, true, + [&](llvm::Value* indvar, bool is_first_iteration) -> Status { + return for_body_generator(indvar, + b_->getInt1(is_first_iteration)); + }); } else { std::unique_ptr loop = llvm_ir::ForLoop::EmitForLoop( - name, start, end, step, ir_builder_, - /*prevent_unrolling=*/prevent_unrolling_, + name, start, end, step, b_, + /*unroll_mode=*/unroll_mode_, /*prevent_vectorization=*/prevent_vectorization_); - ir_builder_->SetInsertPoint(&loop->GetBodyBasicBlock()->back()); - for_body_generator(loop->GetIndVarValue(), - /*is_first_iteration=*/ir_builder_->CreateICmpEQ( - loop->GetIndVarValue(), start)); - llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), ir_builder_); + b_->SetInsertPoint(&loop->GetBodyBasicBlock()->back()); + TF_RETURN_IF_ERROR( + for_body_generator(loop->GetIndVarValue(), + /*is_first_iteration=*/b_->CreateICmpEQ( + loop->GetIndVarValue(), start))); + llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), b_); + return Status::OK(); } } -void KernelSupportLibrary::If( - llvm::Value* condition, const std::function& true_block_generator, - const std::function& false_block_generator) { - llvm_ir::LlvmIfData if_data = - llvm_ir::EmitIfThenElse(condition, "", ir_builder_); - ir_builder_->SetInsertPoint(&if_data.true_block->back()); - true_block_generator(); - ir_builder_->SetInsertPoint(&if_data.false_block->back()); - false_block_generator(); - llvm_ir::SetToLastInsertPoint(if_data.after_block, ir_builder_); +Status KernelSupportLibrary::If( + absl::string_view name, llvm::Value* condition, + const std::function& true_block_generator, + const std::function& false_block_generator) { + llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(condition, name, b_); + b_->SetInsertPoint(&if_data.true_block->back()); + TF_RETURN_IF_ERROR(true_block_generator()); + b_->SetInsertPoint(&if_data.false_block->back()); + TF_RETURN_IF_ERROR(false_block_generator()); + llvm_ir::SetToLastInsertPoint(if_data.after_block, b_); + return Status::OK(); } void KernelSupportLibrary::EmitAndCallOutlinedKernel( - bool enable_fast_math, bool optimize_for_size, - llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name, + bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b, + absl::string_view kernel_name, KernelSupportLibrary::ArgumentVector arguments, const std::function& kernel_body_generator) { - llvm::Module* module = ir_builder->GetInsertBlock()->getModule(); + llvm::Module* module = b->GetInsertBlock()->getModule(); llvm::Function* function = module->getFunction(llvm_ir::AsStringRef(kernel_name)); @@ -93,22 +97,22 @@ void KernelSupportLibrary::EmitAndCallOutlinedKernel( std::back_inserter(arg_types), [](llvm::Value* arg) { return arg->getType(); }); - auto* function_type = llvm::FunctionType::get( - ir_builder->getVoidTy(), arg_types, /*isVarArg=*/false); + auto* function_type = + llvm::FunctionType::get(b->getVoidTy(), arg_types, /*isVarArg=*/false); function = llvm_ir::CreateFunction( function_type, llvm::GlobalValue::InternalLinkage, /*enable_fast_math=*/enable_fast_math, /*optimize_for_size=*/optimize_for_size, kernel_name, module); - llvm::IRBuilder<>::InsertPointGuard guard(*ir_builder); + llvm::IRBuilder<>::InsertPointGuard guard(*b); auto* entry_bb = - llvm::BasicBlock::Create(ir_builder->getContext(), "entry", function); - auto* return_inst = llvm::ReturnInst::Create(ir_builder->getContext(), + llvm::BasicBlock::Create(b->getContext(), "entry", function); + auto* return_inst = llvm::ReturnInst::Create(b->getContext(), /*retVal=*/nullptr, entry_bb); // Set the insert point to before return_inst. - ir_builder->SetInsertPoint(return_inst); + b->SetInsertPoint(return_inst); std::vector arg_values; /* @@ -128,7 +132,7 @@ void KernelSupportLibrary::EmitAndCallOutlinedKernel( VLOG(3) << "Re-using kernel for " << kernel_name; } - ir_builder->CreateCall(function, llvm_ir::AsArrayRef(sanitized_args)); + b->CreateCall(function, llvm_ir::AsArrayRef(sanitized_args)); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h index 64b935bbf1fb9033cd2e1259b4639cd3780be711..43fec311f150d6054f6ad24f99db332f90ff94a3 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h @@ -13,30 +13,32 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_SUPPORT_LIBRARY_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_SUPPORT_LIBRARY_H_ #include +#include "absl/strings/string_view.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Value.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" -#include "tensorflow/core/lib/core/stringpiece.h" namespace xla { // A thin wrapper around llvm_loop.h to make code generating structured control // flow more readable. class KernelSupportLibrary { public: - // `ir_builder` is the llvm::IRBuilder instance used to generate LLVM IR. - // If `prevent_unrolling` is true then unrolling is explicitly disabled on - // every loop generated by this instance of KernelSupportLibrary. - explicit KernelSupportLibrary(llvm::IRBuilder<>* ir_builder, - bool prevent_unrolling = true, - bool prevent_vectorization = true) - : ir_builder_(ir_builder), - prevent_unrolling_(prevent_unrolling), + // `b` is the llvm::IRBuilder instance used to generate LLVM IR. + // `unroll_mode` specifies the desired LLVM unrolling behavior for every loop + // generated by this instance of KernelSupportLibrary. + explicit KernelSupportLibrary( + llvm::IRBuilder<>* b, + llvm_ir::UnrollMode unroll_mode = llvm_ir::UnrollMode::kNoUnroll, + bool prevent_vectorization = true) + : b_(b), + unroll_mode_(unroll_mode), prevent_vectorization_(prevent_vectorization) {} // Generates the following control flow structure: @@ -46,19 +48,41 @@ class KernelSupportLibrary { // for (i64 i = `start` + `step`; i s< `end`; i += `step`) // `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/false)`; // } - void For( - tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end, + Status For( + absl::string_view name, llvm::Value* start, llvm::Value* end, + llvm::Value* step, + const std::function& for_body_generator); + + void ForReturnVoid( + absl::string_view name, llvm::Value* start, llvm::Value* end, llvm::Value* step, const std::function& - for_body_generator); + for_body_generator) { + CHECK_EQ(Status::OK(), + For(name, start, end, step, + [&](llvm::Value* ind_var, bool is_first_iteration) -> Status { + for_body_generator(ind_var, is_first_iteration); + return Status::OK(); + })); + } - void For( - tensorflow::StringPiece name, int64 start, int64 end, int64 step, + Status For(absl::string_view name, int64 start, int64 end, int64 step, + const std::function& + for_body_generator) { + return For(name, /*start=*/b_->getInt64(start), + /*end=*/b_->getInt64(end), + /*step=*/b_->getInt64(step), for_body_generator); + } + + void ForReturnVoid( + absl::string_view name, int64 start, int64 end, int64 step, const std::function& for_body_generator) { - For(name, /*start=*/ir_builder_->getInt64(start), - /*end=*/ir_builder_->getInt64(end), - /*step=*/ir_builder_->getInt64(step), for_body_generator); + ForReturnVoid(name, /*start=*/b_->getInt64(start), + /*end=*/b_->getInt64(end), + /*step=*/b_->getInt64(step), for_body_generator); } // Generates the following control flow structure if `peel_first_iteration` is @@ -75,46 +99,100 @@ class KernelSupportLibrary { // for (i64 i = `start`; i s< `end`; i += `step`) // `for_body_generator(/*ind_var=*/,i, // /*is_first_iteration=*/,(i != `start`))`; - void For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end, - llvm::Value* step, bool peel_first_iteration, - const std::function& - for_body_generator); - - void For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end, - int64 step, bool peel_first_iteration, - const std::function& - for_body_generator) { - For(name, /*start=*/start, /*end=*/end, - /*step=*/ir_builder_->getInt64(step), peel_first_iteration, - for_body_generator); - } - - void For( - tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end, + Status For(absl::string_view name, llvm::Value* start, llvm::Value* end, + llvm::Value* step, bool peel_first_iteration, + const std::function& + for_body_generator); + + void ForReturnVoid(absl::string_view name, llvm::Value* start, + llvm::Value* end, llvm::Value* step, + bool peel_first_iteration, + const std::function& + for_body_generator) { + TF_CHECK_OK(For( + name, start, end, step, peel_first_iteration, + [&](llvm::Value* ind_var, llvm::Value* is_first_iteration) -> Status { + for_body_generator(ind_var, is_first_iteration); + return Status::OK(); + })); + } + + Status For(absl::string_view name, llvm::Value* start, llvm::Value* end, + int64 step, bool peel_first_iteration, + const std::function& + for_body_generator) { + return For(name, /*start=*/start, /*end=*/end, + /*step=*/llvm::ConstantInt::get(start->getType(), step), + peel_first_iteration, for_body_generator); + } + + void ForReturnVoid(absl::string_view name, llvm::Value* start, + llvm::Value* end, int64 step, bool peel_first_iteration, + const std::function& + for_body_generator) { + ForReturnVoid(name, /*start=*/start, /*end=*/end, + /*step=*/llvm::ConstantInt::get(start->getType(), step), + peel_first_iteration, for_body_generator); + } + + Status For( + absl::string_view name, llvm::Value* start, llvm::Value* end, + llvm::Value* step, + const std::function& for_body_generator) { + return For(name, start, end, step, + /*peel_first_iteration=*/false, + [&](llvm::Value* indvar, llvm::Value*) -> Status { + return for_body_generator(indvar); + }); + } + + void ForReturnVoid( + absl::string_view name, llvm::Value* start, llvm::Value* end, llvm::Value* step, const std::function& for_body_generator) { - For(name, start, end, step, - /*peel_first_iteration=*/false, - [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); }); + ForReturnVoid(name, start, end, step, + /*peel_first_iteration=*/false, + [&](llvm::Value* indvar, llvm::Value*) { + return for_body_generator(indvar); + }); } - void For( - tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end, - int64 step, + Status For( + absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step, + const std::function& for_body_generator) { + return For(name, start, end, llvm::ConstantInt::get(start->getType(), step), + /*peel_first_iteration=*/false, + [&](llvm::Value* indvar, llvm::Value*) -> Status { + return for_body_generator(indvar); + }); + } + + void ForReturnVoid( + absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step, const std::function& for_body_generator) { - For(name, start, end, ir_builder_->getInt64(step), - /*peel_first_iteration=*/false, - [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); }); + ForReturnVoid(name, start, end, + llvm::ConstantInt::get(start->getType(), step), + for_body_generator); + } + + Status For( + absl::string_view name, int64 start, int64 end, int64 step, + const std::function& for_body_generator) { + return For(name, /*start=*/b_->getInt64(start), + /*end=*/b_->getInt64(end), + /*step=*/b_->getInt64(step), for_body_generator); } - void For( - tensorflow::StringPiece name, int64 start, int64 end, int64 step, + void ForReturnVoid( + absl::string_view name, int64 start, int64 end, int64 step, const std::function& for_body_generator) { - For(name, /*start=*/ir_builder_->getInt64(start), - /*end=*/ir_builder_->getInt64(end), - /*step=*/ir_builder_->getInt64(step), for_body_generator); + ForReturnVoid(name, /*start=*/b_->getInt64(start), + /*end=*/b_->getInt64(end), + /*step=*/b_->getInt64(step), for_body_generator); } // Generates the following control flow structure: @@ -123,11 +201,41 @@ class KernelSupportLibrary { // `true_block_generator()`; // else // `false_block_generator()`; - void If(llvm::Value* condition, - const std::function& true_block_generator, - const std::function& false_block_generator = []() {}); + Status If(absl::string_view name, llvm::Value* condition, + const std::function& true_block_generator, + const std::function& false_block_generator = + []() -> Status { return Status::OK(); }); + + Status If(llvm::Value* condition, + const std::function& true_block_generator, + const std::function& false_block_generator = + []() -> Status { return Status::OK(); }) { + return If("", condition, true_block_generator, false_block_generator); + } + + void IfReturnVoid(llvm::Value* condition, + const std::function& true_block_generator, + const std::function& false_block_generator = []() { + }) { + IfReturnVoid("", condition, true_block_generator, false_block_generator); + } + + void IfReturnVoid(absl::string_view name, llvm::Value* condition, + const std::function& true_block_generator, + const std::function& false_block_generator = []() { + }) { + TF_CHECK_OK(If(name, condition, + [&]() { + true_block_generator(); + return Status::OK(); + }, + [&]() { + false_block_generator(); + return Status::OK(); + })); + } - using ArgumentVector = tensorflow::gtl::ArraySlice; + using ArgumentVector = absl::Span; // Generates the following control flow structure: // @@ -148,44 +256,42 @@ class KernelSupportLibrary { // in a nullptr llvm::Value* in its position to `kernel_body_generator`. // Currently we only support at most one nullptr value in `arguments`. static void EmitAndCallOutlinedKernel( - bool enable_fast_math, bool optimize_for_size, - llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name, - ArgumentVector arguments, + bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b, + absl::string_view kernel_name, ArgumentVector arguments, const std::function& kernel_body_generator); // Thin wrappers around the more general EmitAndCallOutlinedKernel above. static void EmitAndCallOutlinedKernel( - bool enable_fast_math, bool optimize_for_size, - llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name, - llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2, + bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b, + absl::string_view kernel_name, llvm::Value* arg0, llvm::Value* arg1, + llvm::Value* arg2, const std::function& kernel_body_generator) { EmitAndCallOutlinedKernel( - enable_fast_math, optimize_for_size, ir_builder, kernel_name, - {arg0, arg1, arg2}, [&](ArgumentVector args) { + enable_fast_math, optimize_for_size, b, kernel_name, {arg0, arg1, arg2}, + [&](ArgumentVector args) { kernel_body_generator(args[0], args[1], args[2]); }); } static void EmitAndCallOutlinedKernel( - bool enable_fast_math, bool optimize_for_size, - llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name, - llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2, - llvm::Value* arg3, + bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b, + absl::string_view kernel_name, llvm::Value* arg0, llvm::Value* arg1, + llvm::Value* arg2, llvm::Value* arg3, const std::function& kernel_body_generator) { EmitAndCallOutlinedKernel( - enable_fast_math, optimize_for_size, ir_builder, kernel_name, + enable_fast_math, optimize_for_size, b, kernel_name, {arg0, arg1, arg2, arg3}, [&](ArgumentVector args) { kernel_body_generator(args[0], args[1], args[2], args[3]); }); } private: - llvm::IRBuilder<>* ir_builder_; - bool prevent_unrolling_; + llvm::IRBuilder<>* b_; + llvm_ir::UnrollMode unroll_mode_; bool prevent_vectorization_; }; } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_SUPPORT_LIBRARY_H_ diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc new file mode 100644 index 0000000000000000000000000000000000000000..e5fbdbd51b8a9aa14decadedd1eeb3bdbf831738 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc @@ -0,0 +1,117 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h" +#include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/platform/logging.h" + +namespace xla { +namespace llvm_ir { + +namespace { +// Returns the indices of the first elements of all consecutive subarrays of the +// given array. For example: +// ConsecutiveSegments({m, m+1, m+2, n, k, k+1}) = {0, 3, 4} +std::vector ConsecutiveSegments(absl::Span xs) { + std::vector is = {0}; + for (size_t i = 1; i < xs.size(); ++i) { + if (1 != xs[i] - xs[i - 1]) { + is.push_back(i); + } + } + return is; +} + +// Merges the sequences of dimensions of the given shape which start at the +// given indices `segs`. +Shape MergeDimensions(absl::Span segs, const Shape& shape) { + std::vector dimensions; + for (size_t i = 1; i <= segs.size(); ++i) { + dimensions.push_back(std::accumulate( + shape.dimensions().begin() + segs[i - 1], + shape.dimensions().begin() + + (segs.size() == i ? shape.dimensions().size() : segs[i]), + 1, std::multiplies())); + } + return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(), + dimensions); +} +} // namespace + +absl::optional > FindTranspose021(const Shape& a, + const Shape& b) { + if (!ShapeUtil::CompatibleIgnoringElementType(a, b)) { + return absl::nullopt; + } + + std::vector perm(a.dimensions().size()); + { + auto layout_a_orig = LayoutUtil::MinorToMajor(a); + std::vector layout_a(layout_a_orig.rbegin(), layout_a_orig.rend()); + auto layout_b_orig = LayoutUtil::MinorToMajor(b); + std::vector layout_b(layout_b_orig.rbegin(), layout_b_orig.rend()); + for (size_t i = 0; i < perm.size(); ++i) { + perm[i] = PositionInContainer(layout_b, layout_a[i]); + } + } + auto segs = ConsecutiveSegments(perm); + if ((3 == segs.size() && 0 == perm[0]) || 2 == segs.size()) { + Shape norm_a = + ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a); + Shape reduced_a = MergeDimensions(segs, norm_a); + auto reduced_a_dims = reduced_a.dimensions(); + std::vector dims_021; + if (2 == segs.size()) { + // The logical component-0 is of size one. + dims_021 = {1, reduced_a_dims[1], reduced_a_dims[0]}; + } else { + dims_021 = {reduced_a_dims[0], reduced_a_dims[2], reduced_a_dims[1]}; + } + + return dims_021; + } + + return absl::nullopt; +} + +IrArray::Index GetUnreducedOutputIndex( + const IrArray::Index& reduced_output_index, + const Shape& reduced_output_shape, const Shape& unreduced_output_shape, + llvm::IRBuilder<>* b) { + auto bounds = reduced_output_shape.dimensions(); + auto minor_to_major = reduced_output_shape.layout().minor_to_major(); + llvm::Value* linear_index = reduced_output_index.GetConstantWithIndexType(0); + int64 multiplier = 1; + for (int i = 0; i < reduced_output_index.size(); ++i) { + int64 dim = minor_to_major[i]; + llvm::Value* addend = + b->CreateMul(reduced_output_index[dim], + reduced_output_index.GetConstantWithIndexType(multiplier), + "linearizing", + /*HasNUW=*/true, /*HasNSW=*/true); + linear_index = b->CreateAdd(linear_index, addend, "", + /*HasNUW=*/true, /*HasNSW=*/true); + multiplier *= bounds[dim]; + } + + return IrArray::Index(linear_index, unreduced_output_shape, b); +} + +} // namespace llvm_ir +} // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..5ea05b3188a1c0881e4c0c41625d530aff1b1205 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h @@ -0,0 +1,80 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_ + +#include "llvm/IR/Value.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" + +namespace xla { +namespace llvm_ir { + +// About 0-2-1 transpose: +// +// If a shape can be viewed as three logical components 0-1-2 in the order of +// major to minor, a 0-2-1-transpose changes the order of such logical +// components to 0-2-1. We call the shape being transposed the input shape and +// the transposed shape the output shape. The logical view of the input and +// output shapes for the transpose are called the 0-1-2 shape or reduced input +// shape and the 0-2-1 shape or the reduced output shape respectively. The +// original input and output shapes are called the unreduced input and output +// shapes. + +// If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the +// reduced shape of `b` or the 0-2-1 shape. +absl::optional > FindTranspose021(const Shape& a, + const Shape& b); + +// Return the unreduced output index corresponding to the given reduced output +// index. +IrArray::Index GetUnreducedOutputIndex( + const IrArray::Index& reduced_output_index, + const Shape& reduced_output_shape, const Shape& unreduced_output_shape, + llvm::IRBuilder<>* b); + +// A class to represent information for tiled parameters to support IR emission +// for 021 transpose. +class TiledParameterInfo { + public: + TiledParameterInfo(absl::Span param_buffers, + llvm::Value* y, llvm::Value* x) + : param_buffers_(param_buffers), y_(y), x_(x) {} + + llvm::Value* x() const { return x_; } + llvm::Value* y() const { return y_; } + + void set_x(llvm::Value* x) { x_ = x; } + void set_y(llvm::Value* y) { y_ = y; } + + llvm::Value* GetBufferForParameter(int64 index) const { + return param_buffers_[index]; + } + + private: + // Param_buffers_[i] stores the tile buffer for the ith parameter or nullptr + // if the parameter is not tiled. + absl::Span param_buffers_; + // The y coordinate within a tile. + llvm::Value* y_; + // The x coordinate within a tile. + llvm::Value* x_; +}; + +} // namespace llvm_ir +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_TILING_H_ diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc index 497b48ff227d7d1f158080529372df44b6932b24..219a9f221fbd116cdfbaf17985e21d82aefd079d 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" @@ -25,49 +26,47 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { namespace llvm_ir { -ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix, +ForLoop::ForLoop(absl::string_view prefix, absl::string_view suffix, llvm::Value* start_index, llvm::Value* end_index, - llvm::Value* step, bool prevent_unrolling, + llvm::Value* step, UnrollMode unroll_mode, bool prevent_vectorization) - : prefix_(std::string(prefix)), - suffix_(std::string(suffix)), + : prefix_(prefix), + suffix_(suffix), start_index_(start_index), end_index_(end_index), step_(step), insert_before_bb_(nullptr), - prevent_unrolling_(prevent_unrolling), + unroll_mode_(unroll_mode), prevent_vectorization_(prevent_vectorization) {} /* static */ std::unique_ptr ForLoop::EmitForLoop( - tensorflow::StringPiece prefix, llvm::Value* start_index, - llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder, - bool prevent_unrolling, bool prevent_vectorization) { + absl::string_view prefix, llvm::Value* start_index, llvm::Value* end_index, + llvm::Value* step, llvm::IRBuilder<>* b, UnrollMode unroll_mode, + bool prevent_vectorization) { std::unique_ptr loop(new ForLoop(prefix, /*suffix=*/"", start_index, - end_index, step, prevent_unrolling, + end_index, step, unroll_mode, prevent_vectorization)); - loop->Emit(ir_builder); + loop->Emit(b); return loop; } -void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) { +void ForLoop::Emit(llvm::IRBuilder<>* b) { // The preheader block is the block the builder is currently emitting // code into. - preheader_bb_ = ir_builder->GetInsertBlock(); + preheader_bb_ = b->GetInsertBlock(); - llvm::BasicBlock::iterator insert_point = ir_builder->GetInsertPoint(); + llvm::BasicBlock::iterator insert_point = b->GetInsertPoint(); if (insert_point == preheader_bb_->end()) { // We're emitting the loop at the end of a basic block. Verify there is no // terminator (eg, branch) in the basic block. CHECK_EQ(nullptr, preheader_bb_->getTerminator()); - exit_bb_ = CreateLoopBB("loop_exit", ir_builder); + exit_bb_ = CreateLoopBB("loop_exit", b); } else { // We're emitting the loop into the middle of a basic block. splitBasicBlock // requires that this basic block be well-formed (have a terminator). @@ -86,51 +85,50 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) { insert_before_bb_ = exit_bb_; // Create remaining basic block which form the inside of the loop. - header_bb_ = CreateLoopBB("loop_header", ir_builder); - body_bb_ = CreateLoopBB("loop_body", ir_builder); + header_bb_ = CreateLoopBB("loop_header", b); + body_bb_ = CreateLoopBB("loop_body", b); // Function entry basic block. // Emit alloca for the induction variable. We do this at the entry to the // basic block to ensure the alloc only executes once per function (we could // be emitting a nested loop). llvm::Function* func = preheader_bb_->getParent(); - ir_builder->SetInsertPoint(&func->getEntryBlock(), - func->getEntryBlock().getFirstInsertionPt()); + b->SetInsertPoint(&func->getEntryBlock(), + func->getEntryBlock().getFirstInsertionPt()); llvm::Value* indvar_address = - ir_builder->CreateAlloca(ir_builder->getInt64Ty(), nullptr, - AsStringRef(GetQualifiedName("invar_address"))); + b->CreateAlloca(start_index_->getType(), nullptr, + AsStringRef(GetQualifiedName("invar_address"))); // Preheader basic block. // Initialize induction variable starting index. Create branch to the header. - ir_builder->SetInsertPoint(preheader_bb_); - ir_builder->CreateStore(start_index_, indvar_address); + b->SetInsertPoint(preheader_bb_); + b->CreateStore(start_index_, indvar_address); // The preheader should not have a branch yet. CHECK_EQ(preheader_bb_->getTerminator(), nullptr); - ir_builder->CreateBr(header_bb_); + b->CreateBr(header_bb_); // Header basic block. // Emit the loop conditional branch. Load and compare indvar with ending // index and jump to loop exit if equal. Jump to body otherwise. - ir_builder->SetInsertPoint(header_bb_); - indvar_ = ir_builder->CreateLoad(indvar_address, - AsStringRef(GetQualifiedName("indvar"))); - llvm::Value* exit_cond = ir_builder->CreateICmpUGE(indvar_, end_index_); - ir_builder->CreateCondBr(/*Cond=*/exit_cond, - /*True=*/exit_bb_, /*False=*/body_bb_); + b->SetInsertPoint(header_bb_); + indvar_ = + b->CreateLoad(indvar_address, AsStringRef(GetQualifiedName("indvar"))); + llvm::Value* exit_cond = b->CreateICmpUGE(indvar_, end_index_); + b->CreateCondBr(/*Cond=*/exit_cond, + /*True=*/exit_bb_, /*False=*/body_bb_); // Body basic block. // Increment indvar, store indvar, and jump to header. - ir_builder->SetInsertPoint(body_bb_); + b->SetInsertPoint(body_bb_); llvm::Value* step = step_; llvm::Value* indvar = indvar_; - llvm::Value* indvar_inc = - ir_builder->CreateAdd(indvar, step, "invar.inc", - /*HasNUW=*/true, /*HasNSW=*/true); - ir_builder->CreateStore(indvar_inc, indvar_address); - llvm::BranchInst* back_branch = ir_builder->CreateBr(header_bb_); + llvm::Value* indvar_inc = b->CreateAdd(indvar, step, "invar.inc", + /*HasNUW=*/true, /*HasNSW=*/true); + b->CreateStore(indvar_inc, indvar_address); + llvm::BranchInst* back_branch = b->CreateBr(header_bb_); - std::vector loop_metadata = GetLoopMetadata(ir_builder); + std::vector loop_metadata = GetLoopMetadata(b); if (!loop_metadata.empty()) { llvm::LLVMContext* ctx = &start_index_->getContext(); auto temp_node = llvm::MDNode::getTemporary(*ctx, llvm::None); @@ -141,17 +139,17 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) { } // Re-point the IR builder to the loop exit block. - ir_builder->SetInsertPoint(exit_bb_); + b->SetInsertPoint(exit_bb_); } -std::vector ForLoop::GetLoopMetadata( - llvm::IRBuilder<>* ir_builder) { +std::vector ForLoop::GetLoopMetadata(llvm::IRBuilder<>* b) { const char* const kLlvmLoopUnrollDisableMDName = "llvm.loop.unroll.disable"; + const char* const kLlvmLoopUnrollFullMDName = "llvm.loop.unroll.full"; const char* const kLlvmLoopVectorizeMDName = "llvm.loop.vectorize.enable"; llvm::LLVMContext* ctx = &start_index_->getContext(); std::vector result; - if (prevent_unrolling_) { + if (unroll_mode_ == xla::llvm_ir::UnrollMode::kNoUnroll) { result.push_back(llvm::MDNode::get( *ctx, {llvm::MDString::get(*ctx, kLlvmLoopUnrollDisableMDName)})); } @@ -159,45 +157,45 @@ std::vector ForLoop::GetLoopMetadata( if (prevent_vectorization_) { result.push_back(llvm::MDNode::get( *ctx, {llvm::MDString::get(*ctx, kLlvmLoopVectorizeMDName), - llvm::ConstantAsMetadata::get(ir_builder->getFalse())})); + llvm::ConstantAsMetadata::get(b->getFalse())})); } + if (unroll_mode_ == xla::llvm_ir::UnrollMode::kFullyUnroll) { + result.push_back(llvm::MDNode::get( + *ctx, {llvm::MDString::get(*ctx, kLlvmLoopUnrollFullMDName)})); + } return result; } -string ForLoop::GetQualifiedName(tensorflow::StringPiece name) { +string ForLoop::GetQualifiedName(absl::string_view name) { return llvm_ir::IrName(prefix_, llvm_ir::IrName(name, suffix_)); } -llvm::BasicBlock* ForLoop::CreateLoopBB(tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder) { - return CreateBasicBlock(insert_before_bb_, GetQualifiedName(name), - ir_builder); +llvm::BasicBlock* ForLoop::CreateLoopBB(absl::string_view name, + llvm::IRBuilder<>* b) { + return CreateBasicBlock(insert_before_bb_, GetQualifiedName(name), b); } -std::unique_ptr ForLoopNest::AddLoop(tensorflow::StringPiece suffix, +std::unique_ptr ForLoopNest::AddLoop(absl::string_view suffix, llvm::Value* start_index, llvm::Value* end_index, - bool prevent_unrolling, + UnrollMode unroll_mode, bool prevent_vectorization) { - return AddLoop(suffix, start_index, end_index, ir_builder_->getInt64(1), - prevent_unrolling, prevent_vectorization); + return AddLoop(suffix, start_index, end_index, GetConstantWithIndexType(1), + unroll_mode, prevent_vectorization); } -std::unique_ptr ForLoopNest::AddLoop(tensorflow::StringPiece suffix, - llvm::Value* start_index, - llvm::Value* end_index, - llvm::Value* stride, - bool prevent_unrolling, - bool prevent_vectorization) { +std::unique_ptr ForLoopNest::AddLoop( + absl::string_view suffix, llvm::Value* start_index, llvm::Value* end_index, + llvm::Value* stride, UnrollMode unroll_mode, bool prevent_vectorization) { if (inner_loop_body_bb_ != nullptr) { // Create this loop inside the previous one. - ir_builder_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt()); + b_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt()); } std::unique_ptr loop(new ForLoop( - /*prefix=*/name_, suffix, start_index, end_index, stride, - prevent_unrolling, prevent_vectorization)); - loop->Emit(ir_builder_); + /*prefix=*/name_, suffix, start_index, end_index, stride, unroll_mode, + prevent_vectorization)); + loop->Emit(b_); if (outer_loop_preheader_bb_ == nullptr) { outer_loop_preheader_bb_ = loop->GetPreheaderBasicBlock(); @@ -214,48 +212,78 @@ std::unique_ptr ForLoopNest::AddLoop(tensorflow::StringPiece suffix, std::unique_ptr ForLoopNest::AddLoop(int64 start_index, int64 end_index, - tensorflow::StringPiece suffix, - bool prevent_unrolling, + absl::string_view suffix, + UnrollMode unroll_mode, bool prevent_vectorization) { CHECK_LE(start_index, end_index); - return AddLoop(suffix, ir_builder_->getInt64(start_index), - ir_builder_->getInt64(end_index), prevent_unrolling, + return AddLoop(suffix, GetConstantWithIndexType(start_index), + GetConstantWithIndexType(end_index), unroll_mode, prevent_vectorization); } std::unique_ptr ForLoopNest::AddLoop(int64 start_index, int64 end_index, int64 stride, - tensorflow::StringPiece suffix, - bool prevent_unrolling, + absl::string_view suffix, + UnrollMode unroll_mode, bool prevent_vectorization) { CHECK_LE(start_index, end_index); - return AddLoop(suffix, ir_builder_->getInt64(start_index), - ir_builder_->getInt64(end_index), - ir_builder_->getInt64(stride), prevent_unrolling, + return AddLoop(suffix, GetConstantWithIndexType(start_index), + GetConstantWithIndexType(end_index), + GetConstantWithIndexType(stride), unroll_mode, prevent_vectorization); } IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape, - tensorflow::StringPiece suffix) { + absl::string_view suffix) { std::vector dimensions(ShapeUtil::Rank(shape)); std::iota(dimensions.begin(), dimensions.end(), 0); return AddLoopsForShapeOnDimensions(shape, dimensions, suffix); } IrArray::Index ForLoopNest::AddLoopsForShapeOnDimensions( - const Shape& shape, tensorflow::gtl::ArraySlice dimensions, - tensorflow::StringPiece suffix) { - llvm_ir::IrArray::Index index(shape.dimensions_size(), nullptr); + const Shape& shape, absl::Span dimensions, + absl::string_view suffix) { + llvm_ir::IrArray::Index index(index_type_, shape.dimensions_size()); for (int64 dimension : dimensions) { std::unique_ptr loop = AddLoop( /*start_index=*/0, /*end_index=*/shape.dimensions(dimension), /*suffix=*/ - llvm_ir::IrName(suffix, tensorflow::strings::StrCat(dimension))); + llvm_ir::IrName(suffix, absl::StrCat(dimension))); index[dimension] = loop->GetIndVarValue(); } return index; } +IrArray::Index ForLoopNest::EmitOperandArrayLoopNest( + const llvm_ir::IrArray& operand_array, int64 dimension_to_skip, + absl::string_view name_suffix) { + // Prepares the dimension list we will use to emit the loop nest. Outermost + // loops are added first. Add loops in major-to-minor order, and skip the + // 'dimension_to_skip' dimension. + std::vector dimensions; + const Shape& shape = operand_array.GetShape(); + for (int64 dimension : LayoutUtil::MinorToMajor(shape)) { + if (dimension != dimension_to_skip) { + dimensions.push_back(dimension); + } + } + + // Create loop nest with one for-loop for each dimension of the + // output. + llvm_ir::IrArray::Index index = + AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix); + // Verify every dimension except the 'dimension_to_skip' dimension was set in + // the index. + for (size_t dimension = 0; dimension < index.size(); ++dimension) { + if (dimension == dimension_to_skip) { + DCHECK_EQ(nullptr, index[dimension]); + } else { + DCHECK_NE(nullptr, index[dimension]); + } + } + return index; +} + } // namespace llvm_ir } // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h index d915f95db134918a173a9711936bb1e2f1ea0d95..ac3bba3c9fd6a9eb4e7822474963fcc5a394baf7 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h +++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h @@ -19,21 +19,27 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" namespace xla { namespace llvm_ir { +enum class UnrollMode { + kDefaultUnroll, + kFullyUnroll, + kNoUnroll, +}; + // A class for constructing a for-loop in LLVM IR. class ForLoop { public: @@ -69,12 +75,13 @@ class ForLoop { // LLVM IR. If non-empty, it is prepended to the name of the induction // variable value and each basic block created for the loop. // - // If `prevent_unrolling` is true then emit metadata that directs LLVM to not - // unroll the generated loop. + // `unroll_mode` specifies the desired LLVM unrolling behavior for generated + // loop. static std::unique_ptr EmitForLoop( - tensorflow::StringPiece prefix, llvm::Value* start_index, - llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder, - bool prevent_unrolling = false, bool prevent_vectorization = false); + absl::string_view prefix, llvm::Value* start_index, + llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* b, + UnrollMode unroll_mode = llvm_ir::UnrollMode::kDefaultUnroll, + bool prevent_vectorization = false); // The names of the blocks follow LLVM's conventions. Control flow amongst the // blocks for the example C code looks like: @@ -126,23 +133,22 @@ class ForLoop { // Allow ForLoopNest to call this private constructor. friend class ForLoopNest; - ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix, + ForLoop(absl::string_view prefix, absl::string_view suffix, llvm::Value* start_index, llvm::Value* end_index, llvm::Value* step, - bool prevent_unrolling, bool prevent_vectorization); + UnrollMode unroll_mode, bool prevent_vectorization); // Emit the loop at the insert point of the builder. - void Emit(llvm::IRBuilder<>* ir_builder); + void Emit(llvm::IRBuilder<>* b); - llvm::BasicBlock* CreateLoopBB(tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder); + llvm::BasicBlock* CreateLoopBB(absl::string_view name, llvm::IRBuilder<>* b); // Creates a name for an LLVM construct, appending prefix_ and suffix_, if // they are set. - string GetQualifiedName(tensorflow::StringPiece name); + string GetQualifiedName(absl::string_view name); // Return a list of metadata nodes that should be associated with the // llvm::Loop for this `ForLoop`. - std::vector GetLoopMetadata(llvm::IRBuilder<>* ir_builder); + std::vector GetLoopMetadata(llvm::IRBuilder<>* b); string prefix_; string suffix_; @@ -161,7 +167,7 @@ class ForLoop { llvm::BasicBlock* body_bb_; llvm::BasicBlock* exit_bb_; llvm::Value* indvar_; - bool prevent_unrolling_; + UnrollMode unroll_mode_; bool prevent_vectorization_; TF_DISALLOW_COPY_AND_ASSIGN(ForLoop); @@ -170,46 +176,51 @@ class ForLoop { // A simple class for constructing nested for-loops. class ForLoopNest { public: - explicit ForLoopNest(llvm::IRBuilder<>* ir_builder) - : ForLoopNest(/*name=*/"", ir_builder) {} + explicit ForLoopNest(llvm::IRBuilder<>* b, llvm::Type* index_ty = nullptr) + : ForLoopNest(/*name=*/"", b) { + SetIndexType(index_ty); + } - ForLoopNest(tensorflow::StringPiece name, llvm::IRBuilder<>* ir_builder) - : name_(std::string(name)), + ForLoopNest(absl::string_view name, llvm::IRBuilder<>* b, + llvm::Type* index_ty = nullptr) + : name_(name), outer_loop_preheader_bb_(nullptr), outer_loop_exit_bb_(nullptr), inner_loop_body_bb_(nullptr), - ir_builder_(ir_builder) {} + b_(b) { + SetIndexType(index_ty); + } // Adds a loop to the nest. If no loop has been added yet then emit a loop at // the current insert point of the given builder. If one or more loops have - // been added then emit loop inside the body of the last added loop. If - // prevent_unrolling is true, then metadata is emitting directing LLVM to not - // unroll this loop. - std::unique_ptr AddLoop(tensorflow::StringPiece suffix, - llvm::Value* start_index, - llvm::Value* end_index, llvm::Value* stride, - bool prevent_unrolling = false, - bool prevent_vectorization = false); + // been added then emit loop inside the body of the last added loop. + // unroll_mode is used to emit metadata that controls LLVM unrolling. + std::unique_ptr AddLoop( + absl::string_view suffix, llvm::Value* start_index, + llvm::Value* end_index, llvm::Value* stride, + UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll, + bool prevent_vectorization = false); // Like the above, except that it defaults to a stride of one. - std::unique_ptr AddLoop(tensorflow::StringPiece suffix, - llvm::Value* start_index, - llvm::Value* end_index, - bool prevent_unrolling = false, - bool prevent_vectorization = false); + std::unique_ptr AddLoop( + absl::string_view suffix, llvm::Value* start_index, + llvm::Value* end_index, + UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll, + bool prevent_vectorization = false); // A convenient wrapper of the other flavor of AddLoop. The given start and // end index are constant. - std::unique_ptr AddLoop(int64 start_index, int64 end_index, - int64 stride, tensorflow::StringPiece suffix, - bool prevent_unrolling = false, - bool prevent_vectorization = false); + std::unique_ptr AddLoop( + int64 start_index, int64 end_index, int64 stride, + absl::string_view suffix, + UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll, + bool prevent_vectorization = false); // Like the above, except that it defaults to a stride of one. - std::unique_ptr AddLoop(int64 start_index, int64 end_index, - tensorflow::StringPiece suffix, - bool prevent_unrolling = false, - bool prevent_vectorization = false); + std::unique_ptr AddLoop( + int64 start_index, int64 end_index, absl::string_view suffix, + UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll, + bool prevent_vectorization = false); // Add loops to iterate through the indices within the specified // shape. The returned index collects the induction variables of the @@ -222,8 +233,7 @@ class ForLoopNest { // within the shape. One possible order for that sequence would be: // // (0,0), (0,1), (0,2), (1,0), (1,1), (1,2) - IrArray::Index AddLoopsForShape(const Shape& shape, - tensorflow::StringPiece suffix); + IrArray::Index AddLoopsForShape(const Shape& shape, absl::string_view suffix); // Add a loop for each dimension in "dimensions". "suffix" is the // name suffix of the indvar and basic blocks in this new loop nest. @@ -232,8 +242,19 @@ class ForLoopNest { // size equals the rank of shape and there is a null for each // dimension that is not in "dimensions". IrArray::Index AddLoopsForShapeOnDimensions( - const Shape& shape, tensorflow::gtl::ArraySlice dimensions, - tensorflow::StringPiece suffix); + const Shape& shape, absl::Span dimensions, + absl::string_view suffix); + + // Emits a series of nested loops for iterating over an operand array. Loops + // are constructed in major to minor dimension layout order. No loop is + // emitted for the given 'dimension_to_skip'. The function returns an IrArray + // index for the given operand_array containing the indvars of the loops. All + // dimensions of the index are filled except for 'dimension_to_skip'. + // name_suffix is the string to append to the names of LLVM constructs (eg, + // basic blocks) constructed by this method. + IrArray::Index EmitOperandArrayLoopNest(const llvm_ir::IrArray& operand_array, + int64 dimension_to_skip, + absl::string_view name_suffix); // Convenience methods which return particular basic blocks of the outermost // or innermost loops. These methods return nullptr if no loops have been @@ -245,6 +266,14 @@ class ForLoopNest { llvm::BasicBlock* GetInnerLoopBodyBasicBlock() { return inner_loop_body_bb_; } private: + void SetIndexType(llvm::Type* index_ty) { + index_type_ = index_ty == nullptr ? b_->getInt64Ty() : index_ty; + } + + llvm::Constant* GetConstantWithIndexType(int64 c) const { + return llvm::ConstantInt::get(index_type_, c); + } + // Human-friendly name of the loop nest. string name_; @@ -257,7 +286,9 @@ class ForLoopNest { // has been added yet. llvm::BasicBlock* inner_loop_body_bb_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; + + llvm::Type* index_type_; TF_DISALLOW_COPY_AND_ASSIGN(ForLoopNest); }; diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc index ec04239b4f9112134ba876fdfbb3905a3baf1f72..1a53c026be340ca3bec3a49b11666d6124728130 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc @@ -19,6 +19,8 @@ limitations under the License. #include #include +#include "absl/strings/match.h" +#include "absl/strings/str_cat.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/MDBuilder.h" @@ -26,7 +28,7 @@ limitations under the License. #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Utils/Cloning.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/name_uniquer.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" @@ -34,8 +36,7 @@ limitations under the License. #include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -47,8 +48,8 @@ namespace { // Note, this function is only useful in an insertion context; in a global // (e.g. constants) context it will CHECK fail. -llvm::Module* ModuleFromIRBuilder(llvm::IRBuilder<>* ir_builder) { - auto block = CHECK_NOTNULL(ir_builder->GetInsertBlock()); +llvm::Module* ModuleFromIRBuilder(llvm::IRBuilder<>* b) { + auto block = CHECK_NOTNULL(b->GetInsertBlock()); auto fn = CHECK_NOTNULL(block->getParent()); auto module = CHECK_NOTNULL(fn->getParent()); return module; @@ -60,7 +61,7 @@ string AsString(const std::string& str) { return string(str.data(), str.length()); } -llvm::StringRef AsStringRef(tensorflow::StringPiece str) { +llvm::StringRef AsStringRef(absl::string_view str) { return llvm::StringRef(str.data(), str.size()); } @@ -82,53 +83,44 @@ string DumpModuleToString(const llvm::Module& module) { return AsString(buffer_string); } -llvm::Value* EmitCallToIntrinsic( - llvm::Intrinsic::ID intrinsic_id, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice overloaded_types, - llvm::IRBuilder<>* ir_builder) { - std::vector types; - for (auto type : overloaded_types) { - types.push_back(type); - } - llvm::Module* module = ModuleFromIRBuilder(ir_builder); - llvm::Function* intrinsic = - llvm::Intrinsic::getDeclaration(module, intrinsic_id, types); - std::vector operands_vec; - for (auto operand : operands) { - operands_vec.push_back(operand); - } - return ir_builder->CreateCall(intrinsic, operands_vec); +llvm::Value* EmitCallToIntrinsic(llvm::Intrinsic::ID intrinsic_id, + absl::Span operands, + absl::Span overloaded_types, + llvm::IRBuilder<>* b) { + llvm::Module* module = ModuleFromIRBuilder(b); + llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration( + module, intrinsic_id, AsArrayRef(overloaded_types)); + return b->CreateCall(intrinsic, AsArrayRef(operands)); } llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value, - llvm::IRBuilder<>* ir_builder) { - if (ir_builder->getFastMathFlags().noNaNs()) { - auto cmp = ir_builder->CreateFCmpUGE(lhs_value, rhs_value); - return ir_builder->CreateSelect(cmp, lhs_value, rhs_value); + llvm::IRBuilder<>* b) { + if (b->getFastMathFlags().noNaNs()) { + auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value); + return b->CreateSelect(cmp, lhs_value, rhs_value); } else { - auto cmp_ge = ir_builder->CreateFCmpOGE(lhs_value, rhs_value); - auto lhs_is_nan = ir_builder->CreateFCmpUNE(lhs_value, lhs_value); - auto sel_lhs = ir_builder->CreateOr(cmp_ge, lhs_is_nan); - return ir_builder->CreateSelect(sel_lhs, lhs_value, rhs_value); + auto cmp_ge = b->CreateFCmpOGE(lhs_value, rhs_value); + auto lhs_is_nan = b->CreateFCmpUNE(lhs_value, lhs_value); + auto sel_lhs = b->CreateOr(cmp_ge, lhs_is_nan); + return b->CreateSelect(sel_lhs, lhs_value, rhs_value); } } llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value, - llvm::IRBuilder<>* ir_builder) { - if (ir_builder->getFastMathFlags().noNaNs()) { - auto cmp = ir_builder->CreateFCmpULE(lhs_value, rhs_value); - return ir_builder->CreateSelect(cmp, lhs_value, rhs_value); + llvm::IRBuilder<>* b) { + if (b->getFastMathFlags().noNaNs()) { + auto cmp = b->CreateFCmpULE(lhs_value, rhs_value); + return b->CreateSelect(cmp, lhs_value, rhs_value); } else { - auto cmp_le = ir_builder->CreateFCmpOLE(lhs_value, rhs_value); - auto lhs_is_nan = ir_builder->CreateFCmpUNE(lhs_value, lhs_value); - auto sel_lhs = ir_builder->CreateOr(cmp_le, lhs_is_nan); - return ir_builder->CreateSelect(sel_lhs, lhs_value, rhs_value); + auto cmp_le = b->CreateFCmpOLE(lhs_value, rhs_value); + auto lhs_is_nan = b->CreateFCmpUNE(lhs_value, lhs_value); + auto sel_lhs = b->CreateOr(cmp_le, lhs_is_nan); + return b->CreateSelect(sel_lhs, lhs_value, rhs_value); } } llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Value* index, - llvm::IRBuilder<>* ir_builder) { + llvm::IRBuilder<>* b) { llvm::Type* array_type = array->getType(); CHECK(array_type->isPointerTy()); llvm::PointerType* array_type_as_pointer = @@ -138,16 +130,16 @@ llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Value* index, << " array=" << llvm_ir::DumpToString(*array) << " index=" << llvm_ir::DumpToString(*index); - return ir_builder->CreateInBoundsGEP( + return b->CreateInBoundsGEP( array_type_as_pointer->getElementType(), array, llvm::isa(array) - ? llvm::ArrayRef({ir_builder->getInt64(0), index}) + ? llvm::ArrayRef({b->getInt64(0), index}) : index); } llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, int64 index, - llvm::IRBuilder<>* ir_builder) { - return EmitBufferIndexingGEP(array, ir_builder->getInt64(index), ir_builder); + llvm::IRBuilder<>* b) { + return EmitBufferIndexingGEP(array, b->getInt64(index), b); } llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type, @@ -201,6 +193,10 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type, // An Opaque is like a void*, use i8*. case OPAQUE: return llvm::Type::getInt8PtrTy(module->getContext()); + case TOKEN: + // Tokens do not have a physical representation, but the compiler needs + // some placeholder type, so use int8*. + return llvm::Type::getInt8PtrTy(module->getContext()); default: LOG(FATAL) << "unsupported type " << element_type; } @@ -235,14 +231,15 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module) { return result_type; } -StatusOr EncodeSelfDescribingShapeConstant( - const Shape& shape, int32* shape_size, llvm::IRBuilder<>* ir_builder) { +StatusOr EncodeSelfDescribingShapeConstant(const Shape& shape, + int32* shape_size, + llvm::IRBuilder<>* b) { string encoded_shape = shape.SerializeAsString(); if (encoded_shape.size() > std::numeric_limits::max()) { return InternalError("Encoded shape size exceeded int32 size limit."); } *shape_size = static_cast(encoded_shape.size()); - return ir_builder->CreateGlobalStringPtr(llvm_ir::AsStringRef(encoded_shape)); + return b->CreateGlobalStringPtr(llvm_ir::AsStringRef(encoded_shape)); } StatusOr DecodeSelfDescribingShapeConstant(const void* shape_ptr, @@ -253,227 +250,107 @@ StatusOr DecodeSelfDescribingShapeConstant(const void* shape_ptr, return shape; } -namespace { - -// Recursively construct a multidimensional LLVM constant which represents the -// given literal. The minor-to-major dimension ordering in the constant matches -// that of the literal. For example, given a [2 x 3 x 4] Literal (dimension 0 -// has size 4, dimension 1 has size 3, etc) of primitive type F32 with a -// minor_to_major value of [2, 1, 0] (column major), a LLVM constant of type -// [4 x [3 x [2 x float]] will be returned. -// -// multi_index is a multidimensional index into the array. dimension_index is an -// index into the minor_to_major field in the literal shape. This determines -// which dimension is iterated over in this level of the recursion. Dimensions -// are iterated from most major down to most minor (highest dimension_index -// value down to zero). -llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index, - std::vector* multi_index, - llvm::Module* module) { - const Shape& shape = literal.shape(); - llvm::Type* ir_element_type = - llvm_ir::PrimitiveTypeToIrType(shape.element_type(), module); - if (dimension_index == -1) { - // Base case of the recursion. Index into the data field of the protobuf - // with the multi index. - llvm::Constant* value; - switch (shape.element_type()) { - case PRED: - value = llvm::ConstantInt::get(ir_element_type, - literal.Get(*multi_index)); - break; - case U8: - value = llvm::ConstantInt::get(ir_element_type, - literal.Get(*multi_index)); - break; - case S32: - value = llvm::ConstantInt::get(ir_element_type, - literal.Get(*multi_index)); - break; - case U32: - value = llvm::ConstantInt::get(ir_element_type, - literal.Get(*multi_index)); - break; - case S64: - value = llvm::ConstantInt::get(ir_element_type, - literal.Get(*multi_index)); - break; - case U64: - value = llvm::ConstantInt::get(ir_element_type, - literal.Get(*multi_index)); - break; - case F32: - value = llvm::ConstantFP::get(ir_element_type, - literal.Get(*multi_index)); - break; - case BF16: - value = llvm::ConstantInt::get( - ir_element_type, - tensorflow::bit_cast(literal.Get(*multi_index))); - break; - case F16: - value = llvm::ConstantFP::get( - ir_element_type, - static_cast(literal.Get(*multi_index))); - break; - case F64: - value = llvm::ConstantFP::get(ir_element_type, - literal.Get(*multi_index)); - break; - case C64: { - complex64 x = literal.Get(*multi_index); - value = llvm::ConstantStruct::get( - static_cast(ir_element_type), - llvm::ConstantFP::get(llvm_ir::PrimitiveTypeToIrType(F32, module), - x.real()), - llvm::ConstantFP::get(llvm_ir::PrimitiveTypeToIrType(F32, module), - x.imag())); - break; - } - default: - LOG(FATAL) << "unsupported type " << shape.element_type(); - } - return value; - } - - // The dimension index starts at the one less than the rank of the array and - // decrements with each recursive call. We want to iterate through the - // dimensions in major-to-minor order as we recurse so just index into - // minor_to_major to get the dimension number for this level of the recursion. - int64 dimension = LayoutUtil::Minor(shape.layout(), dimension_index); - - // Recursively call LiteralToConstant to construct subarrays for the - // more-minor dimensions. Gather the subarrays into a vector for bundling into - // a new (higher-dimensional) ConstantArray. - std::vector elements; - for (int64 i = 0; i < shape.dimensions(dimension); ++i) { - (*multi_index)[dimension] = i; - elements.push_back( - LiteralToConstant(literal, dimension_index - 1, multi_index, module)); - } - - llvm::Type* element_type; - if (elements.empty()) { - element_type = ir_element_type; - for (int i = 0; i < dimension_index; ++i) { - int64 index = LayoutUtil::Minor(shape.layout(), i); - element_type = - llvm::ArrayType::get(element_type, shape.dimensions(index)); - } - } else { - element_type = elements[0]->getType(); - } - llvm::ArrayType* aggregate_type = - llvm::ArrayType::get(element_type, shape.dimensions(dimension)); - return llvm::ConstantArray::get(aggregate_type, elements); -} - -} // namespace - llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal, llvm::Module* module) { - std::vector multi_index(ShapeUtil::Rank(literal.shape()), 0); - llvm::Constant* value = LiteralToConstant( - literal, /*dimension_index=*/ShapeUtil::Rank(literal.shape()) - 1, - &multi_index, module); - return value; + const char* data = static_cast(literal.untyped_data()); + CHECK_EQ(module->getDataLayout().isLittleEndian(), + tensorflow::port::kLittleEndian); + return llvm::ConstantDataArray::getString( + module->getContext(), llvm::StringRef(data, literal.size_bytes()), + /*AddNull=*/false); } llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type, - tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder, + absl::string_view name, + llvm::IRBuilder<>* b, int alignment) { - return EmitAllocaAtFunctionEntryWithCount(type, nullptr, name, ir_builder, - alignment); -} - -llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount( - llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder, int alignment) { - llvm::IRBuilder<>::InsertPoint insert_point = ir_builder->saveIP(); - llvm::Function* function = ir_builder->GetInsertBlock()->getParent(); - ir_builder->SetInsertPoint(&function->getEntryBlock(), - function->getEntryBlock().getFirstInsertionPt()); + return EmitAllocaAtFunctionEntryWithCount(type, nullptr, name, b, alignment); +} + +llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type, + llvm::Value* element_count, + absl::string_view name, + llvm::IRBuilder<>* b, + int alignment) { + llvm::IRBuilder<>::InsertPoint insert_point = b->saveIP(); + llvm::Function* function = b->GetInsertBlock()->getParent(); + b->SetInsertPoint(&function->getEntryBlock(), + function->getEntryBlock().getFirstInsertionPt()); llvm::AllocaInst* alloca = - ir_builder->CreateAlloca(type, element_count, AsStringRef(name)); + b->CreateAlloca(type, element_count, AsStringRef(name)); if (alignment != 0) { alloca->setAlignment(alignment); } - ir_builder->restoreIP(insert_point); + b->restoreIP(insert_point); return alloca; } llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before, - tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder) { + absl::string_view name, + llvm::IRBuilder<>* b) { return llvm::BasicBlock::Create( - /*Context=*/ir_builder->getContext(), + /*Context=*/b->getContext(), /*Name=*/AsStringRef(name), - /*Parent=*/ir_builder->GetInsertBlock()->getParent(), + /*Parent=*/b->GetInsertBlock()->getParent(), /*InsertBefore*/ insert_before); } -LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder, bool emit_else) { +LlvmIfData EmitIfThenElse(llvm::Value* condition, absl::string_view name, + llvm::IRBuilder<>* b, bool emit_else) { llvm_ir::LlvmIfData if_data; - if_data.if_block = ir_builder->GetInsertBlock(); - if_data.true_block = CreateBasicBlock( - nullptr, tensorflow::strings::StrCat(name, "-true"), ir_builder); + if_data.if_block = b->GetInsertBlock(); + if_data.true_block = + CreateBasicBlock(nullptr, absl::StrCat(name, "-true"), b); if_data.false_block = - emit_else ? CreateBasicBlock(nullptr, - tensorflow::strings::StrCat(name, "-false"), - ir_builder) + emit_else ? CreateBasicBlock(nullptr, absl::StrCat(name, "-false"), b) : nullptr; // Add a terminator to the if block, if necessary. if (if_data.if_block->getTerminator() == nullptr) { - ir_builder->SetInsertPoint(if_data.if_block); - if_data.after_block = CreateBasicBlock( - nullptr, tensorflow::strings::StrCat(name, "-after"), ir_builder); - ir_builder->CreateBr(if_data.after_block); + b->SetInsertPoint(if_data.if_block); + if_data.after_block = + CreateBasicBlock(nullptr, absl::StrCat(name, "-after"), b); + b->CreateBr(if_data.after_block); } else { if_data.after_block = if_data.if_block->splitBasicBlock( - ir_builder->GetInsertPoint(), - AsStringRef(tensorflow::strings::StrCat(name, "-after"))); + b->GetInsertPoint(), AsStringRef(absl::StrCat(name, "-after"))); } // Our basic block should now end with an unconditional branch. Remove it; // we're going to replace it with a conditional branch. if_data.if_block->getTerminator()->eraseFromParent(); - ir_builder->SetInsertPoint(if_data.if_block); - ir_builder->CreateCondBr( - condition, if_data.true_block, - emit_else ? if_data.false_block : if_data.after_block); + b->SetInsertPoint(if_data.if_block); + b->CreateCondBr(condition, if_data.true_block, + emit_else ? if_data.false_block : if_data.after_block); - ir_builder->SetInsertPoint(if_data.true_block); - ir_builder->CreateBr(if_data.after_block); + b->SetInsertPoint(if_data.true_block); + b->CreateBr(if_data.after_block); if (emit_else) { - ir_builder->SetInsertPoint(if_data.false_block); - ir_builder->CreateBr(if_data.after_block); + b->SetInsertPoint(if_data.false_block); + b->CreateBr(if_data.after_block); } - ir_builder->SetInsertPoint(if_data.after_block, - if_data.after_block->getFirstInsertionPt()); + b->SetInsertPoint(if_data.after_block, + if_data.after_block->getFirstInsertionPt()); return if_data; } llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate, llvm::Value* lhs_value, llvm::Value* rhs_value, - llvm::IRBuilder<>* ir_builder) { + llvm::IRBuilder<>* b) { llvm::Value* comparison_result; if (lhs_value->getType()->isIntegerTy()) { - comparison_result = ir_builder->CreateICmp(predicate, lhs_value, rhs_value); + comparison_result = b->CreateICmp(predicate, lhs_value, rhs_value); } else { - comparison_result = ir_builder->CreateFCmp(predicate, lhs_value, rhs_value); + comparison_result = b->CreateFCmp(predicate, lhs_value, rhs_value); } // comparison_result is i1, but the NVPTX codegen incorrectly lowers i1 // arrays. So we extend it to i8 so that it's addressable. - return ir_builder->CreateZExt( - comparison_result, - llvm_ir::PrimitiveTypeToIrType(PRED, ModuleFromIRBuilder(ir_builder))); + return b->CreateZExt(comparison_result, llvm_ir::PrimitiveTypeToIrType( + PRED, ModuleFromIRBuilder(b))); } // Internal helper that is called from emitted code to log an int64 value with a @@ -482,17 +359,14 @@ static void LogS64(const char* tag, int64 value) { LOG(INFO) << tag << " (int64): " << value; } -void EmitLogging(const char* tag, llvm::Value* value, - llvm::IRBuilder<>* ir_builder) { +void EmitLogging(const char* tag, llvm::Value* value, llvm::IRBuilder<>* b) { llvm::FunctionType* log_function_type = llvm::FunctionType::get( - ir_builder->getVoidTy(), - {ir_builder->getInt64Ty(), ir_builder->getInt64Ty()}, /*isVarArg=*/false); - ir_builder->CreateCall( + b->getVoidTy(), {b->getInt64Ty(), b->getInt64Ty()}, /*isVarArg=*/false); + b->CreateCall( log_function_type, - ir_builder->CreateIntToPtr( - ir_builder->getInt64(tensorflow::bit_cast(&LogS64)), - log_function_type->getPointerTo()), - {ir_builder->getInt64(tensorflow::bit_cast(tag)), value}); + b->CreateIntToPtr(b->getInt64(tensorflow::bit_cast(&LogS64)), + log_function_type->getPointerTo()), + {b->getInt64(tensorflow::bit_cast(tag)), value}); } void SetAlignmentMetadataForLoad(llvm::LoadInst* load, uint64_t alignment) { @@ -538,14 +412,14 @@ string IrName(string a) { return a; } -string IrName(tensorflow::StringPiece a, tensorflow::StringPiece b) { +string IrName(absl::string_view a, absl::string_view b) { if (!a.empty() && !b.empty()) { - return IrName(tensorflow::strings::StrCat(a, ".", b)); + return IrName(absl::StrCat(a, ".", b)); } - return IrName(tensorflow::strings::StrCat(a, b)); + return IrName(absl::StrCat(a, b)); } -string IrName(const HloInstruction* a, tensorflow::StringPiece b) { +string IrName(const HloInstruction* a, absl::string_view b) { return IrName(a->name(), b); } @@ -681,7 +555,7 @@ std::map MergeMetadata( return result; } -static string GetProcessUniqueIrFileName(tensorflow::StringPiece prefix) { +static string GetProcessUniqueIrFileName(absl::string_view prefix) { static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); static NameUniquer* uniquer = new NameUniquer(/*separator=*/"-"); @@ -709,18 +583,16 @@ Status DumpIRToDirectory(const string& directory_name, // XlaJitCompiledCpuFunction::Compile. Avoid overwriting IR files previously // dumped from the same process in such cases. string unique_and_safe_file_name = GetProcessUniqueIrFileName( - tensorflow::strings::StrCat("ir-", SanitizeFileName(hlo_module_name), "-", - optimized ? "with" : "no", "-opt")); + absl::StrCat("ir-", SanitizeFileName(hlo_module_name), "-", + optimized ? "with" : "no", "-opt")); string ir_file_name = tensorflow::io::JoinPath( - directory_name, - tensorflow::strings::StrCat(unique_and_safe_file_name, ".ll")); + directory_name, absl::StrCat(unique_and_safe_file_name, ".ll")); // For some models the embedded constants can be huge, so also dump the module // with the constants stripped to get IR that is easier to manipulate. string ir_no_constant_initializers_file_name = tensorflow::io::JoinPath( - directory_name, - tensorflow::strings::StrCat(unique_and_safe_file_name, "-noconst.ll")); + directory_name, absl::StrCat(unique_and_safe_file_name, "-noconst.ll")); TF_RETURN_IF_ERROR(CreateAndWriteStringToFile( directory_name, ir_file_name, DumpModuleToString(llvm_module))); @@ -732,8 +604,7 @@ Status DumpIRToDirectory(const string& directory_name, llvm::Function* CreateFunction(llvm::FunctionType* function_type, llvm::GlobalValue::LinkageTypes linkage, bool enable_fast_math, bool optimize_for_size, - tensorflow::StringPiece name, - llvm::Module* module) { + absl::string_view name, llvm::Module* module) { llvm::Function* function = llvm::Function::Create(function_type, linkage, AsStringRef(name), module); function->setCallingConv(llvm::CallingConv::C); @@ -763,7 +634,7 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) { fake_argv_storage.push_back(""); for (const auto& it : options) { // Skip options the XLA backend itself consumes. - if (!tensorflow::str_util::StartsWith(it.first, "xla_")) { + if (!absl::StartsWith(it.first, "xla_")) { if (it.second.empty()) { fake_argv_storage.push_back(it.first); } else { @@ -782,5 +653,56 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) { } } +std::pair UMulLowHigh32(llvm::IRBuilder<>* b, + llvm::Value* src0, + llvm::Value* src1) { + CHECK_EQ(src0->getType()->getPrimitiveSizeInBits(), 32); + CHECK_EQ(src1->getType()->getPrimitiveSizeInBits(), 32); + llvm::Type* int64_ty = b->getInt64Ty(); + src0 = b->CreateZExt(src0, int64_ty); + src1 = b->CreateZExt(src1, int64_ty); + return SplitInt64ToInt32s(b, b->CreateMul(src0, src1)); +} + +std::pair SplitInt64ToInt32s( + llvm::IRBuilder<>* b, llvm::Value* value_64bits) { + CHECK_EQ(value_64bits->getType()->getPrimitiveSizeInBits(), 64); + llvm::Type* int32_ty = b->getInt32Ty(); + llvm::Value* low_32bits = b->CreateTrunc(value_64bits, int32_ty); + llvm::Value* high_32bits = + b->CreateTrunc(b->CreateLShr(value_64bits, 32), int32_ty); + return std::make_pair(low_32bits, high_32bits); +} + +llvm::GlobalVariable* GetOrCreateVariableForPhiloxRngState( + llvm::Module* module, llvm::IRBuilder<>* b) { + static const char* kPhiloxRngStateVariableName = "philox_rng_state"; + llvm::GlobalVariable* state_ptr = + module->getNamedGlobal(kPhiloxRngStateVariableName); + if (!state_ptr) { + state_ptr = new llvm::GlobalVariable( + /*M=*/*module, + /*Ty=*/b->getInt64Ty(), + /*isConstant=*/false, + /*Linkage=*/llvm::GlobalValue::PrivateLinkage, + /*Initializer=*/b->getInt64(0), + /*Name=*/kPhiloxRngStateVariableName); + } + return state_ptr; +} + +void IncrementVariableForPhiloxRngState(int64 value, llvm::Module* module, + llvm::IRBuilder<>* builder) { + llvm::GlobalVariable* state_ptr = + GetOrCreateVariableForPhiloxRngState(module, builder); + llvm::Value* state_value_old = builder->CreateLoad(state_ptr, "load_state"); + // If the 64-bit value overflows, we use the wraparound value. This should + // be fine in practice as we only add one to the value each time when a RNG is + // executed. + llvm::Value* state_value_new = builder->CreateAdd( + state_value_old, builder->getInt64(value), "inc_state"); + builder->CreateStore(state_value_new, state_ptr); +} + } // namespace llvm_ir } // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h index 4a10ec466dae6fdb56546fb8d8b353dcff6a5b8d..f59baff263fe7184c6b0821c9dbd9eee205586a6 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h +++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h @@ -20,6 +20,8 @@ limitations under the License. #include #include +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/IRBuilder.h" @@ -27,13 +29,11 @@ limitations under the License. #include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/Support/raw_ostream.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/types.h" namespace llvm { @@ -47,11 +47,11 @@ namespace llvm_ir { // Convert a std::string (used by LLVM's interfaces) to string. string AsString(const std::string& str); -// Convert a tensorflow::StringPiece to a llvm::StringRef. Note: both -// tensorflow::StringPiece and llvm::StringRef are non-owning pointers into a +// Convert a absl::string_view to a llvm::StringRef. Note: both +// absl::string_view and llvm::StringRef are non-owning pointers into a // string in memory. This method is used to feed strings to LLVM // & Clang APIs that expect llvm::StringRef. -llvm::StringRef AsStringRef(tensorflow::StringPiece str); +llvm::StringRef AsStringRef(absl::string_view str); template llvm::ArrayRef AsArrayRef(const std::vector& vec) { @@ -59,7 +59,7 @@ llvm::ArrayRef AsArrayRef(const std::vector& vec) { } template -llvm::ArrayRef AsArrayRef(const tensorflow::gtl::ArraySlice& slice) { +llvm::ArrayRef AsArrayRef(const absl::Span& slice) { return llvm::ArrayRef(slice.data(), slice.size()); } @@ -88,8 +88,8 @@ string DumpModuleToString(const llvm::Module& module); // - removing all '%'s. // string IrName(string a); -string IrName(tensorflow::StringPiece a, tensorflow::StringPiece b); -string IrName(const HloInstruction* a, tensorflow::StringPiece b = ""); +string IrName(absl::string_view a, absl::string_view b); +string IrName(const HloInstruction* a, absl::string_view b = ""); // Removes special characters from a function name. // @@ -101,30 +101,29 @@ string SanitizeFunctionName(string function_name); // intrinsics (for example, "minnum") must include a type in overloaded_types // for each overloaded type. Typically, overloaded intrinsics have only a single // overloaded type. -llvm::Value* EmitCallToIntrinsic( - llvm::Intrinsic::ID intrinsic_id, - tensorflow::gtl::ArraySlice operands, - tensorflow::gtl::ArraySlice overloaded_types, - llvm::IRBuilder<>* ir_builder); +llvm::Value* EmitCallToIntrinsic(llvm::Intrinsic::ID intrinsic_id, + absl::Span operands, + absl::Span overloaded_types, + llvm::IRBuilder<>* b); // Emit float max. Emit maxnum intrinsic is fast math is disabled, or // fcmp+select otherwise llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value, - llvm::IRBuilder<>* ir_builder); + llvm::IRBuilder<>* b); // Emit float min. Emit minnum intrinsic is fast math is disabled, or // fcmp+select otherwise llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value, - llvm::IRBuilder<>* ir_builder); + llvm::IRBuilder<>* b); // Convenience methods for emitting a GEP instruction that indexes into a buffer // (1-dimensional array), equivalent to array[index]. The type is automatically // determined from the element type of the array. The int64 index overload // wraps the index in a i64 llvm::Value. llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Value* index, - llvm::IRBuilder<>* ir_builder); + llvm::IRBuilder<>* b); llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, int64 index, - llvm::IRBuilder<>* ir_builder); + llvm::IRBuilder<>* b); // Returns the LLVM type which represents the given XLA primitive type. llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type, @@ -139,8 +138,9 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module); // Returns a value that represents a pointer to a global string constant that // encodes the shape as a serialized protobuf. -StatusOr EncodeSelfDescribingShapeConstant( - const Shape& shape, int32* shape_size, llvm::IRBuilder<>* ir_builder); +StatusOr EncodeSelfDescribingShapeConstant(const Shape& shape, + int32* shape_size, + llvm::IRBuilder<>* b); // Inverses the encoding of a Shape protobuf into an LLVM global variable. // @@ -163,22 +163,24 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal, // This can be useful to avoid e.g. executing an alloca every time // through a loop. llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type, - tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder, + absl::string_view name, + llvm::IRBuilder<>* b, int alignment = 0); // As EmitAllocaAtFunctionEntry, but allocates element_count entries // instead of a single element. -llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount( - llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder, int alignment = 0); +llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type, + llvm::Value* element_count, + absl::string_view name, + llvm::IRBuilder<>* b, + int alignment = 0); // Creates a basic block with the same context and function as for the // builder. Inserts at the end of the function if insert_before is // null. llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before, - tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder); + absl::string_view name, + llvm::IRBuilder<>* b); // Struct with data on a conditional branch in a diamond shape created // via EmitIfThenElse. @@ -209,14 +211,14 @@ struct LlvmIfData { // Currently the insertion point of the builder must be a well-formed // block with a terminator. If you need to use this for a // non-terminated block, just make the function able to do that too. -LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder, bool emit_else = true); +LlvmIfData EmitIfThenElse(llvm::Value* condition, absl::string_view name, + llvm::IRBuilder<>* b, bool emit_else = true); // Emits a compare operation between "lhs" and "rhs" with the given predicate, // and then converts the result to i8 so that it is addressable. llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate, llvm::Value* lhs, llvm::Value* rhs, - llvm::IRBuilder<>* ir_builder); + llvm::IRBuilder<>* b); // Emits a call that logs the given value with the given tag as a prefix. // The provided tag and value are passed to a runtime logging call that is @@ -228,8 +230,7 @@ llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate, // Precondition: value must be an int64. // Precondition: tag must be a stable pointer for the lifetime of the generated // program (the constant pointer is burned in to the program). -void EmitLogging(const char* tag, llvm::Value* value, - llvm::IRBuilder<>* ir_builder); +void EmitLogging(const char* tag, llvm::Value* value, llvm::IRBuilder<>* b); // Adds alignment metadata to a load instruction using the given alignment. // The alignment refers to the result of the load, not the load itself. @@ -285,13 +286,33 @@ Status DumpIRToDirectory(const string& directory_name, llvm::Function* CreateFunction(llvm::FunctionType* function_type, llvm::GlobalValue::LinkageTypes linkage, bool enable_fast_math, bool optimize_for_size, - tensorflow::StringPiece name, - llvm::Module* module); + absl::string_view name, llvm::Module* module); // Extracts the xla_backend_extra_options from `config` and passes those that // don't start with xla_ to LLVM. void InitializeLLVMCommandLineOptions(const HloModuleConfig& config); +// Zero-extends two 32-bit values to 64 bits, multiplies them, and returns the +// result as a pair of (low 32 bits, high 32 bits). +std::pair UMulLowHigh32(llvm::IRBuilder<>* b, + llvm::Value* src0, + llvm::Value* src1); +// Splits the 64-bit integer value into its high and low 32 bits. +std::pair SplitInt64ToInt32s( + llvm::IRBuilder<>* b, llvm::Value* value_64bits); + +// Checks whether a global variable is already created to represent a +// state passed between RNG calls implemented with Philox algorithm. If not, +// creates such a variable. Returns the global variable. +llvm::GlobalVariable* GetOrCreateVariableForPhiloxRngState( + llvm::Module* module, llvm::IRBuilder<>* b); + +// Adds a value to the global state variable each time when a RNG hlo is +// executed. The value of this global state variable is added to the seed +// of the Philox RNG algorithm so that calling the same RNG Hlo multiple times +// should rarely produce the same result. +void IncrementVariableForPhiloxRngState(int64 value, llvm::Module* module, + llvm::IRBuilder<>* b); } // namespace llvm_ir } // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc index 0728ccfff7b85e3751f33bc5272a5f22d4e5411a..0dc120e0b0df47f261435f490a8459b49d989b53 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc @@ -18,13 +18,13 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/types.h" @@ -33,26 +33,24 @@ namespace xla { namespace llvm_ir { LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape, - llvm::IRBuilder<>* ir_builder) - : body_emitter_(body_emitter), shape_(shape), ir_builder_(ir_builder) {} + llvm::IRBuilder<>* b) + : body_emitter_(body_emitter), shape_(shape), b_(b) {} LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator, - const IrArray& target_array, - llvm::IRBuilder<>* ir_builder) + const IrArray& target_array, llvm::IRBuilder<>* b) : body_emitter_([=](const llvm_ir::IrArray::Index array_index) -> Status { // Convert target_element_generator to a BodyEmitter. TF_ASSIGN_OR_RETURN(llvm::Value * target_element, target_element_generator(array_index)); - target_array.EmitWriteArrayElement(array_index, target_element, - ir_builder); + target_array.EmitWriteArrayElement(array_index, target_element, b); return Status::OK(); }), shape_(target_array.GetShape()), - ir_builder_(ir_builder) {} + b_(b) {} static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion( const ElementGenerator& target_element_generator, - const std::vector& target_arrays, llvm::IRBuilder<>* ir_builder) { + const std::vector& target_arrays, llvm::IRBuilder<>* b) { return [=](const llvm_ir::IrArray::Index array_index) { TF_ASSIGN_OR_RETURN(llvm::Value * target_element, target_element_generator(array_index)); @@ -64,57 +62,58 @@ static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion( for (int64 i = 0; i < target_arrays.size(); ++i) { target_arrays[i].EmitWriteArrayElement( - array_index, ir_builder->CreateExtractValue(target_element, i), - ir_builder); + array_index, b->CreateExtractValue(target_element, i), b); } return Status::OK(); }; } LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator, - tensorflow::gtl::ArraySlice target_arrays, - llvm::IRBuilder<>* ir_builder) + absl::Span target_arrays, + llvm::IRBuilder<>* b) : body_emitter_(MakeBodyEmitterForMultiOutputFusion( target_element_generator, - std::vector(target_arrays.begin(), target_arrays.end()), - ir_builder)), + std::vector(target_arrays.begin(), target_arrays.end()), b)), shape_(target_arrays[0].GetShape()), - ir_builder_(ir_builder) { + b_(b) { // Sanity check: In multi-output fusion, all shapes produced must have the // same dimensions. for (const IrArray& array : target_arrays) { - CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape())); + CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape())) + << ": '" << shape_.ShortDebugString() << "' does not match '" + << array.GetShape().ShortDebugString() << "'"; } } std::vector LoopEmitter::EmitIndexAndSetExitBasicBlock( - tensorflow::StringPiece loop_name) { + absl::string_view loop_name, llvm::Type* index_type) { + CHECK_NE(index_type, nullptr); if (ShapeUtil::IsScalar(shape_)) { // No loop needed, so set exit_bb_ to nullptr. exit_bb_ = nullptr; - return {IrArray::Index()}; + return {IrArray::Index(index_type)}; } // Create loop nest with one for-loop for each dimension of the target shape. // Loops are added from outermost to innermost order with the ForLoopNest // class so emit loops in order from most-major dimension down to most-minor // dimension (of the target shape). - ForLoopNest loop_nest(loop_name, ir_builder_); - IrArray::Index array_index(shape_.dimensions_size()); + ForLoopNest loop_nest(loop_name, b_); + IrArray::Index array_index(index_type, shape_.dimensions_size()); for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) { int64 dimension = LayoutUtil::Major(shape_.layout(), i); std::unique_ptr loop = loop_nest.AddLoop( /*start_index=*/0, /*end_index=*/shape_.dimensions(dimension), - /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension)); + /*suffix=*/absl::StrFormat("dim.%d", dimension)); array_index[dimension] = loop->GetIndVarValue(); } // Set IR builder insertion point to the loop body basic block of the // innermost loop. llvm::BasicBlock* innermost_body_bb = loop_nest.GetInnerLoopBodyBasicBlock(); - ir_builder_->SetInsertPoint(innermost_body_bb, - innermost_body_bb->getFirstInsertionPt()); + b_->SetInsertPoint(innermost_body_bb, + innermost_body_bb->getFirstInsertionPt()); // Set exit_bb_ to the exit block of the loop nest. exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock(); @@ -123,16 +122,21 @@ std::vector LoopEmitter::EmitIndexAndSetExitBasicBlock( return {array_index}; } -Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) { +Status LoopEmitter::EmitLoop(absl::string_view loop_name, + llvm::Type* index_type) { + if (index_type == nullptr) { + index_type = b_->getInt64Ty(); + } + for (const IrArray::Index& array_index : - EmitIndexAndSetExitBasicBlock(loop_name)) { + EmitIndexAndSetExitBasicBlock(loop_name, index_type)) { TF_RETURN_IF_ERROR(body_emitter_(array_index)); } - // Set the insertion point of ir_builder_ to the loop exit, so that + // Set the insertion point of b_ to the loop exit, so that // code emitted for later instructions will be correctly placed. if (exit_bb_ != nullptr) { - ir_builder_->SetInsertPoint(exit_bb_); + b_->SetInsertPoint(exit_bb_); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h index b70d28ecd3033eb26629718e50ce48f39b162273..a537c00066b0a68404b142e91283510092b46e2d 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h +++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h @@ -41,11 +41,11 @@ class LoopEmitter { using BodyEmitter = std::function; LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape, - llvm::IRBuilder<>* ir_builder); + llvm::IRBuilder<>* b); // Constructs a LoopEmitter from an element generator that generates each // element of the given target array. LoopEmitter(const ElementGenerator& target_element_generator, - const IrArray& target_array, llvm::IRBuilder<>* ir_builder); + const IrArray& target_array, llvm::IRBuilder<>* b); // Constructs a LoopEmitter that emits one element into each of N separate // arrays on each iteration of the loop. @@ -53,8 +53,7 @@ class LoopEmitter { // This is used for multi-output fusion. target_element_generator must // produce an LLVM struct with N elements. LoopEmitter(const ElementGenerator& target_element_generator, - tensorflow::gtl::ArraySlice target_arrays, - llvm::IRBuilder<>* ir_builder); + absl::Span target_arrays, llvm::IRBuilder<>* b); LoopEmitter(const LoopEmitter&) = delete; LoopEmitter& operator=(const LoopEmitter&) = delete; @@ -65,13 +64,15 @@ class LoopEmitter { // specifies the element, will return multiple indices if the loop is // unrolled. std::vector EmitIndexAndSetExitBasicBlock() { - return EmitIndexAndSetExitBasicBlock(/*loop_name=*/""); + return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"", b_->getInt64Ty()); } + virtual std::vector EmitIndexAndSetExitBasicBlock( - tensorflow::StringPiece loop_name); + absl::string_view loop_name, llvm::Type* index_type); // Emits a complete loop nest for every element in the given shape. - Status EmitLoop(tensorflow::StringPiece loop_name = ""); + Status EmitLoop(absl::string_view loop_name = "", + llvm::Type* index_type = nullptr); protected: // An IR emitter that generates the loop body. @@ -84,7 +85,7 @@ class LoopEmitter { // scalar, no loops are emitted and exit_bb_ is nullptr in that case. llvm::BasicBlock* exit_bb_; - llvm::IRBuilder<>* ir_builder_; + llvm::IRBuilder<>* b_; }; } // namespace llvm_ir diff --git a/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..0e115cdabf4b290617700276dba8f2e5648a7c07 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc @@ -0,0 +1,59 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" + +namespace xla { +namespace llvm_ir { + +llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input) { + llvm::Type* type = input->getType(); + + // Clamp the input to [-9, 9]. + llvm::Value* input_clamped = llvm_ir::EmitFloatMin( + llvm_ir::EmitFloatMax(input, llvm::ConstantFP::get(type, -9.0), b), + llvm::ConstantFP::get(type, 9.0), b); + + static constexpr std::array numerator_coeffs{ + -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f, + 5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f, + 4.89352455891786e-03f}; + + static constexpr std::array denominator_coeffs{ + 1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f, + 4.89352518554385e-03f}; + + llvm::Value* input_squared = b->CreateFMul(input_clamped, input_clamped); + llvm::Value* numerator = llvm::ConstantFP::get(type, numerator_coeffs[0]); + for (int i = 1; i < numerator_coeffs.size(); i++) { + numerator = b->CreateFAdd(b->CreateFMul(input_squared, numerator), + llvm::ConstantFP::get(type, numerator_coeffs[i])); + } + + numerator = b->CreateFMul(input_clamped, numerator); + + llvm::Value* denominator = llvm::ConstantFP::get(type, denominator_coeffs[0]); + for (int i = 1; i < denominator_coeffs.size(); i++) { + denominator = + b->CreateFAdd(b->CreateFMul(input_squared, denominator), + llvm::ConstantFP::get(type, denominator_coeffs[i])); + } + + return b->CreateFDiv(numerator, denominator); +} + +} // namespace llvm_ir +} // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/math_ops.h b/tensorflow/compiler/xla/service/llvm_ir/math_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..6c8bc3a076367eae2f1829966be2872e5f258178 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/math_ops.h @@ -0,0 +1,32 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_MATH_OPS_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_MATH_OPS_H_ + +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Value.h" + +namespace xla { +namespace llvm_ir { + +// Emits an approximation of tanh. The implementation uses the same rational +// interpolant as implemented in Eigen3. +llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input); + +} // namespace llvm_ir +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_MATH_OPS_H_ diff --git a/tensorflow/compiler/xla/service/llvm_ir/ops.cc b/tensorflow/compiler/xla/service/llvm_ir/ops.cc deleted file mode 100644 index dacc54742c0897bbd92315f1e33a484aae56bb7f..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/llvm_ir/ops.cc +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/llvm_ir/ops.h" -#include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h" -#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" -#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h" -#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" -#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" - -namespace xla { -namespace llvm_ir { - -bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice, - const BufferAssignment& assignment) { - CHECK_EQ(HloOpcode::kDynamicUpdateSlice, dynamic_update_slice->opcode()); - const HloInstruction* operand = dynamic_update_slice->operand(0); - return assignment.HasTopLevelAllocation(dynamic_update_slice) && - assignment.HasTopLevelAllocation(operand) && - assignment.SharesTopLevelSlice(dynamic_update_slice, operand); -} - -// Shared implementation of EmitDynamicUpdateSliceInPlace and -// EmitFusedDynamicUpdateSliceInPlace. -// -// Emits a sequential loop if launch_dimensions is null. -static Status EmitDynamicUpdateSliceInPlaceImpl( - const Shape& update_shape, const ElementGenerator& start_indices_generator, - ElementGenerator update_array_generator, const IrArray& output_array, - const gpu::LaunchDimensions* launch_dimensions, - tensorflow::StringPiece name, llvm::IRBuilder<>* ir_builder) { - const Shape& output_shape = output_array.GetShape(); - - // Read start indices from start_indices_generator. - const int64 rank = ShapeUtil::Rank(output_shape); - IrArray::Index start_index(rank); - for (int64 i = 0; i < rank; ++i) { - IrArray::Index dim_index({ir_builder->getInt64(i)}); - TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index)); - llvm::Value* output_dim_size = llvm::ConstantInt::get( - start_index[i]->getType(), output_shape.dimensions(i)); - llvm::Value* update_dim_size = llvm::ConstantInt::get( - start_index[i]->getType(), update_shape.dimensions(i)); - - // Clamp the start index so that the update region fits in the operand. - // start_index = clamp(start_index, 0, output_dim_size - update_dim_size) - - // TODO(b/74360564): This is implementation defined behavior, but is - // currently respected by all implementations. Change this if we ever decide - // to oficially document different behavior. - llvm::Value* max_bound = - ir_builder->CreateSub(output_dim_size, update_dim_size); - llvm::Value* zero = llvm::ConstantInt::get(start_index[i]->getType(), 0); - start_index[i] = ir_builder->CreateSelect( - ir_builder->CreateICmp(llvm::ICmpInst::ICMP_SGE, zero, start_index[i]), - zero, start_index[i]); - - start_index[i] = ir_builder->CreateSelect( - ir_builder->CreateICmp(llvm::ICmpInst::ICMP_SLE, max_bound, - start_index[i]), - max_bound, start_index[i]); - } - - auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status { - // Calculate output_index, where we'll write the value from update. For - // each dimension, - // - // output_index[dim] = start_index[dim] + update_index[dim] - // - IrArray::Index output_index(rank); - for (int64 i = 0; i < rank; ++i) { - llvm::Value* start_index0 = ir_builder->CreateSExtOrBitCast( - start_index[i], update_index[i]->getType()); - output_index[i] = ir_builder->CreateAdd(start_index0, update_index[i]); - } - - // Do output[output_index] = update[update_index]. - TF_ASSIGN_OR_RETURN(llvm::Value * update_data, - update_array_generator(update_index)); - output_array.EmitWriteArrayElement(output_index, update_data, ir_builder); - return Status::OK(); - }; - - if (launch_dimensions != nullptr) { - return gpu::ParallelLoopEmitter(loop_body_emitter, update_shape, - *launch_dimensions, ir_builder) - .EmitLoop(name); - } - return LoopEmitter(loop_body_emitter, update_shape, ir_builder) - .EmitLoop(name); -} - -Status EmitDynamicUpdateSliceInPlace( - tensorflow::gtl::ArraySlice operand_arrays, - const IrArray& output_array, tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder) { - VLOG(2) << "EmitDynamicUpdateSliceInPlace for " << name; - - // No need to use operand_arrays[0], the input array of the - // dynamic-update-slice, because we know it aliases the op's output. - IrArray update_array = operand_arrays[1]; - IrArray start_indices_array = operand_arrays[2]; - Shape output_shape = output_array.GetShape(); - Shape update_shape = update_array.GetShape(); - - ElementGenerator start_indices_generator = [&](const IrArray::Index& index) { - return start_indices_array.EmitReadArrayElement(index, ir_builder); - }; - ElementGenerator update_array_generator = [&](const IrArray::Index& index) { - return update_array.EmitReadArrayElement(index, ir_builder); - }; - - return EmitDynamicUpdateSliceInPlaceImpl( - update_shape, start_indices_generator, update_array_generator, - output_array, /*launch_dimensions=*/nullptr, name, ir_builder); -} - -// Shared implementation for EmitFusedDynamicUpdateSliceInPlace and -// EmitParallelFusedDynamicUpdateSliceInPlace. -// -// Emits a sequential loop if launch_dimensions is null. -static Status EmitFusedDynamicUpdateSliceInPlaceImpl( - HloInstruction* fusion, - tensorflow::gtl::ArraySlice fusion_operand_arrays, - const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, - const gpu::LaunchDimensions* launch_dimensions, - llvm::IRBuilder<>* ir_builder) { - CHECK_EQ(fusion->opcode(), HloOpcode::kFusion); - VLOG(2) << "EmitFusedDynamicUpdateSliceInPlace for " - << fusion->ToShortString(); - - auto* dynamic_update_slice = fusion->fused_expression_root(); - - const auto* update = dynamic_update_slice->operand(1); - const auto* start_indices = dynamic_update_slice->operand(2); - Shape update_shape = update->shape(); - - // Our in-place dynamic-update-slice implementation emits a loop over - // update_shape. To emit a cache-friendly loop, we need to know that shape's - // layout. - // - // update_shape is inside a fusion node -- it's never materialized in memory - // and thus doesn't have a layout. In this case we use the layout of the - // fusion node for iteration, since that corresponds to the order in memory of - // the buffer we'll be writing to. - // - // (This isn't necessarily optimal; in some cases it might be faster to peek - // through the chain of ops that gives us the update operand and use the - // layout of its source buffer(s). But this is no worse than we do with - // fusion elsewhere.) - TF_RETURN_IF_ERROR( - LayoutUtil::CopyLayoutBetweenShapes(fusion->shape(), &update_shape)); - - // Create element generators for update and start_indices. - FusedIrEmitter fused_emitter(fusion_operand_arrays, elemental_emitter); - TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter)); - ElementGenerator update_array_generator = fused_emitter.GetGenerator(update); - ElementGenerator start_indices_generator = - fused_emitter.GetGenerator(start_indices); - - return EmitDynamicUpdateSliceInPlaceImpl( - update_shape, start_indices_generator, update_array_generator, - fusion_output_array, launch_dimensions, IrName(fusion), ir_builder); -} - -Status EmitFusedDynamicUpdateSliceInPlace( - HloInstruction* fusion, - tensorflow::gtl::ArraySlice fusion_operand_arrays, - const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, - llvm::IRBuilder<>* ir_builder) { - return EmitFusedDynamicUpdateSliceInPlaceImpl( - fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter, - /*launch_dimensions=*/nullptr, ir_builder); -} - -Status EmitParallelFusedDynamicUpdateSliceInPlace( - HloInstruction* fusion, - tensorflow::gtl::ArraySlice fusion_operand_arrays, - const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, - const gpu::LaunchDimensions& launch_dimensions, - llvm::IRBuilder<>* ir_builder) { - return EmitFusedDynamicUpdateSliceInPlaceImpl( - fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter, - &launch_dimensions, ir_builder); -} - -} // namespace llvm_ir -} // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/ops.h b/tensorflow/compiler/xla/service/llvm_ir/ops.h deleted file mode 100644 index 175b081e84d31779b15560cb0998011fe046ca01..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/llvm_ir/ops.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_ - -#include "tensorflow/compiler/xla/service/buffer_assignment.h" -#include "tensorflow/compiler/xla/service/elemental_ir_emitter.h" -#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" - -// Utilities related to emitting LLVM IR for various HLO ops. - -namespace xla { -namespace llvm_ir { - -// Checks if we can emit code for the given DynamicUpdateSlice node that updates -// its input in place. Returns true if the dynamic-update-slice's -// array-to-be-updated and output share the same BufferAllocation::Slice. -// -// dynamic_update_slice must be a DynamicUpdateSlice op. -bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice, - const BufferAssignment& assignment); - -// Checks if the given fusion node is amenable to being implemented by -// EmitFusedDynamicUpdateSliceInPlace. -inline bool CanEmitFusedDynamicUpdateSliceInPlace( - HloInstruction* fusion, const BufferAssignment& assignment) { - CHECK_EQ(fusion->opcode(), HloOpcode::kFusion); - HloInstruction* fused_root = fusion->fused_expression_root(); - if (fused_root->opcode() != HloOpcode::kDynamicUpdateSlice || - fusion->fusion_kind() != HloInstruction::FusionKind::kLoop) { - return false; - } - // Walk DynamicUpdateSlice operand(0) to fused parameter and get its - // associated operand. See if it shares an allocation with this operand. - HloInstruction* fusion_operand; - ShapeIndex index; - std::tie(fusion_operand, index) = - fused_root->mutable_operand(0)->LatestNonGteAncestorAndIndex(); - if (fusion_operand->opcode() != HloOpcode::kParameter) { - return false; - } - auto* operand = fusion->operand(fusion_operand->parameter_number()); - return assignment.HasAllocationAt(operand, index) && - assignment.HasAllocationAt(fusion, {}) && - assignment.SharesSliceAtIndex(fusion, {}, operand, index); -} - -// Emits IR for running the given dynamic-update-slice op in-place -- that is, -// where the input and output buffers share the same slice, so we can simply -// modify the input/output buffer without touching any of the other elements. -Status EmitDynamicUpdateSliceInPlace( - tensorflow::gtl::ArraySlice operand_arrays, - const IrArray& output_array, tensorflow::StringPiece name, - llvm::IRBuilder<>* ir_builder); - -// Given a loop-fusion node whose root is a dynamic-update-slice op whose -// array-to-be-updated and output share the same buffer slice, emits -// (sequential) code for a fusion node that does the dynamic-update-slice in -// place. -Status EmitFusedDynamicUpdateSliceInPlace( - HloInstruction* fusion, - tensorflow::gtl::ArraySlice fusion_operand_arrays, - const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, - llvm::IRBuilder<>* ir_builder); - -// Same as EmitFusedDynamicUpdateSliceInPlace, except emits a parallel loop with -// the given launch dimensions. -Status EmitParallelFusedDynamicUpdateSliceInPlace( - HloInstruction* fusion, - tensorflow::gtl::ArraySlice fusion_operand_arrays, - const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter, - const gpu::LaunchDimensions& launch_dimensions, - llvm::IRBuilder<>* ir_builder); - -} // namespace llvm_ir -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_ diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..944c79580c133906cd431722fd6b29e6aee5f918 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc @@ -0,0 +1,186 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h" + +// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" +#include "llvm/ADT/APInt.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Value.h" +#include "tensorflow/compiler/xla/primitive_util.h" +#include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h" +#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" +#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace llvm_ir { + +namespace { +// Adds the inner comparison loop where we compare elements pointed to by +// 'keys_index' and 'compare_keys_index'. +void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index, + const IrArray::Index& compare_keys_index, + const IrArray& keys_array, + const absl::optional& values_array, + llvm::IRBuilder<>* b) { + // if (is_smaller_index && + // compare_keys[dimension_to_sort] < dimension_to_sort_bound) + llvm::Value* is_smaller_index = b->CreateICmpSLT( + keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]); + int64 dimension_to_sort_bound = + keys_array.GetShape().dimensions(dimension_to_sort); + auto if_data = EmitIfThenElse( + b->CreateAnd(is_smaller_index, + b->CreateICmpSLT(compare_keys_index[dimension_to_sort], + keys_index.GetConstantWithIndexType( + dimension_to_sort_bound))), + "smaller_comparison_index", b, /*emit_else=*/false); + SetToFirstInsertPoint(if_data.true_block, b); + auto key1 = keys_array.EmitReadArrayElement(keys_index, b); + auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, b); + auto compare_key1 = key1; + auto compare_key2 = key2; + auto key_type = keys_array.GetShape().element_type(); + bool is_signed_comparison = true; + if (primitive_util::IsFloatingPointType(key_type)) { + // We would like a total order of floating point numbers so that the sort + // has a predictable behavior in the presence of NaNs. Rather than using + // floating point comparison, we use the following trick: + // If f is a float, and + // x = bit_cast(f); + // y = x < 0 ? 0x7FFFFFFF - x : x; + // then y is ordered as an int32 such that finite values have the obvious + // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning + // and end of the ordering. + auto k = b->getInt(llvm::APInt::getSignedMaxValue( + key1->getType()->getPrimitiveSizeInBits())); + auto comparison_type = k->getType(); + auto zero = llvm::ConstantInt::get(comparison_type, 0); + auto maybe_flip = [&](llvm::Value* v) { + return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero), + b->CreateSub(k, v), v); + }; + compare_key1 = b->CreateBitCast(key1, comparison_type); + compare_key2 = b->CreateBitCast(key2, comparison_type); + compare_key1 = maybe_flip(compare_key1); + compare_key2 = maybe_flip(compare_key2); + } else if (!primitive_util::IsSignedIntegralType(key_type)) { + is_signed_comparison = false; + } + auto comparison = + b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT + : llvm::ICmpInst::ICMP_ULT, + compare_key2, compare_key1); + // If key2 < key1 + auto if_smaller_data = + EmitIfThenElse(comparison, "is_smaller_than", b, /*emit_else=*/false); + SetToFirstInsertPoint(if_smaller_data.true_block, b); + // Swap key1 with key2. + keys_array.EmitWriteArrayElement(keys_index, key2, b); + keys_array.EmitWriteArrayElement(compare_keys_index, key1, b); + if (values_array.has_value()) { + // Also swap the values. + auto value1 = values_array.value().EmitReadArrayElement(keys_index, b); + auto value2 = + values_array.value().EmitReadArrayElement(compare_keys_index, b); + values_array.value().EmitWriteArrayElement(keys_index, value2, b); + values_array.value().EmitWriteArrayElement(compare_keys_index, value1, b); + } +} +} // namespace + +Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, + const absl::optional& values_array, + absl::string_view name, llvm::Value* xor_mask, + llvm::IRBuilder<>* b, + const gpu::LaunchDimensions* launch_dimensions) { + const Shape& keys_shape = keys_array.GetShape(); + + // Create loop nests which loop through the operand dimensions. The sort + // dimension is handled in the innermost loop which performs the sorting. + ForLoopNest loop_nest(name, b); + IrArray::Index keys_index = + loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys"); + if (loop_nest.GetInnerLoopBodyBasicBlock() != nullptr) { + SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b); + } + + // 'compare_keys_index' is the index of the element that 'keys_index' should + // be compared to. + IrArray::Index compare_keys_index(keys_index.GetType()); + for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) { + if (dimension != dimension_to_sort) { + compare_keys_index.push_back(keys_index[dimension]); + } else { + compare_keys_index.push_back(nullptr); + } + } + + // Naive C++ code for the inner compare loop: + // + // for (int64 i = 0; i < dimension_to_sort_bound; ++i) { + // int64 j = i ^ xor_mask; + // if (i < j && j < dimension_to_sort_bound) { + // int64 min_key = std::min(keys[i], keys[j]); + // keys[j] = std::max(keys[i], keys[j]); + // keys[i] = min_key; + // } + // } + // + // This follows the algorithm described on Wikipedia: + // https://en.wikipedia.org/wiki/Bitonic_sorter + + int64 dimension_to_sort_bound = + keys_array.GetShape().dimensions(dimension_to_sort); + Shape compare_shape = ShapeUtil::MakeShape(keys_shape.element_type(), + {dimension_to_sort_bound}); + auto compare_loop_body_emitter = + [&](const IrArray::Index& compare_index) -> Status { + keys_index[dimension_to_sort] = compare_index[0]; + compare_keys_index[dimension_to_sort] = + b->CreateXor(compare_index[0], xor_mask); + EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, + keys_array, values_array, b); + return Status::OK(); + }; + if (launch_dimensions != nullptr) { + TF_RETURN_IF_ERROR(gpu::ParallelLoopEmitter(compare_loop_body_emitter, + compare_shape, + *launch_dimensions, b) + .EmitLoop(name)); + } else { + TF_RETURN_IF_ERROR(LoopEmitter(compare_loop_body_emitter, compare_shape, b) + .EmitLoop(name)); + } + + // Set the IR builder insert point to the exit basic block of the outer most + // loop. This ensures later instructions are inserted after this loop nest. + b->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock()); + + return Status::OK(); +} + +} // namespace llvm_ir +} // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h new file mode 100644 index 0000000000000000000000000000000000000000..527ed10374ce9482045a8459e38fd041e0e83001 --- /dev/null +++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h @@ -0,0 +1,41 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_ + +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" +#include "llvm/IR/Value.h" +#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" +#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace llvm_ir { +// Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort' +// dimension of 'keys_array'. All other dimensions are kept as-is. This +// implements the inner loop of BitonicSort. If 'launch_dimensions' is nullptr, +// the inner compare loop will not be parallelized. +Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, + const absl::optional& values_array, + absl::string_view name, llvm::Value* xor_mask, + llvm::IRBuilder<>* b, + const gpu::LaunchDimensions* launch_dimensions); +} // namespace llvm_ir +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_ diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc index 5fc08aab916e377b245b6221108956c06da70767..7d49b8d6c2c902ee38d72f72b3da9d190cc65bf0 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc @@ -31,12 +31,12 @@ namespace llvm_ir { void EmitTupleSelect(const IrArray& select, const IrArray& pred, llvm::Value* on_true, llvm::Value* on_false, - llvm::IRBuilder<>* ir_builder, llvm::Module* module) { + llvm::IRBuilder<>* b, llvm::Module* module) { CHECK(ShapeUtil::IsScalar(pred.GetShape())); llvm::LoadInst* pred_value = - ir_builder->CreateLoad(pred.GetBasePointer(), "load_predicate_value"); - llvm::Value* pred_cond = ir_builder->CreateICmpNE( + b->CreateLoad(pred.GetBasePointer(), "load_predicate_value"); + llvm::Value* pred_cond = b->CreateICmpNE( pred_value, llvm::ConstantInt::get(PrimitiveTypeToIrType(PRED, module), 0), "boolean_predicate"); @@ -46,47 +46,41 @@ void EmitTupleSelect(const IrArray& select, const IrArray& pred, VLOG(2) << " pred_cond: " << DumpToString(*pred_cond); for (int i = 0; i < ShapeUtil::TupleElementCount(select.GetShape()); ++i) { - llvm::Value* const element_index[] = {ir_builder->getInt64(0), - ir_builder->getInt64(i)}; + llvm::Value* const element_index[] = {b->getInt64(0), b->getInt64(i)}; llvm::Value* on_true_element_address = - ir_builder->CreateInBoundsGEP(on_true, element_index); - llvm::Value* on_true_element = ir_builder->CreateLoad( + b->CreateInBoundsGEP(on_true, element_index); + llvm::Value* on_true_element = b->CreateLoad( on_true_element_address, "on_true_element_" + llvm::Twine(i)); llvm::Value* on_false_element_address = - ir_builder->CreateInBoundsGEP(on_false, element_index); - llvm::Value* on_false_element = ir_builder->CreateLoad( + b->CreateInBoundsGEP(on_false, element_index); + llvm::Value* on_false_element = b->CreateLoad( on_false_element_address, "on_false_element_" + llvm::Twine(i)); llvm::Value* output_element_address = - ir_builder->CreateInBoundsGEP(select.GetBasePointer(), element_index); - ir_builder->CreateStore( - ir_builder->CreateSelect(pred_cond, on_true_element, on_false_element, - "select_output_element_" + llvm::Twine(i)), - output_element_address); + b->CreateInBoundsGEP(select.GetBasePointer(), element_index); + b->CreateStore(b->CreateSelect(pred_cond, on_true_element, on_false_element, + "select_output_element_" + llvm::Twine(i)), + output_element_address); } } -void EmitTuple(const IrArray& tuple, - tensorflow::gtl::ArraySlice operands, - llvm::IRBuilder<>* ir_builder, llvm::Module* module) { +void EmitTuple(const IrArray& tuple, absl::Span operands, + llvm::IRBuilder<>* b, llvm::Module* module) { for (size_t i = 0; i < operands.size(); ++i) { - auto* store = ir_builder->CreateStore( - ir_builder->CreatePointerCast(operands[i], - PrimitiveTypeToIrType(TUPLE, module)), - ir_builder->CreateInBoundsGEP( - tuple.GetBasePointer(), - {ir_builder->getInt64(0), ir_builder->getInt64(i)})); + auto* store = b->CreateStore( + b->CreatePointerCast(operands[i], PrimitiveTypeToIrType(TUPLE, module)), + b->CreateInBoundsGEP(tuple.GetBasePointer(), + {b->getInt64(0), b->getInt64(i)})); tuple.AnnotateLoadStoreInstructionWithMetadata(store); } } llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index, int alignment, llvm::Value* operand, - llvm::IRBuilder<>* ir_builder, - llvm::Module* module) { - llvm::Value* element_ptr = ir_builder->CreateInBoundsGEP( - operand, {ir_builder->getInt64(0), ir_builder->getInt64(index)}); - llvm::LoadInst* src_buffer = ir_builder->CreateLoad(element_ptr); + llvm::IRBuilder<>* b, llvm::Module* module) { + llvm::Value* element_ptr = + b->CreateInBoundsGEP(operand, {b->getInt64(0), b->getInt64(index)}); + llvm::LoadInst* src_buffer = b->CreateLoad(element_ptr); // Mark the loaded pointer as dereferenceable if we know its shape. if (!ShapeUtil::IsOpaque(target_shape)) { @@ -98,7 +92,7 @@ llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index, llvm::Type* element_type = ShapeToIrType(target_shape, module); llvm::Value* ret_val = - ir_builder->CreateBitCast(src_buffer, element_type->getPointerTo()); + b->CreateBitCast(src_buffer, element_type->getPointerTo()); return ret_val; } diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h index 352d34ebf839c6c2465abade7c3d3eb3b7a34506..887fb613717ef780d6903a3b97bfdf4b735c4f82 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h +++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h @@ -16,10 +16,10 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_TUPLE_OPS_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_TUPLE_OPS_H_ +#include "absl/types/span.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/types.h" // Utilities for emitting LLVM IR related to HLO tuples. @@ -61,13 +61,12 @@ namespace llvm_ir { // output[i] = pred ? tuple_on_true[i] : tuple_on_false[i] void EmitTupleSelect(const IrArray& select, const IrArray& pred, llvm::Value* on_true, llvm::Value* on_false, - llvm::IRBuilder<>* ir_builder, llvm::Module* module); + llvm::IRBuilder<>* b, llvm::Module* module); // A tuple is an array of pointers, one for each operand. Each pointer points to // the output buffer of its corresponding operand. -void EmitTuple(const IrArray& tuple, - tensorflow::gtl::ArraySlice operands, - llvm::IRBuilder<>* ir_builder, llvm::Module* module); +void EmitTuple(const IrArray& tuple, absl::Span operands, + llvm::IRBuilder<>* b, llvm::Module* module); // A tuple is an array of pointers, one for each operand. Each pointer points to // the output buffer of its corresponding operand. A GetTupleElement instruction @@ -75,8 +74,7 @@ void EmitTuple(const IrArray& tuple, // Returns an llvm value representing a pointer to the tuple element buffer. llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index, int alignment, llvm::Value* operand, - llvm::IRBuilder<>* ir_builder, - llvm::Module* module); + llvm::IRBuilder<>* b, llvm::Module* module); } // namespace llvm_ir } // namespace xla diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc index 0fa4061738612df76c72a18a9353f16bf6a42677..0d0fb7946ae6815905491ca55652d7d0ab278a3c 100644 --- a/tensorflow/compiler/xla/service/local_service.cc +++ b/tensorflow/compiler/xla/service/local_service.cc @@ -19,27 +19,26 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/client/executable_build_options.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/execution_options_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/computation_layout.h" -#include "tensorflow/compiler/xla/service/computation_tracker.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/service/platform_util.h" -#include "tensorflow/compiler/xla/service/user_computation.h" -#include "tensorflow/compiler/xla/service/versioned_computation_handle.h" #include "tensorflow/compiler/xla/shape_layout.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/cleanup.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -75,7 +74,7 @@ namespace { // If the parameter number is invalid for this computation, nullopt is // returned. When the return value has_value(), nullptr will never be // the held value. -tensorflow::gtl::optional ParameterMetadata( +absl::optional ParameterMetadata( const XlaComputation& computation, int parameter_number) { for (const HloComputationProto& comp : computation.proto().computations()) { if (comp.id() == computation.proto().entry_computation_id()) { @@ -83,14 +82,14 @@ tensorflow::gtl::optional ParameterMetadata( if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter) && instr.parameter_number() == parameter_number) { if (!instr.has_metadata()) { - return tensorflow::gtl::nullopt; + return absl::nullopt; } return &instr.metadata(); } } } } - return tensorflow::gtl::nullopt; + return absl::nullopt; } ExecutionOptions CreateExecutionOptions( @@ -110,6 +109,11 @@ ExecutionOptions CreateExecutionOptions( ->set_xla_dump_optimized_hlo_proto_to( build_options.dump_optimized_hlo_proto_to().value()); } + if (build_options.dump_unoptimized_hlo_proto_to().has_value()) { + execution_options.mutable_debug_options() + ->set_xla_dump_unoptimized_hlo_proto_to( + build_options.dump_unoptimized_hlo_proto_to().value()); + } if (build_options.dump_per_pass_hlo_proto_to().has_value()) { execution_options.mutable_debug_options() ->set_xla_dump_per_pass_hlo_proto_to( @@ -124,78 +128,20 @@ ExecutionOptions CreateExecutionOptions( LayoutUtil::SetToDefaultLayout( execution_options.mutable_shape_with_output_layout()); } - return execution_options; -} - -} // namespace -StatusOr> LocalService::CompileExecutable( - const ComputationHandle& computation, - const tensorflow::gtl::ArraySlice argument_layouts, - const ExecutableBuildOptions& build_options) { - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, - computation_tracker_.Resolve(computation)); - VersionedComputationHandle versioned_handle = - user_computation->GetVersionedHandle(); - - TF_ASSIGN_OR_RETURN( - std::shared_ptr program_shape, - user_computation->ComputeProgramShape(versioned_handle.version)); - - // Validate incoming layouts. - if (argument_layouts.size() != program_shape->parameters_size()) { - return InvalidArgument( - "Invalid number of arguments for computation: expected %d, got %zu.", - program_shape->parameters_size(), argument_layouts.size()); + for (const std::string& disabled_pass : build_options.disabled_hlo_passes()) { + execution_options.mutable_debug_options()->add_xla_disable_hlo_passes( + disabled_pass); } - for (int i = 0; i < argument_layouts.size(); ++i) { - const Shape& argument_shape = *argument_layouts[i]; - TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(argument_shape)); - if (!ShapeUtil::Compatible(argument_shape, program_shape->parameters(i))) { - tensorflow::gtl::optional metadata = - user_computation->ParameterMetadata(i); - auto metadata_string = [&metadata]() -> string { - if (!metadata.has_value()) { - return ""; - } - CHECK(metadata.value() != nullptr); - const OpMetadata& m = *metadata.value(); - if (!m.source_file().empty()) { - return tensorflow::strings::Printf( - " (%s:%d)", m.source_file().c_str(), m.source_line()); - } - return ""; - }; - return InvalidArgument( - "Invalid argument shape for argument %d%s, expected %s, got %s.", i, - metadata_string().c_str(), - ShapeUtil::HumanString(program_shape->parameters(i)).c_str(), - ShapeUtil::HumanString(argument_shape).c_str()); - } - } - if (build_options.result_layout() != nullptr) { - TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout( - *build_options.result_layout(), program_shape->result())); - } - - ExecutionOptions execution_options = - CreateExecutionOptions(build_options, program_shape.get()); - TF_ASSIGN_OR_RETURN(std::unique_ptr module_config, - CreateModuleConfig(*program_shape, argument_layouts, - &execution_options, user_computation)); - TF_ASSIGN_OR_RETURN( - se::StreamExecutor * executor, - execute_backend_->stream_executor(build_options.device_ordinal())); - - return BuildExecutable(versioned_handle, std::move(module_config), - execute_backend_.get(), executor, - build_options.device_allocator()); + return execution_options; } +} // namespace + StatusOr> LocalService::CompileExecutable( const XlaComputation& computation, - const tensorflow::gtl::ArraySlice argument_layouts, + const absl::Span argument_layouts, const ExecutableBuildOptions& build_options) { const HloModuleProto& proto = computation.proto(); TF_RET_CHECK(proto.has_program_shape()); @@ -204,15 +150,16 @@ StatusOr> LocalService::CompileExecutable( // Validate incoming layouts. if (argument_layouts.size() != program_shape.parameters_size()) { return InvalidArgument( - "Invalid number of arguments for computation: expected %d, got %zu.", + "Invalid number of arguments for computation: expected %d, got %u.", program_shape.parameters_size(), argument_layouts.size()); } for (int i = 0; i < argument_layouts.size(); ++i) { const Shape& argument_shape = *argument_layouts[i]; - TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(argument_shape)); + TF_RETURN_IF_ERROR( + ShapeUtil::ValidateShapeWithOptionalLayout(argument_shape)); if (!ShapeUtil::Compatible(argument_shape, program_shape.parameters(i))) { - tensorflow::gtl::optional metadata = + absl::optional metadata = ParameterMetadata(computation, /*parameter_number=*/i); auto metadata_string = [&metadata]() -> string { if (!metadata.has_value()) { @@ -221,21 +168,20 @@ StatusOr> LocalService::CompileExecutable( CHECK(metadata.value() != nullptr); const OpMetadata& m = *metadata.value(); if (!m.source_file().empty()) { - return tensorflow::strings::Printf( - " (%s:%d)", m.source_file().c_str(), m.source_line()); + return absl::StrFormat(" (%s:%d)", m.source_file(), m.source_line()); } return ""; }; return InvalidArgument( "Invalid argument shape for argument %d%s, expected %s, got %s.", i, - metadata_string().c_str(), - ShapeUtil::HumanString(program_shape.parameters(i)).c_str(), - ShapeUtil::HumanString(argument_shape).c_str()); + metadata_string(), + ShapeUtil::HumanString(program_shape.parameters(i)), + ShapeUtil::HumanString(argument_shape)); } } if (build_options.result_layout() != nullptr) { - TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout( - *build_options.result_layout(), program_shape.result())); + TF_RETURN_IF_ERROR(ValidateResultShape(*build_options.result_layout(), + program_shape.result())); } ExecutionOptions execution_options = @@ -245,6 +191,9 @@ StatusOr> LocalService::CompileExecutable( std::unique_ptr module_config, CreateModuleConfig(program_shape, argument_layouts, &execution_options)); + VLOG(3) << "Computation Layout: " + << module_config->entry_computation_layout().ToString(); + TF_ASSIGN_OR_RETURN( se::StreamExecutor * executor, execute_backend_->stream_executor(build_options.device_ordinal())); @@ -260,4 +209,15 @@ StatusOr LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) { /*computation_count=*/1); } +StatusOr LocalService::GlobalDataToShapedBuffer( + const GlobalDataHandle& data, int replica_number) { + TF_ASSIGN_OR_RETURN(auto buffers, allocation_tracker_.Resolve(data)); + if (replica_number >= buffers.size()) { + return InvalidArgument( + "replica_number %d out of range; must be less than num_replicas = %u.", + replica_number, buffers.size()); + } + return buffers[replica_number]; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h index 06567cabd6eb28aae53881613cd6beb78e25e222..3b4f0b50832d6d2b64528ffb63eb5c7375396aec 100644 --- a/tensorflow/compiler/xla/service/local_service.h +++ b/tensorflow/compiler/xla/service/local_service.h @@ -18,8 +18,9 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/executable_build_options.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" @@ -28,7 +29,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { @@ -41,26 +41,14 @@ class LocalService : public Service { static StatusOr> NewService( const ServiceOptions& options); - // Builds an Executable with the given argument layouts and options. If - // result_layout is non-null, then the executable is compiled to produce a - // result of the given layout. If device_allocator is non-null, then the - // compiler may use it to allocate temp space on the device. The compiler is - // responsible for freeing any memory it allocates this way. - StatusOr> CompileExecutable( - const ComputationHandle& computation, - const tensorflow::gtl::ArraySlice argument_layouts, - const ExecutableBuildOptions& options); - // Builds an Executable with the given XlaComputation, argument layouts and // options. If result_layout is non-null, then the executable is compiled to // produce a result of the given layout. If device_allocator is non-null, // then the compiler may use it to allocate temp space on the device. The // compiler is responsible for freeing any memory it allocates this way. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. StatusOr> CompileExecutable( const XlaComputation& computation, - const tensorflow::gtl::ArraySlice argument_layouts, + const absl::Span argument_layouts, const ExecutableBuildOptions& build_options); // Returns the device ordinal that corresponds to the given replica number. @@ -70,6 +58,11 @@ class LocalService : public Service { // the "easy" case where a single replica is a single device. StatusOr ReplicaNumberToDeviceOrdinal(int replica_number); + // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid + // as long as the handle is valid. + StatusOr GlobalDataToShapedBuffer( + const GlobalDataHandle& data, int replica_number); + private: explicit LocalService(const ServiceOptions& options, std::unique_ptr backend); diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc index c742d35a7bcafa66692195a513992c9cfbb39335..e1f56727bd209797c60f7b3f10c3e232992d01e0 100644 --- a/tensorflow/compiler/xla/service/logical_buffer.cc +++ b/tensorflow/compiler/xla/service/logical_buffer.cc @@ -15,11 +15,11 @@ limitations under the License. #include "tensorflow/compiler/xla/service/logical_buffer.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { @@ -34,11 +34,10 @@ LogicalBuffer::~LogicalBuffer() {} string LogicalBuffer::ToString() const { string color_string; if (has_color()) { - color_string = tensorflow::strings::StrCat(" @", color().value()); + color_string = absl::StrCat(" @", color().value()); } - return tensorflow::strings::StrCat(instruction_->name(), "[", - tensorflow::str_util::Join(index_, ","), - "](#", id(), color_string, ")"); + return absl::StrCat(instruction_->name(), "[", absl::StrJoin(index_, ","), + "](#", id(), color_string, ")"); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/logical_buffer.h b/tensorflow/compiler/xla/service/logical_buffer.h index f9ba5a554740c9d4cc2643fe59d18ba76c30d03b..ceacab4ed7319527312a5a6ad715103b5bbaf40f 100644 --- a/tensorflow/compiler/xla/service/logical_buffer.h +++ b/tensorflow/compiler/xla/service/logical_buffer.h @@ -18,13 +18,13 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_value.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/int_type.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc index f410921b4b5337192bdeae5924631d9c06b7d5a5..eaa09591b72ee5202e0a9d1225d92eca92904adc 100644 --- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc +++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc @@ -17,6 +17,7 @@ limitations under the License. #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/platform/logging.h" @@ -89,7 +90,7 @@ void LogicalBufferAnalysis::NewLogicalBuffer(HloInstruction* instruction, const ShapeIndex& index) { CHECK_EQ(logical_buffers_.size(), next_buffer_id_); logical_buffers_.emplace_back( - MakeUnique(instruction, index, next_buffer_id_)); + absl::make_unique(instruction, index, next_buffer_id_)); output_buffers_[std::make_pair(instruction, index)] = logical_buffers_.back().get(); @@ -131,18 +132,23 @@ Status LogicalBufferAnalysis::HandleDomain(HloInstruction*) { return Status::OK(); } -Status LogicalBufferAnalysis::HandleRecvDone(HloInstruction*) { - // RecvDone doesn't create a new buffer but rather aliases its input (Recv) - // tuple element at {0} to its output. +Status LogicalBufferAnalysis::HandleRecvDone(HloInstruction* recv_done) { + // RecvDone produces a two-element tuple containing the data value (which + // aliases part of its operand) and a token. Only the tuple index table and + // the token are defined by the RecvDone. + NewLogicalBuffer(recv_done, /*index=*/{}); + NewLogicalBuffer(recv_done, /*index=*/{1}); return Status::OK(); } Status LogicalBufferAnalysis::HandleSend(HloInstruction* send) { - // Send creates new buffers for the top-level tuple and the context (tuple - // element at {1}). Tuple element at {0} is an alias of the Send operand, so - // we don't need to create a new Logical Buffer for that. + // Send creates new buffers for the top-level tuple, the context (tuple + // element at {1}), and the token (tuple element at {2}). Tuple element at {0} + // is an alias of the Send operand, so we don't need to create a new Logical + // Buffer for that. NewLogicalBuffer(send, /*index=*/{}); NewLogicalBuffer(send, /*index=*/{1}); + NewLogicalBuffer(send, /*index=*/{2}); return Status::OK(); } @@ -152,10 +158,10 @@ Status LogicalBufferAnalysis::HandleTuple(HloInstruction* tuple) { return Status::OK(); } -Status LogicalBufferAnalysis::HandleSelect(HloInstruction* select) { +Status LogicalBufferAnalysis::HandleTupleSelect(HloInstruction* tuple_select) { // Select allocates a new buffer and then shallow copies the on_true or // on_false buffer into this new buffer. - NewLogicalBuffer(select, /*index=*/{}); + NewLogicalBuffer(tuple_select, /*index=*/{}); return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h index b5ef3967875a58b35631d5f69c210f5cbcd91250..81f524d84a8091e1fff13dc7c55b401143a02753 100644 --- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h +++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h @@ -63,7 +63,7 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault { Status HandleCopy(HloInstruction* copy) override; Status HandleRecvDone(HloInstruction* recv_done) override; Status HandleSend(HloInstruction* send) override; - Status HandleSelect(HloInstruction* select) override; + Status HandleTupleSelect(HloInstruction* tuple_select) override; // A map from the buffer ID to the logical buffer std::vector> logical_buffers_; diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc new file mode 100644 index 0000000000000000000000000000000000000000..8269842426e3ee15ea974098a43fe7752c7614df --- /dev/null +++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc @@ -0,0 +1,41 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" +#include "absl/types/variant.h" +namespace xla { + +se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() { + if (HasOwnership()) { + return absl::get(mem_).AsDeviceMemoryBase(); + } else { + return absl::get(mem_); + } +} + +bool MaybeOwningDeviceMemory::HasOwnership() const { + return absl::holds_alternative(mem_); +} + +absl::optional MaybeOwningDeviceMemory::Release() { + if (!HasOwnership()) { + return {}; + } + OwningDeviceMemory result = std::move(absl::get(mem_)); + mem_ = result.AsDeviceMemoryBase(); + return absl::make_optional(std::move(result)); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h new file mode 100644 index 0000000000000000000000000000000000000000..82e7f1183c086437e10daea85ea99235b06cbb35 --- /dev/null +++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h @@ -0,0 +1,70 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_ + +#include "absl/types/optional.h" +#include "absl/types/variant.h" +#include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/compiler/xla/service/owning_device_memory.h" + +namespace xla { + +// MaybeOwningDeviceMemory represents either an owned or unowned device memory. +// Like std::variant. When the object goes +// output of scope, it will free the underlying memory if it owns it. +class MaybeOwningDeviceMemory { + public: + MaybeOwningDeviceMemory() = default; + explicit MaybeOwningDeviceMemory(OwningDeviceMemory owned) + : mem_(std::move(owned)) {} + explicit MaybeOwningDeviceMemory(se::DeviceMemoryBase unowned) + : mem_(unowned) {} + MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default; + ~MaybeOwningDeviceMemory() = default; + + MaybeOwningDeviceMemory& operator=(se::DeviceMemoryBase unowned) { + mem_ = unowned; + return *this; + } + + MaybeOwningDeviceMemory& operator=(OwningDeviceMemory owned) { + mem_ = std::move(owned); + return *this; + } + + MaybeOwningDeviceMemory& operator=(MaybeOwningDeviceMemory&&) = default; + + // Fetches the underlying DeviceMemoryBase from a MaybeOwningDeviceMemory. The + // caller of this function is *not* responsible for freeing the memory. + se::DeviceMemoryBase AsDeviceMemoryBase(); + + // Release the OwningDeviceMemory without freeing it, and moves the ownership + // of the memory buffer from the object to the caller. + // + // A nullopt is returned if the HasOwnership() == false; + absl::optional Release(); + + // Returns true if the device_memory has ownership over underlying memory. + bool HasOwnership() const; + + private: + absl::variant mem_; +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_ diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc new file mode 100644 index 0000000000000000000000000000000000000000..b9ec31c4977be0c31dfff01a0c495902191d7d5b --- /dev/null +++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc @@ -0,0 +1,338 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/multi_output_fusion.h" + +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/core/lib/gtl/flatmap.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { + +StatusOr MultiOutputFusion::Run(HloModule* module) { + bool changed = false; + + for (auto* computation : module->MakeNonfusionComputations()) { + computation_ = computation; + RecomputeReachability(); + candidates_.clear(); + candidates_index_.clear(); + all_fusion_candidates_.clear(); + + int64 index = 0; + for (auto it : computation_->MakeInstructionPostOrder()) { + candidates_.emplace_back(it); + InsertOrDie(&candidates_index_, it, index++); + } + + // Create the initial candidate list for each Node. + for (auto& node : candidates_) { + HloInstruction* instruction = node.hlo; + int64 instruction_id = get_candidate_id(instruction); + FusionCandidate& instr_node = candidates_[instruction_id]; + if (!IsFusible(instruction)) { + continue; + } + all_fusion_candidates_.push_back(instruction); + + std::vector candidates; + tensorflow::gtl::FlatSet candidates_set; + VLOG(10) << "Looking at instruction: " << instruction->name(); + for (auto operand : instruction->operands()) { + // Filter out the non-interesting instructions -- they + // will not generate the savings. + if (!IsProfitableOperand(operand)) { + VLOG(10) << "Operand not profitable: " << operand->name(); + continue; + } + VLOG(10) << "Operand profitable: " << operand->name(); + for (auto user : operand->users()) { + VLOG(10) << "User: " << user->name(); + if (user == instruction || !IsFusible(user)) { + VLOG(10) << "User is not fusible, or is the instruction itself: " + << user->name(); + continue; + } + int64 user_id = get_candidate_id(user); + if (is_connected(instruction, user)) { + VLOG(10) << "User is connected: " << user->name(); + continue; + } + if (instruction_id < user_id && + user->opcode() == HloOpcode::kFusion) { + VLOG(10) << "User ID for user: " << user->name() << " is " + << user_id << " which is higher than " << instruction_id; + continue; + } + if (!LegalToFuse(instruction, user)) { + VLOG(10) << "User not legal to fuse: " << user->name(); + continue; + } + if (candidates_set.insert(user).second) { + VLOG(10) << "User added to candidate list: " << user->name(); + candidates.push_back(user); + } + } + } + + // Iterate over candidates rather than candidates_set to avoid + // nondeterminism. + for (auto candidate : candidates) { + int64 profit = GetProfit(instruction, candidate); + if (profit > 0) { + FusionCandidate& candidate_node = + candidates_[get_candidate_id(candidate)]; + instr_node.fusibles.emplace_back(candidate, profit); + candidate_node.fusibles.emplace_back(instruction, profit); + worklist_.emplace(instruction, candidate, profit); + } + } + } + if (Perform()) { + changed = true; + } + } + return changed; +} + +HloInstruction* MultiOutputFusion::Fuse(HloInstruction* instr1, + HloInstruction* instr2) { + HloInstruction* remaining = instr1; + HloInstruction* fused = instr2; + // Make sure that if only one of the instructions is a fusion, or if only one + // of the instructions is a multi-output fusion, it's what will be fused into. + if (fused->opcode() == HloOpcode::kFusion) { + std::swap(remaining, fused); + } + if (fused->IsMultiOutputFusion()) { + std::swap(remaining, fused); + } + + if (fused->opcode() == HloOpcode::kFusion) { + remaining->MergeFusionInstructionIntoMultiOutput(fused); + } else { + remaining->FuseInstructionIntoMultiOutput(fused); + } + return remaining; +} + +bool MultiOutputFusion::IsProfitableOperand(HloInstruction* instr) { + // kConstant instruction will not have memory reads, so it won't be a profit + // source. Skip them. + if (instr->opcode() == HloOpcode::kConstant && + ShapeUtil::IsEffectiveScalar(instr->shape())) { + return false; + } + // We don't target to fuse producer/consumer instructions -- this should + // be taken care of by the instruction_fusion pass. If instr has only + // one user, it will not have sibling instructions. We won't consider it. + if (instr->user_count() < 2) { + return false; + } + return true; +} + +void MultiOutputFusion::Update(HloInstruction* instr1, HloInstruction* instr2) { + HloInstruction* fusion = instr1; + HloInstruction* fused = instr2; + if (is_fused(instr1)) { + fusion = instr2; + fused = instr1; + } + + // Insert the newly created instruction (if any), to candidates_. + for (auto use : fusion->users()) { + if (candidates_index_.find(use) == candidates_index_.end()) { + int64 index = candidates_.size(); + candidates_.emplace_back(use); + InsertOrDie(&candidates_index_, use, index++); + } + } + FusionCandidate& fusion_node = candidates_[get_candidate_id(fusion)]; + FusionCandidate& fused_node = candidates_[get_candidate_id(fused)]; + + // Update the reachability graph. + UpdateReachability(fusion, fused, all_fusion_candidates_, + [this](HloInstruction* instr) { return is_fused(instr); }); + + // Update the fusible list for fusion. Variable new_fusibles keeps + // track of the new or changed entries. + std::vector> new_fusibles; + tensorflow::gtl::FlatSet in_list; + auto it = fusion_node.fusibles.begin(); + while (it != fusion_node.fusibles.end()) { + HloInstruction* instr = it->first; + if (is_fused(instr) || is_connected(fusion, instr)) { + it = fusion_node.fusibles.erase(it); + continue; + } + in_list.insert(instr); + int64 profit = GetProfit(instr, fusion); + if (profit > it->second) { + it->second = profit; + new_fusibles.emplace_back(instr, profit); + } + ++it; + } + + // Fused_node has been fused into fusion_node. Take the fusion candidates + // (fusibles) from fused_nodes and add them to the fusion_node's. Filter + // out those fusibles that no longer valid (or already in the list). + for (const auto& it : fused_node.fusibles) { + HloInstruction* instr = it.first; + if (instr == fusion || is_fused(instr) || is_connected(fusion, instr)) { + continue; + } + if (in_list.count(instr) > 0) { + continue; + } + int64 profit = GetProfit(instr, fusion); + fusion_node.fusibles.emplace_back(instr, profit); + new_fusibles.emplace_back(instr, profit); + } + fused_node.fusibles.clear(); + + // Update the worklist_. + for (auto it : new_fusibles) { + worklist_.emplace(fusion, it.first, it.second); + } +} + +bool MultiOutputFusion::LegalToFuse(HloInstruction* instr1, + HloInstruction* instr2) { + if (instr1 == instr2) { + return false; + } + if (instr1->opcode() != HloOpcode::kFusion) { + return false; + } + + // Fusing nodes with 0 user makes no sense and the rest of the implementation + // doesn't support it either. + if (instr1->user_count() == 0 || instr2->user_count() == 0) { + return false; + } + + // Check if the users of multioutput fusion is not a get-tuple-element. + // If this is the case, we bail out because the transformation assumes + // the users are get-tuple-element. + auto multioutput_user_is_not_gte = [](HloInstruction* instr) { + if (!instr->IsMultiOutputFusion()) { + return false; + } + for (auto user : instr->users()) { + if (user->opcode() != HloOpcode::kGetTupleElement) { + return true; + } + } + return false; + }; + if (multioutput_user_is_not_gte(instr1) || + multioutput_user_is_not_gte(instr2)) { + return false; + } + + if (is_connected(instr1, instr2)) { + return false; + } + if (!ShapesCompatibleForFusion(instr1, instr2)) { + return false; + } + + return true; +} + +void MultiOutputFusion::RecomputeReachability() { + reachability_ = computation_->ComputeReachability(); +} + +void MultiOutputFusion::UpdateReachability( + HloInstruction* instr1, HloInstruction* instr2, + absl::Span instrs_to_update, + const std::function& skip) { + for (auto instr : instrs_to_update) { + if (skip != nullptr && skip(instr)) { + continue; + } + if (reachability_->IsReachable(instr2, instr) && + reachability_->IsReachable(instr1, instr)) { + // If a candidate was already reachable by both, no update needed. + continue; + } + if (reachability_->IsReachable(instr2, instr)) { + reachability_->FastSetReachabilityToUnion({instr, instr1}, instr); + } + if (reachability_->IsReachable(instr1, instr)) { + reachability_->FastSetReachabilityToUnion({instr, instr2}, instr); + } + } +} + +bool MultiOutputFusion::Perform() { + int changed = false; + // Pick the top candidate from queue and try to merge. + while (!worklist_.empty()) { + if (fuel_ <= 0) { + VLOG(2) << "No fusing: run out of fuel."; + break; + } + ToBeFused candidate = worklist_.top(); + worklist_.pop(); + + HloInstruction* instr1 = candidate.instr1; + HloInstruction* instr2 = candidate.instr2; + + if (is_fused(instr1) || is_fused(instr2)) { + continue; + } + + VLOG(1) << "Considering candidate profit_score=" << candidate.score + << "\n\t\tinstr1 = " << instr1->ToString() + << "\n\t\tinstr2 = " << instr2->ToString(); + + if (LegalToFuse(instr1, instr2)) { + VLOG(1) << "Fuse!"; + VLOG(2) << "Before multi_output_fusion:"; + VLOG(2) << "instr1: " << instr1->ToString(); + VLOG(2) << "\n" + << instr1->fused_instructions_computation()->ToString( + HloPrintOptions().set_indent_amount(1)); + VLOG(2) << "instr2: " << instr2->ToString(); + if (instr2->opcode() == HloOpcode::kFusion) { + VLOG(2) << "\n" + << instr2->fused_instructions_computation()->ToString( + HloPrintOptions().set_indent_amount(1)); + } + HloInstruction* ret = Fuse(instr1, instr2); + set_is_fused(ret == instr1 ? instr2 : instr1); + Update(instr1, instr2); + changed = true; + VLOG(2) << "After fusion, \t this: " << ret->name() << "\n" + << ret->fused_instructions_computation()->ToString( + HloPrintOptions().set_indent_amount(1)); + auto users = ret->users(); + --fuel_; + } + } + if (DoProducerConsumerMultiOutputFusion()) { + changed = true; + } + return changed; +} + +bool MultiOutputFusion::DoProducerConsumerMultiOutputFusion() { return false; } +} // namespace xla diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h new file mode 100644 index 0000000000000000000000000000000000000000..d2c52651c4f37708906e31b7839d0c9f6f04760e --- /dev/null +++ b/tensorflow/compiler/xla/service/multi_output_fusion.h @@ -0,0 +1,167 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MULTI_OUTPUT_FUSION_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_MULTI_OUTPUT_FUSION_H_ + +#include +#include + +#include "absl/strings/string_view.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" +#include "tensorflow/compiler/xla/statusor.h" + +namespace xla { + +// This class implements the fusing of sibling fusion instructions that sharing +// common operands. +// It constructs the following associated data structures. +// (1) candidates_: stores the instruction and the set of instructions it can +// fuse to. +// (2) candidates_index_: maps instruction to id. +// (3) reachability_: reachability map in this computation. +// (4) all_fusion_candidates_: the vector of candidate instructions. +// (5) worklist_: a priority queue that contains pairs of instructions to be +// fused and their fusion profit scores. +// +// Function Perform() applies the optimization. It picks up the most profitable +// pair in the worklist_, check if it's legal to fuse and fuse the pair. +// After fusion, it updates the associated structure such as reachability_, +// candidates_ and worklist_. +// Note that the reachability map is updated based on the original computation. +// This works because the reachability is monotonically increasing with +// instruction fusion. +class MultiOutputFusion : public HloPassInterface { + public: + MultiOutputFusion(int64 fuel) : fuel_(fuel) {} + + absl::string_view name() const override { return "multi_output_fusion"; } + + // Run multi-output fusion on the given module. Returns whether the module + // was changed. + StatusOr Run(HloModule* module) override; + + protected: + // Main entry for the optimization. Returns true if the optimization happens. + bool Perform(); + + // Test if instr1 and instr2 have the compatible shapes that can be legally + // fused. + virtual bool ShapesCompatibleForFusion(HloInstruction* instr1, + HloInstruction* instr2) = 0; + + // Whether the instruction is a candidate for fusion. + virtual bool IsFusible(HloInstruction* instr) = 0; + + // This function estimates the savings by merging instr1 and instr2 into one + // multi-output fusion instruction. + virtual int64 GetProfit(HloInstruction* instr1, HloInstruction* instr2) = 0; + + // Whether fusing the instruction can reduce memory reads. + virtual bool IsProfitableOperand(HloInstruction* instr); + + // Test if it's legal to fuse instr1 and instr2 into one fusion instruction. + virtual bool LegalToFuse(HloInstruction* instr1, HloInstruction* instr2); + + // Fuse HloInstrctuion instr1 and instr2 and return the fused instruction. + // The other instruction is removed from its parent computation. + virtual HloInstruction* Fuse(HloInstruction* instr1, HloInstruction* instr2); + + // Recompute reachability for the current computation. + void RecomputeReachability(); + + // Returns the reachability map for the current computation. + HloReachabilityMap* reachability() const { return reachability_.get(); } + + // Returns the computation for the pass. + HloComputation* computation() const { return computation_; } + + // Update the reachability map after fusing instr1 and instr2. + void UpdateReachability( + HloInstruction* instr1, HloInstruction* instr2, + absl::Span instrs_to_update, + const std::function& skip = nullptr); + + // Hook for multi-output fusion along producer-consumer edges. + // Returns whether any instructions were fused. + // + // TODO(b/80420762): Perform producer-consumer multi-output fusion in + // InstructionFusion instead. + virtual bool DoProducerConsumerMultiOutputFusion(); + + // Optimization fuel is a compiler debugging technique that makes an + // optimization pass stop what it is doing after having made N changes to the + // program, where N is the fuel. By varying N, this can be used to find the + // first single change that makes a test fail. + int64 fuel_; + + private: + // Update the internal data structures after instr1 and instr2 are fused into + // one fusion instruction. + void Update(HloInstruction* instr1, HloInstruction* instr2); + + // Computation for the pass. + HloComputation* computation_; + + // An internal data structure for each instruction in current computation. + // When an instruction is removed, member 'hlo' is set to nullptr. + struct FusionCandidate { + HloInstruction* hlo; + std::list> fusibles; + explicit FusionCandidate(HloInstruction* hlo) : hlo(hlo) {} + }; + std::vector candidates_; + + // A map that maps an instruction to the index_. + tensorflow::gtl::FlatMap candidates_index_; + + // The reachability map of current computation. + std::unique_ptr reachability_; + + // This stores all the candidate instructions in current computation. + std::vector all_fusion_candidates_; + + // The pair of candidates to be fused and the profit score. + struct ToBeFused { + HloInstruction* instr1; + HloInstruction* instr2; + int64 score; + ToBeFused(HloInstruction* instr1, HloInstruction* instr2, int64 score) + : instr1(instr1), instr2(instr2), score(score) {} + bool operator<(const ToBeFused& rhs) const { return score < rhs.score; } + }; + std::priority_queue worklist_; + + int64 get_candidate_id(HloInstruction* instr) { + return FindOrDie(candidates_index_, instr); + } + + bool is_fused(HloInstruction* instr) { + return candidates_[get_candidate_id(instr)].hlo == nullptr; + } + + void set_is_fused(HloInstruction* instr) { + candidates_[get_candidate_id(instr)].hlo = nullptr; + } + + bool is_connected(HloInstruction* instr1, HloInstruction* instr2) { + return reachability_->IsConnected(instr1, instr2); + } +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_MULTI_OUTPUT_FUSION_H_ diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc index 3a6a7c25f4b727c7112dbcbcb4f3d892679a0011..bd8fb17a235ea6eeb0e1809e8cb9ad83145fd8d6 100644 --- a/tensorflow/compiler/xla/service/name_uniquer.cc +++ b/tensorflow/compiler/xla/service/name_uniquer.cc @@ -15,8 +15,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/name_uniquer.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -52,8 +53,8 @@ NameUniquer::NameUniquer(const string& separator) { return result; } -string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) { - string root = GetSanitizedName(prefix.empty() ? "name" : std::string(prefix)); +string NameUniquer::GetUniqueName(absl::string_view prefix) { + string root = GetSanitizedName(prefix.empty() ? "name" : string(prefix)); // Strip away numeric suffix (if any). Only recognize separator if it is in // the middle of the name. @@ -63,26 +64,23 @@ string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) { if (separator_index != string::npos && (separator_index > 0) && (separator_index < root.size() - 1)) { string after_suffix = root.substr(separator_index + 1); - if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) { + if (absl::SimpleAtoi(after_suffix, &numeric_suffix)) { has_numeric_suffix = true; // Remove numeric suffix from root. root = root.substr(0, separator_index); - // Update count to at least the numeric suffix value to avoid future - // colisions with this name. - generated_names_[root] = std::max(generated_names_[root], numeric_suffix); + } else { + // absl::SimpleAtoi may modify numeric_suffix even if it returns false. + numeric_suffix = 0; } } - int64* count = &(generated_names_[root]); - if (*count == 0) { - *count = 1; - return has_numeric_suffix ? tensorflow::strings::StrCat(root, separator_, 0) - : root; - } else { - tensorflow::strings::StrAppend(&root, separator_, *count); - // Increment lookup under old 'root' name. - (*count)++; - return root; + + SequentialIdGenerator& id_generator = generated_names_[root]; + numeric_suffix = id_generator.RegisterId(numeric_suffix); + if (numeric_suffix == 0) { + return has_numeric_suffix ? absl::StrCat(root, separator_, 0) : root; } + absl::StrAppend(&root, separator_, numeric_suffix); + return root; } } // namespace xla diff --git a/tensorflow/compiler/xla/service/name_uniquer.h b/tensorflow/compiler/xla/service/name_uniquer.h index 4139c2700b25e8600182a034a8ac6f4f041c12e6..6dd89c240f81c9f0ccac66e50c7f244bfd5429f1 100644 --- a/tensorflow/compiler/xla/service/name_uniquer.h +++ b/tensorflow/compiler/xla/service/name_uniquer.h @@ -17,10 +17,11 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_SERVICE_NAME_UNIQUER_H_ #include -#include +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/gtl/flatmap.h" +#include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/platform/macros.h" namespace xla { @@ -37,20 +38,47 @@ class NameUniquer { // Get a sanitized unique name in a string, with an optional prefix for // convenience. - string GetUniqueName(tensorflow::StringPiece prefix = ""); + string GetUniqueName(absl::string_view prefix = ""); // Sanitizes and returns the name. Unallowed characters will be replaced with // '_'. The result will match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*". static string GetSanitizedName(const string& name); private: + // Used to track and generate new identifiers for the same instruction name + // root. + class SequentialIdGenerator { + public: + SequentialIdGenerator() = default; + + // Tries to register id as used identifier. If id is not already used, the + // id itself will be returned. Otherwise a new one will be generated, and + // returned. + int64 RegisterId(int64 id) { + if (used_.insert(id).second) { + return id; + } + while (!used_.insert(next_).second) { + ++next_; + } + return next_++; + } + + private: + // The next identifier to be tried. + int64 next_ = 0; + + // Set of all the identifiers which has been used. + tensorflow::gtl::FlatSet used_; + }; + // The string to use to separate the prefix of the name from the uniquing // integer value. string separator_; - // Map from name prefix to the number of names generated using that prefix - // so far. - std::unordered_map generated_names_; + // Map from name prefix to the generator data structure which tracks used + // identifiers and generates new ones. + tensorflow::gtl::FlatMap generated_names_; TF_DISALLOW_COPY_AND_ASSIGN(NameUniquer); }; diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc index 2ec255558c4ed3695ec6c824458cbedac44dc297..3e2592c6ac626143f1421e545a31d9be91e376bc 100644 --- a/tensorflow/compiler/xla/service/name_uniquer_test.cc +++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc @@ -54,12 +54,13 @@ TEST_F(NameUniquerTest, NumericSuffixes) { EXPECT_EQ("foo", uniquer.GetUniqueName("foo")); EXPECT_EQ("foo.54", uniquer.GetUniqueName("foo.54")); - EXPECT_EQ("foo.55", uniquer.GetUniqueName("foo")); + EXPECT_EQ("foo.1", uniquer.GetUniqueName("foo")); EXPECT_EQ("foo.55.1", uniquer.GetUniqueName("foo.55.1")); - EXPECT_EQ("foo.55.2", uniquer.GetUniqueName("foo.55.1")); - EXPECT_EQ("bar.0", uniquer.GetUniqueName("bar.-1000")); - EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.-2000")); - EXPECT_EQ("bar.2", uniquer.GetUniqueName("bar.1")); + EXPECT_EQ("foo.55.0", uniquer.GetUniqueName("foo.55.1")); + EXPECT_EQ("bar.1000", uniquer.GetUniqueName("bar.1000")); + EXPECT_EQ("bar.2000", uniquer.GetUniqueName("bar.2000")); + EXPECT_EQ("bar.-2000", uniquer.GetUniqueName("bar.-2000")); + EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.1")); } TEST_F(NameUniquerTest, PrefixHasSuffix) { @@ -77,12 +78,12 @@ TEST_F(NameUniquerTest, Sanitize) { EXPECT_EQ("foo.54", uniquer.GetUniqueName("foo.54")); EXPECT_EQ("foo_54", uniquer.GetUniqueName("foo_54")); EXPECT_EQ("foo_54.1", uniquer.GetUniqueName("foo_54.1")); - EXPECT_EQ("foo_55", uniquer.GetUniqueName("foo")); + EXPECT_EQ("foo_2", uniquer.GetUniqueName("foo")); // Invalid characters will be replaced with '_'. - EXPECT_EQ("bar_0", uniquer.GetUniqueName("bar<-1000")); - EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar<-2000")); - EXPECT_EQ("bar_2", uniquer.GetUniqueName("bar_1")); + EXPECT_EQ("bar_1000", uniquer.GetUniqueName("bar<1000")); + EXPECT_EQ("bar_2000", uniquer.GetUniqueName("bar<2000")); + EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar_1")); // Separator is only recognized in the middle of the prefix. EXPECT_EQ("_10", uniquer.GetUniqueName( @@ -93,5 +94,15 @@ TEST_F(NameUniquerTest, Sanitize) { EXPECT_EQ("foobar__1", uniquer.GetUniqueName("foobar_")); } +TEST_F(NameUniquerTest, KeepNamesInRandomOrder) { + NameUniquer uniquer("."); + + EXPECT_EQ("foo.11", uniquer.GetUniqueName("foo.11")); + EXPECT_EQ("foo.10", uniquer.GetUniqueName("foo.10")); + EXPECT_EQ("foo.1", uniquer.GetUniqueName("foo.1")); + EXPECT_EQ("foo.12", uniquer.GetUniqueName("foo.12")); + EXPECT_EQ("foo.3", uniquer.GetUniqueName("foo.3")); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h index d3bc47e61e0e75fa2ef181988700f88cec9c1d76..4869db79e719fa10d61ad6c6ed41ff70a344f733 100644 --- a/tensorflow/compiler/xla/service/pattern_matcher.h +++ b/tensorflow/compiler/xla/service/pattern_matcher.h @@ -16,11 +16,11 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_ +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/core/lib/core/stringpiece.h" namespace xla { @@ -86,8 +86,8 @@ namespace xla { // are provided below. // // Example nullary instruction: -// Recv() == Op().WithOpcode(HloOpcode::kRecv) -// Recv(&a) == Op(&a).WithOpcode(HloOpcode::kRecv) +// Param() == Op().WithOpcode(HloOpcode::kParam) +// Param(&a) == Op(&a).WithOpcode(HloOpcode::kParam) // // Example unary instruction: // Abs() == Op().WithOpcode(HloOpcode::kAbs) @@ -204,7 +204,7 @@ class LayoutPattern { // Modifies the pattern to match only if the layout equals the given proto. // The layout must outlive the returned pattern. constexpr LayoutPattern> EqualTo( - const Layout* layout) const { + const ::xla::Layout* layout) const { return LayoutPattern>( LayoutPatternEqualImpl(impl_, layout), matched_layout_); } @@ -622,7 +622,7 @@ template class HloInstructionPatternNameImpl { public: explicit HloInstructionPatternNameImpl(const Previous& previous, - tensorflow::StringPiece name) + absl::string_view name) : previous_(previous), name_(name) {} bool Match(const ::xla::HloInstruction* inst) const { @@ -631,7 +631,7 @@ class HloInstructionPatternNameImpl { private: Previous previous_; - tensorflow::StringPiece name_; + absl::string_view name_; }; // An HloInstructionPattern implementation that matches only if the instruction @@ -726,6 +726,32 @@ class HloInstructionPatternFusionKindImpl { ::xla::HloInstruction::FusionKind kind_; }; +// An HloInstructionPattern implementation that matches only if the instruction +// is a kGetTupleElement with a particular tuple index. +template +class HloInstructionPatternTupleIndexImpl { + public: + explicit constexpr HloInstructionPatternTupleIndexImpl( + const Previous& previous, int64 tuple_index) + : previous_(previous), tuple_index_(tuple_index) {} + + bool Match(const ::xla::HloInstruction* inst) const { + return previous_.Match(inst) && + inst->opcode() == HloOpcode::kGetTupleElement && + inst->tuple_index() == tuple_index_; + } + + bool Match(::xla::HloInstruction* inst) const { + return previous_.Match(inst) && + inst->opcode() == HloOpcode::kGetTupleElement && + inst->tuple_index() == tuple_index_; + } + + private: + Previous previous_; + int64 tuple_index_; +}; + // A pattern that matches HloInstructions. template class HloInstructionPattern { @@ -758,7 +784,7 @@ class HloInstructionPattern { // Modifies the pattern to match only if the instruction has the given name. HloInstructionPattern> - WithName(tensorflow::StringPiece name) const { + WithName(absl::string_view name) const { return HloInstructionPattern>( HloInstructionPatternNameImpl(impl_, name), matched_inst_); @@ -841,6 +867,17 @@ class HloInstructionPattern { HloInstructionPatternFusionKindImpl(impl_, kind), matched_inst_); } + // Modifies the pattern to match only if the instruction is a + // get-tuple-element with the given tuple index. + constexpr HloInstructionPattern> + WithTupleIndex(int64 tuple_index) const { + return HloInstructionPattern>( + HloInstructionPatternTupleIndexImpl(impl_, tuple_index), + matched_inst_); + } + private: Impl impl_; HloInstructionType** matched_inst_; @@ -880,9 +917,8 @@ Op(::xla::HloInstruction** matched_inst) { return Op(matched_inst).WithOpcode(HloOpcode::k##NAME); \ } XLA_NULLOP_PATTERN(Constant) -XLA_NULLOP_PATTERN(Infeed) XLA_NULLOP_PATTERN(Parameter) -XLA_NULLOP_PATTERN(Recv) +XLA_NULLOP_PATTERN(Iota) #undef XLA_NULLOP_PATTERN // Helpers for unary instructions. @@ -919,18 +955,21 @@ XLA_UNOP_PATTERN(Cos) XLA_UNOP_PATTERN(Exp) XLA_UNOP_PATTERN(Fft) XLA_UNOP_PATTERN(Floor) +XLA_UNOP_PATTERN(GetTupleElement) XLA_UNOP_PATTERN(Imag) +XLA_UNOP_PATTERN(Infeed) XLA_UNOP_PATTERN(IsFinite) XLA_UNOP_PATTERN(Log) XLA_UNOP_PATTERN(Not) XLA_UNOP_PATTERN(Negate) -XLA_UNOP_PATTERN(Outfeed) XLA_UNOP_PATTERN(Real) +XLA_UNOP_PATTERN(Recv) +XLA_UNOP_PATTERN(RecvDone) XLA_UNOP_PATTERN(Reduce) XLA_UNOP_PATTERN(ReducePrecision) XLA_UNOP_PATTERN(Reshape) XLA_UNOP_PATTERN(Reverse) -XLA_UNOP_PATTERN(Send) +XLA_UNOP_PATTERN(SendDone) XLA_UNOP_PATTERN(Sign) XLA_UNOP_PATTERN(Sin) XLA_UNOP_PATTERN(Sort) @@ -981,8 +1020,10 @@ XLA_BINOP_PATTERN(Maximum) XLA_BINOP_PATTERN(Minimum) XLA_BINOP_PATTERN(Multiply) XLA_BINOP_PATTERN(Ne) +XLA_BINOP_PATTERN(Outfeed) XLA_BINOP_PATTERN(Power) XLA_BINOP_PATTERN(Remainder) +XLA_BINOP_PATTERN(Send) XLA_BINOP_PATTERN(Subtract) XLA_BINOP_PATTERN(And) XLA_BINOP_PATTERN(Or) @@ -1040,6 +1081,32 @@ inline auto NonConstant(HloInstructionType** matched_inst) return Op(matched_inst).IsNonConstant(); } +// Add overloads for GetTupleElement which take a int64 specifying which tuple +// element is selected. +template +inline auto GetTupleElement(Arg&& arg, int64 tuple_index) + -> decltype(Op().WithOpcode(HloOpcode::kGetTupleElement) + .WithOperand(0, std::forward(arg)) + .WithTupleIndex(tuple_index)) { + return Op() + .WithOpcode(HloOpcode::kGetTupleElement) + .WithOperand(0, std::forward(arg)) + .WithTupleIndex(tuple_index); +} + +template +inline auto GetTupleElement(HloInstructionType** matched_inst, Arg&& arg, + int64 tuple_index) + -> decltype(Op(matched_inst) + .WithOpcode(HloOpcode::kGetTupleElement) + .WithOperand(0, std::forward(arg)) + .WithTupleIndex(tuple_index)) { + return Op(matched_inst) + .WithOpcode(HloOpcode::kGetTupleElement) + .WithOperand(0, std::forward(arg)) + .WithTupleIndex(tuple_index); +} + } // namespace match } // namespace xla diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc index 204e8c99209fa95adb868a676bb9e5144fed432c..a530581c34bf1d699eae3c53203c197f7943cc53 100644 --- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc +++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc @@ -16,7 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/pattern_matcher.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/core/platform/test.h" namespace xla { @@ -29,7 +29,7 @@ TEST(PatternMatcherTest, AddOp) { ROOT %two_plus_two = f32[] add(f32[] %two, f32[] %two) } )"; - TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, tools::Parse(kModuleStr)); + TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr)); const HloInstruction* matched_inst; HloInstruction* matched_operand; @@ -182,7 +182,7 @@ TEST(PatternMatcherTest, FusionKind) { p0 = f32[] parameter(0) ROOT fusion = f32[] fusion(p0), kind=kLoop, calls=fused_computation })"; - TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, tools::Parse(kModuleStr)); + TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr)); auto* root = hlo_module->entry_computation()->root_instruction(); EXPECT_TRUE(Match( @@ -193,5 +193,23 @@ TEST(PatternMatcherTest, FusionKind) { HloInstruction::FusionKind::kLoop))); } +TEST(PatternMatcherTest, GetTupleElement) { + constexpr char kModuleStr[] = R"( + HloModule test_module + + ENTRY while.v11 { + p0 = (f32[], f32[], f32[]) parameter(0) + ROOT gte = f32[] get-tuple-element(p0), index=1 + })"; + TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr)); + + auto* root = hlo_module->entry_computation()->root_instruction(); + EXPECT_FALSE(Match(root, match::Op().WithTupleIndex(0))); + EXPECT_TRUE(Match(root, match::Op().WithTupleIndex(1))); + EXPECT_FALSE(Match(root, match::Op().WithTupleIndex(2))); + EXPECT_FALSE(Match(root, match::GetTupleElement(match::Op(), 0))); + EXPECT_TRUE(Match(root, match::GetTupleElement(match::Op(), 1))); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc index 7c63c0acc7764d558b2151190f0fa79fac355cbf..178a78ede09c34e71566fdee69793fdb1cda6245 100644 --- a/tensorflow/compiler/xla/service/platform_util.cc +++ b/tensorflow/compiler/xla/service/platform_util.cc @@ -19,20 +19,19 @@ limitations under the License. #include #include +#include "absl/strings/ascii.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { -using tensorflow::str_util::Lowercase; - // Minimum supported CUDA compute capability is 3.5. constexpr int kMinCudaComputeCapabilityMajor = 3; constexpr int kMinCudaComputeCapabilityMinor = 5; @@ -43,7 +42,7 @@ constexpr char kInterpreter[] = "interpreter"; namespace { string CanonicalPlatformName(const string& name) { - string platform_str = Lowercase(name); + string platform_str = absl::AsciiStrToLower(name); // "cpu" and "host" mean the same thing. if (platform_str == "cpu") { platform_str = "host"; @@ -75,19 +74,6 @@ PlatformUtil::GetSupportedPlatforms() { auto* platform = platform_pair.second; auto compiler_status = Compiler::GetForPlatform(platform); if (compiler_status.ok()) { - if (platform->VisibleDeviceCount() > 0) { - LOG(INFO) << "platform " << platform->Name() << " present with " - << platform->VisibleDeviceCount() << " visible devices"; - } else { - LOG(WARNING) << "platform " << platform->Name() << " present but no " - << "visible devices found"; - } - // Note: currently we call zero device platforms "supported" on the basis - // that, if the platform support was linked in, it was probably intended - // to be used for execution, and this way we can flag an error. - // - // TODO(b/33730287) If we want an alternative version of this behavior we - // could add an --xla_fallback_to_host flag. platforms.push_back(platform); } else { LOG(INFO) << "platform " << platform->Name() << " present but no " @@ -103,41 +89,54 @@ PlatformUtil::GetSupportedPlatforms() { if (platforms.empty()) { return NotFound("no platforms found"); } else if (platforms.size() == 1) { - return platforms[0]; + se::Platform* platform = platforms[0]; + if (!platform->Initialized()) { + TF_RETURN_IF_ERROR(platform->Initialize({})); + } + return platform; } // Multiple platforms present and we can't pick a reasonable default. - string platforms_string = tensorflow::str_util::Join( + string platforms_string = absl::StrJoin( platforms, ", ", [](string* out, const se::Platform* p) { out->append(p->Name()); }); return InvalidArgument( "must specify platform because more than one platform found: %s", - platforms_string.c_str()); + platforms_string); } /* static */ StatusOr PlatformUtil::GetDefaultPlatform() { TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms()); + + se::Platform* platform = nullptr; if (platforms.empty()) { return NotFound("no platforms found"); } else if (platforms.size() == 1) { - return platforms[0]; + platform = platforms[0]; } else if (platforms.size() == 2) { for (int i = 0; i < 2; i++) { - if (Lowercase(platforms[i]->Name()) == kInterpreter && - Lowercase(platforms[1 - i]->Name()) != kInterpreter) { - return platforms[1 - i]; + if (absl::AsciiStrToLower(platforms[i]->Name()) == kInterpreter && + absl::AsciiStrToLower(platforms[1 - i]->Name()) != kInterpreter) { + platform = platforms[1 - i]; + break; } } } + if (platform != nullptr) { + if (!platform->Initialized()) { + TF_RETURN_IF_ERROR(platform->Initialize({})); + } + return platform; + } // Multiple platforms present and we can't pick a reasonable default. - string platforms_string = tensorflow::str_util::Join( + string platforms_string = absl::StrJoin( platforms, ", ", [](string* out, const se::Platform* p) { out->append(p->Name()); }); return InvalidArgument( "must specify platform because more than one platform (except for the " "interpreter platform) found: %s", - platforms_string.c_str()); + platforms_string); } /*static*/ StatusOr PlatformUtil::GetPlatform( @@ -145,11 +144,14 @@ PlatformUtil::GetSupportedPlatforms() { string platform_str = CanonicalPlatformName(platform_name); TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms()); for (se::Platform* platform : platforms) { - if (Lowercase(platform->Name()) == platform_str) { + if (absl::AsciiStrToLower(platform->Name()) == platform_str) { + if (!platform->Initialized()) { + TF_RETURN_IF_ERROR(platform->Initialize({})); + } return platform; } } - return InvalidArgument("platform %s not found", platform_name.c_str()); + return InvalidArgument("platform %s not found", platform_name); } /*static*/ StatusOr PlatformUtil::GetPlatformExceptFor( @@ -159,23 +161,27 @@ PlatformUtil::GetSupportedPlatforms() { TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms()); std::vector matched; for (se::Platform* platform : platforms) { - if (Lowercase(platform->Name()) != platform_name) { + if (absl::AsciiStrToLower(platform->Name()) != platform_name) { matched.push_back(platform); } } if (matched.empty()) { return InvalidArgument("unable to find platform that is not %s", - platform_name.c_str()); + platform_name); } if (matched.size() == 1) { - return matched[0]; + auto platform = matched[0]; + if (!platform->Initialized()) { + TF_RETURN_IF_ERROR(platform->Initialize({})); + } + return platform; } - string matched_string = tensorflow::str_util::Join( + string matched_string = absl::StrJoin( matched, ", ", [](string* out, const se::Platform* p) { out->append(p->Name()); }); return InvalidArgument( "found multiple platforms %s, but expected one platform except for %s", - matched_string.c_str(), platform_name.c_str()); + matched_string, platform_name); } // Returns whether the device underlying the given StreamExecutor is supported @@ -206,7 +212,7 @@ static bool IsDeviceSupported(se::StreamExecutor* executor) { PlatformUtil::GetStreamExecutors(se::Platform* platform) { int device_count = platform->VisibleDeviceCount(); if (device_count <= 0) { - return NotFound("no %s devices found", platform->Name().c_str()); + return NotFound("no %s devices found", platform->Name()); } if (platform->id() == se::host::kHostPlatformId) { // On host "devices", StreamExecutor exports a device for each hardware @@ -245,7 +251,7 @@ PlatformUtil::GetStreamExecutors(se::Platform* platform) { if (std::all_of(stream_executors.begin(), stream_executors.end(), [](se::StreamExecutor* s) { return s == nullptr; })) { return InternalError("no supported devices found for platform %s", - platform->Name().c_str()); + platform->Name()); } return stream_executors; } diff --git a/tensorflow/compiler/xla/service/pool.h b/tensorflow/compiler/xla/service/pool.h deleted file mode 100644 index 8e710ebb6dc17e0e204ba6ab3c6c159627cd9d3b..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/pool.h +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_POOL_H_ -#define TENSORFLOW_COMPILER_XLA_POOL_H_ - -#include -#include - -#include "tensorflow/compiler/xla/ptr_util.h" -#include "tensorflow/core/platform/mutex.h" - -namespace xla { - -// Pool of values, which are created as needed and destroyed when the `Pool` is -// destroyed -template -class Pool { - public: - struct Deleter { - void operator()(T* ptr) { pool->Deallocate(ptr); } - - Pool* pool; - }; - - // A pointer to a taken element of a `Pool` which returns it to the pool on - // destruction - using SmartPtr = std::unique_ptr; - - // Constructs a `Pool` with given factory function, which need not be - // thread-safe. - explicit Pool(std::function()> factory) - : factory_(factory) {} - - explicit Pool() : Pool([]() { return MakeUnique(); }) {} - - // Returns a pointer to a value in the pool, creating a new value if none is - // free. The returned smart pointer returns the element to the pool on - // destruction. - // - // This method is thread-safe. - SmartPtr Allocate() { - tensorflow::mutex_lock lock(mu_); - T* ptr; - if (!xs_.empty()) { - ptr = std::move(xs_.back()).release(); - xs_.pop_back(); - } else { - ptr = factory_().release(); - } - Deleter del = {this}; - return std::unique_ptr(ptr, del); - } - - private: - // Puts a pointer to a value back into the pool, leaving it free for future - // use. - // - // This method is thread-safe. - void Deallocate(T* ptr) { - tensorflow::mutex_lock lock(mu_); - xs_.push_back(std::unique_ptr(ptr)); - } - - const std::function()> factory_ GUARDED_BY(mu_); - std::vector> xs_ GUARDED_BY(mu_); - tensorflow::mutex mu_; -}; - -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_POOL_H_ diff --git a/tensorflow/compiler/xla/service/pool_test.cc b/tensorflow/compiler/xla/service/pool_test.cc deleted file mode 100644 index 8c4fe258e38fff1b2086d8809bfc487e11ef713f..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/pool_test.cc +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/pool.h" - -#include "tensorflow/compiler/xla/test_helpers.h" - -namespace xla { -namespace { - -using PoolTest = ::testing::Test; - -TEST_F(PoolTest, Test) { - Pool pool; - - { - auto ptr = pool.Allocate(); - EXPECT_NE(nullptr, ptr.get()); - *ptr = 5; - } - - auto ptr = pool.Allocate(); - EXPECT_NE(nullptr, ptr.get()); - EXPECT_EQ(5, *ptr); -} - -} // namespace -} // namespace xla diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h index afde3cf95c721b59a39b74b4e1ff3f15a335fe97..256b231e3af43a2ee85c97a5efab1f022d4de4b1 100644 --- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h +++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h @@ -59,7 +59,7 @@ class ReducePrecisionInsertion : public HloPassInterface { ~ReducePrecisionInsertion() override{}; - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "reduce-precision-insertion"; } diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc index 0f26a025bf125f70199637894741540f89eae7e5..4df746fca9f8320eed72911726f33bb01f06fed5 100644 --- a/tensorflow/compiler/xla/service/reshape_mover.cc +++ b/tensorflow/compiler/xla/service/reshape_mover.cc @@ -38,7 +38,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/reshape_mover.h" #include -#include "tensorflow/compiler/xla/literal_util.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/util.h" @@ -155,20 +157,15 @@ HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand, case HloOpcode::kConstant: { if (first_reshape_operand->opcode() == HloOpcode::kReshape) { VLOG(5) << "Adding reshape to kConstant operand"; - HloInstruction* reshape = computation->AddInstruction( + return computation->AddInstruction( HloInstruction::CreateReshape(new_shape, operand)); - operand->SetupDerivedInstruction(reshape); - return reshape; } else { CHECK(first_reshape_operand->opcode() == HloOpcode::kTranspose); VLOG(5) << "Adding transpose to kConstant operand"; std::vector inverse_permutation = InversePermutation(first_reshape_operand->dimensions()); - HloInstruction* transpose = - computation->AddInstruction(HloInstruction::CreateTranspose( - new_shape, operand, inverse_permutation)); - operand->SetupDerivedInstruction(transpose); - return transpose; + return computation->AddInstruction(HloInstruction::CreateTranspose( + new_shape, operand, inverse_permutation)); } } case HloOpcode::kRng: { @@ -379,7 +376,7 @@ StatusOr TryReshapeMoveOnCandidates( removed = false; for (auto operand : nontrivial_operands) { - if (c_any_of(operand->users(), [&](HloInstruction* user) { + if (absl::c_any_of(operand->users(), [&](HloInstruction* user) { return !reshape_candidates->count(user); })) { for (auto* user : operand->users()) { diff --git a/tensorflow/compiler/xla/service/reshape_mover.h b/tensorflow/compiler/xla/service/reshape_mover.h index 1f59e3b3147facb6f2fae00d6c810bf54d560e5c..1e86a0823a56a9e52421a5c8bd49e0adb98a2c70 100644 --- a/tensorflow/compiler/xla/service/reshape_mover.h +++ b/tensorflow/compiler/xla/service/reshape_mover.h @@ -26,7 +26,7 @@ namespace xla { // them inputward also. class ReshapeMover : public HloPassInterface { public: - tensorflow::StringPiece name() const override { return "reshape-mover"; } + absl::string_view name() const override { return "reshape-mover"; } StatusOr Run(HloModule* module) override; }; diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc index 13e2d3258e3b92f52320201c382594962c0e3b2b..fcf269eee925c2ddb7511d70e71bd815e4b8c24a 100644 --- a/tensorflow/compiler/xla/service/reshape_mover_test.cc +++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc @@ -15,9 +15,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/reshape_mover.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -28,13 +28,13 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" - -namespace op = xla::testing::opcode_matchers; namespace xla { namespace { -using ReshapeMoverTest = HloVerifiedTestBase; + +namespace op = xla::testing::opcode_matchers; + +class ReshapeMoverTest : public HloVerifiedTestBase {}; TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) { HloComputation::Builder builder(TestName()); @@ -76,9 +76,13 @@ TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) { TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) { HloComputation::Builder builder(TestName()); auto root_shape = ShapeUtil::MakeShape(F32, {8, 7}); - auto rng0 = builder.AddInstruction( - HloInstruction::CreateRng(ShapeUtil::MakeShape(F32, {1, 8, 1, 7, 1}), - RandomDistribution::RNG_UNIFORM, {})); + auto rng0 = builder.AddInstruction(HloInstruction::CreateRng( + ShapeUtil::MakeShape(F32, {1, 8, 1, 7, 1}), + RandomDistribution::RNG_UNIFORM, + {builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))), + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(1.0f)))})); auto reshape0 = builder.AddInstruction(HloInstruction::CreateReshape(root_shape, rng0)); @@ -175,8 +179,9 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) { TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) { HloComputation::Builder builder(TestName()); auto root_shape = ShapeUtil::MakeShape(F32, {2, 3}); - auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{true, true, false}, {false, false, true}}))); + auto const0 = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR2( + {{true, true, false}, {false, false, true}}))); auto param1 = builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param1")); @@ -255,12 +260,12 @@ TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) { HloComputation::Builder builder(TestName()); auto root_shape = ShapeUtil::MakeShape(F32, {3, 2}); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}}))); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}}))); auto reshape0 = builder.AddInstruction(HloInstruction::CreateReshape(root_shape, const0)); auto const1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}}))); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}}))); auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(root_shape, const1)); @@ -309,7 +314,7 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) { auto param0 = builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param0")); auto const1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}}))); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}}))); auto reshape0 = builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0)); builder.AddInstruction(HloInstruction::CreateBinary( @@ -348,7 +353,7 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) { auto param0 = builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(F32, {1, 3}), "param0")); auto const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({9, 8, 7}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({9, 8, 7}))); auto reshape0 = builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0)); auto reshape1 = diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc new file mode 100644 index 0000000000000000000000000000000000000000..2f4b2667c405bb23b1c648892c86d337400c14a5 --- /dev/null +++ b/tensorflow/compiler/xla/service/scatter_expander.cc @@ -0,0 +1,350 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/scatter_expander.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_creation_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/while_util.h" +#include "tensorflow/compiler/xla/statusor.h" + +namespace xla { + + +// Transposes the given scatter_indices such that the index_vector_dim becomes +// the most-minor dimension. +static StatusOr TransposeIndexVectorDimToLast( + HloInstruction* scatter_indices, int64 index_vector_dim) { + const Shape& scatter_indices_shape = scatter_indices->shape(); + + if (scatter_indices_shape.dimensions_size() == index_vector_dim) { + return scatter_indices; + } + + if (index_vector_dim == (scatter_indices_shape.dimensions_size() - 1)) { + return scatter_indices; + } + + std::vector permutation; + permutation.reserve(scatter_indices_shape.dimensions_size()); + for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) { + if (i != index_vector_dim) { + permutation.push_back(i); + } + } + permutation.push_back(index_vector_dim); + return MakeTransposeHlo(scatter_indices, permutation); +} + +// Canonicalizes the scatter_indices tensor in order to keep them uniform while +// performing the scatter operation. +static StatusOr CanonicalizeScatterIndices( + HloInstruction* scatter_indices, int64 index_vector_dim) { + // Transpose the non-index-vector dimensions to the front. + TF_ASSIGN_OR_RETURN( + HloInstruction * transposed_scatter_indices, + TransposeIndexVectorDimToLast(scatter_indices, index_vector_dim)); + bool indices_are_scalar = + index_vector_dim == scatter_indices->shape().dimensions_size(); + + // The number of dimensions in scatter_indices that are index dimensions. + const int64 index_dims_in_scatter_indices = indices_are_scalar ? 0 : 1; + + // If there is only one index (i.e. scatter_indices has rank 1 and this + // scatter is really just a dynamic update slice) add a leading degenerate + // dimension for uniformity. Otherwise create a "collapsed" leading dimension + // that subsumes all of the non-index-vector dimensions. + const Shape& shape = transposed_scatter_indices->shape(); + if (shape.dimensions_size() == index_dims_in_scatter_indices) { + return PrependDegenerateDims(transposed_scatter_indices, 1); + } else { + // Collapse all but the dimensions (0 or 1) in scatter_indices containing + // the index vectors. + return CollapseFirstNDims( + transposed_scatter_indices, + shape.dimensions_size() - index_dims_in_scatter_indices); + } +} + +// Permutes the `updates` tensor such that all the scatter dims appear in the +// major dimensions and all the window dimensions appear in the minor +// dimensions. +static StatusOr PermuteScatterAndWindowDims( + HloInstruction* updates, absl::Span update_window_dims) { + std::vector permutation; + const int64 updates_rank = ShapeUtil::Rank(updates->shape()); + permutation.reserve(updates_rank); + + for (int64 i = 0; i < updates_rank; ++i) { + bool is_scatter_dim = !absl::c_binary_search(update_window_dims, i); + if (is_scatter_dim) { + permutation.push_back(i); + } + } + for (auto window_dim : update_window_dims) { + permutation.push_back(window_dim); + } + + return MakeTransposeHlo(updates, permutation); +} + +// Expands or contracts the scatter indices in the updates tensor. +static StatusOr AdjustScatterDims( + const Shape& scatter_indices_shape, HloInstruction* updates, + int64 index_vector_dim) { + int64 num_scatter_dims = scatter_indices_shape.dimensions_size(); + if (index_vector_dim < scatter_indices_shape.dimensions_size()) { + --num_scatter_dims; + } + if (num_scatter_dims == 0) { + // If there are no scatter dims, this must be a dynamic-update-slice kind of + // scatter. In this case, we prepend a degenerate dimension to work + // uniformly in the while loop. + return PrependDegenerateDims(updates, 1); + } + return CollapseFirstNDims(updates, num_scatter_dims); +} + +// Expands an index vector from the scatter_indices tensor into a vector that +// can be used to dynamic-update-slice to perform the scatter update. +static StatusOr ExpandIndexVectorIntoOperandSpace( + HloInstruction* index_vector, const ScatterDimensionNumbers& dim_numbers, + int64 operand_rank) { + HloComputation* computation = index_vector->parent(); + const Shape& index_shape = index_vector->shape(); + HloInstruction* zero = + computation->AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateFromDimensions(index_shape.element_type(), {1}))); + + // We extract out individual components from the smaller index and concatenate + // them (interspersing zeros as needed) into the larger index. + std::vector expanded_index_components; + + for (int i = 0; i < operand_rank; i++) { + int64 index_vector_dim_index = + FindIndex(dim_numbers.scatter_dims_to_operand_dims(), i); + if (index_vector_dim_index != + dim_numbers.scatter_dims_to_operand_dims_size()) { + TF_ASSIGN_OR_RETURN( + HloInstruction * component_to_concat, + MakeSliceHlo(index_vector, /*start_indices=*/{index_vector_dim_index}, + /*limit_indices=*/{index_vector_dim_index + 1}, + /*strides=*/{1})); + expanded_index_components.push_back(component_to_concat); + } else { + expanded_index_components.push_back(zero); + } + } + + return MakeConcatHlo(expanded_index_components, /*dimension=*/0); +} + +// Body of the while loop that performs the scatter operation using other HLOs. +static StatusOr> ScatterLoopBody( + HloInstruction* scatter, HloInstruction* induction_var, + const std::vector& loop_state) { + const ScatterDimensionNumbers& dim_numbers = + scatter->scatter_dimension_numbers(); + CHECK_EQ(loop_state.size(), 3); + HloInstruction* operand = loop_state[0]; + HloInstruction* scatter_indices = loop_state[1]; + HloInstruction* updates = loop_state[2]; + + bool has_scalar_indices = scatter_indices->shape().dimensions_size() == 1; + CHECK_EQ(has_scalar_indices, + dim_numbers.index_vector_dim() == + scatter->operand(1)->shape().dimensions_size()); + + // Build a vector form of the induction variable of the while loop. + TF_ASSIGN_OR_RETURN( + HloInstruction * induction_var_as_vector, + MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{}, + /*result_shape_bounds=*/{1})); + + // Pick the index to scatter from scatter_indices based on the induction_var + // and transform that to an index into the `operand` space. + HloInstruction* index_vector; + if (has_scalar_indices) { + TF_ASSIGN_OR_RETURN( + index_vector, + MakeDynamicSliceHlo(scatter_indices, induction_var_as_vector, {1})); + } else { + TF_ASSIGN_OR_RETURN( + HloInstruction * index_into_scatter_indices, + PadVectorWithZeros(induction_var_as_vector, + /*zeros_to_prepend=*/0, /*zeros_to_append=*/1)); + int index_vector_size = scatter_indices->shape().dimensions(1); + TF_ASSIGN_OR_RETURN( + HloInstruction * index_vector_2d, + MakeDynamicSliceHlo(scatter_indices, index_into_scatter_indices, + {1, index_vector_size})); + TF_ASSIGN_OR_RETURN(index_vector, + ElideDegenerateDims(index_vector_2d, {0})); + } + TF_ASSIGN_OR_RETURN( + HloInstruction * scatter_slice_start, + ExpandIndexVectorIntoOperandSpace(index_vector, dim_numbers, + operand->shape().dimensions_size())); + + // Extract the slice to be used to update from `updates` tensor for the + // induction_var corresponding to this iteration of the while loop. + TF_ASSIGN_OR_RETURN( + HloInstruction * index_into_updates, + PadVectorWithZeros( + induction_var_as_vector, /*zeros_to_prepend=*/0, + /*zeros_to_append=*/updates->shape().dimensions_size() - 1)); + std::vector update_slice_bounds(updates->shape().dimensions().begin(), + updates->shape().dimensions().end()); + update_slice_bounds[0] = 1; + TF_ASSIGN_OR_RETURN( + HloInstruction * update_slice, + MakeDynamicSliceHlo(updates, index_into_updates, update_slice_bounds)); + TF_ASSIGN_OR_RETURN(HloInstruction * update_slice_for_scatter, + ElideDegenerateDims(update_slice, {0})); + TF_ASSIGN_OR_RETURN( + HloInstruction * update_slice_with_dims_inserted, + InsertDegenerateDims(update_slice_for_scatter, + AsInt64Slice(dim_numbers.inserted_window_dims()))); + + // Extact the slice to update from `operand` tensor. + const Shape& update_slice_shape = update_slice_with_dims_inserted->shape(); + TF_ASSIGN_OR_RETURN( + HloInstruction * operand_slice_to_update, + MakeDynamicSliceHlo(operand, scatter_slice_start, + AsInt64Slice(update_slice_shape.dimensions()))); + + // Compute the new value for the slice to be updated in `operand` tensor by + // combining the existing value and the update value using the update + // computation. + TF_ASSIGN_OR_RETURN( + HloInstruction * updated_operand_slice, + MakeMapHlo({operand_slice_to_update, update_slice_with_dims_inserted}, + scatter->to_apply())); + + // Write the updated value of the slice into `operand` tensor. + TF_ASSIGN_OR_RETURN(HloInstruction * updated_operand, + MakeDynamicUpdateSliceHlo(operand, updated_operand_slice, + scatter_slice_start)); + + return StatusOr>{ + {updated_operand, scatter_indices, updates}}; +} + +// High Level Algorithm. +// +// 1. Canonicalize the scatter_indices tensor such that it has rank 2, where +// each row is an index into the operand. +// 2. Canonicalize the updates tensor such that is has rank `num_window_dims+1` +// and the scatter dim is the most-major dimension. +// 3. Iterate over the set of indices in the canonicalized scatter_indices +// tensor using a while loop, updating the operand for each such index. Each +// iteration of this while loop performs the following: +// a. Pick the index from scatter_indices for this iteration. +// b. Transfrom this index into an index into the operand space. +// c. Extract the slice to be used to update from the updates tensor. +// d. Extract the slice to update from the operand tensor. +// e. Compute the new value for the slice to update by combining the slices +// from c. and d. using the update_computation of scatter. +// f. Write the updated value of the slice into the operand tensor. + +StatusOr ScatterExpander::ExpandScatter( + HloInstruction* scatter) { + HloInstruction* operand = scatter->mutable_operand(0); + HloInstruction* scatter_indices = scatter->mutable_operand(1); + HloInstruction* updates = scatter->mutable_operand(2); + const ScatterDimensionNumbers& dim_numbers = + scatter->scatter_dimension_numbers(); + + // If the updates tensor is empty, there is no need to update the operand. We + // can return the operand as is. + if (ShapeUtil::IsZeroElementArray(updates->shape())) { + return operand; + } + + // Compute the trip count for the while loop to be used for scatter. This + // should be the number of indices we should scatter into the operand. + const Shape& scatter_indices_shape = scatter_indices->shape(); + int64 scatter_loop_trip_count = 1; + for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) { + if (i != dim_numbers.index_vector_dim()) { + scatter_loop_trip_count *= scatter_indices_shape.dimensions(i); + } + } + if (!IsInt32(scatter_loop_trip_count)) { + return Unimplemented( + "Scatter operations with more than 2147483647 scatter indices are not " + "supported. This error occurred for %s.", + scatter->ToString()); + } + + // Canonicalize the scatter_indices, after which the size of its most-major + // dimension must be same as the while loop trip count. + TF_ASSIGN_OR_RETURN(HloInstruction * canonical_scatter_indices, + CanonicalizeScatterIndices( + scatter_indices, dim_numbers.index_vector_dim())); + CHECK_EQ(scatter_loop_trip_count, + canonical_scatter_indices->shape().dimensions(0)); + + // Canonicalize the updates, after which the size of its most-major dimension + // must be same as the while loop trip count. + TF_ASSIGN_OR_RETURN( + HloInstruction * canonical_updates, + PermuteScatterAndWindowDims( + updates, AsInt64Slice(dim_numbers.update_window_dims()))); + TF_ASSIGN_OR_RETURN( + HloInstruction * adjusted_canonical_updates, + AdjustScatterDims(scatter_indices->shape(), canonical_updates, + dim_numbers.index_vector_dim())); + CHECK_EQ(scatter_loop_trip_count, + adjusted_canonical_updates->shape().dimensions(0)); + + // The while loop that implements the scatter operation. + StatusOr> scatter_loop_result_status = + WhileUtil::MakeCountedLoop( + scatter->parent(), scatter_loop_trip_count, + {operand, canonical_scatter_indices, adjusted_canonical_updates}, + [&](HloInstruction* induction_var, + const std::vector& loop_state) { + return ScatterLoopBody(scatter, induction_var, loop_state); + }); + TF_ASSIGN_OR_RETURN(std::vector scatter_loop_result, + scatter_loop_result_status); + return scatter_loop_result.front(); +} + +StatusOr ScatterExpander::Run(HloModule* module) { + std::vector scatter_instrs; + for (HloComputation* computation : module->MakeNonfusionComputations()) { + for (HloInstruction* instr : computation->instructions()) { + if (instr->opcode() == HloOpcode::kScatter) { + scatter_instrs.push_back(instr); + } + } + } + + for (auto instr : scatter_instrs) { + TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandScatter(instr)); + TF_RETURN_IF_ERROR( + instr->parent()->ReplaceInstruction(instr, expanded_root)); + } + + return !scatter_instrs.empty(); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/scatter_expander.h b/tensorflow/compiler/xla/service/scatter_expander.h new file mode 100644 index 0000000000000000000000000000000000000000..14f062c89cfd4657097c1a933621a3e945f89c53 --- /dev/null +++ b/tensorflow/compiler/xla/service/scatter_expander.h @@ -0,0 +1,34 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_ + +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" + +namespace xla { + +class ScatterExpander : public HloPassInterface { + public: + absl::string_view name() const override { return "scatter_expander"; } + StatusOr Run(HloModule* module) override; + + private: + StatusOr ExpandScatter(HloInstruction* scatter); +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_ diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index cb0f76ebe4d445059fdf37ebf559bef851a57104..f0e2566a3f9ef5c0be8af46d3a16cd9c72793366 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -20,10 +20,12 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/execution_options_util.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" @@ -36,8 +38,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/service/hlo_proto_util.h" #include "tensorflow/compiler/xla/service/platform_util.h" -#include "tensorflow/compiler/xla/service/session.pb.h" #include "tensorflow/compiler/xla/service/source_map_util.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" #include "tensorflow/compiler/xla/shape_layout.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -46,71 +48,40 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/gtl/cleanup.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" - -using ::tensorflow::strings::Printf; -using ::tensorflow::strings::StrCat; -using ::xla::source_map_util::InvalidParameterArgument; +#include "tensorflow/core/util/ptr_util.h" namespace xla { - namespace { -// Records the arguments used to invoke a computation in a SessionModule -// proto. -Status RecordArguments( - const tensorflow::gtl::ArraySlice arguments, - se::StreamExecutor* executor, TransferManager* transfer_manager, - SessionModule* module) { - module->clear_arguments(); - for (const ShapedBuffer* argument : arguments) { - TF_ASSIGN_OR_RETURN( - std::unique_ptr literal, - transfer_manager->TransferLiteralFromDevice(executor, *argument)); - *module->add_arguments() = literal->ToProto(); - } - return Status::OK(); -} - -// Records the result of a computation in a SessionModule proto. -Status RecordResult(const ShapedBuffer& result, se::StreamExecutor* executor, - TransferManager* transfer_manager, SessionModule* module) { - module->clear_result(); - TF_ASSIGN_OR_RETURN( - std::unique_ptr literal, - transfer_manager->TransferLiteralFromDevice(executor, result)); - *module->mutable_result() = literal->ToProto(); - return Status::OK(); -} +using absl::StrCat; +using absl::StrFormat; // Records the arguments used to invoke a computation in an HloSnapshot proto. -Status RecordArguments( - const tensorflow::gtl::ArraySlice arguments, - se::StreamExecutor* executor, TransferManager* transfer_manager, - HloSnapshot* module) { +Status RecordArguments(const absl::Span arguments, + se::Stream* stream, TransferManager* transfer_manager, + HloSnapshot* module) { module->clear_arguments(); for (const ShapedBuffer* argument : arguments) { TF_ASSIGN_OR_RETURN( std::unique_ptr literal, - transfer_manager->TransferLiteralFromDevice(executor, *argument)); + transfer_manager->TransferLiteralFromDevice(stream, *argument)); *module->add_arguments() = literal->ToProto(); } return Status::OK(); } // Records the result of a computation in a HloSnapshot proto. -Status RecordResult(const ShapedBuffer& result, se::StreamExecutor* executor, +Status RecordResult(const ShapedBuffer& result, se::Stream* stream, TransferManager* transfer_manager, HloSnapshot* module) { module->clear_result(); TF_ASSIGN_OR_RETURN( std::unique_ptr literal, - transfer_manager->TransferLiteralFromDevice(executor, result)); + transfer_manager->TransferLiteralFromDevice(stream, result)); *module->mutable_result() = literal->ToProto(); return Status::OK(); } @@ -175,19 +146,19 @@ Service::Service(const ServiceOptions& options, CHECK_GE(execute_backend_->device_count(), options_.number_of_replicas()) << "Requested more replicas than there are devices."; } - LOG(INFO) << Printf( + LOG(INFO) << StrFormat( "XLA service %p executing computations on platform %s. Devices:", this, - execute_backend_->platform()->Name().c_str()); + execute_backend_->platform()->Name()); for (int i = 0; i < execute_backend_->device_count(); ++i) { if (execute_backend_->device_ordinal_supported(i)) { se::StreamExecutor* executor = execute_backend_->stream_executor(i).ValueOrDie(); const auto& description = executor->GetDeviceDescription(); - LOG(INFO) << Printf(" StreamExecutor device (%d): %s, %s", i, - description.name().c_str(), - description.platform_version().c_str()); + LOG(INFO) << StrFormat(" StreamExecutor device (%d): %s, %s", i, + description.name(), + description.platform_version()); } else { - LOG(INFO) << Printf(" StreamExecutor device (%d) not supported", i); + LOG(INFO) << StrFormat(" StreamExecutor device (%d) not supported", i); } } } else { @@ -195,23 +166,10 @@ Service::Service(const ServiceOptions& options, } } -Status Service::Computation(const ComputationRequest* arg, - ComputationResponse* result) { - if (arg->name().empty()) { - return InvalidArgument("computation request needs a name"); - } - - *result->mutable_computation() = - computation_tracker_.NewComputation(arg->name()); - VLOG(1) << Printf("Created new computation %s on service %p, name %s", - result->computation().ShortDebugString().c_str(), this, - arg->name().c_str()); - return Status::OK(); -} - Status Service::CreateChannelHandle(const CreateChannelHandleRequest* arg, CreateChannelHandleResponse* result) { - *result->mutable_channel() = channel_tracker_.NewChannel(); + TF_ASSIGN_OR_RETURN(*result->mutable_channel(), + channel_tracker_.NewChannel(arg->channel_type())); return Status::OK(); } @@ -233,27 +191,23 @@ Status Service::DeconstructTuple(const DeconstructTupleRequest* arg, return Status::OK(); } -Status Service::ValidateResultShapeWithLayout(const Shape& shape_with_layout, - const Shape& result_shape) const { - if (!ShapeUtil::Compatible(shape_with_layout, result_shape)) { +Status Service::ValidateResultShape(const Shape& client_shape, + const Shape& result_shape) const { + TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(client_shape)); + if (!ShapeUtil::Compatible(client_shape, result_shape)) { return InvalidArgument( "Shape used to set computation result layout %s is not compatible " "with result shape %s", - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str(), - ShapeUtil::HumanString(result_shape).c_str()); + ShapeUtil::HumanStringWithLayout(client_shape), + ShapeUtil::HumanString(result_shape)); } - if (!LayoutUtil::HasLayout(shape_with_layout)) { - return InvalidArgument( - "Shape used to set computation result layout %s does not have layout", - ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str()); - } - return ShapeUtil::ValidateShape(shape_with_layout); + return Status::OK(); } StatusOr>> Service::ResolveAndValidateArguments( - tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice stream_executors) { + absl::Span arguments, + absl::Span stream_executors) { CHECK_EQ(options_.number_of_replicas(), stream_executors.size()); std::vector> replicated_arguments; replicated_arguments.resize(options_.number_of_replicas()); @@ -275,9 +229,9 @@ Service::ResolveAndValidateArguments( return InvalidArgument( "argument %lu is on device %s:%d but computation will be executed " "on device %s", - i, shaped_buffer->platform()->Name().c_str(), + i, shaped_buffer->platform()->Name(), shaped_buffer->device_ordinal(), - execute_backend_->device_name(replica_device_ordinal).c_str()); + execute_backend_->device_name(replica_device_ordinal)); } replicated_arguments[replica].push_back(shaped_buffer); } @@ -287,16 +241,13 @@ Service::ResolveAndValidateArguments( StatusOr> Service::CreateModuleConfig( const ProgramShape& program_shape, - tensorflow::gtl::ArraySlice argument_shapes, - const ExecutionOptions* execution_options, - const UserComputation* user_computation) { - auto config = MakeUnique(program_shape); - ComputationLayout* host_computation_layout = - config->mutable_host_entry_computation_layout(); - ComputationLayout* device_computation_layout = - config->mutable_device_entry_computation_layout(); + absl::Span argument_shapes, + const ExecutionOptions* execution_options) { + auto config = absl::make_unique(program_shape); + ComputationLayout* computation_layout = + config->mutable_entry_computation_layout(); if (program_shape.parameters_size() != argument_shapes.size()) { - return InvalidArgument("computation takes %d parameters, but %zu given", + return InvalidArgument("computation takes %d parameters, but %u given", program_shape.parameters_size(), argument_shapes.size()); } @@ -305,46 +256,28 @@ StatusOr> Service::CreateModuleConfig( // ProgramShape. if (!ShapeUtil::Compatible(*argument_shapes[i], program_shape.parameters(i))) { - if (user_computation == nullptr) { - return InvalidArgument( - "Argument does not match shape of computation parameter %d: want " - "%s, got %s", - i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(), - ShapeUtil::HumanString(*argument_shapes[i]).c_str()); - } - return InvalidParameterArgument( - *user_computation->ParameterMetadata(i).value(), - "Argument does not match shape of computation parameter %d: want %s, " - "got %s", - i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(), - ShapeUtil::HumanString(*argument_shapes[i]).c_str()); + return InvalidArgument( + "Argument does not match shape of computation parameter %d: want " + "%s, got %s", + i, ShapeUtil::HumanString(program_shape.parameters(i)), + ShapeUtil::HumanString(*argument_shapes[i])); } - TF_RETURN_IF_ERROR(host_computation_layout->mutable_parameter_layout(i) - ->CopyLayoutFromShape(*argument_shapes[i])); - TF_RETURN_IF_ERROR(device_computation_layout->mutable_parameter_layout(i) - ->CopyLayoutFromShape(*argument_shapes[i])); + TF_RETURN_IF_ERROR( + computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape( + *argument_shapes[i])); } if (execution_options != nullptr && execution_options->has_shape_with_output_layout()) { const auto& shape_with_output_layout = execution_options->shape_with_output_layout(); - TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(shape_with_output_layout, - program_shape.result())); TF_RETURN_IF_ERROR( - host_computation_layout->mutable_result_layout()->CopyLayoutFromShape( - shape_with_output_layout)); + ValidateResultShape(shape_with_output_layout, program_shape.result())); TF_RETURN_IF_ERROR( - device_computation_layout->mutable_result_layout()->CopyLayoutFromShape( + computation_layout->mutable_result_layout()->CopyLayoutFromShape( shape_with_output_layout)); } else { // If the result layout is not set, then choose the default. - // TODO(b/29118294): Allow the compiler to choose a better layout in this - // case. - // TODO(b/78356948): We are forcing the default layout here. We should fix - // clients which expect a default layout, to be explicit about it, by - // passing the proper ExecutionOptions with shape_with_output_layout set. - host_computation_layout->mutable_result_layout()->SetToDefaultLayout(); - device_computation_layout->mutable_result_layout()->SetToDefaultLayout(); + computation_layout->mutable_result_layout()->SetToDefaultLayout(); } config->set_replica_count(options_.number_of_replicas()); @@ -365,77 +298,13 @@ StatusOr> Service::CreateModuleConfig( StatusOr> Service::CreateModuleConfig( const ProgramShape& program_shape, - tensorflow::gtl::ArraySlice arguments, - const ExecutionOptions& execution_options, - const UserComputation* user_computation) { + absl::Span arguments, + const ExecutionOptions& execution_options) { std::vector argument_shapes; for (const auto* arg : arguments) { argument_shapes.push_back(&arg->on_host_shape()); } - return CreateModuleConfig(program_shape, argument_shapes, &execution_options, - user_computation); -} - -StatusOr>> Service::BuildExecutables( - std::vector versioned_handles, - std::vector> module_configs, - Backend* backend, std::vector> executors, - DeviceMemoryAllocator* device_allocator) { - VLOG(1) << Printf("BuildExecutable on service %p", this); - - // Dump computation proto state if flag is set. - std::vector> session_modules; - for (int64 i = 0; i < versioned_handles.size(); ++i) { - const string& directory_path = - module_configs[i]->debug_options().xla_dump_computations_to(); - const string& other_directory_path = - module_configs[i]->debug_options().xla_dump_executions_to(); - if (directory_path.empty() && other_directory_path.empty()) { - continue; - } - TF_ASSIGN_OR_RETURN( - std::unique_ptr session_module, - computation_tracker_.SnapshotComputation(versioned_handles[i].handle)); - if (!directory_path.empty()) { - string filename = Printf("computation_%lld__%s__version_%lld", - versioned_handles[i].handle.handle(), - session_module->entry().name().c_str(), - versioned_handles[i].version); - TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename, - *session_module)); - session_modules.push_back(std::move(session_module)); - } - } - - VLOG(1) << "Computation handles:"; - for (const VersionedComputationHandle& versioned_handle : versioned_handles) { - VLOG(1) << versioned_handle; - } - - CHECK_EQ(versioned_handles.size(), module_configs.size()); - std::vector> modules; - for (int64 i = 0; i < versioned_handles.size(); ++i) { - const VersionedComputationHandle& versioned_handle = versioned_handles[i]; - const HloModuleConfig& config = *module_configs[i]; - TF_ASSIGN_OR_RETURN(auto module, - computation_tracker_.BuildHloModule( - versioned_handle, config, - /*include_unreachable_instructions=*/true)); - modules.push_back(std::move(module)); - } - - TF_ASSIGN_OR_RETURN( - std::vector> executables, - backend->compiler()->Compile(std::move(modules), std::move(executors), - device_allocator)); - - for (size_t i = 0; i < versioned_handles.size(); ++i) { - if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) { - executables[i]->set_session_module(std::move(session_modules[i])); - } - } - - return std::move(executables); + return CreateModuleConfig(program_shape, argument_shapes, &execution_options); } StatusOr>> Service::BuildExecutables( @@ -443,7 +312,7 @@ StatusOr>> Service::BuildExecutables( std::vector> module_configs, Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator) { - VLOG(1) << Printf("BuildExecutable on service %p", this); + VLOG(1) << StrFormat("BuildExecutable on service %p", this); // Dump computation proto state if flag is set. std::vector> hlo_snapshots; @@ -455,16 +324,15 @@ StatusOr>> Service::BuildExecutables( if (directory_path.empty() && execution_directory_path.empty()) { continue; } - auto hlo_snapshot = MakeUnique(); + auto hlo_snapshot = absl::make_unique(); *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i]; if (!directory_path.empty()) { - string filename = - Printf("computation_%lld__%s", module_protos[i]->id(), - module_protos[i]->entry_computation_name().c_str()); + string filename = StrFormat("computation_%d__%s", module_protos[i]->id(), + module_protos[i]->entry_computation_name()); TF_RETURN_IF_ERROR( Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot)); - hlo_snapshots.push_back(std::move(hlo_snapshot)); } + hlo_snapshots.push_back(std::move(hlo_snapshot)); } VLOG(1) << "Computations:"; @@ -496,125 +364,15 @@ StatusOr>> Service::BuildExecutables( return std::move(executables); } -Status Service::ValidateEntryComputationLayout(HloModule* module) { - const ComputationLayout& on_device = - module->device_entry_computation_layout(); - for (int64 i = 0; i < on_device.parameter_count(); ++i) { - TF_RET_CHECK(ShapeUtil::Equal( - on_device.parameter_shape(i), - execute_backend_->transfer_manager()->HostShapeToDeviceShape( - module->host_entry_computation_layout().parameter_shape(i)))); - } - TF_RET_CHECK(ShapeUtil::Equal( - module->device_entry_computation_layout().result_shape(), - execute_backend_->transfer_manager()->HostShapeToDeviceShape( - module->host_entry_computation_layout().result_shape()))); - return Status::OK(); -} - -StatusOr> Service::BuildExecutable( - const VersionedComputationHandle& versioned_handle, - std::unique_ptr module_config, Backend* backend, - se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) { - VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this, - versioned_handle.ToString().c_str()); - - // Dump computation proto state if flag is set. - std::unique_ptr session_module; - const string& directory_path = - module_config->debug_options().xla_dump_computations_to(); - const string& other_directory_path = - module_config->debug_options().xla_dump_executions_to(); - if (!directory_path.empty() || !other_directory_path.empty()) { - TF_ASSIGN_OR_RETURN( - session_module, - computation_tracker_.SnapshotComputation(versioned_handle.handle)); - if (!directory_path.empty()) { - string filename = Printf("computation_%lld__%s__version_%lld", - versioned_handle.handle.handle(), - session_module->entry().name().c_str(), - versioned_handle.version); - TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename, - *session_module)); - } - } - - TF_ASSIGN_OR_RETURN( - std::unique_ptr module, - computation_tracker_.BuildHloModule(versioned_handle, *module_config, - /*include_unreachable_instructions=*/ - true)); - - TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module)); - - TF_ASSIGN_OR_RETURN( - module, backend->compiler()->RunHloPasses(std::move(module), executor, - device_allocator)); - // Check that on-host and on-device shapes are consistent. - TF_RETURN_IF_ERROR(ValidateEntryComputationLayout(module.get())); - - TF_ASSIGN_OR_RETURN(std::unique_ptr executable, - backend->compiler()->RunBackend( - std::move(module), executor, device_allocator)); - - if (!other_directory_path.empty()) { - executable->set_session_module(std::move(session_module)); - } - - return std::move(executable); -} - -StatusOr> Service::BuildAndCacheExecutable( - const VersionedComputationHandle& versioned_handle, - std::unique_ptr module_config, Backend* backend, - se::StreamExecutor* executor, ExecutionProfile* profile, - DeviceMemoryAllocator* device_allocator) { - std::shared_ptr executable = - compilation_cache_.LookUp(versioned_handle, *module_config); - - if (executable != nullptr) { - // Executable found in the computation cache. - if (profile != nullptr) { - profile->set_compilation_cache_hit(true); - } - return executable; - } - - uint64 start_micros = - // Avoid reading the clock if we don't want timing info - (profile != nullptr) ? tensorflow::Env::Default()->NowMicros() : 0; - - // Take a copy of the module config, as compilation introduces layouts where - // layouts were optional before. - HloModuleConfig original_module_config = *module_config; - TF_ASSIGN_OR_RETURN( - std::unique_ptr executable_unique_ptr, - BuildExecutable(versioned_handle, std::move(module_config), backend, - executor, device_allocator)); - - if (profile != nullptr) { - uint64 end_micros = tensorflow::Env::Default()->NowMicros(); - uint64 milliseconds = (end_micros - start_micros) / 1000; - profile->set_compilation_cache_hit(false); - profile->set_compile_time_ms(milliseconds); - } - - // Insert executable into the cache. - return compilation_cache_.Insert(std::move(executable_unique_ptr), - original_module_config); -} - StatusOr> Service::ExecuteParallelAndRegisterResult( - tensorflow::gtl::ArraySlice executables, - tensorflow::gtl::ArraySlice>> - arguments, - Backend* backend, tensorflow::gtl::ArraySlice device_handles, - tensorflow::gtl::ArraySlice result_tags, - ExecutionProfile* profile) { + absl::Span executables, + absl::Span>> arguments, + Backend* backend, absl::Span device_handles, + absl::Span result_tags, ExecutionProfile* profile) { // Streams where the computation are launched, so we can wait on the streams // to complete. - std::vector::SmartPtr> streams; + std::vector streams; std::vector> timers; // Global data handles for the computation results, one for each computation. @@ -624,9 +382,16 @@ Service::ExecuteParallelAndRegisterResult( // profiled. std::map index_to_profiled_streams; - TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment, - backend->computation_placer()->AssignDevices( - options_.number_of_replicas(), executables.size())); + // Build DeviceAssignment for all cores based on the provided device handles. + DeviceAssignment device_assignment(options_.number_of_replicas(), + executables.size()); + for (int64 i = 0; i < executables.size(); i++) { + TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i])); + CHECK_EQ(replicas.size(), arguments[i].size()); + for (int64 replica = 0; replica < replicas.size(); ++replica) { + device_assignment(replica, i) = replicas[replica]->device_ordinal(); + } + } for (int64 i = 0; i < executables.size(); i++) { // Stream executors for the replicas of the current computation. @@ -634,12 +399,13 @@ Service::ExecuteParallelAndRegisterResult( CHECK_EQ(replicas.size(), arguments[i].size()); std::vector result_buffers; for (int64 replica = 0; replica < replicas.size(); ++replica) { - TF_ASSIGN_OR_RETURN(Pool::SmartPtr stream, + TF_ASSIGN_OR_RETURN(StreamPool::Ptr stream, backend->BorrowStream(replicas[replica])); streams.push_back(std::move(stream)); if (replica == 0 && profile != nullptr) { - timers.emplace_back(new se::Timer(streams.back()->parent())); + timers.push_back( + absl::make_unique(streams.back()->parent())); streams.back() ->InitTimer(timers.back().get()) .ThenStartTimer(timers.back().get()); @@ -671,7 +437,7 @@ Service::ExecuteParallelAndRegisterResult( streams.back()->ThenStopTimer(timers.back().get()); } - result_buffers.emplace_back(std::move(result)); + result_buffers.push_back(std::move(result)); } TF_ASSIGN_OR_RETURN(GlobalDataHandle handle, allocation_tracker_.RegisterReplicatedBuffers( @@ -683,8 +449,8 @@ Service::ExecuteParallelAndRegisterResult( for (int64 i = 0; i < streams.size(); ++i) { Status block_status = streams[i]->BlockHostUntilDone(); if (!block_status.ok()) { - return InternalError("failed to complete execution for stream %lld: %s", - i, block_status.error_message().c_str()); + return InternalError("failed to complete execution for stream %d: %s", i, + block_status.error_message()); } } @@ -698,7 +464,7 @@ Service::ExecuteParallelAndRegisterResult( HloExecutionProfile hlo_profile(&executable->hlo_profile_printer_data(), &executable->hlo_profile_index_map()); TF_RETURN_IF_ERROR( - executable->PopulateExecutionProfile(&hlo_profile, stream->parent())); + executable->PopulateExecutionProfile(&hlo_profile, stream)); XLA_LOG_LINES( tensorflow::INFO, hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription())); @@ -742,17 +508,16 @@ Service::ExecuteParallelAndRegisterResult( StatusOr Service::ExecuteAndRegisterResult( Executable* executable, - const tensorflow::gtl::ArraySlice> - arguments, + const absl::Span> arguments, Backend* backend, const string& result_tag, ExecutionProfile* profile) { // Set up streams. - std::vector::SmartPtr> streams; + std::vector streams; TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, SingleComputationDeviceHandle())); TF_RET_CHECK(!replicas.empty()); for (se::StreamExecutor* executor : replicas) { - TF_ASSIGN_OR_RETURN(Pool::SmartPtr stream, + TF_ASSIGN_OR_RETURN(StreamPool::Ptr stream, backend->BorrowStream(executor)); streams.push_back(std::move(stream)); } @@ -764,7 +529,7 @@ StatusOr Service::ExecuteAndRegisterResult( // Set up run options. std::vector run_options; - for (const Pool::SmartPtr& stream : streams) { + for (const StreamPool::Ptr& stream : streams) { ExecutableRunOptions options; options.set_stream(stream.get()); options.set_device_ordinal(stream->parent()->device_ordinal()); @@ -786,10 +551,9 @@ StatusOr Service::ExecuteAndRegisterResult( // TODO(b/69985541): Support profiling also on this path. - std::vector> - replicated_arguments; + std::vector> replicated_arguments; for (const auto& arg : arguments) { - replicated_arguments.emplace_back(arg); + replicated_arguments.push_back(arg); } TF_ASSIGN_OR_RETURN(auto results, executable->ExecuteOnStreams( @@ -799,13 +563,6 @@ StatusOr Service::ExecuteAndRegisterResult( result_tag); } -Status Service::SetReturnValue(const SetReturnValueRequest* arg, - SetReturnValueResponse* results) { - TF_ASSIGN_OR_RETURN(UserComputation * computation, - computation_tracker_.Resolve(arg->computation())); - return computation->SetReturnValue(arg->operand()); -} - StatusOr> Service::GetExecutors( const ExecutionOptions& execution_options, int64 requests_size, int64 request_index) const { @@ -816,7 +573,7 @@ StatusOr> Service::GetExecutors( if (requests_size > 1 && execution_options.device_handles_size() > 1) { return InvalidArgument( "Parallel requests with multiple device handles is not supported. " - "Found %lld parallel requests, with request %lld containing %d device " + "Found %d parallel requests, with request %d containing %d device " "handles.", requests_size, request_index, execution_options.device_handles_size()); } @@ -833,7 +590,7 @@ StatusOr> Service::GetExecutors( StatusOr>> Service::GetArguments( const ExecutionOptions& execution_options, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { // Resolve the allocations for the arguments of the computation, and create // a vector of device memory offsets for the arguments from the allocations. // In the case of partitioned computations, assume all arguments go on the @@ -847,117 +604,6 @@ StatusOr>> Service::GetArguments( return replicated_arguments; } -Status Service::ExecuteParallel(const ExecuteParallelRequest* arg, - ExecuteParallelResponse* result) { - VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString(); - - std::vector>> all_arguments; - std::vector> all_executors; - std::vector versioned_handles; - std::vector> module_configs; - std::vector computation_names; - std::vector device_handles; - - int num_requested_devices = - std::accumulate(arg->requests().begin(), arg->requests().end(), 0, - [](int a, const ExecuteRequest& r) -> int { - return a + r.execution_options().device_handles_size(); - }); - if (num_requested_devices * options_.number_of_replicas() > - execute_backend_->device_count()) { - return FailedPrecondition( - "there are not enough stream executors to execute %d computations", - num_requested_devices); - } - - for (int64 i = 0; i < arg->requests_size(); ++i) { - // Get the stream executor for the i'th computation. This stream executor - // is one of the executors to run the replicated computation. - const ExecutionOptions& execution_options = - arg->requests(i).execution_options(); - - // Get the executors. - TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options, - arg->requests_size(), i)); - - // Resolve the UserComputation object associated with the requested - // computation and compute the program shape. - const ExecuteRequest& request = arg->requests(i); - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, - computation_tracker_.Resolve(request.computation())); - VersionedComputationHandle versioned_handle = - user_computation->GetVersionedHandle(); - if (user_computation->request_count(versioned_handle.version) == 0) { - return InvalidArgument("computations may not be empty"); - } - - TF_ASSIGN_OR_RETURN( - std::shared_ptr program_shape, - user_computation->ComputeProgramShape(versioned_handle.version)); - - // Get the replicated arguments. - TF_ASSIGN_OR_RETURN(auto replicated_arguments, - GetArguments(execution_options, request.arguments())); - - // Create an HloModuleConfig object for the computation, given the shape of - // the program and the argument allocations. Here, we care only about the - // shapes of the arguments, so, it is sufficient to use the arguments of - // replica 0. - TF_ASSIGN_OR_RETURN( - std::unique_ptr module_config, - CreateModuleConfig(*program_shape, replicated_arguments.front(), - request.execution_options(), user_computation)); - VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: " - << module_config->host_entry_computation_layout().ToString(); - - // Adds to the vectors to build and execute the computations after the loop. - all_arguments.push_back(replicated_arguments); - all_arguments.insert(all_arguments.end(), executors.size() - 1, {{}}); - versioned_handles.push_back(versioned_handle); - module_configs.push_back(std::move(module_config)); - computation_names.insert(computation_names.end(), executors.size(), - user_computation->name()); - all_executors.push_back(executors); - device_handles.insert(device_handles.end(), - execution_options.device_handles().begin(), - execution_options.device_handles().end()); - } - - // Build the user computations into HloModules and compile to generate the - // executables. - // - // TODO(jlebar): There's currently no way to pass a device allocator to - // ExecuteParallel, so we have to pass a null device_allocator below. - TF_ASSIGN_OR_RETURN( - std::vector> executables, - BuildExecutables(versioned_handles, std::move(module_configs), - execute_backend_.get(), all_executors, - /*device_allocator=*/nullptr)); - std::vector executable_ptrs; - executable_ptrs.reserve(executables.size()); - for (const auto& executable : executables) { - executable_ptrs.push_back(executable.get()); - } - - // Execute the generated executables in parallel and return the device - // handles for each computation's output. - ExecutionProfile profile; - TF_ASSIGN_OR_RETURN( - std::vector outputs, - ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments, - execute_backend_.get(), device_handles, - computation_names, &profile)); - for (const GlobalDataHandle& output : outputs) { - ExecuteResponse response; - *response.mutable_output() = output; - *response.mutable_profile() = profile; - *result->add_responses() = response; - } - - VLOG(1) << "successfully completed 'execute-parallel' request"; - return Status::OK(); -} - Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) { VLOG(1) << "running execute-graph-parallel request"; @@ -1007,11 +653,10 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, std::unique_ptr module_config, CreateModuleConfig(request.computation().program_shape(), replicated_arguments.front(), - request.execution_options(), - /*user_computation=*/nullptr)); + request.execution_options())); VLOG(3) << "ExecuteGraphParallel created HloModuleConfig computation layout: " - << module_config->host_entry_computation_layout().ToString(); + << module_config->entry_computation_layout().ToString(); // Adds to the vectors to build and execute the computations after the loop. all_arguments.push_back(replicated_arguments); @@ -1040,6 +685,17 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, executable_ptrs.push_back(executable.get()); } + for (int i = 0; i < executable_ptrs.size(); i++) { + if (executable_ptrs[i]->dumping_snapshot()) { + TF_ASSIGN_OR_RETURN(auto stream, + execute_backend_->BorrowStream( + all_executors[i][0]->device_ordinal())); + TF_RETURN_IF_ERROR(RecordArguments(all_arguments[i].front(), stream.get(), + execute_backend_->transfer_manager(), + executable_ptrs[i]->hlo_snapshot())); + } + } + // Execute the generated executables in parallel and return the device // handles for each computation's output. ExecutionProfile profile; @@ -1055,6 +711,20 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, *result->add_responses() = response; } + for (int i = 0; i < executable_ptrs.size(); i++) { + if (executable_ptrs[i]->dumping_snapshot()) { + TF_ASSIGN_OR_RETURN(const ShapedBuffer* result_buffer, + allocation_tracker_.ResolveForReplica(outputs[i], 0)); + TF_ASSIGN_OR_RETURN(auto stream, + execute_backend_->BorrowStream(all_executors[i][0])); + TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(), + execute_backend_->transfer_manager(), + executable_ptrs[i]->hlo_snapshot())); + // Dump out the ith snapshot. + TF_RETURN_IF_ERROR(executable_ptrs[i]->DumpHloSnapshot()); + } + } + VLOG(1) << "successfully completed 'execute-graph-parallel' request"; return Status::OK(); } @@ -1068,8 +738,8 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg, } if (available_device_count < arg->device_count() * replica_count) { return ResourceExhausted( - "Requested device count (%lld) exceeds the number of available devices " - "on the target (%lld)", + "Requested device count (%d) exceeds the number of available devices " + "on the target (%d)", arg->device_count(), available_device_count); } @@ -1083,15 +753,6 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg, return Status::OK(); } -Status Service::ExecuteOneToN(const ExecuteRequest* arg, - ExecuteResponse* result) { - ExecuteParallelRequest parallel_arg; - *parallel_arg.add_requests() = *arg; - ExecuteParallelResponse parallel_result; - TF_RETURN_IF_ERROR(ExecuteParallel(¶llel_arg, ¶llel_result)); - return PickParallelResponse(parallel_result, result); -} - Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg, ExecuteResponse* result) { ExecuteGraphParallelRequest parallel_arg; @@ -1124,90 +785,16 @@ Status Service::PickParallelResponse( return Status::OK(); } -Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) { - VLOG(1) << "running execute request: " << arg->ShortDebugString(); - - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, - computation_tracker_.Resolve(arg->computation())); - - VersionedComputationHandle versioned_handle = - user_computation->GetVersionedHandle(); - - if (user_computation->request_count(versioned_handle.version) == 0) { - return InvalidArgument("computations may not be empty"); - } - - // If we received multiple device handles, we must partition the module. - if (arg->execution_options().device_handles_size() > 1) { - return ExecuteOneToN(arg, result); - } - - TF_ASSIGN_OR_RETURN( - std::shared_ptr program_shape, - user_computation->ComputeProgramShape(versioned_handle.version)); - - TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_, - SingleComputationDeviceHandle())); - TF_ASSIGN_OR_RETURN( - std::vector> replicated_arguments, - ResolveAndValidateArguments(arg->arguments(), replicas)); - - // Since we care only about the shapes of the arguments, it is sufficient to - // use the arguments of replica 0. - TF_ASSIGN_OR_RETURN( - std::unique_ptr module_config, - CreateModuleConfig(*program_shape, replicated_arguments.front(), - arg->execution_options(), user_computation)); - - VLOG(3) << "Execute created HloModuleConfig computation layout: " - << module_config->host_entry_computation_layout().ToString(); - - TF_ASSIGN_OR_RETURN( - std::shared_ptr executable, - BuildAndCacheExecutable(versioned_handle, std::move(module_config), - execute_backend_.get(), - execute_backend_->default_stream_executor(), - result->mutable_profile())); - - if (executable->dumping()) { - executable->session_module()->set_execution_platform( - execute_backend_->platform()->Name()); - TF_RETURN_IF_ERROR(RecordArguments( - replicated_arguments.front(), - execute_backend_->default_stream_executor(), - execute_backend_->transfer_manager(), executable->session_module())); - } - - TF_ASSIGN_OR_RETURN( - *result->mutable_output(), - ExecuteAndRegisterResult( - executable.get(), replicated_arguments, execute_backend_.get(), - "result of " + user_computation->name(), result->mutable_profile())); - - if (executable->dumping()) { - TF_ASSIGN_OR_RETURN( - const ShapedBuffer* result_buffer, - allocation_tracker_.ResolveForReplica(result->output(), 0)); - TF_RETURN_IF_ERROR(RecordResult( - *result_buffer, execute_backend_->default_stream_executor(), - execute_backend_->transfer_manager(), executable->session_module())); - TF_RETURN_IF_ERROR(executable->DumpSessionModule()); - } - - VLOG(1) << "successfully completed 'execute' request"; - return Status::OK(); -} - StatusOr> Service::BuildExecutable( const HloModuleProto& module_proto, std::unique_ptr module_config, Backend* backend, se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) { - VLOG(1) << Printf( + VLOG(1) << StrFormat( "BuildExecutable on service %p with serialized module proto: %s", this, - module_proto.name().c_str()); + module_proto.name()); // Dump computation proto state if flag is set. - auto hlo_snapshot = MakeUnique(); + auto hlo_snapshot = absl::make_unique(); const string& directory_path = module_config->debug_options().xla_dump_computations_to(); const string& execution_directory_path = @@ -1215,8 +802,8 @@ StatusOr> Service::BuildExecutable( if (!directory_path.empty() || !execution_directory_path.empty()) { *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto; if (!directory_path.empty()) { - string filename = Printf("computation_%lld__%s", module_proto.id(), - module_proto.entry_computation_name().c_str()); + string filename = StrFormat("computation_%d__%s", module_proto.id(), + module_proto.entry_computation_name()); TF_RETURN_IF_ERROR( Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot)); } @@ -1230,13 +817,15 @@ StatusOr> Service::BuildExecutable( TF_ASSIGN_OR_RETURN( module, backend->compiler()->RunHloPasses(std::move(module), executor, device_allocator)); - // Check that on-host and on-device shapes are consistent. - TF_RETURN_IF_ERROR(ValidateEntryComputationLayout(module.get())); TF_ASSIGN_OR_RETURN(std::unique_ptr executable, backend->compiler()->RunBackend( std::move(module), executor, device_allocator)); + if (!execution_directory_path.empty()) { + executable->set_hlo_snapshot(std::move(hlo_snapshot)); + } + return std::move(executable); } @@ -1274,12 +863,14 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg, execute_backend_->default_stream_executor(), /*device_allocator=*/nullptr)); + TF_ASSIGN_OR_RETURN(auto stream, + execute_backend_->BorrowStream( + execute_backend_->default_stream_executor())); if (executable->dumping_snapshot()) { executable->hlo_snapshot()->set_execution_platform( execute_backend_->platform()->Name()); TF_RETURN_IF_ERROR(RecordArguments( - replicated_arguments.front(), - execute_backend_->default_stream_executor(), + replicated_arguments.front(), stream.get(), execute_backend_->transfer_manager(), executable->hlo_snapshot())); } @@ -1293,9 +884,9 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg, TF_ASSIGN_OR_RETURN( const ShapedBuffer* result_buffer, allocation_tracker_.ResolveForReplica(result->output(), 0)); - TF_RETURN_IF_ERROR(RecordResult( - *result_buffer, execute_backend_->default_stream_executor(), - execute_backend_->transfer_manager(), executable->hlo_snapshot())); + TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(), + execute_backend_->transfer_manager(), + executable->hlo_snapshot())); TF_RETURN_IF_ERROR(executable->DumpHloSnapshot()); } @@ -1303,86 +894,6 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg, return Status::OK(); } -Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg, - ExecuteAsyncResponse* result) { - VLOG(1) << "running execute-async request: " << arg->ShortDebugString(); - - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, - computation_tracker_.Resolve(arg->computation())); - - VersionedComputationHandle versioned_handle = - user_computation->GetVersionedHandle(); - if (user_computation->request_count(versioned_handle.version) == 0) { - return InvalidArgument("computations may not be empty"); - } - - TF_ASSIGN_OR_RETURN( - std::shared_ptr program_shape, - user_computation->ComputeProgramShape(versioned_handle.version)); - - TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_, - SingleComputationDeviceHandle())); - TF_RET_CHECK(!replicas.empty()); - TF_ASSIGN_OR_RETURN( - std::vector> replicated_arguments, - ResolveAndValidateArguments(arg->arguments(), replicas)); - - TF_ASSIGN_OR_RETURN( - std::unique_ptr module_config, - CreateModuleConfig(*program_shape, replicated_arguments.front(), - arg->execution_options(), user_computation)); - - VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: " - << module_config->host_entry_computation_layout().ToString(); - - ExecutionProfile profile; - - TF_ASSIGN_OR_RETURN( - std::shared_ptr executable, - BuildAndCacheExecutable( - versioned_handle, std::move(module_config), execute_backend_.get(), - execute_backend_->default_stream_executor(), &profile)); - - // Set up streams. - std::vector::SmartPtr> streams; - for (se::StreamExecutor* executor : replicas) { - TF_ASSIGN_OR_RETURN(Pool::SmartPtr stream, - execute_backend_->BorrowStream(executor)); - streams.push_back(std::move(stream)); - } - - std::vector result_buffers; - for (size_t i = 0; i < streams.size(); ++i) { - const auto& stream = streams[i]; - ExecutableRunOptions options; - options.set_stream(stream.get()); - options.set_allocator(execute_backend_->memory_allocator()); - options.set_intra_op_thread_pool( - execute_backend_->eigen_intra_op_thread_pool_device()); - - ServiceExecutableRunOptions service_options( - options, execute_backend_->StreamBorrower()); - - TF_ASSIGN_OR_RETURN(ScopedShapedBuffer this_result_buffer, - executable->ExecuteAsyncOnStream( - &service_options, replicated_arguments[i])); - - result_buffers.emplace_back(std::move(this_result_buffer)); - } - - TF_ASSIGN_OR_RETURN( - GlobalDataHandle output, - allocation_tracker_.RegisterReplicatedBuffers( - std::move(result_buffers), "result of " + user_computation->name())); - - *result->mutable_execution() = execution_tracker_.Register( - execute_backend_.get(), std::move(streams), profile, output); - streams.clear(); - - VLOG(1) << "successfully completed 'execute-async' request"; - return Status::OK(); -} - Status Service::WaitForExecution(const WaitForExecutionRequest* arg, WaitForExecutionResponse* result) { TF_ASSIGN_OR_RETURN(const auto execution, @@ -1413,14 +924,13 @@ Status Service::TransferToClient(const TransferToClientRequest* arg, return_shape = &shaped_buffer->on_host_shape(); } - TF_ASSIGN_OR_RETURN( - se::StreamExecutor * executor, - execute_backend_->stream_executor(shaped_buffer->device_ordinal())); + TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream( + shaped_buffer->device_ordinal())); TF_ASSIGN_OR_RETURN( std::unique_ptr result_literal, execute_backend_->transfer_manager()->TransferLiteralFromDevice( - executor, *shaped_buffer)); + stream.get(), *shaped_buffer)); if (LayoutUtil::LayoutsInShapesEqual(*return_shape, result_literal->shape())) { @@ -1438,7 +948,7 @@ namespace { // shape and DeviceMemoryBase values of the clone are identical to the original. std::unique_ptr CloneShapedBufferOnDevice( const ShapedBuffer& shaped_buffer, int device_ordinal) { - auto clone = MakeUnique( + auto clone = absl::make_unique( shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape(), shaped_buffer.platform(), device_ordinal); clone->buffers() = shaped_buffer.buffers(); @@ -1470,9 +980,10 @@ Status Service::TransferToServer(const TransferToServerRequest* arg, execute_backend_->transfer_manager()->AllocateScopedShapedBuffer( shape, execute_backend_->memory_allocator(), executor->device_ordinal())); + TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream(executor)); TF_RETURN_IF_ERROR( execute_backend_->transfer_manager()->TransferLiteralToDevice( - executor, *literal, shaped_buffer)); + stream.get(), *literal, shaped_buffer)); replicated_buffers.emplace_back(std::move(shaped_buffer)); } TF_ASSIGN_OR_RETURN(*result->mutable_data(), @@ -1492,8 +1003,7 @@ Status Service::TransferToInfeed(const TransferToInfeedRequest* arg, "%s", StrCat("The replica_id=", arg->replica_id(), " on TransferToInfeedRequest not in range [0, replica_count=", - replica_count, ").") - .c_str()); + replica_count, ").")); } se::StreamExecutor* executor; @@ -1519,8 +1029,7 @@ Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg, const int64 replica_count = options_.number_of_replicas(); if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) { return FailedPrecondition( - "The replica_id=%lld on TransferFromOutfeedRequest not in range [0, " - "%lld)", + "The replica_id=%d on TransferFromOutfeedRequest not in range [0, %d)", arg->replica_id(), replica_count); } @@ -1536,11 +1045,12 @@ Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg, executor = replicas[arg->replica_id()]; } - Literal literal; + auto literal = Literal::CreateFromShape(arg->shape_with_layout()); + TF_RETURN_IF_ERROR( execute_backend_->transfer_manager()->TransferLiteralFromOutfeed( - executor, arg->shape_with_layout(), &literal)); - *result->mutable_literal() = literal.ToProto(); + executor, arg->shape_with_layout(), *literal)); + *result->mutable_literal() = literal->ToProto(); return Status::OK(); } @@ -1549,117 +1059,6 @@ Status Service::ResetDevice(const ResetDeviceRequest* arg, return execute_backend_->ResetDevices(); } -Status Service::IsConstant(const IsConstantRequest* arg, - IsConstantResponse* result) { - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, - computation_tracker_.Resolve(arg->computation())); - - VersionedComputationHandle versioned_handle = - user_computation->GetVersionedHandleAtOperation(arg->operand()); - - if (user_computation->request_count(versioned_handle.version) == 0) { - return InvalidArgument("computations may not be empty"); - } - - TF_ASSIGN_OR_RETURN( - bool is_constant, - user_computation->IsConstant(arg->operand(), arg->num_parameters())); - - result->set_is_constant(is_constant); - return Status::OK(); -} - -Status Service::ComputeConstant(const ComputeConstantRequest* arg, - ComputeConstantResponse* result) { - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, - computation_tracker_.Resolve(arg->computation())); - - VersionedComputationHandle versioned_handle = - user_computation->GetVersionedHandleAtOperation(arg->operand()); - - if (user_computation->request_count(versioned_handle.version) == 0) { - return InvalidArgument("computations may not be empty"); - } - - TF_ASSIGN_OR_RETURN( - bool is_constant, - user_computation->IsConstant(arg->operand(), arg->parameters_size())); - if (!is_constant) { - StatusOr op_request_status = - user_computation->LookUpRequestForErrorReporting(arg->operand()); - string op_request_string = ""; - if (op_request_status.ok()) { - op_request_string = op_request_status.ValueOrDie()->ShortDebugString(); - } - return InvalidArgument( - "Operand to ComputeConstant depends on a parameter.\n\n" - " op requested for constant evaluation: %s\n\n" - "This is an internal error that typically happens when the XLA user " - "(e.g. TensorFlow) is attempting to determine a value that must be a " - "compile-time constant (e.g. an array dimension) but it is not capable " - "of being evaluated at XLA compile time.\n\n" - "Please file a usability bug with the framework being used (e.g. " - "TensorFlow).", - op_request_string.c_str()); - } - - // We can't use ComputeProgramShape because it checks that all parameter - // instructions are present and contiguous. Instead construct ProgramShape - // directly. - ProgramShape program_shape; - TF_ASSIGN_OR_RETURN(*program_shape.mutable_result(), - user_computation->GetShape(arg->operand())); - - TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result())); - - ExecutionOptions execution_options = xla::CreateDefaultExecutionOptions(); - execution_options.mutable_debug_options()->set_xla_enable_fast_math(false); - execution_options.mutable_debug_options() - ->set_xla_eliminate_hlo_implicit_broadcast(true); - *execution_options.mutable_shape_with_output_layout() = - program_shape.result(); - - Shape shape_with_output_layout(program_shape.result()); - if (arg->has_output_layout()) { - TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape( - arg->output_layout(), execution_options.shape_with_output_layout())); - *execution_options.mutable_shape_with_output_layout()->mutable_layout() = - arg->output_layout(); - } - - TF_ASSIGN_OR_RETURN(std::unique_ptr module_config, - CreateModuleConfig(program_shape, {}, execution_options, - user_computation)); - - // Exclude dead parameter instructions for the purpose of computing constants. - TF_ASSIGN_OR_RETURN( - std::unique_ptr module, - computation_tracker_.BuildHloModule(versioned_handle, *module_config, - /*include_unreachable_instructions=*/ - false)); - - std::vector> parameters(arg->parameters_size()); - for (int64 i = 0; i < arg->parameters_size(); ++i) { - TF_ASSIGN_OR_RETURN(parameters[i], - Literal::CreateFromProto(arg->parameters(i))); - } - HloEvaluator evaluator; - TF_ASSIGN_OR_RETURN( - auto result_literal, - evaluator.Evaluate>(*module, parameters)); - - // Since the shape_with_output_layout option in ExecutionOption is - // non-effective to the Evaluator results, explicit relayout here. - // - // TODO(b/77824332): Make HloEvaluator take care of the re-layout. - if (arg->has_output_layout()) { - result_literal = result_literal->Relayout(arg->output_layout()); - } - *result->mutable_literal() = result_literal->ToProto(); - - return Status::OK(); -} - Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) { if (!arg->has_computation()) { @@ -1709,60 +1108,6 @@ Status Service::GetShape(const GetShapeRequest* arg, GetShapeResponse* result) { return Status::OK(); } -Status Service::GetComputationShape(const GetComputationShapeRequest* arg, - GetComputationShapeResponse* result) { - TF_ASSIGN_OR_RETURN(UserComputation * computation, - computation_tracker_.Resolve(arg->computation())); - - VersionedComputationHandle versioned_handle = - computation->GetVersionedHandle(); - - TF_ASSIGN_OR_RETURN(auto program_shape, computation->ComputeProgramShape( - versioned_handle.version)); - *result->mutable_program_shape() = *program_shape; - return Status::OK(); -} - -Status Service::GetLocalShape(const GetLocalShapeRequest* arg, - GetLocalShapeResponse* result) { - TF_ASSIGN_OR_RETURN(UserComputation * computation, - computation_tracker_.Resolve(arg->computation())); - - TF_ASSIGN_OR_RETURN(*result->mutable_shape(), - computation->GetShape(arg->operand())); - return Status::OK(); -} - -Status Service::GetComputationStats(const ComputationStatsRequest* arg, - ComputationStatsResponse* result) { - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, - computation_tracker_.Resolve(arg->computation())); - - VersionedComputationHandle versioned_handle = - user_computation->GetVersionedHandle(); - - HloModuleConfig config; - config.set_debug_options(arg->debug_options()); - TF_ASSIGN_OR_RETURN( - std::unique_ptr module, - computation_tracker_.BuildHloModule(versioned_handle, config)); - - hlo_graph_dumper::MaybeDumpHloModule(*module, - "computation statistics subject"); - - // Run HLO analysis to get the computation statistics. - HloCostAnalysis analysis( - execute_backend_->compiler()->ShapeSizeBytesFunction()); - - TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&analysis)); - - ComputationStats stats; - stats.set_flop_count(analysis.flop_count()); - stats.set_transcendental_count(analysis.transcendental_count()); - *result->mutable_stats() = stats; - return Status::OK(); -} - Status Service::GetComputationGraphStats( const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) { if (!arg->has_computation()) { @@ -1793,262 +1138,6 @@ Status Service::GetComputationGraphStats( return Status::OK(); } -template -Status Service::AddInstruction( - const RequestT* arg, ResponseT* result, - const std::function(UserComputation*)>& - adder) { - TF_ASSIGN_OR_RETURN(UserComputation * computation, - computation_tracker_.Resolve(arg->computation())); - - TF_ASSIGN_OR_RETURN(*result->mutable_output(), adder(computation)); - return Status::OK(); -} - -Status Service::Op(const OpRequest* arg, OpResponse* result) { - TF_ASSIGN_OR_RETURN(UserComputation * computation, - computation_tracker_.Resolve(arg->computation())); - StatusOr handle_status; - - switch (arg->op_case()) { - case OpRequest::kBatchNormTrainingRequest: - handle_status = computation->AddBatchNormTrainingInstruction( - arg->batch_norm_training_request()); - break; - case OpRequest::kBatchNormInferenceRequest: - handle_status = computation->AddBatchNormInferenceInstruction( - arg->batch_norm_inference_request()); - break; - case OpRequest::kBatchNormGradRequest: - handle_status = computation->AddBatchNormGradInstruction( - arg->batch_norm_grad_request()); - break; - case OpRequest::kBinaryOpRequest: - handle_status = - computation->AddBinaryInstruction(arg->binary_op_request()); - break; - case OpRequest::kBroadcastRequest: - handle_status = - computation->AddBroadcastInstruction(arg->broadcast_request()); - break; - case OpRequest::kCallRequest: { - TF_ASSIGN_OR_RETURN( - UserComputation * to_apply, - computation_tracker_.Resolve(arg->call_request().to_apply())); - handle_status = - computation->AddCallInstruction(arg->call_request(), *to_apply); - break; - } - case OpRequest::kConcatenateRequest: - handle_status = - computation->AddConcatenateInstruction(arg->concatenate_request()); - break; - case OpRequest::kConditionalRequest: { - TF_ASSIGN_OR_RETURN(UserComputation * true_computation, - computation_tracker_.Resolve( - arg->conditional_request().true_computation())); - TF_ASSIGN_OR_RETURN(UserComputation * false_computation, - computation_tracker_.Resolve( - arg->conditional_request().false_computation())); - handle_status = computation->AddConditionalInstruction( - arg->conditional_request(), *true_computation, *false_computation); - break; - } - case OpRequest::kConstantRequest: - handle_status = - computation->AddConstantInstruction(arg->constant_request()); - break; - case OpRequest::kConvertRequest: - handle_status = - computation->AddConvertInstruction(arg->convert_request()); - break; - case OpRequest::kBitcastConvertRequest: - handle_status = computation->AddBitcastConvertInstruction( - arg->bitcast_convert_request()); - break; - case OpRequest::kConvolveRequest: - handle_status = - computation->AddConvolveInstruction(arg->convolve_request()); - break; - case OpRequest::kCrossReplicaSumRequest: - handle_status = computation->AddCrossReplicaSumInstruction( - arg->cross_replica_sum_request()); - break; - case OpRequest::kCustomCallRequest: - handle_status = - computation->AddCustomCallInstruction(arg->custom_call_request()); - break; - case OpRequest::kDotRequest: - handle_status = computation->AddDotInstruction(arg->dot_request()); - break; - case OpRequest::kDynamicSliceRequest: - handle_status = - computation->AddDynamicSliceInstruction(arg->dynamic_slice_request()); - break; - case OpRequest::kDynamicUpdateSliceRequest: - handle_status = computation->AddDynamicUpdateSliceInstruction( - arg->dynamic_update_slice_request()); - break; - case OpRequest::kFftRequest: - handle_status = computation->AddFftInstruction(arg->fft_request()); - break; - case OpRequest::kGatherRequest: - handle_status = computation->AddGatherInstruction(arg->gather_request()); - break; - case OpRequest::kGetTupleElementRequest: - handle_status = computation->AddGetTupleElementInstruction( - arg->get_tuple_element_request()); - break; - case OpRequest::kInfeedRequest: - handle_status = computation->AddInfeedInstruction(arg->infeed_request()); - break; - case OpRequest::kOutfeedRequest: - handle_status = - computation->AddOutfeedInstruction(arg->outfeed_request()); - break; - case OpRequest::kHostComputeRequest: - handle_status = - computation->AddHostComputeInstruction(arg->host_compute_request()); - break; - case OpRequest::kMapRequest: { - TF_ASSIGN_OR_RETURN( - UserComputation * to_apply, - computation_tracker_.Resolve(arg->map_request().to_apply())); - handle_status = - computation->AddMapInstruction(arg->map_request(), *to_apply); - break; - } - case OpRequest::kPadRequest: - handle_status = computation->AddPadInstruction(arg->pad_request()); - break; - case OpRequest::kParameterRequest: - handle_status = - computation->AddParameterInstruction(arg->parameter_request()); - break; - case OpRequest::kReduceRequest: { - TF_ASSIGN_OR_RETURN( - UserComputation * to_apply, - computation_tracker_.Resolve(arg->reduce_request().to_apply())); - handle_status = - computation->AddReduceInstruction(arg->reduce_request(), *to_apply); - break; - } - case OpRequest::kReducePrecisionRequest: { - handle_status = computation->AddReducePrecisionInstruction( - arg->reduce_precision_request()); - break; - } - case OpRequest::kReduceWindowRequest: { - TF_ASSIGN_OR_RETURN(UserComputation * to_apply, - computation_tracker_.Resolve( - arg->reduce_window_request().to_apply())); - handle_status = computation->AddReduceWindowInstruction( - arg->reduce_window_request(), *to_apply); - break; - } - case OpRequest::kReshapeRequest: - handle_status = - computation->AddReshapeInstruction(arg->reshape_request()); - break; - case OpRequest::kReverseRequest: - handle_status = - computation->AddReverseInstruction(arg->reverse_request()); - break; - case OpRequest::kRngRequest: - handle_status = computation->AddRngInstruction(arg->rng_request()); - break; - case OpRequest::kSelectAndScatterRequest: { - TF_ASSIGN_OR_RETURN(UserComputation * select, - computation_tracker_.Resolve( - arg->select_and_scatter_request().select())); - TF_ASSIGN_OR_RETURN(UserComputation * scatter, - computation_tracker_.Resolve( - arg->select_and_scatter_request().scatter())); - handle_status = computation->AddSelectAndScatterInstruction( - arg->select_and_scatter_request(), *select, *scatter); - break; - } - case OpRequest::kSliceRequest: - handle_status = computation->AddSliceInstruction(arg->slice_request()); - break; - case OpRequest::kTernaryOpRequest: - handle_status = - computation->AddTernaryInstruction(arg->ternary_op_request()); - break; - case OpRequest::kTraceRequest: - return computation->AddTraceInstruction(arg->trace_request()); - case OpRequest::kTransposeRequest: - handle_status = - computation->AddTransposeInstruction(arg->transpose_request()); - break; - case OpRequest::kUnaryOpRequest: - handle_status = computation->AddUnaryInstruction(arg->unary_op_request()); - break; - case OpRequest::kVariadicOpRequest: - handle_status = - computation->AddVariadicInstruction(arg->variadic_op_request()); - break; - case OpRequest::kWhileRequest: { - TF_ASSIGN_OR_RETURN( - UserComputation * condition, - computation_tracker_.Resolve(arg->while_request().condition())); - TF_ASSIGN_OR_RETURN( - UserComputation * body, - computation_tracker_.Resolve(arg->while_request().body())); - handle_status = computation->AddWhileInstruction(arg->while_request(), - *condition, *body); - break; - } - case OpRequest::kSendRequest: { - TF_RETURN_IF_ERROR( - channel_tracker_.RegisterSend(arg->send_request().channel_handle())); - // Send does not return a value, but we need a handle to be able to - // set OpMetadata and OpSharding (device assignment). - handle_status = computation->AddSendInstruction(arg->send_request()); - break; - } - case OpRequest::kRecvRequest: { - TF_RETURN_IF_ERROR( - channel_tracker_.RegisterRecv(arg->recv_request().channel_handle())); - handle_status = computation->AddRecvInstruction(arg->recv_request()); - break; - } - case OpRequest::OP_NOT_SET: - return InvalidArgument("XLA service received OpRequest with OP_NOT_SET"); - default: - return InvalidArgument("Unsupported operation in XLA service"); - } - TF_ASSIGN_OR_RETURN(*result->mutable_output(), handle_status); - - // We set the debug metadata here, because we slice off part of the OpRequest - // proto in the above switch statement. - TF_ASSIGN_OR_RETURN(ComputationDataHandle handle, handle_status); - TF_RETURN_IF_ERROR(computation->SetOpMetadata(handle, arg->metadata())); - if (arg->has_sharding()) { - TF_RETURN_IF_ERROR(computation->SetOpSharding(handle, arg->sharding())); - } - return Status::OK(); -} - -Status Service::SnapshotComputation(const SnapshotComputationRequest* arg, - SnapshotComputationResponse* result) { - TF_ASSIGN_OR_RETURN( - std::unique_ptr module, - computation_tracker_.SnapshotComputation(arg->computation())); - - result->set_allocated_module(module.release()); - - return Status::OK(); -} - -Status Service::LoadComputationSnapshot( - const LoadComputationSnapshotRequest* arg, - LoadComputationSnapshotResponse* result) { - TF_ASSIGN_OR_RETURN(*result->mutable_computation(), - computation_tracker_.LoadSessionModule(arg->module())); - return Status::OK(); -} - DeviceHandle Service::SingleComputationDeviceHandle() const { DeviceHandle device_handle; device_handle.set_handle(0); diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h index 81fbd41957887aec763e1cfe165ad0d1d2ac2269..44c5248b150cff57546d3287869787f37c8975ba 100644 --- a/tensorflow/compiler/xla/service/service.h +++ b/tensorflow/compiler/xla/service/service.h @@ -21,28 +21,23 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/allocation_tracker.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/channel_tracker.h" -#include "tensorflow/compiler/xla/service/compilation_cache.h" -#include "tensorflow/compiler/xla/service/computation_tracker.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/execution_tracker.h" #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" -#include "tensorflow/compiler/xla/service/session.pb.h" -#include "tensorflow/compiler/xla/service/user_computation.h" -#include "tensorflow/compiler/xla/service/versioned_computation_handle.h" #include "tensorflow/compiler/xla/service_interface.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla.pb.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -83,11 +78,6 @@ class Service : public ServiceInterface { static StatusOr> NewService( const ServiceOptions& options); - // Creates a new computation with the given name. - // A unique ComputationHandle is returned. - Status Computation(const ComputationRequest* arg, - ComputationResponse* result) override; - // Unregisters a previously-allocated global handle. // // If the handle given is not currently allocated, a NOT_FOUND status is @@ -100,35 +90,15 @@ class Service : public ServiceInterface { Status DeconstructTuple(const DeconstructTupleRequest* arg, DeconstructTupleResponse* result) override; - // Modifies the provided computation so that subsequent executions - // will compute the provided ComputationDataHandle, rather than the - // last expression enqueued on that Computation. - Status SetReturnValue(const SetReturnValueRequest* arg, - SetReturnValueResponse* results) override; - - // Executes a computation with the provided global data passed as - // immutable arguments. Returns global data output and execution timing. - Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override; - // Executes a computation with the provided global data passed as // immutable arguments. The request contains the whole computation graph. // Returns global data output and execution timing. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. Status ExecuteGraph(const ExecuteGraphRequest* arg, ExecuteResponse* result) override; // Executes one or more computations in parallel with the provided global data // passed as immutable arguments. Returns global data output for each // computation. - Status ExecuteParallel(const ExecuteParallelRequest* arg, - ExecuteParallelResponse* result) override; - - // Executes one or more computations in parallel with the provided global data - // passed as immutable arguments. Returns global data output for each - // computation. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) override; @@ -143,16 +113,6 @@ class Service : public ServiceInterface { Status GetDeviceHandles(const GetDeviceHandlesRequest* arg, GetDeviceHandlesResponse* result) override; - // Asynchronously executes a computation with provided arguments. Invokes - // the provided computation with the provided global data passed as - // immutable arguments. Returns a handle to the execution. - // - // (Note: The corresponding function in xla::Client was removed as part of - // b/64116060, in an attempt to simplify our API. We're keeping this around - // for now in case we want to expose this to clients in a different way.) - Status ExecuteAsync(const ExecuteAsyncRequest* arg, - ExecuteAsyncResponse* result) override; - // Waits until the specified execution is complete and returns the result. // Calling this API multiple times with the same execution handle returns the // method with an error since the execution handle is destroyed after the @@ -190,13 +150,6 @@ class Service : public ServiceInterface { Status ResetDevice(const ResetDeviceRequest* arg, ResetDeviceResponse* result) override; - // Tests if an expression is a compile-time constant. - Status IsConstant(const IsConstantRequest* arg, - IsConstantResponse* result) override; - - // Computes the value of a constant expression. - Status ComputeConstant(const ComputeConstantRequest* arg, - ComputeConstantResponse* result) override; Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) override; @@ -205,54 +158,15 @@ class Service : public ServiceInterface { Status GetShape(const GetShapeRequest* arg, GetShapeResponse* result) override; - // Returns the program shape of the computation associated with the given - // handle. - Status GetComputationShape(const GetComputationShapeRequest* arg, - GetComputationShapeResponse* result) override; - - ///// - // Computation-oriented methods. - - // Enqueues an Op on the computation. - Status Op(const OpRequest* arg, OpResponse* result) override; - - // Retrieves the inferred shape for a value within a computation. - Status GetLocalShape(const GetLocalShapeRequest* arg, - GetLocalShapeResponse* result) override; - // Retrieves the statistics of a computation. - Status GetComputationStats(const ComputationStatsRequest* arg, - ComputationStatsResponse* result) override; - - // Retrieves the statistics of a computation. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. Status GetComputationGraphStats(const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) override; - // Snapshots the current state of a computation handle into a serializable - // protocol buffer form, so it can be loaded via - // LoadComputationSnapshot. - Status SnapshotComputation(const SnapshotComputationRequest* arg, - SnapshotComputationResponse* result) override; - - // Loads a computation from a serialized protocol buffer created via - // SnapshotComputation. - Status LoadComputationSnapshot( - const LoadComputationSnapshotRequest* arg, - LoadComputationSnapshotResponse* result) override; - // Creates a unique channel handle that can be used for Send/Recv // instructions. Status CreateChannelHandle(const CreateChannelHandleRequest* arg, CreateChannelHandleResponse* result) override; - // Returns the ComputationTracker of the current service instance. - // Only used in unit tests to access user computations from client. - const ComputationTracker& computation_tracker() { - return computation_tracker_; - } - // Returns the backend used to execute computations. const Backend& backend() const { return *execute_backend_; } Backend* mutable_backend() { return execute_backend_.get(); } @@ -262,9 +176,8 @@ class Service : public ServiceInterface { // class. StatusOr> CreateModuleConfig( const ProgramShape& program_shape, - tensorflow::gtl::ArraySlice arguments, - const ExecutionOptions& execution_options, - const UserComputation* user_computation = nullptr); + absl::Span arguments, + const ExecutionOptions& execution_options); // Picks a parallel response and fills the result. Status PickParallelResponse(const ExecuteParallelResponse& parallel_result, @@ -278,10 +191,7 @@ class Service : public ServiceInterface { // Prepare the arguments for executing parallel. StatusOr>> GetArguments( const ExecutionOptions& execution_options, - tensorflow::gtl::ArraySlice arguments); - - // Assert that host- and device-shapes are in a consistent state. - Status ValidateEntryComputationLayout(HloModule* module); + absl::Span arguments); protected: friend class LocalExecutable; @@ -297,31 +207,21 @@ class Service : public ServiceInterface { // the corresponding replica. StatusOr>> ResolveAndValidateArguments( - tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice stream_executors); + absl::Span arguments, + absl::Span stream_executors); // Create a Hlo module config for the given program shape and arguments. // execution_options is optional; if not given a default is used. StatusOr> CreateModuleConfig( const ProgramShape& program_shape, - tensorflow::gtl::ArraySlice argument_shapes, - const ExecutionOptions* execution_options, - const UserComputation* user_computation = nullptr); + absl::Span argument_shapes, + const ExecutionOptions* execution_options); // Builds an Executable for the given parameters. // // If device_allocator is not null, the compiler may use it to allocate temp // buffers, which the compiler is responsible for freeing. The allocator // given here need not match the allocator used when running the executable. - StatusOr> BuildExecutable( - const VersionedComputationHandle& versioned_handle, - std::unique_ptr module_config, Backend* backend, - se::StreamExecutor* executor, - DeviceMemoryAllocator* device_allocator = nullptr); - - // Builds an Executable for the given HLO module proto. - // - // TODO(b/74197823): This is a part of a NOT YET ready refactor. StatusOr> BuildExecutable( const HloModuleProto& module_proto, std::unique_ptr module_config, Backend* backend, @@ -330,66 +230,40 @@ class Service : public ServiceInterface { // Same as BuildExecutable() above, but builds a list of Executables for the // given computations that may interact with each other. - StatusOr>> BuildExecutables( - std::vector versioned_handles, - std::vector> module_configs, - Backend* backend, std::vector> executors, - DeviceMemoryAllocator* device_allocator); StatusOr>> BuildExecutables( const std::vector& module_protos, std::vector> module_configs, Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator); - // Similar to BuildExecutable, but look in the compilation cache for the - // executable first. If the executable is not in the cache, it is built and - // inserted into the cache. - StatusOr> BuildAndCacheExecutable( - const VersionedComputationHandle& versioned_handle, - std::unique_ptr module_config, Backend* backend, - se::StreamExecutor* executor, ExecutionProfile* profile, - DeviceMemoryAllocator* device_allocator = nullptr); - // Runs the given executable with the given arguments and register the result // in the allocation tracker. The handle of the result from the tracker is // returned. If the parameter "profile" is not null, it points to an // ExecutionProfile object which will be filled in with profile data. StatusOr ExecuteAndRegisterResult( Executable* executable, - const tensorflow::gtl::ArraySlice> - arguments, + const absl::Span> arguments, Backend* backend, const string& result_tag, ExecutionProfile* profile); // Runs the given executables with the given arguments and register the result // from each executable in the allocation tracker. The handles of the result // from the tracker are returned. StatusOr> ExecuteParallelAndRegisterResult( - tensorflow::gtl::ArraySlice executables, - tensorflow::gtl::ArraySlice>> - arguments, - Backend* backend, - tensorflow::gtl::ArraySlice device_handles, - tensorflow::gtl::ArraySlice result_tags, - ExecutionProfile* profile); - - // Convenience function for adding a function to a user computation. - template - Status AddInstruction( - const RequestT* arg, ResponseT* result, - const std::function(UserComputation*)>& - adder); + absl::Span executables, + absl::Span>> arguments, + Backend* backend, absl::Span device_handles, + absl::Span result_tags, ExecutionProfile* profile); // Executes a single computation which has more than one target device. // The N devices are expected to all return an empty tuple, but one, which // will be the result of this computation. - Status ExecuteOneToN(const ExecuteRequest* arg, ExecuteResponse* result); Status ExecuteOneToN(const ExecuteGraphRequest* arg, ExecuteResponse* result); - // Convenience function which checks whether the given shape_with_layout + // Convenience function which checks whether the given client_shape // (presumably passed by the client to set the result layout) is valid for the // given computation result shape. - Status ValidateResultShapeWithLayout(const Shape& shape_with_layout, - const Shape& result_shape) const; + Status ValidateResultShape(const Shape& client_shape, + const Shape& result_shape) const; // Returns the stream executors assigned to the replicas represented by the // given device handle. Each device_handle is a virtual replicated device that @@ -405,9 +279,6 @@ class Service : public ServiceInterface { ServiceOptions options_; - // Tracks computations built via the API. - ComputationTracker computation_tracker_; - // Tracks channels created via the API. ChannelTracker channel_tracker_; @@ -417,9 +288,6 @@ class Service : public ServiceInterface { // Tracks asynchronously launched executions via the API. ExecutionTracker execution_tracker_; - // Cache containing previously built Executables. - CompilationCache compilation_cache_; - // Backend to compile and execute computations on. std::unique_ptr execute_backend_; diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h index 7f3910cdb0366078b97fb5f6a2dc498b37570926..dbfed628bfcabffe66bef41a82e0e2430897d80d 100644 --- a/tensorflow/compiler/xla/service/service_executable_run_options.h +++ b/tensorflow/compiler/xla/service/service_executable_run_options.h @@ -17,7 +17,7 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_ #include "tensorflow/compiler/xla/executable_run_options.h" -#include "tensorflow/compiler/xla/service/pool.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/stream_executor/stream_executor.h" @@ -27,8 +27,7 @@ namespace xla { // data, now only a stream cache for GPU backend. class ServiceExecutableRunOptions { public: - using StreamBorrower = - std::function::SmartPtr>(int)>; + using StreamBorrower = std::function(int)>; ServiceExecutableRunOptions() : ServiceExecutableRunOptions(ExecutableRunOptions()) {} @@ -51,7 +50,7 @@ class ServiceExecutableRunOptions { // Borrows a stream and returns a smart pointer which returns the stream on // destruction. - StatusOr::SmartPtr> BorrowStream(int device_ordinal) const { + StatusOr BorrowStream(int device_ordinal) const { return borrow_stream_ ? borrow_stream_(device_ordinal) : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache"); diff --git a/tensorflow/compiler/xla/service/session.proto b/tensorflow/compiler/xla/service/session.proto deleted file mode 100644 index bb8d1cd2a106ea3e5bb61eee5052bd60c38cd0e2..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/session.proto +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// This proto file defines messages which store the state of XLA -// computations within the XLA service. A computation is stored as a record -// of the operation requests used to build it. -syntax = "proto3"; - -import "tensorflow/compiler/xla/xla_data.proto"; - -package xla; - -// Describes a single operation request. -message OperationRequest { - ComputationDataHandle output_handle = 1; - Shape output_shape = 2; - - // For operations which call embedded computations such as "Map", these are - // the version(s) that the embedded computation should be called at. A version - // value of a computation is the ComputationDataHandle of the root of the - // computation at the point in time. - // - // "Call", "Map", "Reduce", and "ReduceWindow" operations take a single - // embedded computation so this field will have a single value for those - // operations. - // - // "While" operation takes two; index 0 is the "condition" version and index 1 - // is the "body" version. - repeated int64 embedded_computation_versions = 3; - - // The actual request, which in itself is a tagged union of all possible - // operation request types. - OpRequest request = 4; -} - -// Describes a sequence of operation requests which define an XLA -// computation. -message SessionComputation { - string name = 1; - - // The ComputationHandle used to refer to this computation in the XLA - // service. - ComputationHandle computation_handle = 2; - - // Map from ComputationDataHandle value to operation request. The highest - // ComputationDataHandle value corresponds to the root of the computation. - map requests = 3; -} - -// Describes a group of SessionComputations with an "entry point" computation -// that may refer to the other non-entry (AKA embedded) computations. -// -// This message is used to serialize a computation that has been built via the -// XLA service API, along with its dependencies, for purposes such as -// analysis/replay/file-storage. -message SessionModule { - // The entry computation, which was requested for serialization. This may have - // referred to embedded computations, which are reflected below. - SessionComputation entry = 1; - - // Embedded computations that are transitively referred to by the entry - // computation. - repeated SessionComputation embedded_computations = 2; - - // The arguments passed to the computation. - repeated LiteralProto arguments = 3; - - // The result of the computation. - LiteralProto result = 4; - - // The name of the platform used to run the computation. - string execution_platform = 5; -} diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index d624f548b1ba65e6f6dfd7b329e8c86ab29112a0..26117498621450d56259507761b6b0a6ea8d3a15 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -21,6 +21,11 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" @@ -28,225 +33,124 @@ limitations under the License. #include "tensorflow/compiler/xla/window_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/lib/math/math_util.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" -using tensorflow::str_util::Join; -using tensorflow::strings::Printf; - namespace xla { - namespace { -// Return the UnaryOperation proto enum value associated with the given HLO -// opcode. -UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) { - switch (opcode) { - case HloOpcode::kAbs: - return UNOP_ABS; - case HloOpcode::kCeil: - return UNOP_CEIL; - case HloOpcode::kClz: - return UNOP_CLZ; - case HloOpcode::kCos: - return UNOP_COS; - case HloOpcode::kExp: - return UNOP_EXP; - case HloOpcode::kExpm1: - return UNOP_EXPM1; - case HloOpcode::kFloor: - return UNOP_FLOOR; - case HloOpcode::kImag: - return UNOP_IMAG; - case HloOpcode::kIsFinite: - return UNOP_IS_FINITE; - case HloOpcode::kLog: - return UNOP_LOG; - case HloOpcode::kLog1p: - return UNOP_LOG1P; - case HloOpcode::kNot: - return UNOP_NOT; - case HloOpcode::kNegate: - return UNOP_NEGATE; - case HloOpcode::kReal: - return UNOP_REAL; - case HloOpcode::kRoundNearestAfz: - return UNOP_ROUND_NEAREST_AFZ; - case HloOpcode::kSign: - return UNOP_SIGN; - case HloOpcode::kSin: - return UNOP_SIN; - case HloOpcode::kSort: - return UNOP_SORT; - case HloOpcode::kTanh: - return UNOP_TANH; - default: - LOG(FATAL) << "Unhandled opcode for conversion to unary operation: " - << opcode; - } -} - -// Return the BinaryOperation proto enum value associated with the given HLO -// opcode. -BinaryOperation OpcodeToBinaryOperation(HloOpcode opcode) { - switch (opcode) { - case HloOpcode::kAtan2: - return BINOP_ATAN2; - case HloOpcode::kComplex: - return BINOP_COMPLEX; - case HloOpcode::kMultiply: - return BINOP_MUL; - case HloOpcode::kAdd: - return BINOP_ADD; - case HloOpcode::kSubtract: - return BINOP_SUB; - case HloOpcode::kDivide: - return BINOP_DIV; - case HloOpcode::kEq: - return BINOP_EQ; - case HloOpcode::kGe: - return BINOP_GE; - case HloOpcode::kGt: - return BINOP_GT; - case HloOpcode::kLe: - return BINOP_LE; - case HloOpcode::kLt: - return BINOP_LT; - case HloOpcode::kNe: - return BINOP_NE; - case HloOpcode::kMaximum: - return BINOP_MAX; - case HloOpcode::kMinimum: - return BINOP_MIN; - case HloOpcode::kPower: - return BINOP_POW; - case HloOpcode::kRemainder: - return BINOP_REM; - case HloOpcode::kOr: - return BINOP_OR; - case HloOpcode::kAnd: - return BINOP_AND; - case HloOpcode::kShiftLeft: - return BINOP_SHIFT_LEFT; - case HloOpcode::kShiftRightArithmetic: - return BINOP_SHIFT_RIGHT_ARITHMETIC; - case HloOpcode::kShiftRightLogical: - return BINOP_SHIFT_RIGHT_LOGICAL; - default: - LOG(FATAL) << "unhandled opcode " << opcode; - } -} - -// Return the TernaryOperation proto enum value associated with the given HLO -// opcode. -TernaryOperation OpcodeToTernaryOperation(HloOpcode opcode) { - switch (opcode) { - case HloOpcode::kClamp: - return TRIOP_CLAMP; - case HloOpcode::kSelect: - return TRIOP_SELECT; - default: - LOG(FATAL) << "unhandled opcode " << opcode; - } -} - -// Return the VariadicOperation proto enum value associated with the given HLO -// opcode. -VariadicOperation OpcodeToVariadicOperation(HloOpcode opcode) { - switch (opcode) { - case HloOpcode::kTuple: - return VAROP_TUPLE; - default: - LOG(FATAL) << "unhandled opcode " << opcode; - } -} +using absl::StrFormat; +using absl::StrJoin; // Returns true if no element is present in slice more than once. -bool AllUnique(tensorflow::gtl::ArraySlice slice) { +bool AllUnique(absl::Span slice) { return std::set(slice.begin(), slice.end()).size() == slice.size(); } -Status ExpectNotTupleOrOpaque(const Shape& shape, - tensorflow::StringPiece op_type) { - if (ShapeUtil::IsTuple(shape)) { - return InvalidArgument("Expected non-tuple argument for %s, but got %s.", - std::string(op_type).c_str(), - ShapeUtil::HumanString(shape).c_str()); - } else if (ShapeUtil::IsOpaque(shape)) { - return InvalidArgument("Expected non-opaque argument for %s, but got %s.", - std::string(op_type).c_str(), - ShapeUtil::HumanString(shape).c_str()); - } else { - return Status::OK(); +Status ExpectArray(const Shape& shape, absl::string_view op_type) { + if (!ShapeUtil::IsArray(shape)) { + return InvalidArgument("Expected array argument for %s, but got %s.", + string(op_type), ShapeUtil::HumanString(shape)); } + return Status::OK(); } Status VerifyReducerShape(const ProgramShape& reducer_shape, - const Shape& init_value_shape, - const PrimitiveType& input_element_type) { - if (reducer_shape.parameters_size() != 2) { + absl::Span init_value_shapes, + absl::Span input_element_types, + int64 inputs) { + if (reducer_shape.parameters_size() != inputs * 2) { return InvalidArgument( - "Reduction function must take 2 parameters, but " + "Reduction function must take %d parameters, but " "takes %d parameter(s).", - reducer_shape.parameters_size()); + inputs * 2, reducer_shape.parameters_size()); } const Shape& accumulator_shape = reducer_shape.result(); - if (ShapeUtil::Rank(accumulator_shape) != 0) { - return InvalidArgument( - "Reduction function must have rank 0 (rank %lld reduction function " - "given).", - ShapeUtil::Rank(accumulator_shape)); - } - - // Check that the accumulator can be passed in as the first argument. - // Note: comparing here and below with Compatible since we don't care about - // layout in scalars - see b/26668201 for a longer-term vision. - if (!ShapeUtil::Compatible(accumulator_shape, reducer_shape.parameters(0))) { - return InvalidArgument( - "Reduction function's first parameter shape differs from the " - "result shape: %s vs %s", - ShapeUtil::HumanString(reducer_shape.parameters(0)).c_str(), - ShapeUtil::HumanString(accumulator_shape).c_str()); - } - - // Check that init_value's shape is suitable for reducer_shape. - if (!ShapeUtil::CompatibleIgnoringFpPrecision(accumulator_shape, - init_value_shape)) { + std::vector accumulator_subshapes; + if (ShapeUtil::IsArray(accumulator_shape)) { + if (inputs != 1) { + return InvalidArgument( + "Reduction function must produce a tuple with %d elements, but " + "produces a scalar", + inputs); + } + accumulator_subshapes.push_back(&accumulator_shape); + } else if (ShapeUtil::IsTuple(accumulator_shape)) { + if (ShapeUtil::TupleElementCount(accumulator_shape) != inputs) { + return InvalidArgument( + "Reduction function must produce a tuple with %d elements, but has " + "%d elements", + inputs, ShapeUtil::TupleElementCount(accumulator_shape)); + } + for (const Shape& element_shape : accumulator_shape.tuple_shapes()) { + accumulator_subshapes.push_back(&element_shape); + } + } else { return InvalidArgument( - "Reduction function's accumulator shape differs from the " - "init_value shape: %s vs %s", - ShapeUtil::HumanString(accumulator_shape).c_str(), - ShapeUtil::HumanString(init_value_shape).c_str()); + "Reduction function must produce a scalar or tuple of scalars, but has " + "shape: %s", + ShapeUtil::HumanString(accumulator_shape)); } - // Check that the inputs can be passed in as the second argument. - const Shape& input_element_shape = - ShapeUtil::MakeShape(input_element_type, {}); - if (!ShapeUtil::CompatibleIgnoringFpPrecision(input_element_shape, - reducer_shape.parameters(1))) { - return InvalidArgument( - "Reduction function's second parameter shape differs from the " - "input type element type: %s vs %s", - ShapeUtil::HumanString(reducer_shape.parameters(1)).c_str(), - ShapeUtil::HumanString(input_element_shape).c_str()); + for (const Shape* element_shape : accumulator_subshapes) { + if (ShapeUtil::Rank(*element_shape) != 0) { + return InvalidArgument( + "Reduction function must return a scalar or tuple of scalars but " + "returns shape: %s", + ShapeUtil::HumanString(accumulator_shape)); + } } - // Currently the accumulator and inputs must be the same type, - // though that restriction could be relaxed. - if (!ShapeUtil::CompatibleIgnoringFpPrecision(accumulator_shape, - reducer_shape.parameters(1))) { - return InvalidArgument( - "Reduction function's second parameter shape must " - "match the result shape, but got %s vs %s.", - ShapeUtil::HumanString(reducer_shape.parameters(1)).c_str(), - ShapeUtil::HumanString(accumulator_shape).c_str()); + for (int64 i = 0; i < inputs; ++i) { + // Check that the accumulator can be passed in as the first argument. + // Note: comparing here and below with Compatible since we don't care about + // layout in scalars - see b/26668201 for a longer-term vision. + if (!ShapeUtil::Compatible(*accumulator_subshapes[i], + reducer_shape.parameters(i))) { + return InvalidArgument( + "Reduction function's %d-th parameter shape differs from the " + "result shape: %s vs %s", + i, ShapeUtil::HumanString(reducer_shape.parameters(i)), + ShapeUtil::HumanString(*accumulator_subshapes[i])); + } + // Check that init_value's shapes are suitable for reducer_shape. + if (!ShapeUtil::CompatibleIgnoringFpPrecision(*accumulator_subshapes[i], + *init_value_shapes[i])) { + return InvalidArgument( + "Reduction function's accumulator shape at index %d differs from " + "the init_value shape: %s vs %s", + i, ShapeUtil::HumanString(*accumulator_subshapes[i]), + ShapeUtil::HumanString(*init_value_shapes[i])); + } + // Check that the inputs can be passed in as the non-accumulator arguments. + const Shape input_element_shape = + ShapeUtil::MakeShape(input_element_types[i], {}); + if (!ShapeUtil::CompatibleIgnoringFpPrecision( + input_element_shape, reducer_shape.parameters(inputs + i))) { + return InvalidArgument( + "Reduction function's %d-th parameter shape differs from the " + "input type element type: %s vs %s", + inputs + i, + ShapeUtil::HumanString(reducer_shape.parameters(inputs + i)), + ShapeUtil::HumanString(input_element_shape)); + } + // Check that the accumulator and inputs to the reducer function match. + // If the accumulator is scalar, it must have the same type as the inputs + // (up to fp precision). If it is a tuple, then the k-th element of the + // tuple must have the same type as the K-th input (again, up to fp + // precision.) + if (!ShapeUtil::CompatibleIgnoringFpPrecision( + *accumulator_subshapes[i], reducer_shape.parameters(inputs + i))) { + return InvalidArgument( + "Reduction function's %d-th parameter shape must " + "match the result shape, but got %s vs %s.", + inputs + i, + ShapeUtil::HumanString(reducer_shape.parameters(inputs + i)), + ShapeUtil::HumanString(*accumulator_subshapes[i])); + } } return Status::OK(); @@ -258,7 +162,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, bool allow_negative_padding) { if (window.dimensions_size() != ShapeUtil::Rank(base_shape)) { return InvalidArgument( - "Window has dimension %d but base shape has dimension %lld.", + "Window has dimension %d but base shape has dimension %d.", window.dimensions_size(), ShapeUtil::Rank(base_shape)); } @@ -267,29 +171,29 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, const auto& dim = window.dimensions(i); if (dim.size() <= 0) { return InvalidArgument("Window %s has a non-positive dimension.", - window.DebugString().c_str()); + window.DebugString()); } if (dim.stride() <= 0) { return InvalidArgument("Window %s has a non-positive stride.", - window.DebugString().c_str()); + window.DebugString()); } if (!allow_negative_padding && dim.padding_low() < 0) { return InvalidArgument("Window %s has a negative low padding.", - window.DebugString().c_str()); + window.DebugString()); } if (!allow_negative_padding && dim.padding_high() < 0) { return InvalidArgument("Window %s has a negative high padding.", - window.DebugString().c_str()); + window.DebugString()); } if (dim.base_dilation() < 1) { return InvalidArgument( "Window %s has a non-positive base area dilation factor.", - window.DebugString().c_str()); + window.DebugString()); } if (dim.window_dilation() < 1) { return InvalidArgument( "Window %s has a non-positive window dilation factor.", - window.DebugString().c_str()); + window.DebugString()); } const int64 dilated_base = window_util::DilatedBound( @@ -321,102 +225,127 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return shape; } - return InferUnaryOpShape(OpcodeToUnaryOperation(opcode), shape); -} + TF_RETURN_IF_ERROR(ExpectArray(shape, "operand of unary operation")); -/* static */ StatusOr ShapeInference::InferUnaryOpShape( - UnaryOperation operation, const Shape& arg) { - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of unary operation")); - - TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(arg)); - switch (operation) { - case UNOP_FLOOR: - case UNOP_CEIL: - if (!ShapeUtil::ElementIsFloating(arg)) { + TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape)); + switch (opcode) { + case HloOpcode::kFloor: + case HloOpcode::kCeil: + case HloOpcode::kRoundNearestAfz: + if (!ShapeUtil::ElementIsFloating(shape)) { + return InvalidArgument( + "Expected element type in shape to be floating for %s operation; " + "got %s.", + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); + } + return shape; + case HloOpcode::kCos: + case HloOpcode::kSin: + case HloOpcode::kExp: + case HloOpcode::kExpm1: + case HloOpcode::kLog: + case HloOpcode::kLog1p: + case HloOpcode::kTanh: + if (!ShapeUtil::ElementIsFloating(shape) && + !ShapeUtil::ElementIsComplex(shape)) { return InvalidArgument( - "Expected element type in shape to be floating for floor/ceil " + "Expected element type in shape to be floating or complex for %s " "operation; got %s.", - PrimitiveType_Name(arg.element_type()).c_str()); + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); + } + return shape; + case HloOpcode::kReal: + case HloOpcode::kImag: + if (ShapeUtil::ElementIsComplex(shape)) { + return ShapeUtil::ComplexComponentShape(shape); + } else if (ShapeUtil::ElementIsFloating(shape)) { + return shape; + } else { + return InvalidArgument( + "Expected element type in shape to be floating or complex for " + "%s operation; got %s.", + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); } - return arg; - case UNOP_COS: - case UNOP_SIN: - case UNOP_EXP: - case UNOP_EXPM1: - case UNOP_LOG: - case UNOP_LOG1P: - case UNOP_TANH: - if (!ShapeUtil::ElementIsFloating(arg) && - !ShapeUtil::ElementIsComplex(arg)) { + case HloOpcode::kAbs: + if (ShapeUtil::ElementIsComplex(shape)) { + return ShapeUtil::ChangeElementType( + shape, primitive_util::ComplexComponentType(shape.element_type())); + } else if (ShapeUtil::ElementIsSigned(shape)) { + return shape; + } else { return InvalidArgument( "Expected element type in shape to be floating or complex for " - "sin/cos/exp/log/tanh operation; got %s.", - PrimitiveType_Name(arg.element_type()).c_str()); + "%s operation; got %s.", + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); } - return arg; - case UNOP_REAL: - case UNOP_IMAG: - if (!ShapeUtil::ElementIsComplex(arg)) { + case HloOpcode::kClz: + if (!ShapeUtil::ElementIsIntegral(shape)) { return InvalidArgument( - "Expected element type in shape to be complex for real/imag " + "Expected an integral element type in argument to Clz " "operation; got %s.", - PrimitiveType_Name(arg.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } - return ShapeUtil::ChangeElementType(arg, F32); - case UNOP_ABS: - if (ShapeUtil::ElementIsComplex(arg)) { - return ShapeUtil::ChangeElementType( - arg, primitive_util::ComplexComponentType(arg.element_type())); + return shape; + case HloOpcode::kNegate: + if (!ShapeUtil::ElementIsIntegral(shape) && + !ShapeUtil::ElementIsFloating(shape) && + !ShapeUtil::ElementIsComplex(shape)) { + return InvalidArgument( + "Expected element type in shape to be integral, floating or " + "complex for %s operation; got %s.", + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); + } + return shape; + case HloOpcode::kSign: + if (!ShapeUtil::ElementIsSigned(shape) && + !ShapeUtil::ElementIsComplex(shape)) { + return InvalidArgument( + "Expected element type in shape to be signed or complex for " + "%s operation; got %s.", + HloOpcodeString(opcode), PrimitiveType_Name(shape.element_type())); } - return arg; - case UNOP_CLZ: - case UNOP_NEGATE: - case UNOP_ROUND_NEAREST_AFZ: - case UNOP_SIGN: - case UNOP_SORT: - return arg; - - case UNOP_NOT: - if (arg.element_type() != PRED && - !primitive_util::IsIntegralType(arg.element_type())) { + return shape; + + case HloOpcode::kNot: + if (shape.element_type() != PRED && + !primitive_util::IsIntegralType(shape.element_type())) { return InvalidArgument( "Expected pred or an integral element type in argument to Not " "operation; got %s.", - PrimitiveType_Name(arg.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } - return arg; + return shape; - case UNOP_IS_FINITE: - if (!ShapeUtil::ElementIsFloating(arg)) { + case HloOpcode::kIsFinite: + if (!ShapeUtil::ElementIsFloating(shape)) { return InvalidArgument( - "Expected element type in shape to be floating point for IsFinite " + "Expected element type in shape to be floating " + "point for IsFinite " "operation; got %s.", - PrimitiveType_Name(arg.element_type()).c_str()); + PrimitiveType_Name(shape.element_type())); } - return ShapeUtil::ChangeElementType(arg, PRED); + return ShapeUtil::ChangeElementType(shape, PRED); default: return InvalidArgument( "Unknown operation for unary shape inference: \"%s\".", - UnaryOperation_Name(operation).c_str()); + HloOpcodeString(opcode)); } } /* static */ StatusOr ShapeInference::InferConcatOpShape( - tensorflow::gtl::ArraySlice arg_shapes, - const int64 dimension) { + absl::Span arg_shapes, const int64 dimension) { if (arg_shapes.empty()) { return InvalidArgument("Concatenate expects at least one argument."); } if (dimension < 0 || dimension >= ShapeUtil::Rank(*arg_shapes[0])) { - return InvalidArgument("Concatenate dimension out of bounds: %lld.", + return InvalidArgument("Concatenate dimension out of bounds: %d.", dimension); } const Shape* arg_shape = nullptr; PrimitiveType element_type = PRIMITIVE_TYPE_INVALID; for (const Shape* shape : arg_shapes) { - TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(*shape, "operand of concatenation")); + TF_RETURN_IF_ERROR(ExpectArray(*shape, "operand of concatenation")); if (!arg_shape) { arg_shape = shape; element_type = arg_shape->element_type(); @@ -424,17 +353,16 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, } if (ShapeUtil::Rank(*arg_shape) != ShapeUtil::Rank(*shape)) { return InvalidArgument( - "Cannot concatenate arrays with different ranks: %lld (%s) vs %lld " + "Cannot concatenate arrays with different ranks: %d (%s) vs %d " "(%s).", - ShapeUtil::Rank(*arg_shape), - ShapeUtil::HumanString(*arg_shape).c_str(), ShapeUtil::Rank(*shape), - ShapeUtil::HumanString(*shape).c_str()); + ShapeUtil::Rank(*arg_shape), ShapeUtil::HumanString(*arg_shape), + ShapeUtil::Rank(*shape), ShapeUtil::HumanString(*shape)); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shape, *shape)) { return InvalidArgument( "Cannot concatenate arrays with different element types: %s vs %s.", - PrimitiveType_Name(arg_shape->element_type()).c_str(), - PrimitiveType_Name(shape->element_type()).c_str()); + PrimitiveType_Name(arg_shape->element_type()), + PrimitiveType_Name(shape->element_type())); } for (int64 dimension_number = 0; dimension_number < ShapeUtil::Rank(*arg_shape); ++dimension_number) { @@ -447,9 +375,9 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Cannot concatenate arrays that differ in dimensions other than " "the one being concatenated (the other array dimensions must be " - "the same): %s vs %s in dimension %lld.", - ShapeUtil::HumanString(*arg_shape).c_str(), - ShapeUtil::HumanString(*shape).c_str(), dimension); + "the same): %s vs %s in dimension %d.", + ShapeUtil::HumanString(*arg_shape), ShapeUtil::HumanString(*shape), + dimension); } } element_type = ShapeUtil::HigherPrecisionElementType(*shape, *arg_shape); @@ -463,6 +391,17 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return ShapeUtil::MakeShape(element_type, new_dimensions); } +/* static */ StatusOr ShapeInference::InferAfterAllShape( + absl::Span arg_shapes) { + for (const Shape* arg_shape : arg_shapes) { + if (arg_shape->element_type() != TOKEN) { + return InvalidArgument( + "Operands of token instructions must be TOKEN types."); + } + } + return ShapeUtil::MakeTokenShape(); +} + /* static */ StatusOr ShapeInference::InferConvertShape( const Shape& operand_shape, PrimitiveType new_element_type) { auto old_element_type = operand_shape.element_type(); @@ -470,17 +409,18 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, !primitive_util::IsComplexType(new_element_type)) { return Unimplemented( "Conversion from complex to real type %s => %s is not implemented.", - ShapeUtil::HumanString(operand_shape).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + ShapeUtil::HumanString(operand_shape), + PrimitiveType_Name(new_element_type)); } - if (ShapeUtil::IsTuple(operand_shape) || new_element_type == TUPLE) { + if (!ShapeUtil::IsArray(operand_shape) || + !primitive_util::IsArrayType(new_element_type)) { // Note: we may want to support tuple conversions via this operation in the // future, by recursing into the tuple elements to check all sub-conversions // are valid. For now we just reject them, though. return InvalidArgument( - "Convert does not allow tuples, so cannot convert from %s to %s.", - ShapeUtil::HumanString(operand_shape).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + "Convert does not allow non-arrays, so cannot convert from %s to %s.", + ShapeUtil::HumanString(operand_shape), + PrimitiveType_Name(new_element_type)); } return ShapeUtil::ChangeElementType(operand_shape, new_element_type); @@ -492,24 +432,25 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, if (primitive_util::IsComplexType(old_element_type) != primitive_util::IsComplexType(new_element_type)) { return InvalidArgument("Conversion from complex to real type %s => %s.", - ShapeUtil::HumanString(operand_shape).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + ShapeUtil::HumanString(operand_shape), + PrimitiveType_Name(new_element_type)); } - if (ShapeUtil::IsTuple(operand_shape) || new_element_type == TUPLE) { + if (!ShapeUtil::IsArray(operand_shape) || + !primitive_util::IsArrayType(new_element_type)) { // Note: we may want to support tuple conversions via this operation in the // future, by recursing into the tuple elements to check all sub-conversions // are valid. For now we just reject them, though. return InvalidArgument( "Cannot convert from or to tuple type; requested conversion: %s => %s.", - ShapeUtil::HumanString(operand_shape).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + ShapeUtil::HumanString(operand_shape), + PrimitiveType_Name(new_element_type)); } if (primitive_util::BitWidth(old_element_type) != primitive_util::BitWidth(new_element_type)) { return InvalidArgument( "Cannot bitcast types with different bit-widths: %s => %s.", - PrimitiveType_Name(old_element_type).c_str(), - PrimitiveType_Name(new_element_type).c_str()); + PrimitiveType_Name(old_element_type), + PrimitiveType_Name(new_element_type)); } return ShapeUtil::ChangeElementType(operand_shape, new_element_type); @@ -522,7 +463,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "Expected element type in shape to be floating point for " "ReducePrecision operation; got %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type())); } if (exponent_bits < 1) { // One exponent bit is necessary to distinguish 0 from infinity. Having @@ -542,7 +483,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, /* static */ StatusOr ShapeInference::InferPadShape( const Shape& operand_shape, const Shape& padding_value_shape, const PaddingConfig& padding_config) { - if (ShapeUtil::IsTuple(operand_shape)) { + if (!ShapeUtil::IsArray(operand_shape)) { return InvalidArgument( "Pad operation does not support tuple-shape operands."); } @@ -554,21 +495,29 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return InvalidArgument( "The rank of the operand and the padding configuration do not match: " "%s vs %s.", - ShapeUtil::HumanString(operand_shape).c_str(), - padding_config.ShortDebugString().c_str()); + ShapeUtil::HumanString(operand_shape), + padding_config.ShortDebugString()); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(operand_shape, padding_value_shape)) { return InvalidArgument( "The element types of the operands to Pad do not match."); } + if (absl::c_any_of(padding_config.dimensions(), + [](const PaddingConfig::PaddingConfigDimension& p) { + return p.interior_padding() < 0; + })) { + return InvalidArgument("Interior padding cannot be negative: %s", + padding_config.ShortDebugString()); + } + std::vector dimensions(ShapeUtil::Rank(operand_shape)); for (int64 i = 0; i < operand_shape.dimensions_size(); ++i) { - dimensions[i] = operand_shape.dimensions(i) + - padding_config.dimensions(i).edge_padding_low() + - padding_config.dimensions(i).edge_padding_high() + + const auto& p = padding_config.dimensions(i); + dimensions[i] = operand_shape.dimensions(i) + p.edge_padding_low() + + p.edge_padding_high() + std::max(operand_shape.dimensions(i) - 1, 0LL) * - padding_config.dimensions(i).interior_padding(); + p.interior_padding(); } return ShapeUtil::MakeShape( ShapeUtil::HigherPrecisionElementType(operand_shape, padding_value_shape), @@ -599,22 +548,22 @@ Status ValidateDotDimensionNumbers( const Shape& lhs, const Shape& rhs, const DotDimensionNumbers& dimension_numbers) { // Check that dimension numbers are in range. - auto dims_in_range = - [](const int64 rank, tensorflow::gtl::ArraySlice contracting_dims, - tensorflow::gtl::ArraySlice batch_dims) -> bool { + auto dims_in_range = [](const int64 rank, + absl::Span contracting_dims, + absl::Span batch_dims) -> bool { auto in_range = [&rank](int64 i) -> bool { return 0 <= i && i < rank; }; return std::all_of(contracting_dims.begin(), contracting_dims.end(), in_range) && std::all_of(batch_dims.begin(), batch_dims.end(), in_range); }; - tensorflow::gtl::ArraySlice lhs_contracting_dimensions = + absl::Span lhs_contracting_dimensions = AsInt64Slice(dimension_numbers.lhs_contracting_dimensions()); - tensorflow::gtl::ArraySlice rhs_contracting_dimensions = + absl::Span rhs_contracting_dimensions = AsInt64Slice(dimension_numbers.rhs_contracting_dimensions()); - tensorflow::gtl::ArraySlice lhs_batch_dimensions = + absl::Span lhs_batch_dimensions = AsInt64Slice(dimension_numbers.lhs_batch_dimensions()); - tensorflow::gtl::ArraySlice rhs_batch_dimensions = + absl::Span rhs_batch_dimensions = AsInt64Slice(dimension_numbers.rhs_batch_dimensions()); if (!dims_in_range(ShapeUtil::Rank(lhs), lhs_contracting_dimensions, @@ -622,12 +571,12 @@ Status ValidateDotDimensionNumbers( !dims_in_range(ShapeUtil::Rank(rhs), rhs_contracting_dimensions, rhs_batch_dimensions)) { return InvalidArgument("A dimension number is out of range in Dot: %s.", - dimension_numbers.DebugString().c_str()); + dimension_numbers.DebugString()); } // Check that dimension numbers are unique. - auto dims_unique = [](tensorflow::gtl::ArraySlice contracting_dims, - tensorflow::gtl::ArraySlice batch_dims) -> bool { + auto dims_unique = [](absl::Span contracting_dims, + absl::Span batch_dims) -> bool { tensorflow::gtl::FlatSet dim_set; auto is_unique = [&dim_set](int64 i) -> bool { return dim_set.insert(i).second; @@ -640,7 +589,7 @@ Status ValidateDotDimensionNumbers( if (!dims_unique(lhs_contracting_dimensions, lhs_batch_dimensions) || !dims_unique(rhs_contracting_dimensions, rhs_batch_dimensions)) { return InvalidArgument("A dimension number is not unique in Dot: %s.", - dimension_numbers.DebugString().c_str()); + dimension_numbers.DebugString()); } // Check that the count of non-contracting-non-batch dimensions is in {0, 1}. @@ -681,18 +630,17 @@ Status ValidateDotDimensionNumbers( /* static */ StatusOr ShapeInference::InferDotOpShape( const Shape& lhs, const Shape& rhs, const DotDimensionNumbers& dimension_numbers) { - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(lhs, "lhs of dot")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(rhs, "rhs of dot")); + TF_RETURN_IF_ERROR(ExpectArray(lhs, "lhs of dot")); + TF_RETURN_IF_ERROR(ExpectArray(rhs, "rhs of dot")); auto fail = [lhs, rhs](const string& addendum) -> Status { - string message = tensorflow::strings::Printf( - "Cannot infer shape for dot operation: %s %s.", - ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str()); + string message = + StrFormat("Cannot infer shape for dot operation: %s %s.", + ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs)); if (!addendum.empty()) { message += " " + addendum; } - return InvalidArgument("%s", message.c_str()); + return InvalidArgument("%s", message); }; // Check if both element types are the same. @@ -768,8 +716,9 @@ Status ValidateDotDimensionNumbers( } /* static */ StatusOr -ShapeInference::InferDegenerateDimensionBroadcastShape( - BinaryOperation operation, const Shape& lhs, const Shape& rhs) { +ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, + const Shape& lhs, + const Shape& rhs) { TF_RET_CHECK(ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs)); // The shapes have to be compatible. That is, if some dimension d has a @@ -787,9 +736,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } else { return InvalidArgument( "Binary op %s with incompatible shapes: %s and %s.", - BinaryOperation_Name(operation).c_str(), - ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str()); + HloOpcodeString(operation), ShapeUtil::HumanString(lhs), + ShapeUtil::HumanString(rhs)); } } return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs), @@ -797,22 +745,21 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } /* static */ StatusOr ShapeInference::InferInDimBroadcastShape( - BinaryOperation operation, const Shape& smaller_shape, - const Shape& larger_shape, - tensorflow::gtl::ArraySlice broadcast_dimensions) { + const Shape& smaller_shape, const Shape& larger_shape, + absl::Span broadcast_dimensions) { if (broadcast_dimensions.empty() && !ShapeUtil::IsScalar(smaller_shape)) { // Reject "magic" inference for binops on different shapes, requiring // the user to provide an explicit broadcast dimension in this case. // See b/25177275 for more details. return InvalidArgument("Automatic shape inference not supported: %s and %s", - ShapeUtil::HumanString(smaller_shape).c_str(), - ShapeUtil::HumanString(larger_shape).c_str()); + ShapeUtil::HumanString(smaller_shape), + ShapeUtil::HumanString(larger_shape)); } else if (broadcast_dimensions.size() != ShapeUtil::Rank(smaller_shape)) { return InvalidArgument( "Size of broadcast_dimensions has to match lower-rank operand's " "rank; " - " lower-rank operand's rank is %lld, size of broadcast_dimensions is " - "%zu.", + " lower-rank operand's rank is %d, size of broadcast_dimensions is " + "%u.", ShapeUtil::Rank(smaller_shape), broadcast_dimensions.size()); } @@ -862,12 +809,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( int64 dimension_to_match = broadcast_dimensions.at(i); if (dimension_to_match < 0) { return InvalidArgument( - "Broadcast dimension number (%lld) cannot be negative.", + "Broadcast dimension number (%d) cannot be negative.", dimension_to_match); } if (dimension_to_match >= larger_shape.dimensions_size()) { return InvalidArgument( - "Broadcast dimension number (%lld) too large; higher-rank " + "Broadcast dimension number (%d) too large; higher-rank " "operand has rank %d.", dimension_to_match, larger_shape.dimensions_size()); } @@ -879,16 +826,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (small_dimension_size != large_dimension_size && small_dimension_size != 1 && large_dimension_size != 1) { return InvalidArgument( - "Broadcast dimension %d mismatch: %lld != %lld; %s and %s.", i, + "Broadcast dimension %d mismatch: %d != %d; %s and %s.", i, small_dimension_size, large_dimension_size, - ShapeUtil::HumanString(smaller_shape).c_str(), - ShapeUtil::HumanString(larger_shape).c_str()); + ShapeUtil::HumanString(smaller_shape), + ShapeUtil::HumanString(larger_shape)); } // Make sure the broadcast dimensions are listed in a strictly increasing // order. if (i > 0 && broadcast_dimensions.at(i - 1) >= dimension_to_match) { return InvalidArgument( - "Broadcast dimensions order is wrong: %lld comes after %lld.", + "Broadcast dimensions order is wrong: %d comes after %d.", dimension_to_match, broadcast_dimensions.at(i - 1)); } @@ -899,19 +846,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } /* static */ StatusOr ShapeInference::InferElementwiseBinaryOpShape( - BinaryOperation operation, const Shape& lhs, const Shape& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(lhs, "lhs of elementwise binary operation")); - TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(rhs, "rhs of elementwise binary operation")); + HloOpcode operation, const Shape& lhs, const Shape& rhs, + absl::Span broadcast_dimensions) { + TF_RETURN_IF_ERROR(ExpectArray(lhs, "lhs of elementwise binary operation")); + TF_RETURN_IF_ERROR(ExpectArray(rhs, "rhs of elementwise binary operation")); if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) { return InvalidArgument( "Binary op %s with different element types: %s and %s.", - BinaryOperation_Name(operation).c_str(), - ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str()); + HloOpcodeString(operation), ShapeUtil::HumanString(lhs), + ShapeUtil::HumanString(rhs)); } if (ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs)) { @@ -943,10 +887,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( ShapeUtil::Rank(lhs) > ShapeUtil::Rank(rhs) ? rhs : lhs; // After InDim broadcasting, perform degenerate dimensions broadcasting. - TF_ASSIGN_OR_RETURN( - Shape indim_broadcast_shape, - InferInDimBroadcastShape(operation, smaller_shape, larger_shape, - broadcast_dimensions)); + TF_ASSIGN_OR_RETURN(Shape indim_broadcast_shape, + InferInDimBroadcastShape(smaller_shape, larger_shape, + broadcast_dimensions)); return InferDegenerateDimensionBroadcastShape( operation, indim_broadcast_shape, larger_shape); @@ -955,59 +898,49 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( /* static */ StatusOr ShapeInference::InferBinaryOpShape( HloOpcode opcode, const HloInstruction* lhs, const HloInstruction* rhs) { - return InferBinaryOpShape(OpcodeToBinaryOperation(opcode), lhs->shape(), - rhs->shape(), /*broadcast_dimensions=*/{}); + return InferBinaryOpShape(opcode, lhs->shape(), rhs->shape(), + /*broadcast_dimensions=*/{}); } /* static */ StatusOr ShapeInference::InferBinaryOpShape( HloOpcode opcode, const Shape& lhs, const Shape& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - return InferBinaryOpShape(OpcodeToBinaryOperation(opcode), lhs, rhs, - broadcast_dimensions); -} - -/* static */ StatusOr ShapeInference::InferBinaryOpShape( - BinaryOperation operation, const Shape& lhs, const Shape& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions) { - VLOG(2) << tensorflow::strings::Printf( + absl::Span broadcast_dimensions) { + VLOG(2) << StrFormat( "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}", - BinaryOperation_Name(operation).c_str(), - ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(), - Join(broadcast_dimensions, ", ").c_str()); + HloOpcodeString(opcode), ShapeUtil::HumanString(lhs), + ShapeUtil::HumanString(rhs), StrJoin(broadcast_dimensions, ", ")); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs)); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque( - lhs, tensorflow::strings::StrCat("lhs of binary operation ", - BinaryOperation_Name(operation)))); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque( - rhs, tensorflow::strings::StrCat("rhs of binary operation ", - BinaryOperation_Name(operation)))); - switch (operation) { - case BINOP_MAX: - case BINOP_MIN: - case BINOP_SUB: - case BINOP_ADD: - case BINOP_ATAN2: - case BINOP_POW: - case BINOP_DIV: - case BINOP_REM: - case BINOP_MUL: - case BINOP_SHIFT_LEFT: - case BINOP_SHIFT_RIGHT_ARITHMETIC: - case BINOP_SHIFT_RIGHT_LOGICAL: - return InferElementwiseBinaryOpShape(operation, lhs, rhs, + TF_RETURN_IF_ERROR(ExpectArray( + lhs, absl::StrCat("lhs of binary operation ", HloOpcodeString(opcode)))); + TF_RETURN_IF_ERROR(ExpectArray( + rhs, absl::StrCat("rhs of binary operation ", HloOpcodeString(opcode)))); + switch (opcode) { + case HloOpcode::kMaximum: + case HloOpcode::kMinimum: + case HloOpcode::kSubtract: + case HloOpcode::kAdd: + case HloOpcode::kAtan2: + case HloOpcode::kPower: + case HloOpcode::kDivide: + case HloOpcode::kRemainder: + case HloOpcode::kMultiply: + case HloOpcode::kShiftLeft: + case HloOpcode::kShiftRightArithmetic: + case HloOpcode::kShiftRightLogical: + return InferElementwiseBinaryOpShape(opcode, lhs, rhs, broadcast_dimensions); - case BINOP_COMPLEX: { + case HloOpcode::kComplex: { if (!ShapeUtil::ElementIsFloating(lhs)) { return InvalidArgument( "Expected element type in shape to be floating for complex compose " "operation; got %s.", - PrimitiveType_Name(lhs.element_type()).c_str()); + PrimitiveType_Name(lhs.element_type())); } TF_ASSIGN_OR_RETURN(const Shape& shape, - InferElementwiseBinaryOpShape(operation, lhs, rhs, + InferElementwiseBinaryOpShape(opcode, lhs, rhs, broadcast_dimensions)); if (lhs.element_type() == F32 && rhs.element_type() == F32) { return ShapeUtil::ChangeElementType(shape, C64); @@ -1015,33 +948,34 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return Unimplemented("Complex component type is not implemented."); } } - case BINOP_AND: - case BINOP_OR: + case HloOpcode::kAnd: + case HloOpcode::kOr: + case HloOpcode::kXor: if (lhs.element_type() != PRED && !primitive_util::IsIntegralType(lhs.element_type())) { return InvalidArgument( "Expected pred or integral type in argument to and/or operation; " "got %s.", - PrimitiveType_Name(lhs.element_type()).c_str()); + PrimitiveType_Name(lhs.element_type())); } - return InferElementwiseBinaryOpShape(operation, lhs, rhs, + return InferElementwiseBinaryOpShape(opcode, lhs, rhs, broadcast_dimensions); - case BINOP_EQ: - case BINOP_GE: - case BINOP_GT: - case BINOP_LE: - case BINOP_LT: - case BINOP_NE: { + case HloOpcode::kEq: + case HloOpcode::kGe: + case HloOpcode::kGt: + case HloOpcode::kLe: + case HloOpcode::kLt: + case HloOpcode::kNe: { TF_ASSIGN_OR_RETURN(const Shape& shape, - InferElementwiseBinaryOpShape(operation, lhs, rhs, + InferElementwiseBinaryOpShape(opcode, lhs, rhs, broadcast_dimensions)); return ShapeUtil::ChangeElementType(shape, PRED); } default: return Unimplemented( "Binary op shape inference: %s; lhs: %s; rhs: %s is not implemented.", - BinaryOperation_Name(operation).c_str(), - lhs.ShortDebugString().c_str(), rhs.ShortDebugString().c_str()); + HloOpcodeString(opcode), lhs.ShortDebugString(), + rhs.ShortDebugString()); } } @@ -1053,30 +987,25 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( /* static */ StatusOr ShapeInference::InferTernaryOpShape( HloOpcode opcode, const Shape& lhs, const Shape& rhs, const Shape& ehs) { - return InferTernaryOpShape(OpcodeToTernaryOperation(opcode), lhs, rhs, ehs); -} - -/* static */ StatusOr ShapeInference::InferTernaryOpShape( - TernaryOperation operation, const Shape& lhs, const Shape& rhs, - const Shape& ehs) { TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(ehs)); - switch (operation) { - case TRIOP_CLAMP: + switch (opcode) { + case HloOpcode::kClamp: return InferClampShape(lhs, rhs, ehs); - case TRIOP_SELECT: + case HloOpcode::kSelect: return InferSelectShape(lhs, rhs, ehs); + case HloOpcode::kTupleSelect: + return InferTupleSelectShape(lhs, rhs, ehs); default: - return InvalidArgument("Unknown operation %s.", - TernaryOperation_Name(operation).c_str()); + return InvalidArgument("Unknown operation %s.", HloOpcodeString(opcode)); } } /* static */ StatusOr ShapeInference::InferVariadicOpShape( - HloOpcode opcode, - tensorflow::gtl::ArraySlice operands) { + HloOpcode opcode, absl::Span operands) { std::vector operand_shapes; + operand_shapes.reserve(operands.size()); for (const HloInstruction* operand : operands) { operand_shapes.push_back(&operand->shape()); } @@ -1084,36 +1013,44 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } /* static */ StatusOr ShapeInference::InferVariadicOpShape( - HloOpcode opcode, - tensorflow::gtl::ArraySlice operand_shapes) { - return InferVariadicOpShape(OpcodeToVariadicOperation(opcode), - operand_shapes); -} - -/* static */ StatusOr ShapeInference::InferVariadicOpShape( - VariadicOperation operation, - tensorflow::gtl::ArraySlice operand_shapes) { + HloOpcode opcode, absl::Span operand_shapes) { for (const Shape* shape : operand_shapes) { TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(*shape)); } - switch (operation) { - case VAROP_TUPLE: { + switch (opcode) { + case HloOpcode::kTuple: { Shape result = ShapeUtil::MakeTupleShape({}); + result.mutable_tuple_shapes()->Reserve(operand_shapes.size()); for (const Shape* shape : operand_shapes) { ShapeUtil::AppendShapeToTuple(*shape, &result); } return result; } + case HloOpcode::kSort: { + if (operand_shapes.size() == 1) { + return *operand_shapes[0]; + } else if (operand_shapes.size() == 2) { + if (!ShapeUtil::SameDimensions(*operand_shapes[0], + *operand_shapes[1])) { + return InvalidArgument( + "Sort keys and values dimensions must match. " + "Keys shape is: %s\n, Values shape is: %s", + ShapeUtil::HumanString(*operand_shapes[0]), + ShapeUtil::HumanString(*operand_shapes[1])); + } + return ShapeUtil::MakeTupleShape( + {*operand_shapes[0], *operand_shapes[1]}); + } + return InvalidArgument("Unexpected number of operands for sort"); + } default: - return InvalidArgument("Unknown operation %s.", - VariadicOperation_Name(operation).c_str()); + return InvalidArgument("Unknown operation %s.", HloOpcodeString(opcode)); } } /* static */ StatusOr ShapeInference::InferMapShape( - tensorflow::gtl::ArraySlice arg_shapes, - const ProgramShape& to_apply, - tensorflow::gtl::ArraySlice dimensions) { + absl::Span arg_shapes, const ProgramShape& to_apply, + absl::Span dimensions) { if (arg_shapes.empty()) { return InvalidArgument("Map expects at least one argument."); } @@ -1121,15 +1058,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( // All arguments must have the same shape. const Shape* arg_shape = arg_shapes[0]; for (size_t i = 1; i < arg_shapes.size(); ++i) { - TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(*arg_shapes[i], "operand of map")); + TF_RETURN_IF_ERROR(ExpectArray(*arg_shapes[i], "operand of map")); if (ShapeUtil::CompatibleIgnoringFpPrecision(*arg_shapes[i], *arg_shape)) { continue; } - if (!ShapeUtil::IsTuple(*arg_shapes[i]) && - !ShapeUtil::IsTuple(*arg_shape) && - ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shapes[i], + if (ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shapes[i], *arg_shape)) { if (ShapeUtil::IsScalar(*arg_shapes[i])) { continue; @@ -1147,7 +1081,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Map operation requires all operands to have the same shape; got: " "%s.", - Join(pieces, ", ").c_str()); + StrJoin(pieces, ", ")); } // Check that dimensions.size == arg_shape.dimensions_size() (we currently @@ -1155,7 +1089,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (dimensions.size() != arg_shape->dimensions_size()) { return InvalidArgument( "Map applied to a subset of dimensions currently not supported: " - "arg_dimension_size: %d, requested_map_dimensions_size: %zu.", + "arg_dimension_size: %d, requested_map_dimensions_size: %u.", arg_shape->dimensions_size(), dimensions.size()); } @@ -1164,7 +1098,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (dimensions[i] != i) { return InvalidArgument( "Map requires monotonically increasing dimension numbers; got: %s.", - Join(dimensions, ", ").c_str()); + StrJoin(dimensions, ", ")); } } @@ -1172,7 +1106,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (arg_shapes.size() != to_apply.parameters_size()) { return InvalidArgument( "Map applied function arity must match number of arguments; got: " - "arity: %d, arguments: %zu.", + "arity: %d, arguments: %u.", to_apply.parameters_size(), arg_shapes.size()); } @@ -1181,7 +1115,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (!ShapeUtil::IsScalar(output_shape)) { return InvalidArgument( "Mapped computation's result has to be a scalar; got: %s.", - ShapeUtil::HumanString(output_shape).c_str()); + ShapeUtil::HumanString(output_shape)); } for (int i = 0; i < to_apply.parameters_size(); ++i) { @@ -1191,7 +1125,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Mapped computation's parameter has to be a scalar; " "got parameter %d shape: %s.", - i, ShapeUtil::HumanString(parameter_shape).c_str()); + i, ShapeUtil::HumanString(parameter_shape)); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(parameter_shape, @@ -1199,8 +1133,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Mapped computation's parameter type has to match argument element " "type; got parameter %d shape: %s, argument shape: %s.", - i, ShapeUtil::HumanString(parameter_shape).c_str(), - ShapeUtil::HumanString(*arg_shape).c_str()); + i, ShapeUtil::HumanString(parameter_shape), + ShapeUtil::HumanString(*arg_shape)); } } @@ -1212,11 +1146,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const Shape& operand_shape, const Shape& scale_shape, const Shape& offset_shape, int64 feature_index) { TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(operand_shape, "operand of batch norm training")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque( - offset_shape, "offset input of batch norm training")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque( - scale_shape, "scale input of batch norm training")); + ExpectArray(operand_shape, "operand of batch norm training")); + TF_RETURN_IF_ERROR( + ExpectArray(offset_shape, "offset input of batch norm training")); + TF_RETURN_IF_ERROR( + ExpectArray(scale_shape, "scale input of batch norm training")); TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape) == Status::OK()); @@ -1229,35 +1163,35 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Expected feature_index of batch-norm-training to be " "smaller than the rank of operand_shape; " - "got feature_index %lld, and rank %lld.", + "got feature_index %d, and rank %d.", feature_index, ShapeUtil::Rank(operand_shape)); } if (feature_index < 0) { return InvalidArgument( "Expected feature_index of batch-norm-training to " - "be a non-negative number, got %lld.", + "be a non-negative number, got %d.", feature_index); } if (ShapeUtil::Rank(operand_shape) < 1) { return InvalidArgument( "Expected the rank of operand to " - "batch-norm-training to be at least 1; got %lld.", + "batch-norm-training to be at least 1; got %d.", ShapeUtil::Rank(operand_shape)); } if (ShapeUtil::Rank(offset_shape) != 1) { return InvalidArgument( "Offset input of batch-norm-training must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(offset_shape)); } if (ShapeUtil::Rank(scale_shape) != 1) { return InvalidArgument( "Scale input of batch-norm-training must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(scale_shape)); } @@ -1265,7 +1199,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "The operand to batch-norm-training must have a floating point " "element type, but the shape is %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(offset_shape, @@ -1274,8 +1208,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "The inputs should have the same element type for batch-norm-training, " "but the shape of offset factor is %s " "and the shape of operand is %s.", - PrimitiveType_Name(offset_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(offset_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(scale_shape, @@ -1284,8 +1218,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "The inputs should have the same element type for batch-norm-training, " "but the shape of scale factor is %s " "and the shape of operand is %s.", - PrimitiveType_Name(scale_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(scale_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } const int64 feature_count = operand_shape.dimensions(feature_index); @@ -1295,16 +1229,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (ShapeUtil::GetDimension(offset_shape, 0) != feature_count) { return InvalidArgument( "The size of offset factor should be the same as feature count," - "but the size of offset factor is %lld " - "and the feature count is %lld.", + "but the size of offset factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(offset_shape, 0), feature_count); } if (ShapeUtil::GetDimension(scale_shape, 0) != feature_count) { return InvalidArgument( "The size of scale factor should be the same as feature count," - "but the size of scale factor is %lld " - "and the feature count is %lld.", + "but the size of scale factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(scale_shape, 0), feature_count); } @@ -1318,11 +1252,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const Shape& offset_shape, const Shape& mean_shape, const Shape& variance_shape, int64 feature_index) { TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(operand_shape, "operand of batch norm inference")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque( - offset_shape, "offset input of batch norm inference")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque( - scale_shape, "scale input of batch norm inference")); + ExpectArray(operand_shape, "operand of batch norm inference")); + TF_RETURN_IF_ERROR( + ExpectArray(offset_shape, "offset input of batch norm inference")); + TF_RETURN_IF_ERROR( + ExpectArray(scale_shape, "scale input of batch norm inference")); TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape) == Status::OK()); @@ -1339,35 +1273,35 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Expected feature_index of batch-norm-inference to be " "smaller than the rank of operand_shape; " - "got feature_index %lld, and rank %lld.", + "got feature_index %d, and rank %d.", feature_index, ShapeUtil::Rank(operand_shape)); } if (feature_index < 0) { return InvalidArgument( "Expected feature_index of batch-norm-inference to " - "be a non-negative number, got %lld.", + "be a non-negative number, got %d.", feature_index); } if (ShapeUtil::Rank(operand_shape) < 1) { return InvalidArgument( "Expected the rank of operand to " - "batch-norm-inference to be at least 1; got %lld.", + "batch-norm-inference to be at least 1; got %d.", ShapeUtil::Rank(operand_shape)); } if (ShapeUtil::Rank(offset_shape) != 1) { return InvalidArgument( "Offset input of batch-norm-inference must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(offset_shape)); } if (ShapeUtil::Rank(scale_shape) != 1) { return InvalidArgument( "Scale input of batch-norm-inference must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(scale_shape)); } @@ -1375,7 +1309,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "The operand to batch-norm-inference must have a floating point " "element type, but the shape is %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(offset_shape, @@ -1385,8 +1319,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "batch-norm-inference, " "but the shape of offset factor is %s " "and the shape of operand is %s.", - PrimitiveType_Name(offset_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(offset_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(scale_shape, @@ -1396,8 +1330,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "batch-norm-inference, " "but the shape of scale factor is %s " "and the shape of operand is %s.", - PrimitiveType_Name(scale_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(scale_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(mean_shape, @@ -1407,8 +1341,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "batch-norm-inference, " "but the shape of mean is %s " "and the shape of operand is %s.", - PrimitiveType_Name(mean_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(mean_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(variance_shape, @@ -1418,8 +1352,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "batch-norm-inference, " "but the shape of variance is %s " "and the shape of operand is %s.", - PrimitiveType_Name(mean_shape.element_type()).c_str(), - PrimitiveType_Name(variance_shape.element_type()).c_str()); + PrimitiveType_Name(mean_shape.element_type()), + PrimitiveType_Name(variance_shape.element_type())); } const int64 feature_count = operand_shape.dimensions(feature_index); @@ -1429,32 +1363,32 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (ShapeUtil::GetDimension(offset_shape, 0) != feature_count) { return InvalidArgument( "The size of offset factor should be the same as feature count," - "but the size of offset factor is %lld " - "and the feature count is %lld.", + "but the size of offset factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(offset_shape, 0), feature_count); } if (ShapeUtil::GetDimension(scale_shape, 0) != feature_count) { return InvalidArgument( "The size of scale factor should be the same as feature count," - "but the size of scale factor is %lld " - "and the feature count is %lld.", + "but the size of scale factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(scale_shape, 0), feature_count); } if (ShapeUtil::GetDimension(mean_shape, 0) != feature_count) { return InvalidArgument( "The size of mean should be the same as feature count," - "but the size of mean is %lld " - "and the feature count is %lld.", + "but the size of mean is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(mean_shape, 0), feature_count); } if (ShapeUtil::GetDimension(variance_shape, 0) != feature_count) { return InvalidArgument( "The size of variance should be the same as feature count," - "but the size of variance is %lld " - "and the feature count is %lld.", + "but the size of variance is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(variance_shape, 0), feature_count); } @@ -1465,16 +1399,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const Shape& operand_shape, const Shape& scale_shape, const Shape& mean_shape, const Shape& var_shape, const Shape& output_grad_shape, int64 feature_index) { + TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of batch norm grad")); TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(operand_shape, "operand of batch norm grad")); - TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(scale_shape, "scale input of batch norm grad")); - TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(mean_shape, "mean input of batch norm grad")); + ExpectArray(scale_shape, "scale input of batch norm grad")); + TF_RETURN_IF_ERROR(ExpectArray(mean_shape, "mean input of batch norm grad")); + TF_RETURN_IF_ERROR(ExpectArray(var_shape, "var input of batch norm grad")); TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(var_shape, "var input of batch norm grad")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque( - output_grad_shape, "output_grad input of batch norm grad")); + ExpectArray(output_grad_shape, "output_grad input of batch norm grad")); TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape)); TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(mean_shape)); @@ -1487,36 +1418,36 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Expected feature_index of batch-norm-grad to be " "smaller than the rank of operand_shape; " - "got feature_index %lld, and rank %lld.", + "got feature_index %d, and rank %d.", feature_index, ShapeUtil::Rank(operand_shape)); } if (ShapeUtil::Rank(operand_shape) != ShapeUtil::Rank(output_grad_shape)) { return InvalidArgument( "Expected operand_shape of batch-norm-grad to have the same rank as" - " output_grad_shape; got rank(oprand_shape) %lld, and" - " rank(output_grad_shape) %lld.", + " output_grad_shape; got rank(oprand_shape) %d, and" + " rank(output_grad_shape) %d.", ShapeUtil::Rank(operand_shape), ShapeUtil::Rank(output_grad_shape)); } if (ShapeUtil::Rank(mean_shape) != 1) { return InvalidArgument( "Mean input of batch-norm-grad must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(mean_shape)); } if (ShapeUtil::Rank(scale_shape) != 1) { return InvalidArgument( "Scale input of batch-norm-grad must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(scale_shape)); } if (ShapeUtil::Rank(var_shape) != 1) { return InvalidArgument( "Var input of batch-norm-grad must have" - " rank 1, but has rank %lld.", + " rank 1, but has rank %d.", ShapeUtil::Rank(var_shape)); } @@ -1524,14 +1455,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "The operand to batch-norm-grad must have a floating point " "element type, but the shape is %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::ElementIsFloating(output_grad_shape)) { return InvalidArgument( "The output_grad to batch-norm-grad must have a floating point " "element type, but the shape is %s.", - PrimitiveType_Name(output_grad_shape.element_type()).c_str()); + PrimitiveType_Name(output_grad_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(output_grad_shape, @@ -1540,8 +1471,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "The inputs should have the same element type for batch-norm-grad, " "but the element type of output_grad is %s " "and the element type of operand is %s.", - PrimitiveType_Name(output_grad_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(output_grad_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(scale_shape, @@ -1550,8 +1481,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "The inputs should have the same element type for batch-norm-grad, " "but the element type of scale factor is %s " "and the element type of operand is %s.", - PrimitiveType_Name(scale_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(scale_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(mean_shape, @@ -1560,8 +1491,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "The inputs should have the same element type for batch-norm-grad, " "but the element type of mean is %s " "and the element type of operand is %s.", - PrimitiveType_Name(mean_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(mean_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(var_shape, @@ -1570,8 +1501,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "The inputs should have the same element type for batch-norm-grad, " "but the element type of mean is %s " "and the element type of operand is %s.", - PrimitiveType_Name(mean_shape.element_type()).c_str(), - PrimitiveType_Name(operand_shape.element_type()).c_str()); + PrimitiveType_Name(mean_shape.element_type()), + PrimitiveType_Name(operand_shape.element_type())); } const int64 feature_count = operand_shape.dimensions(feature_index); @@ -1582,24 +1513,24 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (ShapeUtil::GetDimension(mean_shape, 0) != feature_count) { return InvalidArgument( "The size of mean should be the same as feature count," - "but the size of offset factor is %lld " - "and the feature count is %lld.", + "but the size of offset factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(mean_shape, 0), feature_count); } if (ShapeUtil::GetDimension(scale_shape, 0) != feature_count) { return InvalidArgument( "The size of scale factor should be the same as feature count," - "but the size of scale factor is %lld " - "and the feature count is %lld.", + "but the size of scale factor is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(scale_shape, 0), feature_count); } if (ShapeUtil::GetDimension(var_shape, 0) != feature_count) { return InvalidArgument( "The size of variance should be the same as feature count," - "but the size of variance is %lld " - "and the feature count is %lld.", + "but the size of variance is %d " + "and the feature count is %d.", ShapeUtil::GetDimension(var_shape, 0), feature_count); } @@ -1609,8 +1540,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( ShapeUtil::GetDimension(output_grad_shape, i)) { return InvalidArgument( "The bounds of operand shape should be the same as output_grad's," - "but the bound of operand_shape at dimension %lld is %lld " - "and the bound of output_grad_shape is %lld.", + "but the bound of operand_shape at dimension %d is %d " + "and the bound of output_grad_shape is %d.", i, ShapeUtil::GetDimension(operand_shape, i), ShapeUtil::GetDimension(output_grad_shape, i)); } @@ -1622,22 +1553,21 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( /* static */ StatusOr ShapeInference::InferConvolveShape( const Shape& lhs, const Shape& rhs, const Window& window, - const ConvolutionDimensionNumbers& dnums) { - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(lhs, "lhs of convolution")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(rhs, "rhs of convolution")); + const ConvolutionDimensionNumbers& dnums, int64 feature_group_count) { + TF_RETURN_IF_ERROR(ExpectArray(lhs, "lhs of convolution")); + TF_RETURN_IF_ERROR(ExpectArray(rhs, "rhs of convolution")); if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) { return InvalidArgument( "Convolution with different element types: %s and %s.", - ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str()); + ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs)); } if (dnums.input_spatial_dimensions_size() != dnums.kernel_spatial_dimensions_size()) { return InvalidArgument( "Both arguments to convolution must have same number of dimensions.\n" "Window: %s", - window.DebugString().c_str()); + window.DebugString()); } const int num_spatial_dims = dnums.input_spatial_dimensions_size(); @@ -1645,19 +1575,19 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Window must have same number of dimensions as dimension numbers.\n" "Window: %s\nDimension numbers: %s.", - window.DebugString().c_str(), dnums.DebugString().c_str()); + window.DebugString(), dnums.DebugString()); } const int num_dims = num_spatial_dims + 2; if (ShapeUtil::Rank(lhs) != num_dims) { return InvalidArgument( "The LHS argument to a convolution should have rank %d; lhs: %s.", - num_dims, ShapeUtil::HumanString(lhs).c_str()); + num_dims, ShapeUtil::HumanString(lhs)); } if (ShapeUtil::Rank(rhs) != num_dims) { return InvalidArgument( "The RHS argument to a convolution should have rank %d; lhs: %s.", - num_dims, ShapeUtil::HumanString(lhs).c_str()); + num_dims, ShapeUtil::HumanString(lhs)); } TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs)); @@ -1694,26 +1624,26 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( !std::all_of(output_dnums.begin(), output_dnums.end(), in_range)) { return InvalidArgument( "A dimension number is out of range in convolution: %s.", - dnums.DebugString().c_str()); + dnums.DebugString()); } if (input_dnums != expected_dnums) { return InvalidArgument( "Input dimensions of convolution must contain each dimension exactly " "once: %s.", - dnums.DebugString().c_str()); + dnums.DebugString()); } if (window_dnums != expected_dnums) { return InvalidArgument( "Window dimensions of convolution must contain each dimension exactly " "once: %s.", - dnums.DebugString().c_str()); + dnums.DebugString()); } if (output_dnums != expected_dnums) { return InvalidArgument( "Output dimensions of convolution must contain each dimension exactly " "once: %s.", - dnums.DebugString().c_str()); + dnums.DebugString()); } std::vector input_spatial_dims(num_spatial_dims); @@ -1732,14 +1662,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const int64 kernel_output_features = rhs.dimensions(dnums.kernel_output_feature_dimension()); - if (input_features != kernel_input_features) { + if (input_features != kernel_input_features * feature_group_count) { return InvalidArgument( - "Expected LHS feature dimension (value %lld) to match RHS " - "input feature dimension (value %lld); got (%s, %s)\n" + "Expected LHS feature dimension (value %d) to match RHS " + "input feature dimension * feature_group_count (value %d); " + "got (%s, %s)\n" "Dimension numbers: {%s}.", - input_features, kernel_input_features, - ShapeUtil::HumanString(lhs).c_str(), - ShapeUtil::HumanString(rhs).c_str(), dnums.DebugString().c_str()); + input_features, kernel_input_features * feature_group_count, + ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs), + dnums.DebugString()); } std::vector window_dims(num_spatial_dims); for (int i = 0; i < num_spatial_dims; ++i) { @@ -1751,8 +1682,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "RHS shape: %s\n\t" "Window: {%s}\n\t" "Dimension numbers: {%s}.", - ShapeUtil::HumanString(rhs).c_str(), window.ShortDebugString().c_str(), - dnums.ShortDebugString().c_str()); + ShapeUtil::HumanString(rhs), window.ShortDebugString(), + dnums.ShortDebugString()); } Shape base_shape = @@ -1775,32 +1706,32 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( /* static */ StatusOr ShapeInference::InferFftShape( const Shape& in, const FftType fft_type, - const tensorflow::gtl::ArraySlice fft_length) { + const absl::Span fft_length) { const int64 fft_rank = fft_length.size(); if (fft_rank < 1 || fft_rank > 3) { - return InvalidArgument("FFT only supports ranks 1-3; got %lld.", fft_rank); + return InvalidArgument("FFT only supports ranks 1-3; got %d.", fft_rank); } -#define RET_CHECK_RANK(x) \ - if (x.dimensions_size() < fft_rank) { \ - return InvalidArgument( \ - "FFT of rank %lld requires input of at least " \ - "same rank; got input of rank %d", \ - fft_rank, x.dimensions_size()); \ +#define RET_CHECK_RANK(x) \ + if (x.dimensions_size() < fft_rank) { \ + return InvalidArgument( \ + "FFT of rank %d requires input of at least " \ + "same rank; got input of rank %d", \ + fft_rank, x.dimensions_size()); \ } switch (fft_type) { case FFT: case IFFT: if (in.element_type() != C64) { return InvalidArgument("%s requires C64 input type, found %s.", - FftType_Name(fft_type).c_str(), - PrimitiveType_Name(in.element_type()).c_str()); + FftType_Name(fft_type), + PrimitiveType_Name(in.element_type())); } RET_CHECK_RANK(in); return in; case RFFT: { if (in.element_type() != F32) { return InvalidArgument("RFFT requires F32 input type, found %s.", - PrimitiveType_Name(in.element_type()).c_str()); + PrimitiveType_Name(in.element_type())); } RET_CHECK_RANK(in); for (int i = 0; i < fft_rank; i++) { @@ -1808,7 +1739,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( fft_length[i]) { return InvalidArgument( "RFFT requires innermost dimensions match fft_length but " - "dimension %lld is %lld and should be %lld.", + "dimension %d is %d and should be %d.", in.dimensions_size() - fft_rank + i, in.dimensions(in.dimensions_size() - fft_rank + i), fft_length[i]); @@ -1822,7 +1753,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( case IRFFT: { if (in.element_type() != C64) { return InvalidArgument("IRFFT requires C64 input type, found %s.", - PrimitiveType_Name(in.element_type()).c_str()); + PrimitiveType_Name(in.element_type())); } RET_CHECK_RANK(in); Shape result = ShapeUtil::ComplexComponentShape(in); @@ -1831,7 +1762,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( fft_length[i]) { return InvalidArgument( "IRFFT requires all but one innermost dimensions match " - "fft_length, but dimension %lld is %lld and should be %lld.", + "fft_length, but dimension %d is %d and should be %d.", in.dimensions_size() - fft_rank + i, in.dimensions(in.dimensions_size() - fft_rank + i), fft_length[i]); @@ -1841,7 +1772,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( fft_length[fft_rank - 1] / 2 + 1) { return InvalidArgument( "IRFFT requires innermost dimension matches fft_length/2+1, but " - "dimension %d is %lld and should be %lld.", + "dimension %d is %d and should be %d.", in.dimensions_size() - 1, in.dimensions(in.dimensions_size() - 1), fft_length[fft_rank - 1] / 2 + 1); } @@ -1856,10 +1787,10 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } /* static */ StatusOr ShapeInference::InferCrossReplicaSumShape( - tensorflow::gtl::ArraySlice operand_shapes) { + absl::Span operand_shapes) { for (const Shape* operand_shape : operand_shapes) { TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(*operand_shape, "operand of cross replica sum")); + ExpectArray(*operand_shape, "operand of cross replica sum")); } if (operand_shapes.size() == 1) { return *operand_shapes[0]; @@ -1871,20 +1802,102 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return ShapeUtil::MakeTupleShape(operand_shape_values); } +/* static */ StatusOr ShapeInference::InferAllToAllShape( + const Shape& shape, int64 split_dimension, int64 concat_dimension, + int64 split_count) { + TF_RET_CHECK(split_count > 0); + if (split_dimension >= ShapeUtil::Rank(shape) || split_dimension < 0) { + return InvalidArgument( + "AllToAll split_dimension %d is out-of-bounds in shape %s.", + split_dimension, ShapeUtil::HumanString(shape)); + } + if (concat_dimension >= ShapeUtil::Rank(shape) || concat_dimension < 0) { + return InvalidArgument( + "AllToAll concat_dimension %d is out-of-bounds in shape %s.", + concat_dimension, ShapeUtil::HumanString(shape)); + } + if (shape.dimensions(split_dimension) % split_count != 0) { + return InvalidArgument( + "AllToAll split dimension size %d must be dividable by split_count " + "%d.", + shape.dimensions(split_dimension), split_count); + } + std::vector new_dimensions(shape.dimensions().begin(), + shape.dimensions().end()); + new_dimensions[split_dimension] /= split_count; + new_dimensions[concat_dimension] *= split_count; + return ShapeUtil::MakeShape(shape.element_type(), new_dimensions); +} + +/* static */ StatusOr ShapeInference::InferAllToAllTupleShape( + absl::Span operand_shapes) { + // An Alltoall HLO instruction receives N operands (with the same shape) and + // returns a tuple that contains N array shapes. + TF_RET_CHECK(!operand_shapes.empty()); + for (int i = 0; i < operand_shapes.size(); i++) { + if (!ShapeUtil::Equal(*operand_shapes[0], *operand_shapes[i])) { + return InvalidArgument( + "HLO all-to-all has operands with different shapes: the 0th " + "operand shape %s, but the %dth operand has shape %s.", + ShapeUtil::HumanString(*operand_shapes[0]), i, + ShapeUtil::HumanString(*operand_shapes[i])); + } + } + + return InferVariadicOpShape(HloOpcode::kTuple, operand_shapes); +} + +/* static */ StatusOr ShapeInference::InferCollectivePermuteShape( + const Shape& shape) { + TF_RET_CHECK(ShapeUtil::IsArray(shape)); + return shape; +} + /* static */ StatusOr ShapeInference::InferReduceShape( - const Shape& arg, const Shape& init_value, - tensorflow::gtl::ArraySlice dimensions_to_reduce, + absl::Span arg_shapes, + absl::Span dimensions_to_reduce, const ProgramShape& to_apply) { - // Check that the dimension to reduce are in-bounds for the given shape. + if (arg_shapes.empty()) { + return InvalidArgument("Reduce must have at least 2 arguments, has 0"); + } + if (arg_shapes.size() % 2) { + return InvalidArgument( + "Reduce must have an even number of arguments, has %lu", + arg_shapes.size()); + } + int64 num_reduced_args = arg_shapes.size() / 2; + + auto reduced_args = arg_shapes.subspan(0, num_reduced_args); + // Check that all of the reduced tensors have the same dimensions. The element + // types may be different. + for (int64 i = 1; i < num_reduced_args; ++i) { + if (!ShapeUtil::SameDimensions(*reduced_args[0], *reduced_args[i])) { + return InvalidArgument( + "All reduced tensors must have the sime dimension. Tensor 0 has " + "shape %s, Tensor %d has shape %s", + ShapeUtil::HumanString(*reduced_args[0]), i, + ShapeUtil::HumanString(*reduced_args[i])); + } + } + + // Check that the dimensions to reduce are in-bounds for the given shape. + // We've already verified all reduced tensors have the same dimensions, so it + // doesn't matter which one we choose. + const Shape& arg = *reduced_args[0]; for (int64 dimension : dimensions_to_reduce) { if (dimension >= ShapeUtil::Rank(arg) || dimension < 0) { - return InvalidArgument( - "Reducing out-of-bounds dimension %lld in shape %s.", dimension, - ShapeUtil::HumanString(arg).c_str()); + return InvalidArgument("Reducing out-of-bounds dimension %d in shape %s.", + dimension, ShapeUtil::HumanString(arg)); } } - TF_RETURN_IF_ERROR( - VerifyReducerShape(to_apply, init_value, arg.element_type())); + + auto init_values = arg_shapes.subspan(num_reduced_args, arg_shapes.size()); + std::vector element_types; + for (const Shape* arg : reduced_args) { + element_types.push_back(arg->element_type()); + } + TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply, init_values, element_types, + num_reduced_args)); std::set dimensions_to_reduce_set(dimensions_to_reduce.begin(), dimensions_to_reduce.end()); @@ -1895,16 +1908,26 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } } - return ShapeUtil::MakeShape(to_apply.result().element_type(), new_dimensions); + if (ShapeUtil::IsScalar(to_apply.result())) { + return ShapeUtil::MakeShape(to_apply.result().element_type(), + new_dimensions); + } else { + std::vector result_subshapes; + for (const Shape& subshape : to_apply.result().tuple_shapes()) { + result_subshapes.push_back( + ShapeUtil::MakeShape(subshape.element_type(), new_dimensions)); + } + return ShapeUtil::MakeTupleShape(result_subshapes); + } } /* static */ StatusOr ShapeInference::InferReduceWindowShape( const Shape& operand_shape, const Shape& init_value_shape, const Window& window, const ProgramShape& to_apply_shape) { - TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(operand_shape, "operand of reduce-window")); - TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply_shape, init_value_shape, - operand_shape.element_type())); + TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of reduce-window")); + TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply_shape, {&init_value_shape}, + {operand_shape.element_type()}, + /*inputs=*/1)); return InferWindowOutputShape(operand_shape, window, init_value_shape.element_type(), /*allow_negative_padding=*/false); @@ -1915,7 +1938,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const Window& window, const Shape& source_shape, const Shape& init_value_shape, const ProgramShape& scatter_shape) { TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(operand_shape, "operand of select-and-scatter")); + ExpectArray(operand_shape, "operand of select-and-scatter")); // Check if the select function has a proper shape of (T,T) -> PRED. if (select_shape.parameters_size() != 2) { @@ -1936,21 +1959,22 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Select function's first parameter shape currently must " "match the operand element shape, but got %s vs %s.", - ShapeUtil::HumanString(select_shape.parameters(0)).c_str(), - ShapeUtil::HumanString(operand_element_shape).c_str()); + ShapeUtil::HumanString(select_shape.parameters(0)), + ShapeUtil::HumanString(operand_element_shape)); } if (!ShapeUtil::CompatibleIgnoringFpPrecision(operand_element_shape, select_shape.parameters(1))) { return InvalidArgument( "Select function's second parameter shape currently must " "match the operand element shape, but got %s vs %s.", - ShapeUtil::HumanString(select_shape.parameters(1)).c_str(), - ShapeUtil::HumanString(operand_element_shape).c_str()); + ShapeUtil::HumanString(select_shape.parameters(1)), + ShapeUtil::HumanString(operand_element_shape)); } // Check if the scatter function has a proper shape as a reduction. - TF_RETURN_IF_ERROR(VerifyReducerShape(scatter_shape, init_value_shape, - source_shape.element_type())); + TF_RETURN_IF_ERROR(VerifyReducerShape(scatter_shape, {&init_value_shape}, + {source_shape.element_type()}, + /*inputs=*/1)); // Check if the result shape of window operation matches the source shape. TF_ASSIGN_OR_RETURN(const Shape& window_result_shape, @@ -1962,43 +1986,40 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Source shape does not match the shape of window-reduced operand: " "source(%s), window-reduced operand(%s).", - ShapeUtil::HumanString(source_shape).c_str(), - ShapeUtil::HumanString(window_result_shape).c_str()); + ShapeUtil::HumanString(source_shape), + ShapeUtil::HumanString(window_result_shape)); } return operand_shape; } /* static */ StatusOr ShapeInference::InferSliceShape( - const Shape& arg, tensorflow::gtl::ArraySlice starts, - tensorflow::gtl::ArraySlice limits, - tensorflow::gtl::ArraySlice strides) { + const Shape& arg, absl::Span starts, + absl::Span limits, absl::Span strides) { auto error = [&](const string& message) { return InvalidArgument( "%s in slice operation; argument shape: %s; starts: {%s}; limits: " "{%s}; strides: {%s}.", - message.c_str(), ShapeUtil::HumanString(arg).c_str(), - Join(starts, ",").c_str(), Join(limits, ",").c_str(), - Join(strides, ",").c_str()); + message, ShapeUtil::HumanString(arg), StrJoin(starts, ","), + StrJoin(limits, ","), StrJoin(strides, ",")); }; - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice")); - VLOG(2) << tensorflow::strings::Printf( - "slicing shape %s starts={%s} limits={%s}", - ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(), - Join(limits, ", ").c_str()); + TF_RETURN_IF_ERROR(ExpectArray(arg, "operand of slice")); + VLOG(2) << StrFormat("slicing shape %s starts={%s} limits={%s}", + ShapeUtil::HumanString(arg), StrJoin(starts, ", "), + StrJoin(limits, ", ")); if (starts.size() != limits.size()) { - return error(Printf("slice start and limit sizes differ: %zu vs %zu", - starts.size(), limits.size())); + return error(StrFormat("slice start and limit sizes differ: %u vs %u", + starts.size(), limits.size())); } if (starts.size() != strides.size()) { - return error(Printf("slice start and strides sizes differ: %zu vs %zu", - starts.size(), strides.size())); + return error(StrFormat("slice start and strides sizes differ: %u vs %u", + starts.size(), strides.size())); } if (starts.size() != ShapeUtil::Rank(arg)) { return InvalidArgument( - "Slice index count does not match argument rank: %zu vs %lld.", + "Slice index count does not match argument rank: %u vs %d.", starts.size(), ShapeUtil::Rank(arg)); } @@ -2008,27 +2029,24 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( int64 limit_index = limits[dimension]; int64 stride = strides[dimension]; if (start_index < 0) { - return InvalidArgument("Negative start index to slice: %lld.", - start_index); + return InvalidArgument("Negative start index to slice: %d.", start_index); } if (limit_index > arg.dimensions(dimension)) { return error( - Printf("limit index (%lld) must be less than or equal to dimension " - "size (%lld)", - limit_index, arg.dimensions(dimension))); - } - VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension, - start_index); - VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension, - limit_index); + StrFormat("limit index (%d) must be less than or equal to dimension " + "size (%d)", + limit_index, arg.dimensions(dimension))); + } + VLOG(2) << StrFormat("starts[%d] = %d", dimension, start_index); + VLOG(2) << StrFormat("limits[%d] = %d", dimension, limit_index); if (start_index > limit_index) { return error( - Printf("limit index (%lld) must be greater or equal to " - "start index (%lld) in slice with positive stride", - limit_index, start_index)); + StrFormat("limit index (%d) must be greater or equal to " + "start index (%d) in slice with positive stride", + limit_index, start_index)); } if (stride <= 0) { - return InvalidArgument("Stride (%lld) must be positive.", stride); + return InvalidArgument("Stride (%d) must be positive.", stride); } sizes.push_back((limit_index - start_index + stride - 1) / stride); } @@ -2038,21 +2056,19 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( /* static */ StatusOr ShapeInference::InferDynamicSliceShape( const Shape& operand_shape, const Shape& start_indices_shape, - tensorflow::gtl::ArraySlice slice_sizes) { + absl::Span slice_sizes) { + TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of dynamic slice")); TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(operand_shape, "operand of dynamic slice")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(start_indices_shape, - "start indices of dynamic slice")); + ExpectArray(start_indices_shape, "start indices of dynamic slice")); - VLOG(2) << tensorflow::strings::Printf( + VLOG(2) << StrFormat( "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}", - ShapeUtil::HumanString(operand_shape).c_str(), - ShapeUtil::HumanString(start_indices_shape).c_str(), - Join(slice_sizes, ", ").c_str()); + ShapeUtil::HumanString(operand_shape), + ShapeUtil::HumanString(start_indices_shape), StrJoin(slice_sizes, ", ")); if (ShapeUtil::Rank(start_indices_shape) != 1) { return InvalidArgument( - "Dynamic slice start indices of rank %lld must be rank1.", + "Dynamic slice start indices of rank %d must be rank1.", ShapeUtil::Rank(start_indices_shape)); } @@ -2064,16 +2080,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const int64 start_num_dims = start_indices_shape.dimensions(0); if (ShapeUtil::Rank(operand_shape) != start_num_dims) { return InvalidArgument( - "Dynamic slice start number of dimensions %lld (%s) must match rank " - "%lld of slice input (%s).", - start_num_dims, ShapeUtil::HumanString(start_indices_shape).c_str(), - ShapeUtil::Rank(operand_shape), - ShapeUtil::HumanString(operand_shape).c_str()); + "Dynamic slice start number of dimensions %d (%s) must match rank " + "%d of slice input (%s).", + start_num_dims, ShapeUtil::HumanString(start_indices_shape), + ShapeUtil::Rank(operand_shape), ShapeUtil::HumanString(operand_shape)); } if (slice_sizes.size() != ShapeUtil::Rank(operand_shape)) { return InvalidArgument( - "Dynamic slice index count does not match argument rank: %zu vs %lld.", + "Dynamic slice index count does not match argument rank: %u vs %d.", slice_sizes.size(), ShapeUtil::Rank(operand_shape)); } @@ -2081,16 +2096,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const int64 input_dim_size = operand_shape.dimensions(dim); const int64 slice_dim_size = slice_sizes[dim]; if (slice_dim_size < 0) { - return InvalidArgument("Negative size index to dynamic slice: %lld.", + return InvalidArgument("Negative size index to dynamic slice: %d.", slice_dim_size); } if (slice_dim_size > input_dim_size) { return InvalidArgument( - "Slice dim size %lld greater than dynamic slice dimension: %lld.", + "Slice dim size %d greater than dynamic slice dimension: %d.", slice_dim_size, input_dim_size); } - VLOG(2) << tensorflow::strings::Printf("slice_sizes[%lld] = %lld", dim, - slice_dim_size); + VLOG(2) << StrFormat("slice_sizes[%d] = %d", dim, slice_dim_size); } return ShapeUtil::MakeShape(operand_shape.element_type(), slice_sizes); @@ -2100,22 +2114,22 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const Shape& operand_shape, const Shape& update_shape, const Shape& start_indices_shape) { TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(operand_shape, "operand of dynamic update slice")); + ExpectArray(operand_shape, "operand of dynamic update slice")); TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(update_shape, "update of dynamic update slice")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque( - start_indices_shape, "start indices of dynamic update slice")); + ExpectArray(update_shape, "update of dynamic update slice")); + TF_RETURN_IF_ERROR(ExpectArray(start_indices_shape, + "start indices of dynamic update slice")); - VLOG(2) << tensorflow::strings::Printf( + VLOG(2) << StrFormat( "updating slice of shape %s at dynamic start_indices %s with update " "shape %s", - ShapeUtil::HumanString(operand_shape).c_str(), - ShapeUtil::HumanString(start_indices_shape).c_str(), - ShapeUtil::HumanString(update_shape).c_str()); + ShapeUtil::HumanString(operand_shape), + ShapeUtil::HumanString(start_indices_shape), + ShapeUtil::HumanString(update_shape)); if (ShapeUtil::Rank(start_indices_shape) != 1) { return InvalidArgument( - "Dynamic update slice start indices of rank %lld must be rank1.", + "Dynamic update slice start indices of rank %d must be rank1.", ShapeUtil::Rank(start_indices_shape)); } @@ -2127,17 +2141,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const int64 start_num_dims = start_indices_shape.dimensions(0); if (ShapeUtil::Rank(operand_shape) != start_num_dims) { return InvalidArgument( - "Dynamic update slice start number of dimensions %lld (%s) must match " - "rank %lld of slice input (%s).", - start_num_dims, ShapeUtil::HumanString(start_indices_shape).c_str(), - ShapeUtil::Rank(operand_shape), - ShapeUtil::HumanString(operand_shape).c_str()); + "Dynamic update slice start number of dimensions %d (%s) must match " + "rank %d of slice input (%s).", + start_num_dims, ShapeUtil::HumanString(start_indices_shape), + ShapeUtil::Rank(operand_shape), ShapeUtil::HumanString(operand_shape)); } if (ShapeUtil::Rank(update_shape) != ShapeUtil::Rank(operand_shape)) { return InvalidArgument( "Dynamic update slice update rank does not match argument rank: " - "%lld vs %lld.", + "%d vs %d.", ShapeUtil::Rank(update_shape), ShapeUtil::Rank(operand_shape)); } @@ -2146,8 +2159,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Dynamic update slice update element type does not match argument. " "operand.element_type: %s vs update.element_type: %s.", - PrimitiveType_Name(operand_shape.element_type()).c_str(), - PrimitiveType_Name(update_shape.element_type()).c_str()); + PrimitiveType_Name(operand_shape.element_type()), + PrimitiveType_Name(update_shape.element_type())); } for (int64 dim = 0; dim < ShapeUtil::Rank(operand_shape); ++dim) { @@ -2155,33 +2168,31 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const int64 update_dim_size = update_shape.dimensions(dim); if (update_dim_size < 0) { return InvalidArgument( - "Size index %lld to dynamic update slice must be >= 0.", + "Size index %d to dynamic update slice must be >= 0.", update_dim_size); } if (update_dim_size > input_dim_size) { return InvalidArgument( - "Update dim size %lld greater than dynamic slice dimension: %lld.", + "Update dim size %d greater than dynamic slice dimension: %d.", update_dim_size, input_dim_size); } - VLOG(2) << tensorflow::strings::Printf("update_sizes[%lld] = %lld", dim, - update_dim_size); + VLOG(2) << StrFormat("update_sizes[%d] = %d", dim, update_dim_size); } return operand_shape; } /*static */ StatusOr ShapeInference::InferReverseShape( - const Shape& operand_shape, tensorflow::gtl::ArraySlice dimensions) { - TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(operand_shape, "operand of reverse")); + const Shape& operand_shape, absl::Span dimensions) { + TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of reverse")); if (!AllUnique(dimensions)) { return InvalidArgument("a dimension number is duplicated in reverse"); } for (int64 dimension : dimensions) { if (dimension >= ShapeUtil::Rank(operand_shape) || dimension < 0) { return InvalidArgument( - "One of the reverse dimensions (%lld) is out-of-bounds in shape %s.", - dimension, ShapeUtil::HumanString(operand_shape).c_str()); + "One of the reverse dimensions (%d) is out-of-bounds in shape %s.", + dimension, ShapeUtil::HumanString(operand_shape)); } } return operand_shape; @@ -2192,14 +2203,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (!ShapeUtil::IsTuple(arg)) { return InvalidArgument( "Cannot infer shape: attempting to index into non-tuple: %s.", - ShapeUtil::HumanString(arg).c_str()); + ShapeUtil::HumanString(arg)); } if (index >= arg.tuple_shapes_size()) { return InvalidArgument( - "Cannot infer shape: attempt to index out of tuple bounds: %lld " + "Cannot infer shape: attempt to index out of tuple bounds: %d " ">= %d in shape %s.", - index, arg.tuple_shapes_size(), ShapeUtil::HumanString(arg).c_str()); + index, arg.tuple_shapes_size(), ShapeUtil::HumanString(arg)); } return arg.tuple_shapes(index); @@ -2219,17 +2230,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } auto shape_string = [&]() { - return tensorflow::strings::Printf( - "Condition: %s; body: %s; init: %s.", - ShapeUtil::HumanString(condition).c_str(), - ShapeUtil::HumanString(body).c_str(), - ShapeUtil::HumanString(init).c_str()); + return StrFormat( + "Condition: %s; body: %s; init: %s.", ShapeUtil::HumanString(condition), + ShapeUtil::HumanString(body), ShapeUtil::HumanString(init)); }; // Check the shapes of computation parameters and return types. if (!ShapeUtil::ShapeIs(condition.result(), PRED, {})) { return InvalidArgument("Condition must return a boolean; got %s.", - shape_string().c_str()); + shape_string()); } if (!ShapeUtil::Compatible(body.result(), condition.parameters(0)) || !ShapeUtil::Compatible(body.result(), body.parameters(0)) || @@ -2237,7 +2246,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "The parameter of condition and body, the result of the body, and init " "must all have the same shape; got %s.", - shape_string().c_str()); + shape_string()); } return init; @@ -2249,7 +2258,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const ProgramShape& false_computation) { if (!ShapeUtil::ShapeIs(predicate, PRED, {})) { return InvalidArgument("Predicate must be a boolean; got %s.", - ShapeUtil::HumanString(predicate).c_str()); + ShapeUtil::HumanString(predicate)); } if (true_computation.parameters_size() != 1) { @@ -2258,15 +2267,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } if (!ShapeUtil::Compatible(true_computation.parameters(0), true_operand)) { auto true_shape_string = [&]() { - return tensorflow::strings::Printf( - "true_operand: %s; true_computation: %s", - ShapeUtil::HumanString(true_operand).c_str(), - ShapeUtil::HumanString(true_computation).c_str()); + return StrFormat("true_operand: %s; true_computation: %s", + ShapeUtil::HumanString(true_operand), + ShapeUtil::HumanString(true_computation)); }; return InvalidArgument( "true_operand must match the shape of the only parameter of " "true_computation: got %s.", - true_shape_string().c_str()); + true_shape_string()); } if (false_computation.parameters_size() != 1) { @@ -2275,38 +2283,37 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } if (!ShapeUtil::Compatible(false_computation.parameters(0), false_operand)) { auto false_shape_string = [&]() { - return tensorflow::strings::Printf( - "false_operand: %s; false_computation: %s", - ShapeUtil::HumanString(false_operand).c_str(), - ShapeUtil::HumanString(false_computation).c_str()); + return StrFormat("false_operand: %s; false_computation: %s", + ShapeUtil::HumanString(false_operand), + ShapeUtil::HumanString(false_computation)); }; return InvalidArgument( "false_operand must match the shape of the only parameter of " "false_computation: got %s.", - false_shape_string().c_str()); + false_shape_string()); } if (!ShapeUtil::Compatible(true_computation.result(), false_computation.result())) { auto shape_string = [&]() { - return tensorflow::strings::Printf( + return StrFormat( "true_computation result: %s; false_computation result: %s.", - ShapeUtil::HumanString(true_computation.result()).c_str(), - ShapeUtil::HumanString(false_computation.result()).c_str()); + ShapeUtil::HumanString(true_computation.result()), + ShapeUtil::HumanString(false_computation.result())); }; return InvalidArgument( "the result of true_computation and false_computation must have the " "same shape: got %s.", - shape_string().c_str()); + shape_string()); } return true_computation.result(); } /* static */ StatusOr ShapeInference::InferBroadcastShape( - const Shape& operand, tensorflow::gtl::ArraySlice broadcast_sizes) { - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "operand of broadcast")); + const Shape& operand, absl::Span broadcast_sizes) { + TF_RETURN_IF_ERROR(ExpectArray(operand, "operand of broadcast")); for (int64 size : broadcast_sizes) { if (size < 0) { - return InvalidArgument("Broadcast with negative dimension size %lld.", + return InvalidArgument("Broadcast with negative dimension size %d.", size); } } @@ -2320,9 +2327,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } /* static */ StatusOr ShapeInference::InferReshapeShape( - const Shape& operand, tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice new_sizes) { - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "reshape")); + const Shape& operand, absl::Span dimensions, + absl::Span new_sizes) { + TF_RETURN_IF_ERROR(ExpectArray(operand, "reshape")); Shape inferred_shape = ShapeUtil::MakeShape(operand.element_type(), new_sizes); @@ -2331,11 +2338,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (ShapeUtil::ElementsIn(operand) != ShapeUtil::ElementsIn(inferred_shape)) { return InvalidArgument( - "Reshape operation has mismatched element counts: from=%lld (%s) " - "to=%lld (%s).", - ShapeUtil::ElementsIn(operand), ShapeUtil::HumanString(operand).c_str(), + "Reshape operation has mismatched element counts: from=%d (%s) " + "to=%d (%s).", + ShapeUtil::ElementsIn(operand), ShapeUtil::HumanString(operand), ShapeUtil::ElementsIn(inferred_shape), - ShapeUtil::HumanString(inferred_shape).c_str()); + ShapeUtil::HumanString(inferred_shape)); } std::vector indices(ShapeUtil::Rank(operand)); @@ -2346,15 +2353,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Reshape dimensions [%s] are not a permutation of the operand " "dimensions (operand shape is %s).", - Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str()); + StrJoin(dimensions, ","), ShapeUtil::HumanString(operand)); } return inferred_shape; } /* static */ StatusOr ShapeInference::InferTransposeShape( - const Shape& operand, tensorflow::gtl::ArraySlice dimensions) { - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "transpose")); + const Shape& operand, absl::Span dimensions) { + TF_RETURN_IF_ERROR(ExpectArray(operand, "transpose")); std::vector indices(ShapeUtil::Rank(operand)); std::iota(indices.begin(), indices.end(), 0); @@ -2375,15 +2382,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( // "degenerate" cases, as with binary elementwise ops. /* static */ StatusOr ShapeInference::InferClampShape( const Shape& min, const Shape& operand, const Shape& max) { - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(min, "clamp min")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "clamp operand")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(max, "clamp max")); + TF_RETURN_IF_ERROR(ExpectArray(min, "clamp min")); + TF_RETURN_IF_ERROR(ExpectArray(operand, "clamp operand")); + TF_RETURN_IF_ERROR(ExpectArray(max, "clamp max")); if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(min, operand) || !ShapeUtil::SameElementTypeIgnoringFpPrecision(max, operand)) { return InvalidArgument("Clamp with different operand types: %s, %s, %s.", - ShapeUtil::HumanString(min).c_str(), - ShapeUtil::HumanString(operand).c_str(), - ShapeUtil::HumanString(max).c_str()); + ShapeUtil::HumanString(min), + ShapeUtil::HumanString(operand), + ShapeUtil::HumanString(max)); } if (((ShapeUtil::CompatibleIgnoringFpPrecision(min, operand) || ShapeUtil::IsScalar(min)) && @@ -2400,9 +2407,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return ShapeUtil::ChangeElementType(min, operand.element_type()); } } - return Unimplemented( - "%s, %s %s is not implemented.", min.ShortDebugString().c_str(), - max.ShortDebugString().c_str(), operand.ShortDebugString().c_str()); + return Unimplemented("%s, %s %s is not implemented.", + min.ShortDebugString(), max.ShortDebugString(), + operand.ShortDebugString()); } // TODO(b/36794510): Make broadcast semantics more consistent, by supporting @@ -2410,27 +2417,18 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( // broadcast from all operands, not just the predicate. /* static */ StatusOr ShapeInference::InferSelectShape( const Shape& pred, const Shape& on_true, const Shape& on_false) { - bool compatible; - if (ShapeUtil::IsTuple(on_true)) { - // Select only defines the top-level buffer, so if it's a tuple, the two - // input must match exactly. - compatible = ShapeUtil::Compatible(on_true, on_false); - } else { - compatible = ShapeUtil::CompatibleIgnoringFpPrecision(on_true, on_false); - } - if (!compatible) { + if (!ShapeUtil::CompatibleIgnoringFpPrecision(on_true, on_false)) { return InvalidArgument( "Operands to select must be the same shape; got %s and %s.", - ShapeUtil::HumanString(on_true).c_str(), - ShapeUtil::HumanString(on_false).c_str()); + ShapeUtil::HumanString(on_true), ShapeUtil::HumanString(on_false)); } if (pred.element_type() != PRED) { return InvalidArgument( "Select's pred operand must have PRED element type; got %s.", - ShapeUtil::HumanString(pred).c_str()); + ShapeUtil::HumanString(pred)); } if (ShapeUtil::CompatibleIgnoringElementType(pred, on_true) || - ShapeUtil::Rank(pred) == 0) { + ShapeUtil::IsScalar(pred)) { // By this stage we know that pred's element type is PRED. Therefore, this // check restricts pred to be a PRED scalar, or a PRED array with the same // dimensions as on_true and on_false. @@ -2440,26 +2438,47 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Select operation with non-scalar predicate with dimensionality " " different from the other operands: %s.", - ShapeUtil::HumanString(pred).c_str()); + ShapeUtil::HumanString(pred)); } } +/* static */ StatusOr ShapeInference::InferTupleSelectShape( + const Shape& pred, const Shape& on_true, const Shape& on_false) { + // Select only defines the top-level buffer, so if it's a tuple, the two + // input must match exactly. + if (!ShapeUtil::Compatible(on_true, on_false)) { + return InvalidArgument( + "Operands to tuple-select must be the same shape; got %s and %s.", + ShapeUtil::HumanString(on_true), ShapeUtil::HumanString(on_false)); + } + if (pred.element_type() != PRED) { + return InvalidArgument( + "TupleSelect's pred operand must have PRED element type; got %s.", + ShapeUtil::HumanString(pred)); + } + if (!ShapeUtil::IsScalar(pred)) { + return InvalidArgument( + "TupleSelect operation with non-scalar predicate: %s.", + ShapeUtil::HumanString(pred)); + } + return on_true; +} + /* static */ StatusOr ShapeInference::InferCallShape( - tensorflow::gtl::ArraySlice arg_shapes, - const ProgramShape& to_apply) { + absl::Span arg_shapes, const ProgramShape& to_apply) { // The applied function's arity equals the number of arguments. if (arg_shapes.size() != to_apply.parameters_size()) { string computation_signature = ShapeUtil::HumanString(to_apply); string argument_shapes = - Join(arg_shapes, ", ", [](string* out, const Shape* shape) { - tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape)); + StrJoin(arg_shapes, ", ", [](string* out, const Shape* shape) { + absl::StrAppend(out, ShapeUtil::HumanString(*shape)); }); return InvalidArgument( "Call applied function arity must match number of arguments; got: " - "arity: %d, arguments: %zu; computation signature: %s; argument " + "arity: %d, arguments: %u; computation signature: %s; argument " "shapes: [%s].", - to_apply.parameters_size(), arg_shapes.size(), - computation_signature.c_str(), argument_shapes.c_str()); + to_apply.parameters_size(), arg_shapes.size(), computation_signature, + argument_shapes); } // All arguments must be compatible with the program shape. @@ -2470,8 +2489,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Call parameter must match argument; got parameter %d shape: %s, " "argument shape: %s.", - i, ShapeUtil::HumanString(param_shape).c_str(), - ShapeUtil::HumanString(arg_shape).c_str()); + i, ShapeUtil::HumanString(param_shape), + ShapeUtil::HumanString(arg_shape)); } } @@ -2479,202 +2498,198 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( } static Status ValidateGatherDimensionNumbers( - const Shape& input_shape, - tensorflow::gtl::ArraySlice gather_indices_shape, + const Shape& input_shape, absl::Span start_indices_shape, const GatherDimensionNumbers& dim_numbers) { - if (!c_is_sorted(dim_numbers.output_window_dims())) { + if (!absl::c_is_sorted(dim_numbers.offset_dims())) { return InvalidArgument( "Output window dimensions in gather op must be ascending; got: %s.", - Join(dim_numbers.output_window_dims(), ", ").c_str()); + StrJoin(dim_numbers.offset_dims(), ", ")); } - if (c_adjacent_find(dim_numbers.output_window_dims()) != - dim_numbers.output_window_dims().end()) { + if (absl::c_adjacent_find(dim_numbers.offset_dims()) != + dim_numbers.offset_dims().end()) { return InvalidArgument( "Output window dimensions in gather op must not repeat; got: %s.", - Join(dim_numbers.output_window_dims(), ", ").c_str()); + StrJoin(dim_numbers.offset_dims(), ", ")); } - const int64 output_window_dim_count = dim_numbers.output_window_dims_size(); + const int64 output_offset_dim_count = dim_numbers.offset_dims_size(); const int64 output_shape_rank = - output_window_dim_count + gather_indices_shape.size() - 1; + output_offset_dim_count + start_indices_shape.size() - 1; - for (int i = 0; i < dim_numbers.output_window_dims_size(); ++i) { - int64 window_index = dim_numbers.output_window_dims(i); - if (window_index < 0 || window_index >= output_shape_rank) { + for (int i = 0; i < dim_numbers.offset_dims_size(); ++i) { + int64 offset_dim = dim_numbers.offset_dims(i); + if (offset_dim < 0 || offset_dim >= output_shape_rank) { return InvalidArgument( - "Window index %d in gather op is out of bounds; got %lld, but should " - "have been in [0,%lld).", - i, window_index, output_shape_rank); + "Offset dimension %d in gather op is out of bounds; got %d, but " + "should " + "have been in [0,%d).", + i, offset_dim, output_shape_rank); } } - if (dim_numbers.gather_dims_to_operand_dims_size() != - gather_indices_shape[dim_numbers.index_vector_dim()]) { + if (dim_numbers.start_index_map_size() != + start_indices_shape[dim_numbers.index_vector_dim()]) { return InvalidArgument( - "Gather op has %d elements in gather_dims_to_operand_dims and the " - "bound of dimension index_vector_dim=%lld of gather_indices is " - "%lld. These two numbers must be equal.", - dim_numbers.gather_dims_to_operand_dims_size(), - dim_numbers.index_vector_dim(), - gather_indices_shape[dim_numbers.index_vector_dim()]); + "Gather op has %d elements in start_index_map and the " + "bound of dimension index_vector_dim=%d of start_indices is " + "%d. These two numbers must be equal.", + dim_numbers.start_index_map_size(), dim_numbers.index_vector_dim(), + start_indices_shape[dim_numbers.index_vector_dim()]); } - for (int i = 0; i < dim_numbers.gather_dims_to_operand_dims_size(); i++) { - int64 gather_dim_to_input_dim = dim_numbers.gather_dims_to_operand_dims(i); - if (gather_dim_to_input_dim < 0 || - gather_dim_to_input_dim >= input_shape.dimensions_size()) { + for (int i = 0; i < dim_numbers.start_index_map_size(); i++) { + int64 operand_dim_for_start_index_i = dim_numbers.start_index_map(i); + if (operand_dim_for_start_index_i < 0 || + operand_dim_for_start_index_i >= input_shape.dimensions_size()) { return InvalidArgument( - "Invalid gather_dims_to_operand_dims mapping; domain is [0, %d), " - "got: %d->%lld.", - input_shape.dimensions_size(), i, gather_dim_to_input_dim); + "Invalid start_index_map; domain is [0, %d), got: %d->%d.", + input_shape.dimensions_size(), i, operand_dim_for_start_index_i); } } - std::vector sorted_gather_dims_to_operand_dims( - dim_numbers.gather_dims_to_operand_dims().begin(), - dim_numbers.gather_dims_to_operand_dims().end()); + std::vector sorted_start_index_map( + dim_numbers.start_index_map().begin(), + dim_numbers.start_index_map().end()); - c_sort(sorted_gather_dims_to_operand_dims); + absl::c_sort(sorted_start_index_map); - if (c_adjacent_find(sorted_gather_dims_to_operand_dims) != - sorted_gather_dims_to_operand_dims.end()) { + if (absl::c_adjacent_find(sorted_start_index_map) != + sorted_start_index_map.end()) { return InvalidArgument( - "Repeated dimensions are not allowed in gather_dims_to_operand_dims; " + "Repeated dimensions are not allowed in start_index_map; " "got: %s.", - Join(dim_numbers.gather_dims_to_operand_dims(), ", ").c_str()); + StrJoin(dim_numbers.start_index_map(), ", ")); } - for (int64 elided_dim : dim_numbers.elided_window_dims()) { - if (elided_dim < 0 || elided_dim >= input_shape.dimensions_size()) { + for (int64 collapsed_dim : dim_numbers.collapsed_slice_dims()) { + if (collapsed_dim < 0 || collapsed_dim >= input_shape.dimensions_size()) { return InvalidArgument( - "Invalid elided_window_dims set in gather op; valid range is [0, " - "%d), got: %lld.", - input_shape.dimensions_size(), elided_dim); + "Invalid collapsed_slice_dims set in gather op; valid range is [0, " + "%d), got: %d.", + input_shape.dimensions_size(), collapsed_dim); } } - if (!c_is_sorted(dim_numbers.elided_window_dims())) { + if (!absl::c_is_sorted(dim_numbers.collapsed_slice_dims())) { return InvalidArgument( - "elided_window_dims in gather op must be sorted; got: %s", - Join(dim_numbers.elided_window_dims(), ", ").c_str()); + "collapsed_slice_dims in gather op must be sorted; got: %s", + StrJoin(dim_numbers.collapsed_slice_dims(), ", ")); } - if (c_adjacent_find(dim_numbers.elided_window_dims()) != - dim_numbers.elided_window_dims().end()) { + if (absl::c_adjacent_find(dim_numbers.collapsed_slice_dims()) != + dim_numbers.collapsed_slice_dims().end()) { return InvalidArgument( - "Repeated dimensions not allowed in elided_window_dims in gather op; " + "Repeated dimensions not allowed in collapsed_slice_dims in gather op; " "got: %s.", - Join(dim_numbers.elided_window_dims(), ", ").c_str()); + StrJoin(dim_numbers.collapsed_slice_dims(), ", ")); } return Status::OK(); } /*static*/ StatusOr ShapeInference::InferGatherShape( - const Shape& input_shape, const Shape& gather_indices_shape, + const Shape& input_shape, const Shape& start_indices_shape, const GatherDimensionNumbers& gather_dim_numbers, - tensorflow::gtl::ArraySlice window_bounds) { + absl::Span slice_sizes) { TF_RETURN_IF_ERROR( - ExpectNotTupleOrOpaque(input_shape, "input tensor operand gather op")); - TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque( - gather_indices_shape, "gather indices operand of gather op")); + ExpectArray(input_shape, "input tensor operand gather op")); + TF_RETURN_IF_ERROR( + ExpectArray(start_indices_shape, "gather indices operand of gather op")); - if (!ShapeUtil::ElementIsIntegral(gather_indices_shape)) { + if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) { return InvalidArgument( "Gather indices parameter must be an integral tensor; got %s.", - ShapeUtil::HumanString(gather_indices_shape).c_str()); + ShapeUtil::HumanString(start_indices_shape)); } // We implicitly reshape gather indices of shape P[A,B,C] to P[A,B,C,1] if // index_vector_dim is rank(P). The bounds of this expanded shape is - // stored in expanded_gather_indices_shape. + // stored in expanded_start_indices_shape. - if (gather_indices_shape.dimensions_size() < + if (start_indices_shape.dimensions_size() < gather_dim_numbers.index_vector_dim() || gather_dim_numbers.index_vector_dim() < 0) { return InvalidArgument( - "Gather index leaf dimension must be within [0, rank(gather_indices) + " - "1). rank(gather_indices) is %d and gather index leaf dimension is " - "%lld.", - gather_indices_shape.dimensions_size(), + "Gather index leaf dimension must be within [0, rank(start_indices) + " + "1). rank(start_indices) is %d and gather index leaf dimension is " + "%d.", + start_indices_shape.dimensions_size(), gather_dim_numbers.index_vector_dim()); } - std::vector expanded_gather_indices_shape; - expanded_gather_indices_shape.reserve(gather_indices_shape.dimensions_size()); - c_copy(gather_indices_shape.dimensions(), - std::back_inserter(expanded_gather_indices_shape)); - if (expanded_gather_indices_shape.size() == + std::vector expanded_start_indices_shape; + expanded_start_indices_shape.reserve(start_indices_shape.dimensions_size()); + absl::c_copy(start_indices_shape.dimensions(), + std::back_inserter(expanded_start_indices_shape)); + if (expanded_start_indices_shape.size() == gather_dim_numbers.index_vector_dim()) { - expanded_gather_indices_shape.push_back(1); + expanded_start_indices_shape.push_back(1); } TF_RETURN_IF_ERROR(ValidateGatherDimensionNumbers( - input_shape, expanded_gather_indices_shape, gather_dim_numbers)); + input_shape, expanded_start_indices_shape, gather_dim_numbers)); - if (window_bounds.size() != input_shape.dimensions_size()) { + if (slice_sizes.size() != input_shape.dimensions_size()) { return InvalidArgument( - "Gather op must have one window bound for every input dimension; got: " - "len(window_bounds)=%lu, input_shape.rank=%d.", - window_bounds.size(), input_shape.dimensions_size()); + "Gather op must have one slice size for every input dimension; got: " + "len(slice_sizes)=%lu, input_shape.rank=%d.", + slice_sizes.size(), input_shape.dimensions_size()); } - if (window_bounds.size() != - gather_dim_numbers.output_window_dims_size() + - gather_dim_numbers.elided_window_dims_size()) { + if (slice_sizes.size() != + gather_dim_numbers.offset_dims_size() + + gather_dim_numbers.collapsed_slice_dims_size()) { return InvalidArgument( - "All components of the window index in a gather op must either be a " - "output window index or explicitly elided; got len(window_bounds)=%lu, " - "output_window_bounds=%s, elided_window_bounds=%s.", - window_bounds.size(), - Join(gather_dim_numbers.output_window_dims(), ",").c_str(), - Join(gather_dim_numbers.elided_window_dims(), ",").c_str()); + "All components of the offset index in a gather op must either be a " + "offset dimension or explicitly collapsed; got len(slice_sizes)=%lu, " + "output_slice_sizes=%s, collapsed_slice_dims=%s.", + slice_sizes.size(), StrJoin(gather_dim_numbers.offset_dims(), ","), + StrJoin(gather_dim_numbers.collapsed_slice_dims(), ",")); } - for (int i = 0; i < window_bounds.size(); i++) { - int64 window_bound = window_bounds[i]; - int64 corresponding_input_bound = input_shape.dimensions(i); - if (window_bound < 0 || window_bound > corresponding_input_bound) { + for (int i = 0; i < slice_sizes.size(); i++) { + int64 slice_size = slice_sizes[i]; + int64 corresponding_input_size = input_shape.dimensions(i); + if (slice_size < 0 || slice_size > corresponding_input_size) { return InvalidArgument( - "Window bound at index %d in gather op is out of range, must be " - "within " - "[0, %lld), got %lld.", - i, corresponding_input_bound + 1, window_bound); + "Slice size at index %d in gather op is out of range, must be " + "within [0, %d), got %d.", + i, corresponding_input_size + 1, slice_size); } } - for (int i = 0; i < gather_dim_numbers.elided_window_dims_size(); i++) { - if (window_bounds[gather_dim_numbers.elided_window_dims(i)] != 1) { + for (int i = 0; i < gather_dim_numbers.collapsed_slice_dims_size(); i++) { + if (slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)] != 1) { return InvalidArgument( - "Gather op can only elide window indices with bound 1, but bound is " - "%lld for index %lld at position %d.", - window_bounds[gather_dim_numbers.elided_window_dims(i)], - gather_dim_numbers.elided_window_dims(i), i); + "Gather op can only collapse slice dims with bound 1, but bound is " + "%d for index %d at position %d.", + slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)], + gather_dim_numbers.collapsed_slice_dims(i), i); } } - int64 result_rank = gather_dim_numbers.output_window_dims_size() + - (expanded_gather_indices_shape.size() - 1); - int64 window_dims_seen = 0; + int64 result_rank = gather_dim_numbers.offset_dims_size() + + (expanded_start_indices_shape.size() - 1); + int64 offset_dims_seen = 0; int64 gather_dims_seen = 0; std::vector output_dim_bounds; output_dim_bounds.reserve(result_rank); for (int64 i = 0; i < result_rank; i++) { int64 current_bound; bool is_window_index = - c_binary_search(gather_dim_numbers.output_window_dims(), i); + absl::c_binary_search(gather_dim_numbers.offset_dims(), i); if (is_window_index) { - while (c_binary_search(gather_dim_numbers.elided_window_dims(), - window_dims_seen)) { - window_dims_seen++; + while (absl::c_binary_search(gather_dim_numbers.collapsed_slice_dims(), + offset_dims_seen)) { + offset_dims_seen++; } - current_bound = window_bounds[window_dims_seen++]; + current_bound = slice_sizes[offset_dims_seen++]; } else { if (gather_dims_seen == gather_dim_numbers.index_vector_dim()) { gather_dims_seen++; } - current_bound = expanded_gather_indices_shape[gather_dims_seen++]; + current_bound = expanded_start_indices_shape[gather_dims_seen++]; } output_dim_bounds.push_back(current_bound); @@ -2683,4 +2698,193 @@ static Status ValidateGatherDimensionNumbers( return ShapeUtil::MakeShape(input_shape.element_type(), output_dim_bounds); } +namespace { + +Status ValidateScatterDimensionNumbers( + const Shape& operand_shape, absl::Span scatter_indices_shape, + const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) { + // Validate update_window_dims in ScatterDimensionNumbers. + if (!absl::c_is_sorted(dim_numbers.update_window_dims())) { + return InvalidArgument( + "update_window_dims in scatter op must be sorted; got: %s.", + StrJoin(dim_numbers.update_window_dims(), ", ")); + } + if (absl::c_adjacent_find(dim_numbers.update_window_dims()) != + dim_numbers.update_window_dims().end()) { + return InvalidArgument( + "update_window_dims in scatter op must not repeat; got: %s.", + StrJoin(dim_numbers.update_window_dims(), ", ")); + } + const int64 updates_rank = ShapeUtil::Rank(updates_shape); + for (int64 window_dim : dim_numbers.update_window_dims()) { + if (window_dim < 0 || window_dim >= updates_rank) { + return InvalidArgument( + "Invalid update_window_dims set in scatter op; valid range is [0, " + "%d). got: %d.", + updates_rank, window_dim); + } + } + + // Validate inserted_window_dims in ScatterDimensionNumbers. + if (!absl::c_is_sorted(dim_numbers.inserted_window_dims())) { + return InvalidArgument( + "inserted_window_dims in scatter op must be sorted; got: %s.", + StrJoin(dim_numbers.inserted_window_dims(), ", ")); + } + if (absl::c_adjacent_find(dim_numbers.inserted_window_dims()) != + dim_numbers.inserted_window_dims().end()) { + return InvalidArgument( + "inserted_window_dims in scatter op must not repeat; got: %s.", + StrJoin(dim_numbers.inserted_window_dims(), ", ")); + } + for (int64 inserted_dim : dim_numbers.inserted_window_dims()) { + if (inserted_dim < 0 || inserted_dim >= operand_shape.dimensions_size()) { + return InvalidArgument( + "Invalid inserted_window_dims set in scatter op; valid range is [0, " + "%d), got: %d.", + operand_shape.dimensions_size(), inserted_dim); + } + } + + // Validate scatter_dims_to_operand_dims in ScatterDimensionNumbers. + if (dim_numbers.scatter_dims_to_operand_dims_size() != + scatter_indices_shape[dim_numbers.index_vector_dim()]) { + return InvalidArgument( + "Scatter op has %d elements in scatter_dims_to_operand_dims and the " + "bound of dimension index_vector_dim=%d of scatter_indices is %d. " + "These two numbers must be equal.", + dim_numbers.scatter_dims_to_operand_dims_size(), + dim_numbers.index_vector_dim(), + scatter_indices_shape[dim_numbers.index_vector_dim()]); + } + for (int i = 0; i < dim_numbers.scatter_dims_to_operand_dims_size(); ++i) { + int64 scatter_dim_to_operand_dim = + dim_numbers.scatter_dims_to_operand_dims(i); + if (scatter_dim_to_operand_dim < 0 || + scatter_dim_to_operand_dim >= operand_shape.dimensions_size()) { + return InvalidArgument( + "Invalid scatter_dims_to_operand_dims mapping; domain is [0, %d), " + "got: %d->%d.", + operand_shape.dimensions_size(), i, scatter_dim_to_operand_dim); + } + } + std::vector sorted_scatter_dims_to_operand_dims( + dim_numbers.scatter_dims_to_operand_dims().begin(), + dim_numbers.scatter_dims_to_operand_dims().end()); + absl::c_sort(sorted_scatter_dims_to_operand_dims); + if (absl::c_adjacent_find(sorted_scatter_dims_to_operand_dims) != + sorted_scatter_dims_to_operand_dims.end()) { + return InvalidArgument( + "Repeated dimensions not allowed in scatter_dims_to_operand_dims; " + "got: %s.", + StrJoin(dim_numbers.scatter_dims_to_operand_dims(), ", ")); + } + + return Status::OK(); +} + +} // namespace + +/*static*/ StatusOr ShapeInference::InferScatterShape( + const Shape& operand_shape, const Shape& scatter_indices_shape, + const Shape& updates_shape, const ProgramShape& to_apply_shape, + const ScatterDimensionNumbers& scatter_dim_numbers) { + TF_RETURN_IF_ERROR( + ExpectArray(operand_shape, "operand tensor of scatter op")); + TF_RETURN_IF_ERROR( + ExpectArray(scatter_indices_shape, "scatter indices of scatter op")); + TF_RETURN_IF_ERROR(ExpectArray(updates_shape, "updates of scatter op")); + + if (!ShapeUtil::ElementIsIntegral(scatter_indices_shape)) { + return InvalidArgument( + "Scatter indices parameter must be an integral tensor; got %s.", + ShapeUtil::HumanString(scatter_indices_shape)); + } + + if (scatter_indices_shape.dimensions_size() < + scatter_dim_numbers.index_vector_dim() || + scatter_dim_numbers.index_vector_dim() < 0) { + return InvalidArgument( + "Scatter index leaf dimension must be within [0, rank(scatter_indices)" + " + 1). rank(scatter_indices) is %d and scatter index leaf dimension " + "is %d.", + scatter_indices_shape.dimensions_size(), + scatter_dim_numbers.index_vector_dim()); + } + + // Check if the update computation has a proper shape as a reduction. + const Shape init_value_shape = + ShapeUtil::MakeShape(operand_shape.element_type(), {}); + TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply_shape, {&init_value_shape}, + {updates_shape.element_type()}, + /*inputs=*/1)); + + std::vector expanded_scatter_indices_shape = + ArraySliceToVector(AsInt64Slice(scatter_indices_shape.dimensions())); + if (expanded_scatter_indices_shape.size() == + scatter_dim_numbers.index_vector_dim()) { + expanded_scatter_indices_shape.push_back(1); + } + + int64 expected_updates_rank = expanded_scatter_indices_shape.size() - 1 + + scatter_dim_numbers.update_window_dims_size(); + if (ShapeUtil::Rank(updates_shape) != expected_updates_rank) { + return InvalidArgument("Updates tensor must be of rank %d; got %d.", + expected_updates_rank, + ShapeUtil::Rank(updates_shape)); + } + + TF_RETURN_IF_ERROR(ValidateScatterDimensionNumbers( + operand_shape, expanded_scatter_indices_shape, updates_shape, + scatter_dim_numbers)); + + int64 inserted_dims_seen = 0; + std::vector max_update_slice_sizes; + for (int i = 0; i < operand_shape.dimensions_size(); ++i) { + if (inserted_dims_seen < scatter_dim_numbers.inserted_window_dims_size() && + scatter_dim_numbers.inserted_window_dims(inserted_dims_seen) == i) { + ++inserted_dims_seen; + } else { + max_update_slice_sizes.push_back(operand_shape.dimensions(i)); + } + } + for (int i = 0; i < scatter_dim_numbers.update_window_dims_size(); ++i) { + auto update_window_dim = scatter_dim_numbers.update_window_dims(i); + if (updates_shape.dimensions(update_window_dim) > + max_update_slice_sizes[i]) { + return InvalidArgument( + "Bounds of the window dimensions of updates must not exceed the " + "bounds of the corresponding dimensions of operand. For dimension " + "%d, updates bound is %d, operand bound is %d.", + update_window_dim, updates_shape.dimensions(update_window_dim), + max_update_slice_sizes[i]); + } + } + + int64 scatter_dims_seen = 0; + for (int64 i = 0; i < ShapeUtil::Rank(updates_shape); ++i) { + bool is_update_window_dim = + absl::c_binary_search(scatter_dim_numbers.update_window_dims(), i); + if (is_update_window_dim) { + continue; + } + if (scatter_dims_seen == scatter_dim_numbers.index_vector_dim()) { + ++scatter_dims_seen; + } + if (updates_shape.dimensions(i) != + expanded_scatter_indices_shape[scatter_dims_seen]) { + return InvalidArgument( + "Bounds of the scatter dimensions of updates must be same as the " + "bounds of the corresponding dimensions of scatter indices. For " + "scatter dimension %d, updates bound is %d, scatter_indices " + "bound is %d.", + i, updates_shape.dimensions(i), + expanded_scatter_indices_shape[scatter_dims_seen]); + } + ++scatter_dims_seen; + } + + return operand_shape; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h index 9da2c99b4177f08ece8daabaf2922ddd7e947a1b..a28345acefb8fca1c8b6444f431f932c23c57ce4 100644 --- a/tensorflow/compiler/xla/service/shape_inference.h +++ b/tensorflow/compiler/xla/service/shape_inference.h @@ -21,12 +21,12 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -46,8 +46,6 @@ class ShapeInference { public: // Infers the shape produced by applying the given unary operation to the // given input shape. - static StatusOr InferUnaryOpShape(UnaryOperation operation, - const Shape& arg); static StatusOr InferUnaryOpShape(HloOpcode opcode, const Shape& shape); static StatusOr InferUnaryOpShape(HloOpcode opcode, @@ -55,21 +53,15 @@ class ShapeInference { // Infers the shape produced by applying the given binary operation to the // given input shapes. - static StatusOr InferBinaryOpShape( - BinaryOperation operation, const Shape& lhs, const Shape& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions); static StatusOr InferBinaryOpShape( HloOpcode opcode, const Shape& lhs, const Shape& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions); + absl::Span broadcast_dimensions); static StatusOr InferBinaryOpShape(HloOpcode opcode, const HloInstruction* lhs, const HloInstruction* rhs); // Infers the shape produced by applying the given ternary operation to the // given input shapes. - static StatusOr InferTernaryOpShape(TernaryOperation operation, - const Shape& lhs, const Shape& rhs, - const Shape& ehs); static StatusOr InferTernaryOpShape(HloOpcode opcode, const Shape& lhs, const Shape& rhs, const Shape& ehs); @@ -81,21 +73,15 @@ class ShapeInference { // Infers the shape produced by applying the given variadic operation to the // given input operand shapes. static StatusOr InferVariadicOpShape( - VariadicOperation operation, - tensorflow::gtl::ArraySlice operand_shapes); - static StatusOr InferVariadicOpShape( - HloOpcode opcode, - tensorflow::gtl::ArraySlice operand_shapes); + HloOpcode opcode, absl::Span operand_shapes); static StatusOr InferVariadicOpShape( - HloOpcode opcode, - tensorflow::gtl::ArraySlice operands); + HloOpcode opcode, absl::Span operands); // Infers the shape produced by applying the given mapping computation shape // to the given operand shapes. static StatusOr InferMapShape( - tensorflow::gtl::ArraySlice arg_shapes, - const ProgramShape& to_apply, - tensorflow::gtl::ArraySlice dimensions); + absl::Span arg_shapes, const ProgramShape& to_apply, + absl::Span dimensions); // Infers the shape produced by InferBatchNormTraining with the given // operands. @@ -123,17 +109,31 @@ class ShapeInference { // filter (rhs) to lhs in the way specified by the fields on window. static StatusOr InferConvolveShape( const Shape& lhs, const Shape& rhs, const Window& window, - const ConvolutionDimensionNumbers& dimension_numbers); + const ConvolutionDimensionNumbers& dimension_numbers, + int64 feature_group_count = 1); // Infers the shape produced by the given FFT type on the given operand. - static StatusOr InferFftShape( - const Shape& in, FftType fft_type, - tensorflow::gtl::ArraySlice fft_length); + static StatusOr InferFftShape(const Shape& in, FftType fft_type, + absl::Span fft_length); - // Infers the shape produced a cross replica sum with the given operand + // Infers the shape produced by a cross replica sum with the given operand // shapes. static StatusOr InferCrossReplicaSumShape( - tensorflow::gtl::ArraySlice operand_shapes); + absl::Span operand_shapes); + + // Infers final shape of an Alltoall operation that is created by the xla + // builder. + static StatusOr InferAllToAllShape(const Shape& shape, + int64 split_dimension, + int64 concat_dimension, + int64 split_count); + + // Infers the shape of an HLO all-to-all instruction. + static StatusOr InferAllToAllTupleShape( + absl::Span operand_shapes); + + // Infers the shape of a collective permute operation. + static StatusOr InferCollectivePermuteShape(const Shape& shape); // Infers the shape produced by applying the given reduction computation // shape to the given input operand shape. @@ -142,8 +142,8 @@ class ShapeInference { // index as the leading parameter, and the program shape should match // accordingly (or an error will result). static StatusOr InferReduceShape( - const Shape& arg, const Shape& init_value, - tensorflow::gtl::ArraySlice dimensions_to_reduce, + absl::Span arg_shapes, + absl::Span dimensions_to_reduce, const ProgramShape& to_apply); // Infers the shape produced by applying the given computation to the operand @@ -161,24 +161,23 @@ class ShapeInference { // Infers the shape produced by a reverse operation that reverses the order // of the elements in the given dimensions. - static StatusOr InferReverseShape( - const Shape& operand_shape, - tensorflow::gtl::ArraySlice dimensions); + static StatusOr InferReverseShape(const Shape& operand_shape, + absl::Span dimensions); // Infers the shape produced by a slice operation spanning from the starts to // the limits in the original shape's dimensions. // // e.g. slice f32[32x32] 0:16 0:16 -> f32[16x16] - static StatusOr InferSliceShape( - const Shape& arg, tensorflow::gtl::ArraySlice starts, - tensorflow::gtl::ArraySlice limits, - tensorflow::gtl::ArraySlice strides); + static StatusOr InferSliceShape(const Shape& arg, + absl::Span starts, + absl::Span limits, + absl::Span strides); // Infers the shape produced by a dynamic slice operation of size specified // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'. static StatusOr InferDynamicSliceShape( const Shape& operand_shape, const Shape& start_indices_shape, - tensorflow::gtl::ArraySlice slice_sizes); + absl::Span slice_sizes); // Infers the shape produced by a dynamic update slice operation based // on the shape of operand and update. @@ -209,23 +208,30 @@ class ShapeInference { // Infers the shape produced by a broadcast operation. static StatusOr InferBroadcastShape( - const Shape& operand, tensorflow::gtl::ArraySlice broadcast_sizes); + const Shape& operand, absl::Span broadcast_sizes); // Infers the shape produced by a reshape operation from the element type of // its operand and the new dimension sizes specified. - static StatusOr InferReshapeShape( - const Shape& operand, tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice new_sizes); + static StatusOr InferReshapeShape(const Shape& operand, + absl::Span dimensions, + absl::Span new_sizes); // Infers the shape produced by a transpose operation from the element type of // its operand and its dimensions field. static StatusOr InferTransposeShape( - const Shape& operand, tensorflow::gtl::ArraySlice dimensions); + const Shape& operand, absl::Span dimensions); // Helper that infers the shape produced by performing a concatenate operation // with the given operand shapes. static StatusOr InferConcatOpShape( - tensorflow::gtl::ArraySlice arg_shapes, int64 dimension); + absl::Span arg_shapes, int64 dimension); + + // Infers the shape produced by a kAfterAll. Trivially this shape is always a + // TOKEN shape. However, ShapeInference serves two purposes: inferring shapes + // and checking operand shapes. This method verifies that the operand shapes + // are all TOKENs. + static StatusOr InferAfterAllShape( + absl::Span arg_shapes); // Helper that validates the given operand shape can be converted to the // target output_shape via a convert instruction -- the requirement is that @@ -255,8 +261,7 @@ class ShapeInference { // Helper that validates the given arg_shapes are compatible with the shape of // the to_apply parameters, and returns the to_apply result shape. static StatusOr InferCallShape( - tensorflow::gtl::ArraySlice arg_shapes, - const ProgramShape& to_apply); + absl::Span arg_shapes, const ProgramShape& to_apply); // Helper that infers the shape produced by performing a dot operation with // the given LHS and RHS shapes. @@ -268,9 +273,17 @@ class ShapeInference { // with the given input shape, gather indices shape and gather dimension // numbers. static StatusOr InferGatherShape( - const Shape& input_shape, const Shape& gather_indices_shape, + const Shape& input_shape, const Shape& start_indices_shape, const GatherDimensionNumbers& gather_dim_numbers, - tensorflow::gtl::ArraySlice window_bounds); + absl::Span slice_sizes); + + // Helper that validates the given input shape, scatter indices shape, updates + // shape, and scatter dimension numbers that constitute a scatter operation, + // and returns the result shape of the scatter operation. + static StatusOr InferScatterShape( + const Shape& operand_shape, const Shape& scatter_indices_shape, + const Shape& updates_shape, const ProgramShape& to_apply_shape, + const ScatterDimensionNumbers& scatter_dim_numbers); private: // Helper that infers the shape produced by performing an element-wise binary @@ -279,8 +292,8 @@ class ShapeInference { // the LHS and a single element in the RHS to produce a single output element, // even in the presence of broadcasting of one of the operands over the other. static StatusOr InferElementwiseBinaryOpShape( - BinaryOperation operation, const Shape& lhs, const Shape& rhs, - tensorflow::gtl::ArraySlice broadcast_dimensions); + HloOpcode operation, const Shape& lhs, const Shape& rhs, + absl::Span broadcast_dimensions); // Helper for inferring the shape of Clamp ops. static StatusOr InferClampShape(const Shape& min, const Shape& operand, @@ -290,12 +303,16 @@ class ShapeInference { static StatusOr InferSelectShape(const Shape& pred, const Shape& on_true, const Shape& on_false); + // Helper for inferring the shape of TupleSelect ops. + static StatusOr InferTupleSelectShape(const Shape& pred, + const Shape& on_true, + const Shape& on_false); // Helper for inferring shapes of binary operations which use degenerate // dimension broadcasting (a dimension of size 1 in one operand is broadcast // up to match the size of the dimension in the other operand). static StatusOr InferDegenerateDimensionBroadcastShape( - BinaryOperation operation, const Shape& lhs, const Shape& rhs); + HloOpcode operation, const Shape& lhs, const Shape& rhs); // Helper for inferring shapes of binary operations using "InDim" // broadcasting. This is the broadcasting used in the *InDim binary operations @@ -303,9 +320,8 @@ class ShapeInference { // lower-rank shape than larger_shape. Returns the shape that the // smaller_shape is broadcast to. static StatusOr InferInDimBroadcastShape( - BinaryOperation operation, const Shape& smaller_shape, - const Shape& larger_shape, - tensorflow::gtl::ArraySlice broadcast_dimensions); + const Shape& smaller_shape, const Shape& larger_shape, + absl::Span broadcast_dimensions); TF_DISALLOW_COPY_AND_ASSIGN(ShapeInference); }; diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc index 0e61994a786b53a295ef9c9c2287b28fbf754d9b..cc92e58ef867ee716714fff4fdab07b9cb836d00 100644 --- a/tensorflow/compiler/xla/service/shape_inference_test.cc +++ b/tensorflow/compiler/xla/service/shape_inference_test.cc @@ -17,17 +17,17 @@ limitations under the License. #include +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { namespace { -using ::tensorflow::gtl::ArraySlice; using ::testing::ContainsRegex; using ::testing::HasSubstr; @@ -57,12 +57,12 @@ class ReduceShapeInferenceTest : public ShapeInferenceTest { // Helper that runs reduce shape inference with the input 'arg' and given // dimensions to reduce, and checks the inferred shape is as expected. The // element type here is hard-coded to F32. - void ExpectInferredReduceShape( - const Shape& expected_inferred_shape, const Shape& arg, - tensorflow::gtl::ArraySlice dimensions_to_reduce) { + void ExpectInferredReduceShape(const Shape& expected_inferred_shape, + const Shape& arg, + absl::Span dimensions_to_reduce) { ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_); auto inferred_status = ShapeInference::InferReduceShape( - arg, f32_, dimensions_to_reduce, to_apply); + {&arg, &f32_}, dimensions_to_reduce, to_apply); EXPECT_IS_OK(inferred_status.status()); EXPECT_TRUE(ShapeUtil::Equal(expected_inferred_shape, inferred_status.ValueOrDie())); @@ -101,8 +101,8 @@ class SelectAndScatterShapeInferenceTest : public ShapeInferenceTest { TEST_F(ShapeInferenceTest, UnaryNegateMatrix) { Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}); - auto inferred_status = ShapeInference::InferUnaryOpShape( - UnaryOperation::UNOP_NEGATE, matrix_shape); + auto inferred_status = + ShapeInference::InferUnaryOpShape(HloOpcode::kNegate, matrix_shape); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_shape, inferred_status.ValueOrDie())); } @@ -110,14 +110,14 @@ TEST_F(ShapeInferenceTest, UnaryNegateMatrix) { TEST_F(ShapeInferenceTest, SelectScalarPredBetweenTuples) { Shape tuple = ShapeUtil::MakeTupleShape({s32_, f32_}); auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_SELECT, pred_, tuple, tuple); + HloOpcode::kSelect, pred_, tuple, tuple); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(tuple, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, SelectScalarPredBetweenArrays) { auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_SELECT, pred_, matrix_64_48_, matrix_64_48_); + HloOpcode::kSelect, pred_, matrix_64_48_, matrix_64_48_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie())); } @@ -125,34 +125,34 @@ TEST_F(ShapeInferenceTest, SelectScalarPredBetweenArrays) { TEST_F(ShapeInferenceTest, SelectArrayPredBetweenArrays) { auto predarray = ShapeUtil::MakeShape(PRED, {64, 48}); auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_SELECT, predarray, matrix_64_48_, matrix_64_48_); + HloOpcode::kSelect, predarray, matrix_64_48_, matrix_64_48_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, SelectBadShapes) { auto inferred_status_error1 = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_SELECT, pred_, matrix_64_48_, matrix_32_64_); + HloOpcode::kSelect, pred_, matrix_64_48_, matrix_32_64_); ASSERT_FALSE(inferred_status_error1.ok()); ASSERT_THAT(inferred_status_error1.status().error_message(), HasSubstr("Operands to select must be the same shape")); auto inferred_status_error2 = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_SELECT, s32_, matrix_64_48_, matrix_64_48_); + HloOpcode::kSelect, s32_, matrix_64_48_, matrix_64_48_); ASSERT_FALSE(inferred_status_error2.ok()); ASSERT_THAT(inferred_status_error2.status().error_message(), HasSubstr("pred operand must have PRED")); auto inferred_status_error3 = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_SELECT, ShapeUtil::MakeShape(PRED, {64}), - matrix_64_48_, matrix_64_48_); + HloOpcode::kSelect, ShapeUtil::MakeShape(PRED, {64}), matrix_64_48_, + matrix_64_48_); ASSERT_FALSE(inferred_status_error3.ok()); ASSERT_THAT(inferred_status_error3.status().error_message(), HasSubstr("with non-scalar predicate with dimensionality")); // Tuples have a TUPLE element type and cannot be the pred of a select. auto inferred_status_error4 = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_SELECT, ShapeUtil::MakeTupleShape({pred_, pred_}), + HloOpcode::kSelect, ShapeUtil::MakeTupleShape({pred_, pred_}), ShapeUtil::MakeTupleShape({f32_, f32_}), ShapeUtil::MakeTupleShape({f32_, f32_})); ASSERT_FALSE(inferred_status_error4.ok()); @@ -162,102 +162,98 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) { TEST_F(ShapeInferenceTest, ClampAllMatrix) { auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, matrix_64_48_, matrix_64_48_, - matrix_64_48_); + HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, matrix_64_48_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, ClampAllScalar) { - auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, f32_, f32_, f32_); + auto inferred_status = + ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_, f32_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(f32_, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, ClampMinScalar) { auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, f32_, matrix_64_48_, matrix_64_48_); + HloOpcode::kClamp, f32_, matrix_64_48_, matrix_64_48_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, ClampMaxScalar) { auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, matrix_64_48_, matrix_64_48_, f32_); + HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, f32_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, ClampOperandScalar) { auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, matrix_64_48_, f32_, matrix_64_48_); + HloOpcode::kClamp, matrix_64_48_, f32_, matrix_64_48_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, ClampMinMatrix) { auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, matrix_64_48_, f32_, f32_); + HloOpcode::kClamp, matrix_64_48_, f32_, f32_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, ClampMaxMatrix) { auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, f32_, f32_, matrix_64_48_); + HloOpcode::kClamp, f32_, f32_, matrix_64_48_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, ClampOperandMatrix) { auto inferred_status = ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, f32_, matrix_64_48_, f32_); + HloOpcode::kClamp, f32_, matrix_64_48_, f32_); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie())); } TEST_F(ShapeInferenceTest, ClampBadShapes) { // Type mismatch - ASSERT_FALSE(ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, s32_, f32_, f32_) - .ok()); - ASSERT_FALSE(ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, f32_, s32_, f32_) - .ok()); - ASSERT_FALSE(ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, f32_, f32_, s32_) - .ok()); - // Dimension mismatch ASSERT_FALSE( - ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP, - vector_64_, vector_32_, vector_32_) + ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, s32_, f32_, f32_) .ok()); ASSERT_FALSE( - ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP, - vector_32_, vector_64_, vector_32_) + ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, s32_, f32_) .ok()); ASSERT_FALSE( - ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP, - vector_32_, vector_32_, vector_64_) + ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_, s32_) .ok()); - // Dimension mismatch, where one operand is a scalar + // Dimension mismatch ASSERT_FALSE(ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, vector_64_, vector_32_, f32_) + HloOpcode::kClamp, vector_64_, vector_32_, vector_32_) .ok()); ASSERT_FALSE(ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, vector_64_, f32_, vector_32_) + HloOpcode::kClamp, vector_32_, vector_64_, vector_32_) .ok()); ASSERT_FALSE(ShapeInference::InferTernaryOpShape( - TernaryOperation::TRIOP_CLAMP, f32_, vector_64_, vector_32_) + HloOpcode::kClamp, vector_32_, vector_32_, vector_64_) + .ok()); + // Dimension mismatch, where one operand is a scalar + ASSERT_FALSE(ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, + vector_64_, vector_32_, f32_) + .ok()); + ASSERT_FALSE(ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, + vector_64_, f32_, vector_32_) + .ok()); + ASSERT_FALSE(ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, + vector_64_, vector_32_) .ok()); } TEST_F(ShapeInferenceTest, Complex) { auto complex_shape = [&](const Shape& lhs, const Shape& rhs, - const tensorflow::gtl::ArraySlice& bcast) { - return ShapeInference::InferBinaryOpShape(BinaryOperation::BINOP_COMPLEX, - lhs, rhs, bcast); + const absl::Span& bcast) { + return ShapeInference::InferBinaryOpShape(HloOpcode::kComplex, lhs, rhs, + bcast); }; // Inputs must be FP. ASSERT_FALSE(complex_shape(s32_, s32_, {}).ok()); @@ -292,8 +288,8 @@ TEST_F(ShapeInferenceTest, Complex) { } TEST_F(ShapeInferenceTest, VariadicOpTuplify) { - StatusOr result = ShapeInference::InferVariadicOpShape( - VariadicOperation::VAROP_TUPLE, {&s32_, &f32_}); + StatusOr result = + ShapeInference::InferVariadicOpShape(HloOpcode::kTuple, {&s32_, &f32_}); ASSERT_IS_OK(result.status()); ASSERT_TRUE(ShapeUtil::Equal(result.ValueOrDie(), ShapeUtil::MakeTupleShape({s32_, f32_}))); @@ -706,11 +702,99 @@ TEST_F(ReduceShapeInferenceTest, ReduceCubeAmongAllDimensions) { /*dimensions_to_reduce=*/{0, 1, 2}); } +TEST_F(ReduceShapeInferenceTest, ReduceMultiOutput) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_IS_OK(inferred_status.status()); + EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeTupleShape({f32_, s32_}), + inferred_status.ValueOrDie())); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput1) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = + ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_, f32_, s32_}, + ShapeUtil::MakeTupleShape({f32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT(inferred_status.status().error_message(), + HasSubstr("must take 4 parameters, but takes 6 parameter(s)")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput2) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT( + inferred_status.status().error_message(), + HasSubstr( + "parameter shape differs from the result shape: s32[] vs f32[]")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput3) { + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape({}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT(inferred_status.status().error_message(), + HasSubstr("must have at least 2 arguments, has 0")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = + ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_}, f32_); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT( + inferred_status.status().error_message(), + HasSubstr("must produce a tuple with 2 elements, but produces a scalar")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput2) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT( + inferred_status.status().error_message(), + HasSubstr("must produce a tuple with 2 elements, but has 3 elements")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerBoth) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {s32_, s32_, s32_, s32_}, ShapeUtil::MakeTupleShape({s32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT(inferred_status.status().error_message(), + HasSubstr("accumulator shape at index 0 differs from the " + "init_value shape: s32[] vs f32[]")); +} + TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) { ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_); + Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); auto inferred_status = ShapeInference::InferReduceShape( - ShapeUtil::MakeShape(F32, {5, 3}), f32_, /*dimensions_to_reduce=*/{3, 4}, - to_apply); + {&arg_shape, &f32_}, + /*dimensions_to_reduce=*/{3, 4}, to_apply); EXPECT_FALSE(inferred_status.ok()); EXPECT_THAT(inferred_status.status().error_message(), HasSubstr("out-of-bounds dimension")); @@ -718,8 +802,9 @@ TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) { TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) { ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_, f32_}, f32_); + Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); auto inferred_status = - ShapeInference::InferReduceShape(ShapeUtil::MakeShape(F32, {5, 3}), f32_, + ShapeInference::InferReduceShape({&arg_shape, &f32_}, /*dimensions_to_reduce=*/{0}, to_apply); EXPECT_FALSE(inferred_status.ok()); EXPECT_THAT(inferred_status.status().error_message(), @@ -728,12 +813,13 @@ TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) { TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) { ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, s32_); + Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); auto inferred_status = - ShapeInference::InferReduceShape(ShapeUtil::MakeShape(F32, {5, 3}), f32_, + ShapeInference::InferReduceShape({&arg_shape, &f32_}, /*dimensions_to_reduce=*/{0}, to_apply); EXPECT_FALSE(inferred_status.ok()); EXPECT_THAT(inferred_status.status().error_message(), - HasSubstr("first parameter shape differs")); + HasSubstr("0-th parameter shape differs")); } TEST_F(ShapeInferenceTest, InferSliceShapeRank2) { @@ -804,8 +890,8 @@ TEST_F(ShapeInferenceTest, InferConstIndexShape) { TEST_F(ShapeInferenceTest, InferPowShape) { auto ten_floats = ShapeUtil::MakeShape(F32, {10}); - auto inferred_status = - ShapeInference::InferBinaryOpShape(BINOP_POW, ten_floats, f32_, {}); + auto inferred_status = ShapeInference::InferBinaryOpShape( + HloOpcode::kPower, ten_floats, f32_, {}); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(ten_floats, inferred_status.ValueOrDie())); } @@ -813,7 +899,7 @@ TEST_F(ShapeInferenceTest, InferPowShape) { TEST_F(ShapeInferenceTest, InferCompareShapeEq) { auto ten_floats = ShapeUtil::MakeShape(F32, {10}); auto inferred_status = - ShapeInference::InferBinaryOpShape(BINOP_EQ, ten_floats, f32_, {}); + ShapeInference::InferBinaryOpShape(HloOpcode::kEq, ten_floats, f32_, {}); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), inferred_status.ValueOrDie())); @@ -822,7 +908,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeEq) { TEST_F(ShapeInferenceTest, InferCompareShapeGe) { auto ten_floats = ShapeUtil::MakeShape(F32, {10}); auto inferred_status = - ShapeInference::InferBinaryOpShape(BINOP_GE, ten_floats, f32_, {}); + ShapeInference::InferBinaryOpShape(HloOpcode::kGe, ten_floats, f32_, {}); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), inferred_status.ValueOrDie())); @@ -831,7 +917,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeGe) { TEST_F(ShapeInferenceTest, InferCompareShapeGt) { auto ten_floats = ShapeUtil::MakeShape(F32, {10}); auto inferred_status = - ShapeInference::InferBinaryOpShape(BINOP_GT, ten_floats, f32_, {}); + ShapeInference::InferBinaryOpShape(HloOpcode::kGt, ten_floats, f32_, {}); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), inferred_status.ValueOrDie())); @@ -840,7 +926,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeGt) { TEST_F(ShapeInferenceTest, InferCompareShapeLe) { auto ten_floats = ShapeUtil::MakeShape(F32, {10}); auto inferred_status = - ShapeInference::InferBinaryOpShape(BINOP_LE, ten_floats, f32_, {}); + ShapeInference::InferBinaryOpShape(HloOpcode::kLe, ten_floats, f32_, {}); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), inferred_status.ValueOrDie())); @@ -849,7 +935,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeLe) { TEST_F(ShapeInferenceTest, InferCompareShapeLt) { auto ten_floats = ShapeUtil::MakeShape(F32, {10}); auto inferred_status = - ShapeInference::InferBinaryOpShape(BINOP_LT, ten_floats, f32_, {}); + ShapeInference::InferBinaryOpShape(HloOpcode::kLt, ten_floats, f32_, {}); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), inferred_status.ValueOrDie())); @@ -858,7 +944,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeLt) { TEST_F(ShapeInferenceTest, InferCompareShapeNe) { auto ten_floats = ShapeUtil::MakeShape(F32, {10}); auto inferred_status = - ShapeInference::InferBinaryOpShape(BINOP_NE, ten_floats, f32_, {}); + ShapeInference::InferBinaryOpShape(HloOpcode::kNe, ten_floats, f32_, {}); ASSERT_IS_OK(inferred_status.status()); ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), inferred_status.ValueOrDie())); @@ -1111,22 +1197,22 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastMatrixVector) { const Shape vec8 = ShapeUtil::MakeShape(F32, {8}); const Shape vec16 = ShapeUtil::MakeShape(F32, {16}); - auto inferred_status_match = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, mat, vec8, {1}); + auto inferred_status_match = + ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {1}); ASSERT_IS_OK(inferred_status_match.status()); ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), mat)); - auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, mat, vec8, {0}); + auto inferred_status_mismatch = + ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {0}); ASSERT_FALSE(inferred_status_mismatch.ok()); - inferred_status_match = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, mat, vec16, {0}); + inferred_status_match = + ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec16, {0}); ASSERT_IS_OK(inferred_status_match.status()); ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), mat)); - inferred_status_mismatch = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, mat, vec16, {1}); + inferred_status_mismatch = + ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec16, {1}); ASSERT_FALSE(inferred_status_mismatch.ok()); } @@ -1138,17 +1224,17 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastCubeMatrix) { const Shape matrix16_8 = ShapeUtil::MakeShape(F32, {16, 8}); auto inferred_status_match = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, cube, matrix8_4, {1, 2}); + HloOpcode::kAdd, cube, matrix8_4, {1, 2}); ASSERT_IS_OK(inferred_status_match.status()); ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), cube)); inferred_status_match = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, cube, matrix16_4, {0, 2}); + HloOpcode::kAdd, cube, matrix16_4, {0, 2}); ASSERT_IS_OK(inferred_status_match.status()); ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), cube)); inferred_status_match = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, cube, matrix16_8, {0, 1}); + HloOpcode::kAdd, cube, matrix16_8, {0, 1}); ASSERT_IS_OK(inferred_status_match.status()); ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), cube)); } @@ -1162,43 +1248,43 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) { const Shape matrix8_8 = ShapeUtil::MakeShape(F32, {8, 8}); // "magical" broadcast rejected - auto inferred_status_error1 = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, tensor, vec8, {}); + auto inferred_status_error1 = + ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {}); ASSERT_FALSE(inferred_status_error1.ok()); ASSERT_THAT(inferred_status_error1.status().error_message(), HasSubstr("Automatic")); // broadcast_dimension out of bounds for tensor's rank - auto inferred_status_error2 = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, tensor, vec8, {3}); + auto inferred_status_error2 = + ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {3}); ASSERT_FALSE(inferred_status_error2.ok()); ASSERT_THAT(inferred_status_error2.status().error_message(), ContainsRegex("Broadcast dimension number .* too large")); // broadcast_dimension doesn't match corresponding dimension - auto inferred_status_error3 = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, tensor, vec8, {0}); + auto inferred_status_error3 = + ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {0}); ASSERT_FALSE(inferred_status_error3.ok()); ASSERT_THAT(inferred_status_error3.status().error_message(), HasSubstr("Broadcast dimension 0 mismatch")); // broadcast_dimensions list too long auto inferred_status_error4 = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, tensor, matrix8_4, {0, 1, 2}); + HloOpcode::kAdd, tensor, matrix8_4, {0, 1, 2}); ASSERT_FALSE(inferred_status_error4.ok()); ASSERT_THAT(inferred_status_error4.status().error_message(), HasSubstr("broadcast_dimensions has to match")); // there's a dimension above the rank of the tensor auto inferred_status_error5 = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, tensor, matrix8_4, {3, 0}); + HloOpcode::kAdd, tensor, matrix8_4, {3, 0}); ASSERT_FALSE(inferred_status_error5.ok()); ASSERT_THAT(inferred_status_error5.status().error_message(), ContainsRegex("dimension number .* too large")); // broadcasting dimensions don't match in this order auto inferred_status_error6 = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, tensor, matrix8_4, {2, 1}); + HloOpcode::kAdd, tensor, matrix8_4, {2, 1}); ASSERT_FALSE(inferred_status_error6.ok()); ASSERT_THAT(inferred_status_error6.status().error_message(), HasSubstr("dimension 0 mismatch")); @@ -1207,13 +1293,13 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) { // in a proper (strictly increasing) order, even if the lower-rank array // matches the higher-rank array in many different ways. auto inferred_status_error7 = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {0, 0}); + HloOpcode::kAdd, tensor8_8_8, matrix8_8, {0, 0}); ASSERT_FALSE(inferred_status_error7.ok()); ASSERT_THAT(inferred_status_error7.status().error_message(), HasSubstr("dimensions order is wrong")); auto inferred_status_error8 = ShapeInference::InferBinaryOpShape( - BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {1, 0}); + HloOpcode::kAdd, tensor8_8_8, matrix8_8, {1, 0}); ASSERT_FALSE(inferred_status_error8.ok()); ASSERT_THAT(inferred_status_error8.status().error_message(), HasSubstr("dimensions order is wrong")); @@ -1315,7 +1401,7 @@ TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) { ASSERT_FALSE(inferred_status_error4.ok()); ASSERT_THAT( inferred_status_error4.status().error_message(), - HasSubstr("Expected non-tuple argument for operand of concatenation")); + HasSubstr("Expected array argument for operand of concatenation")); const Shape vector_s32 = ShapeUtil::MakeShape(S32, {32}); auto inferred_status_error5 = ShapeInference::InferConcatOpShape( @@ -1391,7 +1477,7 @@ TEST_F(ShapeInferenceTest, ReverseInvalidDimension) { ShapeInference::InferReverseShape(tuple_shape, {0}); ASSERT_FALSE(inferred_status_error3.ok()); ASSERT_THAT(inferred_status_error3.status().error_message(), - HasSubstr("Expected non-tuple argument")); + HasSubstr("Expected array argument")); } TEST_F(ShapeInferenceTest, Call) { @@ -1527,7 +1613,19 @@ TEST_F(ShapeInferenceTest, BadSlice) { << statusor.status(); } -class GatherShapeInferenceTest : public ShapeInferenceTest { +TEST_F(ShapeInferenceTest, BadSort) { + auto keys = ShapeUtil::MakeShape(F32, {4}); + auto values = ShapeUtil::MakeShape(F32, {5}); + StatusOr statusor = + ShapeInference::InferVariadicOpShape(HloOpcode::kSort, {&keys, &values}); + ASSERT_FALSE(statusor.ok()); + + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("dimensions must match")) + << statusor.status(); +} + +class ScatterGatherShapeInferenceTest : public ShapeInferenceTest { protected: const Shape s64_scalar_ = ShapeUtil::MakeShape(S64, {}); const Shape s64_vector_5_ = ShapeUtil::MakeShape(S64, {5}); @@ -1544,81 +1642,85 @@ class GatherShapeInferenceTest : public ShapeInferenceTest { ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46}); const Shape tuple_shape_ = ShapeUtil::MakeTupleShape( {s64_4d_tensor_10_9_8_7_1_, s64_4d_tensor_10_9_8_7_1_}); + const ProgramShape to_apply_ = + ShapeUtil::MakeProgramShape({f32_, f32_}, f32_); }; -TEST_F(GatherShapeInferenceTest, TensorFlowGather) { - TF_ASSERT_OK_AND_ASSIGN( - Shape gather_shape, - ShapeInference::InferGatherShape(matrix_64_48_, s64_vector_32_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{0}, - /*elided_window_dims=*/{1}, - /*gather_dims_to_operand_dims=*/{1}, - /*index_vector_dim=*/1), - /*window_bounds=*/{64, 1})); +// Shape inference tests for Gather. + +TEST_F(ScatterGatherShapeInferenceTest, TensorFlowGather) { + TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape, + ShapeInference::InferGatherShape( + matrix_64_48_, s64_vector_32_, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{0}, + /*collapsed_slice_dims=*/{1}, + /*start_index_map=*/{1}, + /*index_vector_dim=*/1), + /*slice_sizes=*/{64, 1})); EXPECT_TRUE( ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {64, 32}))) << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, TensorFlowGatherV2) { - TF_ASSERT_OK_AND_ASSIGN( - Shape gather_shape, - ShapeInference::InferGatherShape(matrix_64_48_, s64_vector_32_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{1}, - /*elided_window_dims=*/{0}, - /*gather_dims_to_operand_dims=*/{0}, - /*index_vector_dim=*/1), - /*window_bounds=*/{1, 48})); +TEST_F(ScatterGatherShapeInferenceTest, TensorFlowGatherV2) { + TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape, + ShapeInference::InferGatherShape( + matrix_64_48_, s64_vector_32_, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{1}, + /*collapsed_slice_dims=*/{0}, + /*start_index_map=*/{0}, + /*index_vector_dim=*/1), + /*slice_sizes=*/{1, 48})); EXPECT_TRUE( ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {32, 48}))) << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, TensorFlowGatherNd) { - TF_ASSERT_OK_AND_ASSIGN( - Shape gather_shape, - ShapeInference::InferGatherShape(matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4}, - /*elided_window_dims=*/{0}, - /*gather_dims_to_operand_dims=*/{0}, - /*index_vector_dim=*/4), - /*window_bounds=*/{1, 48})); +TEST_F(ScatterGatherShapeInferenceTest, TensorFlowGatherNd) { + TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape, + ShapeInference::InferGatherShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4}, + /*collapsed_slice_dims=*/{0}, + /*start_index_map=*/{0}, + /*index_vector_dim=*/4), + /*slice_sizes=*/{1, 48})); EXPECT_TRUE(ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 48}))) << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, TensorFlowBatchDynamicSlice) { +TEST_F(ScatterGatherShapeInferenceTest, TensorFlowBatchDynamicSlice) { TF_ASSERT_OK_AND_ASSIGN( Shape gather_shape, ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26})); + /*slice_sizes=*/{30, 29, 28, 27, 26})); EXPECT_TRUE(ShapeUtil::Equal( gather_shape, ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}))) << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) { +TEST_F(ScatterGatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) { TF_ASSERT_OK_AND_ASSIGN( Shape gather_shape, ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/2), - /*window_bounds=*/{30, 29, 28, 27, 26})); + /*slice_sizes=*/{30, 29, 28, 27, 26})); EXPECT_TRUE(ShapeUtil::Equal( gather_shape, @@ -1626,17 +1728,17 @@ TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) { +TEST_F(ScatterGatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) { TF_ASSERT_OK_AND_ASSIGN( Shape gather_shape, ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_5_10_9_7_6_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/0), - /*window_bounds=*/{30, 29, 28, 27, 26})); + /*slice_sizes=*/{30, 29, 28, 27, 26})); EXPECT_TRUE(ShapeUtil::Equal( gather_shape, @@ -1644,94 +1746,96 @@ TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, NoOutputGatherDims) { +TEST_F(ScatterGatherShapeInferenceTest, NoOutputGatherDims) { // This is equivalent to a dynamic slice. - TF_ASSERT_OK_AND_ASSIGN( - Shape gather_shape, - ShapeInference::InferGatherShape( - f32_5d_tensor_50_49_48_47_46_, s64_vector_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{0, 1, 2, 3, 4}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, - /*index_vector_dim=*/0), - /*window_bounds=*/{30, 29, 28, 27, 26})); + TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape, + ShapeInference::InferGatherShape( + f32_5d_tensor_50_49_48_47_46_, s64_vector_5_, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{0, 1, 2, 3, 4}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/0), + /*slice_sizes=*/{30, 29, 28, 27, 26})); EXPECT_TRUE(ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {30, 29, 28, 27, 26}))) << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, ScalarGatherIndices) { +TEST_F(ScatterGatherShapeInferenceTest, ScalarGatherIndices) { // The gather indices "tensor" is a scalar S here that's used to slice out // [S,0,0,0,0]..[S,30,29,28,27] into a [30,29,28,27] shaped result. TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape, ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_scalar_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{0, 1, 2, 3}, - /*elided_window_dims=*/{0}, - /*gather_dims_to_operand_dims=*/{0}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{0, 1, 2, 3}, + /*collapsed_slice_dims=*/{0}, + /*start_index_map=*/{0}, /*index_vector_dim=*/0), - /*window_bounds=*/{1, 30, 29, 28, 27})); + /*slice_sizes=*/{1, 30, 29, 28, 27})); EXPECT_TRUE(ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {30, 29, 28, 27}))) << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, TupleShapedTensorInput) { +TEST_F(ScatterGatherShapeInferenceTest, TupleShapedTensorInput) { StatusOr statusor = ShapeInference::InferGatherShape( tuple_shape_, s64_vector_32_, - HloInstruction::MakeGatherDimNumbers(/*output_window_dims=*/{0}, - /*elided_window_dims=*/{1}, - /*gather_dims_to_operand_dims=*/{1}, - /*index_vector_dim=*/1), - /*window_bounds=*/{64, 1}); + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{0}, + /*collapsed_slice_dims=*/{1}, + /*start_index_map=*/{1}, + /*index_vector_dim=*/1), + /*slice_sizes=*/{64, 1}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), - HasSubstr("Expected non-tuple argument for input")) + HasSubstr("Expected array argument for input")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) { +TEST_F(ScatterGatherShapeInferenceTest, TupleShapedGatherIndicesInput) { StatusOr statusor = ShapeInference::InferGatherShape( s64_vector_32_, tuple_shape_, - HloInstruction::MakeGatherDimNumbers(/*output_window_dims=*/{0}, - /*elided_window_dims=*/{1}, - /*gather_dims_to_operand_dims=*/{1}, - /*index_vector_dim=*/0), - /*window_bounds=*/{64, 1}); + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{0}, + /*collapsed_slice_dims=*/{1}, + /*start_index_map=*/{1}, + /*index_vector_dim=*/0), + /*slice_sizes=*/{64, 1}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), - HasSubstr("Expected non-tuple argument for gather indices")) + HasSubstr("Expected array argument for gather indices")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, FloatingPointGatherIndicesInput) { +TEST_F(ScatterGatherShapeInferenceTest, FloatingPointGatherIndicesInput) { StatusOr statusor = ShapeInference::InferGatherShape( s64_vector_32_, vector_32_, - HloInstruction::MakeGatherDimNumbers(/*output_window_dims=*/{0}, - /*elided_window_dims=*/{1}, - /*gather_dims_to_operand_dims=*/{1}, - /*index_vector_dim=*/0), - /*window_bounds=*/{64, 1}); + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{0}, + /*collapsed_slice_dims=*/{1}, + /*start_index_map=*/{1}, + /*index_vector_dim=*/0), + /*slice_sizes=*/{64, 1}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), HasSubstr("Gather indices parameter must be an integral tensor")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_NonAscendingWindowIndices) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 8, 7}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 8, 7}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT( statusor.status().error_message(), @@ -1739,16 +1843,16 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_RepeatedWindowIndices) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 7}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 7}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT( statusor.status().error_message(), @@ -1756,227 +1860,792 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_WindowIndexOutOfBounds) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 99, 100, 101}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 99, 100, 101}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), - HasSubstr("Window index 2 in gather op is out of bounds")) + HasSubstr("Offset dimension 2 in gather op is out of bounds")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_WindowIndexBarelyOutOfBounds) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 9}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 9}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), - HasSubstr("Window index 4 in gather op is out of bounds")) + HasSubstr("Offset dimension 4 in gather op is out of bounds")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_MismatchingElidedWindowDims) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{4}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{4}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT( statusor.status().error_message(), - HasSubstr("All components of the window index in a gather op must either " - "be a output window index or explicitly elided")) + HasSubstr("All components of the offset index in a gather op must either " + "be a offset dimension or explicitly collapsed")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_OutOfBoundsWindowToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{0, 1, 2, 3, 19}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{0, 1, 2, 3, 19}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), - HasSubstr("Invalid elided_window_dims set in gather op; valid " + HasSubstr("Invalid collapsed_slice_dims set in gather op; valid " "range is [0, 5), got: 19")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_RepeatedWindowToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{0, 1, 2, 3, 3}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{0, 1, 2, 3, 3}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); - EXPECT_THAT( - statusor.status().error_message(), - HasSubstr( - "Repeated dimensions not allowed in elided_window_dims in gather op")) + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Repeated dimensions not allowed in " + "collapsed_slice_dims in gather op")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_MismatchingGatherToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); - EXPECT_THAT( - statusor.status().error_message(), - HasSubstr("Gather op has 4 elements in gather_dims_to_operand_dims and " - "the bound of dimension index_vector_dim=4 of " - "gather_indices is 5. These two numbers must be equal.")) + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Gather op has 4 elements in start_index_map and " + "the bound of dimension index_vector_dim=4 of " + "start_indices is 5. These two numbers must be equal.")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_OutOfBoundsGatherToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 7}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 7}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); - EXPECT_THAT( - statusor.status().error_message(), - HasSubstr("Invalid gather_dims_to_operand_dims mapping; domain is " - "[0, 5), got: 4->7")) + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Invalid start_index_map; domain is [0, 5), got: 4->7")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_RepeatedGatherToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 3}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 3}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT( statusor.status().error_message(), - HasSubstr( - "Repeated dimensions are not allowed in gather_dims_to_operand_dims")) + HasSubstr("Repeated dimensions are not allowed in start_index_map")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_NonAscendingElidedWindowDims) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{2, 1}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{2, 1}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{1, 1, 28, 27, 26}); + /*slice_sizes=*/{1, 1, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), - HasSubstr("elided_window_dims in gather op must be sorted")) + HasSubstr("collapsed_slice_dims in gather op must be sorted")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsTooLarge) { +TEST_F(ScatterGatherShapeInferenceTest, + InvalidGatherDimNumbers_WindowBoundsTooLarge) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7}, - /*elided_window_dims=*/{2}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7}, + /*collapsed_slice_dims=*/{2}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 1, 300, 26}); + /*slice_sizes=*/{30, 29, 1, 300, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), - HasSubstr("Window bound at index 3 in gather op is out of range, " - "must be within [0, 48), got 300")) + HasSubstr("Slice size at index 3 in gather op is out of range, " + "must be within [0, 48), got 300.")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_MismatchingNumberOfWindowBounds) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 26}); + /*slice_sizes=*/{30, 29, 28, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT( statusor.status().error_message(), - HasSubstr( - "Gather op must have one window bound for every input dimension")) + HasSubstr("Gather op must have one slice size for every input dimension")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsNot1ForElidedDim) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7}, - /*elided_window_dims=*/{1}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7}, + /*collapsed_slice_dims=*/{1}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/4), - /*window_bounds=*/{30, 29, 28, 26, 20}); + /*slice_sizes=*/{30, 29, 28, 26, 20}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), - HasSubstr("Gather op can only elide window indices with bound 1, " - "but bound is 29 for index 1 at position 0")) + HasSubstr("Gather op can only collapse slice dims with bound 1, " + "but bound is 29 for index 1 at position 0.")) << statusor.status(); } -TEST_F(GatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) { +TEST_F(ScatterGatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_, - HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/{4, 5, 6, 7, 8}, - /*elided_window_dims=*/{}, - /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + HloGatherInstruction::MakeGatherDimNumbers( + /*offset_dims=*/{4, 5, 6, 7, 8}, + /*collapsed_slice_dims=*/{}, + /*start_index_map=*/{0, 1, 2, 3, 4}, /*index_vector_dim=*/32), - /*window_bounds=*/{30, 29, 28, 27, 26}); + /*slice_sizes=*/{30, 29, 28, 27, 26}); ASSERT_FALSE(statusor.ok()); EXPECT_THAT(statusor.status().error_message(), HasSubstr("Gather index leaf dimension must be within [0, " - "rank(gather_indices) + 1)")) + "rank(start_indices) + 1)")) + << statusor.status(); +} + +// Shape inference tests for Scatter. + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithFullUpdates) { + TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, + ShapeUtil::MakeShape(F32, {64, 32}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithFullUpdatesV2) { + TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, + ShapeUtil::MakeShape(F32, {32, 48}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{1}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/1))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithPartialUpdates) { + TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, + ShapeUtil::MakeShape(F32, {10, 32}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithPartialUpdatesV2) { + TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, + ShapeUtil::MakeShape(F32, {32, 8}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{1}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/1))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithUpdatesBiggerThanInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, ShapeUtil::MakeShape(F32, {65, 32}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Bounds of the window dimensions of updates must not exceed " + "the bounds of the corresponding dimensions of operand.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithUpdatesBiggerThanInputV2) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, ShapeUtil::MakeShape(F32, {32, 49}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{1}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Bounds of the window dimensions of updates must not exceed " + "the bounds of the corresponding dimensions of operand.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + TfScatterWithUpdatesNotMatchingIndices) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, ShapeUtil::MakeShape(F32, {64, 31}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr( + "Bounds of the scatter dimensions of updates must be same as the " + "bounds of the corresponding dimensions of scatter indices.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + TfScatterWithUpdatesNotMatchingIndicesV2) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, ShapeUtil::MakeShape(F32, {31, 48}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{1}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr( + "Bounds of the scatter dimensions of updates must be same as the " + "bounds of the corresponding dimensions of scatter indices.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithFullUpdates) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 48}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithFullUpdatesV2) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 64}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithPartialUpdates) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 10}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithPartialUpdatesV2) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 12}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithUpdatesBiggerThanInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 65}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Bounds of the window dimensions of updates must not exceed " + "the bounds of the corresponding dimensions of operand.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + TfScatterNdWithUpdatesNotMatchingIndices) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {9, 9, 8, 7, 64}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr( + "Bounds of the scatter dimensions of updates must be same as the " + "bounds of the corresponding dimensions of scatter indices.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfBatchDynamicUpdateSlice) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 8}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, NonDefaultScatterIndicesLeafDim) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_, + ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 8}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/2))); + + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, NonDefaultScatterIndicesLeafDimV2) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_5_10_9_7_6_, + ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 8}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/0))); + + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, NoUpdateScatterDims) { + // This is equivalent to a dynamic update slice. + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_vector_5_, + ShapeUtil::MakeShape(F32, {30, 29, 28, 27, 26}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0, 1, 2, 3, 4}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/0))); + + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, ScalarScatterIndices) { + // The scalar indices "tensor" is a scalar S here that's used to update a + // [30,29,28,27] shaped tensor within the operand at position S. + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_scalar_, + ShapeUtil::MakeShape(F32, {30, 29, 28, 27}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0, 1, 2, 3}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/0))); + + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, ScatterWithTupleShapedTensorInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + tuple_shape_, s64_vector_32_, s64_vector_32_, to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Expected array argument for operand")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + ScatterWithTupleShapedScatterIndicesInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + s64_vector_32_, tuple_shape_, s64_vector_32_, to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/0)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Expected array argument for scatter indices")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, ScatterWithTupleShapedUpdatesInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + s64_vector_32_, s64_vector_32_, tuple_shape_, to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/0)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Expected array argument for updates")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, FloatingPointScatterIndicesInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + s64_vector_32_, vector_32_, s64_vector_32_, to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/0)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Scatter indices parameter must be an integral tensor")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, OutOfBoundsScatterIndicesLeafDim) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/10)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Scatter index leaf dimension must be within [0, " + "rank(scatter_indices) + 1)")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, InvalidUpdates) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 50}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Updates tensor must be of rank 7; got 8.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, InvalidUpdateComputation) { + const ProgramShape invalid_update_computation = + ShapeUtil::MakeProgramShape({f32_}, f32_); + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), + invalid_update_computation, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Reduction function must take 2 parameters, but takes 1")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_NonAscendingUpdateWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 8, 7}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("update_window_dims in scatter op must be sorted")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_RepeatedUpdateWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 7}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("update_window_dims in scatter op must not repeat")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_OutOfBoundsUpdateWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 9}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Invalid update_window_dims set in scatter op; valid " + "range is [0, 9)")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_NonAscendingInsertedWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{2, 1}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("inserted_window_dims in scatter op must be sorted")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_RepeatedInsertedWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 1}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("inserted_window_dims in scatter op must not repeat")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_OutOfBoundsInsertedWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 5}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Invalid inserted_window_dims set in scatter op; valid " + "range is [0, 5)")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_MismatchingScatterDimsToOperandDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Scatter op has 4 elements in scatter_dims_to_operand_dims and " + "the bound of dimension index_vector_dim=4 of scatter_indices " + "is 5. These two numbers must be equal")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_OutOfBoundsScatterDimsToOperandDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 10}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Invalid scatter_dims_to_operand_dims mapping; domain " + "is [0, 5), got: 4->10")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_RepeatedValuesInScatterDimsToOperandDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 2, 3}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr( + "Repeated dimensions not allowed in scatter_dims_to_operand_dims")) << statusor.status(); } diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc index 7d7dcac10b65933d1c81b8aca77465932694bfdb..921a984589bb4fb64058a2a56adfe84fe14af69b 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer.cc @@ -18,20 +18,19 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { -using ::tensorflow::strings::Appendf; - ShapedBuffer::ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, const se::Platform* platform, int device_ordinal) @@ -76,7 +75,7 @@ void ShapedBuffer::clear() { } string ShapedBuffer::ToString() const { - string s = tensorflow::strings::StrCat( + string s = absl::StrCat( "ShapedBuffer(", platform_->Name(), ":", device_ordinal(), "), on-host shape=" + ShapeUtil::HumanStringWithLayout(on_host_shape()), ", on-device shape=" + @@ -92,9 +91,9 @@ string ShapedBuffer::ToString() const { shape_str = ShapeUtil::HumanStringWithLayout(subshape); } const se::DeviceMemoryBase& memory = buffer(index); - Appendf(&s, " %s%p (%lld bytes) : %s\n", - string(index.size() * 2, ' ').c_str(), memory.opaque(), - memory.size(), shape_str.c_str()); + absl::StrAppendFormat(&s, " %s%p (%d bytes) : %s\n", + string(index.size() * 2, ' '), memory.opaque(), + memory.size(), shape_str); }); return s; } diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h index 905a7e82e621f2bf4588b71be5dbab20f892cafe..e1d26da4a20c0105be304b1a34c81515fcdc6b7f 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.h +++ b/tensorflow/compiler/xla/service/shaped_buffer.h @@ -20,11 +20,11 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" @@ -84,6 +84,14 @@ class ShapedBuffer { *buffers_.mutable_element(index) = buffer; } + // Sets all buffers. + // + // Precondition: buffers.shape == on_device_shape_ + void set_buffers(ShapeTree buffers) { + CHECK(ShapeUtil::Equal(buffers.shape(), on_device_shape_)); + buffers_ = std::move(buffers); + } + // Returns the underlying ShapeTree containing all the device addresses in the // ShapedBuffer. const ShapeTree& buffers() const { return buffers_; } diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc index 0fc243667911651c788e3c1e5f1d39d86170f1ad..d69e6362e91e4696dab3c46d99a981c67b593a1c 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/shaped_buffer.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -34,7 +35,7 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) { xla::StreamExecutorMemoryAllocator allocator(platform, executors); const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {}); const int kDeviceOrdinal = 0; - auto scoped_buffer = tensorflow::MakeUnique( + auto scoped_buffer = absl::make_unique( shape, shape, &allocator, kDeviceOrdinal); std::unique_ptr buffer = std::move(scoped_buffer); buffer = nullptr; diff --git a/tensorflow/compiler/xla/service/source_map_util.cc b/tensorflow/compiler/xla/service/source_map_util.cc index 8cbaac7b3760717bcacb57adc8782a5755c0aa6d..dd53c7531bea4273b5f8dc1c993e7720eb1afeb2 100644 --- a/tensorflow/compiler/xla/service/source_map_util.cc +++ b/tensorflow/compiler/xla/service/source_map_util.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/source_map_util.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/util.h" namespace xla { @@ -26,11 +27,10 @@ Status InvalidParameterArgumentV(const OpMetadata& op_metadata, string message; tensorflow::strings::Appendv(&message, format, args); if (!op_metadata.source_file().empty()) { - tensorflow::strings::Appendf(&message, " (%s:%d)", - op_metadata.source_file().c_str(), - op_metadata.source_line()); + absl::StrAppendFormat(&message, " (%s:%d)", op_metadata.source_file(), + op_metadata.source_line()); } - return InvalidArgument("%s", message.c_str()); + return InvalidArgument("%s", message); } } // namespace diff --git a/tensorflow/compiler/xla/service/source_map_util.h b/tensorflow/compiler/xla/service/source_map_util.h index 18e2651abb1600a7b9ffb79de887b8795717e55e..c5a7e17cb44c2b3b5ef145da0d66b4b3160f9531 100644 --- a/tensorflow/compiler/xla/service/source_map_util.h +++ b/tensorflow/compiler/xla/service/source_map_util.h @@ -13,9 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_ -#define TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SOURCE_MAP_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_SOURCE_MAP_UTIL_H_ +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/core/platform/macros.h" @@ -23,6 +24,19 @@ limitations under the License. namespace xla { namespace source_map_util { +// Creates an INVALID_ARGUMENT status with the given format string. +template +Status InvalidParameterArgument(const OpMetadata& op_metadata, + const absl::FormatSpec& format, + const Args&... args) { + string message = absl::StrFormat(format, args...); + if (!op_metadata.source_file().empty()) { + absl::StrAppendFormat(&message, " (%s:%d)", op_metadata.source_file(), + op_metadata.source_line()); + } + return InvalidArgument("%s", message); +} + // Creates an INVALID_ARGUMENT status with the given format string. // // Also, attempts to extract the OpMetadata for parameter_number on executable @@ -30,17 +44,21 @@ namespace source_map_util { // // executable may be nullptr, but parameter_number should not be out of bounds // or a CHECK-failure may occur. +template Status InvalidParameterArgument(Executable* executable, int parameter_number, - const char* format, ...) - TF_PRINTF_ATTRIBUTE(3, 4); - -// As above, but takes the parameter metadata directly instead of extracting it -// from the executable. -Status InvalidParameterArgument(const OpMetadata& op_metadata, - const char* format, ...) - TF_PRINTF_ATTRIBUTE(2, 3); + const absl::FormatSpec& format, + const Args&... args) { + if (executable != nullptr && executable->has_module()) { + const HloModule& module = executable->module(); + const HloComputation& computation = *module.entry_computation(); + HloInstruction* param = computation.parameter_instruction(parameter_number); + const OpMetadata& metadata = param->metadata(); + return InvalidParameterArgument(metadata, format, args...); + } + return InvalidArgument(format, args...); +} } // namespace source_map_util } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_SOURCE_MAP_UTIL_H_ diff --git a/tensorflow/compiler/xla/service/stream_pool.cc b/tensorflow/compiler/xla/service/stream_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..5d1cd1c4422a10e3b9e6ce6fac2c83594bb58b30 --- /dev/null +++ b/tensorflow/compiler/xla/service/stream_pool.cc @@ -0,0 +1,65 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/stream_pool.h" + +#include "absl/memory/memory.h" +#include "tensorflow/core/platform/logging.h" + +namespace xla { + +StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor) { + std::unique_ptr stream; + { + tensorflow::mutex_lock lock(mu_); + if (!streams_.empty()) { + // Re-use an existing stream from the pool. + stream = std::move(streams_.back()); + streams_.pop_back(); + VLOG(1) << stream->DebugStreamPointers() + << " StreamPool reusing existing stream"; + } + } + + if (!stream) { + // Create a new stream. + stream = absl::make_unique(executor); + stream->Init(); + VLOG(1) << stream->DebugStreamPointers() + << " StreamPool created new stream"; + } + + // Return the stream wrapped in Ptr, which has our special deleter semantics. + PtrDeleter deleter = {this}; + return Ptr(stream.release(), deleter); +} + +void StreamPool::ReturnStream(se::Stream* stream) { + if (stream->ok()) { + VLOG(1) << stream->DebugStreamPointers() + << " StreamPool returning ok stream"; + tensorflow::mutex_lock lock(mu_); + streams_.emplace_back(stream); + } else { + // If the stream has encountered any errors, all subsequent operations on it + // will fail. So just delete the stream, and rely on new streams to be + // created in the future. + VLOG(1) << stream->DebugStreamPointers() + << " StreamPool deleting !ok stream"; + delete stream; + } +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/stream_pool.h b/tensorflow/compiler/xla/service/stream_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..7221d323a61593ac4b203a81b6046d81a5beaaf0 --- /dev/null +++ b/tensorflow/compiler/xla/service/stream_pool.h @@ -0,0 +1,64 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_ + +#include +#include + +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { + +// Pool of stream_executor::Streams, which are created as needed and +// destroyed when the pool is destroyed. +class StreamPool { + public: + struct PtrDeleter { + void operator()(se::Stream* stream) { pool->ReturnStream(stream); } + StreamPool* pool; + }; + + // Stream pointer type returned by BorrowStream, which returns the + // stream to the pool on destruction. + using Ptr = std::unique_ptr; + + StreamPool() {} + + // Returns a pointer to a stream in the pool, creating a new stream + // if none are available in the pool. The returned smart pointer + // returns the stream to the pool on destruction. + // + // This method is thread-safe. + Ptr BorrowStream(se::StreamExecutor* executor); + + private: + // Puts a pointer to a stream back into the pool, leaving it free + // for future use. Streams that have previously encountered errors + // are deleted, and not returned to the pool. + // + // This method is thread-safe. + void ReturnStream(se::Stream* stream); + + tensorflow::mutex mu_; + std::vector> streams_ GUARDED_BY(mu_); +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_ diff --git a/tensorflow/compiler/xla/service/stream_pool_test.cc b/tensorflow/compiler/xla/service/stream_pool_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..aaf5c37b0d250f78cb57639255ac9b59e1b462f7 --- /dev/null +++ b/tensorflow/compiler/xla/service/stream_pool_test.cc @@ -0,0 +1,136 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/stream_pool.h" + +#include + +#include "tensorflow/compiler/xla/test_helpers.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace { + +class StreamPoolTest : public ::testing::Test { + protected: + std::unique_ptr NewStreamExecutor() { + se::Platform* platform = + se::MultiPlatformManager::PlatformWithName("Host").ConsumeValueOrDie(); + se::StreamExecutorConfig config(/*ordinal=*/0); + return platform->GetUncachedExecutor(config).ConsumeValueOrDie(); + } +}; + +TEST_F(StreamPoolTest, EmptyPool) { StreamPool pool; } + +TEST_F(StreamPoolTest, OneStreamPool) { + std::unique_ptr executor = NewStreamExecutor(); + StreamPool pool; + + // Borrow and return a stream. + StreamPool::Ptr stream1 = pool.BorrowStream(executor.get()); + se::Stream* stream1_ptr = stream1.get(); + EXPECT_TRUE(stream1->ok()); + stream1 = nullptr; + + // Borrow and return another stream. + StreamPool::Ptr stream2 = pool.BorrowStream(executor.get()); + se::Stream* stream2_ptr = stream2.get(); + EXPECT_TRUE(stream2->ok()); + stream2 = nullptr; + + // The underlying streams should be the same, since stream1 was the + // only stream available in the pool when stream2 was borrowed. + EXPECT_EQ(stream1_ptr, stream2_ptr); +} + +TEST_F(StreamPoolTest, TwoStreamPool) { + std::unique_ptr executor = NewStreamExecutor(); + StreamPool pool; + + // Borrow two streams. + StreamPool::Ptr stream1 = pool.BorrowStream(executor.get()); + se::Stream* stream1_ptr = stream1.get(); + EXPECT_TRUE(stream1->ok()); + StreamPool::Ptr stream2 = pool.BorrowStream(executor.get()); + se::Stream* stream2_ptr = stream2.get(); + EXPECT_TRUE(stream2->ok()); + + // The underlying streams should be different, since we haven't + // returned either of them yet. + EXPECT_NE(stream1_ptr, stream2_ptr); + + // Return stream1 and borrow stream3. + stream1 = nullptr; + StreamPool::Ptr stream3 = pool.BorrowStream(executor.get()); + se::Stream* stream3_ptr = stream3.get(); + EXPECT_TRUE(stream3->ok()); + + // stream1 and stream3 should be the same. + EXPECT_EQ(stream1_ptr, stream3_ptr); + EXPECT_NE(stream2_ptr, stream3_ptr); + + // Return stream2, and borrow stream4. + stream2 = nullptr; + StreamPool::Ptr stream4 = pool.BorrowStream(executor.get()); + se::Stream* stream4_ptr = stream4.get(); + EXPECT_TRUE(stream4->ok()); + + // Stream2 and stream4 should be the same. + EXPECT_EQ(stream2_ptr, stream4_ptr); + EXPECT_NE(stream3_ptr, stream4_ptr); +} + +TEST_F(StreamPoolTest, BadStreamDiscarded) { + std::unique_ptr executor = NewStreamExecutor(); + StreamPool pool; + + // Borrow a stream. + StreamPool::Ptr stream1 = pool.BorrowStream(executor.get()); + EXPECT_TRUE(stream1->ok()); + + // Force an error on the stream; here we call a method that requires + // DNN support, which we know the Host platform doesn't support. + stream1->ThenDepthConcatenate({}, {}, nullptr); + EXPECT_FALSE(stream1->ok()); + + // Return stream1 and borrow stream2. + stream1 = nullptr; + StreamPool::Ptr stream2 = pool.BorrowStream(executor.get()); + se::Stream* stream2_ptr = stream2.get(); + EXPECT_TRUE(stream2->ok()); + + // The underlying streams should be different. They would have been + // the same, but since we forced an error on stream1, it cannot be + // put back into the pool. Sadly we can't just check: + // EXPECT_NE(stream1_ptr, stream2_ptr); + // + // The above should hold logically, but it may fail if the new + // stream instance allocated for stream2 happens to reside in the + // same memory address as stream1, which has been deleted. + // + // The check that stream2->ok() serves as a good-enough check. + + // Return stream2 and borrow stream3. The previous error on stream1 + // has no effect on these streams, and they are the same. + stream2 = nullptr; + StreamPool::Ptr stream3 = pool.BorrowStream(executor.get()); + se::Stream* stream3_ptr = stream3.get(); + EXPECT_TRUE(stream3->ok()); + EXPECT_EQ(stream2_ptr, stream3_ptr); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc index c4d01562c4e32225ebb984d8fcd93ec3fa86e403..b8d2d546e5d4dc67e3f314dfc6dcd4e8df5451c5 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.cc +++ b/tensorflow/compiler/xla/service/transfer_manager.cc @@ -18,12 +18,18 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/notification.h" + +using absl::StrCat; namespace xla { /* static */ tensorflow::mutex @@ -36,8 +42,104 @@ TransferManager::GetPlatformTransferManagers() { return r; } +StatusOr> TransferManager::TransferLiteralFromDevice( + se::Stream* stream, const ShapedBuffer& device_buffer) { + StatusOr> ret; + + se::Stream* substream = stream->GetOrCreateSubStream(); + substream->ThenWaitFor(stream); + auto cleanup = tensorflow::gtl::MakeCleanup( + [&]() { stream->ReturnSubStream(substream); }); + + tensorflow::Notification n; + Status s; + Literal literal(device_buffer.on_host_shape()); + TransferLiteralFromDevice(substream, device_buffer, literal, + [&](Status status) { + s = status; + n.Notify(); + }); + n.WaitForNotification(); + if (!s.ok()) { + return s; + } + return absl::make_unique(std::move(literal)); +} + +Status TransferManager::TransferLiteralFromDevice( + se::Stream* stream, const ShapedBuffer& device_buffer, + const MutableBorrowingLiteral& literal) { + se::Stream* substream = stream->GetOrCreateSubStream(); + auto cleanup = tensorflow::gtl::MakeCleanup( + [&]() { stream->ReturnSubStream(substream); }); + + Status ret; + tensorflow::Notification n; + TransferLiteralFromDevice(substream, device_buffer, literal, + [&](Status status) { + ret = status; + n.Notify(); + }); + n.WaitForNotification(); + return ret; +} + +Status TransferManager::TransferLiteralToDevice( + se::Stream* stream, const LiteralSlice& literal, + const ShapedBuffer& device_buffer) { + // Implement the synchronous version by waiting on the asynchronous version. + // Use a substream so that if we are called from a HostCallback we don't + // deadlock. + se::Stream* substream = stream->GetOrCreateSubStream(); + substream->ThenWaitFor(stream); + auto cleanup = tensorflow::gtl::MakeCleanup( + [&]() { stream->ReturnSubStream(substream); }); + TF_RETURN_IF_ERROR( + TransferLiteralToDeviceAsync(substream, literal, device_buffer)); + return substream->BlockHostUntilDone(); +} + +StatusOr> TransferManager::TransferArrayFromDevice( + se::Stream* stream, const Shape& shape, + const se::DeviceMemoryBase& source) { + StatusOr> ret; + // Implement the synchronous version by waiting on the asynchronous version. + // Use a substream so that if we are called from a HostCallback we don't + // deadlock. + se::Stream* substream = stream->GetOrCreateSubStream(); + auto cleanup = tensorflow::gtl::MakeCleanup( + [&]() { stream->ReturnSubStream(substream); }); + + tensorflow::Notification n; + Literal literal(shape); + Status s; + TransferArrayFromDevice(substream, shape, source, literal, + [&](Status status) { + s = status; + n.Notify(); + }); + n.WaitForNotification(); + if (!s.ok()) { + return s; + } + return absl::make_unique(std::move(literal)); +} + Status TransferManager::TransferArrayToDevice( - se::StreamExecutor* executor, const LiteralSlice& literal, + se::Stream* stream, const LiteralSlice& literal, + const se::DeviceMemoryBase& dest) { + // Implement the synchronous version by waiting on the asynchronous version. + // Use a substream so that if we are called from a HostCallback we don't + // deadlock. + se::Stream* substream = stream->GetOrCreateSubStream(); + auto cleanup = tensorflow::gtl::MakeCleanup( + [&]() { stream->ReturnSubStream(substream); }); + TF_RETURN_IF_ERROR(TransferArrayToDeviceAsync(substream, literal, dest)); + return substream->BlockHostUntilDone(); +} + +Status TransferManager::TransferArrayToDeviceAsync( + se::Stream* stream, const LiteralSlice& literal, const se::DeviceMemoryBase& dest) { const Shape on_device_shape = HostShapeToDeviceShape(literal.shape()); TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape)) @@ -47,32 +149,37 @@ Status TransferManager::TransferArrayToDevice( if (dest.size() < GetByteSizeRequirement(on_device_shape)) { return FailedPrecondition( "Allocation on device not large enough for array: " - "%lld < %lld", + "%d < %d", dest.size(), GetByteSizeRequirement(on_device_shape)); } ShapedBuffer shaped_buffer(/*on_host_shape=*/literal.shape(), on_device_shape, - executor->platform(), executor->device_ordinal()); + stream->parent()->platform(), + stream->parent()->device_ordinal()); shaped_buffer.set_buffer(dest, /*index=*/{}); - return TransferLiteralToDevice(executor, literal, shaped_buffer); + return TransferLiteralToDevice(stream, literal, shaped_buffer); } -StatusOr> TransferManager::TransferArrayFromDevice( - se::StreamExecutor* executor, const Shape& shape, - const se::DeviceMemoryBase& source) { - TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape)) - << "Shape " << ShapeUtil::HumanString(shape) - << " has a differently shaped representation on-device: " - << ShapeUtil::HumanString(HostShapeToDeviceShape(shape)); +void TransferManager::TransferArrayFromDevice( + se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source, + const MutableBorrowingLiteral& literal, std::function done) { + if (!ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape)) { + auto error = StrCat("Shape ", ShapeUtil::HumanString(shape), + " has a differently shaped representation on-device: ", + ShapeUtil::HumanString(HostShapeToDeviceShape(shape))); + return done(FailedPrecondition("%s", error)); + } if (source.size() < GetByteSizeRequirement(shape)) { - return FailedPrecondition( - "Allocation on device not large enough for array: " - "%lld < %lld", - source.size(), GetByteSizeRequirement(shape)); + return done( + FailedPrecondition("Allocation on device not large enough for array: " + "%d < %d", + source.size(), GetByteSizeRequirement(shape))); } ShapedBuffer shaped_buffer(/*on_host_shape=*/shape, shape, - executor->platform(), executor->device_ordinal()); + stream->parent()->platform(), + stream->parent()->device_ordinal()); shaped_buffer.set_buffer(source, /*index=*/{}); - return TransferLiteralFromDevice(executor, shaped_buffer); + return TransferLiteralFromDevice(stream, shaped_buffer, literal, + std::move(done)); } /* static */ void TransferManager::RegisterTransferManager( @@ -96,7 +203,7 @@ StatusOr> TransferManager::TransferArrayFromDevice( return NotFound( "could not find registered transfer manager for platform %s -- check " "target linkage", - platform->Name().c_str()); + platform->Name()); } if (it->second.manager == nullptr) { @@ -108,10 +215,14 @@ StatusOr> TransferManager::TransferArrayFromDevice( } Status TransferManager::WriteTupleIndexTables( - se::StreamExecutor* executor, const ShapedBuffer& device_buffer) { - VLOG(2) << "Writing tuple index tables for " << device_buffer; + se::Stream* stream, const ShapedBuffer& device_buffer) { + TF_RETURN_IF_ERROR(WriteTupleIndexTablesAsync(stream, device_buffer)); + return stream->BlockHostUntilDone(); +} - TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal()); +Status TransferManager::WriteTupleIndexTablesAsync( + se::Stream* stream, const ShapedBuffer& device_buffer) { + VLOG(2) << "Writing tuple index tables for " << device_buffer; return ShapeUtil::ForEachSubshapeWithStatus( device_buffer.on_device_shape(), @@ -129,7 +240,7 @@ Status TransferManager::WriteTupleIndexTables( elements.push_back(device_buffer.buffer(element_index)); element_index.pop_back(); } - return WriteSingleTupleIndexTable(executor, elements, device_subshape, + return WriteSingleTupleIndexTable(stream, elements, device_subshape, &device_memory); } @@ -138,40 +249,28 @@ Status TransferManager::WriteTupleIndexTables( } Status TransferManager::TransferBufferFromDevice( - se::StreamExecutor* executor, const se::DeviceMemoryBase& source, - int64 size, void* destination) { + se::Stream* stream, const se::DeviceMemoryBase& source, int64 size, + void* destination) { if (source.size() < size) { return FailedPrecondition( "Source allocation on device not large enough for data tranfer: " - "%lld < %lld", + "%d < %d", source.size(), size); } - auto copy_status = executor->SynchronousMemcpyD2H(source, size, destination); - if (!copy_status.ok()) { - return AddStatus( - Status(static_cast(copy_status.code()), - copy_status.error_message()), - "failed transfer from device to buffer"); - } + stream->ThenMemcpy(destination, source, size); return Status::OK(); } Status TransferManager::TransferBufferToDevice( - se::StreamExecutor* executor, int64 size, const void* source, + se::Stream* stream, int64 size, const void* source, se::DeviceMemoryBase* destination) { if (destination->size() < size) { return FailedPrecondition( "Destination allocation on device not large enough for data tranfer: " - "%lld < %lld", + "%d < %d", destination->size(), size); } - auto copy_status = executor->SynchronousMemcpyH2D(source, size, destination); - if (!copy_status.ok()) { - return AddStatus( - Status(static_cast(copy_status.code()), - copy_status.error_message()), - "failed transfer of buffer to device"); - } + stream->ThenMemcpy(destination, source, size); return Status::OK(); } @@ -179,9 +278,8 @@ StatusOr TransferManager::AllocateScopedShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal) { if (!LayoutUtil::HasLayout(on_host_shape)) { - return InvalidArgument( - "Shape must have a layout: %s", - ShapeUtil::HumanStringWithLayout(on_host_shape).c_str()); + return InvalidArgument("Shape must have a layout: %s", + ShapeUtil::HumanStringWithLayout(on_host_shape)); } TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(on_host_shape)); const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape); diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h index 43a8092b06fba0e2495bce0ee1a309c85a908273..21725946b3629a4495d8ad6cc1529d712d22e0af 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.h +++ b/tensorflow/compiler/xla/service/transfer_manager.h @@ -20,12 +20,12 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -52,30 +52,69 @@ class TransferManager { return host_shape; } - // Returns a literal containing the data held in the given ShapedBuffer. - // using the provided executor. The optional literal_shape will be the shape - // for the literal. The shape of the ShapedBuffer and - // DeviceShape(literal_shape) must be compatible, but need not have the same - // layout. + // Returns a literal containing the data held in the given ShapedBuffer + // using the provided executor. This operation is performed synchronously + // without waiting for any other operation on a stream to complete. + // + // This function should be avoided in favor of the asynchronous version below. virtual StatusOr> TransferLiteralFromDevice( - se::StreamExecutor* executor, const ShapedBuffer& device_buffer) = 0; + se::Stream* stream, const ShapedBuffer& device_buffer); + virtual Status TransferLiteralFromDevice( + se::Stream* stream, const ShapedBuffer& device_buffer, + const MutableBorrowingLiteral& literal); + + // Begins transferring a literal containing the data held in the given + // ShapedBuffer using the provided executor. + // + // This operation is performed asynchronously on the given stream. It returns + // once the transfer is enqueued. 'done' is invoked with the result when + // complete. + // + // device_buffer is copied by reference and must live at least until done() is + // invoked. + virtual void TransferLiteralFromDevice(se::Stream* stream, + const ShapedBuffer& device_buffer, + MutableBorrowingLiteral literal, + std::function done) = 0; // Transfers the given literal into the previously allocated device memory // represented by the given ShapedBuffer using the given executor. The shape // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible, - // but need not have the same layout - virtual Status TransferLiteralToDevice(se::StreamExecutor* executor, + // but need not have the same layout. + // + // This operation is performed synchronously without waiting for any other + // operation on a stream to complete. This function should be avoided in favor + // of the asynchronous version below. + virtual Status TransferLiteralToDevice(se::Stream* stream, const LiteralSlice& literal, - const ShapedBuffer& device_buffer) = 0; + const ShapedBuffer& device_buffer); + + // Transfers the given literal into the previously allocated device memory + // represented by the given ShapedBuffer using the given executor. The shape + // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible, + // but need not have the same layout. + // + // This operation is performed asynchronously on the given stream. It returns + // once the transfer is enqueued. + virtual Status TransferLiteralToDeviceAsync( + se::Stream* stream, const LiteralSlice& literal, + const ShapedBuffer& device_buffer) = 0; // Convenience methods for transferring an array to or from the device at a // known address. This avoids having to construct a ShapedBuffer just to // transfer an array at a known address. - Status TransferArrayToDevice(se::StreamExecutor* executor, - const LiteralSlice& literal, + Status TransferArrayToDevice(se::Stream* stream, const LiteralSlice& literal, const se::DeviceMemoryBase& dest); + void TransferArrayFromDevice(se::Stream* stream, const Shape& shape, + const se::DeviceMemoryBase& source, + const MutableBorrowingLiteral& literal, + std::function done); + + Status TransferArrayToDeviceAsync(se::Stream* stream, + const LiteralSlice& literal, + const se::DeviceMemoryBase& dest); StatusOr> TransferArrayFromDevice( - se::StreamExecutor* executor, const Shape& shape, + se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source); // Transfers the given literal into the Infeed interface of the device, @@ -85,19 +124,21 @@ class TransferManager { // Transfers the given literal from the Outfeed interface of the device, // using the given executor. - virtual Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, - const Shape& literal_shape, - Literal* literal) = 0; + virtual Status TransferLiteralFromOutfeed( + se::StreamExecutor* executor, const Shape& literal_shape, + MutableBorrowingLiteral literal) = 0; // Resets the devices associated with this transfer manager. virtual Status ResetDevices( - tensorflow::gtl::ArraySlice executor) = 0; + absl::Span executor) = 0; // Given an allocated ShapedBuffer, constructs the tuple index table(s) in // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the // ShapedBuffer is array-shaped this method does nothing. - Status WriteTupleIndexTables(se::StreamExecutor* executor, + Status WriteTupleIndexTables(se::Stream* stream, const ShapedBuffer& device_buffer); + Status WriteTupleIndexTablesAsync(se::Stream* stream, + const ShapedBuffer& device_buffer); // Determines the byte size requirement for the given shape on the underlying // architecture. This will be used to allocate an appropriately sized memory @@ -111,6 +152,26 @@ class TransferManager { const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal); + // The given ShapedBuffer holds a handle to allocated memory, but it is not + // in the general case legal to immediately copy or access that allocated + // memory because queued operations on the device may alias that memory. + // Memory ordering is enforced by the Stream's happens-before relationship + // which allows eager deallocation and reallocation of buffers host-side even + // if the device hasn't finished with them. + // + // In certain cases, it can be known that a ShapedBuffer does not have any + // conflicting accesses on the device and thus is eligible to be accessed at + // any time from the host. + // + // This function returns true if device_buffer can be accessed immediately + // without waiting for the Stream's previously enqueued items. This only + // returns true if all subbuffers in device_buffer can be accessed + // immediately. + virtual bool CanShapedBufferBeAccessedNow( + se::StreamExecutor* executor, const ShapedBuffer& device_buffer) const { + return false; + } + ///// // The TransferManager class also serves as a point to register objects for // the various platforms. @@ -130,21 +191,11 @@ class TransferManager { const se::Platform* platform); protected: - // Transfer a memory block of the given size from 'source' buffer to the - // Infeed interface of the device using the given executor. - // - // size is the size to transfer from source in bytes. - // - // source is the source data that must be in the target-dependent layout that - // the Infeed HLO used in the computation expects. - virtual Status TransferBufferToInfeed(se::StreamExecutor* executor, - int64 size, const void* source) = 0; - // Transfer a memory block of the given size from the device source into the // 'destination' buffer. // // size is the size to transfer to destination in bytes. - virtual Status TransferBufferFromDevice(se::StreamExecutor* executor, + virtual Status TransferBufferFromDevice(se::Stream* stream, const se::DeviceMemoryBase& source, int64 size, void* destination); @@ -152,16 +203,15 @@ class TransferManager { // destination of the device. // // size is the size to transfer from source in bytes. - virtual Status TransferBufferToDevice(se::StreamExecutor* executor, - int64 size, const void* source, + virtual Status TransferBufferToDevice(se::Stream* stream, int64 size, + const void* source, se::DeviceMemoryBase* destination); // Writes the given device-memory pointers in 'elements' to the given region // to construct a tuple index table in the platform-specific tuple // representation. virtual Status WriteSingleTupleIndexTable( - se::StreamExecutor* executor, - tensorflow::gtl::ArraySlice elements, + se::Stream* stream, absl::Span elements, const Shape& shape, se::DeviceMemoryBase* region) = 0; private: diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc index ba16dc640e2d2974eab4fc8b134a6e33c03e3b85..530f40e4b2f9c7c19fa29dad28a077b9d4d68a71 100644 --- a/tensorflow/compiler/xla/service/transpose_folding.cc +++ b/tensorflow/compiler/xla/service/transpose_folding.cc @@ -109,6 +109,7 @@ Status FoldTransposeIntoDot(InstructionOperandsPair pair) { std::unique_ptr new_dot = HloInstruction::CreateDot( dot->shape(), new_lhs, new_rhs, new_dim_numbers); + new_dot->set_precision_config(dot->precision_config()); return dot->parent()->ReplaceWithNewInstruction(dot, std::move(new_dot)); } @@ -178,7 +179,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) { auto new_conv = HloInstruction::CreateConvolve( convolution.shape(), new_lhs, new_rhs, convolution.window(), new_dnums); - convolution.SetupDerivedInstruction(new_conv.get()); + new_conv->set_precision_config(convolution.precision_config()); TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction( &convolution, std::move(new_conv))); diff --git a/tensorflow/compiler/xla/service/transpose_folding.h b/tensorflow/compiler/xla/service/transpose_folding.h index 71e8446452f072c22bb730cbda65a1743a95cd4c..3e5aa2db60ee31d9fbccf8f7256b15c1b8465335 100644 --- a/tensorflow/compiler/xla/service/transpose_folding.h +++ b/tensorflow/compiler/xla/service/transpose_folding.h @@ -49,7 +49,7 @@ class TransposeFolding : public HloPassInterface { explicit TransposeFolding( TransposableGemmOperandsFn transposable_gemm_operands, TransposableConvOperandsFn transposable_conv_operands); - tensorflow::StringPiece name() const override { return "transpose-folding"; } + absl::string_view name() const override { return "transpose-folding"; } StatusOr Run(HloModule* module) override; diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc index f73f1227aaf1630a9e7c43bb508732c5518ef929..58f767e913fbc0023e0c45a4f0e82ecefeeef2d6 100644 --- a/tensorflow/compiler/xla/service/transpose_folding_test.cc +++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc @@ -19,20 +19,20 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/shape_inference.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/logging.h" @@ -69,7 +69,7 @@ ENTRY entry_computation { } )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); FoldTranspose(module.get()); @@ -91,7 +91,7 @@ ENTRY entry_computation { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); TransposeFolding transpose_folding( [](const HloInstruction& dot, @@ -119,7 +119,7 @@ ENTRY entry_computation { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); TransposeFolding transpose_folding( [](const HloInstruction& dot, @@ -147,7 +147,7 @@ ENTRY entry_computation { } )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); FoldTranspose(module.get()); @@ -160,11 +160,11 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) { auto builder = HloComputation::Builder("entry"); // (1.0 + 2.0) * (2.0 - 3.0) HloInstruction* const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); HloInstruction* const2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); HloInstruction* const3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); HloInstruction* add = builder.AddInstruction(HloInstruction::CreateBinary( const1->shape(), HloOpcode::kAdd, const1, const2)); HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary( @@ -176,7 +176,7 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) { HloComputation* entry_computation = module->AddEntryComputation(builder.Build(mul)); HloInstruction* call = module->OutlineExpressionFromComputation( - {add, sub, mul}, "", entry_computation); + {add, sub, mul}, "entry", entry_computation); EXPECT_EQ(call, entry_computation->root_instruction()); HloComputation* callee_computation = call->to_apply(); // The arguments to the call should be const1, const2, and const3. @@ -205,7 +205,7 @@ ENTRY entry_computation { } )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); FoldTranspose(module.get()); const HloComputation* callee = module->GetComputationWithName("callee"); diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc index bb634e6573ffceeaa66e0ac9141fe7e3a39ed602..6fed7c76d04ad5d8236fecd07aa27f1eda221ea7 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc @@ -19,23 +19,24 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/map_util.h" +#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" namespace xla { string BufferAlias::ToString() const { - return tensorflow::strings::StrCat("BufferAlias(", instruction_->name(), "[", - tensorflow::str_util::Join(index_, ","), - "])"); + return absl::StrCat("BufferAlias(", instruction_->name(), "[", + absl::StrJoin(index_, ","), "])"); } std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias) { @@ -121,7 +122,6 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index, } namespace { - // Gather fusion instructions from 'instruction' into 'fusion_instructions'. void GatherFusionInstructions( HloInstruction* instruction, @@ -232,8 +232,7 @@ Status TuplePointsToAnalysis::HandleGetTupleElement( // Copy the points-to set (and tuple sources) at index {element_index} of the // operand to the points-to set for this GetTupleElement instruction. points_to_set.ForEachMutableElement( - [&, this](const ShapeIndex& target_index, - PointsToSet::BufferList* points_to) { + [&](const ShapeIndex& target_index, PointsToSet::BufferList* points_to) { // Construct an index into the operand by prepending element_index to // the index for the GetTupleElement instruction's points-to set. ShapeIndex src_index; @@ -292,22 +291,29 @@ Status TuplePointsToAnalysis::HandleSlice(HloInstruction* slice) { } Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) { - // RecvDone aliases its input (Recv) tuple element {0} to its output. + // RecvDone aliases its input (Recv) tuple element {0} to element {0} of its + // output. The other indices ({} and {1}) define their own buffers. PointsToSet& points_to_set = CreateEmptyPointsToSet(recv_done); + points_to_set.AddPointedToBuffer( + logical_buffer_analysis_->GetBuffer(recv_done, /*index=*/{}), + /*index=*/{}); + points_to_set.AddPointedToBuffer( + logical_buffer_analysis_->GetBuffer(recv_done, /*index=*/{1}), + /*index=*/{1}); + const PointsToSet& operand_points_to_set = GetPointsToSet(recv_done->operand(0)); - // Recursively copy the points to set of the operand tuple {0}. + // Recursively copy the points to set of the operand tuple {0} to the output + // element {0}. points_to_set.ForEachMutableElement( - [this, &points_to_set, &operand_points_to_set]( + [&points_to_set, &operand_points_to_set]( const ShapeIndex& index, PointsToSet::BufferList* buffers) { - ShapeIndex src_index({0}); - for (auto element : index) { - src_index.push_back(element); + if (index.empty() || index[0] != 0) { + return; } - *buffers = operand_points_to_set.element(src_index); - for (auto& tuple_source : - operand_points_to_set.tuple_sources(src_index)) { + *buffers = operand_points_to_set.element(index); + for (auto& tuple_source : operand_points_to_set.tuple_sources(index)) { points_to_set.add_tuple_source(index, tuple_source); } }); @@ -315,7 +321,7 @@ Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) { } Status TuplePointsToAnalysis::HandleSend(HloInstruction* send) { - // Send creates a tuple of {aliased operand, U32 context}. + // Send creates a tuple of {aliased operand, U32 context, token}. PointsToSet& points_to_set = CreateEmptyPointsToSet(send); // Creates the points to set for the tuple and its element at {1}. @@ -328,6 +334,10 @@ Status TuplePointsToAnalysis::HandleSend(HloInstruction* send) { context_buffer->push_back( &logical_buffer_analysis_->GetBuffer(send, ShapeIndex({1}))); + auto token_buffer = points_to_set.mutable_element(ShapeIndex({2})); + token_buffer->push_back( + &logical_buffer_analysis_->GetBuffer(send, ShapeIndex({2}))); + // Recursively copy the points to set of the operand to output tuple {0}. const PointsToSet& operand_points_to_set = GetPointsToSet(send->operand(0)); operand_points_to_set.ForEachElement( @@ -350,7 +360,7 @@ Status TuplePointsToAnalysis::HandleSend(HloInstruction* send) { } Status TuplePointsToAnalysis::HandleTuple(HloInstruction* tuple) { - tensorflow::gtl::ArraySlice operands(tuple->operands()); + absl::Span operands(tuple->operands()); PointsToSet& points_to_set = CreateEmptyPointsToSet(tuple); points_to_set.AddPointedToBuffer( logical_buffer_analysis_->GetBuffer(tuple, /*index=*/{}), @@ -388,7 +398,7 @@ Status TuplePointsToAnalysis::HandleTuple(HloInstruction* tuple) { return Status::OK(); } -Status TuplePointsToAnalysis::HandleSelect(HloInstruction* select) { +Status TuplePointsToAnalysis::HandleTupleSelect(HloInstruction* tuple_select) { // Select allocates a new buffer and then shallow copies the on_true or // on_false buffer into this new buffer. Which side is chosen cannot be // determined statically so conservatively set the points-to set to the union @@ -396,9 +406,9 @@ Status TuplePointsToAnalysis::HandleSelect(HloInstruction* select) { // // First create a copy of the on_true points-to set (and tuple sources), then // add in elements of the on_false points-to set (tuple sources). - auto on_true = select->operand(1); - auto on_false = select->operand(2); - PointsToSet& points_to_set = CreateCopiedPointsToSet(select, on_true); + auto on_true = tuple_select->operand(1); + auto on_false = tuple_select->operand(2); + PointsToSet& points_to_set = CreateCopiedPointsToSet(tuple_select, on_true); const PointsToSet& false_points_to_set = *PerInst(on_false)->points_to_set; points_to_set.ForEachMutableElement( [&](const ShapeIndex& index, PointsToSet::BufferList* buffers) { @@ -416,7 +426,7 @@ Status TuplePointsToAnalysis::HandleSelect(HloInstruction* select) { // respective element in the points-to set should contain only itself. points_to_set.mutable_element({})->clear(); points_to_set.AddPointedToBuffer( - logical_buffer_analysis_->GetBuffer(select, /*index=*/{}), + logical_buffer_analysis_->GetBuffer(tuple_select, /*index=*/{}), /*index=*/{}); return Status::OK(); } @@ -431,7 +441,7 @@ PointsToSet& TuplePointsToAnalysis::CreateEmptyPointsToSet( PerInstruction* pi = PerInst(instruction); CHECK(pi->points_to_set == nullptr) << "instruction should not have been present in the map."; - auto set = MakeUnique(&instruction->shape()); + auto set = absl::make_unique(&instruction->shape()); pi->points_to_set = std::move(set); // Return *set using the iterator returned by emplace. return *pi->points_to_set; @@ -452,21 +462,20 @@ Status TuplePointsToAnalysis::VerifyBuffer(const LogicalBuffer& buffer) const { return FailedPrecondition( "LogicalBuffer %s is ill-defined: instruction %s does not define a " "buffer at that index", - buffer.ToString().c_str(), buffer.instruction()->name().c_str()); + buffer.ToString(), buffer.instruction()->name()); } } if (buffer.id() < 0 || buffer.id() >= logical_buffer_analysis_->num_logical_buffers()) { - return FailedPrecondition( - "LogicalBuffer %s is ill-defined: invalid id %lld", - buffer.ToString().c_str(), buffer.id()); + return FailedPrecondition("LogicalBuffer %s is ill-defined: invalid id %d", + buffer.ToString(), buffer.id()); } if (GetBuffer(buffer.id()).instruction() != buffer.instruction() || GetBuffer(buffer.id()).index() != buffer.index()) { return FailedPrecondition( "LogicalBuffer %s is ill-defined: buffer with same id differs: %s", - buffer.ToString().c_str(), GetBuffer(buffer.id()).ToString().c_str()); + buffer.ToString(), GetBuffer(buffer.id()).ToString()); } return Status::OK(); @@ -485,8 +494,7 @@ StatusOr TuplePointsToAnalysis::GetBufferDefinedAt( if (buffers.size() != 1 || buffers[0]->instruction() != instruction) { return FailedPrecondition( "instruction %s does not define buffer at index {%s}", - instruction->name().c_str(), - tensorflow::str_util::Join(index, ",").c_str()); + instruction->name(), absl::StrJoin(index, ",")); } return buffers[0]; } @@ -506,7 +514,7 @@ Status TuplePointsToAnalysis::GatherBuffersDefinedByInstruction( const HloInstruction* instruction, TuplePointsToAnalysis::BufferDefinitionVector* buffers) { GetPointsToSet(instruction) - .ForEachElement([this, buffers, instruction]( + .ForEachElement([buffers, instruction]( const ShapeIndex& index, const PointsToSet::BufferList& source_buffers) { // Add buffers which 'instruction' is the source of. @@ -536,7 +544,7 @@ PointsToSet& TuplePointsToAnalysis::CreateCopiedPointsToSet( PointsToSet& dst_points_to_set = CreateEmptyPointsToSet(instruction); const PointsToSet& src_points_to_set = GetPointsToSet(src); dst_points_to_set.ForEachMutableElement( - [this, &dst_points_to_set, &src_points_to_set]( + [&dst_points_to_set, &src_points_to_set]( const ShapeIndex& index, PointsToSet::BufferList* buffers) { *buffers = src_points_to_set.element(index); for (auto& tuple_source : src_points_to_set.tuple_sources(index)) { @@ -547,13 +555,12 @@ PointsToSet& TuplePointsToAnalysis::CreateCopiedPointsToSet( } string TuplePointsToAnalysis::ToString() const { - string output = tensorflow::strings::Printf( - "TuplePointsToSet for module %s:\n", module_->name().c_str()); + string output = + absl::StrFormat("TuplePointsToSet for module %s:\n", module_->name()); for (const auto* computation : module_->MakeNonfusionComputations()) { const char* entry = computation == module_->entry_computation() ? "entry " : ""; - tensorflow::strings::StrAppend(&output, entry, "computation ", - computation->name(), ":\n"); + absl::StrAppend(&output, entry, "computation ", computation->name(), ":\n"); for (const HloInstruction* instruction : computation->MakeInstructionPostOrder()) { InstructionToString(instruction, &output); @@ -565,12 +572,11 @@ string TuplePointsToAnalysis::ToString() const { } } - tensorflow::strings::StrAppend(&output, "LogicalBuffers:\n"); + absl::StrAppend(&output, "LogicalBuffers:\n"); for (const auto& b : logical_buffer_analysis_->logical_buffers()) { - tensorflow::strings::StrAppend(&output, " buffer ", b->ToString(), ":\n"); + absl::StrAppend(&output, " buffer ", b->ToString(), ":\n"); for (const BufferAlias& alias : logical_buffer_aliases_.at(b->id())) { - tensorflow::strings::StrAppend(&output, " alias ", alias.ToString(), - "\n"); + absl::StrAppend(&output, " alias ", alias.ToString(), "\n"); } } return output; @@ -579,20 +585,18 @@ string TuplePointsToAnalysis::ToString() const { void TuplePointsToAnalysis::InstructionToString( const HloInstruction* instruction, string* output) const { const string prefix = instruction->IsFused() ? " " : ""; - tensorflow::strings::StrAppend(output, prefix, " instruction ", - instruction->ToShortString(), ":\n"); + absl::StrAppend(output, prefix, " instruction ", + instruction->ToShortString(), ":\n"); const PointsToSet& points_to_set = GetPointsToSet(instruction); points_to_set.ForEachElement([&prefix, &output]( const ShapeIndex& index, const PointsToSet::BufferList& points_to) { - tensorflow::strings::StrAppend( - output, prefix, " {", tensorflow::str_util::Join(index, ","), "}: ", - tensorflow::str_util::Join( - points_to, ", ", - [](string* out, const LogicalBuffer* source) { - out->append(source->ToString()); - }), - "\n"); + absl::StrAppend(output, prefix, " {", absl::StrJoin(index, ","), "}: ", + absl::StrJoin(points_to, ", ", + [](string* out, const LogicalBuffer* source) { + out->append(source->ToString()); + }), + "\n"); }); } @@ -707,6 +711,7 @@ bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt( // root at operand 0 or 1. Or... // (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index // 0. +// (5) The 'user' of 'operand' is Sort, and it is the only user. // // (2) and (3) can only be determined if points-to analysis is available. bool TuplePointsToAnalysis::CanShareOperandBufferWithUser( @@ -723,15 +728,22 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser( return false; } if (user->opcode() == HloOpcode::kFusion) { - if (user->fusion_kind() == HloInstruction::FusionKind::kLoop && - user->fused_expression_root()->opcode() == - HloOpcode::kDynamicUpdateSlice) { - // Loop fusion with kDynamicUpdateSlice fused root. - // - // Returns true iff there is exactly one use of 'operand' at shape index - // 'operand_index', and this singleton use is the fused root at operand - // index 0. - return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0); + if (user->fusion_kind() == HloInstruction::FusionKind::kLoop || + user->fusion_kind() == HloInstruction::FusionKind::kInput) { + if (user->fused_expression_root()->opcode() == + HloOpcode::kDynamicUpdateSlice) { + // Loop fusion with kDynamicUpdateSlice fused root. + // + // Returns true iff there is exactly one use of 'operand' at shape index + // 'operand_index', and this singleton use is the fused root at operand + // index 0. + return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0); + } else { + HloInstruction* fusion_param = + user->fused_parameter(user->operand_index(operand)); + return HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple( + fusion_param); + } } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput && user->fused_expression_root()->opcode() == HloOpcode::kAdd) { // Output fusion with kAdd fused root. @@ -765,6 +777,21 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser( std::vector operand_indices = user->OperandIndices(operand); return operand_indices.size() == 1 && operand_indices[0] == 0; } + if (user->opcode() == HloOpcode::kSort) { + // Only valid if there are no other users. + if (operand->users().size() != 1) { + return false; + } + // If we only sort keys, the output of sort is not a tuple, so we can always + // share the buffer. + if (user->operand_count() == 1) { + return true; + } + CHECK(!user_index.empty()); + // Only share with the right tuple element buffer. + std::vector operand_indices = user->OperandIndices(operand); + return operand_indices.size() == 1 && user_index[0] == operand_indices[0]; + } if (user->opcode() == HloOpcode::kCall) { // TODO(b/62548313): Remove when buffer assignment is module scoped and // does not assign buffers to calls. @@ -789,8 +816,12 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser( return param_uses.size() == 1 && param_uses[0].first == callee_root && callee_root->IsElementwiseOnOperand(param_uses[0].second); } - // Check if 'user' is element-wise. - return user->IsElementwise(); + // Loop fusions that contain transposing copies won't reach here as they have + // different layouts, which fails the check in the beginning of this function. + // + // Multi-output fusion will fail the check here as tuples are not considered + // an elementwise operation. + return user->IsElementwiseOnOperand(user->operand_index(operand)); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h index c0d82414806d9a6ff57aec59d077f444137fec9a..a9e8a51e0923362162c6b8a2e97fc334e56d4329 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h @@ -23,6 +23,8 @@ limitations under the License. #include #include +#include "absl/container/inlined_vector.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -33,7 +35,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/compactptrset.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/flatset.h" @@ -109,7 +110,7 @@ class PointsToSet { // Add a tuple source instruction for the given index. void add_tuple_source(const ShapeIndex& index, HloInstruction* tuple); - using BufferList = tensorflow::gtl::InlinedVector; + using BufferList = absl::InlinedVector; // Return the list of logical buffers for the subshape at index. const BufferList& element(const ShapeIndex& index) const { @@ -203,7 +204,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { // logical buffer The buffer alias set is the inverse of the points-to set. // That is, LogicalBuffer B is in the points-to set of instruction I at index // N iff instruction I, index N is a BufferAlias of B. - using BufferAliasVector = tensorflow::gtl::InlinedVector; + using BufferAliasVector = absl::InlinedVector; const BufferAliasVector& GetBufferAliases(const LogicalBuffer& buffer) const; // Returns the number of logical buffers in the module @@ -226,8 +227,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { // instructions produce a single buffer (the top-level buffer), some produce // no buffers (eg bitcast), and some produce more than one buffer (eg, // tuple-shaped parameters). - using BufferDefinitionVector = - tensorflow::gtl::InlinedVector; + using BufferDefinitionVector = absl::InlinedVector; const BufferDefinitionVector& GetBuffersDefinedByInstruction( const HloInstruction* instruction) const; @@ -253,7 +253,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { Status HandleCopy(HloInstruction* copy) override; Status HandleRecvDone(HloInstruction* recv_done) override; Status HandleSend(HloInstruction* send) override; - Status HandleSelect(HloInstruction* select) override; + Status HandleTupleSelect(HloInstruction* tuple_select) override; string ToString() const; diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc index f558316b05b168a6f100e8ef69adfd9dbc023102..a32d1f9026e8beae77b5b40241995707ff62231e 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc @@ -72,9 +72,8 @@ class TuplePointsToAnalysisTest : public HloTestBase { // Checks that the given points-to set contains exactly (unordered) the given // LogicalBuffers. - void ExpectHasBuffers( - const PointsToSet::BufferList& points_to_set, - tensorflow::gtl::ArraySlice buffers) { + void ExpectHasBuffers(const PointsToSet::BufferList& points_to_set, + absl::Span buffers) { std::vector vec(buffers.begin(), buffers.end()); EXPECT_THAT(points_to_set, UnorderedElementsAreArray(vec)); } @@ -83,7 +82,7 @@ class TuplePointsToAnalysisTest : public HloTestBase { // top-level buffers of the given instructions. void ExpectHasTopLevelBuffers( const PointsToSet::BufferList& points_to_set, - tensorflow::gtl::ArraySlice instructions) { + absl::Span instructions) { PointsToSet::BufferList buffers; for (auto instruction : instructions) { buffers.push_back(GetBuffer(instruction, /*index=*/{})); @@ -94,7 +93,7 @@ class TuplePointsToAnalysisTest : public HloTestBase { // Overload which takes a set instead of a vector. void ExpectHasTopLevelBuffers( const PointsToSet::BufferSet& points_to_set, - tensorflow::gtl::ArraySlice instructions) { + absl::Span instructions) { ExpectHasTopLevelBuffers( PointsToSet::BufferList(points_to_set.begin(), points_to_set.end()), instructions); @@ -104,8 +103,7 @@ class TuplePointsToAnalysisTest : public HloTestBase { // aliases which are exactly (unordered) the given instruction/index pairs. void ExpectHasBufferAliases( const HloInstruction* instruction, const ShapeIndex& index, - tensorflow::gtl::ArraySlice> - expected) { + absl::Span> expected) { const LogicalBuffer* buffer = points_to_analysis_->GetBufferDefinedAt(instruction, index) .ValueOrDie(); @@ -124,9 +122,9 @@ class TuplePointsToAnalysisTest : public HloTestBase { TEST_F(TuplePointsToAnalysisTest, SimpleTuple) { auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); @@ -177,14 +175,14 @@ TEST_F(TuplePointsToAnalysisTest, NestedTuple) { // tuple. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto inner_tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({inner_tuple, constant3})); @@ -238,14 +236,14 @@ TEST_F(TuplePointsToAnalysisTest, GetTupleElement) { // tuple. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto inner_tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto constant3 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(3.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({inner_tuple, constant3})); @@ -270,7 +268,7 @@ TEST_F(TuplePointsToAnalysisTest, DuplicatedElement) { // Create a tuple which contains duplicate elements. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant, constant, constant})); @@ -291,9 +289,9 @@ TEST_F(TuplePointsToAnalysisTest, TupleCopy) { // the same. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto copy = builder.AddInstruction( @@ -317,9 +315,10 @@ TEST_F(TuplePointsToAnalysisTest, SendAndSendDone) { // Send forwards its operand to the output tuple at {0}. auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); auto send = builder.AddInstruction( - HloInstruction::CreateSend(constant, /*channel_id=*/0)); + HloInstruction::CreateSend(constant, token, /*channel_id=*/0)); auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send)); BuildModuleAndRunAnalysis(builder.Build()); @@ -342,8 +341,9 @@ TEST_F(TuplePointsToAnalysisTest, SendAndSendDone) { TEST_F(TuplePointsToAnalysisTest, RecvAndRecvDone) { // RecvDone forwards its operand tuple element at {0} to the output. auto builder = HloComputation::Builder(TestName()); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); auto recv = builder.AddInstruction(HloInstruction::CreateRecv( - ShapeUtil::MakeShape(F32, {1, 2, 3}), /*channel_id=*/0)); + ShapeUtil::MakeShape(F32, {1, 2, 3}), token, /*channel_id=*/0)); auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv)); BuildModuleAndRunAnalysis(builder.Build()); @@ -355,7 +355,7 @@ TEST_F(TuplePointsToAnalysisTest, RecvAndRecvDone) { ExpectHasTopLevelBuffers( points_to_analysis_->GetPointsToSet(recv).element({}), {recv}); - ExpectHasBufferAliases(recv, {0}, {{recv, {0}}, {recv_done, {}}}); + ExpectHasBufferAliases(recv, {0}, {{recv, {0}}, {recv_done, {0}}}); } TEST_F(TuplePointsToAnalysisTest, TupleSelect) { @@ -363,18 +363,18 @@ TEST_F(TuplePointsToAnalysisTest, TupleSelect) { // set containing the union of both sides. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple1 = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto tuple2 = builder.AddInstruction( HloInstruction::CreateTuple({constant2, constant2})); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto select = builder.AddInstruction(HloInstruction::CreateTernary( - tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple1->shape(), HloOpcode::kTupleSelect, pred, tuple1, tuple2)); BuildModuleAndRunAnalysis(builder.Build()); @@ -401,9 +401,9 @@ TEST_F(TuplePointsToAnalysisTest, SelectTupleParameters) { auto param1 = builder.AddInstruction( HloInstruction::CreateParameter(1, tuple_shape, "param1")); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto select = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, param0, param1)); + tuple_shape, HloOpcode::kTupleSelect, pred, param0, param1)); auto copy = builder.AddInstruction( HloInstruction::CreateUnary(tuple_shape, HloOpcode::kCopy, select)); @@ -441,18 +441,18 @@ TEST_F(TuplePointsToAnalysisTest, UnambiguousTupleSelect) { // Select from two identical tuples. The result should not be ambiguous. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto tuple1 = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto tuple2 = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto select = builder.AddInstruction(HloInstruction::CreateTernary( - tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple1->shape(), HloOpcode::kTupleSelect, pred, tuple1, tuple2)); BuildModuleAndRunAnalysis(builder.Build()); @@ -472,9 +472,9 @@ TEST_F(TuplePointsToAnalysisTest, NestedTupleSelect) { // the right values. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto inner_tuple1 = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto inner_tuple2 = builder.AddInstruction( @@ -486,9 +486,9 @@ TEST_F(TuplePointsToAnalysisTest, NestedTupleSelect) { builder.AddInstruction(HloInstruction::CreateTuple({inner_tuple2})); auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); auto select = builder.AddInstruction(HloInstruction::CreateTernary( - tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2)); + tuple1->shape(), HloOpcode::kTupleSelect, pred, tuple1, tuple2)); BuildModuleAndRunAnalysis(builder.Build()); @@ -519,9 +519,9 @@ TEST_F(TuplePointsToAnalysisTest, TupleWithBitcast) { // have the operand of the bitcast in its points-to set. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary( constant2->shape(), HloOpcode::kBitcast, constant2)); auto tuple = @@ -555,9 +555,10 @@ TEST_F(TuplePointsToAnalysisTest, PointsToTupleConstantElements) { // Construct a tuple constant and kCopy it. Verify the points-to set of the // copy correctly correctly points into the nested elements of the constant. auto builder = HloComputation::Builder(TestName()); - auto tuple_constant = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::MakeTuple({Literal::CreateR2({{1.0}, {2.0}}).get(), - Literal::CreateR1({2.0, 42}).get()}))); + auto tuple_constant = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{1.0}, {2.0}}).get(), + LiteralUtil::CreateR1({2.0, 42}).get()}))); auto copy = builder.AddInstruction(HloInstruction::CreateUnary( tuple_constant->shape(), HloOpcode::kCopy, tuple_constant)); @@ -577,9 +578,9 @@ TEST_F(TuplePointsToAnalysisTest, BufferAliases) { // times. Verify buffer alias sets. auto builder = HloComputation::Builder(TestName()); auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto inner_tuple = builder.AddInstruction( HloInstruction::CreateTuple({constant1, constant2})); auto tuple = builder.AddInstruction( @@ -618,7 +619,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest { auto tuple_element1 = builder.AddInstruction( HloInstruction::CreateGetTupleElement(update_shape, tuple_param0, 1)); auto ones = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.f, 1.f, 1.f, 1.f}))); + LiteralUtil::CreateR1({1.f, 1.f, 1.f, 1.f}))); // Create 'update' = Add(GetTupleElement(tuple_param0, 1), ones) auto update = builder.AddInstruction(HloInstruction::CreateBinary( update_shape, HloOpcode::kAdd, tuple_element1, ones)); @@ -866,9 +867,9 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) { // Create a DynamicUpdateSlice instruction of tuple element 1. auto starts = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({2}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2}))); auto update = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({2.f, 2.f, 2.f}))); + LiteralUtil::CreateR1({2.f, 2.f, 2.f}))); auto dynamic_update_slice = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( data_shape, gte1, update, starts)); @@ -960,9 +961,9 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) { // Create a DynamicUpdateSlice instruction of tuple element 1. auto starts = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({2}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2}))); auto update = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({2.f, 2.f, 2.f}))); + LiteralUtil::CreateR1({2.f, 2.f, 2.f}))); auto dynamic_update_slice = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice( data_shape, gte1, update, starts)); @@ -1009,14 +1010,56 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) { points_to_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {})); } +TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) { + auto builder = HloComputation::Builder(TestName()); + + Shape keys_shape = ShapeUtil::MakeShape(F32, {8}); + auto keys = builder.AddInstruction( + HloInstruction::CreateParameter(0, keys_shape, "keys")); + auto sort = + builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys)); + + BuildModuleAndRunAnalysis(builder.Build()); + + EXPECT_TRUE( + points_to_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {})); +} + +TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) { + auto builder = HloComputation::Builder(TestName()); + + Shape keys_shape = ShapeUtil::MakeShape(F32, {8}); + Shape values_shape = ShapeUtil::MakeShape(F32, {8}); + auto keys = builder.AddInstruction( + HloInstruction::CreateParameter(0, keys_shape, "keys")); + auto values = builder.AddInstruction( + HloInstruction::CreateParameter(1, values_shape, "values")); + auto sort = builder.AddInstruction(HloInstruction::CreateSort( + ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys, values)); + + BuildModuleAndRunAnalysis(builder.Build()); + + // The buffer for the keys can be shared with the first tuple entry. + EXPECT_TRUE( + points_to_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {0})); + // The buffer for the values can be shared with the second tuple entry. + EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(values, {}, + sort, {1})); + // Verify that the buffers are not shared with the "wrong" tuple entry. + EXPECT_FALSE( + points_to_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {1})); + EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(values, {}, + sort, {0})); +} + TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) { auto builder = HloComputation::Builder(TestName()); Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); auto a = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 0.0}, {0.0, 1.0}}))); + LiteralUtil::CreateR2({{1.0, 0.0}, {0.0, 1.0}}))); auto b = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); + LiteralUtil::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(1); @@ -1025,7 +1068,7 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) { HloInstruction::CreateDot(data_shape, a, b, dot_dnums)); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto add_operand = builder.AddInstruction( HloInstruction::CreateBroadcast(data_shape, one, {1})); @@ -1047,7 +1090,7 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) { Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2}); auto one = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto operand = builder.AddInstruction( HloInstruction::CreateBroadcast(data_shape, one, {1})); @@ -1055,7 +1098,7 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) { HloInstruction::CreateReverse(data_shape, operand, {0, 1})); auto two = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); + LiteralUtil::CreateR2({{2.0, 2.0}, {2.0, 2.0}}))); auto add = builder.AddInstruction( HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two)); @@ -1073,7 +1116,7 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) { TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) { Shape data_shape = ShapeUtil::MakeShape(F32, {8}); - auto make_cond = [this, &data_shape]() { + auto make_cond = [&data_shape]() { auto builder = HloComputation::Builder(TestName() + ".Cond"); auto data = builder.AddInstruction( HloInstruction::CreateParameter(0, data_shape, "data")); @@ -1082,7 +1125,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) { return builder.Build(); }; - auto make_body = [this, &data_shape]() { + auto make_body = [&data_shape]() { auto builder = HloComputation::Builder(TestName() + ".Body"); auto data = builder.AddInstruction( HloInstruction::CreateParameter(0, data_shape, "data")); @@ -1120,7 +1163,7 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) { auto sub_param = sub_builder.AddInstruction( HloInstruction::CreateParameter(0, shape, "sub_param")); auto one = sub_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto ones = sub_builder.AddInstruction( HloInstruction::CreateBroadcast(shape, one, {1})); auto add = sub_builder.AddInstruction( @@ -1148,5 +1191,30 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) { call, {})); } +TEST_F(CanShareOperandBufferWithUserTest, LoopFusionWithElementwiseOperand) { + Shape full_shape = ShapeUtil::MakeShape(F32, {16, 32}); + Shape broadcast_shape = ShapeUtil::MakeShape(F32, {16}); + + auto builder = HloComputation::Builder(TestName() + "_fusion"); + auto param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, full_shape, "full")); + auto param1 = builder.AddInstruction( + HloInstruction::CreateParameter(1, broadcast_shape, "small")); + auto broadcast = builder.AddInstruction( + HloInstruction::CreateBroadcast(full_shape, param1, {0})); + auto add = builder.AddInstruction(HloInstruction::CreateBinary( + full_shape, HloOpcode::kAdd, param0, broadcast)); + + BuildModule(builder.Build()); + auto fusion = computation_->CreateFusionInstruction( + {add, broadcast}, HloInstruction::FusionKind::kLoop); + RunAnalysis(); + + EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(param0, {}, + fusion, {})); + EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(param1, {}, + fusion, {})); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc index d668855084a884518b338cdf396a9330b9f43a2b..77bdcc9de0d830991208a1db271d009bccaf550e 100644 --- a/tensorflow/compiler/xla/service/tuple_simplifier.cc +++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc @@ -30,10 +30,17 @@ limitations under the License. namespace xla { +TupleSimplifier::TupleSimplifier(bool exclude_entry_computation) : + exclude_entry_computation_(exclude_entry_computation) {} + StatusOr TupleSimplifier::Run(HloModule* module) { // Initially add all GTE and Tuple instructions to the worklist. std::queue worklist; for (auto* computation : module->computations()) { + if (exclude_entry_computation_ && + computation == module->entry_computation()) { + continue; + } for (auto* instruction : computation->instructions()) { if (instruction->opcode() == HloOpcode::kTuple || instruction->opcode() == HloOpcode::kGetTupleElement) { @@ -69,7 +76,6 @@ StatusOr TupleSimplifier::Run(HloModule* module) { // Tuple // HloInstruction* top_tuple = nullptr; - HloInstruction* first_gte = nullptr; bool can_simplify = true; for (int64 operand_number = 0; operand_number < instruction->operand_count(); ++operand_number) { @@ -79,17 +85,10 @@ StatusOr TupleSimplifier::Run(HloModule* module) { can_simplify = false; break; } - if (first_gte == nullptr) { - first_gte = operand; - } else if (!first_gte->has_compatible_sharding(operand)) { - can_simplify = false; - break; - } if (top_tuple == nullptr) { top_tuple = operand->mutable_operand(0); if (!ShapeUtil::Compatible(top_tuple->shape(), - instruction->shape()) || - !instruction->has_compatible_sharding(top_tuple)) { + instruction->shape())) { can_simplify = false; break; } @@ -118,14 +117,12 @@ StatusOr TupleSimplifier::Run(HloModule* module) { HloInstruction* element_source = instruction->mutable_operand(0)->mutable_operand( instruction->tuple_index()); - if (instruction->has_compatible_sharding(element_source)) { - changed = true; - TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source)); - for (HloInstruction* user : element_source->users()) { - if (user->opcode() == HloOpcode::kTuple || - user->opcode() == HloOpcode::kGetTupleElement) { - worklist.push(user); - } + changed = true; + TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source)); + for (HloInstruction* user : element_source->users()) { + if (user->opcode() == HloOpcode::kTuple || + user->opcode() == HloOpcode::kGetTupleElement) { + worklist.push(user); } } } diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h index e5e9b10b5bf3f452d1bfec476b8d5c7d74c4f4e8..8c91d6e69de637d58fa2ffc1a32ea65f09d3b6d8 100644 --- a/tensorflow/compiler/xla/service/tuple_simplifier.h +++ b/tensorflow/compiler/xla/service/tuple_simplifier.h @@ -27,13 +27,20 @@ namespace xla { // the module. class TupleSimplifier : public HloPassInterface { public: - TupleSimplifier() {} + TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {} + explicit TupleSimplifier(bool exclude_entry_computation); ~TupleSimplifier() override {} - tensorflow::StringPiece name() const override { return "tuple-simplifier"; } + absl::string_view name() const override { return "tuple-simplifier"; } // Run tuple simplification on the given computation. Returns whether the // computation was changed. StatusOr Run(HloModule* module) override; + + private: + // When set, this pipeline stage will perform optimization of all computations + // apart from the module's entry computation. This is used by Graphcore's + // backend. + bool exclude_entry_computation_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc index ca9ae91281fce5ee061d066fc3e538dbbc09f6b3..39b693872da6bd985d95c2abc9519662c838a3f5 100644 --- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -42,6 +42,12 @@ class TupleSimplifierTest : public HloTestBase { TF_ASSERT_OK(changed_status.status()); EXPECT_EQ(change_expected, changed_status.ValueOrDie()); } + void Run(HloModule* module, bool change_expected, bool exclude_entry) { + TupleSimplifier simplifier(exclude_entry); + auto changed_status = simplifier.Run(module); + TF_ASSERT_OK(changed_status.status()); + EXPECT_EQ(change_expected, changed_status.ValueOrDie()); + } const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {}); const Shape tuple_shape_ = ShapeUtil::MakeTupleShape( @@ -211,5 +217,76 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) { EXPECT_THAT(computation->root_instruction(), tuple); } +TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) { + // Verify that the root computation can be excluded + auto module = CreateNewModule(); + + HloInstruction* p0; + HloInstruction* p1; + HloComputation* c0; + HloComputation* c1; + HloComputation* entry; + + { + HloComputation::Builder builder(TestName() + "_1"); + p0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, tuple_shape_, "param")); + HloInstruction* gte0 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 0)); + HloInstruction* gte1 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 1)); + HloInstruction* gte2 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 2)); + + builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2})); + + c0 = module->AddEmbeddedComputation(builder.Build()); + } + { + HloComputation::Builder builder(TestName() + "_2"); + p1 = builder.AddInstruction( + HloInstruction::CreateParameter(0, tuple_shape_, "param")); + HloInstruction* gte0 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 0)); + HloInstruction* gte1 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 1)); + HloInstruction* gte2 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 2)); + + builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2})); + + c1 = module->AddEmbeddedComputation(builder.Build()); + } + { + HloComputation::Builder builder(TestName() + "_Entry"); + HloInstruction* tuple_param = builder.AddInstruction( + HloInstruction::CreateParameter(0, tuple_shape_, "param")); + HloInstruction* call0 = builder.AddInstruction( + HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c0)); + HloInstruction* call1 = builder.AddInstruction( + HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c1)); + HloInstruction* gte0 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, call0, 0)); + HloInstruction* gte1 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, call1, 1)); + HloInstruction* tuple0 = + builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1})); + HloInstruction* gte2 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 0)); + HloInstruction* gte3 = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 1)); + + builder.AddInstruction(HloInstruction::CreateTuple({gte2, gte3})); + + entry = module->AddEntryComputation(builder.Build()); + } + + Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/ true); + + EXPECT_THAT(c0->root_instruction(), p0); + EXPECT_THAT(c1->root_instruction(), p1); + EXPECT_THAT(entry->instruction_count(), 9); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/tuple_util.cc b/tensorflow/compiler/xla/service/tuple_util.cc index 4a530bb0b20582b303f4af969514748b46fd5064..cfb0c787d09557fd1aec3517eb9698cfec323369 100644 --- a/tensorflow/compiler/xla/service/tuple_util.cc +++ b/tensorflow/compiler/xla/service/tuple_util.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/tuple_util.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { @@ -40,7 +40,7 @@ namespace xla { /*static*/ HloInstruction* TupleUtil::AppendSuffix( HloInstruction* input_tuple, - tensorflow::gtl::ArraySlice trailing_values) { + absl::Span trailing_values) { CHECK(ShapeUtil::IsTuple(input_tuple->shape())); HloComputation* computation = input_tuple->parent(); diff --git a/tensorflow/compiler/xla/service/tuple_util.h b/tensorflow/compiler/xla/service/tuple_util.h index e5ff9aaa8357fe8e4777d6dee37bbec72e144c06..bc5aac09f270c01515b1f3a704af6949f24cb218 100644 --- a/tensorflow/compiler/xla/service/tuple_util.h +++ b/tensorflow/compiler/xla/service/tuple_util.h @@ -38,7 +38,7 @@ class TupleUtil { // `input_tuple`. static HloInstruction* AppendSuffix( HloInstruction* input_tuple, - tensorflow::gtl::ArraySlice trailing_values); + absl::Span trailing_values); }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/tuple_util_test.cc b/tensorflow/compiler/xla/service/tuple_util_test.cc index 754fd8ef169231827eeb5bfd72aeb596644ca767..d33d5bb8f30c8504aa323d461e5f59709b48e1fc 100644 --- a/tensorflow/compiler/xla/service/tuple_util_test.cc +++ b/tensorflow/compiler/xla/service/tuple_util_test.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/tuple_util.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" namespace xla { namespace { @@ -37,7 +37,7 @@ ENTRY entry { )"; TF_ASSIGN_OR_RETURN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); *entry_computation = module->entry_computation(); *param0 = (*entry_computation)->parameter_instruction(0); diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc deleted file mode 100644 index 9e62d0acfb98946f1e693fc0310098b4ec99750b..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/user_computation.cc +++ /dev/null @@ -1,3557 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/user_computation.h" - -#include -#include -#include -#include -#include -#include - -#include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/service/shape_inference.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/protobuf.h" - -namespace xla { -namespace { - -HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) { - switch (unop) { - case UNOP_ABS: - return HloOpcode::kAbs; - case UNOP_CEIL: - return HloOpcode::kCeil; - case UNOP_CLZ: - return HloOpcode::kClz; - case UNOP_COS: - return HloOpcode::kCos; - case UNOP_EXP: - return HloOpcode::kExp; - case UNOP_EXPM1: - return HloOpcode::kExpm1; - case UNOP_FLOOR: - return HloOpcode::kFloor; - case UNOP_IMAG: - return HloOpcode::kImag; - case UNOP_IS_FINITE: - return HloOpcode::kIsFinite; - case UNOP_LOG: - return HloOpcode::kLog; - case UNOP_LOG1P: - return HloOpcode::kLog1p; - case UNOP_NOT: - return HloOpcode::kNot; - case UNOP_NEGATE: - return HloOpcode::kNegate; - case UNOP_REAL: - return HloOpcode::kReal; - case UNOP_ROUND_NEAREST_AFZ: - return HloOpcode::kRoundNearestAfz; - case UNOP_SIGN: - return HloOpcode::kSign; - case UNOP_SIN: - return HloOpcode::kSin; - case UNOP_SORT: - return HloOpcode::kSort; - case UNOP_TANH: - return HloOpcode::kTanh; - default: - LOG(FATAL) << "unhandled operation " << unop; - } -} - -HloOpcode BinaryOperationToHloOpcode(BinaryOperation binop) { - switch (binop) { - case BINOP_ATAN2: - return HloOpcode::kAtan2; - case BINOP_COMPLEX: - return HloOpcode::kComplex; - case BINOP_MUL: - return HloOpcode::kMultiply; - case BINOP_ADD: - return HloOpcode::kAdd; - case BINOP_SUB: - return HloOpcode::kSubtract; - case BINOP_DIV: - return HloOpcode::kDivide; - case BINOP_EQ: - return HloOpcode::kEq; - case BINOP_GE: - return HloOpcode::kGe; - case BINOP_GT: - return HloOpcode::kGt; - case BINOP_LE: - return HloOpcode::kLe; - case BINOP_LT: - return HloOpcode::kLt; - case BINOP_NE: - return HloOpcode::kNe; - case BINOP_MAX: - return HloOpcode::kMaximum; - case BINOP_MIN: - return HloOpcode::kMinimum; - case BINOP_POW: - return HloOpcode::kPower; - case BINOP_REM: - return HloOpcode::kRemainder; - case BINOP_OR: - return HloOpcode::kOr; - case BINOP_AND: - return HloOpcode::kAnd; - case BINOP_SHIFT_LEFT: - return HloOpcode::kShiftLeft; - case BINOP_SHIFT_RIGHT_ARITHMETIC: - return HloOpcode::kShiftRightArithmetic; - case BINOP_SHIFT_RIGHT_LOGICAL: - return HloOpcode::kShiftRightLogical; - default: - LOG(FATAL) << "unhandled operation " << binop; - } -} - -HloOpcode TernaryOperationToHloOpcode(TernaryOperation triop) { - switch (triop) { - case TRIOP_CLAMP: - return HloOpcode::kClamp; - case TRIOP_SELECT: - return HloOpcode::kSelect; - default: - LOG(FATAL) << "unhandled operation " << triop; - } -} - -HloOpcode VariadicOperationToHloOpcode(VariadicOperation varop) { - switch (varop) { - case VAROP_TUPLE: - return HloOpcode::kTuple; - default: - LOG(FATAL) << "unhandled operation " << varop; - } -} - -} // namespace - -/* static */ StatusOr> -UserComputation::MakeWithRemapping( - const SessionComputation& session_computation, - const ComputationHandle& handle, - const std::map& old_to_new) { - auto user_computation = - MakeUnique(session_computation.name(), handle); - { - tensorflow::mutex_lock lock(user_computation->mutex_); - user_computation->session_computation_ = session_computation; - user_computation->next_handle_value_ = - std::max_element(session_computation.requests().begin(), - session_computation.requests().end(), - [](const std::pair& lhs, - const std::pair& rhs) { - return lhs.first < rhs.first; - }) - ->first + - 1; - TF_RETURN_IF_ERROR(user_computation->RemapEmbeddedComputations(old_to_new)); - } - - return std::move(user_computation); -} - -UserComputation::UserComputation(const string& name, - const ComputationHandle& handle) - : name_(name), next_handle_value_(1) { - *session_computation_.mutable_computation_handle() = handle; - session_computation_.set_name(name); - - VLOG(1) << "New UserComputation \"" << name - << "\", handle: " << handle.handle(); -} - -ComputationDataHandle UserComputation::CreateComputationDataHandle() { - ComputationDataHandle handle; - handle.set_handle(next_handle_value_); - // Handles are used as Version values and *must* be assigned consecutively for - // computation versioning to work. - next_handle_value_++; - return handle; -} - -StatusOr UserComputation::AddParameterInstruction( - const ParameterRequest& parameter_request) { - tensorflow::mutex_lock lock(mutex_); - - int64 parameter_number = parameter_request.parameter(); - if (parameters_.count(parameter_number) != 0) { - return InvalidArgument("parameter %lld already registered", - parameter_number); - } - ComputationDataHandle handle = CreateComputationDataHandle(); - - const Shape& validated_shape = parameter_request.shape(); - TF_RETURN_IF_ERROR( - ShapeUtil::ValidateShapeWithOptionalLayout(validated_shape)); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = validated_shape; - *request.mutable_request()->mutable_parameter_request() = parameter_request; - - parameters_[parameter_number] = &request; - - VLOG(1) << "AddParameterInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << parameter_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddSendInstruction( - const SendRequest& send_request) { - tensorflow::mutex_lock lock(mutex_); - - // Check if the operand of the instruction is valid. - TF_RETURN_IF_ERROR(LookUpRequest(send_request.operand()).status()); - - // No handle is returned, but a handle must be assigned to this instruction - // for computation versioning. - ComputationDataHandle handle = CreateComputationDataHandle(); - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = ShapeUtil::MakeNil(); - *request.mutable_request()->mutable_send_request() = send_request; - - VLOG(1) << "AddSendInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << send_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddRecvInstruction( - const RecvRequest& recv_request) { - tensorflow::mutex_lock lock(mutex_); - - const Shape& shape = recv_request.shape(); - TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape)); - ComputationDataHandle handle = CreateComputationDataHandle(); - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_recv_request() = recv_request; - - VLOG(1) << "AddRecvInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << recv_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddPadInstruction( - const PadRequest& pad_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(pad_request.operand())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* padding_value, - LookUpRequest(pad_request.padding_value())); - - TF_ASSIGN_OR_RETURN(Shape inferred_shape, ShapeInference::InferPadShape( - operand->output_shape(), - padding_value->output_shape(), - pad_request.padding_config())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - *request.mutable_request()->mutable_pad_request() = pad_request; - - VLOG(1) << "AddPadInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << pad_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddConstantInstruction( - const ConstantRequest& constant_request) { - const Shape& validated_shape = constant_request.literal().shape(); - TF_RETURN_IF_ERROR( - ShapeUtil::ValidateShapeWithOptionalLayout(validated_shape)); - - tensorflow::mutex_lock lock(mutex_); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = validated_shape; - *request.mutable_request()->mutable_constant_request() = constant_request; - - VLOG(1) << "AddConstantInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle(); - return handle; -} - -StatusOr UserComputation::AddGatherInstruction( - const GatherRequest& gather_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* input_request, - LookUpRequest(gather_request.input())); - TF_ASSIGN_OR_RETURN(const OperationRequest* gather_indices_request, - LookUpRequest(gather_request.gather_indices())); - - TF_ASSIGN_OR_RETURN( - Shape shape, - ShapeInference::InferGatherShape( - input_request->output_shape(), gather_indices_request->output_shape(), - gather_request.dimension_numbers(), - AsInt64Slice(gather_request.window_bounds()))); - - const ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_gather_request() = gather_request; - - VLOG(1) << "AddGatherInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << gather_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddGetTupleElementInstruction( - const GetTupleElementRequest& get_tuple_element_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(get_tuple_element_request.operand())); - if (!ShapeUtil::IsTuple(operand->output_shape())) { - return InvalidArgument( - "Operand to GetTupleElement() is not a tuple; got %s", - ShapeUtil::HumanString(operand->output_shape()).c_str()); - } - Shape element_shape = ShapeUtil::GetTupleElementShape( - operand->output_shape(), get_tuple_element_request.index()); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = element_shape; - *request.mutable_request()->mutable_get_tuple_element_request() = - get_tuple_element_request; - - VLOG(1) << "AddGetTupleElementInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << get_tuple_element_request.ShortDebugString(); - return handle; -} - -Status UserComputation::AddTraceInstruction(const TraceRequest& trace_request) { - tensorflow::mutex_lock lock(mutex_); - - // Verify that the operand index is valid. - TF_RETURN_IF_ERROR(LookUpRequest(trace_request.operand()).status()); - - ComputationDataHandle handle = CreateComputationDataHandle(); - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = ShapeUtil::MakeNil(); - *request.mutable_request()->mutable_trace_request() = trace_request; - - VLOG(1) << "AddTraceInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << trace_request.ShortDebugString(); - return Status::OK(); -} - -StatusOr UserComputation::AddRngInstruction( - const RngRequest& rng_request) { - tensorflow::mutex_lock lock(mutex_); - - // Check the number of parameters per RNG distribution. - switch (rng_request.distribution()) { - case RandomDistribution::RNG_NORMAL: - case RandomDistribution::RNG_UNIFORM: - if (rng_request.parameter_size() != 2) { - return InvalidArgument( - "RNG distribution (%s) expects 2 parameters, but got %d", - RandomDistribution_Name(rng_request.distribution()).c_str(), - rng_request.parameter_size()); - } - break; - default: - LOG(FATAL) << "unhandled distribution " << rng_request.distribution(); - } - - // Verify that the parameter indices are valid; - for (const ComputationDataHandle& param : rng_request.parameter()) { - TF_RETURN_IF_ERROR(LookUpRequest(param).status()); - } - const Shape& validated_shape = rng_request.shape(); - TF_RETURN_IF_ERROR( - ShapeUtil::ValidateShapeWithOptionalLayout(validated_shape)); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = validated_shape; - *request.mutable_request()->mutable_rng_request() = rng_request; - - VLOG(1) << "AddRngInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << rng_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddMapInstruction( - const MapRequest& map_request, - const UserComputation& to_apply_computation) { - tensorflow::mutex_lock lock(mutex_); - - std::vector operand_shapes; - for (const ComputationDataHandle& handle : map_request.operands()) { - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle)); - operand_shapes.push_back(&operand->output_shape()); - } - - VersionedComputationHandle::Version to_apply_version = - to_apply_computation.version(); - TF_ASSIGN_OR_RETURN( - std::shared_ptr to_apply_program_shape, - to_apply_computation.ComputeProgramShape(to_apply_version)); - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferMapShape(operand_shapes, *to_apply_program_shape, - AsInt64Slice(map_request.dimensions()))); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - request.add_embedded_computation_versions(to_apply_version); - *request.mutable_request()->mutable_map_request() = map_request; - - VLOG(1) << "AddMapInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << map_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddReduceInstruction( - const ReduceRequest& reduce_request, - const UserComputation& to_apply_computation) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(reduce_request.operand())); - TF_ASSIGN_OR_RETURN(const OperationRequest* init_value, - LookUpRequest(reduce_request.init_value())); - - VersionedComputationHandle::Version to_apply_version = - to_apply_computation.version(); - TF_ASSIGN_OR_RETURN( - std::shared_ptr to_apply_program_shape, - to_apply_computation.ComputeProgramShape(to_apply_version)); - - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferReduceShape( - operand->output_shape(), init_value->output_shape(), - AsInt64Slice(reduce_request.dimensions()), *to_apply_program_shape)); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - request.add_embedded_computation_versions(to_apply_version); - *request.mutable_request()->mutable_reduce_request() = reduce_request; - - VLOG(1) << "AddReduceInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << reduce_request.ShortDebugString(); - return handle; -} - -StatusOr -UserComputation::AddBatchNormTrainingInstruction( - const BatchNormTrainingRequest& batch_norm_training_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(batch_norm_training_request.operand())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* scale, - LookUpRequest(batch_norm_training_request.scale())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* offset, - LookUpRequest(batch_norm_training_request.offset())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferBatchNormTrainingShape( - operand->output_shape(), scale->output_shape(), - offset->output_shape(), batch_norm_training_request.feature_index())); - - *request.mutable_output_shape() = inferred_shape; - - *request.mutable_output_handle() = handle; - - *request.mutable_request()->mutable_batch_norm_training_request() = - batch_norm_training_request; - - VLOG(1) << "AddBatchNormTrainingInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << batch_norm_training_request.ShortDebugString(); - - return handle; -} - -StatusOr -UserComputation::AddBatchNormInferenceInstruction( - const BatchNormInferenceRequest& batch_norm_inference_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(batch_norm_inference_request.operand())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* scale, - LookUpRequest(batch_norm_inference_request.scale())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* offset, - LookUpRequest(batch_norm_inference_request.offset())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* mean, - LookUpRequest(batch_norm_inference_request.mean())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* variance, - LookUpRequest(batch_norm_inference_request.variance())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - - TF_ASSIGN_OR_RETURN(Shape inferred_shape, - ShapeInference::InferBatchNormInferenceShape( - operand->output_shape(), scale->output_shape(), - offset->output_shape(), mean->output_shape(), - variance->output_shape(), - batch_norm_inference_request.feature_index())); - - *request.mutable_output_shape() = inferred_shape; - - *request.mutable_output_handle() = handle; - - *request.mutable_request()->mutable_batch_norm_inference_request() = - batch_norm_inference_request; - - VLOG(1) << "AddBatchNormInferenceInstruction (" - << GetVersionedHandleInternal() << "), data handle " - << handle.handle() << ": " - << batch_norm_inference_request.ShortDebugString(); - - return handle; -} - -StatusOr UserComputation::AddBatchNormGradInstruction( - const BatchNormGradRequest& batch_norm_grad_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(batch_norm_grad_request.operand())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* scale, - LookUpRequest(batch_norm_grad_request.scale())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* mean, - LookUpRequest(batch_norm_grad_request.mean())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* variance, - LookUpRequest(batch_norm_grad_request.variance())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* grad_output, - LookUpRequest(batch_norm_grad_request.grad_output())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferBatchNormGradShape( - operand->output_shape(), scale->output_shape(), mean->output_shape(), - variance->output_shape(), grad_output->output_shape(), - batch_norm_grad_request.feature_index())); - - *request.mutable_output_shape() = inferred_shape; - - *request.mutable_output_handle() = handle; - - *request.mutable_request()->mutable_batch_norm_grad_request() = - batch_norm_grad_request; - - VLOG(1) << "AddBatchNormGradInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << batch_norm_grad_request.ShortDebugString(); - - return handle; -} - -StatusOr UserComputation::AddReduceWindowInstruction( - const ReduceWindowRequest& reduce_window_request, - const UserComputation& to_apply_computation) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(reduce_window_request.operand())); - TF_ASSIGN_OR_RETURN(const OperationRequest* init_value, - LookUpRequest(reduce_window_request.init_value())); - - VersionedComputationHandle::Version to_apply_version = - to_apply_computation.version(); - TF_ASSIGN_OR_RETURN( - std::shared_ptr to_apply_program_shape, - to_apply_computation.ComputeProgramShape(to_apply_version)); - - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferReduceWindowShape( - operand->output_shape(), init_value->output_shape(), - reduce_window_request.window(), *to_apply_program_shape)); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - request.add_embedded_computation_versions(to_apply_version); - *request.mutable_request()->mutable_reduce_window_request() = - reduce_window_request; - - VLOG(1) << "AddReduceWindowInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << reduce_window_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddSelectAndScatterInstruction( - const SelectAndScatterRequest& select_and_scatter_request, - const UserComputation& select_computation, - const UserComputation& scatter_computation) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(select_and_scatter_request.operand())); - TF_ASSIGN_OR_RETURN(const OperationRequest* source, - LookUpRequest(select_and_scatter_request.source())); - TF_ASSIGN_OR_RETURN(const OperationRequest* init_value, - LookUpRequest(select_and_scatter_request.init_value())); - - VersionedComputationHandle::Version select_version = - select_computation.version(); - TF_ASSIGN_OR_RETURN(std::shared_ptr select_program_shape, - select_computation.ComputeProgramShape(select_version)); - VersionedComputationHandle::Version scatter_version = - scatter_computation.version(); - TF_ASSIGN_OR_RETURN(std::shared_ptr scatter_program_shape, - scatter_computation.ComputeProgramShape(scatter_version)); - - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferSelectAndScatterShape( - operand->output_shape(), *select_program_shape, - select_and_scatter_request.window(), source->output_shape(), - init_value->output_shape(), *scatter_program_shape)); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - request.add_embedded_computation_versions(select_version); - request.add_embedded_computation_versions(scatter_version); - *request.mutable_request()->mutable_select_and_scatter_request() = - select_and_scatter_request; - - VLOG(1) << "AddSelectAndScatterInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << select_and_scatter_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddReverseInstruction( - const ReverseRequest& reverse_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(reverse_request.operand())); - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferReverseShape( - operand->output_shape(), AsInt64Slice(reverse_request.dimensions()))); - - ComputationDataHandle handle = CreateComputationDataHandle(); - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - *request.mutable_request()->mutable_reverse_request() = reverse_request; - VLOG(1) << "AddReverseInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << reverse_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddWhileInstruction( - const WhileRequest& while_request, - const UserComputation& condition_computation, - const UserComputation& body_computation) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* init, - LookUpRequest(while_request.init())); - - VersionedComputationHandle::Version condition_version = - condition_computation.version(); - TF_ASSIGN_OR_RETURN( - std::shared_ptr condition_program_shape, - condition_computation.ComputeProgramShape(condition_version)); - - VersionedComputationHandle::Version body_version = body_computation.version(); - TF_ASSIGN_OR_RETURN(std::shared_ptr body_program_shape, - body_computation.ComputeProgramShape(body_version)); - - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferWhileShape( - *condition_program_shape, *body_program_shape, init->output_shape())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - request.add_embedded_computation_versions(condition_version); - request.add_embedded_computation_versions(body_version); - *request.mutable_request()->mutable_while_request() = while_request; - - VLOG(1) << "AddWhileInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << while_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddConditionalInstruction( - const ConditionalRequest& conditional_request, - const UserComputation& true_computation, - const UserComputation& false_computation) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* pred, - LookUpRequest(conditional_request.predicate())); - TF_ASSIGN_OR_RETURN(const OperationRequest* true_operand, - LookUpRequest(conditional_request.true_operand())); - TF_ASSIGN_OR_RETURN(const OperationRequest* false_operand, - LookUpRequest(conditional_request.false_operand())); - - VersionedComputationHandle::Version true_computation_version = - true_computation.version(); - TF_ASSIGN_OR_RETURN( - std::shared_ptr true_computation_shape, - true_computation.ComputeProgramShape(true_computation_version)); - - VersionedComputationHandle::Version false_computation_version = - false_computation.version(); - TF_ASSIGN_OR_RETURN( - std::shared_ptr false_computation_shape, - false_computation.ComputeProgramShape(false_computation_version)); - - TF_ASSIGN_OR_RETURN(Shape inferred_shape, - ShapeInference::InferConditionalShape( - pred->output_shape(), true_operand->output_shape(), - false_operand->output_shape(), - *true_computation_shape, *false_computation_shape)); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - request.add_embedded_computation_versions(true_computation_version); - request.add_embedded_computation_versions(false_computation_version); - *request.mutable_request()->mutable_conditional_request() = - conditional_request; - - VLOG(1) << "AddConditionalInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << conditional_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddBroadcastInstruction( - const BroadcastRequest& broadcast_request) { - tensorflow::mutex_lock lock(mutex_); - - // Fetches and validates the operand. - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(broadcast_request.operand())); - TF_ASSIGN_OR_RETURN(Shape inferred_shape, - ShapeInference::InferBroadcastShape( - operand->output_shape(), - AsInt64Slice(broadcast_request.broadcast_sizes()))); - - ComputationDataHandle handle = CreateComputationDataHandle(); - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - *request.mutable_request()->mutable_broadcast_request() = broadcast_request; - - VLOG(1) << "AddBroadcastInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << broadcast_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddReshapeInstruction( - const ReshapeRequest& reshape_request) { - tensorflow::mutex_lock lock(mutex_); - - // Fetches and validates the operand. - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(reshape_request.operand())); - - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferReshapeShape( - operand->output_shape(), AsInt64Slice(reshape_request.dimensions()), - AsInt64Slice(reshape_request.new_sizes()))); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - *request.mutable_request()->mutable_reshape_request() = reshape_request; - - VLOG(1) << "AddReshapeInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << reshape_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddTransposeInstruction( - const TransposeRequest& transpose_request) { - tensorflow::mutex_lock lock(mutex_); - - // Fetches and validates the operand. - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(transpose_request.operand())); - - TF_ASSIGN_OR_RETURN(Shape inferred_shape, - ShapeInference::InferTransposeShape( - operand->output_shape(), - AsInt64Slice(transpose_request.dimensions()))); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - *request.mutable_request()->mutable_transpose_request() = transpose_request; - - VLOG(1) << "AddTransposeInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << transpose_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddSliceInstruction( - const SliceRequest& slice_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(slice_request.operand())); - - TF_ASSIGN_OR_RETURN( - Shape new_shape, - ShapeInference::InferSliceShape( - operand->output_shape(), AsInt64Slice(slice_request.start_indices()), - AsInt64Slice(slice_request.limit_indices()), - AsInt64Slice(slice_request.strides()))); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = new_shape; - *request.mutable_request()->mutable_slice_request() = slice_request; - - VLOG(1) << "AddSliceInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << slice_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddDynamicSliceInstruction( - const DynamicSliceRequest& dynamic_slice_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(dynamic_slice_request.operand())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* start_indices, - LookUpRequest(dynamic_slice_request.start_indices())); - - TF_ASSIGN_OR_RETURN( - Shape new_shape, - ShapeInference::InferDynamicSliceShape( - operand->output_shape(), start_indices->output_shape(), - AsInt64Slice(dynamic_slice_request.slice_sizes()))); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = new_shape; - *request.mutable_request()->mutable_dynamic_slice_request() = - dynamic_slice_request; - - VLOG(1) << "AddDynamicSliceInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << dynamic_slice_request.ShortDebugString(); - return handle; -} - -StatusOr -UserComputation::AddDynamicUpdateSliceInstruction( - const DynamicUpdateSliceRequest& dynamic_update_slice_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(dynamic_update_slice_request.operand())); - - TF_ASSIGN_OR_RETURN(const OperationRequest* update, - LookUpRequest(dynamic_update_slice_request.update())); - - TF_ASSIGN_OR_RETURN( - const OperationRequest* start_indices, - LookUpRequest(dynamic_update_slice_request.start_indices())); - - TF_ASSIGN_OR_RETURN(Shape new_shape, - ShapeInference::InferDynamicUpdateSliceShape( - operand->output_shape(), update->output_shape(), - start_indices->output_shape())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = new_shape; - *request.mutable_request()->mutable_dynamic_update_slice_request() = - dynamic_update_slice_request; - - VLOG(1) << "AddDynamicUpdateSliceInstruction (" - << GetVersionedHandleInternal() << "), data handle " - << handle.handle() << ": " - << dynamic_update_slice_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddConcatenateInstruction( - const ConcatenateRequest& concatenate_request) { - tensorflow::mutex_lock lock(mutex_); - - std::vector operand_shapes; - for (const ComputationDataHandle& handle : concatenate_request.operands()) { - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle)); - operand_shapes.push_back(&operand->output_shape()); - } - - TF_ASSIGN_OR_RETURN(Shape new_shape, - ShapeInference::InferConcatOpShape( - operand_shapes, concatenate_request.dimension())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = new_shape; - *request.mutable_request()->mutable_concatenate_request() = - concatenate_request; - - VLOG(1) << "AddConcatenateInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << concatenate_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddConvertInstruction( - const ConvertRequest& convert_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(convert_request.operand())); - - TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferConvertShape( - operand->output_shape(), - convert_request.new_element_type())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = new_shape; - *request.mutable_request()->mutable_convert_request() = convert_request; - - VLOG(1) << "AddConvertInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << convert_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddBitcastConvertInstruction( - const ConvertRequest& convert_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(convert_request.operand())); - - TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferConvertShape( - operand->output_shape(), - convert_request.new_element_type())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = new_shape; - *request.mutable_request()->mutable_bitcast_convert_request() = - convert_request; - - VLOG(1) << "AddBitcastConvertInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << convert_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddReducePrecisionInstruction( - const ReducePrecisionRequest& reduce_precision_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(reduce_precision_request.operand())); - - TF_ASSIGN_OR_RETURN( - Shape new_shape, - ShapeInference::InferReducePrecisionShape( - operand->output_shape(), reduce_precision_request.exponent_bits(), - reduce_precision_request.mantissa_bits())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = new_shape; - *request.mutable_request()->mutable_reduce_precision_request() = - reduce_precision_request; - - VLOG(1) << "AddReducePrecisionInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << reduce_precision_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddConvolveInstruction( - const ConvolveRequest& convolve_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* lhs, - LookUpRequest(convolve_request.lhs())); - TF_ASSIGN_OR_RETURN(const OperationRequest* rhs, - LookUpRequest(convolve_request.rhs())); - TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvolveShape( - lhs->output_shape(), rhs->output_shape(), - convolve_request.window(), - convolve_request.dimension_numbers())); - - const ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_convolve_request() = convolve_request; - - VLOG(1) << "AddConvolveInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << convolve_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddFftInstruction( - const FftRequest& fft_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(fft_request.operand())); - TF_ASSIGN_OR_RETURN(Shape shape, - ShapeInference::InferFftShape( - operand->output_shape(), fft_request.fft_type(), - AsInt64Slice(fft_request.fft_length()))); - - const ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_fft_request() = fft_request; - - VLOG(1) << "AddFftInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << fft_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddCrossReplicaSumInstruction( - const CrossReplicaSumRequest& cross_replica_sum_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(cross_replica_sum_request.operand())); - TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape( - {&operand->output_shape()})); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_cross_replica_sum_request() = - cross_replica_sum_request; - - VLOG(1) << "AddCrossreplicaSumInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << cross_replica_sum_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddInfeedInstruction( - const InfeedRequest& infeed_request) { - tensorflow::mutex_lock lock(mutex_); - - const Shape& shape = infeed_request.shape(); - if (!LayoutUtil::HasLayout(shape)) { - return InvalidArgument("Given shape to Infeed must have a layout"); - } - - const ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_infeed_request() = infeed_request; - - VLOG(1) << "AddInfeedInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << infeed_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddOutfeedInstruction( - const OutfeedRequest& outfeed_request) { - tensorflow::mutex_lock lock(mutex_); - - const Shape& shape = outfeed_request.shape(); - if (!LayoutUtil::HasLayout(shape)) { - return InvalidArgument("Given shape to Outfeed must have a layout"); - } - - // Verify that operand is valid. - TF_RETURN_IF_ERROR(LookUpRequest(outfeed_request.operand()).status()); - - ComputationDataHandle handle = CreateComputationDataHandle(); - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_outfeed_request() = outfeed_request; - - VLOG(1) << "AddOutfeedInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << outfeed_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddCallInstruction( - const CallRequest& call_request, - const UserComputation& to_apply_computation) { - tensorflow::mutex_lock lock(mutex_); - - std::vector operand_shapes; - for (const ComputationDataHandle& handle : call_request.operands()) { - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle)); - operand_shapes.push_back(&operand->output_shape()); - } - - VersionedComputationHandle::Version to_apply_version = - to_apply_computation.version(); - TF_ASSIGN_OR_RETURN( - std::shared_ptr to_apply_program_shape, - to_apply_computation.ComputeProgramShape(to_apply_version)); - TF_ASSIGN_OR_RETURN( - Shape inferred_shape, - ShapeInference::InferCallShape(operand_shapes, *to_apply_program_shape)); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = inferred_shape; - request.add_embedded_computation_versions(to_apply_version); - *request.mutable_request()->mutable_call_request() = call_request; - - VLOG(1) << "AddCallInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << call_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddCustomCallInstruction( - const CustomCallRequest& custom_call_request) { - tensorflow::mutex_lock lock(mutex_); - - for (const ComputationDataHandle& handle : custom_call_request.operands()) { - TF_RETURN_IF_ERROR(LookUpRequest(handle).status()); - } - - if (tensorflow::str_util::StartsWith(custom_call_request.call_target_name(), - "$")) { - return InvalidArgument( - "Invalid custom_call_target \"%s\": Call targets that start with '$' " - "are reserved for internal use.", - custom_call_request.call_target_name().c_str()); - } - - const ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = custom_call_request.shape(); - *request.mutable_request()->mutable_custom_call_request() = - custom_call_request; - - VLOG(1) << "AddCustomCallInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << custom_call_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddHostComputeInstruction( - const HostComputeRequest& host_compute_request) { - tensorflow::mutex_lock lock(mutex_); - - for (const ComputationDataHandle& handle : host_compute_request.operands()) { - TF_RETURN_IF_ERROR(LookUpRequest(handle).status()); - } - - ComputationDataHandle handle = CreateComputationDataHandle(); - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = host_compute_request.shape(); - *request.mutable_request()->mutable_host_compute_request() = - host_compute_request; - - VLOG(1) << "AddHostComputeInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << host_compute_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddDotInstruction( - const DotRequest& dot_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* lhs, - LookUpRequest(dot_request.lhs())); - TF_ASSIGN_OR_RETURN(const OperationRequest* rhs, - LookUpRequest(dot_request.rhs())); - - TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferDotOpShape( - lhs->output_shape(), rhs->output_shape(), - dot_request.dimension_numbers())); - - const ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_dot_request() = dot_request; - - VLOG(1) << "AddDotInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << dot_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddUnaryInstruction( - const UnaryOpRequest& unary_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, - LookUpRequest(unary_request.operand())); - TF_ASSIGN_OR_RETURN( - Shape shape, ShapeInference::InferUnaryOpShape(unary_request.unop(), - operand->output_shape())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_unary_op_request() = unary_request; - - VLOG(1) << "AddUnaryInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << unary_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddBinaryInstruction( - const BinaryOpRequest& binary_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* lhs, - LookUpRequest(binary_request.lhs())); - TF_ASSIGN_OR_RETURN(const OperationRequest* rhs, - LookUpRequest(binary_request.rhs())); - TF_ASSIGN_OR_RETURN( - Shape shape, - ShapeInference::InferBinaryOpShape( - binary_request.binop(), lhs->output_shape(), rhs->output_shape(), - AsInt64Slice(binary_request.broadcast_dimensions()))); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_binary_op_request() = binary_request; - - VLOG(1) << "AddBinaryInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << binary_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddTernaryInstruction( - const TernaryOpRequest& ternary_request) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* lhs, - LookUpRequest(ternary_request.lhs())); - TF_ASSIGN_OR_RETURN(const OperationRequest* rhs, - LookUpRequest(ternary_request.rhs())); - TF_ASSIGN_OR_RETURN(const OperationRequest* ehs, - LookUpRequest(ternary_request.ehs())); - TF_ASSIGN_OR_RETURN(Shape shape, - ShapeInference::InferTernaryOpShape( - ternary_request.triop(), lhs->output_shape(), - rhs->output_shape(), ehs->output_shape())); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_ternary_op_request() = ternary_request; - - VLOG(1) << "AddTernaryInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << ternary_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::AddVariadicInstruction( - const VariadicOpRequest& variadic_request) { - tensorflow::mutex_lock lock(mutex_); - - std::vector operand_shapes; - for (const ComputationDataHandle& handle : variadic_request.operands()) { - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle)); - operand_shapes.push_back(&operand->output_shape()); - } - - TF_ASSIGN_OR_RETURN(Shape shape, - ShapeInference::InferVariadicOpShape( - variadic_request.varop(), operand_shapes)); - - ComputationDataHandle handle = CreateComputationDataHandle(); - - OperationRequest& request = - (*session_computation_.mutable_requests())[handle.handle()]; - *request.mutable_output_handle() = handle; - *request.mutable_output_shape() = shape; - *request.mutable_request()->mutable_variadic_op_request() = variadic_request; - - VLOG(1) << "AddVariadicInstruction (" << GetVersionedHandleInternal() - << "), data handle " << handle.handle() << ": " - << variadic_request.ShortDebugString(); - return handle; -} - -StatusOr UserComputation::GetShape(const ComputationDataHandle& handle) { - tensorflow::mutex_lock lock(mutex_); - - TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle)); - return operand->output_shape(); -} - -Status UserComputation::SetOpMetadata(const ComputationDataHandle& handle, - const OpMetadata& metadata) { - tensorflow::mutex_lock lock(mutex_); - - int64 handle_value = handle.handle(); - if (session_computation_.requests().count(handle_value) == 0) { - return InvalidArgument("Invalid handle in SetOpMetadata (%lld)", - handle_value); - } - *session_computation_.mutable_requests() - ->at(handle_value) - .mutable_request() - ->mutable_metadata() = metadata; - return Status::OK(); -} - -Status UserComputation::SetOpSharding(const ComputationDataHandle& handle, - const OpSharding& sharding) { - tensorflow::mutex_lock lock(mutex_); - - int64 handle_value = handle.handle(); - if (session_computation_.requests().count(handle_value) == 0) { - return InvalidArgument("Invalid handle in SetOpSharding (%lld)", - handle_value); - } - *session_computation_.mutable_requests() - ->at(handle_value) - .mutable_request() - ->mutable_sharding() = sharding; - return Status::OK(); -} - -Status UserComputation::SetReturnValue(const ComputationDataHandle& handle) { - tensorflow::mutex_lock lock(mutex_); - - if (!(handle.handle() > 0 && handle.handle() < next_handle_value_)) { - return InvalidArgument("Invalid handle in SetReturnValue"); - } - - handle_to_return_ = handle; - - VLOG(1) << "SetReturnValue of computation \"" << name() << "\" fixed to " - << GetVersionedHandleInternal(); - - return Status::OK(); -} - -VersionedComputationHandle UserComputation::GetVersionedHandle() const { - tensorflow::mutex_lock lock(mutex_); - return GetVersionedHandleInternal(); -} - -VersionedComputationHandle UserComputation::GetVersionedHandleInternal() const { - VersionedComputationHandle versioned_handle; - versioned_handle.handle = session_computation_.computation_handle(); - - if (handle_to_return_.handle() > 0) { - // A specific handle has been requested for the result of the computation. - versioned_handle.version = handle_to_return_.handle(); - } else { - // A version value is simply the most recently assigned - // ComputationDataHandle value, ie the handle value of the root of the - // computation. - versioned_handle.version = next_handle_value_ - 1; - } - - return versioned_handle; -} - -VersionedComputationHandle UserComputation::GetVersionedHandleAtOperation( - const ComputationDataHandle& operation) const { - tensorflow::mutex_lock lock(mutex_); - - // The version at which an operation was added is simply the handle value of - // the ComputationDataHandle. - VersionedComputationHandle versioned_handle; - versioned_handle.handle = session_computation_.computation_handle(); - versioned_handle.version = operation.handle(); - return versioned_handle; -} - -VersionedComputationHandle::Version UserComputation::version() const { - return GetVersionedHandle().version; -} - -namespace { - -// Returns true if the operation type corresponding to the given opcase can be -// the root of the computation. -bool CanBeRoot(const OpRequest::OpCase& op_case) { - switch (op_case) { - case OpRequest::kTraceRequest: - case OpRequest::kSendRequest: - case OpRequest::kOutfeedRequest: - return false; - default: - return true; - } -} - -// Returns a pointer to the operation with the given data handle value in the -// given SessionComputation. -StatusOr LookUpRequest( - int64 handle_value, const SessionComputation& session_computation) { - if (session_computation.requests().count(handle_value) == 0) { - return InvalidArgument("no ComputationDataHandle value %lld", handle_value); - } - return &session_computation.requests().at(handle_value); -} - -// Returns the OperationRequest corresponding to the root (result) of the -// session computation. -StatusOr GetRoot( - VersionedComputationHandle::Version version, - const SessionComputation& session_computation) { - TF_RET_CHECK(version > 0); - // Not all instructions can be roots. Walk backwards from the operation - // indicated by this version until a valid root is found. - const OperationRequest* root_request = nullptr; - while (version > 0) { - TF_ASSIGN_OR_RETURN(root_request, - LookUpRequest(version, session_computation)); - if (CanBeRoot(root_request->request().op_case())) { - break; - } - version--; - } - if (version == 0) { - return InternalError("Computation contains no root operation"); - } - return root_request; -} - -} // namespace - -StatusOr> -UserComputation::ComputeProgramShape( - VersionedComputationHandle::Version version) const { - tensorflow::mutex_lock lock(mutex_); - - TF_RET_CHECK(version > 0 && version < next_handle_value_); - - if (program_shape_ == nullptr || program_shape_version_ != version) { - // ProgramShape has not been computed yet, or is for different - // version. Compute it now. - TF_RETURN_IF_ERROR(CheckParametersAreContiguous(version)); - - auto program_shape = MakeUnique(); - for (int64 request_num = 1; request_num <= version; ++request_num) { - const OperationRequest& request = - session_computation_.requests().at(request_num); - if (request.request().op_case() == OpRequest::kParameterRequest) { - const ParameterRequest& parameter_request = - request.request().parameter_request(); - int64 param_no = parameter_request.parameter(); - // Parameters may be out of order so expand ProgramShape parameters - // until it is at least large enough to hold the current parameter - // number. - while (program_shape->parameters_size() <= param_no) { - program_shape->add_parameters(); - program_shape->add_parameter_names(); - } - *program_shape->mutable_parameters(param_no) = request.output_shape(); - *program_shape->mutable_parameter_names(param_no) = - parameter_request.name(); - } - } - - // The root determines the output shape. - TF_ASSIGN_OR_RETURN(const OperationRequest* root_request, - GetRoot(version, session_computation_)); - *program_shape->mutable_result() = root_request->output_shape(); - if (ShapeUtil::IsOpaque(program_shape->result())) { - return Unimplemented("Computation results cannot be opaque"); - } - - program_shape_ = std::move(program_shape); - program_shape_version_ = version; - } - - return program_shape_; -} - -namespace { - -// A visitor which checks whether an operation is pure functional meaning that -// it doesn't depend on any parameter with an index higher then num_parameters. -// The visitor walks the computation starting at a given operation and sets -// is_functional to false iff a parameter or RNG operation is encountered. -void PureFunctionalVisitor(const SessionComputation& session_computation, - const ComputationDataHandle& handle, - int64 num_parameters, std::set* visited, - bool* is_functional) { - if (visited->count(handle.handle()) != 0 || !*is_functional) { - return; - } - - const OperationRequest& request = - session_computation.requests().at(handle.handle()); - switch (request.request().op_case()) { - case OpRequest::kRngRequest: - *is_functional = false; - break; - - case OpRequest::kConstantRequest: - break; - - case OpRequest::kGetTupleElementRequest: { - const GetTupleElementRequest& get_tuple_element_request = - request.request().get_tuple_element_request(); - PureFunctionalVisitor(session_computation, - get_tuple_element_request.operand(), num_parameters, - visited, is_functional); - break; - } - - case OpRequest::kSliceRequest: { - const SliceRequest& slice_request = request.request().slice_request(); - PureFunctionalVisitor(session_computation, slice_request.operand(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kDynamicSliceRequest: { - const DynamicSliceRequest& dynamic_slice_request = - request.request().dynamic_slice_request(); - PureFunctionalVisitor(session_computation, - dynamic_slice_request.operand(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, - dynamic_slice_request.start_indices(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kDynamicUpdateSliceRequest: { - const DynamicUpdateSliceRequest& dynamic_update_slice_request = - request.request().dynamic_update_slice_request(); - PureFunctionalVisitor(session_computation, - dynamic_update_slice_request.operand(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, - dynamic_update_slice_request.update(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, - dynamic_update_slice_request.start_indices(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kConcatenateRequest: { - const ConcatenateRequest& concatenate_request = - request.request().concatenate_request(); - for (const ComputationDataHandle& handle : - concatenate_request.operands()) { - PureFunctionalVisitor(session_computation, handle, num_parameters, - visited, is_functional); - } - break; - } - - case OpRequest::kConvolveRequest: { - const ConvolveRequest& convolve_request = - request.request().convolve_request(); - PureFunctionalVisitor(session_computation, convolve_request.lhs(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, convolve_request.rhs(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kFftRequest: { - const FftRequest& fft_request = request.request().fft_request(); - PureFunctionalVisitor(session_computation, fft_request.operand(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kCrossReplicaSumRequest: { - // TODO(b/33009255): Implmement constant folding for cross replica sum. - *is_functional = false; - break; - } - - case OpRequest::kInfeedRequest: { - *is_functional = false; - break; - } - - case OpRequest::kOutfeedRequest: { - *is_functional = false; - break; - } - - case OpRequest::kHostComputeRequest: { - *is_functional = false; - break; - } - - case OpRequest::kCallRequest: { - const CallRequest& call_request = request.request().call_request(); - for (const ComputationDataHandle& handle : call_request.operands()) { - PureFunctionalVisitor(session_computation, handle, num_parameters, - visited, is_functional); - } - // TODO(b/32495713): We aren't checking the to_apply computation itself, - // so we conservatively say that computations containing the Call op - // cannot be constant. We cannot set is_functional=false in other similar - // cases since we're already relying on IsConstant to return true. - *is_functional = false; - break; - } - - case OpRequest::kCustomCallRequest: { - *is_functional = false; - break; - } - - case OpRequest::kDotRequest: { - const DotRequest& dot_request = request.request().dot_request(); - PureFunctionalVisitor(session_computation, dot_request.lhs(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, dot_request.rhs(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kSendRequest: { - *is_functional = false; - break; - } - - case OpRequest::kRecvRequest: { - *is_functional = false; - break; - } - - case OpRequest::kMapRequest: { - const MapRequest& map_request = request.request().map_request(); - for (const ComputationDataHandle& handle : map_request.operands()) { - PureFunctionalVisitor(session_computation, handle, num_parameters, - visited, is_functional); - } - // TODO(b/32495713): We aren't checking the to_apply computation itself. - break; - } - - case OpRequest::kReduceRequest: { - const ReduceRequest& reduce_request = request.request().reduce_request(); - PureFunctionalVisitor(session_computation, reduce_request.operand(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, reduce_request.init_value(), - num_parameters, visited, is_functional); - // TODO(b/32495713): We aren't checking the to_apply computation itself. - break; - } - - case OpRequest::kReduceWindowRequest: { - const ReduceWindowRequest& reduce_window_request = - request.request().reduce_window_request(); - PureFunctionalVisitor(session_computation, - reduce_window_request.operand(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, - reduce_window_request.init_value(), num_parameters, - visited, is_functional); - // TODO(b/32495713): We aren't checking the to_apply computation itself. - break; - } - - case OpRequest::kSelectAndScatterRequest: { - const SelectAndScatterRequest& select_and_scatter_request = - request.request().select_and_scatter_request(); - PureFunctionalVisitor(session_computation, - select_and_scatter_request.operand(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, - select_and_scatter_request.source(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, - select_and_scatter_request.init_value(), - num_parameters, visited, is_functional); - // TODO(b/32495713): We aren't checking the select and scatter - // computations themselves. - break; - } - - case OpRequest::kBroadcastRequest: { - const BroadcastRequest& broadcast_request = - request.request().broadcast_request(); - PureFunctionalVisitor(session_computation, broadcast_request.operand(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kReshapeRequest: { - const ReshapeRequest& reshape_request = - request.request().reshape_request(); - PureFunctionalVisitor(session_computation, reshape_request.operand(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kReverseRequest: { - const ReverseRequest& reverse_request = - request.request().reverse_request(); - PureFunctionalVisitor(session_computation, reverse_request.operand(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kPadRequest: { - const PadRequest& pad_request = request.request().pad_request(); - PureFunctionalVisitor(session_computation, pad_request.operand(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, pad_request.padding_value(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kParameterRequest: { - const ParameterRequest& parameter_request = - request.request().parameter_request(); - if (parameter_request.parameter() >= num_parameters) { - *is_functional = false; - } - break; - } - - case OpRequest::kConvertRequest: { - const ConvertRequest& convert_request = - request.request().convert_request(); - PureFunctionalVisitor(session_computation, convert_request.operand(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kBitcastConvertRequest: { - const ConvertRequest& convert_request = - request.request().bitcast_convert_request(); - PureFunctionalVisitor(session_computation, convert_request.operand(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kWhileRequest: { - const WhileRequest& while_request = request.request().while_request(); - PureFunctionalVisitor(session_computation, while_request.init(), - num_parameters, visited, is_functional); - // TODO(b/32495713): We aren't checking the condition and body - // computations themselves. - *is_functional = false; - break; - } - - case OpRequest::kConditionalRequest: { - const ConditionalRequest& conditional_request = - request.request().conditional_request(); - PureFunctionalVisitor(session_computation, - conditional_request.predicate(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, - conditional_request.true_operand(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, - conditional_request.false_operand(), num_parameters, - visited, is_functional); - // TODO(b/32495713): We aren't checking the true and false computations - // themselves. - break; - } - - case OpRequest::kTernaryOpRequest: { - const TernaryOpRequest& ternary_op_request = - request.request().ternary_op_request(); - PureFunctionalVisitor(session_computation, ternary_op_request.lhs(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, ternary_op_request.rhs(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, ternary_op_request.ehs(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kTransposeRequest: { - const TransposeRequest& transpose_request = - request.request().transpose_request(); - PureFunctionalVisitor(session_computation, transpose_request.operand(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kVariadicOpRequest: { - const VariadicOpRequest& variadic_op_request = - request.request().variadic_op_request(); - for (const ComputationDataHandle& handle : - variadic_op_request.operands()) { - PureFunctionalVisitor(session_computation, handle, num_parameters, - visited, is_functional); - } - break; - } - - case OpRequest::kUnaryOpRequest: { - const UnaryOpRequest& unary_op_request = - request.request().unary_op_request(); - PureFunctionalVisitor(session_computation, unary_op_request.operand(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kBatchNormTrainingRequest: { - const BatchNormTrainingRequest& batch_norm_training_request = - request.request().batch_norm_training_request(); - PureFunctionalVisitor(session_computation, - batch_norm_training_request.operand(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, - batch_norm_training_request.scale(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, - batch_norm_training_request.offset(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kBatchNormInferenceRequest: { - const BatchNormInferenceRequest& batch_norm_inference_request = - request.request().batch_norm_inference_request(); - PureFunctionalVisitor(session_computation, - batch_norm_inference_request.operand(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, - batch_norm_inference_request.scale(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, - batch_norm_inference_request.offset(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, - batch_norm_inference_request.mean(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, - batch_norm_inference_request.variance(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kBatchNormGradRequest: { - const BatchNormGradRequest& batch_norm_grad_request = - request.request().batch_norm_grad_request(); - PureFunctionalVisitor(session_computation, - batch_norm_grad_request.operand(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, - batch_norm_grad_request.scale(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, batch_norm_grad_request.mean(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, - batch_norm_grad_request.variance(), num_parameters, - visited, is_functional); - PureFunctionalVisitor(session_computation, - batch_norm_grad_request.grad_output(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kBinaryOpRequest: { - const BinaryOpRequest& binary_op_request = - request.request().binary_op_request(); - PureFunctionalVisitor(session_computation, binary_op_request.lhs(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, binary_op_request.rhs(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::kGatherRequest: { - PureFunctionalVisitor(session_computation, - request.request().gather_request().input(), - num_parameters, visited, is_functional); - PureFunctionalVisitor(session_computation, - request.request().gather_request().gather_indices(), - num_parameters, visited, is_functional); - break; - } - - case OpRequest::OP_NOT_SET: - LOG(FATAL) << "OperationRequest doesn't contain a request"; - - default: - LOG(FATAL) << "Unexpected request type: " << request.request().op_case(); - } - if (!*is_functional) { - VLOG(1) << "Non-functional: " << request.request().DebugString(); - } - visited->insert(handle.handle()); -} - -} // namespace - -StatusOr UserComputation::IsConstant(const ComputationDataHandle& handle, - int64 num_parameters) { - tensorflow::mutex_lock lock(mutex_); - - // Verify that the handle is valid. - auto operation_status = LookUpRequest(handle); - if (!operation_status.ok()) { - return operation_status.status(); - } - - bool is_constant = true; - std::set visited; - PureFunctionalVisitor(session_computation_, handle, num_parameters, &visited, - &is_constant); - - return is_constant; -} - -std::vector -UserComputation::GetEmbeddedComputations( - VersionedComputationHandle::Version version) const { - tensorflow::mutex_lock lock(mutex_); - - VLOG(1) - << "GetEmbeddedComputations(" << name() << " " - << VersionedComputationHandle{session_computation_.computation_handle(), - version} - << ")"; - XLA_VLOG_LINES(3, session_computation_.DebugString()); - - std::vector computations; - std::vector sorted_handles; - for (const auto& handle_request : session_computation_.requests()) { - sorted_handles.push_back(handle_request.first); - } - std::sort(sorted_handles.begin(), sorted_handles.end()); - for (int64 handle : sorted_handles) { - const auto& handle_request = session_computation_.requests().find(handle); - CHECK(handle_request != session_computation_.requests().end()); - int64 handle_value = handle_request->first; - if (handle_value <= version) { - const OperationRequest& request = handle_request->second; - switch (request.request().op_case()) { - case OpRequest::kCallRequest: { - CHECK_EQ(1, request.embedded_computation_versions_size()); - const CallRequest& call_request = request.request().call_request(); - const VersionedComputationHandle versioned_handle = { - call_request.to_apply(), - request.embedded_computation_versions(0)}; - computations.push_back(versioned_handle); - break; - } - - case OpRequest::kMapRequest: { - CHECK_EQ(1, request.embedded_computation_versions_size()); - const MapRequest& map_request = request.request().map_request(); - const VersionedComputationHandle versioned_handle = { - map_request.to_apply(), request.embedded_computation_versions(0)}; - computations.push_back(versioned_handle); - break; - } - - case OpRequest::kReduceRequest: { - CHECK_EQ(1, request.embedded_computation_versions_size()); - const ReduceRequest& reduce_request = - request.request().reduce_request(); - const VersionedComputationHandle versioned_handle = { - reduce_request.to_apply(), - request.embedded_computation_versions(0)}; - computations.push_back(versioned_handle); - break; - } - - case OpRequest::kReduceWindowRequest: { - CHECK_EQ(1, request.embedded_computation_versions_size()); - const ReduceWindowRequest& reduce_window_request = - request.request().reduce_window_request(); - const VersionedComputationHandle versioned_handle = { - reduce_window_request.to_apply(), - request.embedded_computation_versions(0)}; - computations.push_back(versioned_handle); - break; - } - - case OpRequest::kSelectAndScatterRequest: { - CHECK_EQ(2, request.embedded_computation_versions_size()); - const SelectAndScatterRequest& select_and_scatter_request = - request.request().select_and_scatter_request(); - const VersionedComputationHandle select_versioned_handle = { - select_and_scatter_request.select(), - request.embedded_computation_versions(0)}; - computations.push_back(select_versioned_handle); - const VersionedComputationHandle scatter_versioned_handle = { - select_and_scatter_request.scatter(), - request.embedded_computation_versions(1)}; - computations.push_back(scatter_versioned_handle); - break; - } - - case OpRequest::kWhileRequest: { - CHECK_EQ(2, request.embedded_computation_versions_size()); - const WhileRequest& while_request = request.request().while_request(); - const VersionedComputationHandle condition_versioned_handle = { - while_request.condition(), - request.embedded_computation_versions(0)}; - computations.push_back(condition_versioned_handle); - const VersionedComputationHandle body_versioned_handle = { - while_request.body(), request.embedded_computation_versions(1)}; - computations.push_back(body_versioned_handle); - break; - } - - case OpRequest::kConditionalRequest: { - CHECK_EQ(2, request.embedded_computation_versions_size()); - const ConditionalRequest& conditional_request = - request.request().conditional_request(); - const VersionedComputationHandle true_computation_versioned_handle = { - conditional_request.true_computation(), - request.embedded_computation_versions(0)}; - computations.push_back(true_computation_versioned_handle); - const VersionedComputationHandle false_computation_versioned_handle = - {conditional_request.false_computation(), - request.embedded_computation_versions(1)}; - computations.push_back(false_computation_versioned_handle); - break; - } - - default: - // No embedded computation. - break; - } - } - } - VLOG(2) << "Embedded computations: " - << tensorflow::str_util::Join( - computations, ", ", - [](string* out, const VersionedComputationHandle& h) { - out->append(h.ToString()); - }); - return computations; -} - -StatusOr -UserComputation::LookUpRequestForErrorReporting( - const ComputationDataHandle& handle) const { - tensorflow::mutex_lock lock(mutex_); - return LookUpRequest(handle); -} - -tensorflow::gtl::optional UserComputation::ParameterMetadata( - int parameter_number) const { - tensorflow::mutex_lock lock(mutex_); - auto it = parameters_.find(parameter_number); - if (it == parameters_.end()) { - return tensorflow::gtl::nullopt; - } - OperationRequest* op = it->second; - return &op->request().metadata(); -} - -Status UserComputation::RemapEmbeddedComputations( - const std::map& old_to_new) { - auto update = [&old_to_new](ComputationHandle* to_update) -> Status { - int64 old = to_update->handle(); - auto it = old_to_new.find(old); - if (it == old_to_new.end()) { - string mapping = tensorflow::str_util::Join( - old_to_new, ", ", - [](string* out, std::pair element) { - tensorflow::strings::Appendf(out, "%lld:%lld", element.first, - element.second.handle()); - }); - return NotFound( - "could not find referenced (old) computation handle in mapping: " - "%lld; mapping: {%s}", - old, mapping.c_str()); - } - VLOG(2) << "remapping " << old << " to " << it->second.handle(); - *to_update = it->second; - return Status::OK(); - }; - TF_RETURN_IF_ERROR(update(session_computation_.mutable_computation_handle())); - for (auto& handle_request : *session_computation_.mutable_requests()) { - OperationRequest& request = handle_request.second; - switch (request.request().op_case()) { - case OpRequest::kCallRequest: { - TF_RET_CHECK(1 == request.embedded_computation_versions_size()); - CallRequest* call_request = - request.mutable_request()->mutable_call_request(); - TF_RETURN_IF_ERROR(update(call_request->mutable_to_apply())); - break; - } - case OpRequest::kMapRequest: { - TF_RET_CHECK(1 == request.embedded_computation_versions_size()); - MapRequest* map_request = - request.mutable_request()->mutable_map_request(); - TF_RETURN_IF_ERROR(update(map_request->mutable_to_apply())); - break; - } - case OpRequest::kReduceRequest: { - TF_RET_CHECK(1 == request.embedded_computation_versions_size()); - ReduceRequest* reduce_request = - request.mutable_request()->mutable_reduce_request(); - TF_RETURN_IF_ERROR(update(reduce_request->mutable_to_apply())); - break; - } - case OpRequest::kReduceWindowRequest: { - TF_RET_CHECK(1 == request.embedded_computation_versions_size()); - ReduceWindowRequest* reduce_window_request = - request.mutable_request()->mutable_reduce_window_request(); - TF_RETURN_IF_ERROR(update(reduce_window_request->mutable_to_apply())); - break; - } - case OpRequest::kSelectAndScatterRequest: { - TF_RET_CHECK(2 == request.embedded_computation_versions_size()); - SelectAndScatterRequest* select_and_scatter_request = - request.mutable_request()->mutable_select_and_scatter_request(); - TF_RETURN_IF_ERROR( - update(select_and_scatter_request->mutable_select())); - TF_RETURN_IF_ERROR( - update(select_and_scatter_request->mutable_scatter())); - break; - } - case OpRequest::kWhileRequest: { - TF_RET_CHECK(2 == request.embedded_computation_versions_size()); - WhileRequest* while_request = - request.mutable_request()->mutable_while_request(); - TF_RETURN_IF_ERROR(update(while_request->mutable_condition())); - TF_RETURN_IF_ERROR(update(while_request->mutable_body())); - break; - } - case OpRequest::kConditionalRequest: { - TF_RET_CHECK(2 == request.embedded_computation_versions_size()); - ConditionalRequest* conditional_request = - request.mutable_request()->mutable_conditional_request(); - TF_RETURN_IF_ERROR( - update(conditional_request->mutable_true_computation())); - TF_RETURN_IF_ERROR( - update(conditional_request->mutable_false_computation())); - break; - } - default: - // No embedded computation. - TF_RET_CHECK(0 == request.embedded_computation_versions_size()); - break; - } - } - return Status::OK(); -} - -SessionComputation UserComputation::CloneSessionComputation( - VersionedComputationHandle::Version version) const { - tensorflow::mutex_lock lock(mutex_); - SessionComputation result = session_computation_; - // Erase all the requests that exceed the version specified. - // There's no lower_bound method on tensorflow::protobuf::Map so we iterate - // all the elements. - auto it = result.mutable_requests()->begin(); - while (it != result.mutable_requests()->end()) { - if (it->first > version) { - it = result.mutable_requests()->erase(it); - } else { - ++it; - } - } - return result; -} - -StatusOr UserComputation::LookUpRequest( - const ComputationDataHandle& handle) const { - int64 handle_value = handle.handle(); - if (session_computation_.requests().count(handle_value) == 0) { - return InvalidArgument("no ComputationDataHandle value %lld", handle_value); - } - return &session_computation_.requests().at(handle_value); -} - -Status UserComputation::CheckParametersAreContiguous( - VersionedComputationHandle::Version version) const { - TF_RET_CHECK(version > 0 && version < next_handle_value_); - - // Determine number of parameter inputs at the given version. - std::map parameter_requests; - for (int64 request_num = 1; request_num <= version; ++request_num) { - const OperationRequest& request = - session_computation_.requests().at(request_num); - - if (request.request().op_case() == OpRequest::kParameterRequest) { - const ParameterRequest& parameter_request = - request.request().parameter_request(); - // Duplicate parameters should be checked when parameter requests are - // added. - TF_RET_CHECK(0 == - parameter_requests.count(parameter_request.parameter())); - parameter_requests[parameter_request.parameter()] = ¶meter_request; - } - } - - for (int64 i = 0; i < parameter_requests.size(); ++i) { - auto it = parameter_requests.find(i); - if (it == parameter_requests.end()) { - return FailedPrecondition( - "computation %s does not have all its parameters populated " - "sequentially, missing parameter %lld", - name_.c_str(), i); - } - } - - return Status::OK(); -} - -namespace { - -// Helper class which builds an HLO computation from a SessionComputation. To -// construct the HLO computation, the SessionComputation graph is walked in -// DFS order lowering each OperationRequest to an HLO instruction. -class ComputationLowerer { - public: - static StatusOr> Lower( - const string& computation_name, - const SessionComputation& session_computation, - VersionedComputationHandle::Version version, - UserComputation::HloComputationResolver hlo_resolver, - const DebugOptions& debug_options, - bool include_unreachable_instructions) { - ComputationLowerer lowerer(computation_name, session_computation, version, - std::move(hlo_resolver), debug_options, - include_unreachable_instructions); - return lowerer.Lower(); - } - - private: - ComputationLowerer(const string& computation_name, - const SessionComputation& session_computation, - VersionedComputationHandle::Version version, - UserComputation::HloComputationResolver hlo_resolver, - const DebugOptions& debug_options, - bool include_unreachable_instructions) - : hlo_builder_(computation_name), - session_computation_(session_computation), - version_(version), - hlo_resolver_(std::move(hlo_resolver)), - debug_options_(debug_options), - include_unreachable_instructions_(include_unreachable_instructions) {} - - // Build an HLO computation from the SessionComputation at the given - // version. - StatusOr> Lower(); - - private: - // Traverses the computation 'root' using a DFS, calling 'visit' in postorder. - void TraversePostorder( - const ComputationDataHandle& root, - std::unordered_map* visited, - const std::function& visit); - - // DFS visitor of the UserComputation operations which lowers the operations - // to HLO instructions. - void Visit(const ComputationDataHandle& handle, - std::unordered_map* instructions); - - // Resolves a ComputationHandle and Version to a previously lowered - // HloComputation using the hlo_resolver_ function. - HloComputation* ResolveComputation( - const ComputationHandle& handle, - VersionedComputationHandle::Version version); - - // This function takes an input value which is being implicitly broadcast into - // an output shape and figures out the right kBroadcast instruction(s) - // necessary to replicate the implicit broadcast semantics explicitly. - HloInstruction* ImplicitBroadcastToExplicitBroadcast( - HloInstruction* operand, const Shape& output_shape); - - HloComputation::Builder hlo_builder_; - const SessionComputation& session_computation_; - const VersionedComputationHandle::Version version_; - const UserComputation::HloComputationResolver hlo_resolver_; - const DebugOptions& debug_options_; - const bool include_unreachable_instructions_; -}; - -// Calls 'apply' on each operand of 'request'. -static void ForEachOperand( - const OperationRequest& request, - const std::function& apply) { - switch (request.request().op_case()) { - case OpRequest::kRngRequest: { - const RngRequest& rng_request = request.request().rng_request(); - for (const ComputationDataHandle& param : rng_request.parameter()) { - apply(param); - } - break; - } - - case OpRequest::kConstantRequest: - break; - case OpRequest::kGetTupleElementRequest: { - const GetTupleElementRequest& get_tuple_element_request = - request.request().get_tuple_element_request(); - apply(get_tuple_element_request.operand()); - break; - } - - case OpRequest::kSliceRequest: { - const SliceRequest& slice_request = request.request().slice_request(); - apply(slice_request.operand()); - break; - } - - case OpRequest::kDynamicSliceRequest: { - const DynamicSliceRequest& dynamic_slice_request = - request.request().dynamic_slice_request(); - apply(dynamic_slice_request.operand()); - apply(dynamic_slice_request.start_indices()); - break; - } - - case OpRequest::kDynamicUpdateSliceRequest: { - const DynamicUpdateSliceRequest& dynamic_update_slice_request = - request.request().dynamic_update_slice_request(); - apply(dynamic_update_slice_request.operand()); - apply(dynamic_update_slice_request.update()); - apply(dynamic_update_slice_request.start_indices()); - break; - } - - case OpRequest::kConcatenateRequest: { - const ConcatenateRequest& concatenate_request = - request.request().concatenate_request(); - for (const ComputationDataHandle& handle : - concatenate_request.operands()) { - apply(handle); - } - break; - } - - case OpRequest::kConvolveRequest: { - const ConvolveRequest& convolve_request = - request.request().convolve_request(); - apply(convolve_request.lhs()); - apply(convolve_request.rhs()); - break; - } - - case OpRequest::kFftRequest: { - const FftRequest& fft_request = request.request().fft_request(); - apply(fft_request.operand()); - break; - } - - case OpRequest::kBatchNormTrainingRequest: { - const BatchNormTrainingRequest& batch_norm_training_request = - request.request().batch_norm_training_request(); - - apply(batch_norm_training_request.operand()); - apply(batch_norm_training_request.scale()); - apply(batch_norm_training_request.offset()); - break; - } - - case OpRequest::kBatchNormInferenceRequest: { - const BatchNormInferenceRequest& batch_norm_inference_request = - request.request().batch_norm_inference_request(); - - apply(batch_norm_inference_request.operand()); - apply(batch_norm_inference_request.scale()); - apply(batch_norm_inference_request.offset()); - apply(batch_norm_inference_request.mean()); - apply(batch_norm_inference_request.variance()); - break; - } - - case OpRequest::kBatchNormGradRequest: { - const BatchNormGradRequest& batch_norm_grad_request = - request.request().batch_norm_grad_request(); - - apply(batch_norm_grad_request.operand()); - apply(batch_norm_grad_request.scale()); - apply(batch_norm_grad_request.mean()); - apply(batch_norm_grad_request.variance()); - apply(batch_norm_grad_request.grad_output()); - break; - } - - case OpRequest::kCrossReplicaSumRequest: { - const CrossReplicaSumRequest& cross_replica_sum_request = - request.request().cross_replica_sum_request(); - apply(cross_replica_sum_request.operand()); - break; - } - - case OpRequest::kInfeedRequest: - break; - - case OpRequest::kOutfeedRequest: { - const OutfeedRequest& outfeed_request = - request.request().outfeed_request(); - apply(outfeed_request.operand()); - break; - } - - case OpRequest::kMapRequest: { - const MapRequest& map_request = request.request().map_request(); - for (const ComputationDataHandle& handle : map_request.operands()) { - apply(handle); - } - break; - } - - case OpRequest::kReduceRequest: { - const ReduceRequest& reduce_request = request.request().reduce_request(); - apply(reduce_request.operand()); - apply(reduce_request.init_value()); - break; - } - - case OpRequest::kReduceWindowRequest: { - const ReduceWindowRequest& reduce_window_request = - request.request().reduce_window_request(); - apply(reduce_window_request.operand()); - apply(reduce_window_request.init_value()); - break; - } - - case OpRequest::kSelectAndScatterRequest: { - const SelectAndScatterRequest& select_and_scatter_request = - request.request().select_and_scatter_request(); - apply(select_and_scatter_request.operand()); - apply(select_and_scatter_request.source()); - apply(select_and_scatter_request.init_value()); - - break; - } - - case OpRequest::kBroadcastRequest: { - const BroadcastRequest& broadcast_request = - request.request().broadcast_request(); - apply(broadcast_request.operand()); - break; - } - - case OpRequest::kReshapeRequest: { - const ReshapeRequest& reshape_request = - request.request().reshape_request(); - apply(reshape_request.operand()); - break; - } - - case OpRequest::kTransposeRequest: { - const TransposeRequest& transpose_request = - request.request().transpose_request(); - apply(transpose_request.operand()); - break; - } - - case OpRequest::kReverseRequest: { - const ReverseRequest& reverse_request = - request.request().reverse_request(); - apply(reverse_request.operand()); - break; - } - - case OpRequest::kPadRequest: { - const PadRequest& pad_request = request.request().pad_request(); - apply(pad_request.operand()); - apply(pad_request.padding_value()); - break; - } - - case OpRequest::kRecvRequest: - case OpRequest::kParameterRequest: - break; - - case OpRequest::kConvertRequest: { - const ConvertRequest& convert_request = - request.request().convert_request(); - apply(convert_request.operand()); - break; - } - - case OpRequest::kBitcastConvertRequest: { - const ConvertRequest& convert_request = - request.request().bitcast_convert_request(); - apply(convert_request.operand()); - break; - } - - case OpRequest::kWhileRequest: { - const WhileRequest& while_request = request.request().while_request(); - apply(while_request.init()); - break; - } - - case OpRequest::kConditionalRequest: { - const ConditionalRequest& conditional_request = - request.request().conditional_request(); - apply(conditional_request.predicate()); - apply(conditional_request.true_operand()); - apply(conditional_request.false_operand()); - break; - } - - case OpRequest::kTernaryOpRequest: { - const TernaryOpRequest& ternary_op_request = - request.request().ternary_op_request(); - apply(ternary_op_request.lhs()); - apply(ternary_op_request.rhs()); - apply(ternary_op_request.ehs()); - break; - } - - case OpRequest::kVariadicOpRequest: { - const VariadicOpRequest& variadic_op_request = - request.request().variadic_op_request(); - for (const ComputationDataHandle& handle : - variadic_op_request.operands()) { - apply(handle); - } - break; - } - - case OpRequest::kCallRequest: { - const CallRequest& call_request = request.request().call_request(); - for (const ComputationDataHandle& handle : call_request.operands()) { - apply(handle); - } - break; - } - - case OpRequest::kCustomCallRequest: { - const CustomCallRequest& cc_request = - request.request().custom_call_request(); - for (const ComputationDataHandle& operand : cc_request.operands()) { - apply(operand); - } - break; - } - - case OpRequest::kHostComputeRequest: { - const HostComputeRequest& hc_request = - request.request().host_compute_request(); - for (const ComputationDataHandle& operand : hc_request.operands()) { - apply(operand); - } - break; - } - - case OpRequest::kDotRequest: { - const DotRequest& dot_request = request.request().dot_request(); - apply(dot_request.rhs()); - apply(dot_request.lhs()); - break; - } - - case OpRequest::kUnaryOpRequest: { - const UnaryOpRequest& unary_op_request = - request.request().unary_op_request(); - apply(unary_op_request.operand()); - break; - } - - case OpRequest::kBinaryOpRequest: { - const BinaryOpRequest& binary_op_request = - request.request().binary_op_request(); - apply(binary_op_request.rhs()); - apply(binary_op_request.lhs()); - break; - } - - case OpRequest::kReducePrecisionRequest: { - const ReducePrecisionRequest& reduce_precision_request = - request.request().reduce_precision_request(); - apply(reduce_precision_request.operand()); - break; - } - - case OpRequest::kTraceRequest: { - const TraceRequest& trace_request = request.request().trace_request(); - apply(trace_request.operand()); - break; - } - - case OpRequest::kSendRequest: { - const SendRequest& send_request = request.request().send_request(); - apply(send_request.operand()); - break; - } - - case OpRequest::kGatherRequest: { - const GatherRequest& gather_request = request.request().gather_request(); - apply(gather_request.input()); - apply(gather_request.gather_indices()); - break; - } - - case OpRequest::OP_NOT_SET: - LOG(FATAL) << "OperationRequest doesn't contain a request"; - - default: - LOG(FATAL) << "Unexpected request type: " << request.request().op_case(); - } -} - -void ComputationLowerer::TraversePostorder( - const ComputationDataHandle& root, - std::unordered_map* visited, - const std::function& visit) { - // Stack containing {handle, enter} pairs. The 'enter' value describes whether - // we are entering or leaving 'handle'. - std::stack> work; - work.push({root, true}); - while (!work.empty()) { - ComputationDataHandle handle; - bool enter; - std::tie(handle, enter) = work.top(); - work.pop(); - - if (enter) { - // We are entering 'handle'. The first time we enter 'handle', we add it - // to 'visited' with a nullptr value. If 'handle' is already in 'visited', - // we do not visit it again. This algorithm only uses the presence of - // a handle in 'visited', but we use a map so we can use the same data - // structure to store the HloInstruction outputs. - if (visited->emplace(handle.handle(), nullptr).second) { - const OperationRequest& request = - session_computation_.requests().at(handle.handle()); - // Push the corresponding 'leave' action onto the stack, followed by - // the operands. - work.push({handle, false}); - ForEachOperand(request, [&work](const ComputationDataHandle& child) { - work.push({child, true}); - }); - } - } else { - // We are leaving 'handle'. We have visited the operands of 'handle', and - // now can visit the 'handle' itself. - visit(handle); - } - } -} - -StatusOr> ComputationLowerer::Lower() { - // Map from ComputationDataHandle to HLO instruction. Serves as a record of - // which operations have been visited as well as a cache for looking up - // ComputationDataHandles as HloInstructions. - std::unordered_map instructions; - - TF_ASSIGN_OR_RETURN(const OperationRequest* root_request, - GetRoot(version_, session_computation_)); - - auto visit = [&](const ComputationDataHandle& handle) { - Visit(handle, &instructions); - }; - TraversePostorder(root_request->output_handle(), &instructions, visit); - HloInstruction* hlo_root = - instructions.at(root_request->output_handle().handle()); - - if (include_unreachable_instructions_) { - // Iterate through all computation data handles, and visit any unvisited - // operations. - for (int64 request_num = 1; request_num <= version_; ++request_num) { - TF_ASSIGN_OR_RETURN(const OperationRequest* request, - LookUpRequest(request_num, session_computation_)); - TraversePostorder(request->output_handle(), &instructions, visit); - } - } - - return hlo_builder_.Build(hlo_root); -} - -HloComputation* ComputationLowerer::ResolveComputation( - const ComputationHandle& handle, - VersionedComputationHandle::Version version) { - const VersionedComputationHandle checked_handle = {handle, version}; - return hlo_resolver_(checked_handle); -} - -HloInstruction* ComputationLowerer::ImplicitBroadcastToExplicitBroadcast( - HloInstruction* operand, const Shape& output_shape) { - auto fadd = [this](std::unique_ptr x) { - return hlo_builder_.AddInstruction(std::move(x)); - }; - return fadd( - HloInstruction::CreateBroadcastSequence(output_shape, operand, fadd)); -} - -void ComputationLowerer::Visit( - const ComputationDataHandle& handle, - std::unordered_map* instructions) { - CHECK_LE(handle.handle(), version_); - CHECK(instructions->at(handle.handle()) == nullptr); - const OperationRequest& request = - session_computation_.requests().at(handle.handle()); - auto add_instruction = [&](std::unique_ptr instruction) { - HloInstruction* hlo_instruction = - hlo_builder_.AddInstruction(std::move(instruction)); - hlo_instruction->set_metadata(request.request().metadata()); - if (request.request().has_sharding()) { - OpSharding op_sharding = request.request().sharding(); - hlo_instruction->set_sharding( - HloSharding::FromProto(op_sharding).ValueOrDie()); - } - return hlo_instruction; - }; - auto lookup_instruction = [&](const ComputationDataHandle& handle) { - return instructions->at(handle.handle()); - }; - HloInstruction* hlo_instruction; - switch (request.request().op_case()) { - case OpRequest::kRngRequest: { - const RngRequest& rng_request = request.request().rng_request(); - std::vector parameters; - for (const ComputationDataHandle& param : rng_request.parameter()) { - parameters.push_back(lookup_instruction(param)); - } - hlo_instruction = add_instruction(HloInstruction::CreateRng( - request.output_shape(), rng_request.distribution(), parameters)); - break; - } - - case OpRequest::kConstantRequest: { - const ConstantRequest& constant_request = - request.request().constant_request(); - hlo_instruction = add_instruction(HloInstruction::CreateConstant( - Literal::CreateFromProto(constant_request.literal()) - .ConsumeValueOrDie())); - break; - } - - case OpRequest::kGetTupleElementRequest: { - const GetTupleElementRequest& get_tuple_element_request = - request.request().get_tuple_element_request(); - HloInstruction* operand = - lookup_instruction(get_tuple_element_request.operand()); - hlo_instruction = add_instruction(HloInstruction::CreateGetTupleElement( - request.output_shape(), operand, get_tuple_element_request.index())); - break; - } - - case OpRequest::kSliceRequest: { - const SliceRequest& slice_request = request.request().slice_request(); - HloInstruction* operand = lookup_instruction(slice_request.operand()); - hlo_instruction = add_instruction(HloInstruction::CreateSlice( - request.output_shape(), operand, - AsInt64Slice(slice_request.start_indices()), - AsInt64Slice(slice_request.limit_indices()), - AsInt64Slice(slice_request.strides()))); - break; - } - - case OpRequest::kDynamicSliceRequest: { - const DynamicSliceRequest& dynamic_slice_request = - request.request().dynamic_slice_request(); - HloInstruction* operand = - lookup_instruction(dynamic_slice_request.operand()); - HloInstruction* start_indices = - lookup_instruction(dynamic_slice_request.start_indices()); - - hlo_instruction = add_instruction(HloInstruction::CreateDynamicSlice( - request.output_shape(), operand, start_indices, - AsInt64Slice(dynamic_slice_request.slice_sizes()))); - break; - } - - case OpRequest::kDynamicUpdateSliceRequest: { - const DynamicUpdateSliceRequest& dynamic_update_slice_request = - request.request().dynamic_update_slice_request(); - HloInstruction* operand = - lookup_instruction(dynamic_update_slice_request.operand()); - HloInstruction* update = - lookup_instruction(dynamic_update_slice_request.update()); - HloInstruction* start_indices = - lookup_instruction(dynamic_update_slice_request.start_indices()); - hlo_instruction = - add_instruction(HloInstruction::CreateDynamicUpdateSlice( - request.output_shape(), operand, update, start_indices)); - break; - } - - case OpRequest::kConcatenateRequest: { - const ConcatenateRequest& concatenate_request = - request.request().concatenate_request(); - std::vector operands; - for (const ComputationDataHandle& handle : - concatenate_request.operands()) { - HloInstruction* operand = lookup_instruction(handle); - operands.push_back(operand); - } - hlo_instruction = add_instruction(HloInstruction::CreateConcatenate( - request.output_shape(), operands, concatenate_request.dimension())); - break; - } - - case OpRequest::kConvolveRequest: { - const ConvolveRequest& convolve_request = - request.request().convolve_request(); - HloInstruction* lhs = lookup_instruction(convolve_request.lhs()); - HloInstruction* rhs = lookup_instruction(convolve_request.rhs()); - hlo_instruction = add_instruction(HloInstruction::CreateConvolve( - request.output_shape(), lhs, rhs, convolve_request.window(), - convolve_request.dimension_numbers())); - break; - } - - case OpRequest::kFftRequest: { - const FftRequest& fft_request = request.request().fft_request(); - HloInstruction* operand = lookup_instruction(fft_request.operand()); - hlo_instruction = add_instruction(HloInstruction::CreateFft( - request.output_shape(), operand, fft_request.fft_type(), - AsInt64Slice(fft_request.fft_length()))); - break; - } - - case OpRequest::kDotRequest: { - const DotRequest& dot_request = request.request().dot_request(); - HloInstruction* lhs = lookup_instruction(dot_request.lhs()); - HloInstruction* rhs = lookup_instruction(dot_request.rhs()); - hlo_instruction = add_instruction(HloInstruction::CreateDot( - request.output_shape(), lhs, rhs, dot_request.dimension_numbers())); - break; - } - - case OpRequest::kCrossReplicaSumRequest: { - const CrossReplicaSumRequest& cross_replica_sum_request = - request.request().cross_replica_sum_request(); - HloInstruction* operand = - lookup_instruction(cross_replica_sum_request.operand()); - hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum( - request.output_shape(), {operand})); - break; - } - - case OpRequest::kInfeedRequest: { - const InfeedRequest& infeed_request = request.request().infeed_request(); - hlo_instruction = add_instruction(HloInstruction::CreateInfeed( - request.output_shape(), infeed_request.config())); - break; - } - - case OpRequest::kOutfeedRequest: { - const OutfeedRequest& outfeed_request = - request.request().outfeed_request(); - HloInstruction* operand = lookup_instruction(outfeed_request.operand()); - hlo_instruction = add_instruction(HloInstruction::CreateOutfeed( - outfeed_request.shape(), operand, outfeed_request.outfeed_config())); - break; - } - - case OpRequest::kMapRequest: { - const MapRequest& map_request = request.request().map_request(); - std::vector operands; - for (const ComputationDataHandle& handle : map_request.operands()) { - HloInstruction* operand = lookup_instruction(handle); - operands.push_back(operand); - } - CHECK_EQ(1, request.embedded_computation_versions_size()); - VersionedComputationHandle::Version map_version = - request.embedded_computation_versions(0); - HloComputation* map_computation = - ResolveComputation(map_request.to_apply(), map_version); - hlo_instruction = add_instruction(HloInstruction::CreateMap( - request.output_shape(), operands, map_computation)); - break; - } - - case OpRequest::kReduceRequest: { - const ReduceRequest& reduce_request = request.request().reduce_request(); - HloInstruction* operand = lookup_instruction(reduce_request.operand()); - HloInstruction* init_value = - lookup_instruction(reduce_request.init_value()); - CHECK_EQ(1, request.embedded_computation_versions_size()); - VersionedComputationHandle::Version reduce_version = - request.embedded_computation_versions(0); - HloComputation* reduce_computation = - ResolveComputation(reduce_request.to_apply(), reduce_version); - hlo_instruction = add_instruction(HloInstruction::CreateReduce( - request.output_shape(), operand, init_value, - AsInt64Slice(reduce_request.dimensions()), reduce_computation)); - break; - } - - case OpRequest::kReduceWindowRequest: { - const ReduceWindowRequest& reduce_window_request = - request.request().reduce_window_request(); - HloInstruction* operand = - lookup_instruction(reduce_window_request.operand()); - HloInstruction* init_value = - lookup_instruction(reduce_window_request.init_value()); - CHECK_EQ(1, request.embedded_computation_versions_size()); - VersionedComputationHandle::Version reduce_window_version = - request.embedded_computation_versions(0); - HloComputation* reduce_window_computation = ResolveComputation( - reduce_window_request.to_apply(), reduce_window_version); - hlo_instruction = add_instruction(HloInstruction::CreateReduceWindow( - request.output_shape(), operand, init_value, - reduce_window_request.window(), reduce_window_computation)); - break; - } - - case OpRequest::kSelectAndScatterRequest: { - const SelectAndScatterRequest& select_and_scatter_request = - request.request().select_and_scatter_request(); - HloInstruction* operand = - lookup_instruction(select_and_scatter_request.operand()); - HloInstruction* source = - lookup_instruction(select_and_scatter_request.source()); - HloInstruction* init_value = - lookup_instruction(select_and_scatter_request.init_value()); - CHECK_EQ(2, request.embedded_computation_versions_size()); - VersionedComputationHandle::Version select_version = - request.embedded_computation_versions(0); - VersionedComputationHandle::Version scatter_version = - request.embedded_computation_versions(1); - HloComputation* select_computation = ResolveComputation( - select_and_scatter_request.select(), select_version); - HloComputation* scatter_computation = ResolveComputation( - select_and_scatter_request.scatter(), scatter_version); - hlo_instruction = add_instruction(HloInstruction::CreateSelectAndScatter( - request.output_shape(), operand, select_computation, - select_and_scatter_request.window(), source, init_value, - scatter_computation)); - break; - } - - case OpRequest::kBatchNormTrainingRequest: { - const BatchNormTrainingRequest& batch_norm_training_request = - request.request().batch_norm_training_request(); - HloInstruction* operand = - lookup_instruction(batch_norm_training_request.operand()); - HloInstruction* scale = - lookup_instruction(batch_norm_training_request.scale()); - HloInstruction* offset = - lookup_instruction(batch_norm_training_request.offset()); - - hlo_instruction = add_instruction(HloInstruction::CreateBatchNormTraining( - request.output_shape(), operand, scale, offset, - batch_norm_training_request.epsilon(), - batch_norm_training_request.feature_index())); - break; - } - - case OpRequest::kBatchNormInferenceRequest: { - const BatchNormInferenceRequest& batch_norm_inference_request = - request.request().batch_norm_inference_request(); - HloInstruction* operand = - lookup_instruction(batch_norm_inference_request.operand()); - HloInstruction* scale = - lookup_instruction(batch_norm_inference_request.scale()); - HloInstruction* offset = - lookup_instruction(batch_norm_inference_request.offset()); - HloInstruction* mean = - lookup_instruction(batch_norm_inference_request.mean()); - HloInstruction* variance = - lookup_instruction(batch_norm_inference_request.variance()); - - hlo_instruction = - add_instruction(HloInstruction::CreateBatchNormInference( - request.output_shape(), operand, scale, offset, mean, variance, - batch_norm_inference_request.epsilon(), - batch_norm_inference_request.feature_index())); - break; - } - - case OpRequest::kBatchNormGradRequest: { - const BatchNormGradRequest& batch_norm_grad_request = - request.request().batch_norm_grad_request(); - - HloInstruction* operand = - lookup_instruction(batch_norm_grad_request.operand()); - HloInstruction* scale = - lookup_instruction(batch_norm_grad_request.scale()); - HloInstruction* mean = lookup_instruction(batch_norm_grad_request.mean()); - HloInstruction* variance = - lookup_instruction(batch_norm_grad_request.variance()); - HloInstruction* grad_output = - lookup_instruction(batch_norm_grad_request.grad_output()); - - hlo_instruction = add_instruction(HloInstruction::CreateBatchNormGrad( - request.output_shape(), operand, scale, mean, variance, grad_output, - batch_norm_grad_request.epsilon(), - batch_norm_grad_request.feature_index())); - break; - } - - case OpRequest::kBroadcastRequest: { - const BroadcastRequest& broadcast_request = - request.request().broadcast_request(); - HloInstruction* operand = lookup_instruction(broadcast_request.operand()); - std::vector broadcast_dimensions; - // The client-level broadcast instruction just appends dimensions on the - // left (adds lowest numbered dimensions). The HLO broadcast op is more - // flexible and can add new dimensions anywhere. The broadcast_dimensions - // maps operand dimensions to dimensions in the broadcast output, so - // to append dimensions on the left the broadcast_dimensions should just - // be the n highest dimension numbers of the output shape where n is - // the number of input dimensions. - broadcast_dimensions.reserve(ShapeUtil::Rank(operand->shape())); - for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) { - broadcast_dimensions.push_back(i + - ShapeUtil::Rank(request.output_shape()) - - ShapeUtil::Rank(operand->shape())); - } - hlo_instruction = add_instruction(HloInstruction::CreateBroadcast( - request.output_shape(), operand, broadcast_dimensions)); - break; - } - - case OpRequest::kReshapeRequest: { - const ReshapeRequest& reshape_request = - request.request().reshape_request(); - HloInstruction* operand = lookup_instruction(reshape_request.operand()); - HloInstruction* transposed; - if (IsIdentityPermutation(AsInt64Slice(reshape_request.dimensions()))) { - transposed = operand; - } else { - transposed = add_instruction(HloInstruction::CreateTranspose( - ShapeUtil::PermuteDimensions( - InversePermutation(AsInt64Slice(reshape_request.dimensions())), - operand->shape()), - operand, AsInt64Slice(reshape_request.dimensions()))); - } - hlo_instruction = add_instruction( - HloInstruction::CreateReshape(request.output_shape(), transposed)); - break; - } - - case OpRequest::kTransposeRequest: { - const TransposeRequest& transpose_request = - request.request().transpose_request(); - HloInstruction* operand = lookup_instruction(transpose_request.operand()); - hlo_instruction = add_instruction(HloInstruction::CreateTranspose( - ShapeUtil::PermuteDimensions( - InversePermutation(AsInt64Slice(transpose_request.dimensions())), - operand->shape()), - operand, AsInt64Slice(transpose_request.dimensions()))); - break; - } - - case OpRequest::kReverseRequest: { - const ReverseRequest& reverse_request = - request.request().reverse_request(); - HloInstruction* operand = lookup_instruction(reverse_request.operand()); - hlo_instruction = add_instruction(HloInstruction::CreateReverse( - request.output_shape(), operand, - AsInt64Slice(reverse_request.dimensions()))); - break; - } - - case OpRequest::kPadRequest: { - const PadRequest& pad_request = request.request().pad_request(); - HloInstruction* operand = lookup_instruction(pad_request.operand()); - HloInstruction* padding_value = - lookup_instruction(pad_request.padding_value()); - hlo_instruction = add_instruction(HloInstruction::CreatePad( - request.output_shape(), operand, padding_value, - pad_request.padding_config())); - break; - } - - case OpRequest::kRecvRequest: { - const RecvRequest& recv_request = request.request().recv_request(); - HloInstruction* recv = add_instruction(HloInstruction::CreateRecv( - request.output_shape(), recv_request.channel_handle().handle())); - hlo_instruction = add_instruction(HloInstruction::CreateRecvDone(recv)); - break; - } - - case OpRequest::kParameterRequest: { - const ParameterRequest& parameter_request = - request.request().parameter_request(); - hlo_instruction = add_instruction(HloInstruction::CreateParameter( - parameter_request.parameter(), request.output_shape(), - parameter_request.name())); - break; - } - - case OpRequest::kConvertRequest: { - const ConvertRequest& convert_request = - request.request().convert_request(); - HloInstruction* operand = lookup_instruction(convert_request.operand()); - hlo_instruction = add_instruction( - HloInstruction::CreateConvert(request.output_shape(), operand)); - break; - } - - case OpRequest::kBitcastConvertRequest: { - const ConvertRequest& convert_request = - request.request().bitcast_convert_request(); - HloInstruction* operand = lookup_instruction(convert_request.operand()); - hlo_instruction = add_instruction(HloInstruction::CreateBitcastConvert( - request.output_shape(), operand)); - break; - } - - case OpRequest::kWhileRequest: { - const WhileRequest& while_request = request.request().while_request(); - CHECK_EQ(2, request.embedded_computation_versions_size()); - VersionedComputationHandle::Version condition_version = - request.embedded_computation_versions(0); - HloComputation* condition = - ResolveComputation(while_request.condition(), condition_version); - VersionedComputationHandle::Version body_version = - request.embedded_computation_versions(1); - HloComputation* body = - ResolveComputation(while_request.body(), body_version); - HloInstruction* init = lookup_instruction(while_request.init()); - hlo_instruction = add_instruction(HloInstruction::CreateWhile( - request.output_shape(), condition, body, init)); - break; - } - - case OpRequest::kConditionalRequest: { - const ConditionalRequest& conditional_request = - request.request().conditional_request(); - CHECK_EQ(2, request.embedded_computation_versions_size()); - VersionedComputationHandle::Version true_computation_version = - request.embedded_computation_versions(0); - HloComputation* true_computation = ResolveComputation( - conditional_request.true_computation(), true_computation_version); - VersionedComputationHandle::Version false_computation_version = - request.embedded_computation_versions(1); - HloComputation* false_computation = ResolveComputation( - conditional_request.false_computation(), false_computation_version); - HloInstruction* predicate = - lookup_instruction(conditional_request.predicate()); - HloInstruction* true_operand = - lookup_instruction(conditional_request.true_operand()); - HloInstruction* false_operand = - lookup_instruction(conditional_request.false_operand()); - hlo_instruction = add_instruction(HloInstruction::CreateConditional( - request.output_shape(), predicate, true_operand, true_computation, - false_operand, false_computation)); - break; - } - - case OpRequest::kTernaryOpRequest: { - const TernaryOpRequest& ternary_op_request = - request.request().ternary_op_request(); - HloInstruction* lhs = lookup_instruction(ternary_op_request.lhs()); - HloInstruction* rhs = lookup_instruction(ternary_op_request.rhs()); - HloInstruction* ehs = lookup_instruction(ternary_op_request.ehs()); - auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop()); - if (debug_options_.xla_eliminate_hlo_implicit_broadcast() && - !ShapeUtil::IsTuple(request.output_shape())) { - if (!ShapeUtil::IsTuple(lhs->shape()) && - !ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) { - // lhs side is being implicitly broadcast. Change to explicit. - lhs = - ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape()); - } - - if (!ShapeUtil::IsTuple(rhs->shape()) && - !ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) { - rhs = - ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape()); - } - - if (!ShapeUtil::IsTuple(ehs->shape()) && - !ShapeUtil::SameDimensions(request.output_shape(), ehs->shape())) { - ehs = - ImplicitBroadcastToExplicitBroadcast(ehs, request.output_shape()); - } - } - - hlo_instruction = add_instruction(HloInstruction::CreateTernary( - request.output_shape(), hlo_opcode, lhs, rhs, ehs)); - break; - } - - case OpRequest::kVariadicOpRequest: { - const VariadicOpRequest& variadic_op_request = - request.request().variadic_op_request(); - std::vector operands; - for (const ComputationDataHandle& handle : - variadic_op_request.operands()) { - HloInstruction* operand = lookup_instruction(handle); - operands.push_back(operand); - } - auto hlo_opcode = - VariadicOperationToHloOpcode(variadic_op_request.varop()); - hlo_instruction = add_instruction(HloInstruction::CreateVariadic( - request.output_shape(), hlo_opcode, operands)); - break; - } - - case OpRequest::kCallRequest: { - const CallRequest& call_request = request.request().call_request(); - std::vector operands; - for (const ComputationDataHandle& handle : call_request.operands()) { - operands.push_back(lookup_instruction(handle)); - } - CHECK_EQ(1, request.embedded_computation_versions_size()); - VersionedComputationHandle::Version call_version = - request.embedded_computation_versions(0); - HloComputation* call_computation = - ResolveComputation(call_request.to_apply(), call_version); - hlo_instruction = add_instruction(HloInstruction::CreateCall( - request.output_shape(), operands, call_computation)); - break; - } - - case OpRequest::kCustomCallRequest: { - const CustomCallRequest& cc_request = - request.request().custom_call_request(); - std::vector operands; - for (const ComputationDataHandle& operand : cc_request.operands()) { - operands.push_back(lookup_instruction(operand)); - } - hlo_instruction = add_instruction(HloInstruction::CreateCustomCall( - cc_request.shape(), operands, cc_request.call_target_name())); - break; - } - - case OpRequest::kHostComputeRequest: { - const HostComputeRequest& host_compute_request = - request.request().host_compute_request(); - std::vector operands; - for (const ComputationDataHandle& operand : - host_compute_request.operands()) { - operands.push_back(lookup_instruction(operand)); - } - auto output_shape = host_compute_request.shape(); - auto channel_name = host_compute_request.channel_name(); - auto cost_estimate_ns = host_compute_request.cost_estimate_ns(); - hlo_instruction = add_instruction(HloInstruction::CreateHostCompute( - output_shape, operands, channel_name, cost_estimate_ns)); - break; - } - - case OpRequest::kUnaryOpRequest: { - const UnaryOpRequest& unary_op_request = - request.request().unary_op_request(); - HloInstruction* operand = lookup_instruction(unary_op_request.operand()); - auto hlo_opcode = UnaryOperationToHloOpcode(unary_op_request.unop()); - hlo_instruction = add_instruction(HloInstruction::CreateUnary( - request.output_shape(), hlo_opcode, operand)); - break; - } - - case OpRequest::kBinaryOpRequest: { - const BinaryOpRequest& binary_op_request = - request.request().binary_op_request(); - HloInstruction* lhs = lookup_instruction(binary_op_request.lhs()); - HloInstruction* rhs = lookup_instruction(binary_op_request.rhs()); - auto hlo_opcode = BinaryOperationToHloOpcode(binary_op_request.binop()); - if (binary_op_request.broadcast_dimensions_size() > 0 && - ShapeUtil::Rank(lhs->shape()) != ShapeUtil::Rank(rhs->shape())) { - // Emit a broadcast instruction to perform the "broadcast in dimension" - // operation. - HloInstruction* operand_to_broadcast = - ShapeUtil::Rank(lhs->shape()) < ShapeUtil::Rank(rhs->shape()) ? lhs - : rhs; - CHECK_EQ(ShapeUtil::Rank(operand_to_broadcast->shape()), - binary_op_request.broadcast_dimensions().size()); - - // Construct the bounds of the shape of the kBroadcast instruction - // responsible for the in-dimension broadcast. - std::vector output_dimensions; - for (int64 size : request.output_shape().dimensions()) { - output_dimensions.push_back(size); - } - for (int64 operand_dim = 0; - operand_dim < ShapeUtil::Rank(operand_to_broadcast->shape()); - ++operand_dim) { - int64 output_dim = - binary_op_request.broadcast_dimensions()[operand_dim]; - output_dimensions[output_dim] = - operand_to_broadcast->shape().dimensions(operand_dim); - } - - Shape broadcast_shape = ShapeUtil::MakeShape( - operand_to_broadcast->shape().element_type(), output_dimensions); - - // The broadcast semantics of a client-level binary op broadcast is - // identical to the HLO broadcast semantics so the broadcast_dimensions - // field can just be passed to the instruction builder. - HloInstruction* broadcasted_operand = - add_instruction(HloInstruction::CreateBroadcast( - broadcast_shape, operand_to_broadcast, - AsInt64Slice(binary_op_request.broadcast_dimensions()))); - - lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs; - rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs; - } - if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) { - if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) { - // lhs side is being implicitly broadcast. Change to explicit. - lhs = - ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape()); - } - - if (!ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) { - rhs = - ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape()); - } - } - hlo_instruction = add_instruction(HloInstruction::CreateBinary( - request.output_shape(), hlo_opcode, lhs, rhs)); - break; - } - - case OpRequest::kReducePrecisionRequest: { - const ReducePrecisionRequest& reduce_precision_request = - request.request().reduce_precision_request(); - HloInstruction* operand = - lookup_instruction(reduce_precision_request.operand()); - auto exponent_bits = reduce_precision_request.exponent_bits(); - auto mantissa_bits = reduce_precision_request.mantissa_bits(); - hlo_instruction = add_instruction(HloInstruction::CreateReducePrecision( - request.output_shape(), operand, exponent_bits, mantissa_bits)); - break; - } - - case OpRequest::kTraceRequest: { - const TraceRequest& trace_request = request.request().trace_request(); - HloInstruction* operand = lookup_instruction(trace_request.operand()); - hlo_instruction = add_instruction( - HloInstruction::CreateTrace(trace_request.tag(), operand)); - break; - } - - case OpRequest::kSendRequest: { - const SendRequest& send_request = request.request().send_request(); - HloInstruction* operand = lookup_instruction(send_request.operand()); - HloInstruction* send = add_instruction(HloInstruction::CreateSend( - operand, send_request.channel_handle().handle())); - hlo_instruction = add_instruction(HloInstruction::CreateSendDone(send)); - break; - } - - case OpRequest::kGatherRequest: { - const GatherRequest& gather_request = request.request().gather_request(); - HloInstruction* input_operand = - lookup_instruction(gather_request.input()); - HloInstruction* gather_indices_operand = - lookup_instruction(gather_request.gather_indices()); - std::vector window_bounds; - c_copy(gather_request.window_bounds(), std::back_inserter(window_bounds)); - hlo_instruction = add_instruction(HloInstruction::CreateGather( - request.output_shape(), input_operand, gather_indices_operand, - gather_request.dimension_numbers(), window_bounds)); - break; - } - - case OpRequest::OP_NOT_SET: - LOG(FATAL) << "OperationRequest doesn't contain a request"; - - default: - LOG(FATAL) << "Unexpected request type: " << request.request().op_case(); - } - (*instructions)[handle.handle()] = hlo_instruction; -} // NOLINT(readability/fn_size) - -} // namespace - -StatusOr> UserComputation::BuildHloComputation( - VersionedComputationHandle::Version version, - HloComputationResolver hlo_resolver, const DebugOptions& debug_options, - bool include_unreachable_instructions) const { - tensorflow::mutex_lock lock(mutex_); - - VLOG(2) << "Building HloComputation from UserComputation " << name_ - << " at version " << version; - XLA_VLOG_LINES(3, session_computation_.DebugString()); - - TF_ASSIGN_OR_RETURN( - std::unique_ptr hlo_computation, - ComputationLowerer::Lower( - tensorflow::strings::StrCat(name(), ".v", version), - session_computation_, version, std::move(hlo_resolver), debug_options, - include_unreachable_instructions)); - - return std::move(hlo_computation); -} - -} // namespace xla diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h deleted file mode 100644 index 5544c868fe905c1ca7e6cab32738440add2e3b4f..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/user_computation.h +++ /dev/null @@ -1,413 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_USER_COMPUTATION_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_USER_COMPUTATION_H_ - -#include -#include -#include -#include -#include - -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/session.pb.h" -#include "tensorflow/compiler/xla/service/versioned_computation_handle.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/xla.pb.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/thread_annotations.h" -#include "tensorflow/core/platform/types.h" - -namespace xla { - -// A UserComputation is the built-up computation that users create via the -// XLA Service interface. -// -// The XLA service adds instructions to a user computation via this -// interface. The state of the computation is stored as a SessionComputation -// proto which holds a record of all operation-building requests received by the -// XLA service. -// -// UserComputations are lowered to HloComputations which are passed to the high -// level compiler interface. -class UserComputation { - public: - // Factory used when restoring a computation from serialized session - // computation (computation snapshot) data. Remaps any references to - // computation handle via the old_to_new mapping. - // - // An error will occur if the old_to_new mapping cannot resolve a reference to - // a computation that is present in session_computation. - static StatusOr> MakeWithRemapping( - const SessionComputation& session_computation, - const ComputationHandle& handle, - const std::map& old_to_new); - - // Creates an empty computation with the given name and computation handle. - explicit UserComputation(const string& name, const ComputationHandle& handle); - - // Enqueues a parameter-retrieving instruction onto this user computation. - // Returns an error status if the parameter number is already registered with - // different values. - StatusOr AddParameterInstruction( - const ParameterRequest& parameter_request); - - // Enqueues a pad instruction onto this user computation. - StatusOr AddPadInstruction( - const PadRequest& pad_request); - - // Enqueues a tracing instruction onto this user computation. - // Returns an error status if the operand cannot be resolved. - Status AddTraceInstruction(const TraceRequest& trace_request); - - // Enqueues a random number generation instruction onto this user computation. - StatusOr AddRngInstruction( - const RngRequest& rng_request); - - // Enqueues a unary instruction onto this user computation. - // Returns an error status if the operand index is out of bounds. - StatusOr AddUnaryInstruction( - const UnaryOpRequest& unary_request); - - // Enqueues a batch norm training instruction onto this user computation. - StatusOr AddBatchNormTrainingInstruction( - const BatchNormTrainingRequest& batch_norm_training_request); - - // Enqueues a batch norm inference instruction onto this user computation. - StatusOr AddBatchNormInferenceInstruction( - const BatchNormInferenceRequest& batch_norm_inference_request); - - // Enqueues a batch norm grad instruction onto this user computation. - StatusOr AddBatchNormGradInstruction( - const BatchNormGradRequest& batch_norm_grad_request); - - // Enqueues a binary instruction onto this user computation. - // Returns an error status if the operand indices are out of bounds. - StatusOr AddBinaryInstruction( - const BinaryOpRequest& binary_request); - - // Enqueues a ternary instruction onto this user computation. - // Returns an error status if the operand indices are out of bounds. - StatusOr AddTernaryInstruction( - const TernaryOpRequest& ternary_request); - - // Enqueues a variadic instruction onto this user computation. - // Returns an error status if the operand indices are out of bounds. - StatusOr AddVariadicInstruction( - const VariadicOpRequest& variadic_request); - - // Enqueues a constant instruction onto this user computation. - StatusOr AddConstantInstruction( - const ConstantRequest& constant_request); - - // Enqueues a get tuple element instruction onto this user computation. - StatusOr AddGetTupleElementInstruction( - const GetTupleElementRequest& get_tuple_element_request); - - // Enqueues a map instruction onto this user computation. - StatusOr AddMapInstruction( - const MapRequest& map_request, - const UserComputation& to_apply_computation); - - // Enqueues a reduce-precision instruction onto this user computation. - StatusOr AddReducePrecisionInstruction( - const ReducePrecisionRequest& reduce_precision_request); - - // Enqueues a convolution instruction onto this user computation. - StatusOr AddConvolveInstruction( - const ConvolveRequest& convolve_request); - - // Enqueues an FFT instruction onto this user computation. - StatusOr AddFftInstruction( - const FftRequest& fft_request); - - // Enqueues a cross replica sum instruction onto this user computation. - StatusOr AddCrossReplicaSumInstruction( - const CrossReplicaSumRequest& cross_replica_sum_request); - - // Enqueues an infeed instruction onto this user computation. - StatusOr AddInfeedInstruction( - const InfeedRequest& infeed_request); - - // Enqueues an outfeed instruction onto this user computation. - StatusOr AddOutfeedInstruction( - const OutfeedRequest& outfeed_request); - - // Enqueues a host compute instruction onto this user computation. - StatusOr AddHostComputeInstruction( - const HostComputeRequest& host_compute_request); - - // Enqueues a call instruction onto this user computation. - StatusOr AddCallInstruction( - const CallRequest& call_request, - const UserComputation& to_apply_computation); - - // Enqueues a custom call instruction onto this user computation. - StatusOr AddCustomCallInstruction( - const CustomCallRequest& custom_call_request); - - // Enqueues a dot instruction onto this user computation. - StatusOr AddDotInstruction( - const DotRequest& dot_request); - - // Enqueues a broadcast instruction onto this user computation. - StatusOr AddBroadcastInstruction( - const BroadcastRequest& broadcast_request); - - // Enqueues a reshape instruction onto this user computation. - StatusOr AddReshapeInstruction( - const ReshapeRequest& reshape_request); - - // Enqueues a transpose instruction onto this user computation. - StatusOr AddTransposeInstruction( - const TransposeRequest& transpose_request); - - // Enqueues a slice instruction onto this user computation. - StatusOr AddSliceInstruction( - const SliceRequest& slice_request); - - // Enqueues a dynamic slice instruction onto this user computation. - StatusOr AddDynamicSliceInstruction( - const DynamicSliceRequest& dynamic_slice_request); - - // Enqueues a dynamic update slice instruction onto this user computation. - StatusOr AddDynamicUpdateSliceInstruction( - const DynamicUpdateSliceRequest& dynamic_update_slice_request); - - // Enqueues a concatenate instruction onto this user computation. - StatusOr AddConcatenateInstruction( - const ConcatenateRequest& concatenate_request); - - // Enqueues a convert instruction onto this user computation. - StatusOr AddConvertInstruction( - const ConvertRequest& convert_request); - - // Enqueues a bitcast element instruction onto this user computation. - StatusOr AddBitcastConvertInstruction( - const ConvertRequest& convert_request); - - // Enqueues a reduce instruction onto this user computation. - StatusOr AddReduceInstruction( - const ReduceRequest& reduce_request, - const UserComputation& to_apply_computation); - - // Enqueues a windowed reduce instruction onto this user computation. - StatusOr AddReduceWindowInstruction( - const ReduceWindowRequest& reduce_window_request, - const UserComputation& to_apply_computation); - - // Enqueues a select-and-scatter instruction onto this user - // computation. - StatusOr AddSelectAndScatterInstruction( - const SelectAndScatterRequest& select_and_scatter_request, - const UserComputation& select_computation, - const UserComputation& scatter_computation); - - // Enqueues a reverse instruction onto this user computation. - StatusOr AddReverseInstruction( - const ReverseRequest& reverse_request); - - // Enqueues a while instruction onto this user computation. - StatusOr AddWhileInstruction( - const WhileRequest& while_request, - const UserComputation& condition_computation, - const UserComputation& body_computation); - - // Enqueues a conditional instruction on this user computation. - StatusOr AddConditionalInstruction( - const ConditionalRequest& conditional_request, - const UserComputation& true_computation, - const UserComputation& false_computation); - - // Enqueues a Send instruction onto this user computation. - StatusOr AddSendInstruction( - const SendRequest& send_request); - - // Enqueues a Recv instruction onto this user computation. - StatusOr AddRecvInstruction( - const RecvRequest& recv_request); - - // Enqueues a Gather instruction onto this user computation. - StatusOr AddGatherInstruction( - const GatherRequest& gather_request); - - // Returns the user-provided name of this user computation, which is provided - // via the XLA computation-building API. - const string& name() const { return name_; } - - // Subsequent executions of this computation will compute the value - // represented by handle, rather than the last expression enqueued - // on the computation. - Status SetReturnValue(const ComputationDataHandle& handle); - - // Return a versioned handle for this computation. - VersionedComputationHandle GetVersionedHandle() const; - - // Return a versioned handle for this computation with a version equal to the - // point at which given operation was added to the computation. - VersionedComputationHandle GetVersionedHandleAtOperation( - const ComputationDataHandle& operation) const; - - // Return a version value representing the current state of the - // computation. - VersionedComputationHandle::Version version() const; - - // Computes and returns the program shape for the user computation -- gathers - // parameters and result type into a single proto. A shared_ptr is used - // because the returned pointer refers to an internally cached value which may - // be discarded by the UserComputation object. This avoid unnecessary copies. - // - // If the parameter space is not dense (i.e. there are holes in the parameter - // numbers provided) then an error status is returned. - StatusOr> ComputeProgramShape( - VersionedComputationHandle::Version version) const; - - // Returns true if the given data handle does not depend on any parameter with - // index higher then num_parameters. That is, the value can be computed at - // compile time if we know the first num_parameters arguments. - StatusOr IsConstant(const ComputationDataHandle& handle, - int64 num_parameters); - - // Returns the output shape of the operation indicated by the given handle. - StatusOr GetShape(const ComputationDataHandle& handle); - - // Sets metadata on the Hlo instruction referenced by the given handle. - Status SetOpMetadata(const ComputationDataHandle& handle, - const OpMetadata& metadata); - - // Sets the device assignment on the Hlo instruction referenced by 'handle'. - Status SetOpSharding(const ComputationDataHandle& handle, - const OpSharding& sharding); - - // Builds a HLO computation from the UserComputation. The parameter "resolver" - // is a function which returns a pointer to the HloComputation corresponding - // to the given ComputationHandle at the given version. The resolver is used - // for operations, such as map, which call other computations and need a - // pointer to the called HloComputation to construct the respective HLO - // instructions. If include_unreachable_instructions is true, then - // instructions which are not reachable from the root are lowered into - // HloInstructions. - using HloComputationResolver = - std::function; - StatusOr> BuildHloComputation( - VersionedComputationHandle::Version version, - HloComputationResolver hlo_resolver, const DebugOptions& debug_options, - bool include_unreachable_instructions = true) const; - - // Return a vector containing the embedded computations used by this - // UserComputation. Only embedded computations which are called directly by - // this UserComputation are included. That is, the transitive closure of - // embedded computations is not included. - std::vector GetEmbeddedComputations( - VersionedComputationHandle::Version version) const; - - // Returns the number of OperationRequest objects in this UserComputation. - // The 'version' of a computation is identical to the number of - // OperationRequests in the UserComputation. - int64 request_count(VersionedComputationHandle::Version version) const { - return version; - } - - // Returns a copy of the internal session state for this computation -- this - // is useful for serializing the guts of a user computation, though references - // to other handles (e.g. referred-to computations) must be handled with care - // in the serialization / de-serialization process. - SessionComputation CloneSessionComputation( - VersionedComputationHandle::Version version) const; - - // Warning: typically we don't want to look up computation data handles until - // the computation is finished being built, for consistency purposes. We - // expose this routine for error reporting purposes so that we can provide - // more meaningful error messages from the XLA service layer. - // - // Returns the operation request that the handle comes from. - StatusOr LookUpRequestForErrorReporting( - const ComputationDataHandle& handle) const; - - // Retrieves the parameter metadata for the given parameter number. - // - // If the parameter number is invalid for this computation, nullopt is - // returned. When the return value has_value(), nullptr will never be - // the held value. - tensorflow::gtl::optional ParameterMetadata( - int parameter_number) const; - - private: - // Warning: dangerous mutating operation that doesn't respect versioning. - // This is only used at initialization time when constructing from a - // SessionComputation a la MakeWithRemapping. - // - // Remaps references to old computations (with handle values in the keys of - // old_to_new) to the computation handle given in the values. This is useful - // when loading computations from snapshots, to finish initialization, before - // the user computation is released into the wild. - Status RemapEmbeddedComputations( - const std::map& old_to_new) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Returns the OperationRequest corresponding to the given handle. - StatusOr LookUpRequest( - const ComputationDataHandle& handle) const - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Creates a new ComputationDataHandle with the next available handle value. - ComputationDataHandle CreateComputationDataHandle() - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Checks whether the parameter numbers of the parameter operations are - // contiguous starting from zero. Returns appropriate error status if not. - Status CheckParametersAreContiguous( - VersionedComputationHandle::Version version) const - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - VersionedComputationHandle GetVersionedHandleInternal() const - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Name of the computation. - string name_; - - mutable tensorflow::mutex mutex_; - - // State of the computation as a record of all operation-building requests. - SessionComputation session_computation_ GUARDED_BY(mutex_); - - // Mapping from parameter number to operation request containing the - // respective ParameterRequest. - std::map parameters_ GUARDED_BY(mutex_); - - // The next ComputationDataHandle value to assign. Handle values are assigned - // sequentially. - int64 next_handle_value_ GUARDED_BY(mutex_); - - // If handle_to_return_.has_handle() then an Execution of this Computation - // will compute the value represented by handle_to_return_, otherwise it will - // compute the value of (next_handle_value_ - 1). - ComputationDataHandle handle_to_return_ GUARDED_BY(mutex_); - - // Memoized ProgramShape and its version. A shared_ptr is used because - // references to this object are returned by ComputeProgramShape. - mutable int64 program_shape_version_ GUARDED_BY(mutex_) = 0; - mutable std::shared_ptr program_shape_ GUARDED_BY(mutex_); - - TF_DISALLOW_COPY_AND_ASSIGN(UserComputation); -}; - -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_USER_COMPUTATION_H_ diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc deleted file mode 100644 index 2fa163953f638c0038e9f6bb11ce2a3742e0558c..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/user_computation_test.cc +++ /dev/null @@ -1,340 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/user_computation.h" - -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_matchers.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/test_helpers.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/status_test_util.h" - -namespace op = xla::testing::opcode_matchers; - -namespace xla { -namespace { - -using UserComputationTest = ::testing::Test; - -TEST_F(UserComputationTest, SimpleComputation) { - const Shape kScalarShape = ShapeUtil::MakeShape(F32, {}); - const Shape kVectorShape = ShapeUtil::MakeShape(F32, {2}); - - // Build a simple three operation computatation: - // - // %constant = Constant({123, 42}) - // %param = Param(0) - // %outfeed = Outfeed(%constant) - // - // Build the computation at two different versions and check invariants. - ComputationHandle handle; - handle.set_handle(123); - UserComputation computation("TheComputation", handle); - - ConstantRequest constant_request; - *constant_request.mutable_literal() = - Literal::CreateR1({123.0f, 42.0f})->ToProto(); - TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle constant_handle, - computation.AddConstantInstruction(constant_request)); - - ParameterRequest param_request; - *param_request.mutable_shape() = kScalarShape; - param_request.set_parameter(0); - param_request.set_name("param0"); - TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle param_handle, - computation.AddParameterInstruction(param_request)); - OpMetadata metadata; - metadata.set_op_name("meta"); - TF_ASSERT_OK(computation.SetOpMetadata(param_handle, metadata)); - - OutfeedRequest outfeed_request; - *outfeed_request.mutable_operand() = constant_handle; - *outfeed_request.mutable_shape() = kVectorShape; - outfeed_request.set_outfeed_config("abc"); - TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle outfeed_handle, - computation.AddOutfeedInstruction(outfeed_request)); - - auto hlo_resolver = [](const VersionedComputationHandle& handle) { - return nullptr; - }; - { - // Test the computation at the latest version. In this case, the most - // recently added operation is an outfeed. However, the outfeed is not the - // root because outfeeds cannot be the root of a computation. - VersionedComputationHandle latest_version = - computation.GetVersionedHandle(); - - // Program shape should have a single scalar parameter and scalar - // result. The outfeed instruction should not affect the program shape. - TF_ASSERT_OK_AND_ASSIGN( - std::shared_ptr program_shape, - computation.ComputeProgramShape(latest_version.version)); - ASSERT_EQ(1, program_shape->parameters_size()); - EXPECT_TRUE( - ShapeUtil::Compatible(kScalarShape, program_shape->parameters(0))); - EXPECT_TRUE(ShapeUtil::Compatible(kScalarShape, program_shape->result())); - - // Build the HLO computation. - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr hlo_computation, - computation.BuildHloComputation(latest_version.version, hlo_resolver, - DebugOptions())); - // There should be one HloInstruction per UserComputation operation. - EXPECT_EQ(3, hlo_computation->instruction_count()); - // The root of the instruction should be the parameter instruction (not the - // outfeed). - EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter()); - } - - { - // Test the computation at the version right after the parameter instruction - // is added. - VersionedComputationHandle version_at_param = - computation.GetVersionedHandleAtOperation(param_handle); - - // Program shape should have a single scalar parameter, and scalar result. - TF_ASSERT_OK_AND_ASSIGN( - std::shared_ptr program_shape, - computation.ComputeProgramShape(version_at_param.version)); - ASSERT_EQ(1, program_shape->parameters_size()); - EXPECT_TRUE( - ShapeUtil::Compatible(kScalarShape, program_shape->parameters(0))); - EXPECT_TRUE(ShapeUtil::Compatible(kScalarShape, program_shape->result())); - - // There should be two instructions, one for the constant and one for the - // parameter. The outfeed instruction should not be included. - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr hlo_computation, - computation.BuildHloComputation(version_at_param.version, hlo_resolver, - DebugOptions())); - EXPECT_EQ(2, hlo_computation->instruction_count()); - EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter()); - } - { - // Test the computation at the latest version, but lowered with - // include_unreachable_instructions set to false. - VersionedComputationHandle latest_version = - computation.GetVersionedHandle(); - - // Build the HLO computation. - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr hlo_computation, - computation.BuildHloComputation( - latest_version.version, hlo_resolver, DebugOptions(), - /*include_unreachable_instructions=*/false)); - // There is only one reachable instruction, the parameter. - EXPECT_EQ(1, hlo_computation->instruction_count()); - // The root of the instruction should be the parameter instruction (not the - // outfeed). - EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter()); - EXPECT_EQ(hlo_computation->root_instruction()->metadata().op_name(), - "meta"); - } -} - -TEST_F(UserComputationTest, EliminateScalarBroadcast) { - auto debug_options = DebugOptions(); - debug_options.set_xla_eliminate_hlo_implicit_broadcast(true); - - // Build a binary computation with scalar broadcast. - // - // %a = Constant({123, 42}) - // %b = Constant(1) - // %add = Add(%a, %b) - ComputationHandle handle; - handle.set_handle(123); - UserComputation computation("TheComputation", handle); - - ConstantRequest a_request; - *a_request.mutable_literal() = - Literal::CreateR1({123.0f, 42.0f})->ToProto(); - TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle, - computation.AddConstantInstruction(a_request)); - - ConstantRequest b_request; - *b_request.mutable_literal() = Literal::CreateR0(1.0f)->ToProto(); - TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle, - computation.AddConstantInstruction(b_request)); - - BinaryOpRequest add; - add.set_binop(BINOP_ADD); - *add.mutable_lhs() = a_handle; - *add.mutable_rhs() = b_handle; - TF_ASSERT_OK(computation.AddBinaryInstruction(add).status()); - - auto hlo_resolver = [](const VersionedComputationHandle& handle) { - return nullptr; - }; - VersionedComputationHandle latest_version = computation.GetVersionedHandle(); - - // Build the HLO computation. - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr hlo_computation, - computation.BuildHloComputation(latest_version.version, hlo_resolver, - debug_options)); - // The binary operation has implicit scalar broadcast, should be converted - // to an explicit broadcast intruction and a binary instruction. - EXPECT_EQ(4, hlo_computation->instruction_count()); - EXPECT_THAT(hlo_computation->root_instruction(), op::Add()); - LOG(INFO) << hlo_computation->root_instruction()->ToString(); - const auto& operands = hlo_computation->root_instruction()->operands(); - ASSERT_EQ(2, operands.size()); - EXPECT_TRUE(operands[0]->opcode() == HloOpcode::kBroadcast || - operands[1]->opcode() == HloOpcode::kBroadcast); -} - -TEST_F(UserComputationTest, CheckImplicitBroadcastToExplicitBroadcast) { - auto debug_options = DebugOptions(); - debug_options.set_xla_eliminate_hlo_implicit_broadcast(true); - - // Build a binary computation with degenerate broadcast. - // - // %a = Param({1, 2, 3}); - // %b = Param({1, 2, 1}); - // %add = Add(%a, %b, {}); - ComputationHandle handle; - handle.set_handle(123); - UserComputation computation("TheComputation", handle); - - ParameterRequest a_request; - *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {1, 2, 3}); - a_request.set_name("a"); - a_request.set_parameter(0); - TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle, - computation.AddParameterInstruction(a_request)); - - ParameterRequest b_request; - *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {1, 2, 1}); - b_request.set_name("b"); - b_request.set_parameter(1); - TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle, - computation.AddParameterInstruction(b_request)); - - const int64 kDevice = 7; - OpSharding sharding; - sharding.set_type(OpSharding::Type::OpSharding_Type_MAXIMAL); - sharding.add_tile_assignment_dimensions(1); - sharding.add_tile_assignment_devices(kDevice); - - TF_EXPECT_OK(computation.SetOpSharding(b_handle, sharding)); - - BinaryOpRequest add; - add.set_binop(BINOP_ADD); - *add.mutable_lhs() = a_handle; - *add.mutable_rhs() = b_handle; - TF_ASSERT_OK(computation.AddBinaryInstruction(add).status()); - - auto hlo_resolver = [](const VersionedComputationHandle& handle) { - return nullptr; - }; - VersionedComputationHandle latest_version = computation.GetVersionedHandle(); - - // Build the HLO computation. - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr hlo_computation, - computation.BuildHloComputation(latest_version.version, hlo_resolver, - debug_options)); - - // b a - // | | - // reshape | - // | | - // broadcast | - // \ / - // add - EXPECT_EQ(5, hlo_computation->instruction_count()); - ASSERT_THAT( - hlo_computation->root_instruction(), - op::Add(op::Parameter(), op::Broadcast(op::Reshape(op::Parameter())))); - - const HloInstruction* broadcast = - hlo_computation->root_instruction()->operand(1); - EXPECT_TRUE(broadcast->has_sharding()); - - const HloInstruction* reshape = broadcast->operand(0); - EXPECT_TRUE(reshape->has_sharding()); -} - -TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) { - auto debug_options = DebugOptions(); - debug_options.set_xla_eliminate_hlo_implicit_broadcast(true); - - // Build a binary computation with in-dim broadcast and degenerate broadcast. - // - // %a = Param({2, 3}); - // %b = Param({2, 1, 4}); - // %add = Add(%a, %b, {0, 1}); - ComputationHandle handle; - handle.set_handle(123); - UserComputation computation("TheComputation", handle); - - ParameterRequest a_request; - *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 3}); - a_request.set_name("a"); - a_request.set_parameter(0); - TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle, - computation.AddParameterInstruction(a_request)); - - ParameterRequest b_request; - *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 1, 4}); - b_request.set_name("b"); - b_request.set_parameter(1); - TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle, - computation.AddParameterInstruction(b_request)); - - BinaryOpRequest add; - add.set_binop(BINOP_ADD); - *add.mutable_lhs() = a_handle; - *add.mutable_rhs() = b_handle; - add.add_broadcast_dimensions(0); - add.add_broadcast_dimensions(1); - TF_ASSERT_OK(computation.AddBinaryInstruction(add).status()); - - auto hlo_resolver = [](const VersionedComputationHandle& handle) { - return nullptr; - }; - VersionedComputationHandle latest_version = computation.GetVersionedHandle(); - - // Build the HLO computation. - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr hlo_computation, - computation.BuildHloComputation(latest_version.version, hlo_resolver, - debug_options)); - - // The binary operation has in-dim broadcast and degenerate broadcast, should - // first do the in-dim broadcast then convert the degnerate broadcast into a - // reshape and a broadcast. - // - // b a - // | | - // broadcast reshape - // | | - // | broadcast - // \ / - // add - EXPECT_EQ(6, hlo_computation->instruction_count()); - EXPECT_THAT(hlo_computation->root_instruction(), op::Add()); - const auto& operands = hlo_computation->root_instruction()->operands(); - ASSERT_EQ(2, operands.size()); - EXPECT_TRUE(operands[0]->opcode() == HloOpcode::kBroadcast && - operands[1]->opcode() == HloOpcode::kBroadcast); -} - -} // namespace -} // namespace xla diff --git a/tensorflow/compiler/xla/service/versioned_computation_handle.cc b/tensorflow/compiler/xla/service/versioned_computation_handle.cc deleted file mode 100644 index a693c4695f0e776cf297d0ecd28d6de53bd5c0c6..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/versioned_computation_handle.cc +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/versioned_computation_handle.h" - -#include "tensorflow/core/lib/strings/strcat.h" - -namespace xla { - -string VersionedComputationHandle::ToString() const { - return tensorflow::strings::StrCat(handle.handle(), ":v", version); -} - -std::ostream& operator<<(std::ostream& out, - const VersionedComputationHandle& versioned_handle) { - out << versioned_handle.ToString(); - return out; -} - -} // namespace xla diff --git a/tensorflow/compiler/xla/service/versioned_computation_handle.h b/tensorflow/compiler/xla/service/versioned_computation_handle.h deleted file mode 100644 index 5732a56caffa31dde52dff5c2775f9fde0cacfbd..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/versioned_computation_handle.h +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_VERSIONED_COMPUTATION_HANDLE_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_VERSIONED_COMPUTATION_HANDLE_H_ - -#include - -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" - -namespace xla { - -// A data structure encapsulating a ComputationHandle and version value of that -// computation. This object is used to unambiguously refer to a particular -// computation in the service. -struct VersionedComputationHandle { - // A version value unambiguously specifying the state of the computation at a - // particular point in time as it is being built. This value is the - // ComputationDataHandle of the current root instruction. - using Version = int64; - - ComputationHandle handle; - Version version; - - string ToString() const; - bool operator==(const VersionedComputationHandle& other) const { - return (handle.handle() == other.handle.handle()) && - (version == other.version); - } - bool operator<(const VersionedComputationHandle& other) const { - return ((handle.handle() < other.handle.handle()) || - ((handle.handle() == other.handle.handle()) && - (version < other.version))); - } -}; - -std::ostream& operator<<(std::ostream& out, - const VersionedComputationHandle& versioned_handle); - -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_VERSIONED_COMPUTATION_HANDLE_H_ diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc new file mode 100644 index 0000000000000000000000000000000000000000..c3c2603c7eb58d3e57346d2ea1e0058f8e5d7fe8 --- /dev/null +++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc @@ -0,0 +1,237 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/while_loop_analysis.h" +#include "tensorflow/compiler/xla/service/hlo_evaluator.h" + +namespace xla { + +using absl::nullopt; +using absl::optional; + +// Finds and returns the non-constant operand in instr. +// +// CHECK-fails if instr doesn't have exactly one unique non-constant operand. +static const HloInstruction* NonConstantOperand(const HloInstruction* instr) { + const HloInstruction* result = nullptr; + for (const HloInstruction* operand : instr->operands()) { + if (!operand->IsConstant()) { + if (result != nullptr) { + CHECK_EQ(result, operand); + } + result = operand; + } + } + CHECK_NE(result, nullptr); + return result; +} + +// If all of instr's operands are either constants or have the form +// get-tuple-element(gte_operand, N) +// for the same value N, returns N. Otherwise, returns nullopt. +static optional GetGTEOperandIndex(const HloInstruction* instr, + const HloInstruction* gte_operand) { + VLOG(2) << "GetGTEOperandIndex(" << instr->ToString() << ", " + << gte_operand->ToString() << ")"; + optional tuple_idx; + for (const HloInstruction* operand : instr->operands()) { + if (operand->IsConstant()) { + continue; + } + // Look through copies. + // TODO(b/68830972): We wouldn't need this if for loop matching on the GPU + // would run before copy insertion. + if (operand->opcode() == HloOpcode::kCopy) { + operand = operand->operand(0); + } + if (operand->opcode() != HloOpcode::kGetTupleElement) { + VLOG(2) << "instr uses something other than gte(gte_operand): " + << operand->ToString(); + return nullopt; + } + if (operand->operand(0) != gte_operand) { + VLOG(2) << "instr has gte whose operand is not gte_operand: " + << operand->ToString(); + return nullopt; + } + if (tuple_idx && tuple_idx != operand->tuple_index()) { + VLOG(2) << "instr has operands with conflicting gte indices, " + << *tuple_idx << " vs " << operand->tuple_index(); + return nullopt; + } + + tuple_idx = operand->tuple_index(); + } + return tuple_idx; +} + +// Tries to get the tuple index of the induction variable of a while loop. +// +// Checks that the loop condition and root both plumb the induction variable +// through the same tuple index, and that they both apply exactly one op to the +// induction variable before deciding whether to do another loop iteration (in +// the loop condition's case) or packing the induction variable into the result +// tuple (in the loop body's case). +// +// Specifically, checks that the loop condition has structure +// +// root = op(constants, get-tuple-elem(param0, N), constants) +// +// and the loop body has the structure +// +// inc = op(constants, get-tuple-elem(param0, N), constants) +// root = tuple(..., inc, ...) // inc is N'th operand of tuple(). +// +// If so, returns N. Otherwise, returns nullopt. +static optional GetLoopInductionVarTupleIdx( + const HloInstruction* while_op) { + CHECK_EQ(while_op->opcode(), HloOpcode::kWhile); + VLOG(2) << "Finding induction variable for loop " + << while_op->ToShortString(); + + // The while_cond computation should have the form + // + // while_cond_root = + // op(constants, get-tuple-elem(while_cond_param, N), constants). + // + // If it does, set indvar_tuple_idx to N. + auto* while_cond = while_op->while_condition(); + auto* while_cond_root = while_cond->root_instruction(); + auto* while_cond_param = while_cond->parameter_instruction(0); + optional indvar_tuple_idx = + GetGTEOperandIndex(while_cond_root, while_cond_param); + if (!indvar_tuple_idx) { + VLOG(2) << "Induction variable not found in loop condition: " + << while_cond->root_instruction()->ToString(); + return nullopt; + } + + // The while_body computation should have the form + // + // while_body_inc = + // op(constants, get-tuple-elem(while_body_param, N), constants) + // while_body_root = tuple(..., while_body_inc, ...) + // + // where while_body_inc is operand N of while_body_root. + auto* while_body = while_op->while_body(); + auto* while_body_root = while_body->root_instruction(); + if (while_body_root->opcode() != HloOpcode::kTuple) { + VLOG(2) << "While body's root is not a tuple instruction: " + << while_body_root->ToString(); + return nullopt; + } + + auto* while_body_inc = while_body_root->operand(*indvar_tuple_idx); + auto* while_body_param = while_body->parameter_instruction(0); + optional while_body_indvar_tuple_idx = + GetGTEOperandIndex(while_body_inc, while_body_param); + if (!while_body_indvar_tuple_idx) { + VLOG(2) + << "Induction variable not found in while body increment instruction: " + << while_body_inc->ToString(); + return nullopt; + } + if (while_body_indvar_tuple_idx != indvar_tuple_idx) { + VLOG(2) << "Tuple index of induction variable does not match between loop " + "condition (" + << *indvar_tuple_idx << ") and while body (" + << *while_body_indvar_tuple_idx << ")"; + return nullopt; + } + + // Finally, check that the while loop's initial value is a tuple with enough + // elements. + auto* while_init = while_op->operand(0); + if (while_init->opcode() != HloOpcode::kTuple) { + VLOG(2) << "While init expected to be a tuple: " << while_init->ToString(); + return nullopt; + } + + VLOG(2) << "Induction variable's tuple index: " << *indvar_tuple_idx; + return indvar_tuple_idx; +} + +optional ComputeWhileLoopTripCount(HloInstruction* while_op, + int64 max_value_returned) { + VLOG(2) << "Getting trip count for loop " << while_op->ToString(); + + // The loop's induction variable is found at + // + // get-tuple-elem(comp->parameter_instruction(0), *indvar_tuple_idx), + // + // where comp is while_op->while_body() or while_op->while_condition(). + optional indvar_tuple_idx = GetLoopInductionVarTupleIdx(while_op); + if (!indvar_tuple_idx) { + return nullopt; + } + + // Now that we know the index of the induction variable, we can we can try to + // compute how many times the loop executes. Start by computing the induction + // variable's initial value. + HloEvaluator evaluator(/*max_loop_iterations=*/0); + auto* while_init = while_op->mutable_operand(0); + auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx); + StatusOr> indvar_init_result = + evaluator.Evaluate(indvar_init); + if (!indvar_init_result.ok()) { + VLOG(2) << "Couldn't evaluate induction variable init: " + << indvar_init_result.status(); + return nullopt; + } + + auto* while_body = while_op->while_body(); + auto* while_body_indvar_update = + while_body->root_instruction()->operand(*indvar_tuple_idx); + auto* while_body_indvar = NonConstantOperand(while_body_indvar_update); + + // The initial value of the induction variable. + std::unique_ptr indvar_iter_val = + std::move(indvar_init_result).ValueOrDie(); + for (int64 trip_count = 0; trip_count != max_value_returned + 1; + ++trip_count) { + auto* while_cond = while_op->while_condition(); + auto* while_cond_root = while_cond->root_instruction(); + auto* while_cond_indvar = NonConstantOperand(while_cond_root); + StatusOr> result = + evaluator.EvaluateWithSubstitutions( + while_cond_root, {{while_cond_indvar, indvar_iter_val.get()}}); + if (!result.ok()) { + VLOG(2) << "Couldn't evaluate while cond: " << result.status(); + return nullopt; + } + if (result.ValueOrDie()->data() == absl::Span{false}) { + VLOG(2) << "Loop has static trip count of " << trip_count; + return trip_count; + } + + // Calculate the value of the induction variable after one iteration of the + // loop, and check whether the while condition is true with this new value. + StatusOr> indvar_next_result = + evaluator.EvaluateWithSubstitutions( + while_body_indvar_update, + {{while_body_indvar, indvar_iter_val.get()}}); + if (!indvar_next_result.ok()) { + VLOG(2) << "Couldn't evaluate induction variable update: " + << indvar_next_result.status(); + return nullopt; + } + indvar_iter_val = std::move(indvar_next_result).ValueOrDie(); + } + + VLOG(2) << "Loop has unknown trip count."; + return nullopt; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.h b/tensorflow/compiler/xla/service/while_loop_analysis.h new file mode 100644 index 0000000000000000000000000000000000000000..bf497f4892b95c927379411468a66d8961465413 --- /dev/null +++ b/tensorflow/compiler/xla/service/while_loop_analysis.h @@ -0,0 +1,33 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_ + +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" + +namespace xla { + +// Returns the precise trip count of the loop if it's statically known, +// nullopt otherwise. max_value_returned limits the number of steps that are +// evaluated while trying to brute force a loop trip count, trip counts larger +// than max_value_returned result in nullopt. +absl::optional ComputeWhileLoopTripCount(HloInstruction *while_op, + int64 max_value_returned = 128); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_ diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc index 10fc4958fae06414dbe7a3a0a798cb5c6e0f35c2..aab11806621746141f4302f39a780fcdbab99fc1 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h" +#include "absl/algorithm/container.h" #include "tensorflow/compiler/xla/service/while_util.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/flatmap.h" @@ -32,7 +33,7 @@ static Status ReplaceUsesWhileKeepingLoopInvariance( std::vector users; users.reserve(old_instr->user_count()); - c_copy(old_instr->users(), std::back_inserter(users)); + absl::c_copy(old_instr->users(), std::back_inserter(users)); for (auto* user : users) { for (int64 i = 0, e = user->operand_count(); i < e; i++) { @@ -61,6 +62,12 @@ StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody( WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) { int64 index = invariant_gte->tuple_index(); const HloInstruction& invariant_value = *init_value.operand(index); + + // Should have at least one user that's not while_body_root. + if (invariant_gte->user_count() <= 1) { + continue; + } + if (invariant_value.opcode() == HloOpcode::kConstant) { auto* constant_instr = while_body->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk")); @@ -102,10 +109,10 @@ StatusOr WhileLoopConstantSinking::Run(HloModule* module) { // // This will let us sink the constant into the outer while first and then // into the inner while in a single run of this pass. - c_copy_if(comp->instructions(), std::back_inserter(while_instrs), - [](const HloInstruction* instr) { - return instr->opcode() == HloOpcode::kWhile; - }); + absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs), + [](const HloInstruction* instr) { + return instr->opcode() == HloOpcode::kWhile; + }); } for (HloInstruction* while_instr : while_instrs) { diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h index 21fb8568a84985692026e145c363500a154a1599..2dba7d7f7574742a301e3503e353bbe57d72a203 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h @@ -54,7 +54,7 @@ class WhileLoopConstantSinking : public HloPassInterface { public: ~WhileLoopConstantSinking() override = default; - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "while-loop-invariant-code-motion"; } diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc index 0d2288d8ea6ebb0ac4ac9468a211b161438fc5f1..0e7667de832c54f647d071e3c9563091d0f994aa 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/core/lib/core/status_test_util.h" namespace xla { @@ -55,7 +55,7 @@ ENTRY entry { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); TF_ASSERT_OK_AND_ASSIGN(bool changed, WhileLoopConstantSinking{}.Run(module.get())); @@ -95,7 +95,7 @@ ENTRY entry { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); TF_ASSERT_OK_AND_ASSIGN(bool changed, WhileLoopConstantSinking{}.Run(module.get())); @@ -136,7 +136,7 @@ ENTRY entry { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); TF_ASSERT_OK_AND_ASSIGN(bool changed, WhileLoopConstantSinking{}.Run(module.get())); @@ -184,7 +184,7 @@ ENTRY entry { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); TF_ASSERT_OK_AND_ASSIGN(bool changed, WhileLoopConstantSinking{}.Run(module.get())); @@ -196,5 +196,51 @@ ENTRY entry { op::GetTupleElement(op::Parameter(0)), op::GetTupleElement(op::Parameter(0)))); } + +TEST_F(WhileLoopConstantSinkingTest, DontCreateDeadConstant) { + const char* const hlo_string = R"( +HloModule ModuleWithWhile + +body { + p_body = (f32[2],f32[2]) parameter(0) + p_body.0 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=0 + p_body.1 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=1 + + token = token[] after-all() + outfeed = token[] outfeed(p_body.0, token) + ROOT root = (f32[2],f32[2],f32[2]) tuple(p_body.0, p_body.1, p_body.1) +} + +condition { + p_cond = (f32[2],f32[2]) parameter(0) + ROOT result = pred[] constant(true) +} + +ENTRY entry { + const_0 = f32[2] constant({1, 2}) + const_1 = f32[2] constant({2, 1}) + while_init = (f32[2],f32[2]) tuple(const_0, const_1) + ROOT while = (f32[2],f32[2],f32[2]) while(while_init), condition=condition, + body=body +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_string)); + + TF_ASSERT_OK_AND_ASSIGN(bool changed, + WhileLoopConstantSinking{}.Run(module.get())); + ASSERT_TRUE(changed); + + auto* while_body = module->GetComputationWithName("body"); + EXPECT_THAT(while_body->root_instruction(), + op::Tuple(op::GetTupleElement(), op::GetTupleElement(), + op::GetTupleElement())); + for (const HloInstruction* inst : while_body->instructions()) { + if (inst->opcode() == HloOpcode::kConstant) { + EXPECT_GT(inst->user_count(), 0); + } + } +} } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc index 09ddcffb22c2184262adf87d570870ec000c0e6f..e8fe33e62659ae0fffff1ad46e8ba77f715b76b2 100644 --- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc +++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc @@ -14,18 +14,19 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h" +#include "absl/algorithm/container.h" +#include "absl/container/inlined_vector.h" #include "tensorflow/compiler/xla/service/tuple_util.h" #include "tensorflow/compiler/xla/service/while_util.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/flatset.h" -#include "tensorflow/core/lib/gtl/inlined_vector.h" namespace xla { +using absl::InlinedVector; using tensorflow::gtl::FlatMap; using tensorflow::gtl::FlatSet; -using tensorflow::gtl::InlinedVector; // Copies `to_hoist` to the computation containing `while_instr`, hoisting its // operands as needed. All of its transitive operands are expected to be either @@ -65,8 +66,8 @@ static void CreateLoopInvariantCopy( }; InlinedVector new_operands; - c_transform(old_instruction->operands(), std::back_inserter(new_operands), - get_new_operand); + absl::c_transform(old_instruction->operands(), + std::back_inserter(new_operands), get_new_operand); HloInstruction* new_instruction = parent_of_while->AddInstruction(old_instruction->CloneWithNewOperands( @@ -109,6 +110,7 @@ bool WhileLoopInvariantCodeMotion::NotWorthHoistingIndividually( case HloOpcode::kBitcast: case HloOpcode::kBroadcast: + case HloOpcode::kIota: case HloOpcode::kReshape: case HloOpcode::kReverse: case HloOpcode::kSlice: @@ -197,7 +199,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody( op->opcode() == HloOpcode::kConstant; }; - if (!c_all_of(instruction->operands(), is_invariant)) { + if (!absl::c_all_of(instruction->operands(), is_invariant)) { continue; } @@ -257,10 +259,10 @@ StatusOr WhileLoopInvariantCodeMotion::Run(HloModule* module) { bool changed = false; std::vector while_instrs; for (auto* comp : module->computations()) { - c_copy_if(comp->instructions(), std::back_inserter(while_instrs), - [](const HloInstruction* instr) { - return instr->opcode() == HloOpcode::kWhile; - }); + absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs), + [](const HloInstruction* instr) { + return instr->opcode() == HloOpcode::kWhile; + }); } for (HloInstruction* while_instr : while_instrs) { diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h index 8e6cc8787576e4f041229da5cf8dd2b09194eb2a..2cdf20ce80362c0aeb9d8324573e7e9826cc018c 100644 --- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h +++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h @@ -38,7 +38,7 @@ class WhileLoopInvariantCodeMotion : public HloPassInterface { : hoist_constants_(hoist_constants) {} ~WhileLoopInvariantCodeMotion() override = default; - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "while-loop-invariant-code-motion"; } StatusOr Run(HloModule* module) override; diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc index e1ec12192f47bee714b612141731aaf3dad63e93..32e69c335b713c438bd7fcb2053709b0624f58ed 100644 --- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc +++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc @@ -16,9 +16,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/core/lib/core/status_test_util.h" namespace xla { @@ -53,7 +53,7 @@ HloComputation* WhileLoopInvariantCodeMotionTest::MakeAlwaysTrueComputation( builder.AddInstruction( HloInstruction::CreateParameter(0, param_shape, "param")); builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); return module->AddEmbeddedComputation(builder.Build()); } @@ -125,7 +125,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) { builder.AddInstruction(HloInstruction::CreateUnary( scalar_s32, HloOpcode::kNegate, mul_result)); HloInstruction* constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(4))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(4))); HloInstruction* sub_result = builder.AddInstruction(HloInstruction::CreateBinary( scalar_s32, HloOpcode::kSubtract, negate_result, constant)); @@ -248,7 +248,9 @@ TEST_F(WhileLoopInvariantCodeMotionTest, TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) { auto scalar_s32 = ShapeUtil::MakeShape(S32, {}); - Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32}); + auto token_shape = ShapeUtil::MakeTokenShape(); + Shape while_shape = + ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, token_shape}); HloComputation* while_body = [&]() { HloComputation::Builder builder(TestName() + ".while_body"); @@ -258,25 +260,32 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) { HloInstruction::CreateGetTupleElement(scalar_s32, param, 0)); HloInstruction* gte_1 = builder.AddInstruction( HloInstruction::CreateGetTupleElement(scalar_s32, param, 1)); + HloInstruction* in_token = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(token_shape, param, 2)); + HloInstruction* out_token = builder.AddInstruction( + HloInstruction::CreateOutfeed(scalar_s32, gte_0, in_token, "")); builder.AddInstruction( - HloInstruction::CreateOutfeed(scalar_s32, gte_0, "")); - builder.AddInstruction(HloInstruction::CreateTuple({gte_0, gte_1})); + HloInstruction::CreateTuple({gte_0, gte_1, out_token})); return module().AddEmbeddedComputation(builder.Build()); }(); HloComputation::Builder builder(TestName()); + auto* scalar_param = builder.AddInstruction( + HloInstruction::CreateParameter(0, scalar_s32, "param")); + auto* token = builder.AddInstruction(HloInstruction::CreateToken()); auto* init_value = builder.AddInstruction( - HloInstruction::CreateParameter(0, while_shape, "init_value")); + HloInstruction::CreateTuple({scalar_param, scalar_param, token})); auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile( while_shape, MakeAlwaysTrueComputation(while_shape, &module()), while_body, init_value)); - + builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_s32, while_inst, 0)); module().AddEntryComputation(builder.Build()); TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop, WhileLoopInvariantCodeMotion{}.Run(&module())); - EXPECT_FALSE(simplified_loop); + ASSERT_FALSE(simplified_loop); EXPECT_THAT(while_inst->while_body()->instructions(), Contains(op::Outfeed())); @@ -287,7 +296,9 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) { // bitcast either. auto scalar_s32 = ShapeUtil::MakeShape(S32, {}); auto scalar_f32 = ShapeUtil::MakeShape(F32, {}); - Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32}); + auto token_shape = ShapeUtil::MakeTokenShape(); + Shape while_shape = + ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, token_shape}); HloComputation* while_body = [&]() { HloComputation::Builder builder(TestName() + ".while_body"); @@ -297,21 +308,29 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) { HloInstruction::CreateGetTupleElement(scalar_s32, param, 0)); HloInstruction* gte_1 = builder.AddInstruction( HloInstruction::CreateGetTupleElement(scalar_s32, param, 1)); + HloInstruction* in_token = builder.AddInstruction( + HloInstruction::CreateGetTupleElement(token_shape, param, 2)); HloInstruction* bitcast_inst = builder.AddInstruction( HloInstruction::CreateUnary(scalar_f32, HloOpcode::kBitcast, gte_0)); + HloInstruction* out_token = builder.AddInstruction( + HloInstruction::CreateOutfeed(scalar_f32, bitcast_inst, in_token, "")); builder.AddInstruction( - HloInstruction::CreateOutfeed(scalar_f32, bitcast_inst, "")); - builder.AddInstruction(HloInstruction::CreateTuple({gte_0, gte_1})); + HloInstruction::CreateTuple({gte_0, gte_1, out_token})); return module().AddEmbeddedComputation(builder.Build()); }(); HloComputation::Builder builder(TestName()); + auto* scalar_param = builder.AddInstruction( + HloInstruction::CreateParameter(0, scalar_s32, "param")); + auto* token = builder.AddInstruction(HloInstruction::CreateToken()); auto* init_value = builder.AddInstruction( - HloInstruction::CreateParameter(0, while_shape, "init_value")); + HloInstruction::CreateTuple({scalar_param, scalar_param, token})); auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile( while_shape, MakeAlwaysTrueComputation(while_shape, &module()), while_body, init_value)); + builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_s32, while_inst, 0)); module().AddEntryComputation(builder.Build()); diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc index ec05a74e286c89dd8db5ae07580e461938d7c087..6a7bfe3f129d97866ccc54897d584fab0f7c683e 100644 --- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc +++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc @@ -14,34 +14,16 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/while_loop_simplifier.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/call_inliner.h" -#include "tensorflow/compiler/xla/service/hlo_evaluator.h" +#include "tensorflow/compiler/xla/service/while_loop_analysis.h" #include "tensorflow/core/lib/gtl/flatmap.h" -#include "tensorflow/core/lib/gtl/optional.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { -using tensorflow::gtl::nullopt; -using tensorflow::gtl::optional; - -// Finds and returns the non-constant operand in instr. -// -// CHECK-fails if instr doesn't have exactly one unique non-constant operand. -static const HloInstruction* NonConstantOperand(const HloInstruction* instr) { - const HloInstruction* result = nullptr; - for (const HloInstruction* operand : instr->operands()) { - if (!operand->IsConstant()) { - if (result != nullptr) { - CHECK_EQ(result, operand); - } - result = operand; - } - } - CHECK_NE(result, nullptr); - return result; -} +using absl::optional; // Determines whether the given instruction is a send/recv node, or has a // subcomputation which contains a send/recv node. @@ -72,211 +54,6 @@ static bool IsOrContainsSendOrRecv(const HloInstruction* instr) { return false; } -// If all of instr's operands are either constants or have the form -// get-tuple-element(gte_operand, N) -// for the same value N, returns N. Otherwise, returns nullopt. -static optional GetGTEOperandIndex(const HloInstruction* instr, - const HloInstruction* gte_operand) { - VLOG(2) << "GetGTEOperandIndex(" << instr->ToString() << ", " - << gte_operand->ToString() << ")"; - optional tuple_idx; - for (const HloInstruction* operand : instr->operands()) { - if (operand->IsConstant()) { - continue; - } - if (operand->opcode() != HloOpcode::kGetTupleElement) { - VLOG(2) << "instr uses something other than gte(gte_operand): " - << operand->ToString(); - return nullopt; - } - if (operand->operand(0) != gte_operand) { - VLOG(2) << "instr has gte whose operand is not gte_operand: " - << operand->ToString(); - return nullopt; - } - if (tuple_idx && tuple_idx != operand->tuple_index()) { - VLOG(2) << "instr has operands with conflicting gte indices, " - << *tuple_idx << " vs " << operand->tuple_index(); - return nullopt; - } - - tuple_idx = operand->tuple_index(); - } - return tuple_idx; -} - -// Tries to get the tuple index of the induction variable of a while loop. -// -// Checks that the loop condition and root both plumb the induction variable -// through the same tuple index, and that they both apply exactly one op to the -// induction variable before deciding whether to do another loop iteration (in -// the loop condition's case) or packing the induction variable into the result -// tuple (in the loop body's case). -// -// Specifically, checks that the loop condition has structure -// -// root = op(constants, get-tuple-elem(param0, N), constants) -// -// and the loop body has the structure -// -// inc = op(constants, get-tuple-elem(param0, N), constants) -// root = tuple(..., inc, ...) // inc is N'th operand of tuple(). -// -// If so, returns N. Otherwise, returns nullopt. -static optional GetLoopInductionVarTupleIdx( - const HloInstruction* while_op) { - CHECK_EQ(while_op->opcode(), HloOpcode::kWhile); - VLOG(2) << "Finding induction variable for loop " - << while_op->ToShortString(); - - // The while_cond computation should have the form - // - // while_cond_root = - // op(constants, get-tuple-elem(while_cond_param, N), constants). - // - // If it does, set indvar_tuple_idx to N. - auto* while_cond = while_op->while_condition(); - auto* while_cond_root = while_cond->root_instruction(); - auto* while_cond_param = while_cond->parameter_instruction(0); - optional indvar_tuple_idx = - GetGTEOperandIndex(while_cond_root, while_cond_param); - if (!indvar_tuple_idx) { - VLOG(2) << "Induction variable not found in loop condition: " - << while_cond->root_instruction()->ToString(); - return nullopt; - } - - // The while_body computation should have the form - // - // while_body_inc = - // op(constants, get-tuple-elem(while_body_param, N), constants) - // while_body_root = tuple(..., while_body_inc, ...) - // - // where while_body_inc is operand N of while_body_root. - auto* while_body = while_op->while_body(); - auto* while_body_root = while_body->root_instruction(); - if (while_body_root->opcode() != HloOpcode::kTuple) { - VLOG(2) << "While body's root is not a tuple instruction: " - << while_body_root->ToString(); - return nullopt; - } - - auto* while_body_inc = while_body_root->operand(*indvar_tuple_idx); - auto* while_body_param = while_body->parameter_instruction(0); - optional while_body_indvar_tuple_idx = - GetGTEOperandIndex(while_body_inc, while_body_param); - if (!while_body_indvar_tuple_idx) { - VLOG(2) - << "Induction variable not found in while body increment instruction: " - << while_body_inc->ToString(); - return nullopt; - } - if (while_body_indvar_tuple_idx != indvar_tuple_idx) { - VLOG(2) << "Tuple index of induction variable does not match between loop " - "condition (" - << *indvar_tuple_idx << ") and while body (" - << *while_body_indvar_tuple_idx << ")"; - return nullopt; - } - - // Finally, check that the while loop's initial value is a tuple with enough - // elements. - auto* while_init = while_op->operand(0); - if (while_init->opcode() != HloOpcode::kTuple) { - VLOG(2) << "While init expected to be a tuple: " << while_init->ToString(); - return nullopt; - } - - VLOG(2) << "Induction variable's tuple index: " << *indvar_tuple_idx; - return indvar_tuple_idx; -} - -// Tries to determine the number of times the given loop executes. Currently -// simply returns 0, 1, or "can't tell" (nullopt). -static optional GetLoopTripCount(HloInstruction* while_op) { - CHECK_EQ(while_op->opcode(), HloOpcode::kWhile); - VLOG(2) << "Getting trip count for loop " << while_op->ToString(); - - // The loop's induction variable is found at - // - // get-tuple-elem(comp->parameter_instruction(0), *indvar_tuple_idx), - // - // where comp is while_op->while_body() or while_op->while_condition(). - optional indvar_tuple_idx = GetLoopInductionVarTupleIdx(while_op); - if (!indvar_tuple_idx) { - return nullopt; - } - - VLOG(2) << "Induction variable is at index " << *indvar_tuple_idx - << " in input tuple."; - - // Now that we know the index of the induction variable, we can we can try to - // compute how many times the loop executes. Start by computing the induction - // variable's initial value. - HloEvaluator evaluator(/*max_loop_iterations=*/0); - auto* while_init = while_op->mutable_operand(0); - auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx); - StatusOr> indvar_init_result = - evaluator.Evaluate(indvar_init); - if (!indvar_init_result.ok()) { - VLOG(2) << "Couldn't evaluate induction variable init: " - << indvar_init_result.status(); - return nullopt; - } - - // Evaluates the while loop's condition, returning either "true" (continue - // looping), "false" (stop looping), or nullopt (can't evaluate). - auto evaluate_while_cond = [&](const Literal& indvar) -> optional { - auto* while_cond = while_op->while_condition(); - auto* while_cond_root = while_cond->root_instruction(); - auto* while_cond_indvar = NonConstantOperand(while_cond_root); - StatusOr> result = - evaluator.EvaluateWithSubstitutions(while_cond_root, - {{while_cond_indvar, &indvar}}); - if (!result.ok()) { - VLOG(2) << "Couldn't evaluate while cond: " << result.status(); - return nullopt; - } - return result.ValueOrDie()->data() == - tensorflow::gtl::ArraySlice{true}; - }; - - // The initial value of the induction variable. - const Literal& indvar_iter0_val = *indvar_init_result.ValueOrDie(); - - // Evaluate whether the while condition is true when seeded with - // indvar_iter0_val. - optional while_cond_iter0_val = evaluate_while_cond(indvar_iter0_val); - if (while_cond_iter0_val == false) { - VLOG(2) << "Loop has static trip count of 0."; - return 0; - } - - // Calculate the value of the induction variable after one iteration of the - // loop, and check whether the while condition is true with this new value. - auto* while_body = while_op->while_body(); - auto* while_body_indvar_update = - while_body->root_instruction()->operand(*indvar_tuple_idx); - auto* while_body_indvar = NonConstantOperand(while_body_indvar_update); - StatusOr> indvar_iter1_result = - evaluator.EvaluateWithSubstitutions( - while_body_indvar_update, {{while_body_indvar, &indvar_iter0_val}}); - if (!indvar_iter1_result.ok()) { - VLOG(2) << "Couldn't evaluate induction variable update: " - << indvar_iter1_result.status(); - return nullopt; - } - const Literal& indvar_iter1_val = *indvar_iter1_result.ValueOrDie(); - optional while_cond_iter1_val = evaluate_while_cond(indvar_iter1_val); - if (while_cond_iter1_val == false) { - VLOG(2) << "Determined that loop has static trip count of 1."; - return 1; - } - - VLOG(2) << "Loop has unknown trip count >= 1."; - return nullopt; -} - // Tries to remove elements in a while loop's tuple that aren't used within the // loop. // @@ -459,12 +236,11 @@ static StatusOr TryRemoveDeadWhileParams(HloInstruction* while_op) { << "Instruction " << user->ToString(print_no_metadata) << " should be unused (except by root of while body), but has " "users: {" - << tensorflow::str_util::Join( - user->users(), ", ", - [&](string* out, const HloInstruction* instr) { - tensorflow::strings::StrAppend( - out, instr->ToString(print_no_metadata)); - }) + << absl::StrJoin(user->users(), ", ", + [&](string* out, const HloInstruction* instr) { + absl::StrAppend( + out, instr->ToString(print_no_metadata)); + }) << "}"; replacements.emplace(user, nullptr); @@ -577,7 +353,9 @@ static StatusOr TryRemoveWhileLoop(HloInstruction* while_op) { } // Remove while loops with static trip count of 0. - optional trip_count = GetLoopTripCount(while_op); + optional trip_count = + ComputeWhileLoopTripCount(while_op, + /*max_value_returned=*/1); if (trip_count && *trip_count == 0) { // The loop never executes, so the value of the loop is the value of its // "init" operand. diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h index 3d3e1d60f294c3a2574513c1c2f071805a341ad1..78024f14dc89ff40a11bbc3602072fda1fe6f312 100644 --- a/tensorflow/compiler/xla/service/while_loop_simplifier.h +++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h @@ -33,9 +33,7 @@ namespace xla { class WhileLoopSimplifier : public HloPassInterface { public: ~WhileLoopSimplifier() override {} - tensorflow::StringPiece name() const override { - return "simplify-while-loops"; - } + absl::string_view name() const override { return "simplify-while-loops"; } StatusOr Run(HloModule* module) override; }; diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc index 619e87caa5b6d0f6ec3c3b1489b0d4f50ef29963..1c892ba179ec67ccc9dbfe93d925551d6977ba15 100644 --- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc @@ -15,11 +15,12 @@ limitations under the License. #include "tensorflow/compiler/xla/service/while_loop_simplifier.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_replace.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/strings/str_util.h" namespace xla { namespace { @@ -64,10 +65,8 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) { } )"; - string hlo_string = tensorflow::str_util::StringReplace( - hlo_string_template, "{{LOOP_BOUND}}", - tensorflow::strings::StrCat(42 + num_iters), - /*replace_all=*/true); + string hlo_string = absl::StrReplaceAll( + hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}}); ParseAndVerifyModule(hlo_string); } @@ -103,10 +102,8 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound( } )"; - string hlo_string = tensorflow::str_util::StringReplace( - hlo_string_template, "{{LOOP_BOUND}}", - tensorflow::strings::StrCat(42 + num_iters), - /*replace_all=*/true); + string hlo_string = absl::StrReplaceAll( + hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}}); ParseAndVerifyModule(hlo_string); } @@ -157,7 +154,7 @@ TEST_F(WhileLoopSimplifierTest, auto* while_op = computation->root_instruction(); ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile); auto* true_op = while_op->while_body()->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); TF_ASSERT_OK(true_op->AddControlDependencyTo( while_op->while_body()->root_instruction())); ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie()); @@ -175,9 +172,11 @@ TEST_F(WhileLoopSimplifierTest, LoopWithSendNotSimplified) { auto* while_op = computation->root_instruction(); ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile); auto* while_body = while_op->while_body(); + auto* token = while_body->AddInstruction(HloInstruction::CreateToken()); auto* send = while_body->AddInstruction(HloInstruction::CreateSend( while_body->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(true))), + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))), + token, /*channel_id=*/0)); while_body->AddInstruction(HloInstruction::CreateSendDone(send)); EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie()); @@ -190,8 +189,9 @@ TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) { auto* while_op = computation->root_instruction(); ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile); auto* while_body = while_op->while_body(); + auto* token = while_body->AddInstruction(HloInstruction::CreateToken()); auto* recv = while_body->AddInstruction( - HloInstruction::CreateRecv(ShapeUtil::MakeShape(F32, {1}), + HloInstruction::CreateRecv(ShapeUtil::MakeShape(F32, {1}), token, /*channel_id=*/0)); while_body->AddInstruction(HloInstruction::CreateRecvDone(recv)); EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie()); @@ -208,8 +208,9 @@ TEST_F(WhileLoopSimplifierTest, LoopWithInfeedNotSimplified) { auto* while_op = computation->root_instruction(); ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile); auto* while_body = while_op->while_body(); - while_body->AddInstruction( - HloInstruction::CreateInfeed(ShapeUtil::MakeShape(F32, {1}), "config")); + auto token = while_body->AddInstruction(HloInstruction::CreateToken()); + while_body->AddInstruction(HloInstruction::CreateInfeed( + ShapeUtil::MakeShape(F32, {1}), token, "config")); EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie()); } diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc index ed20b36292a7f24385603627d74fc72ba6b3b724..f90ac91f9d07aded8cafccf82dae894c9a149bd1 100644 --- a/tensorflow/compiler/xla/service/while_util.cc +++ b/tensorflow/compiler/xla/service/while_util.cc @@ -14,14 +14,16 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/while_util.h" +#include "absl/algorithm/container.h" +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_creation_utils.h" #include "tensorflow/compiler/xla/service/tuple_util.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace xla { -using tensorflow::strings::StrCat; +using absl::StrCat; static StatusOr WidenWhileCondition( HloComputation* narrow_condition, const Shape& wide_shape) { @@ -38,7 +40,7 @@ static StatusOr WidenWhileCondition( // the root instruction later. We later change the root instruction to // something more appropriate. builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); return narrow_condition->parent()->AddEmbeddedComputation(builder.Build()); }(); @@ -92,7 +94,7 @@ WidenWhileBody(HloComputation* narrow_body, const Shape& wide_shape) { /*static*/ StatusOr WhileUtil::MakeInstructionsLiveIn( HloInstruction* while_instr, - tensorflow::gtl::ArraySlice instructions) { + absl::Span instructions) { CHECK(ShapeUtil::IsTuple(while_instr->shape())); int64 elements_in_old_while_shape = while_instr->shape().tuple_shapes_size(); @@ -117,9 +119,13 @@ WhileUtil::MakeInstructionsLiveIn( HloInstruction* new_while = containing_computation->AddInstruction( HloInstruction::CreateWhile(new_while_shape, new_while_condition, new_while_body, new_while_init)); - TF_RETURN_IF_ERROR(containing_computation->ReplaceInstruction( - while_instr, TupleUtil::ExtractPrefix( - new_while, while_instr->shape().tuple_shapes_size()))); + + // We want to get rid of the old while instruction even if it has side + // effecting operations so we do a manual HloComputation::RemoveInstruction + // instead of relying on HloComputation::ReplaceInstruction. + TF_RETURN_IF_ERROR(while_instr->ReplaceAllUsesWith(TupleUtil::ExtractPrefix( + new_while, while_instr->shape().tuple_shapes_size()))); + TF_RETURN_IF_ERROR(containing_computation->RemoveInstruction(while_instr)); HloInstruction* while_body_param = new_while_body->parameter_instruction(0); std::vector live_in_instructions; @@ -150,7 +156,7 @@ MakeCountedLoopConditionComputation(const Shape& loop_state_shape, {&loop_state_shape}, scalar_pred, "while_cond")); HloInstruction* trip_count_constant = cond_computation->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(trip_count))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(trip_count))); HloInstruction* param = cond_computation->parameter_instruction(0); TF_ASSIGN_OR_RETURN(HloInstruction * indvar, @@ -171,7 +177,7 @@ static StatusOr> MakeCountedLoopBodyComputation( CreateComputationWithSignature( {&loop_state_shape}, loop_state_shape, "while_body")); HloInstruction* one = body_computation->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); HloInstruction* param = body_computation->parameter_instruction(0); TF_ASSIGN_OR_RETURN(HloInstruction * indvar, MakeGetTupleElementHlo(param, 0)); @@ -199,9 +205,9 @@ static StatusOr MakeInitTupleFromInitValues( std::vector init_values_with_indvar; init_values_with_indvar.reserve(init_values.size() + 1); HloInstruction* zero = computation->AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); init_values_with_indvar.push_back(zero); - c_copy(init_values, std::back_inserter(init_values_with_indvar)); + absl::c_copy(init_values, std::back_inserter(init_values_with_indvar)); return computation->AddInstruction( HloInstruction::CreateTuple(init_values_with_indvar)); } @@ -210,8 +216,9 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) { std::vector loop_state_shape_components; loop_state_shape_components.reserve(init_values.size() + 1); loop_state_shape_components.push_back(ShapeUtil::MakeShape(S32, {})); - c_transform(init_values, std::back_inserter(loop_state_shape_components), - [](HloInstruction* instr) { return instr->shape(); }); + absl::c_transform(init_values, + std::back_inserter(loop_state_shape_components), + [](HloInstruction* instr) { return instr->shape(); }); return ShapeUtil::MakeTupleShape(loop_state_shape_components); } diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h index 322d27b88cae60cb051f5fafdde70e2aafedbc1e..b1c4486887ae0ddbe2ba4e79f45a265689111017 100644 --- a/tensorflow/compiler/xla/service/while_util.h +++ b/tensorflow/compiler/xla/service/while_util.h @@ -38,20 +38,24 @@ class WhileUtil { }; // Replaces `while_instr` with a new while instruction that is equivalent to - // `while_instr`, except that it has all of the HLO instructions in + // `while_instr` except that it has all of the HLO instructions in // `instructions` as live-in, loop invariant values. These new live in values // are represented as new elements appended to the parameter of the while // loop, which must be of tuple shape. GetTupleElement instructions computing // each new live in value is returned in the `while_body_live_in_values` // vector. // - // Precondition: `while_instr` must have a tuple shaped state. + // Deletes `while_instr` after replacing it. // - // Every instruction in `instructions` must be contained in the computation - // that contains `while_instr`. + // Preconditions: + // + // `while_instr` must have a tuple shaped state. + // + // Every instruction in `instructions` must be contained in the computation + // that contains `while_instr`. static StatusOr MakeInstructionsLiveIn( HloInstruction* while_instr, - tensorflow::gtl::ArraySlice instructions); + absl::Span instructions); using LoopStateTy = std::vector; using LoopBodyGeneratorTy = std::function( diff --git a/tensorflow/compiler/xla/service/while_util_test.cc b/tensorflow/compiler/xla/service/while_util_test.cc index 974bc542a34d0af6d41ed29f36df87f4c164a360..5e6941933330fde29bc9c779aae4bb3c36914660 100644 --- a/tensorflow/compiler/xla/service/while_util_test.cc +++ b/tensorflow/compiler/xla/service/while_util_test.cc @@ -15,9 +15,11 @@ limitations under the License. #include "tensorflow/compiler/xla/service/while_util.h" +#include "absl/algorithm/container.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/test.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" +#include "tensorflow/compiler/xla/util.h" namespace xla { namespace { @@ -49,7 +51,7 @@ ENTRY entry { )"; TF_ASSIGN_OR_RETURN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); *entry_computation = module->entry_computation(); *param0 = (*entry_computation)->parameter_instruction(0); @@ -150,7 +152,7 @@ ENTRY main { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_string)); + ParseHloString(hlo_string)); HloComputation* while_body = module->GetComputationWithName("body"); @@ -163,5 +165,49 @@ ENTRY main { ASSERT_EQ(gte_list.size(), 1); EXPECT_EQ((*gte_list.begin())->name(), "gte.0"); } + +TEST(WhileUtilTest, AlwaysRemovePreviousWhileBody) { + const char* const hlo_string = R"( +HloModule WhileWithSideEffects + +body { + param.b = (s32[], s32[]) parameter(0) + gte.0 = s32[] get-tuple-element(param.b), index=0 + gte.1 = s32[] get-tuple-element(param.b), index=1 + add = s32[] add(gte.0, gte.1) + ROOT tuple = (s32[], s32[]) tuple(gte.0, add) +} + +cond { + param.c = (s32[], s32[]) parameter(0) + token = token[] after-all() + infeed = (pred[], token[]) infeed(token) + ROOT condition = pred[] get-tuple-element(infeed), index=0 +} + +ENTRY main { + init = (s32[], s32[]) parameter(0) + to_make_live_in = f32[100] parameter(1) + ROOT while = (s32[], s32[]) while(init), condition=cond, body=body +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_string)); + + HloComputation* main = module->GetComputationWithName("main"); + HloInstruction* while_instr = main->root_instruction(); + HloInstruction* to_make_live_in = main->parameter_instruction(1); + + TF_ASSERT_OK_AND_ASSIGN( + WhileUtil::MakeInstructionsLiveInResult make_live_in_result, + WhileUtil::MakeInstructionsLiveIn(while_instr, + /*instructions=*/{to_make_live_in})); + + auto is_while = [](const HloInstruction* instr) { + return instr->opcode() == HloOpcode::kWhile; + }; + EXPECT_EQ(absl::c_count_if(main->instructions(), is_while), 1); +} } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc index aa40b5cb264803097f52966d6f61f1f41b6b3017..83d696fe0915086c3c98b6d7cbdaeaeb4d9d0bdb 100644 --- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc +++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -32,11 +32,12 @@ StatusOr ZeroSizedHloElimination::Run(HloModule* module) { for (HloComputation* comp : module->MakeNonfusionComputations()) { for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) { if (instruction->HasSideEffect() || - ShapeUtil::IsTuple(instruction->shape())) { + !ShapeUtil::IsArray(instruction->shape()) || + instruction->opcode() == HloOpcode::kConstant) { continue; } if (comp->IsRemovable(instruction) && - ShapeUtil::HasZeroElements(instruction->shape())) { + ShapeUtil::IsZeroElementArray(instruction->shape())) { TF_RETURN_IF_ERROR(comp->ReplaceWithNewInstruction( instruction, HloInstruction::CreateConstant( Literal::CreateFromShape(instruction->shape())))); diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h index 8763e588c484011ba2ccbc7cad8f29817347a605..a7f0e207eb5a81b04bb28977d6f5e38864ad2d6a 100644 --- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h +++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h @@ -24,7 +24,7 @@ namespace xla { class ZeroSizedHloElimination : public HloPassInterface { public: StatusOr Run(HloModule* module) override; - tensorflow::StringPiece name() const override { + absl::string_view name() const override { return "zero_sized_hlo_elimination"; } }; diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc index f5331280ee9f252aa5717baab88f2c203be5c372..b9ef18892d7aa859f6b0b505db4c004e4f5c5066 100644 --- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc +++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc @@ -19,7 +19,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -67,7 +67,16 @@ TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateParameter) { } TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateSideEffects) { - builder_.AddInstruction(HloInstruction::CreateSend(zero_sized_param_, 0)); + auto token = builder_.AddInstruction(HloInstruction::CreateToken()); + builder_.AddInstruction( + HloInstruction::CreateSend(zero_sized_param_, token, 0)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination()); + EXPECT_FALSE(changed); +} + +TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateConstant) { + builder_.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR1({}))); TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination()); EXPECT_FALSE(changed); } diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h index 141347a792c23a2c542d7b564ab76c118409865d..14c35e7b84f07bebac33a9753ac26a8ee1418f1e 100644 --- a/tensorflow/compiler/xla/service_interface.h +++ b/tensorflow/compiler/xla/service_interface.h @@ -47,41 +47,22 @@ class ServiceInterface { virtual Status ResetDevice(const ResetDeviceRequest* arg, ResetDeviceResponse* result) = 0; - virtual Status LoadComputationSnapshot( - const LoadComputationSnapshotRequest* request, - LoadComputationSnapshotResponse* result) = 0; - - virtual Status Execute(const ExecuteRequest* arg, - ExecuteResponse* result) = 0; - virtual Status ExecuteGraph(const ExecuteGraphRequest* arg, ExecuteResponse* result) = 0; - virtual Status ExecuteParallel(const ExecuteParallelRequest* arg, - ExecuteParallelResponse* result) = 0; - virtual Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) = 0; - virtual Status ExecuteAsync(const ExecuteAsyncRequest* arg, - ExecuteAsyncResponse* result) = 0; - virtual Status WaitForExecution(const WaitForExecutionRequest* arg, WaitForExecutionResponse* result) = 0; virtual Status DeconstructTuple(const DeconstructTupleRequest* arg, DeconstructTupleResponse* result) = 0; - virtual Status GetComputationStats(const ComputationStatsRequest* arg, - ComputationStatsResponse* result) = 0; - virtual Status GetComputationGraphStats( const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) = 0; - virtual Status GetComputationShape(const GetComputationShapeRequest* arg, - GetComputationShapeResponse* result) = 0; - virtual Status GetShape(const GetShapeRequest* arg, GetShapeResponse* result) = 0; @@ -91,31 +72,9 @@ class ServiceInterface { virtual Status GetDeviceHandles(const GetDeviceHandlesRequest* arg, GetDeviceHandlesResponse* result) = 0; - // Methods used by ComputationBuilder. - virtual Status Computation(const ComputationRequest* arg, - ComputationResponse* result) = 0; - - virtual Status Op(const OpRequest* arg, OpResponse* result) = 0; - - virtual Status GetLocalShape(const GetLocalShapeRequest* arg, - GetLocalShapeResponse* result) = 0; - - virtual Status SetReturnValue(const SetReturnValueRequest* arg, - SetReturnValueResponse* results) = 0; - - virtual Status IsConstant(const IsConstantRequest* arg, - IsConstantResponse* result) = 0; - - virtual Status ComputeConstant(const ComputeConstantRequest* arg, - ComputeConstantResponse* result) = 0; - virtual Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) = 0; - // Methods used by Computation. - virtual Status SnapshotComputation(const SnapshotComputationRequest* ag, - SnapshotComputationResponse* result) = 0; - // Methods used by GlobalData. virtual Status Unregister(const UnregisterRequest* arg, UnregisterResponse* result) = 0; diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc index 7ee366b27a82bdbcb7a63a57ea80194db8ca7df4..d44db89d571891ecef554cd45c050017833982bb 100644 --- a/tensorflow/compiler/xla/shape_layout.cc +++ b/tensorflow/compiler/xla/shape_layout.cc @@ -25,8 +25,8 @@ namespace xla { Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) { if (!ShapeUtil::Compatible(other_shape, shape_)) { return InvalidArgument("Shape %s is not compatible with shape %s", - ShapeUtil::HumanString(other_shape).c_str(), - ShapeUtil::HumanString(shape()).c_str()); + ShapeUtil::HumanString(other_shape), + ShapeUtil::HumanString(shape())); } shape_ = other_shape; return Status::OK(); @@ -35,8 +35,8 @@ Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) { Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const { if (!ShapeUtil::Compatible(*to_shape, shape_)) { return InvalidArgument("Shape %s is not compatible with shape %s", - ShapeUtil::HumanString(*to_shape).c_str(), - ShapeUtil::HumanString(shape()).c_str()); + ShapeUtil::HumanString(*to_shape), + ShapeUtil::HumanString(shape())); } *to_shape = shape_; return Status::OK(); @@ -67,6 +67,14 @@ void ShapeLayout::ResetLayout(const Layout& layout) { TF_CHECK_OK(ShapeUtil::ValidateShape(shape_)); } +void ShapeLayout::ResetLayout(const Layout& layout, + ShapeIndexView shape_index) { + CHECK(ShapeUtil::IsTuple(shape_)); + *ShapeUtil::GetMutableSubshape(&shape_, shape_index)->mutable_layout() = + layout; + TF_CHECK_OK(ShapeUtil::ValidateShape(shape_)); +} + bool ShapeLayout::operator==(const ShapeLayout& other) const { return ShapeUtil::Equal(shape_, other.shape_); } diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h index 36806da599cc9b27286e67c128bb7f496f29c105..214cf98854938414c23c5031f4114016140ae9a7 100644 --- a/tensorflow/compiler/xla/shape_layout.h +++ b/tensorflow/compiler/xla/shape_layout.h @@ -72,6 +72,10 @@ class ShapeLayout { // tuple. void ResetLayout(const Layout& layout); + // Resets the layout on the shape at the provided ShapeIndex to the provided + // layout. Shape must be a tuple. + void ResetLayout(const Layout& layout, ShapeIndexView shape_index); + // Returns a string representation of this object. string ToString() const { return ShapeUtil::HumanStringWithLayout(shape_); } diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h index 5b14953ebb243da7b9be6eafd46160db8bc62707..52c895e8d4b2aa55b55df41b7139b00c576d6e99 100644 --- a/tensorflow/compiler/xla/shape_tree.h +++ b/tensorflow/compiler/xla/shape_tree.h @@ -21,16 +21,16 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/types/optional.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/iterator_range.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -44,8 +44,7 @@ struct ShapeTreeNode { // Data corresponding to this node. std::pair data; - // Children of this node, as indices into the container's nodes_ array. - std::vector children; + bool is_leaf = true; explicit ShapeTreeNode(ShapeIndex index) : ShapeTreeNode(std::move(index), T()) {} @@ -53,6 +52,20 @@ struct ShapeTreeNode { : data(std::move(index), std::move(data)) {} }; +// Internal representation of an index table entry. +struct IndexTableEntry { + // Index of the node in the ShapeTreeNode vector. + uint32 index; + // Index of the first child in a IndexTableEntry vector. In the index + // table all children entries for a given node will be placed next to each + // other. This allows us to use a single field to index them. + uint32 children_start; +#ifndef NDEBUG + // Number of children, used for bounds checking. + uint32 children_count; +#endif +}; + } // namespace internal template @@ -81,6 +94,7 @@ template class ShapeTree { public: using Node = internal::ShapeTreeNode; + using Index = internal::IndexTableEntry; // Default constructor creates a tree with a nil shape (i.e. an empty tuple). ShapeTree() : ShapeTree(ShapeUtil::MakeNil()) {} @@ -102,8 +116,8 @@ class ShapeTree { // Returns the data element associated with the array in the shape at the // given index (see ShapeUtil::GetSubshape for how indexes are defined). - const T& element(const ShapeIndex& index) const; - T* mutable_element(const ShapeIndex& index); + const T& element(ShapeIndexView index) const; + T* mutable_element(ShapeIndexView index); // Return the shape represented with this ShapeTree. const Shape& shape() const { return *shape_; } @@ -122,9 +136,7 @@ class ShapeTree { // Returns true if the node at the given index is a leaf node (an array // shape). - bool IsLeaf(const ShapeIndex& index) const { - return Lookup(index)->children.empty(); - } + bool IsLeaf(ShapeIndexView index) const { return Lookup(index)->is_leaf; } ShapeTree(const ShapeTree&) = default; ShapeTree& operator=(const ShapeTree&) = default; @@ -210,12 +222,12 @@ class ShapeTree { // Returns an iterator pointing to the given ShapeIndex. // REQUIRES: index must exist in the ShapeTree. - iterator find(const ShapeIndex& index) { + iterator find(ShapeIndexView index) { Node* element = Lookup(index); return iterator(&nodes_, typename std::vector::iterator(element), /*iterate_leaves_only=*/false); } - const_iterator find(const ShapeIndex& index) const { + const_iterator find(ShapeIndexView index) const { Node* element = Lookup(index); return iterator(&nodes_, typename std::vector::const_iterator(element), @@ -250,6 +262,25 @@ class ShapeTree { template Status ForEachMutableElementWithStatus(const Fn& func); + // Maps each element to generate a new tree with the same shape. + template + ShapeTree Map(const std::function& func) { + ShapeTree result(shape_storage_); + ForEachElement([&](const ShapeIndex& index, const T& t) { + *result.mutable_element(index) = func(t); + }); + return result; + } + + template + ShapeTree Map(const std::function& func) { + ShapeTree result(shape_storage_); + ForEachMutableElement([&](const ShapeIndex& index, T* t) { + *result.mutable_element(index) = func(t); + }); + return result; + } + // Copy the subtree of values from 'other' rooted at ShapeIndex // 'source_base_index' into the subtree of value in this ShapeTree rooted at // 'target_base_index'. @@ -266,11 +297,12 @@ class ShapeTree { private: // Initialize node->children based on 'shape'. All children are assigned the // the given 'init_value'. - void InitChildren(const Shape& shape, const T& init_value, Node* node); + void InitChildren(const Shape& shape, const T& init_value, Node* node, + Index* index); // Initialize node->children based on 'shape'. All children have // default-constructed data values. - void InitChildren(const Shape& shape, Node* node); + void InitChildren(const Shape& shape, Node* node, Index* index); // Returns the number of subshapes, including interior nodes, in shape. int64 CountSubshapes(const Shape& shape); @@ -284,12 +316,15 @@ class ShapeTree { static Status ForEachMutableHelper(const Fn& func, std::vector* nodes); // Return the tree node at the given index. - Node* Lookup(const ShapeIndex& index); - const Node* Lookup(const ShapeIndex& index) const; + Node* Lookup(ShapeIndexView index); + const Node* Lookup(ShapeIndexView index) const; // The nodes in this shape tree. std::vector nodes_; + // Index table for node lookups. + std::vector index_table_; + // If we own our Shape, this field contains it, and shape_ is a pointer into // here. Otherwise if we don't own our shape, this is nullptr. std::shared_ptr shape_storage_; @@ -311,16 +346,14 @@ class ShapeTreeIterator : nodes_(nodes), node_(std::move(node)), iterate_leaves_only_(iterate_leaves_only) { - while (iterate_leaves_only && node_ != nodes_->end() && - !node_->children.empty()) { + while (iterate_leaves_only && node_ != nodes_->end() && !node_->is_leaf) { ++node_; } } ShapeTreeIterator& operator++() { ++node_; - while (iterate_leaves_only_ && node_ != nodes_->end() && - !node_->children.empty()) { + while (iterate_leaves_only_ && node_ != nodes_->end() && !node_->is_leaf) { ++node_; } return *this; @@ -333,8 +366,7 @@ class ShapeTreeIterator ShapeTreeIterator& operator--() { --node_; - while (iterate_leaves_only_ && node_ > nodes_->begin() && - !node_->children.empty()) { + while (iterate_leaves_only_ && node_ > nodes_->begin() && !node_->is_leaf) { --node_; } return *this; @@ -358,7 +390,7 @@ class ShapeTreeIterator ContainerType* nodes_; IteratorType node_; // True if we should not include interior nodes in our walk. - bool iterate_leaves_only_; + const bool iterate_leaves_only_; }; template @@ -375,34 +407,74 @@ int64 ShapeTree::CountSubshapes(const Shape& shape) { template void ShapeTree::InitChildren(const Shape& shape, const T& init_value, - Node* node) { + Node* node, Index* index) { if (ShapeUtil::IsTuple(shape)) { const int64 size = ShapeUtil::TupleElementCount(shape); - node->children.reserve(size); +#ifndef NDEBUG + index->children_count = size; +#endif + node->is_leaf = false; ShapeIndex shape_index = node->data.first; shape_index.push_back(0); + + // At the end of the index_table, reserve a continuous space to hold the + // children of current node. In order to enforce the invariant that all + // children of a given node are placed together, we need to do the + // reservation before we recurse into any of its children. + int64 children_start_position = index_table_.size(); + index_table_.resize(index_table_.size() + size); + for (int i = 0; i < size; ++i) { shape_index[shape_index.size() - 1] = i; - node->children.push_back(nodes_.size()); + index_table_[children_start_position + i].index = nodes_.size(); + // The first child of the node in the index table is placed at the end of + // the table. + index_table_[children_start_position + i].children_start = + index_table_.size(); nodes_.emplace_back(shape_index, init_value); - InitChildren(shape.tuple_shapes(i), init_value, &nodes_.back()); + InitChildren(shape.tuple_shapes(i), init_value, &nodes_.back(), + &index_table_[children_start_position + i]); } + } else { +#ifndef NDEBUG + index->children_count = 0; +#endif } } template -void ShapeTree::InitChildren(const Shape& shape, Node* node) { +void ShapeTree::InitChildren(const Shape& shape, Node* node, Index* index) { if (ShapeUtil::IsTuple(shape)) { const int64 size = ShapeUtil::TupleElementCount(shape); - node->children.reserve(size); +#ifndef NDEBUG + index->children_count = size; +#endif + node->is_leaf = false; ShapeIndex shape_index = node->data.first; shape_index.push_back(0); + + // At the end of the index_table, reserve a continuous space to hold the + // children of current node. In order to enforce the invariant that all + // children of a given node are placed together, we need to do the + // reservation before we recurse into any of its children. + int64 children_start_position = index_table_.size(); + index_table_.resize(index_table_.size() + size); + for (int i = 0; i < size; ++i) { shape_index[shape_index.size() - 1] = i; - node->children.push_back(nodes_.size()); + index_table_[children_start_position + i].index = nodes_.size(); + // The first child of the node in the index table is placed at the end of + // the table. + index_table_[children_start_position + i].children_start = + index_table_.size(); nodes_.emplace_back(shape_index); - InitChildren(shape.tuple_shapes(i), &nodes_.back()); + InitChildren(shape.tuple_shapes(i), &nodes_.back(), + &index_table_[children_start_position + i]); } + } else { +#ifndef NDEBUG + index->children_count = 0; +#endif } } @@ -410,82 +482,103 @@ template ShapeTree::ShapeTree(Shape shape) : shape_storage_(std::make_shared(std::move(shape))), shape_(shape_storage_.get()) { - // The shape_ field is just used to hold the structure of the shape. - // It should not be relied upon to store layout information. - LayoutUtil::ClearLayout(shape_storage_.get()); - nodes_.reserve(CountSubshapes(*shape_)); + const int64 count = CountSubshapes(*shape_); + nodes_.reserve(count); nodes_.emplace_back(ShapeIndex{}); - InitChildren(*shape_, &nodes_[0]); + + index_table_.reserve(count); + index_table_.emplace_back(Index{0, 1}); + InitChildren(*shape_, &nodes_[0], &index_table_[0]); } template ShapeTree::ShapeTree(const Shape* shape) : shape_(shape) { - nodes_.reserve(CountSubshapes(*shape_)); + const int64 count = CountSubshapes(*shape_); + nodes_.reserve(count); nodes_.emplace_back(ShapeIndex{}); - InitChildren(*shape_, &nodes_[0]); + + index_table_.reserve(count); + index_table_.emplace_back(Index{0, 1}); + InitChildren(*shape_, &nodes_[0], &index_table_[0]); } template ShapeTree::ShapeTree(const std::shared_ptr& shape) : shape_storage_(shape), shape_(shape_storage_.get()) { - nodes_.reserve(CountSubshapes(*shape_)); + const int64 count = CountSubshapes(*shape_); + nodes_.reserve(count); nodes_.emplace_back(ShapeIndex{}); - InitChildren(*shape_, &nodes_[0]); + + index_table_.reserve(count); + index_table_.emplace_back(Index{0, 1}); + InitChildren(*shape_, &nodes_[0], &index_table_[0]); } template ShapeTree::ShapeTree(Shape shape, const T& init_value) : shape_storage_(std::make_shared(std::move(shape))), shape_(shape_storage_.get()) { - // The shape_ field is just used to hold the structure of the shape. - // It should not be relied upon to store layout information. - LayoutUtil::ClearLayout(shape_storage_.get()); - nodes_.reserve(CountSubshapes(*shape_)); + const int64 count = CountSubshapes(*shape_); + nodes_.reserve(count); nodes_.emplace_back(ShapeIndex{}, init_value); - InitChildren(*shape_, init_value, &nodes_[0]); + + index_table_.reserve(count); + index_table_.emplace_back(Index{0, 1}); + InitChildren(*shape_, init_value, &nodes_[0], &index_table_[0]); } template ShapeTree::ShapeTree(const Shape* shape, const T& init_value) : shape_(shape) { - nodes_.reserve(CountSubshapes(*shape_)); + const int64 count = CountSubshapes(*shape_); + nodes_.reserve(count); nodes_.emplace_back(ShapeIndex{}, init_value); - InitChildren(*shape_, init_value, &nodes_[0]); + + index_table_.reserve(count); + index_table_.emplace_back(Index{0, 1}); + InitChildren(*shape_, init_value, &nodes_[0], &index_table_[0]); } template ShapeTree::ShapeTree(const std::shared_ptr& shape, const T& init_value) : shape_storage_(shape), shape_(shape_storage_.get()) { - nodes_.reserve(CountSubshapes(*shape_)); + const int64 count = CountSubshapes(*shape_); + nodes_.reserve(count); nodes_.emplace_back(ShapeIndex{}, init_value); - InitChildren(*shape_, init_value, &nodes_[0]); + + index_table_.reserve(count); + index_table_.emplace_back(Index{0, 1}); + InitChildren(*shape_, init_value, &nodes_[0], &index_table_[0]); } template -const T& ShapeTree::element(const ShapeIndex& index) const { +const T& ShapeTree::element(ShapeIndexView index) const { return Lookup(index)->data.second; } template -T* ShapeTree::mutable_element(const ShapeIndex& index) { +T* ShapeTree::mutable_element(ShapeIndexView index) { return &Lookup(index)->data.second; } template -internal::ShapeTreeNode* ShapeTree::Lookup(const ShapeIndex& index) { - Node* node = &nodes_[0]; +internal::ShapeTreeNode* ShapeTree::Lookup(ShapeIndexView index) { + Index* iter = &index_table_[0]; for (const int64 i : index) { CHECK_GE(i, 0); - CHECK_LT(i, node->children.size()); - node = &nodes_[node->children[i]]; +#ifndef NDEBUG + CHECK_LT(i, iter->children_count); +#endif + iter = &index_table_[iter->children_start + i]; } - return node; + + return &nodes_[iter->index]; } template const internal::ShapeTreeNode* ShapeTree::Lookup( - const ShapeIndex& index) const { + ShapeIndexView index) const { return const_cast(this)->Lookup(index); } diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc index dc5facf1581c07fbb74dfcee95025692938632bd..c8ff55e7845785d9292516b823fb591cc28cbfad 100644 --- a/tensorflow/compiler/xla/shape_tree_test.cc +++ b/tensorflow/compiler/xla/shape_tree_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_tree.h" +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -116,6 +117,11 @@ TEST_F(ShapeTreeTest, InitValueConstructor) { TestInitValueConstructor(nested_tuple_shape_, 10); } +TEST_F(ShapeTreeTest, EmptyTupleMustHaveNoLeaves) { + ShapeTree shape_tree{ShapeUtil::MakeTupleShape({})}; + EXPECT_EQ(0, shape_tree.leaf_count()); +} + TEST_F(ShapeTreeTest, ArrayShape) { ShapeTree shape_tree{array_shape_}; *shape_tree.mutable_element({}) = 42; @@ -167,7 +173,7 @@ TEST_F(ShapeTreeTest, TupleShape) { // Write zero to all data elements. shape_tree.ForEachMutableElement( - [&sum](const ShapeIndex& /*index*/, int* data) { *data = 0; }); + [](const ShapeIndex& /*index*/, int* data) { *data = 0; }); EXPECT_EQ(0, shape_tree.element({})); EXPECT_EQ(0, shape_tree.element({0})); EXPECT_EQ(0, shape_tree.element({1})); @@ -222,20 +228,22 @@ TEST_F(ShapeTreeTest, NestedTupleShape) { TEST_F(ShapeTreeTest, InvalidIndexingTuple) { ShapeTree shape_tree{tuple_shape_}; - +#ifndef NDEBUG EXPECT_DEATH(shape_tree.element({4}), ""); +#endif } TEST_F(ShapeTreeTest, InvalidIndexingNestedTuple) { ShapeTree shape_tree{nested_tuple_shape_}; - +#ifndef NDEBUG EXPECT_DEATH(shape_tree.element({0, 0}), ""); +#endif } TEST_F(ShapeTreeTest, ShapeTreeOfNonCopyableType) { ShapeTree> shape_tree{tuple_shape_}; EXPECT_EQ(shape_tree.element({2}).get(), nullptr); - *shape_tree.mutable_element({2}) = MakeUnique(42); + *shape_tree.mutable_element({2}) = absl::make_unique(42); EXPECT_EQ(*shape_tree.element({2}), 42); } @@ -597,12 +605,15 @@ void BM_Iterate(int iters, int depth, int fan_out) { } } -BENCHMARK(BM_Construct)->ArgPair(2, 8); -BENCHMARK(BM_ConstructUnowned)->ArgPair(2, 8); -BENCHMARK(BM_Copy)->ArgPair(2, 8); -BENCHMARK(BM_Move)->ArgPair(2, 8); -BENCHMARK(BM_ForEach)->ArgPair(2, 8); -BENCHMARK(BM_Iterate)->ArgPair(2, 8); +#define BENCHMARK_WITH_ARGS(name) \ + BENCHMARK(name)->ArgPair(2, 8)->ArgPair(1, 1000) + +BENCHMARK_WITH_ARGS(BM_Construct); +BENCHMARK_WITH_ARGS(BM_ConstructUnowned); +BENCHMARK_WITH_ARGS(BM_Copy); +BENCHMARK_WITH_ARGS(BM_Move); +BENCHMARK_WITH_ARGS(BM_ForEach); +BENCHMARK_WITH_ARGS(BM_Iterate); } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc index e8a28d76e936720005a232d7de85195cad315baa..9772c06bce32cef0d79a036b525c3606ea60e31b 100644 --- a/tensorflow/compiler/xla/shape_util.cc +++ b/tensorflow/compiler/xla/shape_util.cc @@ -22,50 +22,42 @@ limitations under the License. #include #include +#include "absl/strings/ascii.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "absl/strings/str_split.h" +#include "absl/strings/string_view.h" +#include "absl/strings/strip.h" +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/index_util.h" #include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/overflow_util.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/gtl/iterator_range.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/regexp.h" namespace xla { -string ShapeIndex::ToString() const { - return tensorflow::strings::StrCat( - "{", tensorflow::str_util::Join(indices_, ","), "}"); -} +using absl::StrAppend; +using absl::StrCat; + +string ShapeIndex::ToString() const { return ShapeIndexView(*this).ToString(); } string ShapeIndexView::ToString() const { - return tensorflow::strings::StrCat( - "{", - tensorflow::str_util::Join(tensorflow::gtl::make_range(begin_, end_), - ","), - "}"); + return StrCat("{", absl::StrJoin(indices_, ","), "}"); } bool ShapeIndexView::operator==(const ShapeIndexView& other) const { - if (size() != other.size()) { - return false; - } - for (auto it = begin(), other_it = other.begin(); it != end(); - ++it, ++other_it) { - if (*it != *other_it) { - return false; - } - } - return true; + return indices_ == other.indices_; } bool ShapeIndexView::operator!=(const ShapeIndexView& other) const { @@ -84,18 +76,34 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index) { namespace { +// Returns whether the given primitive type corresponds to an array shape. +bool IsArrayPrimitiveType(PrimitiveType primitive_type) { + return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE && + primitive_type != OPAQUE && primitive_type != TOKEN; +} + // Recursive helper for comparing the equality of two shapes. Returns true if // the shapes are the same. If compare_layouts is true, then layouts must also // match. -bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) { - if (ShapeUtil::IsTuple(lhs) || ShapeUtil::IsTuple(rhs)) { - return ShapeUtil::IsTuple(lhs) && ShapeUtil::IsTuple(rhs) && - ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(), - [=](const Shape& l, const Shape& r) { - return CompareShapes(l, r, compare_layouts); - }); - } else if (ShapeUtil::IsOpaque(lhs) || ShapeUtil::IsOpaque(rhs)) { - return ShapeUtil::IsOpaque(lhs) && ShapeUtil::IsOpaque(rhs); +bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts, + bool ignore_fp_precision) { + if ((ignore_fp_precision && + !ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) || + (!ignore_fp_precision && !ShapeUtil::SameElementType(lhs, rhs))) { + VLOG(3) << "CompareShapes: lhs element type != rhs element type"; + return false; + } + + if (ShapeUtil::IsTuple(lhs)) { + return absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(), + [=](const Shape& l, const Shape& r) { + return CompareShapes(l, r, compare_layouts, + ignore_fp_precision); + }); + } else if (!ShapeUtil::IsArray(lhs)) { + // Non-tuple, non-array tupes such as opaque and token types are trivially + // the same. + return true; } if (compare_layouts) { @@ -103,13 +111,13 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) { return false; } if (LayoutUtil::IsDenseArray(lhs)) { - if (!ContainersEqual(LayoutUtil::MinorToMajor(lhs), - LayoutUtil::MinorToMajor(rhs))) { + if (!absl::c_equal(LayoutUtil::MinorToMajor(lhs), + LayoutUtil::MinorToMajor(rhs))) { VLOG(3) << "CompareShapes: lhs layout != rhs layout"; return false; } - if (!ContainersEqual(lhs.layout().padded_dimensions(), - rhs.layout().padded_dimensions())) { + if (!absl::c_equal(lhs.layout().padded_dimensions(), + rhs.layout().padded_dimensions())) { VLOG(3) << "CompareShapes: lhs padded_dimensions != rhs padded_dimensions"; return false; @@ -125,25 +133,21 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) { VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions"; return false; } - if (!ShapeUtil::SameElementType(lhs, rhs)) { - VLOG(3) << "CompareShapes: lhs element type != rhs element type"; - return false; - } return true; } // Constructs and returns the new shape with the given minor_to_major order in // its Layout. StatusOr MakeShapeWithLayoutInternal( - PrimitiveType element_type, tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice minor_to_major) { + PrimitiveType element_type, absl::Span dimensions, + absl::Span minor_to_major) { if (dimensions.size() != minor_to_major.size()) { return InvalidArgument("Dimensions size is %ld, but layout size is %ld.", dimensions.size(), minor_to_major.size()); } if (element_type == OPAQUE || element_type == TUPLE) { return InvalidArgument("Unsupported element type: %s", - PrimitiveType_Name(element_type).c_str()); + PrimitiveType_Name(element_type)); } Shape shape = ShapeUtil::MakeShape(element_type, dimensions); auto min2maj = shape.mutable_layout()->mutable_minor_to_major(); @@ -161,7 +165,8 @@ StatusOr MakeShapeWithLayoutInternal( } // namespace /* static */ bool ShapeUtil::Equal(const Shape& lhs, const Shape& rhs) { - bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true); + bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true, + /*ignore_fp_precision=*/false); if (!equal && VLOG_IS_ON(3)) { VLOG(3) << "ShapeUtil::Equal differ: lhs = " << lhs.ShortDebugString() << ", rhs = " << rhs.ShortDebugString(); @@ -170,9 +175,21 @@ StatusOr MakeShapeWithLayoutInternal( return equal; } +/* static */ bool ShapeUtil::EqualIgnoringFpPrecision(const Shape& lhs, + const Shape& rhs) { + bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true, + /*ignore_fp_precision=*/true); + if (!equal && VLOG_IS_ON(3)) { + VLOG(3) << "ShapeUtil::EqualIgnoringFpPrecision differ: lhs = " + << lhs.ShortDebugString() << ", rhs = " << rhs.ShortDebugString(); + } + + return equal; +} + /* static */ int64 ShapeUtil::Rank(const Shape& shape) { - CHECK(!ShapeUtil::IsTuple(shape)) - << "Tuples do not have a rank, shape: " << shape; + CHECK(ShapeUtil::IsArray(shape)) + << "Non-arrays do not have a rank, shape: " << shape; return shape.dimensions_size(); } @@ -197,34 +214,32 @@ StatusOr MakeShapeWithLayoutInternal( return program_shape; } -/* static */ Shape ShapeUtil::MakeShape( - PrimitiveType element_type, tensorflow::gtl::ArraySlice dimensions) { - DCHECK_NE(TUPLE, element_type); - DCHECK_NE(OPAQUE, element_type); +/* static */ Shape ShapeUtil::MakeShape(PrimitiveType element_type, + absl::Span dimensions) { + CHECK(IsArrayPrimitiveType(element_type)); Shape result; PopulateShape(element_type, dimensions, &result); return result; } /* static */ Shape ShapeUtil::MakeShapeWithLayout( - PrimitiveType element_type, tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice minor_to_major) { + PrimitiveType element_type, absl::Span dimensions, + absl::Span minor_to_major) { return MakeShapeWithLayoutInternal(element_type, dimensions, minor_to_major) .ValueOrDie(); } /* static */ Shape ShapeUtil::MakeShapeWithDescendingLayout( - PrimitiveType element_type, tensorflow::gtl::ArraySlice dimensions) { + PrimitiveType element_type, absl::Span dimensions) { std::vector layout(dimensions.size()); std::iota(layout.rbegin(), layout.rend(), static_cast(0)); return MakeShapeWithLayout(element_type, dimensions, layout); } /* static */ Shape ShapeUtil::MakeShapeWithSparseLayout( - PrimitiveType element_type, tensorflow::gtl::ArraySlice dimensions, + PrimitiveType element_type, absl::Span dimensions, int64 max_sparse_elements) { - DCHECK_NE(TUPLE, element_type); - DCHECK_NE(OPAQUE, element_type); + CHECK(IsArrayPrimitiveType(element_type)); Shape shape = ShapeUtil::MakeShape(element_type, dimensions); *shape.mutable_layout() = LayoutUtil::MakeSparseLayout(max_sparse_elements); TF_DCHECK_OK(ShapeUtil::ValidateShape(shape)); @@ -241,9 +256,9 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( return MakeShapeWithDescendingLayout(shape.element_type(), dims); } -/* static */ void ShapeUtil::PopulateShape( - PrimitiveType element_type, tensorflow::gtl::ArraySlice dimensions, - Shape* shape) { +/* static */ void ShapeUtil::PopulateShape(PrimitiveType element_type, + absl::Span dimensions, + Shape* shape) { shape->Clear(); shape->set_element_type(element_type); for (int64 dimension : dimensions) { @@ -253,10 +268,10 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( TF_DCHECK_OK(ValidateShape(*shape)); } -/* static */ Shape ShapeUtil::MakeTupleShape( - tensorflow::gtl::ArraySlice shapes) { +/* static */ Shape ShapeUtil::MakeTupleShape(absl::Span shapes) { Shape result; result.set_element_type(TUPLE); + result.mutable_tuple_shapes()->Reserve(shapes.size()); for (const auto& shape : shapes) { AppendShapeToTuple(shape, &result); } @@ -271,6 +286,13 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( return result; } +/* static */ Shape ShapeUtil::MakeTokenShape() { + Shape result; + result.set_element_type(TOKEN); + TF_DCHECK_OK(ValidateShapeWithOptionalLayout(result)); + return result; +} + /* static */ void ShapeUtil::AppendShapeToTuple(const Shape& shape, Shape* tuple_shape) { TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape)); @@ -294,7 +316,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( } /* static */ bool ShapeUtil::ElementHasBitWidth(const Shape& shape, int bits) { - if (shape.element_type() == TUPLE || shape.element_type() == OPAQUE) { + if (!IsArray(shape)) { return false; } return primitive_util::BitWidth(shape.element_type()) == bits; @@ -320,6 +342,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( case C64: case TUPLE: case OPAQUE: + case TOKEN: return false; default: @@ -335,6 +358,10 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( return primitive_util::IsFloatingPointType(shape.element_type()); } +/* static */ bool ShapeUtil::IsArray(const Shape& shape) { + return IsArrayPrimitiveType(shape.element_type()); +} + /* static */ bool ShapeUtil::IsNestedTuple(const Shape& shape) { return IsTuple(shape) && std::any_of(shape.tuple_shapes().begin(), shape.tuple_shapes().end(), IsTuple); @@ -345,7 +372,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( } /* static */ bool ShapeUtil::IsNil(const Shape& shape) { - return IsTuple(shape) ? IsEmptyTuple(shape) : HasZeroElements(shape); + return IsEmptyTuple(shape); } /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) { @@ -361,6 +388,13 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( return shape.tuple_shapes(index); } +/* static */ int64 ShapeUtil::SubshapeCount(const Shape& shape) { + int64 n = 0; + ForEachSubshape(shape, [&](const Shape& literal_subshape, + const ShapeIndex& index) { ++n; }); + return n; +} + /* static */ Shape ShapeUtil::SliceTuple(const Shape& tuple, int64 start, int64 limit) { TF_DCHECK_OK(ValidateShapeWithOptionalLayout(tuple)); @@ -388,50 +422,44 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( } /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) { - CHECK(!IsTuple(shape)) << ShapeUtil::HumanString(shape); + CHECK(IsArray(shape)) << ShapeUtil::HumanString(shape); CHECK_EQ(shape.dimensions_size(), Rank(shape)); return std::accumulate( shape.dimensions().begin(), shape.dimensions().end(), 1LL, std::multiplies()); } -/* static */ bool ShapeUtil::HasZeroElements(const Shape& shape) { - return ElementsIn(shape) == 0; +/* static */ int64 ShapeUtil::ElementsInRecursive(const Shape& shape) { + CHECK(IsArray(shape) || IsTuple(shape)); + if (IsArray(shape)) { + return ElementsIn(shape); + } + int64 count = 0; + for (const Shape& element_shape : shape.tuple_shapes()) { + count += ElementsInRecursive(element_shape); + } + return count; } -/* static */ bool ShapeUtil::IsScalarF32(const Shape& shape) { - return shape.element_type() == F32 && Rank(shape) == 0; +/* static */ bool ShapeUtil::IsZeroElementArray(const Shape& shape) { + return ShapeUtil::IsArray(shape) && ElementsIn(shape) == 0; } -/* static */ string ShapeUtil::HumanString(const Shape& shape) { - if (IsTuple(shape)) { - string text = "("; - const char* prefix = ""; - for (const Shape& elem_shape : shape.tuple_shapes()) { - tensorflow::strings::StrAppend(&text, prefix, HumanString(elem_shape)); - prefix = ", "; - } - text += ")"; - return text; - } else { - return tensorflow::strings::StrCat( - tensorflow::str_util::Lowercase( - PrimitiveType_Name(shape.element_type())), - "[", tensorflow::str_util::Join(shape.dimensions(), ","), "]"); - } +/* static */ bool ShapeUtil::IsScalarF32(const Shape& shape) { + return shape.element_type() == F32 && Rank(shape) == 0; } namespace { // Class to memoize the computation of -// tensorflow::str_util::Lowercase(PrimitiveType_Name(p)) +// absl::AsciiStrToLower(PrimitiveType_Name(p)) // for all PrimitiveType values "p" class PrimitiveTypeNameGenerator { public: PrimitiveTypeNameGenerator() { for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) { if (PrimitiveType_IsValid(i)) { - lowercase_name_[i] = tensorflow::str_util::Lowercase( + lowercase_name_[i] = absl::AsciiStrToLower( PrimitiveType_Name(static_cast(i))); } } @@ -462,78 +490,84 @@ StatusOr StringToPrimitiveType(const string& name) { }(); auto found = name_to_type->find(name); if (found == name_to_type->end()) { - return InvalidArgument("Invalid element type string: \"%s\".", - name.c_str()); + return InvalidArgument("Invalid element type string: \"%s\".", name); } return found->second; } } // namespace -/* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) { +/* static */ string ShapeUtil::HumanString(const Shape& shape) { if (IsTuple(shape)) { string text = "("; const char* prefix = ""; for (const Shape& elem_shape : shape.tuple_shapes()) { - tensorflow::strings::StrAppend(&text, prefix, - HumanStringWithLayout(elem_shape)); + StrAppend(&text, prefix, HumanString(elem_shape)); prefix = ", "; } text += ")"; return text; - } else { - string result = tensorflow::strings::StrCat( - LowercasePrimitiveTypeName(shape.element_type()), "["); - for (int i = 0; i < shape.dimensions().size(); i++) { - tensorflow::strings::StrAppend(&result, (i > 0) ? "," : "", - shape.dimensions(i)); + } + return StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[", + absl::StrJoin(shape.dimensions(), ","), "]"); +} + +/* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) { + if (IsTuple(shape)) { + string text = "("; + const char* prefix = ""; + for (const Shape& elem_shape : shape.tuple_shapes()) { + StrAppend(&text, prefix, HumanStringWithLayout(elem_shape)); + prefix = ", "; } - result += "]"; - if (!IsScalar(shape) && !IsOpaque(shape)) { - if (LayoutUtil::HasLayout(shape)) { - tensorflow::strings::StrAppend(&result, - LayoutUtil::HumanString(shape.layout())); - } + text += ")"; + return text; + } + string result = StrCat(LowercasePrimitiveTypeName(shape.element_type()), "["); + for (int i = 0; i < shape.dimensions().size(); i++) { + StrAppend(&result, (i > 0) ? "," : "", shape.dimensions(i)); + } + result += "]"; + if (!IsScalar(shape) && IsArray(shape)) { + if (LayoutUtil::HasLayout(shape)) { + StrAppend(&result, LayoutUtil::HumanString(shape.layout())); } - return result; } + return result; } /* static */ string ShapeUtil::HumanString(const ProgramShape& program_shape) { std::vector parameters; for (auto& shape : program_shape.parameters()) { const int i = parameters.size(); - parameters.push_back( - tensorflow::strings::StrCat(i < program_shape.parameter_names_size() - ? program_shape.parameter_names(i) - : "(unknown)", - ": ", HumanString(shape))); + parameters.push_back(StrCat(i < program_shape.parameter_names_size() + ? program_shape.parameter_names(i) + : "(unknown)", + ": ", HumanString(shape))); } - return tensorflow::strings::StrCat( - "(", tensorflow::str_util::Join(parameters, ", "), ") -> ", - HumanString(program_shape.result())); + return StrCat("(", absl::StrJoin(parameters, ", "), ") -> ", + HumanString(program_shape.result())); } namespace { // Parses shapes with simple recursive descent structure -- consumes from the // front of s and passes that view recursively as required. -StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { - tensorflow::str_util::RemoveLeadingWhitespace(s); +StatusOr ParseShapeStringInternal(absl::string_view* s) { + *s = StripLeadingAsciiWhitespace(*s); - if (tensorflow::str_util::ConsumePrefix(s, "(")) { // Tuple. + if (absl::ConsumePrefix(s, "(")) { // Tuple. std::vector shapes; bool must_end = false; while (true) { - if (tensorflow::str_util::ConsumePrefix(s, ")")) { + if (absl::ConsumePrefix(s, ")")) { break; } else if (must_end) { - return InvalidArgument("Expected end of tuple; got: \"%s\"", - std::string(*s).c_str()); + return InvalidArgument("Expected end of tuple; got: \"%s\"", *s); } shapes.emplace_back(); TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s)); - tensorflow::str_util::RemoveLeadingWhitespace(s); - must_end = !tensorflow::str_util::ConsumePrefix(s, ","); + *s = StripLeadingAsciiWhitespace(*s); + must_end = !absl::ConsumePrefix(s, ","); } return ShapeUtil::MakeTupleShape(shapes); } @@ -542,32 +576,30 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { string dimensions_string; string format_string; string layout_string; - // tensorflow::StringPiece is not compatible with internal RE2 StringPiece, so + // absl::string_view is not compatible with internal RE2 StringPiece, so // we convert in to the RE2-consumable type and then consume the corresponding - // amount from our StringPiece type. + // amount from our string_view type. + static LazyRE2 shape_pattern = { + "^(\\w*\\d*)\\[([\\d,]*)\\](?:\\s*(dense|sparse)?\\s*{([\\d,]+)})?"}; tensorflow::RegexpStringPiece s_consumable(s->data(), s->size()); - if (RE2::Consume( - &s_consumable, - "^(\\w*\\d*)\\[([\\d,]*)\\](?:\\s*(dense|sparse)?\\s*{([\\d,]+)})?", - &element_type_string, &dimensions_string, &format_string, - &layout_string)) { + if (RE2::Consume(&s_consumable, *shape_pattern, &element_type_string, + &dimensions_string, &format_string, &layout_string)) { size_t consumed = s->size() - s_consumable.size(); s->remove_prefix(consumed); - auto string_to_int64 = [&s](const string& input) -> StatusOr { + auto string_to_int64 = [&s](absl::string_view input) -> StatusOr { int64 element; - if (!tensorflow::strings::safe_strto64(input.c_str(), &element)) { + if (!absl::SimpleAtoi(input, &element)) { return InvalidArgument( - "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"", - input.c_str(), std::string(*s).c_str()); + "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"", input, + *s); } return element; }; auto comma_list_to_int64s = - [&s, - string_to_int64](const string& input) -> StatusOr> { + [string_to_int64](const string& input) -> StatusOr> { std::vector results; - for (const string& piece : tensorflow::str_util::Split(input, ',')) { + for (const auto& piece : absl::StrSplit(input, ',', absl::SkipEmpty())) { TF_ASSIGN_OR_RETURN(int64 element, string_to_int64(piece)); results.push_back(element); } @@ -581,14 +613,17 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { // Extract the primitive element type. TF_ASSIGN_OR_RETURN(const PrimitiveType primitive_type, StringToPrimitiveType(element_type_string)); - if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE || - primitive_type == OPAQUE) { + if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE) { return InvalidArgument("Invalid element type string: \"%s\".", - element_type_string.c_str()); + element_type_string); } Shape result; - if (format_string.empty() && layout_string.empty()) { + if (primitive_type == OPAQUE) { + result = ShapeUtil::MakeOpaqueShape(); + } else if (primitive_type == TOKEN) { + result = ShapeUtil::MakeTokenShape(); + } else if (format_string.empty() && layout_string.empty()) { // Create a shape without a layout set. result = ShapeUtil::MakeShape(primitive_type, dimensions); } else if (format_string == "sparse") { @@ -610,17 +645,14 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { return std::move(result); } - return InvalidArgument("Invalid shape string to parse: \"%s\"", - std::string(*s).c_str()); + return InvalidArgument("Invalid shape string to parse: \"%s\"", *s); } } // namespace -/* static */ StatusOr ShapeUtil::ParseShapeString( - tensorflow::StringPiece s) { +/* static */ StatusOr ShapeUtil::ParseShapeString(absl::string_view s) { TF_ASSIGN_OR_RETURN(Shape shape, ParseShapeStringInternal(&s)); if (!s.empty()) { - return InvalidArgument("Invalid shape string to parse: \"%s\"", - std::string(s).c_str()); + return InvalidArgument("Invalid shape string to parse: \"%s\"", s); } return shape; } @@ -629,47 +661,41 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { const Shape& rhs) { CHECK(ShapeUtil::IsArray(lhs)); CHECK(ShapeUtil::IsArray(rhs)); - return ContainersEqual(lhs.dimensions(), rhs.dimensions()); + return absl::c_equal(lhs.dimensions(), rhs.dimensions()); } /* static */ bool ShapeUtil::Compatible(const Shape& lhs, const Shape& rhs) { - if (lhs.element_type() == TUPLE) { - return rhs.element_type() == TUPLE && - ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(), Compatible); - } - if (lhs.element_type() == OPAQUE) { - return rhs.element_type() == OPAQUE; - } - return SameElementType(lhs, rhs) && SameDimensions(lhs, rhs); + return CompareShapes(lhs, rhs, /*compare_layouts=*/false, + /*ignore_fp_precision=*/false); } /* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs, const Shape& rhs) { - if (lhs.element_type() == TUPLE) { + if (IsArray(lhs)) { + return IsArray(rhs) && SameDimensions(lhs, rhs); + } else if (lhs.element_type() == TUPLE) { return rhs.element_type() == TUPLE && - ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(), - CompatibleIgnoringElementType); - } - if (lhs.element_type() == OPAQUE) { - return rhs.element_type() == OPAQUE; + absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(), + CompatibleIgnoringElementType); + } else { + // Opaque, token, etc types are vacuously compatible. + return lhs.element_type() == rhs.element_type(); } - return ShapeUtil::IsArray(rhs) && SameDimensions(lhs, rhs); } /* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs, const Shape& rhs) { - if (lhs.element_type() == TUPLE) { + if (IsArray(lhs)) { + return IsArray(rhs) && SameElementTypeIgnoringFpPrecision(lhs, rhs) && + CompatibleIgnoringElementType(lhs, rhs); + } else if (lhs.element_type() == TUPLE) { return rhs.element_type() == TUPLE && - ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(), - CompatibleIgnoringFpPrecision); - } - if (lhs.element_type() == OPAQUE) { - return rhs.element_type() == OPAQUE; - } - if (SameElementTypeIgnoringFpPrecision(lhs, rhs)) { - return CompatibleIgnoringElementType(lhs, rhs); + absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(), + CompatibleIgnoringFpPrecision); + } else { + // Opaque, token, etc types are vacuously compatible. + return lhs.element_type() == rhs.element_type(); } - return false; } /* static */ int64 ShapeUtil::GetDimension(const Shape& shape, @@ -691,10 +717,6 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { switch (primitive_type) { case PRED: return sizeof(int8); - case TUPLE: - LOG(FATAL) << "tuples have no definitive size"; - case OPAQUE: - LOG(FATAL) << "opaque have no definitive size"; case S8: return sizeof(int8); case S16: @@ -721,6 +743,13 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { return sizeof(double); case C64: return sizeof(complex64); + case TOKEN: + // Tokens require no space. + return 0; + case TUPLE: + case OPAQUE: + LOG(FATAL) << PrimitiveType_Name(primitive_type) + << " primitive type has no definitive size"; default: LOG(FATAL) << "Unhandled primitive type " << primitive_type; } @@ -729,35 +758,39 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { /* static */ int64 ShapeUtil::ByteSizeOf(const Shape& shape, int64 pointer_size) { TF_DCHECK_OK(ValidateShape(shape)); - DCHECK_NE(OPAQUE, shape.element_type()); if (shape.element_type() == TUPLE) { return ByteSizeOfTupleIndexTable(shape, pointer_size); + } else if (IsArray(shape)) { + int64 byte_size = ByteSizeOfElements(shape); + if (LayoutUtil::IsSparseArray(shape)) { + byte_size += ByteSizeOfSparseIndices(shape); + } + return byte_size; + } else if (shape.element_type() == TOKEN) { + return 0; } - int64 byte_size = ByteSizeOfElements(shape); - if (LayoutUtil::IsSparseArray(shape)) { - byte_size += ByteSizeOfSparseIndices(shape); - } - return byte_size; + LOG(FATAL) << PrimitiveType_Name(shape.element_type()) + << " primitive type has no definitive size"; } /* static */ int64 ShapeUtil::ByteSizeOfTupleIndexTable(const Shape& shape, int64 pointer_size) { TF_DCHECK_OK(ValidateShape(shape)); - DCHECK_EQ(TUPLE, shape.element_type()); + CHECK_EQ(TUPLE, shape.element_type()); CHECK_GT(pointer_size, 0); return pointer_size * shape.tuple_shapes_size(); } /* static */ int64 ShapeUtil::ByteSizeOfElements(const Shape& shape) { TF_DCHECK_OK(ValidateShape(shape)); - DCHECK(ShapeUtil::IsArray(shape)); + CHECK(ShapeUtil::IsArray(shape)); int64 allocated_element_count; if (LayoutUtil::IsSparseArray(shape)) { allocated_element_count = LayoutUtil::MaxSparseElements(shape.layout()); } else { - CHECK(LayoutUtil::IsDenseArray(shape)); - tensorflow::gtl::ArraySlice padded_dimensions = + CHECK(LayoutUtil::IsDenseArray(shape)) << shape.ShortDebugString(); + absl::Span padded_dimensions = LayoutUtil::PaddedDimensions(shape); if (!padded_dimensions.empty()) { CHECK_EQ(Rank(shape), padded_dimensions.size()); @@ -775,13 +808,17 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { /* static */ int64 ShapeUtil::ByteSizeOfSparseIndices(const Shape& shape) { TF_DCHECK_OK(ValidateShape(shape)); - DCHECK(LayoutUtil::IsSparseArray(shape)); + CHECK(LayoutUtil::IsSparseArray(shape)); return LayoutUtil::MaxSparseElements(shape.layout()) * ShapeUtil::Rank(shape) * sizeof(int64); } /* static */ Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal( const Shape& shape) { + if (shape.element_type() == PRIMITIVE_TYPE_INVALID) { + return InvalidArgument("shape has invalid element type: %s", + shape.ShortDebugString()); + } if (shape.element_type() == TUPLE) { if (shape.dimensions_size() != 0) { return InvalidArgument("tuples must not have dimensions specified"); @@ -797,13 +834,27 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { if (shape.tuple_shapes_size() > 0) { return InvalidArgument("non-tuple shape has tuple_shapes field"); } - if (shape.element_type() == PRIMITIVE_TYPE_INVALID) { - return InvalidArgument("shape has invalid element type: %s", - shape.ShortDebugString().c_str()); + + // Tokens and opaques can should not have layout or dimensions. + if (shape.element_type() == TOKEN || shape.element_type() == OPAQUE) { + if (shape.dimensions_size() != 0) { + return InvalidArgument( + "shape has %s element type, but has dimensions field: %s", + LowercasePrimitiveTypeName(shape.element_type()), + shape.ShortDebugString()); + } + if (shape.has_layout()) { + return InvalidArgument( + "shape has %s element type, but has layout field: %s", + LowercasePrimitiveTypeName(shape.element_type()), + shape.ShortDebugString()); + } + return Status::OK(); } + if (Rank(shape) != shape.dimensions_size()) { return InvalidArgument( - "shape's rank is mismatched with dimension count; rank=%lld " + "shape's rank is mismatched with dimension count; rank=%d " "dimensions_size=%d", Rank(shape), shape.dimensions_size()); } @@ -811,12 +862,76 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { int64 dimension = shape.dimensions(i); if (dimension < 0) { return InvalidArgument( - "shape's dimensions must not be < 0; dimension at index %lld was " - "%lld", - i, dimension); + "shape's dimensions must not be < 0; dimension at index %d was %d", i, + dimension); + } + } + + TF_RETURN_IF_ERROR(ValidateShapeSize(shape)); + return Status::OK(); +} + +/* static */ Status ShapeUtil::ValidateShapeSize(const Shape& shape) { + VLOG(3) << "Validating shape size: " << ShapeUtil::HumanString(shape); + + if (!IsArray(shape)) { + return Status::OK(); + } + + int64 shape_size = [&shape]() { + if (LayoutUtil::IsSparseArray(shape)) { + int64 max_sparse_elements = LayoutUtil::MaxSparseElements(shape.layout()); + if (max_sparse_elements < 0) { + return max_sparse_elements; + } + int64 sparse_elements_size = MultiplyWithoutOverflow( + max_sparse_elements, ByteSizeOfPrimitiveType(shape.element_type())); + if (sparse_elements_size < 0) { + return sparse_elements_size; + } + int64 sparse_indices_size = + MultiplyWithoutOverflow(max_sparse_elements, ShapeUtil::Rank(shape)); + if (sparse_indices_size < 0) { + return sparse_indices_size; + } + sparse_indices_size = + MultiplyWithoutOverflow(sparse_indices_size, sizeof(int64)); + if (sparse_indices_size < 0) { + return sparse_indices_size; + } + // At this point, both sparse_indices_size and sparse_elements_size are + // non-negative, so we can easily check if adding them wraps. + if (static_cast(sparse_elements_size) + + static_cast(sparse_indices_size) > + INT64_MAX) { + return static_cast(-1); + } + } + + // This is intentionally unconditional: even if the shape is sparse, we want + // to verify the densified version has a reasonable size. + int64 dense_shape_size = 1; + if (shape.dimensions().empty()) { + return dense_shape_size; } + + for (int64 dim : shape.dimensions()) { + dense_shape_size = MultiplyWithoutOverflow(dense_shape_size, dim); + if (dense_shape_size < 0) { + return dense_shape_size; + } + } + dense_shape_size = MultiplyWithoutOverflow( + dense_shape_size, ByteSizeOfPrimitiveType(shape.element_type())); + return dense_shape_size; + }(); + + if (shape_size < 0) { + return InvalidArgument("Shape %s size may overflow int64.", + ShapeUtil::HumanString(shape)); } + VLOG(3) << "Shape size is valid: " << shape_size; return Status::OK(); } @@ -865,6 +980,21 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { return *return_shape; } +/* static */ StatusOr ShapeUtil::TryGetSubshape( + const Shape& shape, ShapeIndexView index) { + const Shape* return_shape = &shape; + for (auto i : index) { + if (!IsTuple(*return_shape) || i < 0 || + i >= return_shape->tuple_shapes_size()) { + return InvalidArgument( + "Shape index %s not a valid subshape index for tuple with shape %s", + index.ToString(), shape.DebugString()); + } + return_shape = &return_shape->tuple_shapes(i); + } + return return_shape; +} + /* static */ Shape* ShapeUtil::GetMutableSubshape(Shape* shape, ShapeIndexView index) { Shape* return_shape = shape; @@ -881,12 +1011,13 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) { } /* static */ int64 ShapeUtil::GetLeafCount(const Shape& shape) { + if (!IsTuple(shape)) { + return 1; + } int64 count = 0; - ForEachSubshape(shape, [&](const Shape&, const ShapeIndex& index) { - if (IsLeafIndex(shape, index)) { - ++count; - } - }); + for (const Shape& subshape : shape.tuple_shapes()) { + count += GetLeafCount(subshape); + } return count; } @@ -901,64 +1032,9 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) { return leaves; } -/* static */ Shape ShapeUtil::StripDegenerateDimensions(const Shape& shape) { - std::vector dimension_sizes; - std::vector degenerate_dimensions; - for (int64 i = 0; i < shape.dimensions_size(); ++i) { - if (shape.dimensions(i) == 1) { - degenerate_dimensions.push_back(i); - } else { - dimension_sizes.push_back(shape.dimensions(i)); - } - } - - // Construct minor_to_major of stripped shape. The order of the non-degenerate - // dimensions should be preserved from the original shape. First, create - // vector of the non-degenerate dimensions from the original minor_to_major - // array. - std::vector minor_to_major; - for (int64 i : shape.layout().minor_to_major()) { - if (std::find(degenerate_dimensions.begin(), degenerate_dimensions.end(), - i) == degenerate_dimensions.end()) { - minor_to_major.push_back(i); - } - } - - // The dimensions in minor_to_major need to be renumbered to account for the - // degenerate dimensions which have removed. Decrement each dimension number - // once for each degenerate dimension which has a smaller number. - for (int i = 0; i < minor_to_major.size(); ++i) { - int adjustment = 0; - for (int64 dim : degenerate_dimensions) { - if (minor_to_major[i] > dim) { - adjustment++; - } - } - minor_to_major[i] -= adjustment; - } - - { - std::vector dims(minor_to_major.size()); - std::iota(dims.begin(), dims.end(), 0); - DCHECK(minor_to_major.size() == dims.size() && - std::is_permutation(minor_to_major.begin(), minor_to_major.end(), - dims.begin())); - } - Shape stripped_shape; - if (LayoutUtil::IsDenseArray(shape)) { - stripped_shape = MakeShapeWithLayout(shape.element_type(), dimension_sizes, - minor_to_major); - } else if (LayoutUtil::IsSparseArray(shape)) { - stripped_shape = - MakeShapeWithSparseLayout(shape.element_type(), dimension_sizes, - shape.layout().max_sparse_elements()); - } else { - stripped_shape = MakeShape(shape.element_type(), dimension_sizes); - } - - VLOG(10) << "Original_shape: " << HumanStringWithLayout(shape); - VLOG(10) << "Stripped_shape: " << HumanStringWithLayout(stripped_shape); - return stripped_shape; +/* static */ bool ShapeUtil::HasDegenerateDimensions(const Shape& shape) { + CHECK(ShapeUtil::IsArray(shape)); + return absl::c_linear_search(shape.dimensions(), 1); } namespace { @@ -1038,18 +1114,47 @@ Status ForEachMutableSubshapeHelper( } /* static */ Shape ShapeUtil::PermuteDimensions( - tensorflow::gtl::ArraySlice permutation, const Shape& shape) { + absl::Span permutation, const Shape& shape) { Shape new_shape = shape; new_shape.clear_dimensions(); for (auto dim : Permute(permutation, shape.dimensions())) { new_shape.add_dimensions(dim); } + + // If `shape` has a layout, by contract we choose a new layout such that the + // transpose defined by this permutation is a bitcast. + // + // Some formalism helps to understand the correct way to do this. We're going + // to do algebra in the group of permutations of the dimensions of `shape`. + // + // Since the order of `shape`'s dimensions is not permuted relative to itself, + // `shape`'s list of dimensions is isomorphic to the identity I. + // + // Let `shape`'s layout be L. A layout is a permutation which maps a + // minor-to-major physical layout to the order of a shape's logical dims. + // Therefore inverse of a layout maps from logical to physical dims, and so + // the physical layout of I is simply L'.I = L', where L' is the inverse of L. + // + // Let the argument `permutation` be P. This is a permutation over `shape`'s + // dimensions, so our return value will be a shape with dims P.I = P. Our + // goal is to construct a layout permutation L* that we can apply to P such + // that that the physical dimension ordering of the returned shape is the same + // as that of the original shape, namely L'. + // + // Our returned shape has dims P and layout L*, so its in-memory layout is + // L*'.P. Setting this equal to L' and solving for L*, we get: + // + // L*'.P = L' => + // L*' = L'P' => + // L* = P.L + // if (shape.has_layout()) { CHECK(LayoutUtil::IsDenseArray(shape)); Layout* new_layout = new_shape.mutable_layout(); new_layout->set_format(DENSE); new_layout->clear_minor_to_major(); - for (auto index : Permute(permutation, shape.layout().minor_to_major())) { + for (auto index : ComposePermutations( + permutation, AsInt64Slice(shape.layout().minor_to_major()))) { new_layout->add_minor_to_major(index); } if (shape.layout().padded_dimensions_size() > 0) { @@ -1059,6 +1164,12 @@ Status ForEachMutableSubshapeHelper( new_layout->add_padded_dimensions(dim); } } + // The permutation accepted by TransposeIsBitcast is the inverse of the + // permutation here. + CHECK(TransposeIsBitcast(shape, new_shape, InversePermutation(permutation))) + << "shape=" << HumanStringWithLayout(shape) + << ", new_shape=" << HumanStringWithLayout(new_shape) + << ", permutation={" << absl::StrJoin(permutation, ",") << "}"; } return new_shape; } @@ -1066,6 +1177,9 @@ Status ForEachMutableSubshapeHelper( /* static */ std::tuple, std::vector> ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre, const Shape& shape_post) { + CHECK(IsArray(shape_pre)); + CHECK(IsArray(shape_post)); + auto nil = std::make_tuple(false, std::vector(), std::vector()); std::vector deleted_indices; @@ -1123,6 +1237,9 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre, /* static */ std::vector> ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, const Shape& output_shape) { + CHECK(IsArray(input_shape)); + CHECK(IsArray(output_shape)); + // Unmodified dimensions are merely common factors of rank 1. auto common_factors = CommonFactors(AsInt64Slice(input_shape.dimensions()), AsInt64Slice(output_shape.dimensions())); @@ -1141,7 +1258,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, /* static */ bool ShapeUtil::TransposeIsBitcast( const Shape& input_shape, const Shape& output_shape, - tensorflow::gtl::ArraySlice dimension_mapping) { + absl::Span dimension_mapping) { CHECK(LayoutUtil::HasLayout(input_shape) && LayoutUtil::HasLayout(output_shape)); @@ -1168,7 +1285,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, // apply(input_dimensions, I) = // apply((dimension_mapping * output_dimensions), I) // input_dimensions = dimension_mapping * output_dimensions - return ContainersEqual( + return absl::c_equal( ComposePermutations(dimension_mapping, AsInt64Slice(output_shape.layout().minor_to_major())), input_shape.layout().minor_to_major()); @@ -1176,8 +1293,10 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, /* static */ bool ShapeUtil::ReshapeIsBitcast(const Shape& input_shape, const Shape& output_shape) { - CHECK(LayoutUtil::HasLayout(input_shape) && - LayoutUtil::HasLayout(output_shape)); + CHECK(IsArray(input_shape)); + CHECK(IsArray(output_shape)); + CHECK(LayoutUtil::HasLayout(input_shape)); + CHECK(LayoutUtil::HasLayout(output_shape)); if (!SameElementType(input_shape, output_shape)) { return false; @@ -1337,8 +1456,11 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, check_input_unit_indices(output_shape, input_shape); } -/* static */ tensorflow::gtl::optional ShapeUtil::AlignLayouts( +/* static */ absl::optional ShapeUtil::AlignLayouts( const Shape& input_shape, const Shape& output_shape) { + CHECK(IsArray(input_shape)); + CHECK(IsArray(output_shape)); + int64 input_rank = Rank(input_shape); int64 output_rank = Rank(output_shape); @@ -1373,7 +1495,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, if (input_dimension_product < output_dimension_product || j == output_rank) { if (i == input_rank) { - return tensorflow::gtl::nullopt; + return absl::nullopt; } dimension_to_alignment_index[i] = alignment.size() - 1; input_dimension_product *= input_shape.dimensions(i); @@ -1384,7 +1506,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, } } if (input_dimension_product != output_dimension_product) { - return tensorflow::gtl::nullopt; + return absl::nullopt; } // We also need to store an end element so that we know where the last // alignment part ends. @@ -1428,7 +1550,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, for (int64 j = 0; j < num_non_trivial_dimensions_in_alignment_part; ++i, ++j) { if (i == input_rank) { - return tensorflow::gtl::nullopt; + return absl::nullopt; } // Skip trivial dimensions with a bound of 1. if (input_shape.dimensions(input_dimension_numbers[i]) == 1) { @@ -1441,7 +1563,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, if (dimension_to_alignment_index[input_dimension_numbers[i]] != current_alignment_index || input_dimension_numbers[i] > current_dimension_number) { - return tensorflow::gtl::nullopt; + return absl::nullopt; } current_dimension_number = input_dimension_numbers[i]; } @@ -1473,6 +1595,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete, Shape shape) { + CHECK(IsArray(shape)); shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete); if (LayoutUtil::HasLayout(shape)) { Layout* layout = shape.mutable_layout(); @@ -1494,6 +1617,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, /* static */ Shape ShapeUtil::FilterDimensions( const std::function& p, Shape shape) { + CHECK(IsArray(shape)); std::vector dims_to_delete; for (int64 i = shape.dimensions().size() - 1; i >= 0; --i) { if (!p(i)) { diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index 9df31d5d21e4dba875427819da6e213d55e5c8c4..8234fcdd3f57978b94630d4e2880826dd678389f 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -22,6 +22,9 @@ limitations under the License. #include #include +#include "absl/container/inlined_vector.h" +#include "absl/types/optional.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -30,8 +33,6 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/macros.h" @@ -62,6 +63,8 @@ class ShapeIndex { public: ShapeIndex() = default; ShapeIndex(std::initializer_list init) : indices_(init) {} + template + ShapeIndex(InputIt start, InputIt end) : indices_(start, end) {} bool empty() const { return indices_.empty(); } size_t size() const { return indices_.size(); } @@ -71,10 +74,12 @@ class ShapeIndex { // push_front is O(n^2), but shapes don't usually have a ton of dimensions. void push_front(int64 value) { indices_.insert(indices_.begin(), value); } - std::vector::const_iterator begin() const { return indices_.begin(); } - std::vector::const_iterator end() const { return indices_.end(); } - std::vector::iterator begin() { return indices_.begin(); } - std::vector::iterator end() { return indices_.end(); } + using container_type = absl::InlinedVector; + + container_type::const_iterator begin() const { return indices_.begin(); } + container_type::const_iterator end() const { return indices_.end(); } + container_type::iterator begin() { return indices_.begin(); } + container_type::iterator end() { return indices_.end(); } const int64* data() const { return indices_.data(); } @@ -95,7 +100,7 @@ class ShapeIndex { string ToString() const; private: - std::vector indices_; + container_type indices_; }; // A view into a ShapeIndex as above, with the cheap/easy ability to consume the @@ -108,30 +113,33 @@ class ShapeIndex { class ShapeIndexView { public: ShapeIndexView(const ShapeIndex& shape_index, int64 offset = 0) - : ShapeIndexView(shape_index.data() + offset, - shape_index.data() + shape_index.size()) { + : indices_(shape_index.data() + offset, shape_index.size() - offset) { CHECK_LE(offset, shape_index.size()); } - ShapeIndexView(std::initializer_list indices) - : ShapeIndexView(indices.begin(), indices.end()) {} + ShapeIndexView(std::initializer_list indices) : indices_(indices) {} ShapeIndexView(const ShapeIndexView& other) = default; using iterator = const int64*; - iterator begin() const { return begin_; } - iterator end() const { return end_; } - int64 size() const { return std::distance(begin_, end_); } - bool empty() const { return begin_ == end_; } + iterator begin() const { return indices_.begin(); } + iterator end() const { return indices_.end(); } + int64 size() const { return indices_.size(); } + bool empty() const { return indices_.empty(); } int64 front() const { CHECK(!empty()); - return *begin_; + return indices_.front(); } ShapeIndexView ConsumeFront() const { - CHECK(!empty()); - auto new_begin = begin_; - ++new_begin; - return ShapeIndexView(new_begin, end_); + ShapeIndexView result = *this; + result.indices_.remove_prefix(1); + return result; } + ShapeIndexView ConsumeBack() const { + ShapeIndexView result = *this; + result.indices_.remove_suffix(1); + return result; + } + ShapeIndex ToShapeIndex() const { return ShapeIndex(begin(), end()); } bool operator==(const ShapeIndexView& other) const; bool operator!=(const ShapeIndexView& other) const; @@ -139,10 +147,7 @@ class ShapeIndexView { string ToString() const; private: - ShapeIndexView(iterator begin, iterator end) : begin_(begin), end_(end) {} - - iterator begin_; - iterator end_; + absl::Span indices_; }; std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index); @@ -169,24 +174,25 @@ class ShapeUtil { // may not actually be able to store this number of elements. See // LayoutUtil::MaxSparseElements(shape) to obtain the maximum number of // elements that can be stored in a sparse shape. - // Precondition: !IsTuple(shape) + // Precondition: IsArray(shape) static int64 ElementsIn(const Shape& shape); - // Returns true if 'shape' has zero elements. - static bool HasZeroElements(const Shape& shape); + // As ElementsIn(), but recurses through tuples. + static int64 ElementsInRecursive(const Shape& shape); + + // Returns true if 'shape' is an array with zero elements. + static bool IsZeroElementArray(const Shape& shape); // Returns the number of bytes required for an allocation of shape. The // |pointer_size| parameter is used for calculating the size of tuple // shapes. This includes only the size of the top-level buffer. For example, a // tuple is stored as an array of pointers to other buffers. In this case, // this method only returns the size of the pointer array. - // Precondition: (!ShapeUtil::IsTuple(shape) || pointer_size > 0) && - // !ShapeUtil::IsOpaque(shape) static int64 ByteSizeOf(const Shape& shape, int64 pointer_size = -1); // Returns the number of bytes used to store the primitive_type. // - // Precondition: !ShapeUtil::IsOpaque(shape) && !ShapeUtil::IsTuple(shape) + // Precondition: ShapeUtil::IsArray(shape) static int64 ByteSizeOfPrimitiveType(PrimitiveType primitive_type); // Returns the number of bytes required to store the tuple member pointers for @@ -222,7 +228,7 @@ class ShapeUtil { // Parses a ShapeUtil::HumanString-format shape string back into a shape // object. - static StatusOr ParseShapeString(tensorflow::StringPiece s); + static StatusOr ParseShapeString(absl::string_view s); // Returns whether the LHS and RHS shapes have the same dimensions; note: does // not check element type. @@ -245,7 +251,7 @@ class ShapeUtil { } // Returns the higher-precision element type if a and b are both floating - // point types; otherwise, checks that they have the same element type + // point types; otherwise, checks that that they have the same element type // and returns it. static PrimitiveType HigherPrecisionElementType(const Shape& a, const Shape& b) { @@ -276,6 +282,9 @@ class ShapeUtil { // Returns whether the lhs and rhs shapes are identical protobufs. static bool Equal(const Shape& lhs, const Shape& rhs); + // As Equal, but allow one of lhs and rhs to be F16 while the other is F32. + static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs); + // Returns the rank (number of dimensions) of the given shape. // Precondition: !IsTuple(shape) static int64 Rank(const Shape& shape); @@ -293,10 +302,10 @@ class ShapeUtil { // Scalar-specific static bool IsScalar(const Shape& shape) { - return !IsTuple(shape) && !IsOpaque(shape) && Rank(shape) == 0; + return IsArray(shape) && Rank(shape) == 0; } static bool IsEffectiveScalar(const Shape& shape) { - return !IsTuple(shape) && !IsOpaque(shape) && TrueRank(shape) == 0; + return IsArray(shape) && TrueRank(shape) == 0; } static bool IsScalarF32(const Shape& shape); @@ -319,19 +328,23 @@ class ShapeUtil { static Shape ChangeElementType(const Shape& original, PrimitiveType type); // Creates a tuple shape from a slice of element shapes within the tuple. - static Shape MakeTupleShape(tensorflow::gtl::ArraySlice shapes); + static Shape MakeTupleShape(absl::Span shapes); // Creates an opaque shape. These are generally used for threading a context // into a custom operation. static Shape MakeOpaqueShape(); + // Creates a token shape. Values of this shape are used for ordering + // side-effecting operations. + static Shape MakeTokenShape(); + // Appends a shape to the given tuple. static void AppendShapeToTuple(const Shape& shape, Shape* tuple_shape); // Appends a major dimension to the shape with the given bound. static void AppendMajorDimension(int bound, Shape* shape); - // Returns an empty tuple shape. Can be used to indicate side-effects. + // Returns an empty tuple shape. Can be used as a sentinel Shape value. static Shape MakeNil() { return MakeTupleShape({}); } // Checks whether the shape is initialized. @@ -342,31 +355,29 @@ class ShapeUtil { // Constructs a new shape with the given element type and sequence of // dimensions. static Shape MakeShape(PrimitiveType element_type, - tensorflow::gtl::ArraySlice dimensions); + absl::Span dimensions); // Creates a Shape with element type corresponding to T and the given // dimensions template - static Shape MakeShapeWithType( - tensorflow::gtl::ArraySlice dimensions) { + static Shape MakeShapeWithType(absl::Span dimensions) { return ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType(), dimensions); } // Constructs a new shape with the given minor_to_major order in its Layout. // Returns a value shape such that shape.has_layout(). - static Shape MakeShapeWithLayout( - PrimitiveType element_type, tensorflow::gtl::ArraySlice dimensions, - tensorflow::gtl::ArraySlice minor_to_major); + static Shape MakeShapeWithLayout(PrimitiveType element_type, + absl::Span dimensions, + absl::Span minor_to_major); - static Shape MakeShapeWithSparseLayout( - PrimitiveType element_type, tensorflow::gtl::ArraySlice dimensions, - int64 max_sparse_elements); + static Shape MakeShapeWithSparseLayout(PrimitiveType element_type, + absl::Span dimensions, + int64 max_sparse_elements); // Constructs a new shape with major-first layout (i.e. {n, n-1, ..., 0}). static Shape MakeShapeWithDescendingLayout( - PrimitiveType element_type, - tensorflow::gtl::ArraySlice dimensions); + PrimitiveType element_type, absl::Span dimensions); // Returns a new Shape based on the given Shape with low-dimension-major // layout (i.e. {n, n-1, ..., 0}, like Fortran), and with the dimensions @@ -378,8 +389,7 @@ class ShapeUtil { // As MakeShape, but the object to write to is passed in. static void PopulateShape(PrimitiveType element_type, - tensorflow::gtl::ArraySlice dimensions, - Shape* shape); + absl::Span dimensions, Shape* shape); // Validates that the provided shape satisfies invariants. static Status ValidateShape(const Shape& shape); @@ -424,11 +434,15 @@ class ShapeUtil { return shape.element_type() == OPAQUE; } + // Returns whether the shape is an token value used for ordering + // side-effecting operations. + static bool IsToken(const Shape& shape) { + return shape.element_type() == TOKEN; + } + // Returns whether the shape is an array. Note that scalars are considered // arrays. - static bool IsArray(const Shape& shape) { - return !IsTuple(shape) && !IsOpaque(shape); - } + static bool IsArray(const Shape& shape); // Returns whether the shape is a tuple with at least one element which is // also a tuple. @@ -437,7 +451,7 @@ class ShapeUtil { // Returns true if shape is an empty tuple. static bool IsEmptyTuple(const Shape& shape); - // Returns true if shape is an empty tuple, or is an array with no elements. + // Returns true if shape is the nil shape (an empty tuple). static bool IsNil(const Shape& shape); // Returns the number of elements in the given tuple shape. @@ -448,6 +462,9 @@ class ShapeUtil { // Precondition: IsTuple(shape) && TupleElementCount(shape) > index static const Shape& GetTupleElementShape(const Shape& shape, int64 index); + // Returns the number of elements, recursively, in the given shape. + static int64 SubshapeCount(const Shape& shape); + // Slices tuple elements in the range [start, limit) and returns a new tuple // shape. E.g. a tuple like (f32, s32, u32) would slice via 1,3 to (s32, u32). static Shape SliceTuple(const Shape& tuple, int64 start, int64 limit); @@ -467,8 +484,11 @@ class ShapeUtil { static bool IndexIsValid(const Shape& shape, ShapeIndexView index); // GetSubshape and GetMutableSubshape return a particular nested Shape within - // the given Shape argument. + // the given Shape argument. The non-Try variants check fail if index is + // invalid. static const Shape& GetSubshape(const Shape& shape, ShapeIndexView index); + static StatusOr TryGetSubshape(const Shape& shape, + ShapeIndexView index); static Shape* GetMutableSubshape(Shape* shape, ShapeIndexView index); // Returns whether the given index in the given shape is a leaf element of the @@ -504,29 +524,19 @@ class ShapeUtil { static Status ForEachMutableSubshapeWithStatus( Shape* shape, const MutatingStatusVisitorFunction& func); - // Removes all degenerate dimensions (size one) from the given shape. The - // stripped minor_to_major preserves the relative ordering of non-degenerate - // dimensions. The stripped shape has the property that the underlying - // representation (bits in memory) for the stripped shape is the same as the - // original shape modulo padding. Examples: - // - // input shape: F32 [1, 2, 1], minor_to_major = {0, 1, 2} - // stripped shape: F32 [2], minor_to_major = {0} - // - // input shape: F32 [6, 1, 5], minor_to_major = {2, 0, 1} - // stripped shape: F32 [6, 5], minor_to_major = {1, 0} - // - // input shape: F32 [1, 7, 1, 6, 5, 1], minor_to_major = {0, 2, 5, 4, 3, 1} - // stripped shape: F32 [7, 6, 5], minor_to_major = {0, 2, 1} - // - // input shape: F32 [1, 1], minor_to_major = {0, 1} - // stripped shape: F32 [], minor_to_major = {} - // Precondition: !ShapeUtil::IsOpaque(shape) && !ShapeUtil::IsTuple(shape) - static Shape StripDegenerateDimensions(const Shape& shape); + // Returns true if `shape` (which must be an array) with degenerate dimensions + // (dimensions with bound 1). + static bool HasDegenerateDimensions(const Shape& shape); // Permutes the dimensions by the given permutation, so - // return_value.dimensions[permutation[i]] = argument.dimensions[i] - static Shape PermuteDimensions(tensorflow::gtl::ArraySlice permutation, + // return_value.dimensions[permutation[i]] = argument.dimensions[i]. + // + // Postcondition: For any valid permutation, + // + // !HasLayout(shape) || + // TransposeIsBitcast(shape, PermuteDimensions(permutation, shape), + // InversePermutation(permutation)). + static Shape PermuteDimensions(absl::Span permutation, const Shape& shape); // If we can go from `shape_pre` to `shape_post` by merely inserting or @@ -567,9 +577,9 @@ class ShapeUtil { // to its input and thus may be replaced with a bitcast. // // Precondition: Both input_shape and output_shape have explicit layouts. - static bool TransposeIsBitcast( - const Shape& input_shape, const Shape& output_shape, - tensorflow::gtl::ArraySlice dimension_mapping); + static bool TransposeIsBitcast(const Shape& input_shape, + const Shape& output_shape, + absl::Span dimension_mapping); // Returns whether a reshape from "input_shape" to "output_shape" is a // bitcast. @@ -584,8 +594,8 @@ class ShapeUtil { // layout). The layout of 'input_shape' is kept fixed. Returns // 'output_shape_with_layout' if such a layout can be found, and an error // otherwise. - static tensorflow::gtl::optional AlignLayouts( - const Shape& input_shape, const Shape& output_shape); + static absl::optional AlignLayouts(const Shape& input_shape, + const Shape& output_shape); // Returns a shape with the given dimension deleted. // For example: @@ -608,12 +618,12 @@ class ShapeUtil { // continue, or false otherwise. // // visitor_function must be a callable of type - // StatusOr(ArraySlice) or compatible. + // StatusOr(Span) or compatible. template static Status ForEachIndexWithStatus(const Shape& shape, - tensorflow::gtl::ArraySlice base, - tensorflow::gtl::ArraySlice count, - tensorflow::gtl::ArraySlice incr, + absl::Span base, + absl::Span count, + absl::Span incr, const FnType& visitor_function) { return ForEachIndexInternal(shape, base, count, incr, visitor_function); } @@ -635,13 +645,12 @@ class ShapeUtil { } template - static void ForEachIndex(const Shape& shape, - tensorflow::gtl::ArraySlice base, - tensorflow::gtl::ArraySlice count, - tensorflow::gtl::ArraySlice incr, + static void ForEachIndex(const Shape& shape, absl::Span base, + absl::Span count, + absl::Span incr, const FnType& visitor_function) { ForEachIndexWithStatus(shape, base, count, incr, - [&](tensorflow::gtl::ArraySlice indices) { + [&](absl::Span indices) { return StatusOr(visitor_function(indices)); }) .IgnoreError(); @@ -663,7 +672,7 @@ class ShapeUtil { template static void ForEachIndex(const Shape& shape, const FnType& visitor_function) { ForEachIndexWithStatus(shape, - [&](tensorflow::gtl::ArraySlice indices) { + [&](absl::Span indices) { return StatusOr(visitor_function(indices)); }) .IgnoreError(); @@ -674,18 +683,18 @@ class ShapeUtil { // matter. // // visitor_function must be a callable of type - // void(ArraySlice) or compatible. + // void(Span) or compatible. template static void ForEachIndexParallel(const Shape& shape, - tensorflow::gtl::ArraySlice base, - tensorflow::gtl::ArraySlice count, - tensorflow::gtl::ArraySlice incr, + absl::Span base, + absl::Span count, + absl::Span incr, const FnType& visitor_function) { // The parallel version of ForEachIndexInternal can never fail. CHECK(ForEachIndexInternal( shape, base, count, incr, - [&visitor_function](tensorflow::gtl::ArraySlice indexes) - -> StatusOr { + [&visitor_function]( + absl::Span indexes) -> StatusOr { visitor_function(indexes); return true; }, @@ -697,18 +706,22 @@ class ShapeUtil { static size_t Hash(const Shape& shape); private: + // Validates the shape size is sane. This makes sure it's safe to do + // calculations in int64 without overflowing. + static Status ValidateShapeSize(const Shape& shape); + // Validates all of the non-layout properties of the shape -- this is a helper // used by both the layout-optional and layout-required public method. static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape); template static Status ForEachIndexInternal(const Shape& shape, - tensorflow::gtl::ArraySlice base, - tensorflow::gtl::ArraySlice count, - tensorflow::gtl::ArraySlice incr, + absl::Span base, + absl::Span count, + absl::Span incr, const FnType& visitor_function, bool parallel = false) { - if (ShapeUtil::HasZeroElements(shape)) { + if (ShapeUtil::IsZeroElementArray(shape)) { return Status::OK(); } CHECK_EQ(Rank(shape), base.size()); @@ -720,13 +733,13 @@ class ShapeUtil { int64 n = -1; std::vector indexes(base.begin(), base.end()); const int kNumThreads = tensorflow::port::NumSchedulableCPUs(); - tensorflow::gtl::optional pool; + absl::optional pool; if (parallel) { pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads); } while (n < rank) { - if (pool != tensorflow::gtl::nullopt) { + if (pool != absl::nullopt) { pool->Schedule( [indexes, &visitor_function] { visitor_function(indexes); }); } else { diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc index f7675e97da7b061bde063e5093256c2288f99c98..6ca4085aaf3bd1c181da3b94aa6c570e21172d0a 100644 --- a/tensorflow/compiler/xla/shape_util_test.cc +++ b/tensorflow/compiler/xla/shape_util_test.cc @@ -15,6 +15,9 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" +#include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/test.h" @@ -28,6 +31,15 @@ namespace { using ::testing::ElementsAre; +TEST(ShapeUtilTest, ShapeIndexViewTest) { + ShapeIndex index = {1, 2, 3, 4}; + ShapeIndexView index_view(index, 1); + EXPECT_EQ(3, index_view.size()); + EXPECT_EQ(ShapeIndexView({2, 3, 4}), index_view); + EXPECT_EQ(ShapeIndexView({3, 4}), index_view.ConsumeFront()); + EXPECT_EQ(ShapeIndexView({2, 3}), index_view.ConsumeBack()); +} + TEST(ShapeUtilTest, GetDimensionHelperCanNegativeIndex) { Shape matrix = ShapeUtil::MakeShape(F32, {2, 3}); EXPECT_EQ(3, ShapeUtil::GetDimension(matrix, -1)); @@ -93,12 +105,14 @@ TEST(ShapeUtilTest, ParseShapeStringTupleOfArrays) { } TEST(ShapeUtilTest, ParseShapeStringNestedTuple) { - string shape_string = "(f32[1],(f32[2]), f32[3])"; + string shape_string = "(f32[1],(f32[2], token[]), opaque[], f32[3])"; TF_ASSERT_OK_AND_ASSIGN(Shape actual, ShapeUtil::ParseShapeString(shape_string)); Shape expected = ShapeUtil::MakeTupleShape({ ShapeUtil::MakeShape(F32, {1}), - ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2})}), + ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeTokenShape()}), + ShapeUtil::MakeOpaqueShape(), ShapeUtil::MakeShape(F32, {3}), }); ASSERT_TRUE(ShapeUtil::Equal(expected, actual)) @@ -136,6 +150,23 @@ TEST(ShapeUtilTest, ParseShapeStringWithSparseLayout) { << "actual: " << ShapeUtil::HumanString(actual); } +TEST(ShapeUtilTest, ParseOpaqueType) { + TF_ASSERT_OK_AND_ASSIGN(Shape actual, + ShapeUtil::ParseShapeString("opaque[]")); + Shape expected = ShapeUtil::MakeOpaqueShape(); + ASSERT_TRUE(ShapeUtil::Equal(expected, actual)) + << "expected: " << ShapeUtil::HumanString(expected) + << "actual: " << ShapeUtil::HumanString(actual); +} + +TEST(ShapeUtilTest, ParseTokenType) { + TF_ASSERT_OK_AND_ASSIGN(Shape actual, ShapeUtil::ParseShapeString("token[]")); + Shape expected = ShapeUtil::MakeTokenShape(); + ASSERT_TRUE(ShapeUtil::Equal(expected, actual)) + << "expected: " << ShapeUtil::HumanString(expected) + << "actual: " << ShapeUtil::HumanString(actual); +} + TEST(ShapeUtilTest, ParseInvalidShapeString) { string shape_strings[] = { "f32[123,456]foobar{0,1}", "f32[123,456]sparse{0,1}", "f32[123,456]{foo}", @@ -153,6 +184,41 @@ TEST(ShapeUtilTest, CompatibleIdenticalShapes) { ASSERT_TRUE(ShapeUtil::Compatible(shape1, shape2)); } +TEST(ShapeUtilTest, TokenCompatibility) { + EXPECT_TRUE(ShapeUtil::Compatible(ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeTokenShape())); + EXPECT_FALSE(ShapeUtil::Compatible(ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeShape(F32, {}))); + EXPECT_FALSE(ShapeUtil::Compatible(ShapeUtil::MakeShape(F32, {}), + ShapeUtil::MakeTokenShape())); + EXPECT_TRUE(ShapeUtil::Compatible( + ShapeUtil::MakeTupleShape({ShapeUtil::MakeTokenShape()}), + ShapeUtil::MakeTupleShape({ShapeUtil::MakeTokenShape()}))); +} + +TEST(ShapeUtilTest, TokensEqualShapes) { + EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeTokenShape())); + EXPECT_FALSE(ShapeUtil::Equal(ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeShape(F32, {}))); + EXPECT_FALSE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {}), + ShapeUtil::MakeTokenShape())); + EXPECT_TRUE(ShapeUtil::Equal( + ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1})}), + ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1})}))); + EXPECT_FALSE(ShapeUtil::Equal( + ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1})}), + ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeTokenShape(), + ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {1, 0})}))); +} + TEST(ShapeUtilTest, CompatibleNotIdenticalShapes) { Shape shape_1 = ShapeUtil::MakeShape(F32, {3, 2}); auto layout_1 = shape_1.mutable_layout(); @@ -188,6 +254,24 @@ TEST(ShapeUtilTest, IncompatibleDifferentElementShapes) { EXPECT_FALSE(ShapeUtil::Compatible(shape_1, shape_2)); } +TEST(ShapeUtilTest, EqualIgnoringFpPrecision) { + EXPECT_TRUE(ShapeUtil::EqualIgnoringFpPrecision( + ShapeUtil::MakeShapeWithLayout(F32, {4, 3}, {0, 1}), + ShapeUtil::MakeShapeWithLayout(F16, {4, 3}, {0, 1}))); +} + +TEST(ShapeUtilTest, UnequalIgnoringFpPrecision) { + EXPECT_FALSE(ShapeUtil::EqualIgnoringFpPrecision( + ShapeUtil::MakeShapeWithLayout(F32, {4, 3}, {0, 1}), + ShapeUtil::MakeShapeWithLayout(F16, {3, 4}, {0, 1}))); + EXPECT_FALSE(ShapeUtil::EqualIgnoringFpPrecision( + ShapeUtil::MakeShapeWithLayout(F32, {3, 4}, {0, 1}), + ShapeUtil::MakeShapeWithLayout(F16, {3, 4}, {1, 0}))); + EXPECT_FALSE(ShapeUtil::EqualIgnoringFpPrecision( + ShapeUtil::MakeShapeWithLayout(F32, {4, 3}, {0, 1}), + ShapeUtil::MakeShapeWithLayout(PRED, {4, 3}, {0, 1}))); +} + TEST(ShapeUtilTest, CompatibleTuples) { Shape tuple1 = ShapeUtil::MakeTupleShape( {ShapeUtil::MakeShape(F32, {3, 2}), ShapeUtil::MakeShape(PRED, {4, 5})}); @@ -250,6 +334,17 @@ TEST(ShapeUtilTest, IncompatibleScalarVsTuple) { EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape2, shape1)); } +TEST(ShapeUtilTest, OpaqueVsArray) { + Shape shape1 = ShapeUtil::MakeShape(F32, {5, 7}); + Shape shape2 = ShapeUtil::MakeOpaqueShape(); + EXPECT_FALSE(ShapeUtil::Compatible(shape1, shape2)); + EXPECT_FALSE(ShapeUtil::Compatible(shape2, shape1)); + EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape1, shape2)); + EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape2, shape1)); + EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(shape1, shape2)); + EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(shape2, shape1)); +} + TEST(ShapeUtilTest, CompareShapesWithPaddedDimensionsMismatch) { Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30}); shape1.mutable_layout()->add_padded_dimensions(10); @@ -295,6 +390,9 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) { EXPECT_EQ(8, ShapeUtil::ByteSizeOfPrimitiveType(C64)); EXPECT_EQ(8, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(C64, {}))); EXPECT_EQ(1600, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(C64, {10, 20}))); + + EXPECT_EQ(0, ShapeUtil::ByteSizeOfPrimitiveType(TOKEN)); + EXPECT_EQ(0, ShapeUtil::ByteSizeOf(ShapeUtil::MakeTokenShape())); } TEST(ShapeUtilTest, ByteSizeOfWithPadding) { @@ -307,6 +405,16 @@ TEST(ShapeUtilTest, ByteSizeOfWithPadding) { EXPECT_EQ(15 * 21 * 4, ShapeUtil::ByteSizeOf(shape)); } +TEST(ShapeUtilTest, NilShape) { + EXPECT_TRUE(ShapeUtil::IsNil(ShapeUtil::MakeNil())); + EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {1, 2, 3}))); + EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {0, 1}))); + EXPECT_FALSE(ShapeUtil::IsNil( + ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {})}))); + EXPECT_FALSE(ShapeUtil::IsNil( + ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {0})}))); +} + TEST(ShapeUtilTest, NestedTuple) { EXPECT_FALSE(ShapeUtil::IsNestedTuple(ShapeUtil::MakeTupleShape({}))); EXPECT_FALSE(ShapeUtil::IsNestedTuple( @@ -337,25 +445,30 @@ TEST(ShapeUtilTest, ElementsIn) { EXPECT_EQ(221, ShapeUtil::ElementsIn(ShapeUtil::MakeShape(S32, {13, 17}))); } -TEST(ShapeUtilTest, HasZeroElements) { - EXPECT_EQ(false, ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {}))); - EXPECT_EQ(true, ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {0}))); - EXPECT_EQ(false, ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {1}))); - EXPECT_EQ(false, - ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {1, 1}))); - EXPECT_EQ(false, ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {2}))); - EXPECT_EQ(false, - ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {2, 1}))); - EXPECT_EQ(false, - ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {3, 5}))); - EXPECT_EQ(true, - ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {3, 0, 5}))); - EXPECT_EQ(true, - ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {0, 3, 0}))); - EXPECT_EQ(false, - ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {1, 3, 5}))); - EXPECT_EQ(false, - ShapeUtil::HasZeroElements(ShapeUtil::MakeShape(S32, {13, 17}))); +TEST(ShapeUtilTest, IsZeroElementArray) { + EXPECT_FALSE(ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {}))); + EXPECT_TRUE(ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {0}))); + EXPECT_FALSE(ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {1}))); + EXPECT_FALSE( + ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {1, 1}))); + EXPECT_FALSE(ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {2}))); + EXPECT_FALSE( + ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {2, 1}))); + EXPECT_FALSE( + ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {3, 5}))); + EXPECT_TRUE( + ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {3, 0, 5}))); + EXPECT_TRUE( + ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {0, 3, 0}))); + EXPECT_FALSE( + ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {1, 3, 5}))); + EXPECT_FALSE( + ShapeUtil::IsZeroElementArray(ShapeUtil::MakeShape(S32, {13, 17}))); + + EXPECT_FALSE(ShapeUtil::IsZeroElementArray(ShapeUtil::MakeNil())); + EXPECT_FALSE(ShapeUtil::IsZeroElementArray(ShapeUtil::MakeTupleShape({}))); + EXPECT_FALSE(ShapeUtil::IsZeroElementArray( + ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {0, 3, 0})}))); } TEST(ShapeUtilTest, SameDimensions) { @@ -449,19 +562,21 @@ TEST(ShapeUtilTest, IsLeafIndex) { TEST(ShapeUtilTest, HumanString) { Shape opaque = ShapeUtil::MakeOpaqueShape(); + Shape token = ShapeUtil::MakeTokenShape(); Shape scalar = ShapeUtil::MakeShape(F32, {}); Shape matrix = ShapeUtil::MakeShape(U32, {1, 2}); Shape matrix2 = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1}); Shape tuple = ShapeUtil::MakeTupleShape({opaque, scalar, matrix, matrix2}); - Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix}); + Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix, token}); EXPECT_EQ("opaque[]", ShapeUtil::HumanString(opaque)); + EXPECT_EQ("token[]", ShapeUtil::HumanString(token)); EXPECT_EQ("f32[]", ShapeUtil::HumanString(scalar)); EXPECT_EQ("u32[1,2]", ShapeUtil::HumanString(matrix)); EXPECT_EQ("s32[3,4]", ShapeUtil::HumanString(matrix2)); EXPECT_EQ("(opaque[], f32[], u32[1,2], s32[3,4])", ShapeUtil::HumanString(tuple)); - EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])", + EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])", ShapeUtil::HumanString(nested_tuple)); EXPECT_EQ("opaque[]", ShapeUtil::HumanStringWithLayout(opaque)); @@ -470,8 +585,10 @@ TEST(ShapeUtilTest, HumanString) { EXPECT_EQ("s32[3,4]{0,1}", ShapeUtil::HumanStringWithLayout(matrix2)); EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})", ShapeUtil::HumanStringWithLayout(tuple)); - EXPECT_EQ("((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0})", - ShapeUtil::HumanStringWithLayout(nested_tuple)); + EXPECT_EQ( + "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, " + "token[])", + ShapeUtil::HumanStringWithLayout(nested_tuple)); ProgramShape prog = ShapeUtil::MakeProgramShape( {opaque, scalar, matrix, matrix2, tuple, nested_tuple}, nested_tuple); @@ -481,8 +598,9 @@ TEST(ShapeUtilTest, HumanString) { "(unknown): u32[1,2], " "(unknown): s32[3,4], " "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), " - "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])) -> " - "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])", + "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) " + "-> " + "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])", ShapeUtil::HumanString(prog)); prog.add_parameter_names("arg0"); @@ -497,8 +615,10 @@ TEST(ShapeUtilTest, HumanString) { "matrix: u32[1,2], " "matrix2: s32[3,4], " "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), " - "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])) -> " - "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])", + "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], " + "token[])) " + "-> " + "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])", ShapeUtil::HumanString(prog)); } @@ -585,11 +705,10 @@ TEST(ShapeUtilTest, ForEachIndex) { Shape shape = ShapeUtil::MakeShape(F32, data.dimensions); // Increments at every invocation. int invocations = 0; - auto increment_func = - [&invocations](tensorflow::gtl::ArraySlice indexes) { - invocations++; - return true; - }; + auto increment_func = [&invocations](absl::Span indexes) { + invocations++; + return true; + }; std::vector zero_base(data.dimensions.size(), 0); std::vector step(data.dimensions.size(), 1); @@ -606,8 +725,7 @@ TEST(ShapeUtilTest, ForEachIndexWithStatus) { // Increments at every invocation. int invocations = 0; auto increment_func = - [&invocations]( - tensorflow::gtl::ArraySlice indexes) -> StatusOr { + [&invocations](absl::Span indexes) -> StatusOr { if (++invocations == 5) { return Unimplemented("Cannot increment beyond 5."); } @@ -628,7 +746,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel) { Shape shape = ShapeUtil::MakeShape(F32, {10, 10}); int64 output[10][10]; int init = 5; - auto set_func = [&](tensorflow::gtl::ArraySlice indexes) { + auto set_func = [&](absl::Span indexes) { output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1]; }; @@ -713,14 +831,37 @@ TEST(ShapeUtilTest, ReshapeIsBitcast_3x2x2_6x2_Dim1IsMostMinor) { ShapeUtil::MakeShapeWithLayout(F32, {6, 2}, {0, 1}))); } -TEST(ShapeUtilTest, StripDegenerateDimensions) { - EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::StripDegenerateDimensions( - ShapeUtil::MakeShape(F32, {3, 1, 2})), - ShapeUtil::MakeShape(F32, {3, 2}))); - EXPECT_TRUE(ShapeUtil::Equal( - ShapeUtil::StripDegenerateDimensions( - ShapeUtil::MakeShapeWithSparseLayout(F32, {3, 1, 2}, 10)), - ShapeUtil::MakeShapeWithSparseLayout(F32, {3, 2}, 10))); +TEST(ShapeUtilTest, HasDegenerateDimensions) { + EXPECT_TRUE( + ShapeUtil::HasDegenerateDimensions(ShapeUtil::MakeShape(F32, {3, 1, 2}))); + EXPECT_TRUE( + ShapeUtil::HasDegenerateDimensions(ShapeUtil::MakeShape(F32, {3, 1, 1}))); + EXPECT_FALSE( + ShapeUtil::HasDegenerateDimensions(ShapeUtil::MakeShape(F32, {3, 3, 5}))); + EXPECT_FALSE( + ShapeUtil::HasDegenerateDimensions(ShapeUtil::MakeShape(F32, {3, 0, 5}))); +} + +TEST(ShapeUtilTest, PermuteDimensionsLayout) { + std::vector layout(3); + std::iota(layout.begin(), layout.end(), 0); + do { + Shape s = ShapeUtil::MakeShapeWithLayout(F32, {10, 100, 1000}, layout); + SCOPED_TRACE(absl::StrCat("s=", ShapeUtil::HumanString(s))); + + std::vector permutation(3); + std::iota(permutation.begin(), permutation.end(), 0); + do { + SCOPED_TRACE( + absl::StrCat("permutation=", absl::StrJoin(permutation, ","))); + + // TransposeIsBitcast takes the inverse of the permutation that + // PermuteDimensions takes. + EXPECT_TRUE(ShapeUtil::TransposeIsBitcast( + s, ShapeUtil::PermuteDimensions(permutation, s), + InversePermutation(permutation))); + } while (std::next_permutation(permutation.begin(), permutation.end())); + } while (std::next_permutation(layout.begin(), layout.end())); } TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) { diff --git a/tensorflow/compiler/xla/sparse_index_array.cc b/tensorflow/compiler/xla/sparse_index_array.cc index 31844abd89a020c87c403353374a80fb639a3244..1c135dda864b3060b8bdc6369f18268d7c5c7f9e 100644 --- a/tensorflow/compiler/xla/sparse_index_array.cc +++ b/tensorflow/compiler/xla/sparse_index_array.cc @@ -33,7 +33,7 @@ SparseIndexArray::SparseIndexArray(int64 max_indices, int64 rank, } SparseIndexArray::SparseIndexArray(int64 max_indices, int64 rank, - tensorflow::gtl::ArraySlice indices) + absl::Span indices) : SparseIndexArray(max_indices, rank, std::vector(indices.begin(), indices.end())) {} @@ -48,25 +48,24 @@ int64 SparseIndexArray::index_count() const { return indices_.size() / rank_; } -tensorflow::gtl::ArraySlice SparseIndexArray::At( +absl::Span SparseIndexArray::At( int64 sparse_element_number) const { CHECK_GT(rank_, 0); CHECK_GE(sparse_element_number, 0); CHECK_LE(rank_ * sparse_element_number + rank_, indices_.size()); - return tensorflow::gtl::ArraySlice( + return absl::Span( indices_.data() + rank_ * sparse_element_number, rank_); } -tensorflow::gtl::MutableArraySlice SparseIndexArray::At( - int64 sparse_element_number) { +absl::Span SparseIndexArray::At(int64 sparse_element_number) { CHECK_GT(rank_, 0); CHECK_GE(sparse_element_number, 0); CHECK_LE(rank_ * sparse_element_number + rank_, indices_.size()); - return tensorflow::gtl::MutableArraySlice( - indices_.data() + rank_ * sparse_element_number, rank_); + return absl::Span(indices_.data() + rank_ * sparse_element_number, + rank_); } -void SparseIndexArray::Append(tensorflow::gtl::ArraySlice index) { +void SparseIndexArray::Append(absl::Span index) { CHECK_GT(rank_, 0); CHECK_EQ(index.size(), rank_); indices_.insert(indices_.end(), index.begin(), index.end()); @@ -90,12 +89,12 @@ bool SparseIndexArray::Validate(const Shape& shape) const { if (num_indices < 2) { return true; } - tensorflow::gtl::ArraySlice last = At(0); + absl::Span last = At(0); if (!IndexUtil::IndexInBounds(shape, last)) { return false; } for (int64 n = 1; n < num_indices; ++n) { - tensorflow::gtl::ArraySlice next = At(n); + absl::Span next = At(n); if (!IndexUtil::IndexInBounds(shape, next)) { return false; } diff --git a/tensorflow/compiler/xla/sparse_index_array.h b/tensorflow/compiler/xla/sparse_index_array.h index f2ce22d6721ff8da46f741ccedc2a63dea5994c8..a96d483462efd77ae4761541e8c79b2c84fa49f3 100644 --- a/tensorflow/compiler/xla/sparse_index_array.h +++ b/tensorflow/compiler/xla/sparse_index_array.h @@ -20,10 +20,11 @@ limitations under the License. #include +#include "absl/container/inlined_vector.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/index_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { @@ -64,7 +65,7 @@ class SparseIndexArray { SparseIndexArray(int64 max_indices, int64 rank, std::vector indices = {}); SparseIndexArray(int64 max_indices, int64 rank, - tensorflow::gtl::ArraySlice indices); + absl::Span indices); // Returns the number of elements represented by the indices stored in the // array. @@ -72,12 +73,12 @@ class SparseIndexArray { // Returns a slice that refers to the given sparse index number. The argument // must be in the range [0, element_count()). - tensorflow::gtl::ArraySlice At(int64 sparse_element_number) const; - tensorflow::gtl::MutableArraySlice At(int64 sparse_element_number); + absl::Span At(int64 sparse_element_number) const; + absl::Span At(int64 sparse_element_number); // Adds the given index at the end of the array. The new size of the // SparseIndexArray must not exceed `max_indices`. - void Append(tensorflow::gtl::ArraySlice index); + void Append(absl::Span index); // Removes all indices from the array. void Clear(); @@ -95,8 +96,8 @@ class SparseIndexArray { int64 max_indices() const { return max_indices_; } // Returns a pointer to the int64 array that holds the sparse indices. - tensorflow::gtl::MutableArraySlice mutable_data() { return &indices_; } - tensorflow::gtl::ArraySlice data() const { return indices_; } + absl::Span mutable_data() { return absl::MakeSpan(indices_); } + absl::Span data() const { return indices_; } // Sorts this sparse index array along with the set of corresponding values. // The indices and values are sorted in the lexicographic order of the @@ -114,7 +115,7 @@ class SparseIndexArray { // std::cout << v[0] << ", " << v[1] << ", " << v[2] << std::endl; // template - void SortWithValues(tensorflow::gtl::MutableArraySlice values); + void SortWithValues(absl::Span values); private: std::vector indices_; @@ -123,8 +124,7 @@ class SparseIndexArray { }; template -void SparseIndexArray::SortWithValues( - tensorflow::gtl::MutableArraySlice values) { +void SparseIndexArray::SortWithValues(absl::Span values) { int64 num_elements = index_count(); CHECK_EQ(values.size(), num_elements); std::vector sort_order; @@ -139,7 +139,7 @@ void SparseIndexArray::SortWithValues( // Reorder the array elements according to sort_order. Work through the array // and follow cycles so we can do the reorder in-place. - tensorflow::gtl::InlinedVector saved_index(rank()); + absl::InlinedVector saved_index(rank()); for (int64 i = 0; i < num_elements; ++i) { // sort_order[i] == -1 indicates the element has already been copied. if (sort_order[i] < 0) { diff --git a/tensorflow/compiler/xla/sparse_index_array_test.cc b/tensorflow/compiler/xla/sparse_index_array_test.cc index 7377f88958dcb7daf3d3f4f0e07966fdc9294580..e54057c4007078c76b79fe44d5706665e266c083 100644 --- a/tensorflow/compiler/xla/sparse_index_array_test.cc +++ b/tensorflow/compiler/xla/sparse_index_array_test.cc @@ -33,7 +33,7 @@ TEST(SparseIndexArrayTest, Sort) { std::vector values = { 12.0, 13.0, 11.0, 15.0, 14.0, 16.0, }; - a.SortWithValues(&values); + a.SortWithValues(absl::MakeSpan(values)); ASSERT_EQ(a.data(), std::vector({1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8})); ASSERT_EQ(values, std::vector({11.0, 12.0, 13.0, 14.0, 15.0, 16.0})); diff --git a/tensorflow/compiler/xla/status_macros.cc b/tensorflow/compiler/xla/status_macros.cc index a6b1f9004f096abb3b01d315938b0a23bea1ca48..b88fe367d7416a26c1147fd5e10fb20772814fe5 100644 --- a/tensorflow/compiler/xla/status_macros.cc +++ b/tensorflow/compiler/xla/status_macros.cc @@ -17,9 +17,8 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stacktrace.h" @@ -37,8 +36,7 @@ static void LogError(const Status& status, const char* filename, int line, if (TF_PREDICT_TRUE(log_severity != tensorflow::NUM_SEVERITIES)) { string stack_trace; if (should_log_stack_trace) { - stack_trace = - tensorflow::strings::StrCat("\n", tensorflow::CurrentStackTrace()); + stack_trace = absl::StrCat("\n", tensorflow::CurrentStackTrace()); } switch (log_severity) { case tensorflow::INFO: @@ -142,17 +140,15 @@ Status MakeErrorStream::Impl::GetStatus() { is_done_ = true; const string& stream_str = stream_.str(); - const string str = - prior_message_handling_ == kAppendToPriorMessage - ? tensorflow::strings::StrCat(prior_message_, stream_str) - : tensorflow::strings::StrCat(stream_str, prior_message_); + const string str = prior_message_handling_ == kAppendToPriorMessage + ? absl::StrCat(prior_message_, stream_str) + : absl::StrCat(stream_str, prior_message_); if (TF_PREDICT_FALSE(str.empty())) { - return MakeError(file_, line_, code_, - tensorflow::strings::StrCat( - str, "Error without message at ", file_, ":", line_), - true /* should_log */, - tensorflow::ERROR /* log_severity */, - should_log_stack_trace_); + return MakeError( + file_, line_, code_, + absl::StrCat(str, "Error without message at ", file_, ":", line_), + true /* should_log */, tensorflow::ERROR /* log_severity */, + should_log_stack_trace_); } else { return MakeError(file_, line_, code_, str, should_log_, log_severity_, should_log_stack_trace_); diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h index 0e1387c93938fa520562fcd63ac107a82b089a51..a32e2ad9851b0b5644f7e6f0f9ead6c438934c07 100644 --- a/tensorflow/compiler/xla/statusor.h +++ b/tensorflow/compiler/xla/statusor.h @@ -12,297 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ - -// StatusOr is the union of a Status object and a T object. StatusOr models -// the concept of an object that is either a value, or an error Status -// explaining why such a value is not present. To this end, StatusOr does not -// allow its Status value to be Status::OK. -// -// The primary use-case for StatusOr is as the return value of a -// function which may fail. -// -// Example client usage for a StatusOr, where T is not a pointer: -// -// StatusOr result = DoBigCalculationThatCouldFail(); -// if (result.ok()) { -// float answer = result.ValueOrDie(); -// printf("Big calculation yielded: %f", answer); -// } else { -// LOG(ERROR) << result.status(); -// } -// -// Example client usage for a StatusOr: -// -// StatusOr result = FooFactory::MakeNewFoo(arg); -// if (result.ok()) { -// std::unique_ptr foo(result.ValueOrDie()); -// foo->DoSomethingCool(); -// } else { -// LOG(ERROR) << result.status(); -// } -// -// Example client usage for a StatusOr>: -// -// StatusOr> result = FooFactory::MakeNewFoo(arg); -// if (result.ok()) { -// std::unique_ptr foo = std::move(result.ValueOrDie()); -// foo->DoSomethingCool(); -// } else { -// LOG(ERROR) << result.status(); -// } -// -// Example factory implementation returning StatusOr: -// -// StatusOr FooFactory::MakeNewFoo(int arg) { -// if (arg <= 0) { -// return tensorflow::InvalidArgument("Arg must be positive"); -// } else { -// return new Foo(arg); -// } -// } -// -// Note that the assignment operators require that destroying the currently -// stored value cannot invalidate the argument; in other words, the argument -// cannot be an alias for the current value, or anything owned by the current -// value. #ifndef TENSORFLOW_COMPILER_XLA_STATUSOR_H_ #define TENSORFLOW_COMPILER_XLA_STATUSOR_H_ #include "tensorflow/compiler/xla/status.h" -#include "tensorflow/compiler/xla/statusor_internals.h" -#include "tensorflow/core/platform/macros.h" +#include "tensorflow/stream_executor/lib/statusor.h" namespace xla { -#if defined(__clang__) -// Only clang supports warn_unused_result as a type annotation. -template -class TF_MUST_USE_RESULT StatusOr; -#endif - -template -class StatusOr : private internal_statusor::StatusOrData, - private internal_statusor::TraitsBase< - std::is_copy_constructible::value, - std::is_move_constructible::value> { - template - friend class StatusOr; - - typedef internal_statusor::StatusOrData Base; - - public: - typedef T element_type; - - // Constructs a new StatusOr with Status::UNKNOWN status. This is marked - // 'explicit' to try to catch cases like 'return {};', where people think - // StatusOr> will be initialized with an empty vector, - // instead of a Status::UNKNOWN status. - explicit StatusOr(); - - // StatusOr will be copy constructible/assignable if T is copy - // constructible. - StatusOr(const StatusOr&) = default; - StatusOr& operator=(const StatusOr&) = default; - - // StatusOr will be move constructible/assignable if T is move - // constructible. - StatusOr(StatusOr&&) = default; - StatusOr& operator=(StatusOr&&) = default; - - // Conversion copy/move constructor, T must be convertible from U. - template ::value>::type* = nullptr> - StatusOr(const StatusOr& other); - template ::value>::type* = nullptr> - StatusOr(StatusOr&& other); - - // Conversion copy/move assignment operator, T must be convertible from U. - template ::value>::type* = nullptr> - StatusOr& operator=(const StatusOr& other); - template ::value>::type* = nullptr> - StatusOr& operator=(StatusOr&& other); - - // Constructs a new StatusOr with the given value. After calling this - // constructor, calls to ValueOrDie() will succeed, and calls to status() will - // return OK. - // - // NOTE: Not explicit - we want to use StatusOr as a return type - // so it is convenient and sensible to be able to do 'return T()' - // when the return type is StatusOr. - // - // REQUIRES: T is copy constructible. - StatusOr(const T& value); - - // Constructs a new StatusOr with the given non-ok status. After calling - // this constructor, calls to ValueOrDie() will CHECK-fail. - // - // NOTE: Not explicit - we want to use StatusOr as a return - // value, so it is convenient and sensible to be able to do 'return - // Status()' when the return type is StatusOr. - // - // REQUIRES: !status.ok(). This requirement is DCHECKed. - // In optimized builds, passing Status::OK() here will have the effect - // of passing tensorflow::error::INTERNAL as a fallback. - StatusOr(const Status& status); - StatusOr& operator=(const Status& status); - - // TODO(b/62186997): Add operator=(T) overloads. - - // Similar to the `const T&` overload. - // - // REQUIRES: T is move constructible. - StatusOr(T&& value); - - // RValue versions of the operations declared above. - StatusOr(Status&& status); - StatusOr& operator=(Status&& status); - - // Returns this->status().ok() - bool ok() const { return this->status_.ok(); } - - // Returns a reference to our status. If this contains a T, then - // returns Status::OK(). - const Status& status() const &; - Status status() &&; - - // Returns a reference to our current value, or CHECK-fails if !this->ok(). - // - // Note: for value types that are cheap to copy, prefer simple code: - // - // T value = statusor.ValueOrDie(); - // - // Otherwise, if the value type is expensive to copy, but can be left - // in the StatusOr, simply assign to a reference: - // - // T& value = statusor.ValueOrDie(); // or `const T&` - // - // Otherwise, if the value type supports an efficient move, it can be - // used as follows: - // - // T value = std::move(statusor).ValueOrDie(); - // - // The std::move on statusor instead of on the whole expression enables - // warnings about possible uses of the statusor object after the move. - // C++ style guide waiver for ref-qualified overloads granted in cl/143176389 - // See go/ref-qualifiers for more details on such overloads. - const T& ValueOrDie() const &; - T& ValueOrDie() &; - const T&& ValueOrDie() const &&; - T&& ValueOrDie() &&; - - T ConsumeValueOrDie() { return std::move(ValueOrDie()); } - - // Ignores any errors. This method does nothing except potentially suppress - // complaints from any tools that are checking that errors are not dropped on - // the floor. - void IgnoreError() const; -}; - -//////////////////////////////////////////////////////////////////////////////// -// Implementation details for StatusOr - -template -StatusOr::StatusOr() : Base(Status(tensorflow::error::UNKNOWN, "")) {} - -template -StatusOr::StatusOr(const T& value) : Base(value) {} - -template -StatusOr::StatusOr(const Status& status) : Base(status) {} - -template -StatusOr& StatusOr::operator=(const Status& status) { - this->Assign(status); - return *this; -} - -template -StatusOr::StatusOr(T&& value) : Base(std::move(value)) {} - -template -StatusOr::StatusOr(Status&& status) : Base(std::move(status)) {} - -template -StatusOr& StatusOr::operator=(Status&& status) { - this->Assign(std::move(status)); - return *this; -} - -template -template ::value>::type*> -inline StatusOr::StatusOr(const StatusOr& other) - : Base(static_cast::Base&>(other)) {} - -template -template ::value>::type*> -inline StatusOr& StatusOr::operator=(const StatusOr& other) { - if (other.ok()) - this->Assign(other.ValueOrDie()); - else - this->Assign(other.status()); - return *this; -} - -template -template ::value>::type*> -inline StatusOr::StatusOr(StatusOr&& other) - : Base(static_cast::Base&&>(other)) {} - -template -template ::value>::type*> -inline StatusOr& StatusOr::operator=(StatusOr&& other) { - if (other.ok()) { - this->Assign(std::move(other).ValueOrDie()); - } else { - this->Assign(std::move(other).status()); - } - return *this; -} - -template -const Status& StatusOr::status() const & { - return this->status_; -} -template -Status StatusOr::status() && { - return ok() ? Status::OK() : std::move(this->status_); -} - -template -const T& StatusOr::ValueOrDie() const & { - this->EnsureOk(); - return this->data_; -} - -template -T& StatusOr::ValueOrDie() & { - this->EnsureOk(); - return this->data_; -} - -template -const T&& StatusOr::ValueOrDie() const && { - this->EnsureOk(); - return std::move(this->data_); -} - -template -T&& StatusOr::ValueOrDie() && { - this->EnsureOk(); - return std::move(this->data_); -} - +// Use steam_executor's StatusOr so we don't duplicate code. template -void StatusOr::IgnoreError() const { - // no-op -} +using StatusOr = ::stream_executor::port::StatusOr; } // namespace xla diff --git a/tensorflow/compiler/xla/test.h b/tensorflow/compiler/xla/test.h index 87a8c5f3a528289d47c1729ae6719aae47037c36..a657554dc2fd4fd1838639cac011bc0bb8b3d1eb 100644 --- a/tensorflow/compiler/xla/test.h +++ b/tensorflow/compiler/xla/test.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPLIER_XLA_TEST_H_ -#define TENSORFLOW_COMPLIER_XLA_TEST_H_ +#ifndef TENSORFLOW_COMPILER_XLA_TEST_H_ +#define TENSORFLOW_COMPILER_XLA_TEST_H_ // This header includes gmock.h and enables the use of gmock matchers in tests // in third_party/tensorflow/compiler/xla. @@ -45,4 +45,4 @@ limitations under the License. #include "tensorflow/core/platform/test.h" -#endif // TENSORFLOW_COMPLIER_XLA_TEST_H_ +#endif // TENSORFLOW_COMPILER_XLA_TEST_H_ diff --git a/tensorflow/compiler/xla/test_helpers.h b/tensorflow/compiler/xla/test_helpers.h index 8918350135fbb86973b228b35f5873fea8695b2f..3ede5e6e38a7a9e922fc0744f014c395dbd2324c 100644 --- a/tensorflow/compiler/xla/test_helpers.h +++ b/tensorflow/compiler/xla/test_helpers.h @@ -19,9 +19,9 @@ limitations under the License. #include #include +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/regexp.h" #include "tensorflow/core/platform/test.h" diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index a62d49e9c759e0371f02902c0029029b61d39b79..36b8fb26440f0f71207cc9b2af4d14f21e618cfe 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -43,6 +43,7 @@ cc_library( "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", ], alwayslink = True, ) @@ -65,9 +66,9 @@ cc_library( srcs = ["test_utils.cc"], hdrs = ["test_utils.h"], deps = [ + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_dataflow_analysis", @@ -75,6 +76,8 @@ cc_library( "//tensorflow/compiler/xla/service:transfer_manager", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_headers_lib", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], ) @@ -88,6 +91,7 @@ cc_library( "//tensorflow/compiler/xla:array3d", "//tensorflow/compiler/xla:array4d", "//tensorflow/compiler/xla:error_spec", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_comparison", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:test", @@ -96,6 +100,9 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", ], ) @@ -111,20 +118,23 @@ cc_library( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", "//tensorflow/compiler/xla/service:backend", "//tensorflow/compiler/xla/service:computation_layout", "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service:hlo_runner", "//tensorflow/compiler/xla/service:hlo_verifier", "//tensorflow/compiler/xla/service:interpreter_plugin", # reference backend "//tensorflow/compiler/xla/service:platform_util", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", ], ) @@ -138,10 +148,11 @@ cc_library( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service:hlo_verifier", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -152,8 +163,8 @@ tf_cc_binary( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/client:client_library", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service/cpu:cpu_compiler", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", @@ -179,18 +190,18 @@ cc_library( "//tensorflow/compiler/xla:array3d", "//tensorflow/compiler/xla:array4d", "//tensorflow/compiler/xla:execution_options_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test_helpers", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:interpreter_plugin", # reference backend "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -198,6 +209,9 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -209,6 +223,7 @@ cc_library( deps = [ ":codegen_test_base", ":filecheck", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service:llvm_compiler", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:test", @@ -258,7 +273,7 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:computation_placer", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", @@ -270,6 +285,8 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//third_party/eigen3", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], ) @@ -286,8 +303,8 @@ xla_test( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -302,7 +319,7 @@ xla_test( "enable_for_xla_interpreter", ], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", @@ -310,8 +327,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -330,8 +347,8 @@ xla_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -345,16 +362,16 @@ xla_test( "enable_for_xla_interpreter", ], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -372,14 +389,17 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:platform_util", + "//tensorflow/compiler/xla/service:stream_pool", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/core:lib", "//tensorflow/core:regexp_internal", "//tensorflow/core:test", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", ], ) @@ -391,8 +411,8 @@ xla_test( ], deps = [ "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -406,7 +426,7 @@ xla_test( tags = ["enable_for_xla_interpreter"], deps = [ "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", @@ -415,9 +435,9 @@ xla_test( "//tensorflow/compiler/xla:xla_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -435,14 +455,14 @@ xla_test( tags = ["optonly"], deps = [ "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -460,9 +480,9 @@ xla_test( deps = [ "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -479,8 +499,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -497,8 +517,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -515,9 +535,9 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -531,6 +551,7 @@ xla_test( srcs = ["scalar_computations_test.cc"], shard_count = 32, deps = [ + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -538,13 +559,15 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -557,12 +580,11 @@ xla_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/core:lib", - "//tensorflow/core:test", + "@com_google_absl//absl/types:span", ], ) @@ -573,7 +595,7 @@ xla_test( "enable_for_xla_interpreter", ], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", @@ -581,12 +603,12 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/types:span", ], ) @@ -599,7 +621,7 @@ xla_test( "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array3d", "//tensorflow/compiler/xla:array4d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", @@ -607,13 +629,12 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", - "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -633,7 +654,7 @@ xla_test( deps = [ ":client_library_test_base", ":literal_test_util", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", ], @@ -645,7 +666,7 @@ xla_test( tags = ["enable_for_xla_interpreter"], deps = [ "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", @@ -653,12 +674,13 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:reduce_precision_insertion", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", ], ) @@ -676,8 +698,7 @@ xla_test( "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -685,6 +706,7 @@ xla_test( "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", ], ) @@ -697,8 +719,22 @@ xla_test( "//tensorflow/compiler/xla:execution_options_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + ], +) + +xla_test( + name = "scatter_test", + srcs = ["scatter_test.cc"], + deps = [ + ":client_library_test_base", + ":hlo_test_base", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", ], ) @@ -719,8 +755,7 @@ xla_test( "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -728,6 +763,7 @@ xla_test( "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", ], ) @@ -743,8 +779,8 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -763,11 +799,12 @@ xla_test( "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array3d", "//tensorflow/compiler/xla:array4d", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -779,7 +816,7 @@ xla_test( CONVOLUTION_TEST_DEPS = [ "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array4d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", @@ -788,8 +825,9 @@ CONVOLUTION_TEST_DEPS = [ "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -801,7 +839,10 @@ xla_test( timeout = "long", srcs = ["convolution_test.cc"], shard_count = 25, - deps = CONVOLUTION_TEST_DEPS, + deps = CONVOLUTION_TEST_DEPS + [ + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + ], ) xla_test( @@ -811,7 +852,10 @@ xla_test( backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]}, backends = ["gpu"], shard_count = 25, - deps = CONVOLUTION_TEST_DEPS, + deps = CONVOLUTION_TEST_DEPS + [ + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + ], ) xla_test( @@ -826,13 +870,13 @@ xla_test( deps = [ "//tensorflow/compiler/xla:array3d", "//tensorflow/compiler/xla:array4d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -855,13 +899,14 @@ xla_test( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -873,7 +918,7 @@ xla_test( ":test_utils", "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array4d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -884,9 +929,10 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client/lib:math", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", @@ -894,6 +940,7 @@ xla_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", ], ) @@ -905,7 +952,7 @@ xla_test( ":test_utils", "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array4d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -916,9 +963,9 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", @@ -938,12 +985,12 @@ xla_test( ], deps = [ ":test_utils", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -964,12 +1011,16 @@ xla_test( "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -983,8 +1034,8 @@ xla_test( "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array3d", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1005,7 +1056,7 @@ xla_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:computation_placer", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", @@ -1029,19 +1080,21 @@ xla_test( ], deps = [ "//tensorflow/compiler/xla:array2d", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -1056,9 +1109,9 @@ xla_test( "//tensorflow/compiler/xla:array3d", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1086,14 +1139,17 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -1113,15 +1169,18 @@ xla_test_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -1129,6 +1188,7 @@ xla_test( name = "reduce_window_test", timeout = "long", srcs = [], + shard_count = 20, tags = [ "enable_for_xla_interpreter", "optonly", @@ -1147,16 +1207,16 @@ xla_test( ], deps = [ "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1174,16 +1234,17 @@ xla_test( deps = [ ":client_library_test_base", "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -1194,12 +1255,28 @@ xla_test( "enable_for_xla_interpreter", ], deps = [ - ":client_library_test_base", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", + ], +) + +xla_test( + name = "token_hlo_test", + srcs = ["token_hlo_test.cc"], + tags = [ + "enable_for_xla_interpreter", + ], + deps = [ + "//tensorflow/compiler/xla/service:hlo_verifier", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "@com_google_absl//absl/strings", ], ) @@ -1210,12 +1287,13 @@ xla_test( "enable_for_xla_interpreter", ], deps = [ + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1228,10 +1306,12 @@ xla_test( name = "custom_call_test", srcs = ["custom_call_test.cc"], deps = [ + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry", "//tensorflow/compiler/xla/tests:client_library_test_base", @@ -1240,6 +1320,7 @@ xla_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -1254,8 +1335,8 @@ xla_test( "//tensorflow/compiler/xla:array4d", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1272,12 +1353,13 @@ xla_test( deps = [ "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array4d", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1297,13 +1379,14 @@ xla_test( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -1315,8 +1398,8 @@ xla_test( ], deps = [ "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1332,8 +1415,8 @@ xla_test( ], deps = [ "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1349,22 +1432,24 @@ xla_test( ], deps = [ "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test_helpers", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -1372,18 +1457,18 @@ xla_test( name = "prng_test", srcs = ["prng_test.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/types:span", ], ) @@ -1403,17 +1488,16 @@ xla_test( "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", - "//tensorflow/core:test", + "@com_google_absl//absl/types:span", ], ) @@ -1427,12 +1511,14 @@ xla_test( "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array4d", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", ], ) @@ -1450,9 +1536,9 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1476,8 +1562,8 @@ xla_test( "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1493,17 +1579,16 @@ xla_test( ], deps = [ "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", + "@com_google_absl//absl/algorithm:container", ], ) @@ -1511,20 +1596,20 @@ xla_test( name = "cross_replica_sum_test", srcs = ["cross_replica_sum_test.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", @@ -1541,7 +1626,7 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1555,21 +1640,21 @@ xla_test( name = "compilation_cache_test", srcs = ["compilation_cache_test.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla:xla_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/types:span", ], ) @@ -1581,13 +1666,14 @@ xla_test( ], deps = [ "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -1595,22 +1681,22 @@ xla_test( name = "compute_constant_test", srcs = ["compute_constant_test.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:global_data", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/strings", ], ) @@ -1625,8 +1711,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -1642,8 +1728,8 @@ xla_test( deps = [ ":client_library_test_base", "//tensorflow/compiler/xla/client:global_data", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", ], @@ -1656,8 +1742,8 @@ xla_test( deps = [ ":client_library_test_base", "//tensorflow/compiler/xla/client:global_data", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", ], @@ -1670,15 +1756,15 @@ xla_test( "enable_for_xla_interpreter", ], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -1695,7 +1781,7 @@ xla_test( "enable_for_xla_interpreter", ], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", @@ -1704,6 +1790,7 @@ xla_test( "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", + "@com_google_absl//absl/memory", ], ) @@ -1712,6 +1799,7 @@ tf_cc_test( srcs = ["llvm_compiler_test.cc"], tags = ["requires-gpu-sm35"], deps = [ + "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/service:backend", "//tensorflow/compiler/xla/service:cpu_plugin", @@ -1724,6 +1812,7 @@ tf_cc_test( "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/stream_executor", + "@com_google_absl//absl/memory", "@llvm//:core", ], ) @@ -1732,7 +1821,7 @@ xla_test( name = "round_trip_packed_literal_test", srcs = ["round_trip_packed_literal_test.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:packed_literal_reader", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", @@ -1744,6 +1833,7 @@ xla_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/types:span", ], ) @@ -1755,16 +1845,13 @@ xla_test( ], deps = [ "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", - "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_runner", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", @@ -1774,6 +1861,8 @@ xla_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//third_party/eigen3", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/types:span", ], ) @@ -1781,18 +1870,12 @@ xla_test( name = "multioutput_fusion_test", srcs = ["multioutput_fusion_test.cc"], deps = [ - "//tensorflow/compiler/xla:array2d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_runner", - "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -1800,6 +1883,9 @@ xla_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", ], ) @@ -1822,11 +1908,10 @@ xla_test( name = "local_client_allocation_test", srcs = ["local_client_allocation_test.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -1834,6 +1919,7 @@ xla_test( "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/types:optional", ], ) @@ -1845,7 +1931,7 @@ xla_test( shard_count = 30, tags = ["optonly"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test", @@ -1853,8 +1939,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:platform_util", @@ -1870,6 +1956,16 @@ xla_test( ], ) +xla_test( + name = "outfeed_in_nested_computation_test", + srcs = ["outfeed_in_nested_computation_test.cc"], + deps = [ + "//tensorflow/compiler/xla/tests:local_client_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:test", + ], +) + tf_cc_test( name = "hlo_metadata_test", srcs = [ @@ -1879,7 +1975,7 @@ tf_cc_test( ":local_client_test_base", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:cpu_plugin", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/core:test_main", @@ -1891,7 +1987,7 @@ xla_test( srcs = ["round_trip_transfer_test.cc"], deps = [ "//tensorflow/compiler/xla:array4d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:xla_data_proto", @@ -1912,22 +2008,21 @@ xla_test( deps = [ "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:array4d", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:test_helpers", - "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", "//tensorflow/core:test", + "@com_google_absl//absl/types:span", ], ) @@ -1935,7 +2030,7 @@ xla_test( name = "deep_graph_test", srcs = ["deep_graph_test.cc"], deps = [ - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", ], @@ -1950,6 +2045,7 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", + "@com_google_absl//absl/strings", ], ) @@ -1960,7 +2056,7 @@ xla_test( ":literal_test_util", ":local_client_test_base", ":xla_internal_test_main", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", @@ -1968,8 +2064,10 @@ xla_test( "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:generic_transfer_manager", "//tensorflow/compiler/xla/service:shaped_buffer", + "//tensorflow/compiler/xla/service:stream_pool", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/core:test", ], ) @@ -1989,6 +2087,7 @@ xla_test( "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", + "@com_google_absl//absl/types:optional", ], ) @@ -2015,13 +2114,33 @@ tf_cc_test( xla_test( name = "test_utils_test", srcs = ["test_utils_test.cc"], + # There is nothing backend specific in this test, so just pick an arbitrary backend. + backends = ["cpu"], deps = [ ":local_client_test_base", ":test_utils", "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", - "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/compiler/xla/client:xla_computation", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:lib", "//tensorflow/core:test", ], ) + +xla_test( + name = "iota_test", + srcs = ["iota_test.cc"], + shard_count = 30, + tags = [ + "enable_for_xla_interpreter", + # Require optimized builds, iota_test_cpu is very slow in fastbuild. + "optonly", + ], + deps = [ + ":client_library_test_base", + ":xla_internal_test_main", + "//tensorflow/core:lib", + ], +) diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc index 36a706496918ac8c15780473019e2a8d098ffa22..0bf4556b437fb1717a9c9773834fa3031cfbd6ea 100644 --- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc +++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc @@ -19,14 +19,15 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -40,6 +41,7 @@ limitations under the License. namespace xla { namespace { + class ArrayElementwiseOpTest : public ClientLibraryTestBase { public: ErrorSpec error_spec_{0.0001, 0.0001}; @@ -51,16 +53,16 @@ class ArrayElementwiseOpTestParamCount XLA_TEST_F(ArrayElementwiseOpTest, NegConstantZeroElementF32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - builder.Neg(a); + auto a = ConstantR1(&builder, {}); + Neg(a); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, NegConstantF32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f}); - builder.Neg(a); + auto a = ConstantR1(&builder, {-2.5f, 3.14f, 2.25f, -10.0f, 6.0f}); + Neg(a); ComputeAndCompareR1(&builder, {2.5f, -3.14f, -2.25f, 10.0f, -6.0f}, {}, error_spec_); @@ -68,10 +70,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantF32) { XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-1, 0, 1, 324, - std::numeric_limits::min(), - std::numeric_limits::max()}); - builder.Neg(a); + auto a = ConstantR1(&builder, + {-1, 0, 1, 324, std::numeric_limits::min(), + std::numeric_limits::max()}); + Neg(a); // -min == min for int32 due to an overflow. In C++ it is undefined behavior // to do this calculation. For XLA we have not specified that, so it @@ -84,17 +86,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS32) { XLA_TEST_F(ArrayElementwiseOpTest, NegConstantZeroElementC64) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - builder.Neg(a); + auto a = ConstantR1(&builder, {}); + Neg(a); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, NegConstantC64) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {{-2.5f, 1.0f}, {0.0f, 3.14f}, {2.25f, -1.0f}, {-10.0f, 0.0f}}); - builder.Neg(a); + auto a = ConstantR1( + &builder, {{-2.5f, 1.0f}, {0.0f, 3.14f}, {2.25f, -1.0f}, {-10.0f, 0.0f}}); + Neg(a); ComputeAndCompareR1( &builder, {{2.5f, -1.0f}, {0.0f, -3.14f}, {-2.25f, 1.0f}, {10.0f, 0.0f}}, @@ -103,16 +105,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantC64) { XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS64) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({ - -1, - 1, - 0, - 0x12345678, - static_cast(0xffffffff12345678l), - static_cast(0x8000000000000000LL), - static_cast(0x8000000000000001LL), - }); - builder.Neg(a); + auto a = + ConstantR1(&builder, { + -1, + 1, + 0, + 0x12345678, + static_cast(0xffffffff12345678l), + static_cast(0x8000000000000000LL), + static_cast(0x8000000000000001LL), + }); + Neg(a); LOG(INFO) << -static_cast(0x7FFFFFFFFFFFFFFFLL); ComputeAndCompareR1(&builder, @@ -130,8 +133,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS64) { XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteZeroElementF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - builder.IsFinite(a); + auto a = ConstantR1(&builder, {}); + IsFinite(a); ComputeAndCompareR1(&builder, {}, {}); } @@ -141,21 +144,21 @@ static const float kNonCanonicalNaN = tensorflow::bit_cast(0x7FD01234); XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteScalarF32) { XlaBuilder builder(TestName()); - builder.IsFinite(builder.ConstantR0(NAN)); + IsFinite(ConstantR0(&builder, NAN)); ComputeAndCompareR0(&builder, false, {}); EXPECT_TRUE(std::isnan(kNonCanonicalNaN)); - builder.IsFinite(builder.ConstantR0(kNonCanonicalNaN)); + IsFinite(ConstantR0(&builder, kNonCanonicalNaN)); ComputeAndCompareR0(&builder, false, {}); const float inf = std::numeric_limits::infinity(); - builder.IsFinite(builder.ConstantR0(inf)); + IsFinite(ConstantR0(&builder, inf)); ComputeAndCompareR0(&builder, false, {}); - builder.IsFinite(builder.ConstantR0(-inf)); + IsFinite(ConstantR0(&builder, -inf)); ComputeAndCompareR0(&builder, false, {}); - builder.IsFinite(builder.ConstantR0(0.0f)); + IsFinite(ConstantR0(&builder, 0.0f)); ComputeAndCompareR0(&builder, true, {}); } @@ -163,9 +166,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteR1F32s) { XlaBuilder builder(TestName()); const float inf = std::numeric_limits::infinity(); EXPECT_TRUE(std::isnan(kNonCanonicalNaN)); - auto a = builder.ConstantR1( - {{NAN, 7.0f, kNonCanonicalNaN, -1.0f, inf, -inf}}); - builder.IsFinite(a); + auto a = ConstantR1(&builder, + {{NAN, 7.0f, kNonCanonicalNaN, -1.0f, inf, -inf}}); + IsFinite(a); ComputeAndCompareR1(&builder, {false, true, false, true, false, false}, {}); @@ -173,9 +176,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteR1F32s) { XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f}); - auto b = builder.ConstantR1({100.0f, 3.13f, 2.75f, 10.5f, -999.0f}); - builder.Add(a, b); + auto a = ConstantR1(&builder, {-2.5f, 3.14f, 2.25f, -10.0f, 6.0f}); + auto b = ConstantR1(&builder, {100.0f, 3.13f, 2.75f, 10.5f, -999.0f}); + Add(a, b); ComputeAndCompareR1(&builder, {97.5f, 6.27f, 5.0f, 0.5f, -993.0f}, {}, error_spec_); @@ -183,20 +186,20 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantF32s) { XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Add(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Add(a, b); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantC64s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {{-2.5f, 0.0f}, {0.0f, 3.14f}, {2.25f, 0.0f}, {1.0f, -10.0f}}); - auto b = builder.ConstantR1( - {{100.0f, 0.0f}, {3.13f, 0.0f}, {2.75f, 1.0f}, {-2.0f, 10.5f}}); - builder.Add(a, b); + auto a = ConstantR1( + &builder, {{-2.5f, 0.0f}, {0.0f, 3.14f}, {2.25f, 0.0f}, {1.0f, -10.0f}}); + auto b = ConstantR1( + &builder, {{100.0f, 0.0f}, {3.13f, 0.0f}, {2.75f, 1.0f}, {-2.0f, 10.5f}}); + Add(a, b); ComputeAndCompareR1( &builder, {97.5f, {3.13f, 3.14f}, {5.0f, 1.0f}, {-1.0f, 0.5f}}, {}, @@ -205,9 +208,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantC64s) { XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementC64s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Add(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Add(a, b); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } @@ -224,8 +227,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) { 0x8000000000000000LL, 0x8000000000000000LL, 1}; - std::unique_ptr lhs_literal = Literal::CreateR1({lhs}); - auto lhs_param = b.Parameter(0, lhs_literal->shape(), "lhs_param"); + std::unique_ptr lhs_literal = LiteralUtil::CreateR1({lhs}); + auto lhs_param = Parameter(&b, 0, lhs_literal->shape(), "lhs_param"); std::unique_ptr lhs_data = client_->TransferToServer(*lhs_literal).ConsumeValueOrDie(); @@ -238,12 +241,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) { 0, 1, 0x8000000000000000LL}; - std::unique_ptr rhs_literal = Literal::CreateR1({rhs}); - auto rhs_param = b.Parameter(1, rhs_literal->shape(), "rhs_param"); + std::unique_ptr rhs_literal = LiteralUtil::CreateR1({rhs}); + auto rhs_param = Parameter(&b, 1, rhs_literal->shape(), "rhs_param"); std::unique_ptr rhs_data = client_->TransferToServer(*rhs_literal).ConsumeValueOrDie(); - b.Add(lhs_param, rhs_param); + Add(lhs_param, rhs_param); std::vector expected(lhs.size()); for (int64 i = 0; i < lhs.size(); ++i) { @@ -264,8 +267,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) { 1, 0, -1}; - std::unique_ptr lhs_literal = Literal::CreateR1({lhs}); - auto lhs_param = b.Parameter(0, lhs_literal->shape(), "lhs_param"); + std::unique_ptr lhs_literal = LiteralUtil::CreateR1({lhs}); + auto lhs_param = Parameter(&b, 0, lhs_literal->shape(), "lhs_param"); std::unique_ptr lhs_data = client_->TransferToServer(*lhs_literal).ConsumeValueOrDie(); @@ -277,12 +280,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) { 0x7FFFFFFFFFFFFFFLL, 0x7FFFFFFFFFFFFFFFLL, 0x7FFFFFFFFFFFFFFFLL}; - std::unique_ptr rhs_literal = Literal::CreateR1({rhs}); - auto rhs_param = b.Parameter(1, rhs_literal->shape(), "rhs_param"); + std::unique_ptr rhs_literal = LiteralUtil::CreateR1({rhs}); + auto rhs_param = Parameter(&b, 1, rhs_literal->shape(), "rhs_param"); std::unique_ptr rhs_data = client_->TransferToServer(*rhs_literal).ConsumeValueOrDie(); - auto sub = b.Sub(lhs_param, rhs_param); + Sub(lhs_param, rhs_param); std::vector expected(lhs.size()); for (int64 i = 0; i < lhs.size(); ++i) { @@ -292,6 +295,22 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) { ComputeAndCompareR1(&b, expected, {lhs_data.get(), rhs_data.get()}); } +XLA_TEST_F(ArrayElementwiseOpTest, CmpTwoConstantU64s) { + XlaBuilder b(TestName()); + + std::vector lhs{static_cast(0x8000000000000000ULL)}; + std::unique_ptr lhs_literal = LiteralUtil::CreateR1({lhs}); + auto lhs_param = Parameter(&b, 0, lhs_literal->shape(), "lhs_param"); + + std::vector rhs{static_cast(0x7FFFFFFFFFFFFFFFULL)}; + std::unique_ptr rhs_literal = LiteralUtil::CreateR1({rhs}); + auto rhs_param = Parameter(&b, 1, rhs_literal->shape(), "rhs_param"); + + Lt(lhs_param, rhs_param); + + ComputeAndCompare(&b, {std::move(*lhs_literal), std::move(*rhs_literal)}); +} + TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) { const int count = GetParam(); XlaBuilder builder(TestName()); @@ -302,26 +321,26 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) { b_values.push_back(2 * i / static_cast(count + 2)); } - std::unique_ptr a_literal = Literal::CreateR1({a_values}); + std::unique_ptr a_literal = LiteralUtil::CreateR1({a_values}); std::unique_ptr a_data = client_->TransferToServer(*a_literal).ConsumeValueOrDie(); - auto a_constant = builder.ConstantR1(a_values); - auto a_param = builder.Parameter(0, a_literal->shape(), "a_param"); + auto a_constant = ConstantR1(&builder, a_values); + auto a_param = Parameter(&builder, 0, a_literal->shape(), "a_param"); - std::unique_ptr b_literal = Literal::CreateR1({b_values}); + std::unique_ptr b_literal = LiteralUtil::CreateR1({b_values}); std::unique_ptr b_data = client_->TransferToServer(*b_literal).ConsumeValueOrDie(); - auto b_constant = builder.Parameter(1, a_literal->shape(), "b_param"); - auto b_param = builder.ConstantR1(b_values); + auto b_constant = Parameter(&builder, 1, a_literal->shape(), "b_param"); + auto b_param = ConstantR1(&builder, b_values); - auto sum1 = builder.Add(a_constant, b_constant); - auto sum2 = builder.Add(a_constant, b_param); - auto sum3 = builder.Add(a_param, b_constant); - auto sum4 = builder.Add(a_param, b_param); + auto sum1 = Add(a_constant, b_constant); + auto sum2 = Add(a_constant, b_param); + auto sum3 = Add(a_param, b_constant); + auto sum4 = Add(a_param, b_param); - auto sum = builder.Add(sum1, sum2); - sum = builder.Add(sum, sum3); - sum = builder.Add(sum, sum4); + auto sum = Add(sum1, sum2); + sum = Add(sum, sum3); + sum = Add(sum, sum4); std::vector expected; for (int64 i = 0; i < count; ++i) { @@ -334,9 +353,9 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) { XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f}); - auto b = builder.ConstantR1({100.0f, 3.13f, 2.75f, 10.5f, -999.0f}); - builder.Sub(a, b); + auto a = ConstantR1(&builder, {-2.5f, 3.14f, 2.25f, -10.0f, 6.0f}); + auto b = ConstantR1(&builder, {100.0f, 3.13f, 2.75f, 10.5f, -999.0f}); + Sub(a, b); ComputeAndCompareR1(&builder, {-102.5f, 0.01f, -0.5f, -20.5f, 1005.0f}, {}, error_spec_); @@ -344,38 +363,38 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) { XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Sub(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Sub(a, b); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-1, 0, 2, 1000000000}); - auto b = builder.ConstantR1({-1, 2, 1, -1}); - builder.Sub(a, b); + auto a = ConstantR1(&builder, {-1, 0, 2, 1000000000}); + auto b = ConstantR1(&builder, {-1, 2, 1, -1}); + Sub(a, b); ComputeAndCompareR1(&builder, {0, -2, 1, 1000000001}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementS32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Sub(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Sub(a, b); ComputeAndCompareR1(&builder, {}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantC64s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {{-2.5f, 0.0f}, {0.0f, 3.14f}, {3.0f, 2.25f}}); - auto b = builder.ConstantR1( - {{0.0f, 10.0f}, {3.13f, 0.0f}, {2.75f, -0.25f}}); - builder.Sub(a, b); + auto a = ConstantR1(&builder, + {{-2.5f, 0.0f}, {0.0f, 3.14f}, {3.0f, 2.25f}}); + auto b = ConstantR1( + &builder, {{0.0f, 10.0f}, {3.13f, 0.0f}, {2.75f, -0.25f}}); + Sub(a, b); ComputeAndCompareR1( &builder, {{-2.5f, -10.0f}, {-3.13f, 3.14f}, {0.25f, 2.5f}}, {}, @@ -384,18 +403,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantC64s) { XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementC64s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Sub(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Sub(a, b); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); - auto b = builder.ConstantR1({10.0f, 5.1f, 1.0f, 10.0f, -6.0f}); - builder.Div(a, b); + auto a = ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); + auto b = ConstantR1(&builder, {10.0f, 5.1f, 1.0f, 10.0f, -6.0f}); + Div(a, b); ComputeAndCompareR1(&builder, {-0.25f, 5.0f, 2.25f, -1.0f, -1.0f}, {}, error_spec_); @@ -403,14 +422,72 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantF32s) { XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Div(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Div(a, b); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } -XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) { +class IntegerDivideOpTest : public ArrayElementwiseOpTest { + protected: + template + void TestDivRem(absl::Span dividends, absl::Span divisors, + absl::Span quotients, + absl::Span remainders) { + { + XlaBuilder builder(TestName()); + XlaOp dividend; + XlaOp divisor; + auto dividend_data = + CreateR1Parameter(dividends, 0, "dividend", &builder, ÷nd); + auto divisor_data = + CreateR1Parameter(divisors, 1, "divisor", &builder, &divisor); + Div(dividend, divisor); + + ComputeAndCompareR1(&builder, quotients, + {dividend_data.get(), divisor_data.get()}); + } + + // Test with a compile-time constant divisor. + { + XlaBuilder builder(TestName()); + XlaOp dividend; + auto dividend_data = + CreateR1Parameter(dividends, 0, "dividend", &builder, ÷nd); + Div(dividend, ConstantR1(&builder, divisors)); + + ComputeAndCompareR1(&builder, quotients, {dividend_data.get()}); + } + + { + XlaBuilder builder(TestName()); + XlaOp dividend; + XlaOp divisor; + auto dividend_data = + CreateR1Parameter(dividends, 0, "dividend", &builder, ÷nd); + auto divisor_data = + CreateR1Parameter(divisors, 1, "divisor", &builder, &divisor); + Rem(dividend, divisor); + + ComputeAndCompareR1(&builder, remainders, + {dividend_data.get(), divisor_data.get()}); + } + + // Test with a compile-time constant divisor. + { + XlaBuilder builder(TestName()); + XlaOp dividend; + auto dividend_data = + CreateR1Parameter(dividends, 0, "dividend", &builder, ÷nd); + Rem(dividend, ConstantR1(&builder, divisors)); + + ComputeAndCompareR1(&builder, remainders, {dividend_data.get()}); + } + } +}; + +XLA_TEST_F(IntegerDivideOpTest, DivS32s) { // clang-format off // Some interesting values to test. std::vector vals = { @@ -434,58 +511,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) { } } - { - XlaBuilder builder(TestName()); - XlaOp dividend; - XlaOp divisor; - auto dividend_data = - CreateR1Parameter(dividends, 0, "dividend", &builder, ÷nd); - auto divisor_data = - CreateR1Parameter(divisors, 1, "divisor", &builder, &divisor); - builder.Div(dividend, divisor); - - ComputeAndCompareR1(&builder, quotients, - {dividend_data.get(), divisor_data.get()}); - } - - // Test with a compile-time constant divisor. - { - XlaBuilder builder(TestName()); - XlaOp dividend; - auto dividend_data = - CreateR1Parameter(dividends, 0, "dividend", &builder, ÷nd); - builder.Div(dividend, builder.ConstantR1(divisors)); - - ComputeAndCompareR1(&builder, quotients, {dividend_data.get()}); - } - - { - XlaBuilder builder(TestName()); - XlaOp dividend; - XlaOp divisor; - auto dividend_data = - CreateR1Parameter(dividends, 0, "dividend", &builder, ÷nd); - auto divisor_data = - CreateR1Parameter(divisors, 1, "divisor", &builder, &divisor); - builder.Rem(dividend, divisor); - - ComputeAndCompareR1(&builder, remainders, - {dividend_data.get(), divisor_data.get()}); - } + TestDivRem(dividends, divisors, quotients, remainders); +} - // Test with a compile-time constant divisor. - { - XlaBuilder builder(TestName()); - XlaOp dividend; - auto dividend_data = - CreateR1Parameter(dividends, 0, "dividend", &builder, ÷nd); - builder.Rem(dividend, builder.ConstantR1(divisors)); +XLA_TEST_F(IntegerDivideOpTest, SignedOverflow) { + std::vector dividends = {5, INT32_MIN}, divisors = {0, -1}, + quotients = {-1, INT32_MIN}, remainders = {5, 0}; - ComputeAndCompareR1(&builder, remainders, {dividend_data.get()}); - } + TestDivRem(dividends, divisors, quotients, remainders); } -XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) { +XLA_TEST_F(IntegerDivideOpTest, DivU32s) { // clang-format off // Some interesting values to test. std::vector vals = { @@ -505,62 +541,23 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) { } } - { - XlaBuilder builder(TestName()); - XlaOp dividend; - XlaOp divisor; - auto dividend_data = CreateR1Parameter(dividends, 0, "dividend", - &builder, ÷nd); - auto divisor_data = - CreateR1Parameter(divisors, 1, "divisor", &builder, &divisor); - builder.Div(dividend, divisor); - - ComputeAndCompareR1(&builder, quotients, - {dividend_data.get(), divisor_data.get()}); - } - - { - XlaBuilder builder(TestName()); - XlaOp dividend; - auto dividend_data = CreateR1Parameter(dividends, 0, "dividend", - &builder, ÷nd); - builder.Div(dividend, builder.ConstantR1(divisors)); - - ComputeAndCompareR1(&builder, quotients, {dividend_data.get()}); - } - - { - XlaBuilder builder(TestName()); - XlaOp dividend; - XlaOp divisor; - auto dividend_data = CreateR1Parameter(dividends, 0, "dividend", - &builder, ÷nd); - auto divisor_data = - CreateR1Parameter(divisors, 1, "divisor", &builder, &divisor); - builder.Rem(dividend, divisor); - - ComputeAndCompareR1(&builder, remainders, - {dividend_data.get(), divisor_data.get()}); - } + TestDivRem(dividends, divisors, quotients, remainders); +} - { - XlaBuilder builder(TestName()); - XlaOp dividend; - auto dividend_data = CreateR1Parameter(dividends, 0, "dividend", - &builder, ÷nd); - builder.Rem(dividend, builder.ConstantR1(divisors)); +XLA_TEST_F(IntegerDivideOpTest, UnsignedOverflow) { + std::vector dividends = {5}, divisors = {0}, quotients = {-1}, + remainders = {5}; - ComputeAndCompareR1(&builder, remainders, {dividend_data.get()}); - } + TestDivRem(dividends, divisors, quotients, remainders); } XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantC64s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {{-2.5f, 1.0f}, {-25.5f, 0.0f}, {2.0f, -1.0f}}); - auto b = builder.ConstantR1( - {{10.0f, 0.0f}, {0.0f, 1.0f}, {2.0f, -1.0f}}); - builder.Div(a, b); + auto a = ConstantR1( + &builder, {{-2.5f, 1.0f}, {-25.5f, 0.0f}, {2.0f, -1.0f}}); + auto b = ConstantR1(&builder, + {{10.0f, 0.0f}, {0.0f, 1.0f}, {2.0f, -1.0f}}); + Div(a, b); ComputeAndCompareR1( &builder, {{-0.25f, 0.1f}, {0.0f, 25.5f}, {1.0f, 0.0f}}, {}, error_spec_); @@ -568,20 +565,20 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantC64s) { XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementC64s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Div(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Div(a, b); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, RemF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {-2.5f, 25.5f, 2.25f, -10.0f, 6.0f, 3.0f, 3.0f, -1.0f, -8.0f}); - auto b = builder.ConstantR1( - {10.0f, 5.1f, 1.0f, 10.0f, -6.0f, 2.0f, -2.0f, 7.0f, -4.0f}); - builder.Rem(a, b); + auto a = ConstantR1( + &builder, {-2.5f, 25.5f, 2.25f, -10.0f, 6.0f, 3.0f, 3.0f, -1.0f, -8.0f}); + auto b = ConstantR1( + &builder, {10.0f, 5.1f, 1.0f, 10.0f, -6.0f, 2.0f, -2.0f, 7.0f, -4.0f}); + Rem(a, b); ComputeAndCompareR1( &builder, {-2.5f, 0.0f, 0.25f, 0.0f, -0.0f, 1.0f, 1.0f, -1.0f, -0.0f}, {}, @@ -590,20 +587,20 @@ XLA_TEST_F(ArrayElementwiseOpTest, RemF32s) { XLA_TEST_F(ArrayElementwiseOpTest, RemZeroElementF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Rem(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Rem(a, b); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, RemF64s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {-2.5, 25.5, 2.25, -10.0, 6.0, 3.0, 3.0, -1.0, -8.0}); - auto b = builder.ConstantR1( - {10.0, 5.1, 1.0, 10.0, -6.0, 2.0, -2.0, 7.0, -4.0}); - builder.Rem(a, b); + auto a = ConstantR1( + &builder, {-2.5, 25.5, 2.25, -10.0, 6.0, 3.0, 3.0, -1.0, -8.0}); + auto b = ConstantR1( + &builder, {10.0, 5.1, 1.0, 10.0, -6.0, 2.0, -2.0, 7.0, -4.0}); + Rem(a, b); ComputeAndCompareR1( &builder, {-2.5, 0.0, 0.25, 0.0, -0.0, 1.0, 1.0, -1.0, -0.0}, {}, @@ -612,9 +609,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, RemF64s) { XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); - auto b = builder.ConstantR1({10.0f, 5.0f, 1.0f, 10.0f, -6.0f}); - builder.Mul(a, b); + auto a = ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); + auto b = ConstantR1(&builder, {10.0f, 5.0f, 1.0f, 10.0f, -6.0f}); + Mul(a, b); ComputeAndCompareR1(&builder, {-25.0f, 127.5f, 2.25f, -100.0f, -36.0f}, {}, error_spec_); @@ -622,9 +619,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantF32s) { XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Mul(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Mul(a, b); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } @@ -648,18 +645,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantS32s) { } XlaBuilder builder(TestName()); - auto a = builder.ConstantR1(a_data); - auto b = builder.ConstantR1(b_data); - builder.Mul(a, b); + auto a = ConstantR1(&builder, a_data); + auto b = ConstantR1(&builder, b_data); + Mul(a, b); ComputeAndCompareR1(&builder, expected, {}); } XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementS32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Mul(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Mul(a, b); ComputeAndCompareR1(&builder, {}, {}); } @@ -679,20 +676,20 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantU32s) { } XlaBuilder builder(TestName()); - auto a = builder.ConstantR1(a_data); - auto b = builder.ConstantR1(b_data); - builder.Mul(a, b); + auto a = ConstantR1(&builder, a_data); + auto b = ConstantR1(&builder, b_data); + Mul(a, b); ComputeAndCompareR1(&builder, expected, {}); } XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantC64s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {{-2.5f, 0.0f}, {0.0f, 25.5f}, {2.0f, -10.0f}}); - auto b = builder.ConstantR1( - {{0.0f, 10.0f}, {5.0f, 1.0f}, {10.0f, -6.0f}}); - builder.Mul(a, b); + auto a = ConstantR1( + &builder, {{-2.5f, 0.0f}, {0.0f, 25.5f}, {2.0f, -10.0f}}); + auto b = ConstantR1(&builder, + {{0.0f, 10.0f}, {5.0f, 1.0f}, {10.0f, -6.0f}}); + Mul(a, b); ComputeAndCompareR1( &builder, {{0.0f, -25.0f}, {-25.5f, 127.5f}, {-40.0f, -112.0}}, {}, @@ -701,27 +698,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantC64s) { XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementC64s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Mul(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Mul(a, b); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, AndPredR1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({false, false, true, true}); - auto b = builder.ConstantR1({false, true, false, true}); - builder.And(a, b); + auto a = ConstantR1(&builder, {false, false, true, true}); + auto b = ConstantR1(&builder, {false, true, false, true}); + And(a, b); ComputeAndCompareR1(&builder, {false, false, false, true}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, AndPredR2) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{false, false}, {true, true}}); - auto b = builder.ConstantR2({{false, true}, {false, true}}); - builder.And(a, b); + auto a = ConstantR2(&builder, {{false, false}, {true, true}}); + auto b = ConstantR2(&builder, {{false, true}, {false, true}}); + And(a, b); Array2D expected_array({{false, false}, {false, true}}); ComputeAndCompareR2(&builder, expected_array, {}); @@ -729,27 +726,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, AndPredR2) { XLA_TEST_F(ArrayElementwiseOpTest, AndZeroElementPredR1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.And(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + And(a, b); ComputeAndCompareR1(&builder, {}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, AndS32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({0, -1, -8}); - auto b = builder.ConstantR1({5, -7, 12}); - builder.And(a, b); + auto a = ConstantR1(&builder, {0, -1, -8}); + auto b = ConstantR1(&builder, {5, -7, 12}); + And(a, b); ComputeAndCompareR1(&builder, {0, -7, 8}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, AndS32R2) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{0, -5}, {-1, 5}}); - auto b = builder.ConstantR2({{1, -6}, {4, 5}}); - builder.And(a, b); + auto a = ConstantR2(&builder, {{0, -5}, {-1, 5}}); + auto b = ConstantR2(&builder, {{1, -6}, {4, 5}}); + And(a, b); Array2D expected_array({{0, -6}, {4, 5}}); ComputeAndCompareR2(&builder, expected_array, {}); @@ -757,27 +754,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, AndS32R2) { XLA_TEST_F(ArrayElementwiseOpTest, AndZeroElementS32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.And(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + And(a, b); ComputeAndCompareR1(&builder, {}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, AndU32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({0, 1, 8}); - auto b = builder.ConstantR1({5, 7, 12}); - builder.And(a, b); + auto a = ConstantR1(&builder, {0, 1, 8}); + auto b = ConstantR1(&builder, {5, 7, 12}); + And(a, b); ComputeAndCompareR1(&builder, {0, 1, 8}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, AndU32R2) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{0, 1}, {3, 8}}); - auto b = builder.ConstantR2({{1, 0}, {7, 6}}); - builder.And(a, b); + auto a = ConstantR2(&builder, {{0, 1}, {3, 8}}); + auto b = ConstantR2(&builder, {{1, 0}, {7, 6}}); + And(a, b); Array2D expected_array({{0, 0}, {3, 0}}); ComputeAndCompareR2(&builder, expected_array, {}); @@ -785,27 +782,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, AndU32R2) { XLA_TEST_F(ArrayElementwiseOpTest, AndZeroElementU32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.And(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + And(a, b); ComputeAndCompareR1(&builder, {}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, OrPredR1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({false, false, true, true}); - auto b = builder.ConstantR1({false, true, false, true}); - builder.Or(a, b); + auto a = ConstantR1(&builder, {false, false, true, true}); + auto b = ConstantR1(&builder, {false, true, false, true}); + Or(a, b); ComputeAndCompareR1(&builder, {false, true, true, true}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, OrPredR2) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{false, false}, {true, true}}); - auto b = builder.ConstantR2({{false, true}, {false, true}}); - builder.Or(a, b); + auto a = ConstantR2(&builder, {{false, false}, {true, true}}); + auto b = ConstantR2(&builder, {{false, true}, {false, true}}); + Or(a, b); Array2D expected_array({{false, true}, {true, true}}); ComputeAndCompareR2(&builder, expected_array, {}); @@ -813,27 +810,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, OrPredR2) { XLA_TEST_F(ArrayElementwiseOpTest, OrZeroElementPredR1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Or(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Or(a, b); ComputeAndCompareR1(&builder, {}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, OrS32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({0, -1, 8}); - auto b = builder.ConstantR1({5, -7, 4}); - builder.Or(a, b); + auto a = ConstantR1(&builder, {0, -1, 8}); + auto b = ConstantR1(&builder, {5, -7, 4}); + Or(a, b); ComputeAndCompareR1(&builder, {5, -1, 12}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, OrS32R2) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{0, -1}, {8, 8}}); - auto b = builder.ConstantR2({{5, -7}, {4, 1}}); - builder.Or(a, b); + auto a = ConstantR2(&builder, {{0, -1}, {8, 8}}); + auto b = ConstantR2(&builder, {{5, -7}, {4, 1}}); + Or(a, b); Array2D expected_array({{5, -1}, {12, 9}}); ComputeAndCompareR2(&builder, expected_array, {}); @@ -841,27 +838,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, OrS32R2) { XLA_TEST_F(ArrayElementwiseOpTest, OrZeroElementS32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Or(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Or(a, b); ComputeAndCompareR1(&builder, {}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, OrU32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({0, 1, 8}); - auto b = builder.ConstantR1({5, 7, 4}); - builder.Or(a, b); + auto a = ConstantR1(&builder, {0, 1, 8}); + auto b = ConstantR1(&builder, {5, 7, 4}); + Or(a, b); ComputeAndCompareR1(&builder, {5, 7, 12}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, OrU32R2) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{0, 1}, {8, 8}}); - auto b = builder.ConstantR2({{5, 7}, {4, 1}}); - builder.Or(a, b); + auto a = ConstantR2(&builder, {{0, 1}, {8, 8}}); + auto b = ConstantR2(&builder, {{5, 7}, {4, 1}}); + Or(a, b); Array2D expected_array({{5, 7}, {12, 9}}); ComputeAndCompareR2(&builder, expected_array, {}); @@ -869,25 +866,108 @@ XLA_TEST_F(ArrayElementwiseOpTest, OrU32R2) { XLA_TEST_F(ArrayElementwiseOpTest, OrZeroElementU32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.Or(a, b); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Or(a, b); ComputeAndCompareR1(&builder, {}, {}); } +XLA_TEST_F(ArrayElementwiseOpTest, XorPredR1) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {false, false, true, true}); + auto b = ConstantR1(&builder, {false, true, false, true}); + Xor(a, b); + + ComputeAndCompareR1(&builder, {false, true, true, false}, {}); +} + +XLA_TEST_F(ArrayElementwiseOpTest, XorPredR2) { + XlaBuilder builder(TestName()); + auto a = ConstantR2(&builder, {{false, false}, {true, true}}); + auto b = ConstantR2(&builder, {{false, true}, {false, true}}); + Xor(a, b); + + Array2D expected_array({{false, true}, {true, false}}); + ComputeAndCompareR2(&builder, expected_array, {}); +} + +XLA_TEST_F(ArrayElementwiseOpTest, XorZeroElementPredR1) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Xor(a, b); + + ComputeAndCompareR1(&builder, {}, {}); +} + +XLA_TEST_F(ArrayElementwiseOpTest, XorS32R1) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {0, -1, 8}); + auto b = ConstantR1(&builder, {5, -7, 4}); + Xor(a, b); + + ComputeAndCompareR1(&builder, {5, 6, 12}, {}); +} + +XLA_TEST_F(ArrayElementwiseOpTest, XorS32R2) { + XlaBuilder builder(TestName()); + auto a = ConstantR2(&builder, {{0, -1}, {8, 8}}); + auto b = ConstantR2(&builder, {{5, -7}, {4, 1}}); + Xor(a, b); + + Array2D expected_array({{5, 6}, {12, 9}}); + ComputeAndCompareR2(&builder, expected_array, {}); +} + +XLA_TEST_F(ArrayElementwiseOpTest, XorZeroElementS32R1) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Xor(a, b); + + ComputeAndCompareR1(&builder, {}, {}); +} + +XLA_TEST_F(ArrayElementwiseOpTest, XorU32R1) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {0, 1, 8}); + auto b = ConstantR1(&builder, {5, 7, 4}); + Xor(a, b); + + ComputeAndCompareR1(&builder, {5, 6, 12}, {}); +} + +XLA_TEST_F(ArrayElementwiseOpTest, XorU32R2) { + XlaBuilder builder(TestName()); + auto a = ConstantR2(&builder, {{0, 1}, {8, 8}}); + auto b = ConstantR2(&builder, {{5, 7}, {4, 1}}); + Xor(a, b); + + Array2D expected_array({{5, 6}, {12, 9}}); + ComputeAndCompareR2(&builder, expected_array, {}); +} + +XLA_TEST_F(ArrayElementwiseOpTest, XorZeroElementU32R1) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + Xor(a, b); + + ComputeAndCompareR1(&builder, {}, {}); +} XLA_TEST_F(ArrayElementwiseOpTest, NotPredR1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({false, true, true, false}); - builder.Not(a); + auto a = ConstantR1(&builder, {false, true, true, false}); + Not(a); ComputeAndCompareR1(&builder, {true, false, false, true}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, NotPredR2) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{false, true}, {true, false}}); - builder.Not(a); + auto a = ConstantR2(&builder, {{false, true}, {true, false}}); + Not(a); Array2D expected_array({{true, false}, {false, true}}); ComputeAndCompareR2(&builder, expected_array, {}); @@ -895,24 +975,24 @@ XLA_TEST_F(ArrayElementwiseOpTest, NotPredR2) { XLA_TEST_F(ArrayElementwiseOpTest, NotZeroElementPredR1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - builder.Not(a); + auto a = ConstantR1(&builder, {}); + Not(a); ComputeAndCompareR1(&builder, {}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, NotS32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-1, 0, 1}); - builder.Not(a); + auto a = ConstantR1(&builder, {-1, 0, 1}); + Not(a); ComputeAndCompareR1(&builder, {0, -1, -2}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, NotS32R2) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{-1, 0}, {1, 8}}); - builder.Not(a); + auto a = ConstantR2(&builder, {{-1, 0}, {1, 8}}); + Not(a); Array2D expected_array({{0, -1}, {-2, -9}}); ComputeAndCompareR2(&builder, expected_array, {}); @@ -920,24 +1000,24 @@ XLA_TEST_F(ArrayElementwiseOpTest, NotS32R2) { XLA_TEST_F(ArrayElementwiseOpTest, NotZeroElementS32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - builder.Not(a); + auto a = ConstantR1(&builder, {}); + Not(a); ComputeAndCompareR1(&builder, {}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, NotU32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({0, 4294967295}); - builder.Not(a); + auto a = ConstantR1(&builder, {0, 4294967295}); + Not(a); ComputeAndCompareR1(&builder, {4294967295, 0}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, NotU32R2) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{0, 4294967295}, {1, 4294967294}}); - builder.Not(a); + auto a = ConstantR2(&builder, {{0, 4294967295}, {1, 4294967294}}); + Not(a); Array2D expected_array({{4294967295, 0}, {4294967294, 1}}); ComputeAndCompareR2(&builder, expected_array, {}); @@ -945,19 +1025,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, NotU32R2) { XLA_TEST_F(ArrayElementwiseOpTest, NotZeroElementU32R1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - builder.Not(a); + auto a = ConstantR1(&builder, {}); + Not(a); ComputeAndCompareR1(&builder, {}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, ShiftLeftS32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({static_cast(0x12345678), - static_cast(0xF0001000), 1, 3, 77, - 1, -3, 77}); - auto b = builder.ConstantR1({4, 8, 2, 7, 15, 32, 100, -1}); - builder.ShiftLeft(a, b); + auto a = ConstantR1( + &builder, {static_cast(0x12345678), static_cast(0xF0001000), + 1, 3, 77, 1, -3, 77}); + auto b = ConstantR1(&builder, {4, 8, 2, 7, 15, 32, 100, -1}); + ShiftLeft(a, b); ComputeAndCompareR1(&builder, {static_cast(0x23456780), 0x00100000, 0x4, @@ -967,11 +1047,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, ShiftLeftS32) { XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightArithmeticS32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({static_cast(0x92345678), - static_cast(0x10001000), 1, 3, 77, - 1, -3, 77}); - auto b = builder.ConstantR1({4, 8, 2, 7, 2, 32, 100, -1}); - builder.ShiftRightArithmetic(a, b); + auto a = ConstantR1( + &builder, {static_cast(0x92345678), static_cast(0x10001000), + 1, 3, 77, 1, -3, 77}); + auto b = ConstantR1(&builder, {4, 8, 2, 7, 2, 32, 100, -1}); + ShiftRightArithmetic(a, b); ComputeAndCompareR1( &builder, @@ -982,11 +1062,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightArithmeticS32) { XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightLogicalS32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({static_cast(0x92345678), - static_cast(0x10001000), 1, 3, 77, - 1, -3, 77}); - auto b = builder.ConstantR1({4, 8, 2, 7, 5, 32, 100, -1}); - builder.ShiftRightLogical(a, b); + auto a = ConstantR1( + &builder, {static_cast(0x92345678), static_cast(0x10001000), + 1, 3, 77, 1, -3, 77}); + auto b = ConstantR1(&builder, {4, 8, 2, 7, 5, 32, 100, -1}); + ShiftRightLogical(a, b); ComputeAndCompareR1(&builder, {0x09234567, 0x00100010, 0, 0, 2, 0, 0, 0}, {}); @@ -994,10 +1074,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightLogicalS32) { XLA_TEST_F(ArrayElementwiseOpTest, ShiftLeftU32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {0x12345678, 0xF0001000, 1, 3, 77, 1, ~3u, 77}); - auto b = builder.ConstantR1({4, 8, 2, 7, 15, 32, 100, ~0u}); - builder.ShiftLeft(a, b); + auto a = ConstantR1(&builder, + {0x12345678, 0xF0001000, 1, 3, 77, 1, ~3u, 77}); + auto b = ConstantR1(&builder, {4, 8, 2, 7, 15, 32, 100, ~0u}); + ShiftLeft(a, b); ComputeAndCompareR1( &builder, {0x23456780, 0x00100000, 0x4, 0x180, 2523136, 0, 0, 0}, {}); @@ -1005,10 +1085,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, ShiftLeftU32) { XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightArithmeticU32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {0x92345678, 0x10001000, 1, 3, 77, 1, ~3u, 77}); - auto b = builder.ConstantR1({4, 8, 2, 7, 2, 32, 100, ~0u}); - builder.ShiftRightArithmetic(a, b); + auto a = ConstantR1(&builder, + {0x92345678, 0x10001000, 1, 3, 77, 1, ~3u, 77}); + auto b = ConstantR1(&builder, {4, 8, 2, 7, 2, 32, 100, ~0u}); + ShiftRightArithmetic(a, b); ComputeAndCompareR1( &builder, {0xF9234567, 0x00100010, 0, 0, 19, 0, ~0u, 0}, {}); @@ -1016,10 +1096,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightArithmeticU32) { XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightLogicalU32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {0x92345678, 0x10001000, 1, 3, 77, 1, ~3u, 77}); - auto b = builder.ConstantR1({4, 8, 2, 7, 5, 32, 100, ~0u}); - builder.ShiftRightLogical(a, b); + auto a = ConstantR1(&builder, + {0x92345678, 0x10001000, 1, 3, 77, 1, ~3u, 77}); + auto b = ConstantR1(&builder, {4, 8, 2, 7, 5, 32, 100, ~0u}); + ShiftRightLogical(a, b); ComputeAndCompareR1(&builder, {0x09234567, 0x00100010, 0, 0, 2, 0, 0, 0}, {}); @@ -1028,18 +1108,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, ShiftRightLogicalU32) { XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({-2.5f, 25.5f, 2.25f, NAN, 6.0f}); - auto rhs = builder.ConstantR1({10.0f, 5.0f, 2.25f, 10.0f, NAN}); - builder.Eq(lhs, rhs); + auto lhs = ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, NAN, 6.0f}); + auto rhs = ConstantR1(&builder, {10.0f, 5.0f, 2.25f, 10.0f, NAN}); + Eq(lhs, rhs); ComputeAndCompareR1(&builder, {false, false, true, false, false}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementF32s) { XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({}); - auto rhs = builder.ConstantR1({}); - builder.Eq(lhs, rhs); + auto lhs = ConstantR1(&builder, {}); + auto rhs = ConstantR1(&builder, {}); + Eq(lhs, rhs); ComputeAndCompareR1(&builder, {}, {}); } @@ -1047,9 +1127,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementF32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({-2.5f, 25.5f, 2.25f, NAN, 6.0f}); - auto rhs = builder.ConstantR1({10.0f, 5.0f, 1.0f, 10.0f, NAN}); - builder.Ge(lhs, rhs); + auto lhs = ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, NAN, 6.0f}); + auto rhs = ConstantR1(&builder, {10.0f, 5.0f, 1.0f, 10.0f, NAN}); + Ge(lhs, rhs); ComputeAndCompareR1(&builder, {false, true, true, false, false}, {}); } @@ -1057,9 +1137,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareGtF32s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({-2.5f, 25.5f, 2.25f, NAN, 6.0f}); - auto rhs = builder.ConstantR1({10.0f, 5.0f, 1.0f, 10.0f, NAN}); - builder.Gt(lhs, rhs); + auto lhs = ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, NAN, 6.0f}); + auto rhs = ConstantR1(&builder, {10.0f, 5.0f, 1.0f, 10.0f, NAN}); + Gt(lhs, rhs); ComputeAndCompareR1(&builder, {false, true, true, false, false}, {}); } @@ -1067,9 +1147,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtF32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareLeF32s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({-2.5f, 5.0f, 2.25f, NAN, 6.0f}); - auto rhs = builder.ConstantR1({10.0f, 5.0f, 1.0f, 10.0f, NAN}); - builder.Le(lhs, rhs); + auto lhs = ConstantR1(&builder, {-2.5f, 5.0f, 2.25f, NAN, 6.0f}); + auto rhs = ConstantR1(&builder, {10.0f, 5.0f, 1.0f, 10.0f, NAN}); + Le(lhs, rhs); ComputeAndCompareR1(&builder, {true, true, false, false, false}, {}); } @@ -1077,9 +1157,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLeF32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareLtF32s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({-2.5f, 25.5f, 2.25f, NAN, 6.0f}); - auto rhs = builder.ConstantR1({10.0f, 5.0f, 1.0f, 10.0f, NAN}); - builder.Lt(lhs, rhs); + auto lhs = ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, NAN, 6.0f}); + auto rhs = ConstantR1(&builder, {10.0f, 5.0f, 1.0f, 10.0f, NAN}); + Lt(lhs, rhs); ComputeAndCompareR1(&builder, {true, false, false, false, false}, {}); } @@ -1088,9 +1168,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqS32s) { const int32 min = std::numeric_limits::min(); const int32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({min, min, min, 0, 0, 0, max, max, max}); - auto rhs = builder.ConstantR1({min, 0, max, -1, 0, 1, min, 0, max}); - builder.Eq(lhs, rhs); + auto lhs = + ConstantR1(&builder, {min, min, min, 0, 0, 0, max, max, max}); + auto rhs = ConstantR1(&builder, {min, 0, max, -1, 0, 1, min, 0, max}); + Eq(lhs, rhs); ComputeAndCompareR1( &builder, {true, false, false, false, true, false, false, false, true}, @@ -1099,9 +1180,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqS32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementS32s) { XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({}); - auto rhs = builder.ConstantR1({}); - builder.Eq(lhs, rhs); + auto lhs = ConstantR1(&builder, {}); + auto rhs = ConstantR1(&builder, {}); + Eq(lhs, rhs); ComputeAndCompareR1(&builder, {}, {}); } @@ -1109,26 +1190,26 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementS32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareEqC64s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({{-2.5f, 10.0f}, - {1.0f, 25.5f}, - {2.25f, -3.0f}, - {NAN, 0.0f}, - {1.0f, 6.0f}}); - auto rhs = builder.ConstantR1({{0.0f, 10.0f}, - {1.0f, 5.0f}, - {2.25f, -3.0f}, - {10.0f, 0.0f}, - {1.0f, NAN}}); - builder.Eq(lhs, rhs); + auto lhs = ConstantR1(&builder, {{-2.5f, 10.0f}, + {1.0f, 25.5f}, + {2.25f, -3.0f}, + {NAN, 0.0f}, + {1.0f, 6.0f}}); + auto rhs = ConstantR1(&builder, {{0.0f, 10.0f}, + {1.0f, 5.0f}, + {2.25f, -3.0f}, + {10.0f, 0.0f}, + {1.0f, NAN}}); + Eq(lhs, rhs); ComputeAndCompareR1(&builder, {false, false, true, false, false}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementC64s) { XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({}); - auto rhs = builder.ConstantR1({}); - builder.Eq(lhs, rhs); + auto lhs = ConstantR1(&builder, {}); + auto rhs = ConstantR1(&builder, {}); + Eq(lhs, rhs); ComputeAndCompareR1(&builder, {}, {}); } @@ -1138,17 +1219,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeC64s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({{-2.5f, 10.0f}, - {1.0f, 25.5f}, - {2.25f, -3.0f}, - {NAN, 0.0f}, - {1.0f, 6.0f}}); - auto rhs = builder.ConstantR1({{0.0f, 10.0f}, - {1.0f, 5.0f}, - {2.25f, -3.0f}, - {10.0f, 0.0f}, - {1.0f, NAN}}); - builder.Ne(lhs, rhs); + auto lhs = ConstantR1(&builder, {{-2.5f, 10.0f}, + {1.0f, 25.5f}, + {2.25f, -3.0f}, + {NAN, 0.0f}, + {1.0f, 6.0f}}); + auto rhs = ConstantR1(&builder, {{0.0f, 10.0f}, + {1.0f, 5.0f}, + {2.25f, -3.0f}, + {10.0f, 0.0f}, + {1.0f, NAN}}); + Ne(lhs, rhs); ComputeAndCompareR1(&builder, {true, true, false, true, true}, {}); } @@ -1158,9 +1239,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeF32s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({-2.5f, 25.5f, 2.25f, NAN, 6.0f}); - auto rhs = builder.ConstantR1({10.0f, 25.5f, 1.0f, 10.0f, NAN}); - builder.Ne(lhs, rhs); + auto lhs = ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, NAN, 6.0f}); + auto rhs = ConstantR1(&builder, {10.0f, 25.5f, 1.0f, 10.0f, NAN}); + Ne(lhs, rhs); ComputeAndCompareR1(&builder, {true, false, true, true, true}, {}); } @@ -1169,9 +1250,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeS32s) { const int32 min = std::numeric_limits::min(); const int32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({min, min, min, 0, 0, 0, max, max, max}); - auto rhs = builder.ConstantR1({min, 0, max, -1, 0, 1, min, 0, max}); - builder.Ne(lhs, rhs); + auto lhs = + ConstantR1(&builder, {min, min, min, 0, 0, 0, max, max, max}); + auto rhs = ConstantR1(&builder, {min, 0, max, -1, 0, 1, min, 0, max}); + Ne(lhs, rhs); ComputeAndCompareR1( &builder, {false, true, true, true, false, true, true, true, false}, {}); @@ -1181,9 +1263,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeS32s) { const int32 min = std::numeric_limits::min(); const int32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({min, min, min, 0, 0, 0, max, max, max}); - auto rhs = builder.ConstantR1({min, 0, max, -1, 0, 1, min, 0, max}); - builder.Ge(lhs, rhs); + auto lhs = + ConstantR1(&builder, {min, min, min, 0, 0, 0, max, max, max}); + auto rhs = ConstantR1(&builder, {min, 0, max, -1, 0, 1, min, 0, max}); + Ge(lhs, rhs); ComputeAndCompareR1( &builder, {true, false, false, true, true, false, true, true, true}, {}); @@ -1193,9 +1276,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtS32s) { const int32 min = std::numeric_limits::min(); const int32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({min, min, min, 0, 0, 0, max, max, max}); - auto rhs = builder.ConstantR1({min, 0, max, -1, 0, 1, min, 0, max}); - builder.Gt(lhs, rhs); + auto lhs = + ConstantR1(&builder, {min, min, min, 0, 0, 0, max, max, max}); + auto rhs = ConstantR1(&builder, {min, 0, max, -1, 0, 1, min, 0, max}); + Gt(lhs, rhs); ComputeAndCompareR1( &builder, {false, false, false, true, false, false, true, true, false}, @@ -1206,9 +1290,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLeS32s) { const int32 min = std::numeric_limits::min(); const int32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({min, min, min, 0, 0, 0, max, max, max}); - auto rhs = builder.ConstantR1({min, 0, max, -1, 0, 1, min, 0, max}); - builder.Le(lhs, rhs); + auto lhs = + ConstantR1(&builder, {min, min, min, 0, 0, 0, max, max, max}); + auto rhs = ConstantR1(&builder, {min, 0, max, -1, 0, 1, min, 0, max}); + Le(lhs, rhs); ComputeAndCompareR1( &builder, {true, true, true, false, true, true, false, false, true}, {}); @@ -1218,9 +1303,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtS32s) { const int32 min = std::numeric_limits::min(); const int32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({min, min, min, 0, 0, 0, max, max, max}); - auto rhs = builder.ConstantR1({min, 0, max, -1, 0, 1, min, 0, max}); - builder.Lt(lhs, rhs); + auto lhs = + ConstantR1(&builder, {min, min, min, 0, 0, 0, max, max, max}); + auto rhs = ConstantR1(&builder, {min, 0, max, -1, 0, 1, min, 0, max}); + Lt(lhs, rhs); ComputeAndCompareR1( &builder, {false, true, true, false, false, true, false, false, false}, @@ -1230,9 +1316,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtS32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareEqU32s) { const uint32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({0, 0, 0, 5, 5, 5, max, max, max}); - auto rhs = builder.ConstantR1({0, 1, max, 4, 5, 6, 0, 1, max}); - builder.Eq(lhs, rhs); + auto lhs = ConstantR1(&builder, {0, 0, 0, 5, 5, 5, max, max, max}); + auto rhs = ConstantR1(&builder, {0, 1, max, 4, 5, 6, 0, 1, max}); + Eq(lhs, rhs); ComputeAndCompareR1( &builder, {true, false, false, false, true, false, false, false, true}, @@ -1242,9 +1328,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqU32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareNeU32s) { const uint32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({0, 0, 0, 5, 5, 5, max, max, max}); - auto rhs = builder.ConstantR1({0, 1, max, 4, 5, 6, 0, 1, max}); - builder.Ne(lhs, rhs); + auto lhs = ConstantR1(&builder, {0, 0, 0, 5, 5, 5, max, max, max}); + auto rhs = ConstantR1(&builder, {0, 1, max, 4, 5, 6, 0, 1, max}); + Ne(lhs, rhs); ComputeAndCompareR1( &builder, {false, true, true, true, false, true, true, true, false}, {}); @@ -1253,9 +1339,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareNeU32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareGeU32s) { const uint32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({0, 0, 0, 5, 5, 5, max, max, max}); - auto rhs = builder.ConstantR1({0, 1, max, 4, 5, 6, 0, 1, max}); - builder.Ge(lhs, rhs); + auto lhs = ConstantR1(&builder, {0, 0, 0, 5, 5, 5, max, max, max}); + auto rhs = ConstantR1(&builder, {0, 1, max, 4, 5, 6, 0, 1, max}); + Ge(lhs, rhs); ComputeAndCompareR1( &builder, {true, false, false, true, true, false, true, true, true}, {}); @@ -1264,9 +1350,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeU32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareGtU32s) { const uint32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({0, 0, 0, 5, 5, 5, max, max, max}); - auto rhs = builder.ConstantR1({0, 1, max, 4, 5, 6, 0, 1, max}); - builder.Gt(lhs, rhs); + auto lhs = ConstantR1(&builder, {0, 0, 0, 5, 5, 5, max, max, max}); + auto rhs = ConstantR1(&builder, {0, 1, max, 4, 5, 6, 0, 1, max}); + Gt(lhs, rhs); ComputeAndCompareR1( &builder, {false, false, false, true, false, false, true, true, false}, @@ -1276,9 +1362,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtU32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareLeU32s) { const uint32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({0, 0, 0, 5, 5, 5, max, max, max}); - auto rhs = builder.ConstantR1({0, 1, max, 4, 5, 6, 0, 1, max}); - builder.Le(lhs, rhs); + auto lhs = ConstantR1(&builder, {0, 0, 0, 5, 5, 5, max, max, max}); + auto rhs = ConstantR1(&builder, {0, 1, max, 4, 5, 6, 0, 1, max}); + Le(lhs, rhs); ComputeAndCompareR1( &builder, {true, true, true, false, true, true, false, false, true}, {}); @@ -1287,9 +1373,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLeU32s) { XLA_TEST_F(ArrayElementwiseOpTest, CompareLtU32s) { const uint32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({0, 0, 0, 5, 5, 5, max, max, max}); - auto rhs = builder.ConstantR1({0, 1, max, 4, 5, 6, 0, 1, max}); - builder.Lt(lhs, rhs); + auto lhs = ConstantR1(&builder, {0, 0, 0, 5, 5, 5, max, max, max}); + auto rhs = ConstantR1(&builder, {0, 1, max, 4, 5, 6, 0, 1, max}); + Lt(lhs, rhs); ComputeAndCompareR1( &builder, {false, true, true, false, false, true, false, false, false}, @@ -1300,10 +1386,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); auto lhs = - builder.ConstantR1({4.0f, 2.0f, 2.0f, NAN, 6.0f, -2.0f, -2.0f}); + ConstantR1(&builder, {4.0f, 2.0f, 2.0f, NAN, 6.0f, -2.0f, -2.0f}); auto rhs = - builder.ConstantR1({2.0f, -2.0f, 3.0f, 10.0f, NAN, 3.0f, 4.0f}); - builder.Pow(lhs, rhs); + ConstantR1(&builder, {2.0f, -2.0f, 3.0f, 10.0f, NAN, 3.0f, 4.0f}); + Pow(lhs, rhs); ComputeAndCompareR1( &builder, {16.0f, 0.25f, 8.0f, NAN, NAN, -8.0f, 16.0f}, {}, error_spec_); @@ -1312,9 +1398,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) { XLA_TEST_F(ArrayElementwiseOpTest, PowNonIntegerF32s) { SetFastMathDisabled(true); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({-2.0f, -0.6f, -0.6f, 0.0f}); - auto rhs = builder.ConstantR1({0.5f, 0.6f, -0.6f, -0.6f}); - builder.Pow(lhs, rhs); + auto lhs = ConstantR1(&builder, {-2.0f, -0.6f, -0.6f, 0.0f}); + auto rhs = ConstantR1(&builder, {0.5f, 0.6f, -0.6f, -0.6f}); + Pow(lhs, rhs); ComputeAndCompareR1(&builder, {NAN, NAN, NAN, INFINITY}, {}, error_spec_); @@ -1322,9 +1408,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowNonIntegerF32s) { XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) { XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({}); - auto rhs = builder.ConstantR1({}); - builder.Pow(lhs, rhs); + auto lhs = ConstantR1(&builder, {}); + auto rhs = ConstantR1(&builder, {}); + Pow(lhs, rhs); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } @@ -1336,14 +1422,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) { std::vector values = {1.0f, 2.0f, 3.2f, -4.0f}; std::vector exponents = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; - std::unique_ptr param_literal = Literal::CreateR1(values); + std::unique_ptr param_literal = LiteralUtil::CreateR1(values); std::unique_ptr param_data = client_->TransferToServer(*param_literal).ConsumeValueOrDie(); - auto sum = b.ConstantR0(0.0f); - auto param = b.Parameter(0, param_literal->shape(), "param"); + auto sum = ConstantR0(&b, 0.0f); + auto param = Parameter(&b, 0, param_literal->shape(), "param"); for (float exponent : exponents) { - sum = b.Add(sum, b.Pow(param, b.ConstantR0(exponent))); + sum = Add(sum, Pow(param, ConstantR0(&b, exponent))); } std::vector expected; @@ -1364,15 +1450,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) { std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; - std::unique_ptr literal0 = Literal::CreateR1(values0); + std::unique_ptr literal0 = LiteralUtil::CreateR1(values0); std::unique_ptr data0 = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - std::unique_ptr literal1 = Literal::CreateR1(values1); + std::unique_ptr literal1 = LiteralUtil::CreateR1(values1); std::unique_ptr data1 = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - auto param0 = b.Parameter(0, literal0->shape(), "param0"); - auto param1 = b.Parameter(1, literal1->shape(), "param1"); - b.Pow(b.Exp(param0), param1); + auto param0 = Parameter(&b, 0, literal0->shape(), "param0"); + auto param1 = Parameter(&b, 1, literal1->shape(), "param1"); + Pow(Exp(param0), param1); std::vector expected(values0.size()); for (int64 i = 0; i < values0.size(); ++i) { @@ -1389,15 +1475,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) { std::vector values0 = {1.0f, 2.0f, 3.2f, 4.0f, 0.5f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; - std::unique_ptr literal0 = Literal::CreateR1(values0); + std::unique_ptr literal0 = LiteralUtil::CreateR1(values0); std::unique_ptr data0 = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - std::unique_ptr literal1 = Literal::CreateR1(values1); + std::unique_ptr literal1 = LiteralUtil::CreateR1(values1); std::unique_ptr data1 = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - auto param0 = b.Parameter(0, literal0->shape(), "param0"); - auto param1 = b.Parameter(1, literal1->shape(), "param1"); - b.Log(b.Pow(param0, param1)); + auto param0 = Parameter(&b, 0, literal0->shape(), "param0"); + auto param1 = Parameter(&b, 1, literal1->shape(), "param1"); + Log(Pow(param0, param1)); std::vector expected(values0.size()); for (int64 i = 0; i < values0.size(); ++i) { @@ -1414,15 +1500,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) { std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; - std::unique_ptr literal0 = Literal::CreateR1(values0); + std::unique_ptr literal0 = LiteralUtil::CreateR1(values0); std::unique_ptr data0 = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - std::unique_ptr literal1 = Literal::CreateR1(values1); + std::unique_ptr literal1 = LiteralUtil::CreateR1(values1); std::unique_ptr data1 = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - auto param0 = b.Parameter(0, literal0->shape(), "param0"); - auto param1 = b.Parameter(1, literal1->shape(), "param1"); - b.Mul(b.Exp(param0), b.Exp(param1)); + auto param0 = Parameter(&b, 0, literal0->shape(), "param0"); + auto param1 = Parameter(&b, 1, literal1->shape(), "param1"); + Mul(Exp(param0), Exp(param1)); std::vector expected(values0.size()); for (int64 i = 0; i < values0.size(); ++i) { @@ -1439,15 +1525,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) { std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; - std::unique_ptr literal0 = Literal::CreateR1(values0); + std::unique_ptr literal0 = LiteralUtil::CreateR1(values0); std::unique_ptr data0 = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - std::unique_ptr literal1 = Literal::CreateR1(values1); + std::unique_ptr literal1 = LiteralUtil::CreateR1(values1); std::unique_ptr data1 = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - auto param0 = b.Parameter(0, literal0->shape(), "param0"); - auto param1 = b.Parameter(1, literal1->shape(), "param1"); - b.Div(param0, b.Exp(param1)); + auto param0 = Parameter(&b, 0, literal0->shape(), "param0"); + auto param1 = Parameter(&b, 1, literal1->shape(), "param1"); + Div(param0, Exp(param1)); std::vector expected(values0.size()); for (int64 i = 0; i < values0.size(); ++i) { @@ -1465,21 +1551,21 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) { std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; std::vector values2 = {0.1f, 1.1f, 6.9f, 12.5f, -15.0f, -0.5f}; - std::unique_ptr literal0 = Literal::CreateR1(values0); + std::unique_ptr literal0 = LiteralUtil::CreateR1(values0); std::unique_ptr data0 = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - std::unique_ptr literal1 = Literal::CreateR1(values1); + std::unique_ptr literal1 = LiteralUtil::CreateR1(values1); std::unique_ptr data1 = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - std::unique_ptr literal2 = Literal::CreateR1(values2); + std::unique_ptr literal2 = LiteralUtil::CreateR1(values2); std::unique_ptr data2 = client_->TransferToServer(*literal2).ConsumeValueOrDie(); - auto param0 = b.Parameter(0, literal0->shape(), "param0"); - auto param1 = b.Parameter(1, literal1->shape(), "param1"); - auto param2 = b.Parameter(2, literal2->shape(), "param2"); - b.Div(b.Div(param0, param1), param2); + auto param0 = Parameter(&b, 0, literal0->shape(), "param0"); + auto param1 = Parameter(&b, 1, literal1->shape(), "param1"); + auto param2 = Parameter(&b, 2, literal2->shape(), "param2"); + Div(Div(param0, param1), param2); std::vector expected(values0.size()); for (int64 i = 0; i < values0.size(); ++i) { @@ -1497,22 +1583,22 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) { std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; std::vector values2 = {0.1f, 1.1f, 6.9f, 12.5f, -15.0f, -0.5f}; - std::unique_ptr literal0 = Literal::CreateR1(values0); + std::unique_ptr literal0 = LiteralUtil::CreateR1(values0); std::unique_ptr data0 = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - std::unique_ptr literal1 = Literal::CreateR1(values1); + std::unique_ptr literal1 = LiteralUtil::CreateR1(values1); std::unique_ptr data1 = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - std::unique_ptr literal2 = Literal::CreateR1(values2); + std::unique_ptr literal2 = LiteralUtil::CreateR1(values2); std::unique_ptr data2 = client_->TransferToServer(*literal2).ConsumeValueOrDie(); - auto param0 = b.Parameter(0, literal0->shape(), "param0"); - auto param1 = b.Parameter(1, literal1->shape(), "param1"); - auto param2 = b.Parameter(2, literal2->shape(), "param2"); - b.Div(param0, b.Div(param1, param2)); + auto param0 = Parameter(&b, 0, literal0->shape(), "param0"); + auto param1 = Parameter(&b, 1, literal1->shape(), "param1"); + auto param2 = Parameter(&b, 2, literal2->shape(), "param2"); + Div(param0, Div(param1, param2)); std::vector expected(values0.size()); for (int64 i = 0; i < values0.size(); ++i) { @@ -1530,22 +1616,22 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) { std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, 1.0f, 0.5f}; std::vector values2 = {0.1f, 1.1f, 6.9f, 9.5f, -11.0f, -0.5f}; - std::unique_ptr literal0 = Literal::CreateR1(values0); + std::unique_ptr literal0 = LiteralUtil::CreateR1(values0); std::unique_ptr data0 = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - std::unique_ptr literal1 = Literal::CreateR1(values1); + std::unique_ptr literal1 = LiteralUtil::CreateR1(values1); std::unique_ptr data1 = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - std::unique_ptr literal2 = Literal::CreateR1(values2); + std::unique_ptr literal2 = LiteralUtil::CreateR1(values2); std::unique_ptr data2 = client_->TransferToServer(*literal2).ConsumeValueOrDie(); - auto param0 = b.Parameter(0, literal0->shape(), "param0"); - auto param1 = b.Parameter(1, literal1->shape(), "param1"); - auto param2 = b.Parameter(2, literal2->shape(), "param2"); - b.Div(param0, b.Pow(param1, param2)); + auto param0 = Parameter(&b, 0, literal0->shape(), "param0"); + auto param1 = Parameter(&b, 1, literal1->shape(), "param1"); + auto param2 = Parameter(&b, 2, literal2->shape(), "param2"); + Div(param0, Pow(param1, param2)); std::vector expected(values0.size()); for (int64 i = 0; i < values0.size(); ++i) { @@ -1564,27 +1650,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div4F32) { std::vector values2 = {0.1f, 1.1f, 6.9f, 12.5f, -15.0f, -0.5f}; std::vector values3 = {2.1f, 3.1f, 9.9f, -4.5f, -11.0f, -21.5f}; - std::unique_ptr literal0 = Literal::CreateR1(values0); + std::unique_ptr literal0 = LiteralUtil::CreateR1(values0); std::unique_ptr data0 = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - std::unique_ptr literal1 = Literal::CreateR1(values1); + std::unique_ptr literal1 = LiteralUtil::CreateR1(values1); std::unique_ptr data1 = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - std::unique_ptr literal2 = Literal::CreateR1(values2); + std::unique_ptr literal2 = LiteralUtil::CreateR1(values2); std::unique_ptr data2 = client_->TransferToServer(*literal2).ConsumeValueOrDie(); - std::unique_ptr literal3 = Literal::CreateR1(values3); + std::unique_ptr literal3 = LiteralUtil::CreateR1(values3); std::unique_ptr data3 = client_->TransferToServer(*literal3).ConsumeValueOrDie(); - auto param0 = b.Parameter(0, literal0->shape(), "param0"); - auto param1 = b.Parameter(1, literal1->shape(), "param1"); - auto param2 = b.Parameter(2, literal2->shape(), "param2"); - auto param3 = b.Parameter(3, literal3->shape(), "param2"); - b.Div(b.Div(param0, param1), b.Div(param2, param3)); + auto param0 = Parameter(&b, 0, literal0->shape(), "param0"); + auto param1 = Parameter(&b, 1, literal1->shape(), "param1"); + auto param2 = Parameter(&b, 2, literal2->shape(), "param2"); + auto param3 = Parameter(&b, 3, literal3->shape(), "param2"); + Div(Div(param0, param1), Div(param2, param3)); std::vector expected(values0.size()); for (int64 i = 0; i < values0.size(); ++i) { @@ -1604,8 +1690,8 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) { for (int i = 0; i < count; ++i) { values.push_back(i / static_cast(count)); } - auto x = builder.ConstantR1(values); - builder.Pow(x, builder.ConstantR0(2.0f)); + auto x = ConstantR1(&builder, values); + Pow(x, ConstantR0(&builder, 2.0f)); std::vector expected; expected.reserve(values.size()); @@ -1630,8 +1716,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4D) { Array4D expected(2, 2, 2, 2, expected_vector); - auto x = builder.ConstantR4FromArray4D(values); - builder.Pow(x, builder.ConstantR0(2.0f)); + auto x = ConstantR4FromArray4D(&builder, values); + Pow(x, ConstantR0(&builder, 2.0f)); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } @@ -1641,8 +1727,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4DZeroElements) { Array4D values(2, 2, 0, 2); Array4D expected(2, 2, 0, 2); - auto x = builder.ConstantR4FromArray4D(values); - builder.Pow(x, builder.ConstantR0(2.0f)); + auto x = ConstantR4FromArray4D(&builder, values); + Pow(x, ConstantR0(&builder, 2.0f)); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } @@ -1650,9 +1736,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4DZeroElements) { XLA_TEST_F(ArrayElementwiseOpTest, MinF32s) { XlaBuilder builder(TestName()); SetFastMathDisabled(true); - auto lhs = builder.ConstantR1({1.0f, 1.0f, 2.25f, NAN, 6.0f}); - auto rhs = builder.ConstantR1({2.0f, -5.0f, 1.0f, 10.0f, NAN}); - builder.Min(lhs, rhs); + auto lhs = ConstantR1(&builder, {1.0f, 1.0f, 2.25f, NAN, 6.0f}); + auto rhs = ConstantR1(&builder, {2.0f, -5.0f, 1.0f, 10.0f, NAN}); + Min(lhs, rhs); ComputeAndCompareR1(&builder, {1.0f, -5.0f, 1.0f, NAN, NAN}, {}, error_spec_); @@ -1660,18 +1746,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinF32s) { XLA_TEST_F(ArrayElementwiseOpTest, MinZeroElementF32s) { XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({}); - auto rhs = builder.ConstantR1({}); - builder.Min(lhs, rhs); + auto lhs = ConstantR1(&builder, {}); + auto rhs = ConstantR1(&builder, {}); + Min(lhs, rhs); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, MinF64s) { XlaBuilder builder(TestName()); SetFastMathDisabled(true); - auto lhs = builder.ConstantR1({1.0, 1.0, 2.25, NAN, 6.0}); - auto rhs = builder.ConstantR1({2.0, -5.0, 1.0, 10.0, NAN}); - builder.Min(lhs, rhs); + auto lhs = ConstantR1(&builder, {1.0, 1.0, 2.25, NAN, 6.0}); + auto rhs = ConstantR1(&builder, {2.0, -5.0, 1.0, 10.0, NAN}); + Min(lhs, rhs); ComputeAndCompareR1(&builder, {1.0, -5.0, 1.0, NAN, NAN}, {}, error_spec_); @@ -1680,9 +1766,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinF64s) { XLA_TEST_F(ArrayElementwiseOpTest, MaxF32s) { XlaBuilder builder(TestName()); SetFastMathDisabled(true); - auto lhs = builder.ConstantR1({1.0f, 1.0f, 2.25f, NAN, 6.0f}); - auto rhs = builder.ConstantR1({2.0f, -5.0f, 1.0f, 10.0f, NAN}); - builder.Max(lhs, rhs); + auto lhs = ConstantR1(&builder, {1.0f, 1.0f, 2.25f, NAN, 6.0f}); + auto rhs = ConstantR1(&builder, {2.0f, -5.0f, 1.0f, 10.0f, NAN}); + Max(lhs, rhs); ComputeAndCompareR1(&builder, {2.0f, 1.0f, 2.25f, NAN, NAN}, {}, error_spec_); @@ -1690,18 +1776,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxF32s) { XLA_TEST_F(ArrayElementwiseOpTest, MaxZeroElementF32s) { XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({}); - auto rhs = builder.ConstantR1({}); - builder.Max(lhs, rhs); + auto lhs = ConstantR1(&builder, {}); + auto rhs = ConstantR1(&builder, {}); + Max(lhs, rhs); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, MaxF64s) { XlaBuilder builder(TestName()); SetFastMathDisabled(true); - auto lhs = builder.ConstantR1({1.0, 1.0, 2.25, NAN, 6.0}); - auto rhs = builder.ConstantR1({2.0, -5.0, 1.0, 10.0, NAN}); - builder.Max(lhs, rhs); + auto lhs = ConstantR1(&builder, {1.0, 1.0, 2.25, NAN, 6.0}); + auto rhs = ConstantR1(&builder, {2.0, -5.0, 1.0, 10.0, NAN}); + Max(lhs, rhs); ComputeAndCompareR1(&builder, {2.0, 1.0, 2.25, NAN, NAN}, {}, error_spec_); @@ -1711,11 +1797,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxS32s) { const int32 min = std::numeric_limits::min(); const int32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {min, min, min, -1, -1, 0, 0, 0, 1, 1, max, max, max}); - auto y = builder.ConstantR1( - {min, max, 0, -10, 0, -1, 0, 1, 0, 10, 0, max, min}); - builder.Max(x, y); + auto x = ConstantR1( + &builder, {min, min, min, -1, -1, 0, 0, 0, 1, 1, max, max, max}); + auto y = ConstantR1( + &builder, {min, max, 0, -10, 0, -1, 0, 1, 0, 10, 0, max, min}); + Max(x, y); std::vector expected = {min, max, 0, -1, 0, 0, 0, 1, 1, 10, max, max, max}; @@ -1726,11 +1812,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinS32s) { const int32 min = std::numeric_limits::min(); const int32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {min, min, min, -1, -1, 0, 0, 0, 1, 1, max, max, max}); - auto y = builder.ConstantR1( - {min, max, 0, -10, 0, -1, 0, 1, 0, 10, 0, max, min}); - builder.Min(x, y); + auto x = ConstantR1( + &builder, {min, min, min, -1, -1, 0, 0, 0, 1, 1, max, max, max}); + auto y = ConstantR1( + &builder, {min, max, 0, -10, 0, -1, 0, 1, 0, 10, 0, max, min}); + Min(x, y); std::vector expected = {min, min, min, -10, -1, -1, 0, 0, 0, 1, 0, max, min}; @@ -1740,9 +1826,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinS32s) { XLA_TEST_F(ArrayElementwiseOpTest, MaxU32s) { const uint32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({0, 0, 1, 1, 1, max, max, max}); - auto y = builder.ConstantR1({0, 1, 0, 1, 10, 0, 234234, max}); - builder.Max(x, y); + auto x = ConstantR1(&builder, {0, 0, 1, 1, 1, max, max, max}); + auto y = ConstantR1(&builder, {0, 1, 0, 1, 10, 0, 234234, max}); + Max(x, y); std::vector expected = {0, 1, 1, 1, 10, max, max, max}; ComputeAndCompareR1(&builder, expected, {}); @@ -1751,9 +1837,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxU32s) { XLA_TEST_F(ArrayElementwiseOpTest, MinU32s) { const uint32 max = std::numeric_limits::max(); XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({0, 0, 1, 1, 1, max, max, max}); - auto y = builder.ConstantR1({0, 1, 0, 1, 10, 0, 234234, max}); - builder.Min(x, y); + auto x = ConstantR1(&builder, {0, 0, 1, 1, 1, max, max, max}); + auto y = ConstantR1(&builder, {0, 1, 0, 1, 10, 0, 234234, max}); + Min(x, y); std::vector expected = {0, 0, 0, 1, 1, 0, 234234, max}; ComputeAndCompareR1(&builder, expected, {}); @@ -1761,11 +1847,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinU32s) { XLA_TEST_F(ArrayElementwiseOpTest, MaxTenF32s) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0}); - auto y = builder.ConstantR1( - {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0}); - builder.Max(x, y); + auto x = ConstantR1( + &builder, {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0}); + auto y = ConstantR1( + &builder, {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0}); + Max(x, y); std::vector expected = {-0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; @@ -1774,9 +1860,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxTenF32s) { XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S1AndR1S0F32s) { XlaBuilder builder(TestName()); - auto u = builder.ConstantR1({3.5}); - auto v = builder.ConstantR1({}); - builder.Max(u, v); + auto u = ConstantR1(&builder, {3.5}); + auto v = ConstantR1(&builder, {}); + Max(u, v); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } @@ -1784,9 +1870,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S1AndR1S0F32s) { XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S0AndR2S0x2F32s) { for (int broadcast_dim : {0, 1}) { XlaBuilder builder(TestName()); - auto u = builder.ConstantR1({3.5}); - auto v = builder.ConstantR2FromArray2D(Array2D(0, 2)); - builder.Max(u, v, /*broadcast_dimensions=*/{broadcast_dim}); + auto u = ConstantR1(&builder, {3.5}); + auto v = ConstantR2FromArray2D(&builder, Array2D(0, 2)); + Max(u, v, /*broadcast_dimensions=*/{broadcast_dim}); ComputeAndCompareR2(&builder, Array2D(0, 2), {}, error_spec_); } @@ -1794,10 +1880,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S0AndR2S0x2F32s) { XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DF32s) { XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({2.0f, 3.0f, 4.0f}); - auto m = - builder.ConstantR2({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); - builder.Max(v, m, /*broadcast_dimensions=*/{1}); + auto v = ConstantR1(&builder, {2.0f, 3.0f, 4.0f}); + auto m = ConstantR2(&builder, + {{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); + Max(v, m, /*broadcast_dimensions=*/{1}); Array2D expected({{2.0f, 3.14f, 4.0f}, {2.25f, 3.0f, 4.0f}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); @@ -1805,9 +1891,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DF32s) { XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DZeroElementF32s) { XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({}); - auto m = builder.ConstantR2({{}, {}}); - builder.Max(v, m, /*broadcast_dimensions=*/{1}); + auto v = ConstantR1(&builder, {}); + auto m = ConstantR2(&builder, {{}, {}}); + Max(v, m, /*broadcast_dimensions=*/{1}); Array2D expected({{}, {}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); @@ -1815,10 +1901,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DZeroElementF32s) { XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarS32s) { XlaBuilder builder(TestName()); - auto scalar = builder.ConstantR0(2); + auto scalar = ConstantR0(&builder, 2); Array3D a_3d({{{3, 9, -1}, {2, -10, 3}}, {{-2, 2, 8}, {12, 10, 4}}}); - auto array = builder.ConstantR3FromArray3D(a_3d); - builder.Max(array, scalar, /*broadcast_dimensions=*/{}); + auto array = ConstantR3FromArray3D(&builder, a_3d); + Max(array, scalar, /*broadcast_dimensions=*/{}); Array3D expected({{{3, 9, 2}, {2, 2, 3}}, {{2, 2, 8}, {12, 10, 4}}}); ComputeAndCompareR3(&builder, expected, {}); @@ -1826,10 +1912,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarS32s) { XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarZeroElementS32s) { XlaBuilder builder(TestName()); - auto scalar = builder.ConstantR0(2); + auto scalar = ConstantR0(&builder, 2); Array3D a_3d(2, 0, 3); - auto array = builder.ConstantR3FromArray3D(a_3d); - builder.Max(array, scalar, /*broadcast_dimensions=*/{}); + auto array = ConstantR3FromArray3D(&builder, a_3d); + Max(array, scalar, /*broadcast_dimensions=*/{}); Array3D expected(2, 0, 3); ComputeAndCompareR3(&builder, expected, {}); @@ -1837,10 +1923,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarZeroElementS32s) { XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DF32s) { XlaBuilder builder(TestName()); - auto m = - builder.ConstantR2({{-10.4f, 64.0f, 6.0f}, {0.1f, 32.0f, 16.1f}}); - auto v = builder.ConstantR1({-10.2f, 16.4f}); - builder.Min(m, v, /*broadcast_dimensions=*/{0}); + auto m = ConstantR2(&builder, + {{-10.4f, 64.0f, 6.0f}, {0.1f, 32.0f, 16.1f}}); + auto v = ConstantR1(&builder, {-10.2f, 16.4f}); + Min(m, v, /*broadcast_dimensions=*/{0}); Array2D expected({{-10.4f, -10.2f, -10.2f}, {0.1f, 16.4f, 16.1f}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); @@ -1848,9 +1934,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DF32s) { XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DZeroElementF32s) { XlaBuilder builder(TestName()); - auto m = builder.ConstantR2({{}, {}}); - auto v = builder.ConstantR1({-10.2f, 16.4f}); - builder.Min(m, v, /*broadcast_dimensions=*/{0}); + auto m = ConstantR2(&builder, {{}, {}}); + auto v = ConstantR1(&builder, {-10.2f, 16.4f}); + Min(m, v, /*broadcast_dimensions=*/{0}); Array2D expected({{}, {}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); @@ -1859,11 +1945,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DZeroElementF32s) { XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DF32s) { XlaBuilder builder(TestName()); auto array2d = - builder.ConstantR2({{-12.2f, 64.3f, 6.1f}, {0.0f, 32.2f, 2.5f}}); - auto array4d = builder.ConstantR4FromArray4D( - {{{{-12.1f, 32.3f, 6.2f}}, {{0.0f, 32.5f, 3.0f}}}, - {{{-2.5f, 64.29f, 6.5f}}, {{-0.01f, 32.25f, 2.6f}}}}); - builder.Min(array2d, array4d, /*broadcast_dimensions=*/{1, 3}); + ConstantR2(&builder, {{-12.2f, 64.3f, 6.1f}, {0.0f, 32.2f, 2.5f}}); + auto array4d = ConstantR4FromArray4D( + &builder, {{{{-12.1f, 32.3f, 6.2f}}, {{0.0f, 32.5f, 3.0f}}}, + {{{-2.5f, 64.29f, 6.5f}}, {{-0.01f, 32.25f, 2.6f}}}}); + Min(array2d, array4d, /*broadcast_dimensions=*/{1, 3}); Array4D expected( {{{{-12.2f, 32.3f, 6.1f}}, {{0.0f, 32.2f, 2.5f}}}, @@ -1874,10 +1960,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DF32s) { XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DZeroElementF32s) { XlaBuilder builder(TestName()); auto array2d = - builder.ConstantR2({{-12.2f, 64.3f, 6.1f}, {0.0f, 32.2f, 2.5f}}); + ConstantR2(&builder, {{-12.2f, 64.3f, 6.1f}, {0.0f, 32.2f, 2.5f}}); Array4D arg(2, 2, 0, 3); - auto array4d = builder.ConstantR4FromArray4D(arg); - builder.Min(array2d, array4d, /*broadcast_dimensions=*/{1, 3}); + auto array4d = ConstantR4FromArray4D(&builder, arg); + Min(array2d, array4d, /*broadcast_dimensions=*/{1, 3}); Array4D expected(2, 2, 0, 3); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -1885,9 +1971,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo4DZeroElementF32s) { XLA_TEST_F(ArrayElementwiseOpTest, MinTenS32s) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); - auto y = builder.ConstantR1({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); - builder.Min(x, y); + auto x = ConstantR1(&builder, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); + auto y = ConstantR1(&builder, {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); + Min(x, y); std::vector expected = {0, 1, 2, 3, 4, 4, 3, 2, 1, 0}; ComputeAndCompareR1(&builder, expected, {}); @@ -1895,9 +1981,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinTenS32s) { XLA_TEST_F(ArrayElementwiseOpTest, MaxTenS32s) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); - auto y = builder.ConstantR1({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); - builder.Max(x, y); + auto x = ConstantR1(&builder, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); + auto y = ConstantR1(&builder, {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}); + Max(x, y); std::vector expected = {9, 8, 7, 6, 5, 5, 6, 7, 8, 9}; ComputeAndCompareR1(&builder, expected, {}); @@ -1905,19 +1991,20 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxTenS32s) { XLA_TEST_F(ArrayElementwiseOpTest, RemTwoConstantS32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-3, 26, 2, -1, 1}); - auto b = builder.ConstantR1({10, 5, 1, 10, -10}); - builder.Rem(a, b); + auto a = ConstantR1(&builder, {-3, 26, 2, -1, 1}); + auto b = ConstantR1(&builder, {10, 5, 1, 10, -10}); + Rem(a, b); ComputeAndCompareR1(&builder, {-3, 1, 0, -1, 1}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, NonNanClampF32) { XlaBuilder builder(TestName()); - auto minimum = builder.ConstantR1({1.0f, -6.5f, 1.0f, 2.25f, 0.0f}); - auto argument = builder.ConstantR1({2.0f, 10.0f, -5.0f, 1.0f, 10.0f}); - auto maximum = builder.ConstantR1({3.0f, 0.5f, 25.5f, 5.0f, 123.0}); - builder.Clamp(minimum, argument, maximum); + auto minimum = ConstantR1(&builder, {1.0f, -6.5f, 1.0f, 2.25f, 0.0f}); + auto argument = + ConstantR1(&builder, {2.0f, 10.0f, -5.0f, 1.0f, 10.0f}); + auto maximum = ConstantR1(&builder, {3.0f, 0.5f, 25.5f, 5.0f, 123.0}); + Clamp(minimum, argument, maximum); ComputeAndCompareR1(&builder, {2.0f, 0.5f, 1.0f, 2.25f, 10.0f}, {}, error_spec_); @@ -1925,10 +2012,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonNanClampF32) { XLA_TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) { XlaBuilder builder(TestName()); - auto minimum = builder.ConstantR0(0.0f); - auto argument = builder.ConstantR1({2.0f, 10.0f, -5.0f, 1.0f, 4.0f}); - auto maximum = builder.ConstantR0(5.0f); - builder.Clamp(minimum, argument, maximum); + auto minimum = ConstantR0(&builder, 0.0f); + auto argument = ConstantR1(&builder, {2.0f, 10.0f, -5.0f, 1.0f, 4.0f}); + auto maximum = ConstantR0(&builder, 5.0f); + Clamp(minimum, argument, maximum); ComputeAndCompareR1(&builder, {2.0f, 5.0f, 0.0f, 1.0f, 4.0f}, {}, error_spec_); @@ -1936,16 +2023,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) { XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) { XlaBuilder builder(TestName()); - auto min_scalar = builder.ConstantR0(0.0f); - auto min_vector = builder.ConstantR1({1.0f, -6.5f, 1.0f, 2.25f, 0.0f}); - auto arg_vector = builder.ConstantR1({2.0f, 10.0f, -5.0f, 1.0f, 4.0f}); - auto max_scalar = builder.ConstantR0(3.0f); - auto max_vector = builder.ConstantR1({3.0f, 0.5f, 25.5f, 5.0f, 123.0}); + auto min_scalar = ConstantR0(&builder, 0.0f); + auto min_vector = + ConstantR1(&builder, {1.0f, -6.5f, 1.0f, 2.25f, 0.0f}); + auto arg_vector = + ConstantR1(&builder, {2.0f, 10.0f, -5.0f, 1.0f, 4.0f}); + auto max_scalar = ConstantR0(&builder, 3.0f); + auto max_vector = + ConstantR1(&builder, {3.0f, 0.5f, 25.5f, 5.0f, 123.0}); // Perform clamp with broadcasted scalar and vector. - builder.Add(builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar), - builder.Clamp(min_scalar, arg_vector, max_vector)), - builder.Add(builder.Clamp(min_vector, arg_vector, max_vector), - builder.Clamp(min_scalar, arg_vector, max_scalar))); + Add(Add(Clamp(min_vector, arg_vector, max_scalar), + Clamp(min_scalar, arg_vector, max_vector)), + Add(Clamp(min_vector, arg_vector, max_vector), + Clamp(min_scalar, arg_vector, max_scalar))); ComputeAndCompareR1(&builder, {8.0f, 7.0f, 2.0f, 6.5f, 14.0f}, {}, error_spec_); @@ -1953,52 +2043,52 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) { XLA_TEST_F(ArrayElementwiseOpTest, ClampS32Vector) { XlaBuilder builder(TestName()); - auto min_vector = builder.ConstantR1({1, -6, 1, 2, 0, -5}); - auto arg_vector = builder.ConstantR1({2, 10, -5, 1, 4, 10}); - auto max_vector = builder.ConstantR1({3, 0, 25, 5, 123, -1}); - builder.Clamp(min_vector, arg_vector, max_vector); + auto min_vector = ConstantR1(&builder, {1, -6, 1, 2, 0, -5}); + auto arg_vector = ConstantR1(&builder, {2, 10, -5, 1, 4, 10}); + auto max_vector = ConstantR1(&builder, {3, 0, 25, 5, 123, -1}); + Clamp(min_vector, arg_vector, max_vector); ComputeAndCompareR1(&builder, {2, 0, 1, 2, 4, -1}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, ClampS32ScalarVector) { XlaBuilder builder(TestName()); - auto min_scalar = builder.ConstantR0(0); - auto min_vector = builder.ConstantR1({1, -6, 1, 2, 0}); - auto arg_vector = builder.ConstantR1({2, 10, -5, 1, 4}); - auto max_scalar = builder.ConstantR0(3); - auto max_vector = builder.ConstantR1({3, 1, 25, 5, 123}); + auto min_scalar = ConstantR0(&builder, 0); + auto min_vector = ConstantR1(&builder, {1, -6, 1, 2, 0}); + auto arg_vector = ConstantR1(&builder, {2, 10, -5, 1, 4}); + auto max_scalar = ConstantR0(&builder, 3); + auto max_vector = ConstantR1(&builder, {3, 1, 25, 5, 123}); // Perform clamp with broadcasted scalar and vector. - builder.Add(builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar), - builder.Clamp(min_scalar, arg_vector, max_vector)), - builder.Add(builder.Clamp(min_vector, arg_vector, max_vector), - builder.Clamp(min_scalar, arg_vector, max_scalar))); + Add(Add(Clamp(min_vector, arg_vector, max_scalar), + Clamp(min_scalar, arg_vector, max_vector)), + Add(Clamp(min_vector, arg_vector, max_vector), + Clamp(min_scalar, arg_vector, max_scalar))); ComputeAndCompareR1(&builder, {8, 8, 2, 6, 14}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) { XlaBuilder builder(TestName()); - auto min_vector = builder.ConstantR1({1, 2, 1, 2, 0, ~0u - 4}); - auto arg_vector = builder.ConstantR1({2, 10, 5, 1, 4, 10}); - auto max_vector = builder.ConstantR1({3, 5, 25, 5, 123, ~0u}); - builder.Clamp(min_vector, arg_vector, max_vector); + auto min_vector = ConstantR1(&builder, {1, 2, 1, 2, 0, ~0u - 4}); + auto arg_vector = ConstantR1(&builder, {2, 10, 5, 1, 4, 10}); + auto max_vector = ConstantR1(&builder, {3, 5, 25, 5, 123, ~0u}); + Clamp(min_vector, arg_vector, max_vector); ComputeAndCompareR1(&builder, {2, 5, 5, 2, 4, ~0u - 4}, {}); } XLA_TEST_F(ArrayElementwiseOpTest, ClampU32ScalarVector) { XlaBuilder builder(TestName()); - auto min_scalar = builder.ConstantR0(0); - auto min_vector = builder.ConstantR1({1, 0, 1, 2, 0}); - auto arg_vector = builder.ConstantR1({2, 10, 0, 1, 4}); - auto max_scalar = builder.ConstantR0(3); - auto max_vector = builder.ConstantR1({3, 1, 25, 5, 123}); + auto min_scalar = ConstantR0(&builder, 0); + auto min_vector = ConstantR1(&builder, {1, 0, 1, 2, 0}); + auto arg_vector = ConstantR1(&builder, {2, 10, 0, 1, 4}); + auto max_scalar = ConstantR0(&builder, 3); + auto max_vector = ConstantR1(&builder, {3, 1, 25, 5, 123}); // Perform clamp with broadcasted scalar and vector. - builder.Add(builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar), - builder.Clamp(min_scalar, arg_vector, max_vector)), - builder.Add(builder.Clamp(min_vector, arg_vector, max_vector), - builder.Clamp(min_scalar, arg_vector, max_scalar))); + Add(Add(Clamp(min_vector, arg_vector, max_scalar), + Clamp(min_scalar, arg_vector, max_vector)), + Add(Clamp(min_vector, arg_vector, max_vector), + Clamp(min_scalar, arg_vector, max_scalar))); ComputeAndCompareR1(&builder, {8, 8, 2, 6, 14}, {}); } @@ -2007,18 +2097,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({1.1f, 2.2f, 3.3f, 5.5f}); + LiteralUtil::CreateR1({1.1f, 2.2f, 3.3f, 5.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_literal = - Literal::CreateR1({7.2f, 2.3f, 3.4f, 5.6f}); + LiteralUtil::CreateR1({7.2f, 2.3f, 3.4f, 5.6f}); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); - auto p0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto p1 = builder.Parameter(1, param1_literal->shape(), "param1"); - builder.Add(p0, p1); + auto p0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto p1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + Add(p0, p1); ComputeAndCompareR1(&builder, {8.3f, 4.5f, 6.7f, 11.1f}, {param0_data.get(), param1_data.get()}, @@ -2029,18 +2119,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersZeroElementF32s) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR3FromArray3D(Array3D(0, 7, 0)); + LiteralUtil::CreateR3FromArray3D(Array3D(0, 7, 0)); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_literal = - Literal::CreateR3FromArray3D(Array3D(0, 7, 0)); + LiteralUtil::CreateR3FromArray3D(Array3D(0, 7, 0)); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); - auto p0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto p1 = builder.Parameter(1, param1_literal->shape(), "param1"); - builder.Add(p0, p1); + auto p0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto p1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + Add(p0, p1); Array3D expected(0, 7, 0); ComputeAndCompareR3( @@ -2051,13 +2141,13 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({1.1f, 2.2f, 3.3f, 5.5f}); + LiteralUtil::CreateR1({1.1f, 2.2f, 3.3f, 5.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto a = builder.ConstantR1({1.1f, 2.2f, 3.3f, 4.4f}); - auto p = builder.Parameter(0, param0_literal->shape(), "param0"); - builder.Add(a, p); + auto a = ConstantR1(&builder, {1.1f, 2.2f, 3.3f, 4.4f}); + auto p = Parameter(&builder, 0, param0_literal->shape(), "param0"); + Add(a, p); ComputeAndCompareR1(&builder, {2.2f, 4.4f, 6.6f, 9.9f}, {param0_data.get()}, error_spec_); @@ -2065,8 +2155,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) { XLA_TEST_F(ArrayElementwiseOpTest, CosF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({3.14159f, 0.0f, 1.570796f, -0.78539f}); - builder.Cos(a); + auto a = ConstantR1(&builder, {3.14159f, 0.0f, 1.570796f, -0.78539f}); + Cos(a); ComputeAndCompareR1(&builder, {-1.0f, 1.0f, 0.0f, 0.707107f}, {}, error_spec_); @@ -2074,8 +2164,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, CosF32s) { XLA_TEST_F(ArrayElementwiseOpTest, SinF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({3.14159f, 0.0f, 1.570796f, -0.78539f}); - builder.Sin(a); + auto a = ConstantR1(&builder, {3.14159f, 0.0f, 1.570796f, -0.78539f}); + Sin(a); ComputeAndCompareR1(&builder, {0.0f, 0.0f, 1.0f, -0.707107f}, {}, error_spec_); @@ -2083,9 +2173,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, SinF32s) { XLA_TEST_F(ArrayElementwiseOpTest, Atan2F32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({0.0f, 5.0f, 0.0f, -3.0f, 2.0f, -8.0f}); - auto b = builder.ConstantR1({6.0f, 0.0f, -4.0f, 0.0f, 2.0f, 8.0f}); - builder.Atan2(a, b); + auto a = ConstantR1(&builder, {0.0f, 5.0f, 0.0f, -3.0f, 2.0f, -8.0f}); + auto b = ConstantR1(&builder, {6.0f, 0.0f, -4.0f, 0.0f, 2.0f, 8.0f}); + Atan2(a, b); ComputeAndCompareR1( &builder, @@ -2095,8 +2185,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Atan2F32s) { XLA_TEST_F(ArrayElementwiseOpTest, TanhF32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({-2.5f, 3.14f, 2.25f}); - builder.Tanh(a); + auto a = ConstantR1(&builder, {-2.5f, 3.14f, 2.25f}); + Tanh(a); ComputeAndCompareR1(&builder, {-0.986614f, 0.996260f, 0.978026}, {}, error_spec_); @@ -2107,7 +2197,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) { // the input tensor is large enough to exercise the vectorized tanh // implementation on XLA CPU. XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateR1( + auto input_literal = LiteralUtil::CreateR1( {1.02, -0.32, 0.85, 0.90, 1.23, -0.91, -0.49, 0.80, -0.67, 0.16, -0.07, 0.39, -0.41, 0.04, 1.36, 1.25, 0.41, 0.65, -1.08, 0.32, -1.45, -0.77, -1.09, 0.91, -1.03, -0.30, -1.11, -1.17, 1.50, -0.85, @@ -2118,8 +2208,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) { TF_ASSERT_OK_AND_ASSIGN(auto input_data, client_->TransferToServer(*input_literal)); - auto input = builder.Parameter(0, input_literal->shape(), "input"); - builder.Tanh(input); + auto input = Parameter(&builder, 0, input_literal->shape(), "input"); + Tanh(input); ComputeAndCompareR1( &builder, @@ -2149,7 +2239,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, ExpF32sVector) { // Just to help make sense of the scales here -- exp(89) saturates float32 and // exp(-10) is smaller than our error spec. - std::unique_ptr input_literal = Literal::CreateR1( + std::unique_ptr input_literal = LiteralUtil::CreateR1( {1.02, -0.32, 0.85, 0.9, 1.23, -0.91, -0.49, 0.8, -1.31, -1.44, -0.13, -1.31, -0.79, 1.41, 1.21, 1.05, -195.6, -194.5, -193.4, -192.3, -191.2, -190.1, -189.0, -187.9, -19.6, -18.5, -17.4, @@ -2164,8 +2254,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, ExpF32sVector) { TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr input_data, client_->TransferToServer(*input_literal)); - auto input = builder.Parameter(0, input_literal->shape(), "input"); - builder.Exp(input); + auto input = Parameter(&builder, 0, input_literal->shape(), "input"); + Exp(input); std::vector expected_result; int64 input_size = input_literal->shape().dimensions(0); @@ -2183,7 +2273,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) { // implementation on XLA CPU. XlaBuilder builder(TestName()); - std::unique_ptr input_literal = Literal::CreateR1( + std::unique_ptr input_literal = LiteralUtil::CreateR1( {-1.29, -1.41, -1.25, -13.5, -11.7, -17.9, -198, -167, 1.29, 1.41, 1.25, 13.5, 11.7, 17.9, 198, 167, 1.27e+03, 1.33e+03, 1.74e+03, 1.6e+04, 1.84e+04, @@ -2202,8 +2292,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) { TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr input_data, client_->TransferToServer(*input_literal)); - auto input = builder.Parameter(0, input_literal->shape(), "input"); - builder.Log(input); + auto input = Parameter(&builder, 0, input_literal->shape(), "input"); + Log(input); std::vector expected_result; int64 input_size = input_literal->shape().dimensions(0); @@ -2218,9 +2308,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) { XLA_TEST_F(ArrayElementwiseOpTest, ClzU32s) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {0, 1, 0x10, 0x10000, 0x700000, 0x12345678, 0xF2345678}); - builder.Clz(a); + auto a = ConstantR1( + &builder, {0, 1, 0x10, 0x10000, 0x700000, 0x12345678, 0xF2345678}); + Clz(a); ComputeAndCompareR1(&builder, {32, 31, 27, 15, 9, 3, 0}, {}); } @@ -2228,8 +2318,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClzU32s) { XLA_TEST_F(ArrayElementwiseOpTest, ClzS64s) { XlaBuilder builder(TestName()); auto a = - builder.ConstantR1({0, 1, 0x80000000, 0x7FFFFFFFF2345678ul, -1}); - builder.Clz(a); + ConstantR1(&builder, {0, 1, 0x80000000, 0x7FFFFFFFF2345678ul, -1}); + Clz(a); ComputeAndCompareR1(&builder, {64, 63, 32, 1, 0}, {}); } @@ -2241,12 +2331,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) { // c---------------------/ XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({1.1f, 2.2f, 3.3f, 4.4f}); - auto b = builder.ConstantR1({2.1f, 3.2f, 4.3f, 5.4f}); - auto c = builder.ConstantR1({-3.3f, -15.5f, -7.7f, -29.9f}); + auto a = ConstantR1(&builder, {1.1f, 2.2f, 3.3f, 4.4f}); + auto b = ConstantR1(&builder, {2.1f, 3.2f, 4.3f, 5.4f}); + auto c = ConstantR1(&builder, {-3.3f, -15.5f, -7.7f, -29.9f}); - auto add = builder.Add(a, b); - builder.Add(add, c); + auto add = Add(a, b); + Add(add, c); ComputeAndCompareR1(&builder, {-0.1f, -10.1f, -0.1f, -20.1f}, {}, error_spec_); @@ -2259,12 +2349,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldRight) { // a---------------------/ XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({91.1f, 2.2f, 3.3f, 4.4f}); - auto b = builder.ConstantR1({2.1f, 3.2f, 4.3f, 5.4f}); - auto c = builder.ConstantR1({-3.3f, -15.5f, -7.7f, -29.9f}); + auto a = ConstantR1(&builder, {91.1f, 2.2f, 3.3f, 4.4f}); + auto b = ConstantR1(&builder, {2.1f, 3.2f, 4.3f, 5.4f}); + auto c = ConstantR1(&builder, {-3.3f, -15.5f, -7.7f, -29.9f}); - auto add = builder.Add(b, c); - builder.Add(a, add); + auto add = Add(b, c); + Add(a, add); ComputeAndCompareR1(&builder, {89.9f, -10.1f, -0.1f, -20.1f}, {}, error_spec_); @@ -2276,12 +2366,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddWithNeg) { // b ----- (neg) ----/ XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({91.1f, 2.2f, 3.3f, 4.4f}); - auto b = builder.ConstantR1({2.1f, 3.2f, 4.3f, 5.4f}); + auto a = ConstantR1(&builder, {91.1f, 2.2f, 3.3f, 4.4f}); + auto b = ConstantR1(&builder, {2.1f, 3.2f, 4.3f, 5.4f}); - auto neg_a = builder.Neg(a); - auto neg_b = builder.Neg(b); - builder.Add(neg_a, neg_b); + auto neg_a = Neg(a); + auto neg_b = Neg(b); + Add(neg_a, neg_b); ComputeAndCompareR1(&builder, {-93.2f, -5.4f, -7.6f, -9.8f}, {}, error_spec_); @@ -2297,14 +2387,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainTwoSide) { // d -----/ XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({91.1f, 2.2f, 3.3f, 4.4f}); - auto b = builder.ConstantR1({2.1f, 3.2f, 4.3f, 5.4f}); - auto c = builder.ConstantR1({-3.3f, -15.5f, -7.7f, -29.9f}); - auto d = builder.ConstantR1({-19.0f, 10.0f, -40.0f, 20.2f}); + auto a = ConstantR1(&builder, {91.1f, 2.2f, 3.3f, 4.4f}); + auto b = ConstantR1(&builder, {2.1f, 3.2f, 4.3f, 5.4f}); + auto c = ConstantR1(&builder, {-3.3f, -15.5f, -7.7f, -29.9f}); + auto d = ConstantR1(&builder, {-19.0f, 10.0f, -40.0f, 20.2f}); - auto add_ab = builder.Add(a, b); - auto add_cd = builder.Add(c, d); - builder.Add(add_ab, add_cd); + auto add_ab = Add(a, b); + auto add_cd = Add(c, d); + Add(add_ab, add_cd); ComputeAndCompareR1(&builder, {70.9f, -0.1f, -40.1f, 0.1f}, {}, error_spec_); @@ -2312,11 +2402,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddChainTwoSide) { XLA_TEST_F(ArrayElementwiseOpTest, 2DBinaryOpF32s) { XlaBuilder builder(TestName()); - auto a = - builder.ConstantR2({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); - auto b = - builder.ConstantR2({{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}}); - builder.Add(a, b); + auto a = ConstantR2(&builder, + {{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); + auto b = ConstantR2(&builder, + {{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}}); + Add(a, b); Array2D expected_array( {{-4.0f, 11.28f, 43.0f}, {1.25f, -14.0f, 8.88f}}); @@ -2326,10 +2416,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, 2DBinaryOpF32s) { XLA_TEST_F(ArrayElementwiseOpTest, ScalarPlus2DF32) { // Add a scalar + matrix. XlaBuilder builder(TestName()); - auto a = - builder.ConstantR2({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); - auto scalar = builder.ConstantR0(3.0f); - builder.Add(scalar, a); + auto a = ConstantR2(&builder, + {{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); + auto scalar = ConstantR0(&builder, 3.0f); + Add(scalar, a); Array2D expected_array({{0.5f, 6.14f, 4.0f}, {5.25f, -7.0f, 6.33f}}); ComputeAndCompareR2(&builder, expected_array, {}, error_spec_); @@ -2338,10 +2428,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, ScalarPlus2DF32) { XLA_TEST_F(ArrayElementwiseOpTest, 2DPlusScalarF32) { // Add a matrix + scalar. XlaBuilder builder(TestName()); - auto a = - builder.ConstantR2({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); - auto scalar = builder.ConstantR0(3.0f); - builder.Add(a, scalar); + auto a = ConstantR2(&builder, + {{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); + auto scalar = ConstantR0(&builder, 3.0f); + Add(a, scalar); Array2D expected_array({{0.5f, 6.14f, 4.0f}, {5.25f, -7.0f, 6.33f}}); ComputeAndCompareR2(&builder, expected_array, {}, error_spec_); @@ -2351,13 +2441,13 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32) { // Test simple broadcasting of a R1F32 over R2F32. The vector's size matches // only dim 0 of the matrix. XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({20.0f, 40.0f, 60.0f}); + auto v = ConstantR1(&builder, {20.0f, 40.0f, 60.0f}); // clang-format off - auto m = builder.ConstantR2({ + auto m = ConstantR2(&builder, { {-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); // clang-format on - builder.Add(v, m, /*broadcast_dimensions=*/{1}); + Add(v, m, /*broadcast_dimensions=*/{1}); Array2D expected_array( {{17.5f, 43.14f, 61.0f}, {22.25f, 30.0f, 63.33f}}); ComputeAndCompareR2(&builder, expected_array, {}, error_spec_); @@ -2366,27 +2456,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32) { XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Eq) { // Test broadcasting in Eq comparison. XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({42, 73}); - auto m = builder.ConstantR2({{42, 73}, {42, 52}}); + auto v = ConstantR1(&builder, {42, 73}); + auto m = ConstantR2(&builder, {{42, 73}, {42, 52}}); // This test exercises both possible broadcast dimensions for a vector/matrix // comparison. - auto cmp_dim_0 = builder.Eq(v, m, /*broadcast_dimensions=*/{1}); - auto cmp_dim_1 = builder.Eq(v, m, /*broadcast_dimensions=*/{0}); - auto result = builder.Tuple({cmp_dim_0, cmp_dim_1}); + auto cmp_dim_0 = Eq(v, m, /*broadcast_dimensions=*/{1}); + auto cmp_dim_1 = Eq(v, m, /*broadcast_dimensions=*/{0}); + Tuple(&builder, {cmp_dim_0, cmp_dim_1}); - auto expected = Literal::MakeTuple( - {Literal::CreateR2({{true, true}, {true, false}}).get(), - Literal::CreateR2({{true, false}, {false, false}}).get()}); + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{true, true}, {true, false}}).get(), + LiteralUtil::CreateR2({{true, false}, {false, false}}).get()}); ComputeAndCompareTuple(&builder, *expected, {}, error_spec_); } XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ne) { // Test broadcasting in Ne comparison. XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({42, 73}); - auto m = builder.ConstantR2({{42, 73}, {42, 52}}); - builder.Ne(v, m, /*broadcast_dimensions=*/{1}); + auto v = ConstantR1(&builder, {42, 73}); + auto m = ConstantR2(&builder, {{42, 73}, {42, 52}}); + Ne(v, m, /*broadcast_dimensions=*/{1}); const string expected = R"(pred[2,2] { { 00 }, @@ -2398,9 +2488,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ne) { XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ge) { // Test broadcasting in Ge comparison. XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({1, 2, 3, 4}); - auto m = builder.ConstantR2({{1, 0, 5, 6}, {42, 52, 10, 4}}); - builder.Ge(v, m, /*broadcast_dimensions=*/{1}); + auto v = ConstantR1(&builder, {1, 2, 3, 4}); + auto m = ConstantR2(&builder, {{1, 0, 5, 6}, {42, 52, 10, 4}}); + Ge(v, m, /*broadcast_dimensions=*/{1}); const string expected = R"(pred[2,4] { { 1100 }, @@ -2412,9 +2502,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ge) { XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Gt) { // Test broadcasting in Gt comparison. XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({1, 2, 3, 4}); - auto m = builder.ConstantR2({{1, 0, 5, 6}, {42, 52, 10, 4}}); - builder.Gt(v, m, /*broadcast_dimensions=*/{1}); + auto v = ConstantR1(&builder, {1, 2, 3, 4}); + auto m = ConstantR2(&builder, {{1, 0, 5, 6}, {42, 52, 10, 4}}); + Gt(v, m, /*broadcast_dimensions=*/{1}); const string expected = R"(pred[2,4] { { 0100 }, @@ -2426,9 +2516,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Gt) { XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Le) { // Test broadcasting in Le comparison. XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({1, 2, 3, 4}); - auto m = builder.ConstantR2({{1, 0, 5, 6}, {42, 52, 10, 4}}); - builder.Le(v, m, /*broadcast_dimensions=*/{1}); + auto v = ConstantR1(&builder, {1, 2, 3, 4}); + auto m = ConstantR2(&builder, {{1, 0, 5, 6}, {42, 52, 10, 4}}); + Le(v, m, /*broadcast_dimensions=*/{1}); const string expected = R"(pred[2,4] { { 1011 }, @@ -2440,9 +2530,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Le) { XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Lt) { // Test broadcasting in Lt comparison. XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({1, 2, 3, 4}); - auto m = builder.ConstantR2({{1, 0, 5, 6}, {42, 52, 10, 4}}); - builder.Lt(v, m, /*broadcast_dimensions=*/{1}); + auto v = ConstantR1(&builder, {1, 2, 3, 4}); + auto m = ConstantR2(&builder, {{1, 0, 5, 6}, {42, 52, 10, 4}}); + Lt(v, m, /*broadcast_dimensions=*/{1}); const string expected = R"(pred[2,4] { { 0011 }, @@ -2455,9 +2545,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Mul2Dby1DF32) { // Test simple broadcasting of a R1F32 over R2F32 when the order of binary op // arguments is reversed. XlaBuilder builder(TestName()); - auto m = builder.ConstantR2({{1.5f, 2.5f, 3.5f}, {4.5f, 5.5f, 6.5f}}); - auto v = builder.ConstantR1({2.0f, 4.0f, 6.0f}); - builder.Mul(m, v, /*broadcast_dimensions=*/{1}); + auto m = + ConstantR2(&builder, {{1.5f, 2.5f, 3.5f}, {4.5f, 5.5f, 6.5f}}); + auto v = ConstantR1(&builder, {2.0f, 4.0f, 6.0f}); + Mul(m, v, /*broadcast_dimensions=*/{1}); Array2D expected_array({{3.0f, 10.0f, 21.0f}, {9.0f, 22.0f, 39.0f}}); ComputeAndCompareR2(&builder, expected_array, {}, error_spec_); } @@ -2468,10 +2559,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim1) { // m's shape in XLA notation is {3, 2} // md's shape in XLA notation is {3, 1} // The result has shape {3, 2}, where md is broadcast over m - auto m = - builder.ConstantR2({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); - auto md = builder.ConstantR2({{10.0f, 20.0f, 30.0f}}); - builder.Add(m, md); + auto m = ConstantR2(&builder, + {{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); + auto md = ConstantR2(&builder, {{10.0f, 20.0f, 30.0f}}); + Add(m, md); Array2D expected_array( {{7.5f, 23.14f, 31.0f}, {12.25f, 10.0f, 33.33f}}); ComputeAndCompareR2(&builder, expected_array, {}, error_spec_); @@ -2483,10 +2574,10 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim0) { // m's shape in XLA notation is {3, 2} // md's shape in XLA notation is {1, 2} // The result has shape {3, 2}, where md is broadcast over m - auto m = - builder.ConstantR2({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); - auto md = builder.ConstantR2({{10.0f}, {20.0f}}); - builder.Add(m, md); + auto m = ConstantR2(&builder, + {{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); + auto md = ConstantR2(&builder, {{10.0f}, {20.0f}}); + Add(m, md); Array2D expected_array( {{7.5f, 13.14f, 11.0f}, {22.25f, 10.0f, 23.33f}}); ComputeAndCompareR2(&builder, expected_array, {}, error_spec_); @@ -2501,9 +2592,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DsWithDegenerateDimsOuterProduct) { // a's shape in XLA notation is {1, 4} // b's shape in XLA notation is {3, 1} // The result has shape {3, 4}. - auto a = builder.ConstantR2({{0.0f}, {10.0f}, {20.0f}, {30.0f}}); - auto b = builder.ConstantR2({{1.0f, 2.0f, 3.0f}}); - builder.Add(a, b); + auto a = ConstantR2(&builder, {{0.0f}, {10.0f}, {20.0f}, {30.0f}}); + auto b = ConstantR2(&builder, {{1.0f, 2.0f, 3.0f}}); + Add(a, b); Array2D expected_array({{1.0f, 2.0f, 3.0f}, {11.0f, 12.0f, 13.0f}, {21.0f, 22.0f, 23.0f}, @@ -2515,9 +2606,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver1) { // Add together a (2,2) array and a (2) array, using dimension 0 for // broadcasting (though there are two ways to broadcast these shapes). XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({20.0f, 40.0f}); - auto m = builder.ConstantR2({{10.0f, 50.0f}, {77.0f, 88.0f}}); - builder.Add(v, m, /*broadcast_dimensions=*/{1}); + auto v = ConstantR1(&builder, {20.0f, 40.0f}); + auto m = ConstantR2(&builder, {{10.0f, 50.0f}, {77.0f, 88.0f}}); + Add(v, m, /*broadcast_dimensions=*/{1}); Array2D expected_array({{30.0f, 90.0f}, {97.0f, 128.0f}}); ComputeAndCompareR2(&builder, expected_array, {}, error_spec_); } @@ -2526,9 +2617,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver0) { // Add together a (2,2) array and a (2) array, using dimension 1 for // broadcasting (though there are two ways to broadcast these shapes). XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({20.0f, 40.0f}); - auto m = builder.ConstantR2({{10.0f, 50.0f}, {77.0f, 88.0f}}); - builder.Add(v, m, /*broadcast_dimensions=*/{0}); + auto v = ConstantR1(&builder, {20.0f, 40.0f}); + auto m = ConstantR2(&builder, {{10.0f, 50.0f}, {77.0f, 88.0f}}); + Add(v, m, /*broadcast_dimensions=*/{0}); Array2D expected_array({{30.0f, 70.0f}, {117.0f, 128.0f}}); ComputeAndCompareR2(&builder, expected_array, {}, error_spec_); } @@ -2538,12 +2629,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, 3DBinaryOpF32s) { XlaBuilder builder(TestName()); Array3D a_3d({{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}}); - auto a = builder.ConstantR3FromArray3D(a_3d); + auto a = ConstantR3FromArray3D(&builder, a_3d); Array3D b_3d({{{2.0f, 4.0f}, {6.0f, 8.0f}, {10.0f, 12.0f}}, {{14.0f, 16.0f}, {18.0f, 20.0f}, {22.0f, 24.0f}}}); - auto b = builder.ConstantR3FromArray3D(b_3d); - builder.Add(a, b); + auto b = ConstantR3FromArray3D(&builder, b_3d); + Add(a, b); Array3D expected_3d( {{{3.0f, 6.0f}, {9.0f, 12.0f}, {15.0f, 18.0f}}, @@ -2565,9 +2656,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver2) { {11.0f, 12.0f}}, }); // clang-format on - auto a = builder.ConstantR3FromArray3D(a_3d); - auto v = builder.ConstantR1({10.0f, 20.0f}); - builder.Add(a, v, /*broadcast_dimensions=*/{2}); + auto a = ConstantR3FromArray3D(&builder, a_3d); + auto v = ConstantR1(&builder, {10.0f, 20.0f}); + Add(a, v, /*broadcast_dimensions=*/{2}); Array3D expected_3d( {{{11.0f, 22.0f}, {13.0f, 24.0f}, {15.0f, 26.0f}}, @@ -2589,9 +2680,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo3DTwoWaysOver0) { {11.0f, 12.0f}}, }); // clang-format on - auto a = builder.ConstantR3FromArray3D(a_3d); - auto v = builder.ConstantR1({10.0f, 20.0f}); - builder.Add(a, v, /*broadcast_dimensions=*/{0}); + auto a = ConstantR3FromArray3D(&builder, a_3d); + auto v = ConstantR1(&builder, {10.0f, 20.0f}); + Add(a, v, /*broadcast_dimensions=*/{0}); // clang-format off Array3D expected_3d({ @@ -2619,12 +2710,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo3D) { {9.0f, 10.0f}, {11.0f, 12.0f}}, }); - auto a = builder.ConstantR3FromArray3D(a_3d); - auto m = builder.ConstantR2({ + auto a = ConstantR3FromArray3D(&builder, a_3d); + auto m = ConstantR2(&builder, { {10.0f, 20.0f, 30.0f}, {40.0f, 50.0f, 60.0f}, }); - builder.Add(a, m, /*broadcast_dimensions=*/{0, 1}); + Add(a, m, /*broadcast_dimensions=*/{0, 1}); Array3D expected_3d({ {{11.0f, 12.0f}, @@ -2644,12 +2735,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) { XlaBuilder builder(TestName()); Array3D a_3d({{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}}); - auto a = builder.ConstantR3FromArray3D(a_3d); + auto a = ConstantR3FromArray3D(&builder, a_3d); Array3D b_3d({{{7.0f, 1.0f}, {3.0f, 10.0f}, {15.0f, 6.0f}}}); - auto b = builder.ConstantR3FromArray3D(b_3d); + auto b = ConstantR3FromArray3D(&builder, b_3d); - builder.Gt(a, b); + Gt(a, b); Array3D expected_3d( {{{0, 1}, {0, 0}, {0, 0}}, {{0, 1}, {1, 0}, {0, 1}}}); @@ -2684,9 +2775,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, 4DBinaryOpF32s) { } } - auto a = builder.ConstantR4FromArray4D(*operand_a_4d); - auto b = builder.ConstantR4FromArray4D(*operand_b_4d); - builder.Add(a, b); + auto a = ConstantR4FromArray4D(&builder, *operand_a_4d); + auto b = ConstantR4FromArray4D(&builder, *operand_b_4d); + Add(a, b); ComputeAndCompareR4(&builder, *expected_4d, {}, error_spec_); } @@ -2712,9 +2803,9 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4PlusR1InDim1) { } } - auto a = builder.ConstantR4FromArray4D(*operand_a_4d); - auto b = builder.ConstantR1(operand_b_1d); - builder.Add(a, b, {1}); + auto a = ConstantR4FromArray4D(&builder, *operand_a_4d); + auto b = ConstantR1(&builder, operand_b_1d); + Add(a, b, {1}); ComputeAndCompareR4(&builder, *expected_4d, {}, error_spec_); } @@ -2730,11 +2821,12 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) { std::iota(r1.begin(), r1.end(), 1.0); XlaBuilder builder(TestName()); - std::unique_ptr a_literal = Literal::CreateR4FromArray4DWithLayout( - r4, LayoutUtil::MakeLayout({0, 1, 2, 3})); - auto a = builder.ConstantLiteral(*a_literal); - auto b = builder.ConstantR1(r1); - builder.Add(a, b, {1}); + std::unique_ptr a_literal = + LiteralUtil::CreateR4FromArray4DWithLayout( + r4, LayoutUtil::MakeLayout({0, 1, 2, 3})); + auto a = ConstantLiteral(&builder, *a_literal); + auto b = ConstantR1(&builder, r1); + Add(a, b, {1}); for (int i0 = 0; i0 < d0; ++i0) { for (int i1 = 0; i1 < d1; ++i1) { @@ -2752,22 +2844,22 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) { XLA_TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) { XlaBuilder builder(TestName()); auto shape = ShapeUtil::MakeOpaqueShape(); - auto x = builder.Parameter(0, shape, "x"); - builder.Add(x, x); + auto x = Parameter(&builder, 0, shape, "x"); + Add(x, x); auto computation_status = builder.Build(); ASSERT_FALSE(computation_status.ok()); EXPECT_THAT(computation_status.status().ToString(), ::testing::ContainsRegex( - "Expected non-opaque argument for lhs of binary operation")); + "Expected array argument for lhs of binary operation")); } XLA_TEST_F(ArrayElementwiseOpTest, IdentityBroadcastOfSameRankIsAllowed) { XlaBuilder builder(TestName()); - auto a = - builder.ConstantR2({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); - auto b = - builder.ConstantR2({{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}}); - builder.Add(a, b, /*broadcast_dimensions=*/{0, 1}); + auto a = ConstantR2(&builder, + {{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); + auto b = ConstantR2(&builder, + {{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}}); + Add(a, b, /*broadcast_dimensions=*/{0, 1}); Array2D expected_array( {{-4.0f, 11.28f, 43.0f}, {1.25f, -14.0f, 8.88f}}); @@ -2776,11 +2868,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, IdentityBroadcastOfSameRankIsAllowed) { XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) { XlaBuilder builder(TestName()); - auto a = - builder.ConstantR2({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); - auto b = - builder.ConstantR2({{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}}); - builder.Add(a, b, /*broadcast_dimensions=*/{1, 0}); + auto a = ConstantR2(&builder, + {{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}}); + auto b = ConstantR2(&builder, + {{-1.5f, 8.14f, 42.0}, {-1.0f, -4.0f, 5.55f}}); + Add(a, b, /*broadcast_dimensions=*/{1, 0}); auto computation_status = builder.Build(); ASSERT_FALSE(computation_status.ok()); @@ -2792,15 +2884,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) { // broadcast. XLA_TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) { XlaBuilder builder(TestName()); - auto x_literal = Literal::CreateR1({1, 2, 3}); - auto y_literal = Literal::CreateR1({4, 5}); + auto x_literal = LiteralUtil::CreateR1({1, 2, 3}); + auto y_literal = LiteralUtil::CreateR1({4, 5}); auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie(); auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie(); - auto x = builder.Parameter(0, x_literal->shape(), "x"); - auto y = builder.Parameter(1, y_literal->shape(), "y"); - auto slice = builder.Slice(x, {1}, {2}, {1}); - builder.Sub(slice, y); + auto x = Parameter(&builder, 0, x_literal->shape(), "x"); + auto y = Parameter(&builder, 1, y_literal->shape(), "y"); + auto slice = Slice(x, {1}, {2}, {1}); + Sub(slice, y); ComputeAndCompareR1(&builder, {-2, -3}, {x_data.get(), y_data.get()}, error_spec_); diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc index fcd9ff55e393f64476ddd4754e0fa74427f1cb51..caeb0bf49a0dde9eeac02037b2ea04fd024d100c 100644 --- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc +++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc @@ -16,7 +16,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -29,10 +29,10 @@ class AxpySimpleTest : public ClientLibraryTestBase {}; TEST_F(AxpySimpleTest, AxTenValues) { XlaBuilder builder("ax_10"); - auto alpha = builder.ConstantR0(3.1415926535); - auto x = builder.ConstantR1( - {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0}); - builder.Mul(alpha, x); + auto alpha = ConstantR0(&builder, 3.1415926535); + auto x = ConstantR1( + &builder, {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0}); + Mul(alpha, x); std::vector expected = { -3.14159265, 3.14159265, 6.28318531, -6.28318531, -9.42477796, @@ -42,11 +42,11 @@ TEST_F(AxpySimpleTest, AxTenValues) { XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) { XlaBuilder builder("axpy_10"); - auto alpha = builder.ConstantR0(3.1415926535); - auto x = builder.ConstantR1({}); - auto y = builder.ConstantR1({}); - auto ax = builder.Mul(alpha, x); - builder.Add(ax, y); + auto alpha = ConstantR0(&builder, 3.1415926535); + auto x = ConstantR1(&builder, {}); + auto y = ConstantR1(&builder, {}); + auto ax = Mul(alpha, x); + Add(ax, y); std::vector expected = {}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -54,13 +54,13 @@ XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) { TEST_F(AxpySimpleTest, AxpyTenValues) { XlaBuilder builder("axpy_10"); - auto alpha = builder.ConstantR0(3.1415926535); - auto x = builder.ConstantR1( - {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0}); - auto y = builder.ConstantR1( - {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0}); - auto ax = builder.Mul(alpha, x); - builder.Add(ax, y); + auto alpha = ConstantR0(&builder, 3.1415926535); + auto x = ConstantR1( + &builder, {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0}); + auto y = ConstantR1( + &builder, {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0}); + auto ax = Mul(alpha, x); + Add(ax, y); TF_ASSERT_OK_AND_ASSIGN(ProgramShape shape, builder.GetProgramShape()); diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc index 22c3394e6f34bd018ffaaaa4d9d68339673c3764..af0b8522394a0c591e6c42ad12db8853ef66243c 100644 --- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc +++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc @@ -19,8 +19,8 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -35,10 +35,10 @@ class BadRngShapeValidationTest : public ClientLibraryTestBase {}; TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) { XlaBuilder builder(TestName()); - auto zero = builder.ConstantR0(0.0); - auto one = builder.ConstantR0(1.0); + auto zero = ConstantR0(&builder, 0.0); + auto one = ConstantR0(&builder, 1.0); Shape default_constructed; - builder.RngUniform(zero, one, default_constructed); + RngUniform(zero, one, default_constructed); StatusOr computation = builder.Build(); EXPECT_FALSE(computation.ok()); @@ -49,13 +49,13 @@ TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) { TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) { XlaBuilder builder(TestName()); - auto zero = builder.ConstantR0(0.0); - auto one = builder.ConstantR0(1.0); + auto zero = ConstantR0(&builder, 0.0); + auto one = ConstantR0(&builder, 1.0); Shape sans_layout; sans_layout.set_element_type(F32); sans_layout.add_dimensions(1); - builder.RngUniform(zero, one, sans_layout); + RngUniform(zero, one, sans_layout); StatusOr computation = builder.Build(); ASSERT_TRUE(computation.ok()); diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc index f3dac75a44b948c4b45b80b93e7462073010979e..ac90a3adb6dbad30e3ef0b11438fb9a6fd6f8574 100644 --- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc +++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc @@ -17,13 +17,15 @@ limitations under the License. #include #include +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/math.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -40,7 +42,6 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/math/math_util.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -62,7 +63,7 @@ class BatchNormalizationTest {5.0f, 4.4f}, // p2 }); input_array_.FillWithPZ(pz); - input_literal_ = std::move(*Literal::CreateR4FromArray4D(input_array_)); + input_literal_ = std::move(*LiteralUtil::CreateR4FromArray4D(input_array_)); CHECK_EQ(kSamples, input_array_.planes()); CHECK_EQ(kZ, input_array_.depth()); CHECK_EQ(kY, input_array_.height()); @@ -101,9 +102,9 @@ INSTANTIATE_TEST_CASE_P(BatchNormalizationTestInstance, BatchNormalizationTest, XLA_TEST_P(BatchNormalizationTest, SubtractInZ) { XlaBuilder builder("subtract_in_z_one_sample"); - auto x = builder.ConstantLiteral(input_literal_); - auto y = builder.ConstantR1({3.14, 4.25}); - builder.Sub(x, y, /*broadcast_dimensions=*/{1}); + auto x = ConstantLiteral(&builder, input_literal_); + auto y = ConstantR1(&builder, {3.14, 4.25}); + Sub(x, y, /*broadcast_dimensions=*/{1}); Array4D expected(kSamples, kZ, kY, kX); Array2D pz({ @@ -117,8 +118,8 @@ XLA_TEST_P(BatchNormalizationTest, SubtractInZ) { XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) { XlaBuilder builder("square_tesseract_elementwise"); - auto x = builder.ConstantLiteral(input_literal_); - builder.SquareF32(x); + auto x = ConstantLiteral(&builder, input_literal_); + Square(x); using tensorflow::MathUtil; @@ -134,11 +135,10 @@ XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) { XLA_TEST_P(BatchNormalizationTest, SumToZ) { XlaBuilder builder("sum_to_z"); - auto input_activations = builder.ConstantLiteral(input_literal_); + auto input_activations = ConstantLiteral(&builder, input_literal_); XlaComputation add = CreateScalarAddComputation(F32, &builder); // Reduce all but the Z dimension. - builder.Reduce(input_activations, builder.ConstantR0(0.0f), add, - {0, 2, 3}); + Reduce(input_activations, ConstantR0(&builder, 0.0f), add, {0, 2, 3}); std::vector expected = {6, 12.6}; ComputeAndCompareR1(&builder, expected, {}, error_spec_); @@ -146,13 +146,13 @@ XLA_TEST_P(BatchNormalizationTest, SumToZ) { XLA_TEST_P(BatchNormalizationTest, SquareAndReduce) { XlaBuilder builder("square_and_reduce"); - auto input_activations = builder.ConstantLiteral(input_literal_); - auto set_means = builder.ConstantR1({2.f, 4.2f}); - auto activation_deviations = builder.Sub(input_activations, set_means, - /*broadcast_dimensions=*/{1}); + auto input_activations = ConstantLiteral(&builder, input_literal_); + auto set_means = ConstantR1(&builder, {2.f, 4.2f}); + auto activation_deviations = Sub(input_activations, set_means, + /*broadcast_dimensions=*/{1}); XlaComputation add = CreateScalarAddComputation(F32, &builder); - auto dev_squares = builder.SquareF32(activation_deviations); - builder.Reduce(dev_squares, builder.ConstantR0(0.0f), add, {0, 2, 3}); + auto dev_squares = Square(activation_deviations); + Reduce(dev_squares, ConstantR0(&builder, 0.0f), add, {0, 2, 3}); std::vector expected = {18, 0.06}; ComputeAndCompareR1(&builder, expected, {}, error_spec_); @@ -160,8 +160,8 @@ XLA_TEST_P(BatchNormalizationTest, SquareAndReduce) { XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) { XlaBuilder builder("variance_to_stddev"); - auto variance = builder.ConstantR1({6.f, .02f}); - builder.SqrtF32(variance); + auto variance = ConstantR1(&builder, {6.f, .02f}); + Sqrt(variance); std::vector expected = {2.44948974f, 0.14142136f}; ComputeAndCompareR1(&builder, expected, {}, error_spec_); @@ -172,50 +172,50 @@ XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) { XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) { XlaBuilder builder("batch_normalize_per_spec"); auto input_activations = - CheckShape(&builder, builder.ConstantLiteral(input_literal_), + CheckShape(&builder, ConstantLiteral(&builder, input_literal_), ShapeUtil::MakeShape(F32, {3, 2, 1, 1})); - auto gamma = builder.ConstantR1({1.0, 1.0}); - auto beta = builder.ConstantR1({0.0, 0.0}); + auto gamma = ConstantR1(&builder, {1.0, 1.0}); + auto beta = ConstantR1(&builder, {0.0, 0.0}); XlaComputation add = CreateScalarAddComputation(F32, &builder); // Reduce all dimensions except dimension 1. Shape TwoElementVectorF32 = ShapeUtil::MakeShape(F32, {2}); auto sum = CheckShape( &builder, - builder.Reduce(input_activations, builder.ConstantR0(0.0f), add, - /*dimensions_to_reduce=*/{0, 2, 3}), + Reduce(input_activations, ConstantR0(&builder, 0.0f), add, + /*dimensions_to_reduce=*/{0, 2, 3}), TwoElementVectorF32); auto input_shape = builder.GetShape(input_activations).ConsumeValueOrDie(); auto sum_shape = builder.GetShape(sum).ConsumeValueOrDie(); - auto count = builder.ConstantR0(ShapeUtil::ElementsIn(input_shape) / - ShapeUtil::ElementsIn(sum_shape)); - auto set_means = builder.Div(sum, count); + auto count = + ConstantR0(&builder, ShapeUtil::ElementsIn(input_shape) / + ShapeUtil::ElementsIn(sum_shape)); + auto set_means = Div(sum, count); const float kEpsilon = 1e-9f; - auto epsilon = builder.ConstantR0(kEpsilon); - auto epsilon2 = builder.ConstantR1({kEpsilon, kEpsilon}); - auto activation_deviations = builder.Sub(input_activations, set_means, - /*broadcast_dimensions=*/{1}); - auto dev_squares = builder.SquareF32(activation_deviations); - auto sum_of_squares = CheckShape( - &builder, - builder.Reduce(dev_squares, builder.ConstantR0(0.0f), add, - /*dimensions_to_reduce=*/{0, 2, 3}), - TwoElementVectorF32); - auto variance = builder.Div(sum_of_squares, count); - auto standard_deviation = builder.SqrtF32(variance); + auto epsilon = ConstantR0(&builder, kEpsilon); + auto epsilon2 = ConstantR1(&builder, {kEpsilon, kEpsilon}); + auto activation_deviations = Sub(input_activations, set_means, + /*broadcast_dimensions=*/{1}); + auto dev_squares = Square(activation_deviations); + auto sum_of_squares = + CheckShape(&builder, + Reduce(dev_squares, ConstantR0(&builder, 0.0f), add, + /*dimensions_to_reduce=*/{0, 2, 3}), + TwoElementVectorF32); + auto variance = Div(sum_of_squares, count); + auto standard_deviation = Sqrt(variance); auto standard_deviation_above_epsilon = - CheckShape(&builder, builder.Gt(standard_deviation, epsilon), + CheckShape(&builder, Gt(standard_deviation, epsilon), ShapeUtil::MakeShape(PRED, {2})); - auto gt_eps = builder.Select(standard_deviation_above_epsilon, - standard_deviation, epsilon2); - auto normalization_factors = builder.ReciprocalF32(gt_eps); + auto gt_eps = + Select(standard_deviation_above_epsilon, standard_deviation, epsilon2); + auto normalization_factors = Reciprocal(gt_eps); auto normalized_input_activations = - builder.Mul(activation_deviations, normalization_factors, - /*broadcast_dimensions=*/{1}); - /* auto output_activations = */ builder.Add( - builder.Mul(normalized_input_activations, gamma, - /*broadcast_dimensions=*/{1}), - beta, /*broadcast_dimensions=*/{1}); + Mul(activation_deviations, normalization_factors, + /*broadcast_dimensions=*/{1}); + /* auto output_activations = */ Add(Mul(normalized_input_activations, gamma, + /*broadcast_dimensions=*/{1}), + beta, /*broadcast_dimensions=*/{1}); Array4D expected(kSamples, kZ, kY, kX); Array2D pz({ @@ -232,46 +232,47 @@ XLA_TEST_P(BatchNormalizationTest, BasicTraining) { const int kFeatureIndex = 3; XlaBuilder builder(TestName()); - auto operand = builder.ConstantR4FromArray4D( - {{{{1.f, 2.f}}, {{3.f, 4.f}}}, {{{5.f, 6.f}}, {{7.f, 8.f}}}}); + auto operand = ConstantR4FromArray4D( + &builder, {{{{1.f, 2.f}}, {{3.f, 4.f}}}, {{{5.f, 6.f}}, {{7.f, 8.f}}}}); - auto scale = builder.ConstantR1({2.0f, 3.0f}); + auto scale = ConstantR1(&builder, {2.0f, 3.0f}); - auto offset = builder.ConstantR1({1.0f, 2.0f}); + auto offset = ConstantR1(&builder, {1.0f, 2.0f}); - builder.BatchNormTraining(operand, scale, offset, - /*epsilon=*/0.001, kFeatureIndex); + BatchNormTraining(operand, scale, offset, + /*epsilon=*/0.001, kFeatureIndex); - auto expected = Literal::MakeTuple( - {Literal::CreateR4({{{{-1.6f, -2.0f}}, {{0.1f, 0.6f}}}, - {{{1.9f, 3.3f}}, {{3.7f, 6.0f}}}}) + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR4({{{{-1.6f, -2.0f}}, {{0.1f, 0.6f}}}, + {{{1.9f, 3.3f}}, {{3.7f, 6.0f}}}}) .get(), - Literal::CreateR1({4, 5}).get(), - Literal::CreateR1({5, 5}).get()}); + LiteralUtil::CreateR1({4, 5}).get(), + LiteralUtil::CreateR1({5, 5}).get()}); ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.1)); } -XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) { +XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnDimension2) { const int kFeatureIndex = 2; XlaBuilder builder(TestName()); - auto operand = builder.ConstantR4FromArray4D( + auto operand = ConstantR4FromArray4D( + &builder, {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}}); - auto scale = builder.ConstantR1({2.0f, 3.0f}); + auto scale = ConstantR1(&builder, {2.0f, 3.0f}); - auto offset = builder.ConstantR1({1.0f, 2.0f}); + auto offset = ConstantR1(&builder, {1.0f, 2.0f}); - builder.BatchNormTraining(operand, scale, offset, - /*epsilon=*/0.001, kFeatureIndex); + BatchNormTraining(operand, scale, offset, + /*epsilon=*/0.001, kFeatureIndex); - auto expected = Literal::MakeTuple( - {Literal::CreateR4({{{{-1.6f}, {-2.0f}}, {{0.1f}, {0.6f}}}, - {{{1.9f}, {3.3f}}, {{3.7f}, {6.0f}}}}) + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR4({{{{-1.6f}, {-2.0f}}, {{0.1f}, {0.6f}}}, + {{{1.9f}, {3.3f}}, {{3.7f}, {6.0f}}}}) .get(), - Literal::CreateR1({4, 5}).get(), - Literal::CreateR1({5, 5}).get()}); + LiteralUtil::CreateR1({4, 5}).get(), + LiteralUtil::CreateR1({5, 5}).get()}); ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.1)); } @@ -294,14 +295,14 @@ XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) { CreateR1Parameter(std::vector(260, 1.0f), /*parameter_number=*/2, "offset", &builder, &h2); - builder.BatchNormTraining(h0, h1, h2, - /*epsilon=*/1, kFeatureIndex); + BatchNormTraining(h0, h1, h2, + /*epsilon=*/1, kFeatureIndex); - auto expected = Literal::MakeTuple( - {Literal::CreateR3FromArray3D(Array3D(260, 2, 2, 1.0f)) + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR3FromArray3D(Array3D(260, 2, 2, 1.0f)) .get(), - Literal::CreateR1(std::vector(260, 1.0f)).get(), - Literal::CreateR1(std::vector(260, 0.0f)).get()}); + LiteralUtil::CreateR1(std::vector(260, 1.0f)).get(), + LiteralUtil::CreateR1(std::vector(260, 0.0f)).get()}); ComputeAndCompareTuple(&builder, *expected, {operand.get(), scale.get(), offset.get()}, @@ -327,14 +328,15 @@ XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) { /*parameter_number=*/2, "offset", &builder, &h2); // var = 125, mean = 15, epsilon = -100 - builder.BatchNormTraining(h0, h1, h2, - /*epsilon=*/-100, kFeatureIndex); + BatchNormTraining(h0, h1, h2, + /*epsilon=*/-100, kFeatureIndex); - auto expected = Literal::MakeTuple( - {Literal::CreateR3FromArray3D({{{-3.0f}, {-1.0f}, {1.0f}, {3.0f}}}) + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR3FromArray3D( + {{{-3.0f}, {-1.0f}, {1.0f}, {3.0f}}}) .get(), - Literal::CreateR1(std::vector(1, 15.0f)).get(), - Literal::CreateR1(std::vector(1, 125.0f)).get()}); + LiteralUtil::CreateR1(std::vector(1, 15.0f)).get(), + LiteralUtil::CreateR1(std::vector(1, 125.0f)).get()}); ComputeAndCompareTuple(&builder, *expected, {operand.get(), scale.get(), offset.get()}, @@ -346,26 +348,27 @@ XLA_TEST_P(BatchNormalizationTest, BatchNormGradBasic) { XlaBuilder builder(TestName()); auto operand = - builder.ConstantR4FromArray4D(Array4D(2, 2, 2, 1, 0.0f)); + ConstantR4FromArray4D(&builder, Array4D(2, 2, 2, 1, 0.0f)); - auto scale = builder.ConstantR1({1.0f, 1.0f}); + auto scale = ConstantR1(&builder, {1.0f, 1.0f}); - auto mean = builder.ConstantR1({0.0f, 0.0f}); + auto mean = ConstantR1(&builder, {0.0f, 0.0f}); - auto var = builder.ConstantR1({1.0f, 1.0f}); + auto var = ConstantR1(&builder, {1.0f, 1.0f}); - auto grad_output = builder.ConstantR4FromArray4D( + auto grad_output = ConstantR4FromArray4D( + &builder, {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}}); - builder.BatchNormGrad(operand, scale, mean, var, grad_output, - /*epsilon=*/0.0, kFeatureIndex); + BatchNormGrad(operand, scale, mean, var, grad_output, + /*epsilon=*/0.0, kFeatureIndex); - auto expected = Literal::MakeTuple( - {Literal::CreateR4({{{{-3.f}, {-3.f}}, {{-1.f}, {-1.f}}}, - {{{1.f}, {1.f}}, {{3.f}, {3.f}}}}) + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR4({{{{-3.f}, {-3.f}}, {{-1.f}, {-1.f}}}, + {{{1.f}, {1.f}}, {{3.f}, {3.f}}}}) .get(), - Literal::CreateR1({0, 0}).get(), - Literal::CreateR1({16, 20}).get()}); + LiteralUtil::CreateR1({0, 0}).get(), + LiteralUtil::CreateR1({16, 20}).get()}); ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.1)); } @@ -379,7 +382,7 @@ struct BatchNormTestParam { friend ::std::ostream& operator<<(::std::ostream& os, const BatchNormTestParam& p) { - os << "bounds={" << tensorflow::str_util::Join(p.bounds, ", ") << "}, "; + os << "bounds={" << absl::StrJoin(p.bounds, ", ") << "}, "; os << "feature_index=" << p.feature_index << ", "; os << "random_value_mean=" << p.random_value_mean << ", "; os << "random_value_var=" << p.random_value_var; @@ -511,22 +514,23 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) { auto normalized = *ReferenceUtil::BatchNorm4D(input_array, mean4D, var4D, scale4D, offset4D, epsilon); - auto expected_normalized = Literal::CreateR4FromArray4D(normalized); + auto expected_normalized = + LiteralUtil::CreateR4FromArray4D(normalized); - auto offset_literal = Literal::CreateR1(offset); - auto scale_literal = Literal::CreateR1(scale); - auto input_literal = Literal::CreateR4FromArray4D(input_array); + auto offset_literal = LiteralUtil::CreateR1(offset); + auto scale_literal = LiteralUtil::CreateR1(scale); + auto input_literal = LiteralUtil::CreateR4FromArray4D(input_array); auto input_activations = - builder.Parameter(0, input_literal->shape(), "input"); + Parameter(&builder, 0, input_literal->shape(), "input"); auto scale_activations = - builder.Parameter(1, scale_literal->shape(), "offset"); + Parameter(&builder, 1, scale_literal->shape(), "offset"); auto offset_activations = - builder.Parameter(2, offset_literal->shape(), "scale"); + Parameter(&builder, 2, offset_literal->shape(), "scale"); - auto expected = Literal::MakeTuple({expected_normalized.get(), - Literal::CreateR1(mean).get(), - Literal::CreateR1(var).get()}); + auto expected = LiteralUtil::MakeTuple( + {expected_normalized.get(), LiteralUtil::CreateR1(mean).get(), + LiteralUtil::CreateR1(var).get()}); std::unique_ptr input_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); @@ -535,8 +539,8 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) { std::unique_ptr offset_data = client_->TransferToServer(*offset_literal).ConsumeValueOrDie(); - builder.BatchNormTraining(input_activations, scale_activations, - offset_activations, epsilon, feature_index); + BatchNormTraining(input_activations, scale_activations, offset_activations, + epsilon, feature_index); // Run all HLO passes during this test. In particular, ClientLibraryTestBase // disables constant folding, but we want it enabled for our zero-sized tensor @@ -611,21 +615,21 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) { auto normalized = *ReferenceUtil::BatchNorm4D(input_array, mean4D, var4D, scale4D, offset4D, epsilon); - auto offset_literal = Literal::CreateR1(offset); - auto scale_literal = Literal::CreateR1(scale); - auto mean_literal = Literal::CreateR1(mean); - auto var_literal = Literal::CreateR1(var); - auto input_literal = Literal::CreateR4FromArray4D(input_array); + auto offset_literal = LiteralUtil::CreateR1(offset); + auto scale_literal = LiteralUtil::CreateR1(scale); + auto mean_literal = LiteralUtil::CreateR1(mean); + auto var_literal = LiteralUtil::CreateR1(var); + auto input_literal = LiteralUtil::CreateR4FromArray4D(input_array); auto input_activations = - builder.Parameter(0, input_literal->shape(), "input"); + Parameter(&builder, 0, input_literal->shape(), "input"); auto scale_activations = - builder.Parameter(1, scale_literal->shape(), "offset"); + Parameter(&builder, 1, scale_literal->shape(), "offset"); auto offset_activations = - builder.Parameter(2, offset_literal->shape(), "scale"); - auto mean_activations = builder.Parameter(3, mean_literal->shape(), "mean"); + Parameter(&builder, 2, offset_literal->shape(), "scale"); + auto mean_activations = Parameter(&builder, 3, mean_literal->shape(), "mean"); auto variance_activations = - builder.Parameter(4, var_literal->shape(), "variance"); + Parameter(&builder, 4, var_literal->shape(), "variance"); Array4D expected = normalized; @@ -640,9 +644,9 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) { std::unique_ptr variance_data = client_->TransferToServer(*var_literal).ConsumeValueOrDie(); - builder.BatchNormInference(input_activations, scale_activations, - offset_activations, mean_activations, - variance_activations, epsilon, feature_index); + BatchNormInference(input_activations, scale_activations, offset_activations, + mean_activations, variance_activations, epsilon, + feature_index); // Run all HLO passes during this test. In particular, ClientLibraryTestBase // disables constant folding, but we want it enabled for our zero-sized tensor @@ -729,7 +733,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) { var4D, [epsilon](float a) { return a + epsilon; }); auto rsqrt_var_add_epsilon = *ReferenceUtil::MapArray4D( - var_add_epsilon, [epsilon](float a) { return 1 / std::sqrt(a); }); + var_add_epsilon, [](float a) { return 1 / std::sqrt(a); }); auto grad_output_times_var = *ReferenceUtil::MapArray4D(grad_output_array, var_add_epsilon, @@ -798,21 +802,23 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) { }); auto expected_grad_activation = - Literal::CreateR4FromArray4D(grad_activation); + LiteralUtil::CreateR4FromArray4D(grad_activation); - auto input_literal = Literal::CreateR4FromArray4D(input_array); - auto scale_literal = Literal::CreateR1(scale); - auto mean_literal = Literal::CreateR1(mean); - auto var_literal = Literal::CreateR1(var); + auto input_literal = LiteralUtil::CreateR4FromArray4D(input_array); + auto scale_literal = LiteralUtil::CreateR1(scale); + auto mean_literal = LiteralUtil::CreateR1(mean); + auto var_literal = LiteralUtil::CreateR1(var); auto grad_output_literal = - Literal::CreateR4FromArray4D(grad_output_array); - - auto input_parameter = builder.Parameter(0, input_literal->shape(), "input"); - auto scale_parameter = builder.Parameter(1, scale_literal->shape(), "scale"); - auto mean_parameter = builder.Parameter(2, mean_literal->shape(), "mean"); - auto var_parameter = builder.Parameter(3, var_literal->shape(), "variance"); + LiteralUtil::CreateR4FromArray4D(grad_output_array); + + auto input_parameter = + Parameter(&builder, 0, input_literal->shape(), "input"); + auto scale_parameter = + Parameter(&builder, 1, scale_literal->shape(), "scale"); + auto mean_parameter = Parameter(&builder, 2, mean_literal->shape(), "mean"); + auto var_parameter = Parameter(&builder, 3, var_literal->shape(), "variance"); auto grad_output_parameter = - builder.Parameter(4, grad_output_literal->shape(), "grad_output"); + Parameter(&builder, 4, grad_output_literal->shape(), "grad_output"); std::unique_ptr input_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); @@ -825,14 +831,13 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) { std::unique_ptr grad_output_data = client_->TransferToServer(*grad_output_literal).ConsumeValueOrDie(); - builder.BatchNormGrad(input_parameter, scale_parameter, mean_parameter, - var_parameter, grad_output_parameter, epsilon, - feature_index); + BatchNormGrad(input_parameter, scale_parameter, mean_parameter, var_parameter, + grad_output_parameter, epsilon, feature_index); auto expected = - Literal::MakeTuple({expected_grad_activation.get(), - Literal::CreateR1(grad_scale).get(), - Literal::CreateR1(grad_offset).get()}); + LiteralUtil::MakeTuple({expected_grad_activation.get(), + LiteralUtil::CreateR1(grad_scale).get(), + LiteralUtil::CreateR1(grad_offset).get()}); // Run all HLO passes during this test. In particular, ClientLibraryTestBase // disables constant folding, but we want it enabled for our zero-sized tensor diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc index ca337e78840e77377719636cd4cf33af2578210d..65589b0d6af2ffca26776541eb05a093f43e0a9a 100644 --- a/tensorflow/compiler/xla/tests/bfloat16_test.cc +++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc @@ -21,8 +21,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -51,9 +51,9 @@ class Bfloat16Test : public ClientLibraryTestBase { XLA_TEST_F(Bfloat16Test, ScalarOperation) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR0(static_cast(2.0f)); - auto y = builder.ConstantR0(static_cast(1.0f)); - builder.Add(x, y); + auto x = ConstantR0(&builder, static_cast(2.0f)); + auto y = ConstantR0(&builder, static_cast(1.0f)); + Add(x, y); ComputeAndCompareR0(&builder, static_cast(3.0f), {}, error_spec_); @@ -61,16 +61,16 @@ XLA_TEST_F(Bfloat16Test, ScalarOperation) { XLA_TEST_F(Bfloat16Test, LogOperation) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR0(static_cast(4.0f)); - builder.Log(x); + auto x = ConstantR0(&builder, static_cast(4.0f)); + Log(x); ComputeAndCompareR0(&builder, static_cast(1.387f), {}, - error_spec_); + ErrorSpec(0.01, 0.01)); } XLA_TEST_F(Bfloat16Test, NegateScalarF16) { XlaBuilder builder(TestName()); - builder.Neg(builder.ConstantR0(static_cast(2.1f))); + Neg(ConstantR0(&builder, static_cast(2.1f))); ComputeAndCompareR0(&builder, static_cast(-2.1f), {}, error_spec_); @@ -80,75 +80,76 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) { const int kFeatureIndex = 2; XlaBuilder builder(TestName()); - auto operand = builder.ConstantR4FromArray4D( + auto operand = ConstantR4FromArray4D( + &builder, {{{{static_cast(1.f)}, {static_cast(2.f)}}, {{static_cast(3.f)}, {static_cast(4.f)}}}, {{{static_cast(5.f)}, {static_cast(6.f)}}, {{static_cast(7.f)}, {static_cast(8.f)}}}}); - auto scale = builder.ConstantR1( - {static_cast(2.0f), static_cast(3.0f)}); + auto scale = ConstantR1( + &builder, {static_cast(2.0f), static_cast(3.0f)}); - auto offset = builder.ConstantR1( - {static_cast(1.0f), static_cast(2.0f)}); + auto offset = ConstantR1( + &builder, {static_cast(1.0f), static_cast(2.0f)}); - auto tuple = builder.BatchNormTraining(operand, scale, offset, - /*epsilon=*/0.001, kFeatureIndex); + BatchNormTraining(operand, scale, offset, /*epsilon=*/0.001, kFeatureIndex); - auto expected = Literal::MakeTuple( - {Literal::CreateR4( + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR4( {{{{static_cast(-1.6875f)}, {static_cast(-2.04f)}}, {{static_cast(0.105f)}, {static_cast(0.66f)}}}, {{{static_cast(1.89f)}, {static_cast(3.35f)}}, {{static_cast(3.7f)}, {static_cast(6.04f)}}}}) .get(), - Literal::CreateR1( + LiteralUtil::CreateR1( {static_cast(4), static_cast(5)}) .get(), - Literal::CreateR1( + LiteralUtil::CreateR1( {static_cast(5), static_cast(5)}) .get()}); - ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.01)); + ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.01, 0.02)); } XLA_TEST_F(Bfloat16Test, BatchNormGrad) { const int kFeatureIndex = 2; XlaBuilder builder(TestName()); - auto operand = builder.ConstantR4FromArray4D( - Array4D(2, 2, 2, 1, static_cast(0.0f))); + auto operand = ConstantR4FromArray4D( + &builder, Array4D(2, 2, 2, 1, static_cast(0.0f))); - auto scale = builder.ConstantR1( - {static_cast(1.0f), static_cast(1.0f)}); + auto scale = ConstantR1( + &builder, {static_cast(1.0f), static_cast(1.0f)}); - auto mean = builder.ConstantR1( - {static_cast(0.0f), static_cast(0.0f)}); + auto mean = ConstantR1( + &builder, {static_cast(0.0f), static_cast(0.0f)}); - auto var = builder.ConstantR1( - {static_cast(1.0f), static_cast(1.0f)}); + auto var = ConstantR1( + &builder, {static_cast(1.0f), static_cast(1.0f)}); - auto grad_output = builder.ConstantR4FromArray4D( + auto grad_output = ConstantR4FromArray4D( + &builder, {{{{static_cast(1.f)}, {static_cast(2.f)}}, {{static_cast(3.f)}, {static_cast(4.f)}}}, {{{static_cast(5.f)}, {static_cast(6.f)}}, {{static_cast(7.f)}, {static_cast(8.f)}}}}); - builder.BatchNormGrad(operand, scale, mean, var, grad_output, - /*epsilon=*/0.0, kFeatureIndex); + BatchNormGrad(operand, scale, mean, var, grad_output, + /*epsilon=*/0.0, kFeatureIndex); - auto expected = Literal::MakeTuple( - {Literal::CreateR4( + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR4( {{{{static_cast(-3.f)}, {static_cast(-3.f)}}, {{static_cast(-1.f)}, {static_cast(-1.f)}}}, {{{static_cast(1.f)}, {static_cast(1.f)}}, {{static_cast(3.f)}, {static_cast(3.f)}}}}) .get(), - Literal::CreateR1( + LiteralUtil::CreateR1( {static_cast(0), static_cast(0)}) .get(), - Literal::CreateR1( + LiteralUtil::CreateR1( {static_cast(16), static_cast(20)}) .get()}); diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc index 48203b1d40ea69ff00a57c2c9e42620739b23d59..0d7a3aa46a9c12c19d954c11ae3a2cccbed886ef 100644 --- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc +++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc @@ -16,7 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -33,9 +33,9 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_32x4) { auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 4); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR2FromArray2D(*alhs); - auto rhs = builder.ConstantR2FromArray2D(*arhs); - builder.Add(lhs, rhs); + auto lhs = ConstantR2FromArray2D(&builder, *alhs); + auto rhs = ConstantR2FromArray2D(&builder, *arhs); + Add(lhs, rhs); auto aexpected = ReferenceUtil::MapWithIndexArray2D( *alhs, [&](float lhs_value, int64 row, int64 col) { @@ -49,9 +49,9 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_129x129) { auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 129); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR2FromArray2D(*alhs); - auto rhs = builder.ConstantR2FromArray2D(*arhs); - builder.Add(lhs, rhs); + auto lhs = ConstantR2FromArray2D(&builder, *alhs); + auto rhs = ConstantR2FromArray2D(&builder, *arhs); + Add(lhs, rhs); auto aexpected = ReferenceUtil::MapWithIndexArray2D( *alhs, [&](float lhs_value, int64 row, int64 col) { @@ -65,9 +65,9 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_9x5) { auto arhs = MakeLinspaceArray2D(0.0, 1.0, 9, 1); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR2FromArray2D(*alhs); - auto rhs = builder.ConstantR2FromArray2D(*arhs); - builder.Add(lhs, rhs); + auto lhs = ConstantR2FromArray2D(&builder, *alhs); + auto rhs = ConstantR2FromArray2D(&builder, *arhs); + Add(lhs, rhs); auto aexpected = ReferenceUtil::MapWithIndexArray2D( *alhs, [&](float lhs_value, int64 row, int64 col) { @@ -81,9 +81,9 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) { auto arhs = MakeLinspaceArray2D(0.0, 1.0, 129, 1); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR2FromArray2D(*alhs); - auto rhs = builder.ConstantR2FromArray2D(*arhs); - builder.Add(lhs, rhs); + auto lhs = ConstantR2FromArray2D(&builder, *alhs); + auto rhs = ConstantR2FromArray2D(&builder, *arhs); + Add(lhs, rhs); auto aexpected = ReferenceUtil::MapWithIndexArray2D( *alhs, [&](float lhs_value, int64 row, int64 col) { @@ -94,11 +94,12 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) { TEST_F(BinopScalingTest, R0PlusR2F32) { XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR0(42.0); - auto rhs = builder.ConstantR2({ - {1.0, 2.0}, {3.0, 4.0}, - }); - builder.Add(lhs, rhs); + auto lhs = ConstantR0(&builder, 42.0); + auto rhs = ConstantR2(&builder, { + {1.0, 2.0}, + {3.0, 4.0}, + }); + Add(lhs, rhs); Array2D expected(2, 2); expected(0, 0) = 42.0 + 1.0; @@ -129,9 +130,9 @@ TEST_F(BinopScalingTest, R4PlusR0S32) { }); // clang-format on - auto lhs = builder.ConstantR4FromArray4D(lhs_array); - auto rhs = builder.ConstantR0(42); - builder.Add(lhs, rhs); + auto lhs = ConstantR4FromArray4D(&builder, lhs_array); + auto rhs = ConstantR0(&builder, 42); + Add(lhs, rhs); ComputeAndCompareR4(&builder, expected, {}); } diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc index bff60f25ec8f15d372d251ac313200301a04f20f..c6b5108fe9e5bcf843982676d822f1942359da71 100644 --- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc +++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc @@ -19,7 +19,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -43,8 +43,8 @@ class BitcastConvertTest : public ClientLibraryTestBase { TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42, 64}); - builder.BitcastConvertType(a, S32); + auto a = ConstantR1(&builder, {42, 64}); + BitcastConvertType(a, S32); std::vector expected = {42, 64}; ComputeAndCompareR1(&builder, expected, {}); @@ -52,8 +52,8 @@ TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) { TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.0f, 64.0f}); - builder.BitcastConvertType(a, F32); + auto a = ConstantR1(&builder, {42.0f, 64.0f}); + BitcastConvertType(a, F32); std::vector expected = {42.0f, 64.0f}; ComputeAndCompareR1(&builder, expected, {}); @@ -62,10 +62,10 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) { TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) { XlaBuilder builder(TestName()); auto a = - builder.ConstantR1({0, static_cast(0x80000000), 0x3F800000, - static_cast(0xBF800000), 0x3F000000, - static_cast(0xBF000000)}); - builder.BitcastConvertType(a, F32); + ConstantR1(&builder, {0, static_cast(0x80000000), + 0x3F800000, static_cast(0xBF800000), + 0x3F000000, static_cast(0xBF000000)}); + BitcastConvertType(a, F32); std::vector expected = {0.0f, -0.0f, 1.0f, -1.0f, 0.5f, -0.5f}; ComputeAndCompareR1(&builder, expected, {}); @@ -73,8 +73,8 @@ TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) { XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - builder.BitcastConvertType(a, F32); + auto a = ConstantR1(&builder, {}); + BitcastConvertType(a, F32); std::vector expected = {}; ComputeAndCompareR1(&builder, expected, {}); @@ -82,8 +82,8 @@ XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) { TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.6, 64.4}); - builder.BitcastConvertType(a, S32); + auto a = ConstantR1(&builder, {42.6, 64.4}); + BitcastConvertType(a, S32); std::vector expected = {0x422a6666, 0x4280cccd}; ComputeAndCompareR1(&builder, expected, {}); @@ -91,9 +91,9 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) { TEST_F(BitcastConvertTest, ConvertS32Extremes) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {std::numeric_limits::min(), std::numeric_limits::max()}); - builder.BitcastConvertType(a, F32); + auto a = ConstantR1(&builder, {std::numeric_limits::min(), + std::numeric_limits::max()}); + BitcastConvertType(a, F32); std::vector expected = {-0.0f, NAN}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0, 0)); @@ -102,10 +102,10 @@ TEST_F(BitcastConvertTest, ConvertS32Extremes) { TEST_F(BitcastConvertTest, ConvertMapToS32) { XlaBuilder builder(TestName()); auto b = builder.CreateSubBuilder("convert"); - auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in"); - b->BitcastConvertType(param, S32); - auto a = builder.ConstantR1({42.0f, 64.0f}); - builder.Map({a}, b->BuildAndNoteError(), {0}); + auto param = Parameter(b.get(), 0, ShapeUtil::MakeShape(F32, {}), "in"); + BitcastConvertType(param, S32); + auto a = ConstantR1(&builder, {42.0f, 64.0f}); + Map(&builder, {a}, b->BuildAndNoteError(), {0}); std::vector expected = {0x42280000, 0x42800000}; ComputeAndCompareR1(&builder, expected, {}); @@ -114,10 +114,10 @@ TEST_F(BitcastConvertTest, ConvertMapToS32) { TEST_F(BitcastConvertTest, ConvertMapToF32) { XlaBuilder builder(TestName()); auto b = builder.CreateSubBuilder("convert"); - auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in"); - b->BitcastConvertType(param, F32); - auto a = builder.ConstantR1({0x42280000, 0x42800000}); - builder.Map({a}, b->BuildAndNoteError(), {0}); + auto param = Parameter(b.get(), 0, ShapeUtil::MakeShape(S32, {}), "in"); + BitcastConvertType(param, F32); + auto a = ConstantR1(&builder, {0x42280000, 0x42800000}); + Map(&builder, {a}, b->BuildAndNoteError(), {0}); std::vector expected = {42.0f, 64.0f}; ComputeAndCompareR1(&builder, expected, {}); @@ -130,9 +130,9 @@ TEST_F(BitcastConvertTest, ConvertMapToF32) { // the new convert should have the same element type as the old convert. TEST_F(BitcastConvertTest, ConvertReshape) { XlaBuilder builder(TestName()); - auto input = builder.ConstantR1({0x42280000}); - auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{}); - builder.BitcastConvertType(reshape, F32); + auto input = ConstantR1(&builder, {0x42280000}); + auto reshape = Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{}); + BitcastConvertType(reshape, F32); ComputeAndCompareR0(&builder, 42.0f, {}); } diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc index 34c86e007beea1cbac04641bdbdab62dc567f13e..fe4267c73bd170f22a0456533f45e50be823a80b 100644 --- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc +++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc @@ -20,7 +20,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" @@ -37,41 +38,43 @@ class BroadcastSimpleTest : public ClientLibraryTestBase { XlaBuilder* builder) { switch (op) { case HloOpcode::kMinimum: { - return builder->Min(lhs, rhs); + return Min(lhs, rhs); } case HloOpcode::kMaximum: { - return builder->Max(lhs, rhs); + return Max(lhs, rhs); } case HloOpcode::kMultiply: { - return builder->Mul(lhs, rhs); + return Mul(lhs, rhs); } default: { // Default to Add - return builder->Add(lhs, rhs); + return Add(lhs, rhs); } } } - std::unique_ptr MakeR3Data( - tensorflow::gtl::ArraySlice bounds, - tensorflow::gtl::ArraySlice minor_to_major, Shape* r3_shape, - Array3D* r3_array, float start, float end, int seed) { + std::unique_ptr MakeR3Data(absl::Span bounds, + absl::Span minor_to_major, + Shape* r3_shape, + Array3D* r3_array, float start, + float end, int seed) { *r3_shape = ShapeUtil::MakeShapeWithLayout(F32, bounds, minor_to_major); r3_array->FillRandom(start, end, seed); - auto r3_data = Literal::CreateR3FromArray3D(*r3_array)->Relayout( + auto r3_data = LiteralUtil::CreateR3FromArray3D(*r3_array)->Relayout( LayoutUtil::MakeLayout(minor_to_major)); std::unique_ptr r3_global_data = client_->TransferToServer(*r3_data).ConsumeValueOrDie(); return r3_global_data; } - std::unique_ptr MakeR2Data( - tensorflow::gtl::ArraySlice bounds, - tensorflow::gtl::ArraySlice minor_to_major, Shape* r2_shape, - Array2D* r2_array, float start, float end, int seed) { + std::unique_ptr MakeR2Data(absl::Span bounds, + absl::Span minor_to_major, + Shape* r2_shape, + Array2D* r2_array, float start, + float end, int seed) { *r2_shape = ShapeUtil::MakeShapeWithLayout(F32, bounds, minor_to_major); r2_array->FillRandom(start, end, seed); - auto r2_data = Literal::CreateR2FromArray2D(*r2_array)->Relayout( + auto r2_data = LiteralUtil::CreateR2FromArray2D(*r2_array)->Relayout( LayoutUtil::MakeLayout(minor_to_major)); std::unique_ptr r2_global_data = client_->TransferToServer(*r2_data).ConsumeValueOrDie(); @@ -104,13 +107,13 @@ using ::testing::HasSubstr; XLA_TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) { XlaBuilder b(TestName()); - b.Broadcast(b.ConstantR0(1.5), {}); + Broadcast(ConstantR0(&b, 1.5), {}); ComputeAndCompareR0(&b, 1.5, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x3) { XlaBuilder b(TestName()); - b.Broadcast(b.ConstantR0(2.25), {2, 3}); + Broadcast(ConstantR0(&b, 2.25), {2, 3}); Array2D expected(2, 3, 2.25); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } @@ -122,7 +125,7 @@ XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) { CreateR0Parameter(2.25f, /*parameter_number=*/0, /*name=*/"src", /*builder=*/&b, /*data_handle=*/&src); - b.Broadcast(src, {2, 3}); + Broadcast(src, {2, 3}); Array2D expected(2, 3, 2.25); ComputeAndCompareR2(&b, expected, {param_data.get()}, ErrorSpec(0.0001)); @@ -130,21 +133,21 @@ XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) { XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x0) { XlaBuilder b(TestName()); - b.Broadcast(b.ConstantR0(2.25), {2, 0}); + Broadcast(ConstantR0(&b, 2.25), {2, 0}); Array2D expected(2, 0); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_0x2) { XlaBuilder b(TestName()); - b.Broadcast(b.ConstantR0(2.25), {0, 2}); + Broadcast(ConstantR0(&b, 2.25), {0, 2}); Array2D expected(0, 2); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) { XlaBuilder b(TestName()); - b.Broadcast(b.ConstantR1({1, 2, 3}), {2}); + Broadcast(ConstantR1(&b, {1, 2, 3}), {2}); Array2D expected(2, 3); expected(0, 0) = 1; @@ -156,6 +159,86 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) { ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } +XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) { + XlaBuilder b(TestName()); + BroadcastInDim(ConstantR1(&b, {1, 2}), + ShapeUtil::MakeShape(F32, {2, 2}), {1}); + + Array2D expected(2, 2); + expected(0, 0) = 1; + expected(0, 1) = 2; + expected(1, 0) = 1; + expected(1, 1) = 2; + + ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); +} + +XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) { + XlaBuilder b(TestName()); + BroadcastInDim(ConstantR1(&b, {1, 2}), + ShapeUtil::MakeShape(F32, {2, 2}), {0}); + + Array2D expected(2, 2); + expected(0, 0) = 1; + expected(0, 1) = 1; + expected(1, 0) = 2; + expected(1, 1) = 2; + + ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); +} + +XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) { + XlaBuilder b(TestName()); + BroadcastInDim(ConstantR2(&b, {{1.0, 5.0}, {2.0, 6.0}}), + ShapeUtil::MakeShape(F32, {2, 2, 2}), {0, 1}); + + Array3D expected(2, 2, 2); + expected(0, 0, 0) = 1.0; + expected(1, 0, 0) = 2.0; + expected(0, 0, 1) = 1.0; + expected(1, 0, 1) = 2.0; + expected(0, 1, 0) = 5.0; + expected(1, 1, 0) = 6.0; + expected(1, 1, 1) = 6.0; + expected(0, 1, 1) = 5.0; + + ComputeAndCompareR3(&b, expected, {}, ErrorSpec(0.0001)); +} + +XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) { + XlaBuilder b(TestName()); + BroadcastInDim(ConstantR2(&b, {{1.0, 5.0}, {2.0, 6.0}}), + ShapeUtil::MakeShape(F32, {2, 2, 2}), {0, 2}); + + Array3D expected(2, 2, 2); + expected(0, 0, 0) = 1.0; + expected(1, 0, 0) = 2.0; + expected(0, 0, 1) = 5.0; + expected(1, 0, 1) = 6.0; + expected(0, 1, 0) = 1.0; + expected(1, 1, 0) = 2.0; + expected(1, 1, 1) = 6.0; + expected(0, 1, 1) = 5.0; + + ComputeAndCompareR3(&b, expected, {}, ErrorSpec(0.0001)); +} + +XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsNotPossibleWithBroadCast) { + XlaBuilder b(TestName()); + BroadcastInDim(ConstantR1(&b, {1, 2}), + ShapeUtil::MakeShape(F32, {3, 2}), {1}); + + Array2D expected(3, 2); + expected(0, 0) = 1; + expected(0, 1) = 2; + expected(1, 0) = 1; + expected(1, 1) = 2; + expected(2, 0) = 1; + expected(2, 1) = 2; + + ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); +} + // Tests implicit broadcasting of PREDs. XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) { XlaBuilder b(TestName()); @@ -172,7 +255,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) { XlaOp x, y; auto x_data = CreateR2Parameter(x_vals, 0, "x", &b, &x); auto y_data = CreateR3Parameter(y_vals, 1, "y", &b, &y); - b.And(x, y, /*broadcast_dimensions=*/{1, 2}); + And(x, y, /*broadcast_dimensions=*/{1, 2}); Array3D expected(2, 2, 1); expected(0, 0, 0) = false; @@ -185,7 +268,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) { XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) { XlaBuilder b(TestName()); - b.Broadcast(b.ConstantR1({}), {2}); + Broadcast(ConstantR1(&b, {}), {2}); Array2D expected(2, 0); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); @@ -193,7 +276,7 @@ XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) { XLA_TEST_F(BroadcastSimpleTest, 1DToZeroElement2D) { XlaBuilder b(TestName()); - b.Broadcast(b.ConstantR1({1, 2, 3}), {0}); + Broadcast(ConstantR1(&b, {1, 2, 3}), {0}); Array2D expected(0, 3); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); @@ -209,14 +292,14 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) { // dimensions. XlaBuilder b(TestName()); - b.Add(b.ConstantR2({{1.0, 5.0}}), - b.ConstantLiteral(*Literal::CreateR3( - {{{2.0}, {3.0}, {4.0}}, {{5.0}, {6.0}, {7.0}}})), - /*broadcast_dimensions=*/{1, 2}); + Add(ConstantR2(&b, {{1.0, 5.0}}), + ConstantLiteral(&b, *LiteralUtil::CreateR3( + {{{2.0}, {3.0}, {4.0}}, {{5.0}, {6.0}, {7.0}}})), + /*broadcast_dimensions=*/{1, 2}); auto expected = - Literal::CreateR3({{{3.0, 7.0}, {4.0, 8.0}, {5.0, 9.0}}, - {{6.0, 10.0}, {7.0, 11.0}, {8.0, 12.0}}}); + LiteralUtil::CreateR3({{{3.0, 7.0}, {4.0, 8.0}, {5.0, 9.0}}, + {{6.0, 10.0}, {7.0, 11.0}, {8.0, 12.0}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } @@ -260,13 +343,14 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) { MakeR3Data(spec.input_bounds, spec.minor2major_layout, &r3_implicit_shape, &r3_implicit_array, 1.0, 0.2, 56789); - auto r3_implicit_parameter = builder.Parameter(0, r3_implicit_shape, "input"); - auto r3_parameter = builder.Parameter(1, r3_shape, "input"); - XlaOp op = BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder); + auto r3_implicit_parameter = + Parameter(&builder, 0, r3_implicit_shape, "input"); + auto r3_parameter = Parameter(&builder, 1, r3_shape, "input"); + BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder); Array3D expected_array(spec.output_bounds[0], spec.output_bounds[1], spec.output_bounds[2]); - auto Each = ([&](tensorflow::gtl::ArraySlice indices, float* value) { + auto Each = ([&](absl::Span indices, float* value) { float r3_implicit = r3_implicit_array(indices[0] % spec.input_bounds[0], indices[1] % spec.input_bounds[1], indices[2] % spec.input_bounds[2]); @@ -284,7 +368,7 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) { } } } - auto expected = Literal::CreateR3FromArray3D(expected_array); + auto expected = LiteralUtil::CreateR3FromArray3D(expected_array); ComputeAndCompareLiteral( &builder, *expected, {r3_implicit_global_data.get(), r3_global_data.get()}, @@ -306,10 +390,10 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) { auto r1 = CreateR3Parameter(r1d, 1, "r1", &b, &r1h); auto r3 = CreateR3Parameter(r3d, 0, "r3", &b, &r3h); - b.Add(r3h, r1h); + Add(r3h, r1h); auto expected = - Literal::CreateR3({{{2, 3}, {4, 5}}, {{7, 8}, {9, 10}}}); + LiteralUtil::CreateR3({{{2, 3}, {4, 5}}, {{7, 8}, {9, 10}}}); ComputeAndCompareLiteral(&b, *expected, {r3.get(), r1.get()}, ErrorSpec(0.0001)); @@ -317,79 +401,81 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) { XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) { XlaBuilder b(TestName()); - auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}}})); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); - b.Add(r3, r1); + auto r1 = ConstantLiteral(&b, *LiteralUtil::CreateR3({{{1, 2}}})); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + Add(r3, r1); auto expected = - Literal::CreateR3({{{2, 4}, {4, 6}}, {{6, 8}, {8, 10}}}); + LiteralUtil::CreateR3({{{2, 4}, {4, 6}}, {{6, 8}, {8, 10}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) { XlaBuilder b(TestName()); - auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}, {2}}})); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); - b.Add(r3, r1); + auto r1 = ConstantLiteral(&b, *LiteralUtil::CreateR3({{{1}, {2}}})); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + Add(r3, r1); auto expected = - Literal::CreateR3({{{2, 3}, {5, 6}}, {{6, 7}, {9, 10}}}); + LiteralUtil::CreateR3({{{2, 3}, {5, 6}}, {{6, 7}, {9, 10}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) { XlaBuilder b(TestName()); - auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}, {3, 4}}})); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); - b.Add(r3, r1); + auto r1 = + ConstantLiteral(&b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}})); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + Add(r3, r1); auto expected = - Literal::CreateR3({{{2, 4}, {6, 8}}, {{6, 8}, {10, 12}}}); + LiteralUtil::CreateR3({{{2, 4}, {6, 8}}, {{6, 8}, {10, 12}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) { XlaBuilder b(TestName()); - auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}}, {{3, 4}}})); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); - b.Add(r3, r1); + auto r1 = + ConstantLiteral(&b, *LiteralUtil::CreateR3({{{1, 2}}, {{3, 4}}})); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + Add(r3, r1); auto expected = - Literal::CreateR3({{{2, 4}, {4, 6}}, {{8, 10}, {10, 12}}}); + LiteralUtil::CreateR3({{{2, 4}, {4, 6}}, {{8, 10}, {10, 12}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) { XlaBuilder b(TestName()); - auto r1 = - b.ConstantLiteral(*Literal::CreateR3({{{1}, {2}}, {{3}, {4}}})); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); - b.Add(r3, r1); + auto r1 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1}, {2}}, {{3}, {4}}})); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + Add(r3, r1); auto expected = - Literal::CreateR3({{{2, 3}, {5, 6}}, {{8, 9}, {11, 12}}}); + LiteralUtil::CreateR3({{{2, 3}, {5, 6}}, {{8, 9}, {11, 12}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) { XlaBuilder b(TestName()); - auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}}})); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); - b.Add(r3, r1); + auto r1 = ConstantLiteral(&b, *LiteralUtil::CreateR3({{{1}}})); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + Add(r3, r1); auto expected = - Literal::CreateR3({{{2, 3}, {4, 5}}, {{6, 7}, {8, 9}}}); + LiteralUtil::CreateR3({{{2, 3}, {4, 5}}, {{6, 7}, {8, 9}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } @@ -509,14 +595,14 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) { &r2_implicit_shape2, &r2_implicit_array2, 0.8, 0.4, 56789); auto r2_implicit_parameter1 = - builder.Parameter(0, r2_implicit_shape1, "input0"); - auto r2_parameter = builder.Parameter(1, r2_shape, "input1"); + Parameter(&builder, 0, r2_implicit_shape1, "input0"); + auto r2_parameter = Parameter(&builder, 1, r2_shape, "input1"); auto r2_implicit_parameter2 = - builder.Parameter(2, r2_implicit_shape2, "input2"); + Parameter(&builder, 2, r2_implicit_shape2, "input2"); XlaOp op1 = BuildBinOp(spec.op1, r2_implicit_parameter1, r2_parameter, &builder); - XlaOp op2 = BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder); + BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder); Array2D expected_array(spec.output_bounds[0], spec.output_bounds[1]); @@ -530,7 +616,7 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) { *v = ApplyOpToFloats(spec.op2, tmp, v3); }); - auto expected = Literal::CreateR2FromArray2D(expected_array); + auto expected = LiteralUtil::CreateR2FromArray2D(expected_array); ComputeAndCompareLiteral( &builder, *expected, {r2_implicit_global_data1.get(), r2_global_data.get(), @@ -544,80 +630,82 @@ INSTANTIATE_TEST_CASE_P(BroadcastR2ImplicitTestInstances, XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) { XlaBuilder b(TestName()); - auto r1 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}})); - auto r2 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}, {3, 4}})); - b.Add(r2, r1); + auto r1 = ConstantLiteral(&b, *LiteralUtil::CreateR2({{1, 2}})); + auto r2 = + ConstantLiteral(&b, *LiteralUtil::CreateR2({{1, 2}, {3, 4}})); + Add(r2, r1); - auto expected = Literal::CreateR2({{2, 4}, {4, 6}}); + auto expected = LiteralUtil::CreateR2({{2, 4}, {4, 6}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) { XlaBuilder b(TestName()); - auto r1 = b.ConstantLiteral(*Literal::CreateR2({{1}, {2}})); - auto r2 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}, {3, 4}})); - b.Add(r2, r1); + auto r1 = ConstantLiteral(&b, *LiteralUtil::CreateR2({{1}, {2}})); + auto r2 = + ConstantLiteral(&b, *LiteralUtil::CreateR2({{1, 2}, {3, 4}})); + Add(r2, r1); - auto expected = Literal::CreateR2({{2, 3}, {5, 6}}); + auto expected = LiteralUtil::CreateR2({{2, 3}, {5, 6}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) { XlaBuilder b(TestName()); - auto r1 = b.ConstantR1({10, 20}); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); - b.Add(r3, r1, {0}); + auto r1 = ConstantR1(&b, {10, 20}); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + Add(r3, r1, {0}); - auto expected = - Literal::CreateR3({{{11, 12}, {13, 14}}, {{25, 26}, {27, 28}}}); + auto expected = LiteralUtil::CreateR3( + {{{11, 12}, {13, 14}}, {{25, 26}, {27, 28}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) { XlaBuilder b(TestName()); - auto r1 = b.ConstantR1({10, 20}); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); - b.Add(r1, r3, {1}); + auto r1 = ConstantR1(&b, {10, 20}); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + Add(r1, r3, {1}); - auto expected = - Literal::CreateR3({{{11, 12}, {23, 24}}, {{15, 16}, {27, 28}}}); + auto expected = LiteralUtil::CreateR3( + {{{11, 12}, {23, 24}}, {{15, 16}, {27, 28}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) { XlaBuilder b(TestName()); - auto r1 = b.ConstantR1({10, 20}); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); - b.Add(r1, r3, {2}); + auto r1 = ConstantR1(&b, {10, 20}); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + Add(r1, r3, {2}); - auto expected = - Literal::CreateR3({{{11, 22}, {13, 24}}, {{15, 26}, {17, 28}}}); + auto expected = LiteralUtil::CreateR3( + {{{11, 22}, {13, 24}}, {{15, 26}, {17, 28}}}); ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) { XlaBuilder b(TestName()); - auto r1_0 = b.ConstantR1({1000, 2000}); - auto r1_1 = b.ConstantR1({100, 200}); - auto r1_2 = b.ConstantR1({10, 20}); - auto r3 = b.ConstantLiteral( - *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); + auto r1_0 = ConstantR1(&b, {1000, 2000}); + auto r1_1 = ConstantR1(&b, {100, 200}); + auto r1_2 = ConstantR1(&b, {10, 20}); + auto r3 = ConstantLiteral( + &b, *LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); for (int i = 0; i < 3; ++i) { - r3 = b.Add(r1_0, r3, {0}); - r3 = b.Add(r3, r1_1, {1}); - r3 = b.Add(r1_2, r3, {2}); + r3 = Add(r1_0, r3, {0}); + r3 = Add(r3, r1_1, {1}); + r3 = Add(r1_2, r3, {2}); } - r3 = b.Mul(r3, b.ConstantR0(-2)); + r3 = Mul(r3, ConstantR0(&b, -2)); - auto expected = Literal::CreateR3( + auto expected = LiteralUtil::CreateR3( {{{-6 * 1110 - 2, -6 * 1120 - 4}, {-6 * 1210 - 6, -6 * 1220 - 8}}, {{-6 * 2110 - 10, -6 * 2120 - 12}, {-6 * 2210 - 14, -6 * 2220 - 16}}}); @@ -626,19 +714,19 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) { XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) { XlaBuilder b(TestName()); - auto r1_0 = b.ConstantR1({1000, 2000}); - auto r1_1 = b.ConstantR1({100, 200}); - auto r1_2 = b.ConstantR1({10, 20}); - auto r0 = b.ConstantR0(3); - auto r3 = b.Broadcast(r0, {2, 2, 2}); + auto r1_0 = ConstantR1(&b, {1000, 2000}); + auto r1_1 = ConstantR1(&b, {100, 200}); + auto r1_2 = ConstantR1(&b, {10, 20}); + auto r0 = ConstantR0(&b, 3); + auto r3 = Broadcast(r0, {2, 2, 2}); for (int i = 0; i < 3; ++i) { - r3 = b.Add(r1_0, r3, {0}); - r3 = b.Add(r3, r1_1, {1}); - r3 = b.Add(r1_2, r3, {2}); + r3 = Add(r1_0, r3, {0}); + r3 = Add(r3, r1_1, {1}); + r3 = Add(r1_2, r3, {2}); } - r3 = b.Mul(r3, b.ConstantR0(-1)); + r3 = Mul(r3, ConstantR0(&b, -1)); - auto expected = Literal::CreateR3( + auto expected = LiteralUtil::CreateR3( {{{-3 * 1110 - 3, -3 * 1120 - 3}, {-3 * 1210 - 3, -3 * 1220 - 3}}, {{-3 * 2110 - 3, -3 * 2120 - 3}, {-3 * 2210 - 3, -3 * 2220 - 3}}}); @@ -650,10 +738,10 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) { // results in a shape incompatible with the lhs [2, 3, 1]. XlaBuilder b(TestName()); - b.Add(b.ConstantR2({{1.0, 5.0}, {1.0, 5.0}}), - b.ConstantLiteral(*Literal::CreateR3( - {{{2.0}, {3.0}, {4.0}}, {{5.0}, {6.0}, {7.0}}})), - /*broadcast_dimensions=*/{1, 2}); + Add(ConstantR2(&b, {{1.0, 5.0}, {1.0, 5.0}}), + ConstantLiteral(&b, *LiteralUtil::CreateR3( + {{{2.0}, {3.0}, {4.0}}, {{5.0}, {6.0}, {7.0}}})), + /*broadcast_dimensions=*/{1, 2}); auto result_status = Execute(&b, {}); EXPECT_FALSE(result_status.ok()); @@ -665,26 +753,26 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) { // Test invalid broadcasting with [1, 2] and [2, 3] inputs. XlaBuilder b(TestName()); - b.Add(b.ConstantR2({{1.0, 2.0}}), - b.ConstantR2({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}})); + Add(ConstantR2(&b, {{1.0, 2.0}}), + ConstantR2(&b, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}})); auto result_status = Execute(&b, {}); EXPECT_FALSE(result_status.ok()); EXPECT_THAT(result_status.status().error_message(), - HasSubstr("op BINOP_ADD with incompatible shapes")); + HasSubstr("op add with incompatible shapes")); } XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) { // Test invalid broadcasting with [1, 2] and [2, 3] inputs. XlaBuilder b(TestName()); - b.Add(b.ConstantR2({{1.0, 2.0}}), - b.ConstantR2({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}})); + Add(ConstantR2(&b, {{1.0, 2.0}}), + ConstantR2(&b, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}})); auto result_status = Execute(&b, {}); EXPECT_FALSE(result_status.ok()); EXPECT_THAT(result_status.status().error_message(), - HasSubstr("op BINOP_ADD with incompatible shapes")); + HasSubstr("op add with incompatible shapes")); } } // namespace diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc index 51b9f0d3e330e73f5d110f0a62f824179d5c7cf7..74d4d2eb10c32b270a83aa04dd2e6025d7a56c26 100644 --- a/tensorflow/compiler/xla/tests/broadcast_test.cc +++ b/tensorflow/compiler/xla/tests/broadcast_test.cc @@ -16,8 +16,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -37,7 +37,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) { // Test degenerate case of broadcasting a scalar into a scalar. auto builder = HloComputation::Builder(TestName()); auto input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(F32, {}), input, {})); @@ -46,14 +46,14 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) { hlo_module->AddEntryComputation(builder.Build()); auto result = ExecuteAndTransfer(std::move(hlo_module), {}); - EXPECT_TRUE(LiteralTestUtil::Near(*Literal::CreateR0(42.0), *result, - error_spec_)); + EXPECT_TRUE(LiteralTestUtil::Near(*LiteralUtil::CreateR0(42.0), + *result, error_spec_)); } XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) { auto builder = HloComputation::Builder(TestName()); auto input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(F32, {2, 2}), input, {})); @@ -63,14 +63,14 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) { auto result = ExecuteAndTransfer(std::move(hlo_module), {}); EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR2({{42.0, 42.0}, {42.0, 42.0}}), *result, + *LiteralUtil::CreateR2({{42.0, 42.0}, {42.0, 42.0}}), *result, error_spec_)); } XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) { auto builder = HloComputation::Builder(TestName()); auto input = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.0, 2.0, 3.0}))); + LiteralUtil::CreateR1({1.0, 2.0, 3.0}))); // Broadcast vector in both dimension 0 and dimension 1. Join them in a tuple // to enable testing of the results. @@ -86,18 +86,18 @@ XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) { auto result = ExecuteAndTransfer(std::move(hlo_module), {}); EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR2({{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}}), + *LiteralUtil::CreateR2({{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}}), LiteralSlice(*result, {0}), error_spec_)); EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR2({{1.0, 2.0, 3.0}, {1.0, 2.0, 3.0}}), + *LiteralUtil::CreateR2({{1.0, 2.0, 3.0}, {1.0, 2.0, 3.0}}), LiteralSlice(*result, {1}), error_spec_)); } XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) { auto builder = HloComputation::Builder(TestName()); auto input = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(F32, {2, 2}), input, {0, 1})); @@ -106,9 +106,9 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) { hlo_module->AddEntryComputation(builder.Build()); auto result = ExecuteAndTransfer(std::move(hlo_module), {}); - EXPECT_TRUE( - LiteralTestUtil::Near(*Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}), - *result, error_spec_)); + EXPECT_TRUE(LiteralTestUtil::Near( + *LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}), *result, + error_spec_)); } XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) { @@ -116,7 +116,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) { // the dimensions, ie transpose. auto builder = HloComputation::Builder(TestName()); auto input = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(F32, {2, 2}), input, {1, 0})); @@ -125,15 +125,15 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) { hlo_module->AddEntryComputation(builder.Build()); auto result = ExecuteAndTransfer(std::move(hlo_module), {}); - EXPECT_TRUE( - LiteralTestUtil::Near(*Literal::CreateR2({{1.0, 3.0}, {2.0, 4.0}}), - *result, error_spec_)); + EXPECT_TRUE(LiteralTestUtil::Near( + *LiteralUtil::CreateR2({{1.0, 3.0}, {2.0, 4.0}}), *result, + error_spec_)); } XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) { auto builder = HloComputation::Builder(TestName()); auto input = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}))); builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(F32, {2, 3, 2}), input, {0, 2})); @@ -143,15 +143,15 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) { auto result = ExecuteAndTransfer(std::move(hlo_module), {}); EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR3({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}}, - {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}}), + *LiteralUtil::CreateR3({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}}, + {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}}), *result, error_spec_)); } TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) { auto builder = HloComputation::Builder(TestName()); auto input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1.0, 2.0}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({1.0, 2.0}))); // Broadcast vector in dimension 1. builder.AddInstruction(HloInstruction::CreateBroadcast( @@ -166,8 +166,9 @@ TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) { Array2D pz({{1, 2}, {1, 2}}); expected.FillWithPZ(pz); - EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR4FromArray4D(expected), *result, error_spec_)); + EXPECT_TRUE( + LiteralTestUtil::Near(*LiteralUtil::CreateR4FromArray4D(expected), + *result, error_spec_)); } TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) { @@ -176,7 +177,7 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) { int64 r1_size = input_data.size(); std::iota(input_data.begin(), input_data.end(), 0.0f); auto input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1(input_data))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1(input_data))); // Broadcast vector in dimension 3. builder.AddInstruction(HloInstruction::CreateBroadcast( @@ -196,8 +197,9 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) { } expected.FillWithYX(yx); - EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR4FromArray4D(expected), *result, error_spec_)); + EXPECT_TRUE( + LiteralTestUtil::Near(*LiteralUtil::CreateR4FromArray4D(expected), + *result, error_spec_)); } XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) { @@ -207,7 +209,7 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) { std::vector r1_array(64, 42.0); auto input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1(r1_array))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1(r1_array))); // Broadcast vector in dimension 1. builder.AddInstruction(HloInstruction::CreateBroadcast( @@ -218,14 +220,14 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) { hlo_module->AddEntryComputation(builder.Build()); auto result = ExecuteAndTransfer(std::move(hlo_module), {}); - EXPECT_TRUE(LiteralTestUtil::Near(*Literal::CreateR4FromArray4D(r4_array), + EXPECT_TRUE(LiteralTestUtil::Near(*LiteralUtil::CreateR4FromArray4D(r4_array), *result, error_spec_)); } TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) { auto builder = HloComputation::Builder(TestName()); auto input = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(F32, {64, 64, 3, 3}), input, {})); @@ -238,15 +240,16 @@ TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) { Array4D expected(64, 64, 3, 3); expected.Fill(1.0f); - EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR4FromArray4D(expected), *result, error_spec_)); + EXPECT_TRUE( + LiteralTestUtil::Near(*LiteralUtil::CreateR4FromArray4D(expected), + *result, error_spec_)); } TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) { auto builder = HloComputation::Builder(TestName()); Array2D to_broadcast({{1.0f, 2.0f}, {3.0f, 4.0f}}); auto input = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2FromArray2D(to_broadcast))); + LiteralUtil::CreateR2FromArray2D(to_broadcast))); // Broadcast vector in dimensions 2 and 3. builder.AddInstruction(HloInstruction::CreateBroadcast( @@ -260,8 +263,9 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) { Array4D expected(3, 3, 2, 2); expected.FillWithYX(to_broadcast); - EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR4FromArray4D(expected), *result, error_spec_)); + EXPECT_TRUE( + LiteralTestUtil::Near(*LiteralUtil::CreateR4FromArray4D(expected), + *result, error_spec_)); } TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) { @@ -280,7 +284,7 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) { } } auto input = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR3FromArray3D(input_vals))); + LiteralUtil::CreateR3FromArray3D(input_vals))); // Broadcast vector in dimensions 2 and 3. builder.AddInstruction(HloInstruction::CreateBroadcast( @@ -291,8 +295,9 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) { hlo_module->AddEntryComputation(builder.Build()); auto result = ExecuteAndTransfer(std::move(hlo_module), {}); - EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR4FromArray4D(expected), *result, error_spec_)); + EXPECT_TRUE( + LiteralTestUtil::Near(*LiteralUtil::CreateR4FromArray4D(expected), + *result, error_spec_)); } } // namespace diff --git a/tensorflow/compiler/xla/tests/call_test.cc b/tensorflow/compiler/xla/tests/call_test.cc index 5fd33b50c94356839bbed58acd43b7d0286f4a7e..b1d18210eaafdfec0920c0cccaa0dfdbd6de5609 100644 --- a/tensorflow/compiler/xla/tests/call_test.cc +++ b/tensorflow/compiler/xla/tests/call_test.cc @@ -16,8 +16,9 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -34,7 +35,7 @@ class CallOpTest : public ClientLibraryTestBase { protected: XlaComputation CreateR0F32IdentityComputation() { XlaBuilder builder("Identity"); - builder.Parameter(0, r0f32_, "x"); + Parameter(&builder, 0, r0f32_, "x"); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -42,9 +43,9 @@ class CallOpTest : public ClientLibraryTestBase { XlaComputation CreateR1S0F32AdditionComputation() { XlaBuilder builder("Addition"); - auto x = builder.Parameter(0, r1s0f32_, "x"); - auto y = builder.Parameter(1, r1s0f32_, "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, r1s0f32_, "x"); + auto y = Parameter(&builder, 1, r1s0f32_, "y"); + Add(x, y); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -52,9 +53,9 @@ class CallOpTest : public ClientLibraryTestBase { XlaComputation CreateR1S2F32AdditionComputation() { XlaBuilder builder("Addition"); - auto x = builder.Parameter(0, r1s2f32_, "x"); - auto y = builder.Parameter(1, r1s2f32_, "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, r1s2f32_, "x"); + auto y = Parameter(&builder, 1, r1s2f32_, "y"); + Add(x, y); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -62,7 +63,7 @@ class CallOpTest : public ClientLibraryTestBase { XlaComputation CreateR0F32TupleComputation() { XlaBuilder builder("Tuple"); - builder.Tuple({builder.Parameter(0, r0f32_, "x")}); + Tuple(&builder, {Parameter(&builder, 0, r0f32_, "x")}); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -76,8 +77,9 @@ class CallOpTest : public ClientLibraryTestBase { XLA_TEST_F(CallOpTest, CallR0F32IdentityScalar) { XlaBuilder builder(TestName()); XlaComputation callee = CreateR0F32IdentityComputation(); - auto constant = builder.ConstantLiteral(*Literal::CreateR0(42.0)); - builder.Call(callee, {constant}); + auto constant = + ConstantLiteral(&builder, *LiteralUtil::CreateR0(42.0)); + Call(&builder, callee, {constant}); ComputeAndCompareR0(&builder, 42.0, {}, ErrorSpec(0.01f)); } @@ -85,9 +87,9 @@ XLA_TEST_F(CallOpTest, CallR0F32IdentityScalar) { XLA_TEST_F(CallOpTest, CallR1S0F32AddArray) { XlaBuilder builder(TestName()); XlaComputation callee = CreateR1S0F32AdditionComputation(); - auto x = builder.ConstantLiteral(*Literal::CreateR1({})); - auto y = builder.ConstantLiteral(*Literal::CreateR1({})); - builder.Call(callee, {x, y}); + auto x = ConstantLiteral(&builder, *LiteralUtil::CreateR1({})); + auto y = ConstantLiteral(&builder, *LiteralUtil::CreateR1({})); + Call(&builder, callee, {x, y}); ComputeAndCompareR1(&builder, {}, {}, ErrorSpec(0.01f)); } @@ -95,9 +97,11 @@ XLA_TEST_F(CallOpTest, CallR1S0F32AddArray) { XLA_TEST_F(CallOpTest, CallR1S2F32AddArray) { XlaBuilder builder(TestName()); XlaComputation callee = CreateR1S2F32AdditionComputation(); - auto x = builder.ConstantLiteral(*Literal::CreateR1({1.0f, 2.0f})); - auto y = builder.ConstantLiteral(*Literal::CreateR1({2.0f, 3.0f})); - builder.Call(callee, {x, y}); + auto x = + ConstantLiteral(&builder, *LiteralUtil::CreateR1({1.0f, 2.0f})); + auto y = + ConstantLiteral(&builder, *LiteralUtil::CreateR1({2.0f, 3.0f})); + Call(&builder, callee, {x, y}); ComputeAndCompareR1(&builder, {3.0f, 5.0f}, {}, ErrorSpec(0.01f)); } @@ -105,40 +109,40 @@ XLA_TEST_F(CallOpTest, CallR1S2F32AddArray) { XLA_TEST_F(CallOpTest, CallTreeTwoDeepBranchFactorThree) { XlaBuilder builder("inner"); { - auto x = builder.Parameter(0, r0f32_, "x"); - builder.Add(x, builder.ConstantR0(1.0)); + auto x = Parameter(&builder, 0, r0f32_, "x"); + Add(x, ConstantR0(&builder, 1.0)); } TF_ASSERT_OK_AND_ASSIGN(XlaComputation inner, builder.Build()); XlaBuilder builder2("outer"); { - auto x = builder2.Parameter(0, r0f32_, "x"); - x = builder2.Call(inner, {x}); - x = builder2.Call(inner, {x}); - x = builder2.Call(inner, {x}); + auto x = Parameter(&builder2, 0, r0f32_, "x"); + x = Call(&builder2, inner, {x}); + x = Call(&builder2, inner, {x}); + x = Call(&builder2, inner, {x}); } TF_ASSERT_OK_AND_ASSIGN(XlaComputation outer, builder2.Build()); XlaBuilder builder3("outermost"); { - auto x = builder3.Parameter(0, r0f32_, "x"); - x = builder3.Call(outer, {x}); - x = builder3.Call(outer, {x}); - x = builder3.Call(outer, {x}); + auto x = Parameter(&builder3, 0, r0f32_, "x"); + x = Call(&builder3, outer, {x}); + x = Call(&builder3, outer, {x}); + x = Call(&builder3, outer, {x}); } TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr start, - client_->TransferToServer(*Literal::CreateR0(1.0f))); + client_->TransferToServer(*LiteralUtil::CreateR0(1.0f))); ComputeAndCompareR0(&builder3, 10.0f, {start.get()}, ErrorSpec(0.0f)); } XLA_TEST_F(CallOpTest, CallR0F32Tuple) { XlaBuilder builder(TestName()); XlaComputation callee = CreateR0F32TupleComputation(); - auto elem = Literal::CreateR0(42.0); - auto tuple = Literal::MakeTuple({elem.get()}); - builder.Call(callee, {builder.ConstantLiteral(*elem)}); + auto elem = LiteralUtil::CreateR0(42.0); + auto tuple = LiteralUtil::MakeTuple({elem.get()}); + Call(&builder, callee, {ConstantLiteral(&builder, *elem)}); ComputeAndCompareTuple(&builder, *tuple, {}, ErrorSpec(0.01f)); } diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc index 660ff0cad5666219a4a7cb1eedbed03f06e651ba..a4eb57fc7b9abd460a7d158d0dc629eba88018cd 100644 --- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc +++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc @@ -17,8 +17,8 @@ limitations under the License. #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" @@ -36,11 +36,11 @@ class CheckExecutionArityTest : public ClientLibraryTestBase {}; TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) { XlaBuilder builder("add_two_params"); - auto param_literal = Literal::CreateR1({1.1f, 2.2f}); + auto param_literal = LiteralUtil::CreateR1({1.1f, 2.2f}); - auto p0 = builder.Parameter(0, param_literal->shape(), "param0"); - auto p1 = builder.Parameter(1, param_literal->shape(), "param1"); - auto add = builder.Add(p0, p1); + auto p0 = Parameter(&builder, 0, param_literal->shape(), "param0"); + auto p1 = Parameter(&builder, 1, param_literal->shape(), "param1"); + Add(p0, p1); auto param0_data = client_->TransferToServer(*param_literal).ConsumeValueOrDie(); @@ -77,20 +77,20 @@ TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) { XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) { XlaBuilder builder("add_two_params"); - auto p0 = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0"); - auto p1 = builder.Parameter(1, ShapeUtil::MakeShape(F32, {4}), "param1"); - auto add = builder.Mul(p0, p1); + auto p0 = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "param0"); + auto p1 = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {4}), "param1"); + Mul(p0, p1); auto computation_status = builder.Build(); ASSERT_IS_OK(computation_status.status()); auto computation = computation_status.ConsumeValueOrDie(); - auto f32_literal = Literal::CreateR0(1.1f); + auto f32_literal = LiteralUtil::CreateR0(1.1f); auto f32_data = client_->TransferToServer(*f32_literal).ConsumeValueOrDie(); - auto f32_4_literal = Literal::CreateR1({1.0f, 2.0f, 3.0f, 4.0f}); + auto f32_4_literal = LiteralUtil::CreateR1({1.0f, 2.0f, 3.0f, 4.0f}); auto f32_4_data = client_->TransferToServer(*f32_4_literal).ConsumeValueOrDie(); - auto u8_4_literal = Literal::CreateR1U8("hola"); + auto u8_4_literal = LiteralUtil::CreateR1U8("hola"); auto u8_4_data = client_->TransferToServer(*u8_4_literal).ConsumeValueOrDie(); // Match diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc index bf8ed4d9fb0bc61b86ef0b5872711a122a3d416b..8a236db0ff2f63332892de822461dd1cc17276ca 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.cc +++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc @@ -17,17 +17,18 @@ limitations under the License. #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/execution_options_util.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test_helpers.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -94,15 +95,14 @@ string ClientLibraryTestBase::TestName() const { } StatusOr> ClientLibraryTestBase::Execute( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments) { + XlaBuilder* builder, absl::Span arguments) { // Build the computation, as a convenience. TF_ASSIGN_OR_RETURN(auto computation, builder->Build()); return client_->Execute(computation, arguments, &execution_options_); } StatusOr> ClientLibraryTestBase::ExecuteAndTransfer( - const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + const XlaComputation& computation, absl::Span arguments, const Shape* shape_with_output_layout) { ExecutionOptions execution_options = execution_options_; if (shape_with_output_layout != nullptr) { @@ -114,7 +114,7 @@ StatusOr> ClientLibraryTestBase::ExecuteAndTransfer( } StatusOr> ClientLibraryTestBase::ExecuteAndTransfer( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments, + XlaBuilder* builder, absl::Span arguments, const Shape* shape_with_output_layout) { // Build the computation, as a convenience. TF_ASSIGN_OR_RETURN(auto computation, builder->Build()); @@ -123,8 +123,7 @@ StatusOr> ClientLibraryTestBase::ExecuteAndTransfer( StatusOr> ClientLibraryTestBase::ExecuteAndTransferReference( - const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + const XlaComputation& computation, absl::Span arguments, const Shape* shape_with_output_layout) { ExecutionOptions execution_options = execution_options_; if (shape_with_output_layout != nullptr) { @@ -137,7 +136,7 @@ ClientLibraryTestBase::ExecuteAndTransferReference( } string ClientLibraryTestBase::ExecuteToString( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments) { + XlaBuilder* builder, absl::Span arguments) { auto computation_status = builder->Build(); if (!computation_status.ok()) { return computation_status.status().ToString(); @@ -155,23 +154,22 @@ string ClientLibraryTestBase::ExecuteToString( void ClientLibraryTestBase::ComputeAndCompareR1( XlaBuilder* builder, const tensorflow::core::Bitmap& expected, - tensorflow::gtl::ArraySlice arguments) { - std::unique_ptr expected_literal = Literal::CreateR1(expected); + absl::Span arguments) { + std::unique_ptr expected_literal = LiteralUtil::CreateR1(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments); } void ClientLibraryTestBase::ComputeAndCompareLiteral( XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, - const Shape* shape_with_layout) { + absl::Span arguments, const Shape* shape_with_layout) { EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments, shape_with_layout)); } void ClientLibraryTestBase::ComputeAndCompareLiteral( XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error, + absl::Span arguments, ErrorSpec error, const Shape* shape_with_layout) { EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments, error, shape_with_layout)); @@ -179,7 +177,7 @@ void ClientLibraryTestBase::ComputeAndCompareLiteral( Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts( const xla::XlaComputation& computation, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const std::function& verify_output) { // Try with no layout requirement. @@ -195,8 +193,8 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts( AsInt64Slice(expected.shape().dimensions()), minor_to_major); TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments, &layout)); - verify_output(*actual, tensorflow::strings::StrCat( - "Test with output layout: ", + verify_output(*actual, + absl::StrCat("Test with output layout: ", ShapeUtil::HumanStringWithLayout(layout))); } while (std::next_permutation(minor_to_major.begin(), minor_to_major.end())); return Status::OK(); @@ -204,7 +202,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts( Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts( const xla::XlaComputation& computation, const Literal& /*expected*/, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const std::function& verify_output, const Shape* output_with_layout) { @@ -251,13 +249,12 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts( // Every argument has an assigned layout. TF_ASSIGN_OR_RETURN( auto actual, - ExecuteAndTransfer( - computation, - tensorflow::gtl::ArraySlice(arguments_with_layout), - output_with_layout)); + ExecuteAndTransfer(computation, + absl::Span(arguments_with_layout), + output_with_layout)); string error_message = "Test with input layouts: "; for (const auto& str : layout_strings) { - tensorflow::strings::StrAppend(&error_message, str, " "); + absl::StrAppend(&error_message, str, " "); } verify_output(*actual, error_message); return Status::OK(); @@ -268,14 +265,20 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts( Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus( XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments_passed_in, + absl::Span arguments_passed_in, const Shape* shape_with_layout) { std::vector arguments(arguments_passed_in.begin(), arguments_passed_in.end()); + + // Transfer and use elements of arguments_, if the AddParam() API was used. + std::vector> owning_arguments; if (!arguments_.empty()) { CHECK(arguments.empty()); for (const auto& argument : arguments_) { - arguments.push_back(argument.get()); + owning_arguments.push_back( + client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument)) + .ValueOrDie()); + arguments.push_back(owning_arguments.back().get()); } } @@ -283,10 +286,6 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus( if (ShapeUtil::ElementIsFloating(expected.shape()) || ShapeUtil::ElementIsComplex(expected.shape())) { LOG(WARNING) << "performing exact comparison of floating point numbers"; - } else { - TF_RET_CHECK(ShapeUtil::ElementIsIntegral(expected.shape()) || - expected.shape().element_type() == PRED) - << ShapeUtil::HumanString(expected.shape()); } // We allow using a float expected literal for a bfloat16 output. In this // case, we need to convert the expected literal to bfloat16. @@ -294,7 +293,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus( std::unique_ptr converted_expected; Shape layout_shape; if (use_bfloat16_) { - converted_expected = Literal::ConvertF32ToBF16(expected); + converted_expected = LiteralUtil::ConvertF32ToBF16(expected); expected_ptr = converted_expected.get(); if (shape_with_layout != nullptr) { layout_shape = *shape_with_layout; @@ -326,19 +325,23 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus( Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus( XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments_passed_in, - ErrorSpec error, const Shape* shape_with_layout) { + absl::Span arguments_passed_in, ErrorSpec error, + const Shape* shape_with_layout) { std::vector arguments(arguments_passed_in.begin(), arguments_passed_in.end()); + + // Transfer and use elements of arguments_, if the AddParam() API was used. + std::vector> owning_arguments; if (!arguments_.empty()) { CHECK(arguments.empty()); for (const auto& argument : arguments_) { - arguments.push_back(argument.get()); + owning_arguments.push_back( + client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument)) + .ValueOrDie()); + arguments.push_back(owning_arguments.back().get()); } } - TF_RET_CHECK(ShapeUtil::ElementIsFloating(expected.shape()) || - ShapeUtil::ElementIsComplex(expected.shape())); TF_ASSIGN_OR_RETURN(auto computation, builder->Build()); // We allow using a float expected literal for a bfloat16 output. In this // case, we need to convert the expected literal to bfloat16. @@ -346,7 +349,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus( std::unique_ptr converted_expected; Shape layout_shape; if (use_bfloat16_) { - converted_expected = Literal::ConvertF32ToBF16(expected); + converted_expected = LiteralUtil::ConvertF32ToBF16(expected); expected_ptr = converted_expected.get(); if (shape_with_layout != nullptr) { layout_shape = *shape_with_layout; @@ -378,8 +381,8 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus( } void ClientLibraryTestBase::ComputeAndCompareR1U8( - XlaBuilder* builder, tensorflow::StringPiece expected, - tensorflow::gtl::ArraySlice arguments) { + XlaBuilder* builder, absl::string_view expected, + absl::Span arguments) { auto actual_status = ExecuteAndTransfer(builder, arguments); EXPECT_IS_OK(actual_status.status()); if (!actual_status.ok()) { @@ -388,7 +391,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8( auto actual = actual_status.ConsumeValueOrDie(); // Turn the expected value into a literal. - std::unique_ptr expected_literal = Literal::CreateR1U8(expected); + std::unique_ptr expected_literal = LiteralUtil::CreateR1U8(expected); VLOG(1) << "expected: " << expected_literal->ToString(); VLOG(1) << "actual: " << actual->ToString(); @@ -398,7 +401,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8( void ClientLibraryTestBase::ComputeAndCompareTuple( XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { auto actual_status = ExecuteAndTransfer(builder, arguments); EXPECT_IS_OK(actual_status.status()); if (!actual_status.ok()) { @@ -410,7 +413,7 @@ void ClientLibraryTestBase::ComputeAndCompareTuple( void ClientLibraryTestBase::ComputeAndCompareTuple( XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error) { + absl::Span arguments, ErrorSpec error) { auto actual_status = ExecuteAndTransfer(builder, arguments); EXPECT_IS_OK(actual_status.status()); if (!actual_status.ok()) { @@ -421,7 +424,7 @@ void ClientLibraryTestBase::ComputeAndCompareTuple( } void ClientLibraryTestBase::ComputeAndCompare( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments) { + XlaBuilder* builder, absl::Span arguments) { auto status_or_data = ComputeValueAndReference(builder, arguments); EXPECT_IS_OK(status_or_data); if (!status_or_data.ok()) { @@ -433,8 +436,7 @@ void ClientLibraryTestBase::ComputeAndCompare( } void ClientLibraryTestBase::ComputeAndCompare( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments, - ErrorSpec error) { + XlaBuilder* builder, absl::Span arguments, ErrorSpec error) { auto status_or_data = ComputeValueAndReference(builder, arguments); EXPECT_IS_OK(status_or_data); if (!status_or_data.ok()) { @@ -447,12 +449,20 @@ void ClientLibraryTestBase::ComputeAndCompare( StatusOr, std::unique_ptr>> ClientLibraryTestBase::ComputeValueAndReference( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments) { + XlaBuilder* builder, absl::Span arguments) { // Transfer the arguments to the executor service. We put the unique_ptr's // into a vector to keep the data alive on the service until the end of this // function. std::vector> argument_data; std::vector> ref_argument_data; + + // Use `arguments_` if the AddParam() API was used. Otherwise, use + // plain `arguments`. + if (!arguments_.empty()) { + CHECK_EQ(arguments.size(), 0); + arguments = arguments_; + } + for (const auto& arg : arguments) { TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg.Clone())); TF_ASSIGN_OR_RETURN(auto ref_data, ref_client_->TransferToServer(arg)); @@ -486,11 +496,11 @@ ClientLibraryTestBase::ComputeValueAndReference( XlaComputation ClientLibraryTestBase::CreateScalarRelu() { XlaBuilder builder("relu"); auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {}); - auto z_value = builder.Parameter(0, shape, "z_value"); + auto z_value = Parameter(&builder, 0, shape, "z_value"); auto zero = use_bfloat16_ - ? builder.ConstantR0(static_cast(0.0f)) - : builder.ConstantR0(0.0f); - builder.Max(z_value, zero); + ? ConstantR0(&builder, static_cast(0.0f)) + : ConstantR0(&builder, 0.0f); + Max(z_value, zero); auto computation_status = builder.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -499,9 +509,9 @@ XlaComputation ClientLibraryTestBase::CreateScalarRelu() { XlaComputation ClientLibraryTestBase::CreateScalarMax() { XlaBuilder builder("max"); auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {}); - auto x = builder.Parameter(0, shape, "x"); - auto y = builder.Parameter(1, shape, "y"); - builder.Max(x, y); + auto x = Parameter(&builder, 0, shape, "x"); + auto y = Parameter(&builder, 1, shape, "y"); + Max(x, y); auto computation_status = builder.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -510,13 +520,13 @@ XlaComputation ClientLibraryTestBase::CreateScalarMax() { XlaComputation ClientLibraryTestBase::CreateScalarReluSensitivity() { XlaBuilder builder("relu_sensitivity"); auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {}); - auto activation = builder.Parameter(0, shape, "activation"); - auto backprop = builder.Parameter(1, shape, "backprop"); + auto activation = Parameter(&builder, 0, shape, "activation"); + auto backprop = Parameter(&builder, 1, shape, "backprop"); auto zero = use_bfloat16_ - ? builder.ConstantR0(static_cast(0.0f)) - : builder.ConstantR0(0.0f); - auto activation_gtz = builder.Gt(activation, zero); - builder.Select(activation_gtz, /*on_true=*/backprop, /*on_false=*/zero); + ? ConstantR0(&builder, static_cast(0.0f)) + : ConstantR0(&builder, 0.0f); + auto activation_gtz = Gt(activation, zero); + Select(activation_gtz, /*on_true=*/backprop, /*on_false=*/zero); auto computation_status = builder.Build(); TF_CHECK_OK(computation_status.status()); @@ -525,7 +535,7 @@ XlaComputation ClientLibraryTestBase::CreateScalarReluSensitivity() { std::unique_ptr> ClientLibraryTestBase::CreatePatternedMatrix( int rows, int cols, float offset) { - auto array = MakeUnique>(rows, cols); + auto array = absl::make_unique>(rows, cols); for (int64 row = 0; row < rows; ++row) { for (int64 col = 0; col < cols; ++col) { (*array)(row, col) = col + (row * 1000.0f) + offset; @@ -540,7 +550,7 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols, int cols_padded) { CHECK_GE(rows_padded, rows); CHECK_GE(cols_padded, cols); - auto array = MakeUnique>(rows_padded, cols_padded, 0.0); + auto array = absl::make_unique>(rows_padded, cols_padded, 0.0); for (int64 row = 0; row < rows; ++row) { for (int64 col = 0; col < cols; ++col) { (*array)(row, col) = col + (row * 1000.0f); @@ -551,16 +561,16 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols, XlaOp ClientLibraryTestBase::AddParam(const Literal& argument, XlaBuilder* builder) { - XlaOp data_handle; - arguments_.push_back(CreateParameterAndTransferLiteral( - arguments_.size(), argument, "", builder, &data_handle)); - return data_handle; + arguments_.push_back(argument.Clone()); + return Parameter(builder, /*parameter_number=*/arguments_.size() - 1, + MaybeConvertShapeToBfloat16(argument.shape()), ""); } XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal, XlaBuilder* builder) { - return builder->ConstantLiteral( - use_bfloat16_ ? *Literal::ConvertF32ToBF16(literal) : literal); + return ConstantLiteral(builder, use_bfloat16_ + ? *LiteralUtil::ConvertF32ToBF16(literal) + : literal); } std::unique_ptr @@ -573,22 +583,39 @@ ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number, nullptr, builder, data_handle); } +Shape ClientLibraryTestBase::MaybeConvertShapeToBfloat16(const Shape& shape) { + if (!use_bfloat16_) { + return shape; + } + Shape new_shape = shape; + ShapeUtil::ForEachMutableSubshape(&new_shape, + [](Shape* subshape, const ShapeIndex&) { + if (subshape->element_type() == F32) { + subshape->set_element_type(BF16); + } + }); + return new_shape; +} + +Literal ClientLibraryTestBase::MaybeConvertLiteralToBfloat16( + const Literal& literal) { + if (use_bfloat16_) { + return std::move(*LiteralUtil::ConvertF32ToBF16(literal)); + } + return literal.Clone(); +} + std::unique_ptr ClientLibraryTestBase::CreateParameterAndTransferLiteral( int64 parameter_number, const Literal& literal, const string& name, const DeviceHandle* device_handle, XlaBuilder* builder, XlaOp* data_handle) { - const Literal* param_literal = &literal; - std::unique_ptr converted_literal; - if (use_bfloat16_) { - converted_literal = Literal::ConvertF32ToBF16(literal); - param_literal = converted_literal.get(); - } + Literal param_literal = MaybeConvertLiteralToBfloat16(literal); std::unique_ptr data = - client_->TransferToServer(*param_literal, device_handle) + client_->TransferToServer(param_literal, device_handle) .ConsumeValueOrDie(); *data_handle = - builder->Parameter(parameter_number, param_literal->shape(), name); + Parameter(builder, parameter_number, param_literal.shape(), name); return data; } diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h index 0499fec5898a42affa0e0a712dee10187355c13e..22dfdfb0e4c67cc06fa748177c75cf35572196c8 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.h +++ b/tensorflow/compiler/xla/tests/client_library_test_base.h @@ -21,22 +21,23 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/global_data.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_utils.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/bitmap.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -48,8 +49,8 @@ namespace xla { // use_bfloat16_params with that value. Returns the result. template std::vector ExpandUseBfloat16( - tensorflow::gtl::ArraySlice use_bfloat16_params, - tensorflow::gtl::ArraySlice specs) { + absl::Span use_bfloat16_params, + absl::Span specs) { std::vector expanded; for (bool use_bfloat16 : use_bfloat16_params) { for (const auto& spec : specs) { @@ -73,8 +74,9 @@ class ClientLibraryTestBase : public ::testing::Test { string TestName() const; void SetFastMathDisabled(bool disabled) { - execution_options_.mutable_debug_options()->set_xla_enable_fast_math( - !disabled); + auto* opts = execution_options_.mutable_debug_options(); + opts->set_xla_cpu_enable_fast_math(!disabled); + opts->set_xla_gpu_enable_fast_math(!disabled); } void SetSeed(uint64 seed) { execution_options_.set_seed(seed); } @@ -91,15 +93,15 @@ class ClientLibraryTestBase : public ::testing::Test { // execution options. Modify execution_options_ in your test if you want to // customize the options. StatusOr> Execute( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments); + XlaBuilder* builder, absl::Span arguments); StatusOr> ExecuteAndTransfer( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments, + XlaBuilder* builder, absl::Span arguments, const Shape* shape_with_output_layout = nullptr); StatusOr> ExecuteAndTransfer( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const Shape* shape_with_output_layout = nullptr); // This executes the computation via the reference client (which connects a @@ -107,13 +109,13 @@ class ClientLibraryTestBase : public ::testing::Test { // computation. StatusOr> ExecuteAndTransferReference( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const Shape* shape_with_output_layout = nullptr); // Run a computation and return its value as a string. If an error // occurs, then instead return the error as a string. string ExecuteToString(XlaBuilder* builder, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); // Convenience methods for building and running a computation, transferring // the result, and comparing it to the expected value(s). Methods are @@ -123,102 +125,98 @@ class ClientLibraryTestBase : public ::testing::Test { // for integral types without the ErrorSpec parameter. template void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); template void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, ErrorSpec error); template void ComputeAndCompareR1(XlaBuilder* builder, - tensorflow::gtl::ArraySlice expected, - tensorflow::gtl::ArraySlice arguments); + absl::Span expected, + absl::Span arguments); template void ComputeAndCompareR1(XlaBuilder* builder, - tensorflow::gtl::ArraySlice expected, - tensorflow::gtl::ArraySlice arguments, + absl::Span expected, + absl::Span arguments, ErrorSpec error); // As above, but uses a bitmap to hold the predicate vector to avoid // deficiencies of vector. void ComputeAndCompareR1(XlaBuilder* builder, const tensorflow::core::Bitmap& expected, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); template void ComputeAndCompareR2(XlaBuilder* builder, const Array2D& expected, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); template void ComputeAndCompareR2(XlaBuilder* builder, const Array2D& expected, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, ErrorSpec error); template void ComputeAndCompareR3(XlaBuilder* builder, const Array3D& expected, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); template void ComputeAndCompareR3(XlaBuilder* builder, const Array3D& expected, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, ErrorSpec error); template void ComputeAndCompareR4(XlaBuilder* builder, const Array4D& expected, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); template void ComputeAndCompareR4(XlaBuilder* builder, const Array4D& expected, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, ErrorSpec error); // Build and run the computation and compare the result with the given // literal. shape_with_layout indicates the result layout to request when // calling Execute. - void ComputeAndCompareLiteral( - XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, - const Shape* shape_with_layout = nullptr); - void ComputeAndCompareLiteral( - XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error, - const Shape* shape_with_layout = nullptr); + void ComputeAndCompareLiteral(XlaBuilder* builder, const Literal& expected, + absl::Span arguments, + const Shape* shape_with_layout = nullptr); + void ComputeAndCompareLiteral(XlaBuilder* builder, const Literal& expected, + absl::Span arguments, + ErrorSpec error, + const Shape* shape_with_layout = nullptr); // ComputeAndCompare variant which returns an error status. Status ComputeAndCompareLiteralWithStatus( XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const Shape* shape_with_layout = nullptr); Status ComputeAndCompareLiteralWithStatus( XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error, + absl::Span arguments, ErrorSpec error, const Shape* shape_with_layout = nullptr); // Compare the result of the computation to a strings. In XLA strings are // represented using rank-1 U8 shapes. - void ComputeAndCompareR1U8( - XlaBuilder* builder, tensorflow::StringPiece expected, - tensorflow::gtl::ArraySlice arguments); + void ComputeAndCompareR1U8(XlaBuilder* builder, absl::string_view expected, + absl::Span arguments); // Convenience method for running a built computation, transferring the // result, and comparing it to the expected tuple literal. - void ComputeAndCompareTuple( - XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments); - void ComputeAndCompareTuple( - XlaBuilder* builder, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error); + void ComputeAndCompareTuple(XlaBuilder* builder, const Literal& expected, + absl::Span arguments); + void ComputeAndCompareTuple(XlaBuilder* builder, const Literal& expected, + absl::Span arguments, + ErrorSpec error); // Convenience method for running a built computation and comparing the result // with the reference result. void ComputeAndCompare(XlaBuilder* builder, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); void ComputeAndCompare(XlaBuilder* builder, - tensorflow::gtl::ArraySlice arguments, - ErrorSpec error); + absl::Span arguments, ErrorSpec error); // Create scalar operations for use in reductions. XlaComputation CreateScalarRelu(); @@ -284,7 +282,7 @@ class ClientLibraryTestBase : public ::testing::Test { template XlaOp AddParam(const Array& argument, XlaBuilder* builder) { - return AddParam(*Literal::CreateFromArray(argument), builder); + return AddParam(*LiteralUtil::CreateFromArray(argument), builder); } // Creates a constant instruction with the given literal. When the @@ -299,13 +297,14 @@ class ClientLibraryTestBase : public ::testing::Test { template XlaOp CreateConstantFromArray(const Array& array, XlaBuilder* builder) { - return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder); + return CreateConstantFromLiteral(*LiteralUtil::CreateFromArray(array), + builder); } // Same as CreateConstantFromArray, but for scalars. template XlaOp CreateConstantFromScalar(NativeT value, XlaBuilder* builder) { - return CreateConstantFromLiteral(*Literal::CreateR0(value), + return CreateConstantFromLiteral(*LiteralUtil::CreateR0(value), builder); } @@ -334,7 +333,7 @@ class ClientLibraryTestBase : public ::testing::Test { // converted to bfloat16. template std::unique_ptr CreateR1Parameter( - tensorflow::gtl::ArraySlice values, int64 parameter_number, + absl::Span values, int64 parameter_number, const string& name, XlaBuilder* builder, XlaOp* data_handle); // Creates a parameter instruction that wraps the given constant array @@ -373,6 +372,13 @@ class ClientLibraryTestBase : public ::testing::Test { // The float type used in this test, BF16 or F32 according to use_bfloat16. PrimitiveType FloatType() const { return use_bfloat16_ ? BF16 : F32; } + // Executes the computation and calculates the expected reference value using + // the reference client. Returns two literals in the order of (expected, + // actual). + StatusOr, std::unique_ptr>> + ComputeValueAndReference(XlaBuilder* builder, + absl::Span arguments); + Client* client_; Client* ref_client_; // To compute reference result. ExecutionOptions execution_options_; @@ -380,37 +386,34 @@ class ClientLibraryTestBase : public ::testing::Test { private: Status ComputeAndCompareLiteralWithAllOutputLayouts( const xla::XlaComputation& computation, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const std::function& verify_output); Status ComputeAndCompareLiteralWithAllInputLayouts( const xla::XlaComputation& computation, const Literal& expected, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const std::function& verify_output, const Shape* output_with_layout = nullptr); - // Executes the computation and calculates the expected reference value using - // the reference client. Returns two literals in the order of (expected, - // actual). - StatusOr, std::unique_ptr>> - ComputeValueAndReference(XlaBuilder* builder, - tensorflow::gtl::ArraySlice arguments); + // Converts an f32 shape/literal to bf16 if use_bfloat16_ is true. + Literal MaybeConvertLiteralToBfloat16(const Literal& literal); + Shape MaybeConvertShapeToBfloat16(const Shape& shape); // Whether to run tests with all float-type input/output converted to // bfloat16. bool use_bfloat16_ = false; // Arguments to be passed to the computation when it runs. - std::vector> arguments_; + std::vector arguments_; }; template void ClientLibraryTestBase::ComputeAndCompareR0( XlaBuilder* builder, NativeT expected, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { std::unique_ptr expected_literal = - Literal::CreateR0(expected); + LiteralUtil::CreateR0(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments); } @@ -418,7 +421,7 @@ void ClientLibraryTestBase::ComputeAndCompareR0( template void ClientLibraryTestBase::ComputeAndCompareR0( XlaBuilder* builder, NativeT expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error) { + absl::Span arguments, ErrorSpec error) { static_assert(std::is_same::value || std::is_same::value || std::is_same::value || @@ -426,25 +429,25 @@ void ClientLibraryTestBase::ComputeAndCompareR0( std::is_same::value, "Float or complex type required when specifying an ErrorSpec"); std::unique_ptr expected_literal = - Literal::CreateR0(expected); + LiteralUtil::CreateR0(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments, error); } template void ClientLibraryTestBase::ComputeAndCompareR1( - XlaBuilder* builder, tensorflow::gtl::ArraySlice expected, - tensorflow::gtl::ArraySlice arguments) { + XlaBuilder* builder, absl::Span expected, + absl::Span arguments) { std::unique_ptr expected_literal = - Literal::CreateR1(expected); + LiteralUtil::CreateR1(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments); } template void ClientLibraryTestBase::ComputeAndCompareR1( - XlaBuilder* builder, tensorflow::gtl::ArraySlice expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error) { + XlaBuilder* builder, absl::Span expected, + absl::Span arguments, ErrorSpec error) { static_assert(std::is_same::value || std::is_same::value || std::is_same::value || @@ -452,7 +455,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1( std::is_same::value, "Float or complex type required when specifying an ErrorSpec"); std::unique_ptr expected_literal = - Literal::CreateR1(expected); + LiteralUtil::CreateR1(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments, error); } @@ -460,9 +463,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1( template void ClientLibraryTestBase::ComputeAndCompareR2( XlaBuilder* builder, const Array2D& expected, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { std::unique_ptr expected_literal = - Literal::CreateR2FromArray2D(expected); + LiteralUtil::CreateR2FromArray2D(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments); } @@ -470,7 +473,7 @@ void ClientLibraryTestBase::ComputeAndCompareR2( template void ClientLibraryTestBase::ComputeAndCompareR2( XlaBuilder* builder, const Array2D& expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error) { + absl::Span arguments, ErrorSpec error) { static_assert(std::is_same::value || std::is_same::value || std::is_same::value || @@ -478,7 +481,7 @@ void ClientLibraryTestBase::ComputeAndCompareR2( std::is_same::value, "Float or complex type required when specifying an ErrorSpec"); std::unique_ptr expected_literal = - Literal::CreateR2FromArray2D(expected); + LiteralUtil::CreateR2FromArray2D(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments, error); } @@ -486,9 +489,9 @@ void ClientLibraryTestBase::ComputeAndCompareR2( template void ClientLibraryTestBase::ComputeAndCompareR3( XlaBuilder* builder, const Array3D& expected, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { std::unique_ptr expected_literal = - Literal::CreateR3FromArray3D(expected); + LiteralUtil::CreateR3FromArray3D(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments); } @@ -496,7 +499,7 @@ void ClientLibraryTestBase::ComputeAndCompareR3( template void ClientLibraryTestBase::ComputeAndCompareR3( XlaBuilder* builder, const Array3D& expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error) { + absl::Span arguments, ErrorSpec error) { static_assert(std::is_same::value || std::is_same::value || std::is_same::value || @@ -504,7 +507,7 @@ void ClientLibraryTestBase::ComputeAndCompareR3( std::is_same::value, "Float or complex type required when specifying an ErrorSpec"); std::unique_ptr expected_literal = - Literal::CreateR3FromArray3D(expected); + LiteralUtil::CreateR3FromArray3D(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments, error); } @@ -512,9 +515,9 @@ void ClientLibraryTestBase::ComputeAndCompareR3( template void ClientLibraryTestBase::ComputeAndCompareR4( XlaBuilder* builder, const Array4D& expected, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { std::unique_ptr expected_literal = - Literal::CreateR4FromArray4D(expected); + LiteralUtil::CreateR4FromArray4D(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments); } @@ -522,7 +525,7 @@ void ClientLibraryTestBase::ComputeAndCompareR4( template void ClientLibraryTestBase::ComputeAndCompareR4( XlaBuilder* builder, const Array4D& expected, - tensorflow::gtl::ArraySlice arguments, ErrorSpec error) { + absl::Span arguments, ErrorSpec error) { static_assert(std::is_same::value || std::is_same::value || std::is_same::value || @@ -530,7 +533,7 @@ void ClientLibraryTestBase::ComputeAndCompareR4( std::is_same::value, "Float or complex type required when specifying an ErrorSpec"); std::unique_ptr expected_literal = - Literal::CreateR4FromArray4D(expected); + LiteralUtil::CreateR4FromArray4D(expected); ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, arguments, error); } @@ -539,27 +542,27 @@ template std::unique_ptr ClientLibraryTestBase::CreateR0Parameter( NativeT value, int64 parameter_number, const string& name, XlaBuilder* builder, XlaOp* data_handle) { - std::unique_ptr literal = Literal::CreateR0(value); + std::unique_ptr literal = LiteralUtil::CreateR0(value); if (use_bfloat16_ && literal->shape().element_type() == F32) { - literal = Literal::ConvertF32ToBF16(*literal); + literal = LiteralUtil::ConvertF32ToBF16(*literal); } std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); - *data_handle = builder->Parameter(parameter_number, literal->shape(), name); + *data_handle = Parameter(builder, parameter_number, literal->shape(), name); return data; } template std::unique_ptr ClientLibraryTestBase::CreateR1Parameter( - tensorflow::gtl::ArraySlice values, int64 parameter_number, + absl::Span values, int64 parameter_number, const string& name, XlaBuilder* builder, XlaOp* data_handle) { - std::unique_ptr literal = Literal::CreateR1(values); + std::unique_ptr literal = LiteralUtil::CreateR1(values); if (use_bfloat16_ && literal->shape().element_type() == F32) { - literal = Literal::ConvertF32ToBF16(*literal); + literal = LiteralUtil::ConvertF32ToBF16(*literal); } std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); - *data_handle = builder->Parameter(parameter_number, literal->shape(), name); + *data_handle = Parameter(builder, parameter_number, literal->shape(), name); return data; } @@ -567,13 +570,13 @@ template std::unique_ptr ClientLibraryTestBase::CreateR2Parameter( const Array2D& array_2d, int64 parameter_number, const string& name, XlaBuilder* builder, XlaOp* data_handle) { - std::unique_ptr literal = Literal::CreateR2FromArray2D(array_2d); + std::unique_ptr literal = LiteralUtil::CreateR2FromArray2D(array_2d); if (use_bfloat16_ && literal->shape().element_type() == F32) { - literal = Literal::ConvertF32ToBF16(*literal); + literal = LiteralUtil::ConvertF32ToBF16(*literal); } std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); - *data_handle = builder->Parameter(parameter_number, literal->shape(), name); + *data_handle = Parameter(builder, parameter_number, literal->shape(), name); return data; } @@ -581,13 +584,13 @@ template std::unique_ptr ClientLibraryTestBase::CreateR3Parameter( const Array3D& array_3d, int64 parameter_number, const string& name, XlaBuilder* builder, XlaOp* data_handle) { - std::unique_ptr literal = Literal::CreateR3FromArray3D(array_3d); + std::unique_ptr literal = LiteralUtil::CreateR3FromArray3D(array_3d); if (use_bfloat16_ && literal->shape().element_type() == F32) { - literal = Literal::ConvertF32ToBF16(*literal); + literal = LiteralUtil::ConvertF32ToBF16(*literal); } std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); - *data_handle = builder->Parameter(parameter_number, literal->shape(), name); + *data_handle = Parameter(builder, parameter_number, literal->shape(), name); return data; } @@ -606,7 +609,7 @@ template std::unique_ptr> ClientLibraryTestBase::CreatePseudorandomR2( const int rows, const int cols, NativeT min_value, NativeT max_value, uint32 seed) { - auto result = MakeUnique>(rows, cols); + auto result = absl::make_unique>(rows, cols); PseudorandomGenerator generator(min_value, max_value, seed); for (int y = 0; y < rows; ++y) { for (int x = 0; x < cols; ++x) { diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc index 08671cf62445826649b5c97003f998ae98a59d97..c898dacf489db97223e2918414daf5de88bece64 100644 --- a/tensorflow/compiler/xla/tests/client_test.cc +++ b/tensorflow/compiler/xla/tests/client_test.cc @@ -18,8 +18,8 @@ limitations under the License. #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -43,8 +43,8 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) { std::vector> layouts = {{0, 1}, {1, 0}}; for (const std::vector& execute_layout : layouts) { for (const std::vector& transfer_layout : layouts) { - b.Add(b.ConstantR2({{1, 2}, {3, 4}}), - b.ConstantR2({{10, 20}, {30, 40}})); + Add(ConstantR2(&b, {{1, 2}, {3, 4}}), + ConstantR2(&b, {{10, 20}, {30, 40}})); TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build()); ExecutionOptions execution_options = execution_options_; @@ -56,7 +56,7 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) { client_->Execute(computation, {}, &execution_options)); std::unique_ptr expected_literal = - Literal::CreateR2WithLayout( + LiteralUtil::CreateR2WithLayout( {{11, 22}, {33, 44}}, LayoutUtil::MakeLayout(transfer_layout)); TF_ASSERT_OK_AND_ASSIGN( @@ -72,8 +72,8 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) { XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) { XlaBuilder b(TestName()); - b.Tuple({b.ConstantR2({{1, 2}, {3, 4}}), - b.ConstantR2({{10, 20}, {30, 40}})}); + Tuple(&b, {ConstantR2(&b, {{1, 2}, {3, 4}}), + ConstantR2(&b, {{10, 20}, {30, 40}})}); TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build()); @@ -112,13 +112,13 @@ XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) { XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg; Shape shape = ShapeUtil::MakeShape(S32, {2, 2}); - TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr const_arg, - client_->TransferToServer(*Literal::CreateR2({{5, 6}, {7, 8}}))); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr const_arg, + client_->TransferToServer( + *LiteralUtil::CreateR2({{5, 6}, {7, 8}}))); XlaBuilder b(TestName() + ".add"); - b.Add(b.Parameter(0, shape, "param_0"), - b.ConstantR2({{1, 2}, {3, 4}})); + Add(Parameter(&b, 0, shape, "param_0"), + ConstantR2(&b, {{1, 2}, {3, 4}})); TF_ASSERT_OK_AND_ASSIGN(add_with_one_arg, b.Build()); // We can't really test parallel execution on CPU since all of the cores in a @@ -136,7 +136,7 @@ XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) { TF_ASSERT_OK_AND_ASSIGN(auto results, client_->ExecuteParallel(computation_instances)); - auto expected_result = Literal::CreateR2({{6, 8}, {10, 12}}); + auto expected_result = LiteralUtil::CreateR2({{6, 8}, {10, 12}}); TF_ASSERT_OK_AND_ASSIGN( auto result_literal, diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc index 50a006964869b3e5dce431d441f7cd81af9df910..03d56964998f9abea21d6f82dee8faf86f9fe1d4 100644 --- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc +++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc @@ -17,11 +17,12 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -30,7 +31,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_utils.h" #include "tensorflow/compiler/xla/xla.pb.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/test.h" namespace xla { @@ -38,10 +38,9 @@ namespace { class CompilationCacheTest : public ClientLibraryTestBase { public: - void ExecuteComputationR0F32( - const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, float expected_result, - bool expect_cache_hit) { + void ExecuteComputationR0F32(const XlaComputation& computation, + absl::Span arguments, + float expected_result, bool expect_cache_hit) { ExecutionProfile execution_profile; std::unique_ptr result = client_ @@ -50,13 +49,13 @@ class CompilationCacheTest : public ClientLibraryTestBase { &execution_profile) .ConsumeValueOrDie(); EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR0(expected_result), *result, error_spec_)); + *LiteralUtil::CreateR0(expected_result), *result, error_spec_)); EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit()); } void ExecuteComputationR2F32( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, std::initializer_list> expected_result, bool expect_cache_hit) { ExecutionProfile execution_profile; @@ -67,7 +66,7 @@ class CompilationCacheTest : public ClientLibraryTestBase { std::unique_ptr result = client_->Transfer(*data_handle).ConsumeValueOrDie(); EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR2(expected_result), *result, error_spec_)); + *LiteralUtil::CreateR2(expected_result), *result, error_spec_)); EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit()); } @@ -77,7 +76,7 @@ class CompilationCacheTest : public ClientLibraryTestBase { // TODO(b/74197823): Disabled because there is no cache in the new design. XLA_TEST_F(CompilationCacheTest, DISABLED_ComputationCalledMultipleTimes) { XlaBuilder builder(TestName()); - builder.Neg(builder.ConstantR0(42.0)); + Neg(ConstantR0(&builder, 42.0)); XlaComputation computation = builder.Build().ConsumeValueOrDie(); ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/false); @@ -89,17 +88,17 @@ XLA_TEST_F(CompilationCacheTest, DISABLED_ComputationCalledMultipleTimes) { XLA_TEST_F(CompilationCacheTest, DISABLED_ComputationCalledWithDifferentParameters) { std::unique_ptr data_42 = - client_->TransferToServer(*Literal::CreateR0(42.0f)) + client_->TransferToServer(*LiteralUtil::CreateR0(42.0f)) .ConsumeValueOrDie(); std::unique_ptr data_123 = - client_->TransferToServer(*Literal::CreateR0(123.0f)) + client_->TransferToServer(*LiteralUtil::CreateR0(123.0f)) .ConsumeValueOrDie(); std::unique_ptr data_456 = - client_->TransferToServer(*Literal::CreateR0(456.0f)) + client_->TransferToServer(*LiteralUtil::CreateR0(456.0f)) .ConsumeValueOrDie(); XlaBuilder builder(TestName()); - builder.Neg(builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param")); + Neg(Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "param")); XlaComputation computation = builder.Build().ConsumeValueOrDie(); ExecuteComputationR0F32(computation, {data_42.get()}, -42.0, @@ -115,16 +114,16 @@ XLA_TEST_F(CompilationCacheTest, // TODO(b/74197823): Disabled because there is no cache in the new design. XLA_TEST_F(CompilationCacheTest, DISABLED_MultipleComputations) { XlaBuilder builder_neg(TestName() + "_neg"); - builder_neg.Neg(builder_neg.ConstantR0(42.0)); + Neg(ConstantR0(&builder_neg, 42.0)); XlaComputation computation_neg = builder_neg.Build().ConsumeValueOrDie(); XlaBuilder builder_exp(TestName() + "_exp"); - builder_exp.Exp(builder_exp.ConstantR0(1.0)); + Exp(ConstantR0(&builder_exp, 1.0)); XlaComputation computation_exp = builder_exp.Build().ConsumeValueOrDie(); XlaBuilder builder_add(TestName() + "_add"); - builder_add.Add(builder_add.ConstantR0(2.0), - builder_add.ConstantR0(3.0)); + Add(ConstantR0(&builder_add, 2.0), + ConstantR0(&builder_add, 3.0)); XlaComputation computation_add = builder_add.Build().ConsumeValueOrDie(); ExecuteComputationR0F32(computation_neg, {}, -42.0, @@ -143,18 +142,18 @@ XLA_TEST_F(CompilationCacheTest, DISABLED_DifferentParameterLayouts) { // layouts. Use these arrays as parameters to a simple computation. If the // layout of the array changes then computation should be recompiled (cache // miss). - auto rowmaj_array = Literal::CreateR2WithLayout( + auto rowmaj_array = LiteralUtil::CreateR2WithLayout( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({1, 0})); auto rowmaj_handle = client_->TransferToServer(*rowmaj_array).ConsumeValueOrDie(); - auto colmaj_array = Literal::CreateR2WithLayout( + auto colmaj_array = LiteralUtil::CreateR2WithLayout( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1})); auto colmaj_handle = client_->TransferToServer(*colmaj_array).ConsumeValueOrDie(); XlaBuilder builder(TestName()); - builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "param0"); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2, 2}), "param0"); XlaComputation computation = builder.Build().ConsumeValueOrDie(); ExecuteComputationR2F32(computation, {colmaj_handle.get()}, diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc index ba22530f1cfee56337f862c25122d399dbf0f1e4..8226b6de3f780197bc0f1145b617dba99803927f 100644 --- a/tensorflow/compiler/xla/tests/compute_constant_test.cc +++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc @@ -17,12 +17,13 @@ limitations under the License. #include #include +#include "absl/strings/match.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/global_data.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -32,7 +33,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_utils.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -99,7 +99,7 @@ TEST_F(ComputeConstantTest, ScalarInt32Literal) { for (ClientType client_type : client_types) { Client* client = ClientOrDie(platform_, client_type); XlaBuilder b(TestName()); - auto computation = b.ConstantR0(42); + auto computation = ConstantR0(&b, 42); EXPECT_TRUE(IsConstant(computation, &b)); auto value = ComputeConstantScalar(client, computation, &b); @@ -113,7 +113,7 @@ TEST_F(ComputeConstantTest, ScalarFloatAdd) { Client* client = ClientOrDie(platform_, client_type); XlaBuilder b(TestName()); auto computation = - b.Add(b.ConstantR0(42.5f), b.ConstantR0(1.5f)); + Add(ConstantR0(&b, 42.5f), ConstantR0(&b, 1.5f)); EXPECT_TRUE(IsConstant(computation, &b)); auto value = ComputeConstantScalar(client, computation, &b); @@ -127,8 +127,8 @@ TEST_F(ComputeConstantTest, ScalarRng) { Client* client = ClientOrDie(platform_, client_type); XlaBuilder b(TestName()); auto computation = - b.RngUniform(b.ConstantR0(1.1f), b.ConstantR0(2.1f), - ShapeUtil::MakeShape(F32, {})); + RngUniform(ConstantR0(&b, 1.1f), ConstantR0(&b, 2.1f), + ShapeUtil::MakeShape(F32, {})); EXPECT_FALSE(IsConstant(computation, &b)); auto value = ComputeConstantScalar(client, computation, &b); @@ -141,12 +141,12 @@ TEST_F(ComputeConstantTest, DirectParamMissing) { for (ClientType client_type : client_types) { Client* client = ClientOrDie(platform_, client_type); XlaBuilder b(TestName()); - auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"); + auto computation = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "param"); EXPECT_FALSE(IsConstant(computation, &b)); auto value = ComputeConstantScalar(client, computation, &b); - EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(), - "depends on a parameter")) + EXPECT_TRUE( + absl::StrContains(value.status().ToString(), "depends on a parameter")) << value.status(); } } @@ -156,13 +156,13 @@ TEST_F(ComputeConstantTest, IndirectParamMissing) { Client* client = ClientOrDie(platform_, client_type); XlaBuilder b(TestName()); auto computation = - b.Add(b.ConstantR0(1.0f), - b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param")); + Add(ConstantR0(&b, 1.0f), + Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "param")); EXPECT_FALSE(IsConstant(computation, &b)); auto value = ComputeConstantScalar(client, computation, &b); - EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(), - "depends on a parameter")) + EXPECT_TRUE( + absl::StrContains(value.status().ToString(), "depends on a parameter")) << value.status(); } } @@ -174,18 +174,18 @@ TEST_F(ComputeConstantTest, UnrelatedParam) { Client* client = ClientOrDie(platform_, client_type); XlaBuilder b(TestName()); - auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0"); + auto param_a = Parameter(&b, 10, ShapeUtil::MakeShape(F32, {}), "param0"); auto constant_4 = - b.Add(b.ConstantR0(2.5f), b.ConstantR0(1.5f)); - auto not_constant_a = b.Add(constant_4, param_a); + Add(ConstantR0(&b, 2.5f), ConstantR0(&b, 1.5f)); + auto not_constant_a = Add(constant_4, param_a); - auto param_b = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "param1"); + auto param_b = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "param1"); auto constant_9 = - b.Mul(b.ConstantR0(2.0f), b.ConstantR0(4.5f)); - auto not_constant_b = b.Add(param_b, constant_9); + Mul(ConstantR0(&b, 2.0f), ConstantR0(&b, 4.5f)); + auto not_constant_b = Add(param_b, constant_9); - auto constant_13 = b.Add(constant_4, constant_9); - b.Add(not_constant_b, b.Add(constant_13, not_constant_a)); + auto constant_13 = Add(constant_4, constant_9); + Add(not_constant_b, Add(constant_13, not_constant_a)); EXPECT_TRUE(IsConstant(constant_13, &b)); @@ -201,13 +201,13 @@ TEST_F(ComputeConstantTest, NonScalarAdd) { XlaBuilder b(TestName()); auto computation = - b.Add(b.ConstantR1({1, 2}), b.ConstantR1({3, 4})); + Add(ConstantR1(&b, {1, 2}), ConstantR1(&b, {3, 4})); EXPECT_TRUE(IsConstant(computation, &b)); TF_ASSERT_OK_AND_ASSIGN(auto computed, ComputeConstantLiteral(client, computation, &b)); std::unique_ptr expected_literal = - Literal::CreateR1({4, 6}); + LiteralUtil::CreateR1({4, 6}); EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed)); } } @@ -216,12 +216,12 @@ TEST_F(ComputeConstantTest, IntegerDivide) { for (ClientType client_type : client_types) { Client* client = ClientOrDie(platform_, client_type); XlaBuilder b(TestName()); - auto computation = b.Div(b.ConstantR0(15), b.ConstantR0(3)); + auto computation = Div(ConstantR0(&b, 15), ConstantR0(&b, 3)); EXPECT_TRUE(IsConstant(computation, &b)); TF_ASSERT_OK_AND_ASSIGN(auto computed, ComputeConstantLiteral(client, computation, &b)); - std::unique_ptr expected_literal = Literal::CreateR0(5); + std::unique_ptr expected_literal = LiteralUtil::CreateR0(5); EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed)); } } @@ -237,13 +237,13 @@ XLA_TEST_F(ComputeConstantTest, Layout) { TF_ASSERT_OK_AND_ASSIGN( auto computed, ComputeConstantLiteral( client, - b.Add(b.ConstantR2({{1, 2}, {3, 4}}), - b.ConstantR2({{10, 20}, {30, 40}})), + Add(ConstantR2(&b, {{1, 2}, {3, 4}}), + ConstantR2(&b, {{10, 20}, {30, 40}})), &b, &layout_proto)); std::unique_ptr expected_literal = - Literal::CreateR2WithLayout({{11, 22}, {33, 44}}, - LayoutUtil::MakeLayout(layout)); + LiteralUtil::CreateR2WithLayout( + {{11, 22}, {33, 44}}, LayoutUtil::MakeLayout(layout)); ASSERT_TRUE(LiteralTestUtil::EqualShapesAndLayouts( expected_literal->shape(), computed->shape())); EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed)); diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc index a4c8a83eb15f7cc279b6c8f1bf1394c0afb9f7cf..be017477d84eb9faf5aa79dcdf54d6b6aaf6fd8e 100644 --- a/tensorflow/compiler/xla/tests/concat_test.cc +++ b/tensorflow/compiler/xla/tests/concat_test.cc @@ -19,8 +19,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" @@ -39,7 +39,7 @@ using ::testing::HasSubstr; // Concatenate expects at least one argument. XLA_TEST_F(ConcatTest, Concat_Nothing) { XlaBuilder builder(TestName()); - builder.ConcatInDim({}, 0); + ConcatInDim(&builder, {}, 0); StatusOr computation_status = builder.Build(); ASSERT_FALSE(computation_status.ok()); EXPECT_THAT(computation_status.status().ToString(), @@ -49,8 +49,8 @@ XLA_TEST_F(ConcatTest, Concat_Nothing) { // Concatenate with one argument works. XLA_TEST_F(ConcatTest, Concat_R1_With_Nothing) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.0, 64.0}); - builder.ConcatInDim({a}, 0); + auto a = ConstantR1(&builder, {42.0, 64.0}); + ConcatInDim(&builder, {a}, 0); std::vector expected = {42, 64}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -58,8 +58,8 @@ XLA_TEST_F(ConcatTest, Concat_R1_With_Nothing) { XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - builder.ConcatInDim({a}, 0); + auto a = ConstantR1(&builder, {}); + ConcatInDim(&builder, {a}, 0); std::vector expected = {}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -69,9 +69,9 @@ XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) { // to concatenate on. XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR0(42.0); - auto b = builder.ConstantR0(64.0); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR0(&builder, 42.0); + auto b = ConstantR0(&builder, 64.0); + ConcatInDim(&builder, {a, b}, 0); StatusOr computation_status = builder.Build(); ASSERT_FALSE(computation_status.ok()); EXPECT_THAT(computation_status.status().ToString(), @@ -80,9 +80,9 @@ XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) { XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L0) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({}); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {}); + ConcatInDim(&builder, {a, b}, 0); std::vector expected = {}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -90,9 +90,9 @@ XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L0) { XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - auto b = builder.ConstantR1({256.0}); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR1(&builder, {}); + auto b = ConstantR1(&builder, {256.0}); + ConcatInDim(&builder, {a, b}, 0); std::vector expected = {256}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -100,9 +100,9 @@ XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L1) { XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L0) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.0, 64.0}); - auto b = builder.ConstantR1({}); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR1(&builder, {42.0, 64.0}); + auto b = ConstantR1(&builder, {}); + ConcatInDim(&builder, {a, b}, 0); std::vector expected = {42, 64}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -110,9 +110,9 @@ XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L0) { XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.0, 64.0}); - auto b = builder.ConstantR1({256.0}); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR1(&builder, {42.0, 64.0}); + auto b = ConstantR1(&builder, {256.0}); + ConcatInDim(&builder, {a, b}, 0); std::vector expected = {42, 64, 256}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -130,9 +130,9 @@ XLA_TEST_F(ConcatTest, Concat_R1_L253_With_R1_L7) { } XlaBuilder builder(TestName()); - auto a = builder.ConstantR1(lhs); - auto b = builder.ConstantR1(rhs); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR1(&builder, lhs); + auto b = ConstantR1(&builder, rhs); + ConcatInDim(&builder, {a, b}, 0); ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); } @@ -140,9 +140,9 @@ XLA_TEST_F(ConcatTest, Concat_R1_L253_With_R1_L7) { XLA_TEST_F(ConcatTest, Concat_0x0_With_0x0) { for (int dim : {0, 1}) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2FromArray2D(Array2D(0, 0)); - auto b = builder.ConstantR2FromArray2D(Array2D(0, 0)); - builder.ConcatInDim({a, b}, dim); + auto a = ConstantR2FromArray2D(&builder, Array2D(0, 0)); + auto b = ConstantR2FromArray2D(&builder, Array2D(0, 0)); + ConcatInDim(&builder, {a, b}, dim); ComputeAndCompareR2(&builder, Array2D(0, 0), {}, ErrorSpec(0.0001)); @@ -153,9 +153,9 @@ XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim0) { XlaBuilder builder(TestName()); auto a_array = CreatePatternedMatrix(1, 1); auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0); - auto a = builder.ConstantR2FromArray2D(*a_array); - auto b = builder.ConstantR2FromArray2D(*b_array); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR2FromArray2D(&builder, *a_array); + auto b = ConstantR2FromArray2D(&builder, *b_array); + ConcatInDim(&builder, {a, b}, 0); Array2D expected({ {0}, @@ -168,9 +168,9 @@ XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) { XlaBuilder builder(TestName()); auto a_array = CreatePatternedMatrix(1, 1); auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0); - auto a = builder.ConstantR2FromArray2D(*a_array); - auto b = builder.ConstantR2FromArray2D(*b_array); - builder.ConcatInDim({a, b}, 1); + auto a = ConstantR2FromArray2D(&builder, *a_array); + auto b = ConstantR2FromArray2D(&builder, *b_array); + ConcatInDim(&builder, {a, b}, 1); Array2D expected({ {0, 64}, @@ -181,9 +181,9 @@ XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) { XLA_TEST_F(ConcatTest, Concat2x0With2x5) { XlaBuilder builder(TestName()); auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0); - auto a = builder.ConstantR2FromArray2D(Array2D(2, 0)); - auto b = builder.ConstantR2FromArray2D(*b_array); - builder.ConcatInDim({a, b}, 1); + auto a = ConstantR2FromArray2D(&builder, Array2D(2, 0)); + auto b = ConstantR2FromArray2D(&builder, *b_array); + ConcatInDim(&builder, {a, b}, 1); ComputeAndCompareR2(&builder, *b_array, {}, ErrorSpec(0.0001)); } @@ -192,9 +192,9 @@ XLA_TEST_F(ConcatTest, Concat2x3With2x5) { XlaBuilder builder(TestName()); auto a_array = CreatePatternedMatrix(2, 3); auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0); - auto a = builder.ConstantR2FromArray2D(*a_array); - auto b = builder.ConstantR2FromArray2D(*b_array); - builder.ConcatInDim({a, b}, 1); + auto a = ConstantR2FromArray2D(&builder, *a_array); + auto b = ConstantR2FromArray2D(&builder, *b_array); + ConcatInDim(&builder, {a, b}, 1); Array2D expected({ {0, 1, 2, 64, 65, 66, 67, 68}, @@ -206,9 +206,9 @@ XLA_TEST_F(ConcatTest, Concat2x3With2x5) { XLA_TEST_F(ConcatTest, Concat3x2With0x2) { XlaBuilder builder(TestName()); auto a_array = CreatePatternedMatrix(3, 2); - auto a = builder.ConstantR2FromArray2D(*a_array); - auto b = builder.ConstantR2FromArray2D(Array2D(0, 2)); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR2FromArray2D(&builder, *a_array); + auto b = ConstantR2FromArray2D(&builder, Array2D(0, 2)); + ConcatInDim(&builder, {a, b}, 0); ComputeAndCompareR2(&builder, *a_array, {}, ErrorSpec(0.0001)); } @@ -217,9 +217,9 @@ XLA_TEST_F(ConcatTest, Concat3x2With5x2) { XlaBuilder builder(TestName()); auto a_array = CreatePatternedMatrix(3, 2); auto b_array = CreatePatternedMatrix(5, 2, /*offset=*/64.0); - auto a = builder.ConstantR2FromArray2D(*a_array); - auto b = builder.ConstantR2FromArray2D(*b_array); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR2FromArray2D(&builder, *a_array); + auto b = ConstantR2FromArray2D(&builder, *b_array); + ConcatInDim(&builder, {a, b}, 0); Array2D expected({ {0, 1}, @@ -236,9 +236,9 @@ XLA_TEST_F(ConcatTest, Concat3x2With5x2) { XLA_TEST_F(ConcatTest, Concat_R3_3x0x2_3x0x1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR3FromArray3D(Array3D(3, 0, 2)); - auto b = builder.ConstantR3FromArray3D(Array3D(3, 0, 1)); - builder.ConcatInDim({a, b}, 2); + auto a = ConstantR3FromArray3D(&builder, Array3D(3, 0, 2)); + auto b = ConstantR3FromArray3D(&builder, Array3D(3, 0, 1)); + ConcatInDim(&builder, {a, b}, 2); ComputeAndCompareR3(&builder, Array3D(3, 0, 3), {}, ErrorSpec(0.0001)); } @@ -257,9 +257,9 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) { {{7}}, {{8}}, }); - auto a = builder.ConstantR3FromArray3D(a_array); - auto b = builder.ConstantR3FromArray3D(b_array); - builder.ConcatInDim({a, b}, 2); + auto a = ConstantR3FromArray3D(&builder, a_array); + auto b = ConstantR3FromArray3D(&builder, b_array); + ConcatInDim(&builder, {a, b}, 2); Array3D expected({ {{0, 1, 6}}, @@ -271,10 +271,10 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) { XLA_TEST_F(ConcatTest, Concat_R1_1x1_1x1_1x1) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.0}); - auto b = builder.ConstantR1({64.0}); - auto c = builder.ConstantR1({256.0}); - builder.ConcatInDim({a, b, c}, 0); + auto a = ConstantR1(&builder, {42.0}); + auto b = ConstantR1(&builder, {64.0}); + auto c = ConstantR1(&builder, {256.0}); + ConcatInDim(&builder, {a, b, c}, 0); std::vector expected = {42, 64, 256}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -300,10 +300,10 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) { {{7}}, {{11}}, }); - auto a = builder.ConstantR3FromArray3D(a_array); - auto b = builder.ConstantR3FromArray3D(b_array); - auto c = builder.ConstantR3FromArray3D(c_array); - builder.ConcatInDim({a, b, c}, 2); + auto a = ConstantR3FromArray3D(&builder, a_array); + auto b = ConstantR3FromArray3D(&builder, b_array); + auto c = ConstantR3FromArray3D(&builder, c_array); + ConcatInDim(&builder, {a, b, c}, 2); Array3D expected({ {{0, 1, 2, 3}}, @@ -315,11 +315,11 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) { XLA_TEST_F(ConcatTest, DoubleConcatLeftAssociative) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.0}); - auto b = builder.ConstantR1({64.0}); - auto c = builder.ConstantR1({256.0}); + auto a = ConstantR1(&builder, {42.0}); + auto b = ConstantR1(&builder, {64.0}); + auto c = ConstantR1(&builder, {256.0}); // concatenated = (a concat b) concat c - builder.ConcatInDim({builder.ConcatInDim({a, b}, 0), c}, 0); + ConcatInDim(&builder, {ConcatInDim(&builder, {a, b}, 0), c}, 0); std::vector expected = {42, 64, 256}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -327,11 +327,11 @@ XLA_TEST_F(ConcatTest, DoubleConcatLeftAssociative) { XLA_TEST_F(ConcatTest, DoubleConcatRightAssociative) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.0}); - auto b = builder.ConstantR1({64.0}); - auto c = builder.ConstantR1({256.0}); + auto a = ConstantR1(&builder, {42.0}); + auto b = ConstantR1(&builder, {64.0}); + auto c = ConstantR1(&builder, {256.0}); // concatenated = a concat (b concat c) - builder.ConcatInDim({a, builder.ConcatInDim({b, c}, 0)}, 0); + ConcatInDim(&builder, {a, ConcatInDim(&builder, {b, c}, 0)}, 0); std::vector expected = {42, 64, 256}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -346,9 +346,9 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim0) { } XlaBuilder builder(TestName()); - auto a = builder.ConstantR2FromArray2D(lhs); - auto b = builder.ConstantR2FromArray2D(rhs); - builder.ConcatInDim({a, b}, 0); + auto a = ConstantR2FromArray2D(&builder, lhs); + auto b = ConstantR2FromArray2D(&builder, rhs); + ConcatInDim(&builder, {a, b}, 0); Array2D expected(2, 1024); for (int i = 0; i < 1024; ++i) { @@ -367,9 +367,9 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim1) { } XlaBuilder builder(TestName()); - auto a = builder.ConstantR2FromArray2D(lhs); - auto b = builder.ConstantR2FromArray2D(rhs); - builder.ConcatInDim({a, b}, 1); + auto a = ConstantR2FromArray2D(&builder, lhs); + auto b = ConstantR2FromArray2D(&builder, rhs); + ConcatInDim(&builder, {a, b}, 1); Array2D expected(1, 2048); for (int i = 0; i < 1024; ++i) { @@ -392,9 +392,9 @@ XLA_TEST_F(ConcatTest, Concat_64x64_With_64x2) { } XlaBuilder builder(TestName()); - auto a = builder.ConstantR2FromArray2D(lhs); - auto b = builder.ConstantR2FromArray2D(rhs); - builder.ConcatInDim({a, b}, 1); + auto a = ConstantR2FromArray2D(&builder, lhs); + auto b = ConstantR2FromArray2D(&builder, rhs); + ConcatInDim(&builder, {a, b}, 1); Array2D expected(64, 66); for (int i0 = 0; i0 < 64; ++i0) { @@ -410,22 +410,37 @@ XLA_TEST_F(ConcatTest, CannotConcatOpaques) { XlaBuilder builder(TestName()); auto opaque_shape = ShapeUtil::MakeOpaqueShape(); auto r1f32 = xla::ShapeUtil::MakeShape(xla::F32, {1}); - auto x = builder.Parameter(0, r1f32, "x"); - auto y = builder.Parameter(1, opaque_shape, "y"); - builder.ConcatInDim({x, y}, 0); + auto x = Parameter(&builder, 0, r1f32, "x"); + auto y = Parameter(&builder, 1, opaque_shape, "y"); + ConcatInDim(&builder, {x, y}, 0); StatusOr computation_status = builder.Build(); ASSERT_FALSE(computation_status.ok()); EXPECT_THAT( computation_status.status().ToString(), - HasSubstr("Expected non-opaque argument for operand of concatenation")); + HasSubstr("Expected array argument for operand of concatenation")); +} + +// Show that we can't concatenate with tokens. +XLA_TEST_F(ConcatTest, CannotConcatTokens) { + XlaBuilder builder(TestName()); + auto token_shape = ShapeUtil::MakeTokenShape(); + auto r1f32 = xla::ShapeUtil::MakeShape(xla::F32, {1}); + auto x = Parameter(&builder, 0, r1f32, "x"); + auto y = Parameter(&builder, 1, token_shape, "y"); + ConcatInDim(&builder, {x, y}, 0); + StatusOr computation_status = builder.Build(); + ASSERT_FALSE(computation_status.ok()); + EXPECT_THAT( + computation_status.status().ToString(), + HasSubstr("Expected array argument for operand of concatenation")); } XLA_TEST_F(ConcatTest, ConcatSeveralBoxedPredicates) { XlaBuilder builder(TestName()); - auto p0 = builder.ConstantR1({true}); - auto p1 = builder.ConstantR1({false}); - auto p2 = builder.ConstantR1({true}); - builder.ConcatInDim({p0, p1, p2}, 0); + auto p0 = ConstantR1(&builder, {true}); + auto p1 = ConstantR1(&builder, {false}); + auto p2 = ConstantR1(&builder, {true}); + ConcatInDim(&builder, {p0, p1, p2}, 0); bool expected[] = {true, false, true}; ComputeAndCompareR1(&builder, expected, {}); @@ -433,11 +448,11 @@ XLA_TEST_F(ConcatTest, ConcatSeveralBoxedPredicates) { XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) { XlaBuilder builder(TestName()); - auto a0 = builder.ConstantR1({1}); - auto a1 = builder.ConstantR1({2, 3}); - auto a2 = builder.ConstantR1({4, 5, 6}); - auto a3 = builder.ConstantR1({7, 8, 9, 10}); - builder.ConcatInDim({a0, a1, a2, a3}, 0); + auto a0 = ConstantR1(&builder, {1}); + auto a1 = ConstantR1(&builder, {2, 3}); + auto a2 = ConstantR1(&builder, {4, 5, 6}); + auto a3 = ConstantR1(&builder, {7, 8, 9, 10}); + ConcatInDim(&builder, {a0, a1, a2, a3}, 0); std::vector expected(10); std::iota(expected.begin(), expected.end(), 1); @@ -472,7 +487,7 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) { auto p1 = CreateR3Parameter(arr1, /*parameter_number=*/1, "p1", &builder, &h1); - builder.ConcatInDim({h0, h1}, 2); + ConcatInDim(&builder, {h0, h1}, 2); ComputeAndCompareR3(&builder, expected, {p0.get(), p1.get()}); } @@ -499,9 +514,9 @@ TEST_P(ConcatR2BinaryTest, DoIt) { rhs.FillUnique(1000); XlaBuilder builder(TestName()); - auto a0 = builder.ConstantR2FromArray2D(lhs); - auto a1 = builder.ConstantR2FromArray2D(rhs); - builder.ConcatInDim({a0, a1}, spec.concat_dimension); + auto a0 = ConstantR2FromArray2D(&builder, lhs); + auto a1 = ConstantR2FromArray2D(&builder, rhs); + ConcatInDim(&builder, {a0, a1}, spec.concat_dimension); std::unique_ptr> expected = ReferenceUtil::Concat2D(lhs, rhs, spec.concat_dimension); @@ -519,19 +534,19 @@ TEST_P(ConcatR2BinaryTest, DoIt) { // concat XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) { auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {}); - auto x_literal = Literal::CreateR0(2.f); - auto y_literal = Literal::CreateR0(3.f); + auto x_literal = LiteralUtil::CreateR0(2.f); + auto y_literal = LiteralUtil::CreateR0(3.f); auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie(); auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie(); XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, f32_scalar, "x"); - auto y = builder.Parameter(1, f32_scalar, "y"); - auto mul = builder.Mul(x, y); - auto add1 = builder.Add(mul, builder.ConstantR1({1.f, 2.f})); - auto add2 = builder.Add(mul, builder.ConstantR1({3.f, 4.f})); - auto add3 = builder.Add(mul, builder.ConstantR1({5.f, 6.f})); - builder.ConcatInDim({add1, add2, add3}, /*dimension=*/0); + auto x = Parameter(&builder, 0, f32_scalar, "x"); + auto y = Parameter(&builder, 1, f32_scalar, "y"); + auto mul = Mul(x, y); + auto add1 = Add(mul, ConstantR1(&builder, {1.f, 2.f})); + auto add2 = Add(mul, ConstantR1(&builder, {3.f, 4.f})); + auto add3 = Add(mul, ConstantR1(&builder, {5.f, 6.f})); + ConcatInDim(&builder, {add1, add2, add3}, /*dimension=*/0); ComputeAndCompareR1(&builder, {7., 8., 9., 10., 11., 12.}, {x_data.get(), y_data.get()}, ErrorSpec(1e-4)); @@ -541,21 +556,21 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) { // produces the correct result in rank 1. XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) { auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {}); - auto x_literal = Literal::CreateR1({2.0f, 3.0f, 5.0f, 6.0f}); - auto y_literal = Literal::CreateR0(1.5f); - auto z_literal = Literal::CreateR0(5.5f); + auto x_literal = LiteralUtil::CreateR1({2.0f, 3.0f, 5.0f, 6.0f}); + auto y_literal = LiteralUtil::CreateR0(1.5f); + auto z_literal = LiteralUtil::CreateR0(5.5f); auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie(); auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie(); auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie(); XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, x_literal->shape(), "x"); - auto y = builder.Parameter(1, f32_scalar, "y"); - auto z = builder.Parameter(2, f32_scalar, "z"); - auto bcast = builder.Broadcast(y, {5}); - auto bcast2 = builder.Broadcast(z, {3}); - auto concat = builder.ConcatInDim({bcast, x}, /*dimension=*/0); - builder.ConcatInDim({concat, bcast2}, /*dimension=*/0); + auto x = Parameter(&builder, 0, x_literal->shape(), "x"); + auto y = Parameter(&builder, 1, f32_scalar, "y"); + auto z = Parameter(&builder, 2, f32_scalar, "z"); + auto bcast = Broadcast(y, {5}); + auto bcast2 = Broadcast(z, {3}); + auto concat = ConcatInDim(&builder, {bcast, x}, /*dimension=*/0); + ConcatInDim(&builder, {concat, bcast2}, /*dimension=*/0); ComputeAndCompareR1( &builder, @@ -569,21 +584,21 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) { XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) { auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {}); Array3D x3d(3, 5, 7, 3.14f); - auto x_literal = Literal::CreateR3FromArray3D(x3d); - auto y_literal = Literal::CreateR0(1.5f); - auto z_literal = Literal::CreateR0(5.5f); + auto x_literal = LiteralUtil::CreateR3FromArray3D(x3d); + auto y_literal = LiteralUtil::CreateR0(1.5f); + auto z_literal = LiteralUtil::CreateR0(5.5f); auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie(); auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie(); auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie(); XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, x_literal->shape(), "x"); - auto y = builder.Parameter(1, f32_scalar, "y"); - auto z = builder.Parameter(2, f32_scalar, "y"); - auto y_bcast = builder.Broadcast(y, {1, 5, 7}); - auto z_bcast = builder.Broadcast(z, {4, 1, 7}); - auto concat = builder.ConcatInDim({y_bcast, x}, /*dimension=*/0); - builder.ConcatInDim({concat, z_bcast}, /*dimension=*/1); + auto x = Parameter(&builder, 0, x_literal->shape(), "x"); + auto y = Parameter(&builder, 1, f32_scalar, "y"); + auto z = Parameter(&builder, 2, f32_scalar, "y"); + auto y_bcast = Broadcast(y, {1, 5, 7}); + auto z_bcast = Broadcast(z, {4, 1, 7}); + auto concat = ConcatInDim(&builder, {y_bcast, x}, /*dimension=*/0); + ConcatInDim(&builder, {concat, z_bcast}, /*dimension=*/1); Array3D y_bcast3d(1, 5, 7, 1.5f); Array3D z_bcast3d(4, 1, 7, 5.5f); auto concat0 = ReferenceUtil::Concat3D(y_bcast3d, x3d, 0); diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc index 7ff6706935740c7d76ee5cd03eae292386760397..25d10ab00af11b8ebb8147917e7cdbb21f9a42c4 100644 --- a/tensorflow/compiler/xla/tests/conditional_test.cc +++ b/tensorflow/compiler/xla/tests/conditional_test.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -26,8 +26,8 @@ class ConditionalOpTest : public ClientLibraryTestBase { protected: XlaComputation CreateR0ConstantComputation(float value) { XlaBuilder builder("Constant"); - builder.Parameter(0, empty_tuple_, "tuple"); - builder.ConstantR0(value); + Parameter(&builder, 0, empty_tuple_, "tuple"); + ConstantR0(&builder, value); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -35,7 +35,7 @@ class ConditionalOpTest : public ClientLibraryTestBase { XlaComputation CreateR0IdentityComputation() { XlaBuilder builder("Identity"); - builder.Parameter(0, r0f32_, "x"); + Parameter(&builder, 0, r0f32_, "x"); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -43,8 +43,8 @@ class ConditionalOpTest : public ClientLibraryTestBase { XlaComputation CreateCeilComputation(const Shape& shape) { XlaBuilder builder("Ceil"); - auto param = builder.Parameter(0, shape, "param"); - builder.Ceil(param); + auto param = Parameter(&builder, 0, shape, "param"); + Ceil(param); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -60,8 +60,8 @@ class ConditionalOpTest : public ClientLibraryTestBase { XlaComputation CreateFloorComputation(const Shape& shape) { XlaBuilder builder("Floor"); - auto param = builder.Parameter(0, shape, "param"); - builder.Floor(param); + auto param = Parameter(&builder, 0, shape, "param"); + Floor(param); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -78,12 +78,12 @@ class ConditionalOpTest : public ClientLibraryTestBase { XlaComputation CreateTupleCeilComputation(const string& computation_name, const Shape& tuple_shape) { XlaBuilder builder(computation_name); - auto tuple = builder.Parameter(0, tuple_shape, "tuple"); - auto x = builder.GetTupleElement(tuple, 0); - auto y = builder.GetTupleElement(tuple, 1); - auto x_ceil = builder.Ceil(x); - auto y_ceil = builder.Ceil(y); - builder.Tuple({x_ceil, y_ceil}); + auto tuple = Parameter(&builder, 0, tuple_shape, "tuple"); + auto x = GetTupleElement(tuple, 0); + auto y = GetTupleElement(tuple, 1); + auto x_ceil = Ceil(x); + auto y_ceil = Ceil(y); + Tuple(&builder, {x_ceil, y_ceil}); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -100,12 +100,12 @@ class ConditionalOpTest : public ClientLibraryTestBase { XlaComputation CreateTupleFloorComputation(const string& computation_name, const Shape& tuple_shape) { XlaBuilder builder(computation_name); - auto tuple = builder.Parameter(0, tuple_shape, "tuple"); - auto x = builder.GetTupleElement(tuple, 0); - auto y = builder.GetTupleElement(tuple, 1); - auto x_floor = builder.Floor(x); - auto y_floor = builder.Floor(y); - builder.Tuple({x_floor, y_floor}); + auto tuple = Parameter(&builder, 0, tuple_shape, "tuple"); + auto x = GetTupleElement(tuple, 0); + auto y = GetTupleElement(tuple, 1); + auto x_floor = Floor(x); + auto y_floor = Floor(y); + Tuple(&builder, {x_floor, y_floor}); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -122,10 +122,10 @@ class ConditionalOpTest : public ClientLibraryTestBase { XlaComputation CreateTupleAddComputation(const string& computation_name, const Shape& tuple_shape) { XlaBuilder builder(computation_name); - auto tuple = builder.Parameter(0, tuple_shape, "tuple"); - auto x = builder.GetTupleElement(tuple, 0); - auto y = builder.GetTupleElement(tuple, 1); - builder.Add(x, y); + auto tuple = Parameter(&builder, 0, tuple_shape, "tuple"); + auto x = GetTupleElement(tuple, 0); + auto y = GetTupleElement(tuple, 1); + Add(x, y); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -142,10 +142,10 @@ class ConditionalOpTest : public ClientLibraryTestBase { XlaComputation CreateTupleSubComputation(const string& computation_name, const Shape& tuple_shape) { XlaBuilder builder(computation_name); - auto tuple = builder.Parameter(0, tuple_shape, "tuple"); - auto x = builder.GetTupleElement(tuple, 0); - auto y = builder.GetTupleElement(tuple, 1); - builder.Sub(x, y); + auto tuple = Parameter(&builder, 0, tuple_shape, "tuple"); + auto x = GetTupleElement(tuple, 0); + auto y = GetTupleElement(tuple, 1); + Sub(x, y); auto build_status = builder.Build(); EXPECT_IS_OK(build_status.status()); return build_status.ConsumeValueOrDie(); @@ -172,198 +172,215 @@ class ConditionalOpTest : public ClientLibraryTestBase { // Test true and false computations that do not take any parameters. XLA_TEST_F(ConditionalOpTest, Parameters0) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(true); - auto operands = builder.Tuple({}); + XlaOp pred; + auto pred_arg = CreateR0Parameter(true, 0, "pred", &builder, &pred); + auto operands = Tuple(&builder, {}); auto true_computation = CreateR0ConstantComputation(56.0f); auto false_computation = CreateR0ConstantComputation(12.0f); - builder.Conditional(pred, operands, true_computation, operands, - false_computation); + Conditional(pred, operands, true_computation, operands, false_computation); - ComputeAndCompareR0(&builder, 56.0f, {}, error_spec_); + ComputeAndCompareR0(&builder, 56.0f, {pred_arg.get()}, error_spec_); } // Test true and false computations that take in 1 parameter. XLA_TEST_F(ConditionalOpTest, Parameters1) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operand1 = builder.ConstantR0(56.0f); - auto operand2 = builder.ConstantR0(12.0f); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand1 = ConstantR0(&builder, 56.0f); + auto operand2 = ConstantR0(&builder, 12.0f); auto identity = CreateR0IdentityComputation(); - builder.Conditional(pred, operand1, identity, operand2, identity); + Conditional(pred, operand1, identity, operand2, identity); - ComputeAndCompareR0(&builder, 12.0f, {}, error_spec_); + ComputeAndCompareR0(&builder, 12.0f, {pred_arg.get()}, error_spec_); } // Test conditional with two different computations in the true and false cases // that take in different arguments. XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operand1 = builder.ConstantR0(56.4f); - auto operand2 = builder.ConstantR0(12.6f); - builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2, - CreateR0FloorComputation()); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand1 = ConstantR0(&builder, 56.4f); + auto operand2 = ConstantR0(&builder, 12.6f); + Conditional(pred, operand1, CreateR0CeilComputation(), operand2, + CreateR0FloorComputation()); - ComputeAndCompareR0(&builder, 12.0f, {}, error_spec_); + ComputeAndCompareR0(&builder, 12.0f, {pred_arg.get()}, error_spec_); } // Test conditional with two different computations in the true and false cases // that take in the same arguments. XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operand = builder.ConstantR0(12.6f); - builder.Conditional(pred, operand, CreateR0CeilComputation(), operand, - CreateR0FloorComputation()); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand = ConstantR0(&builder, 12.6f); + Conditional(pred, operand, CreateR0CeilComputation(), operand, + CreateR0FloorComputation()); - ComputeAndCompareR0(&builder, 12.0f, {}, error_spec_); + ComputeAndCompareR0(&builder, 12.0f, {pred_arg.get()}, error_spec_); } // Test conditional with the same computation in the true and false cases but // take in different arguments. XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operand1 = builder.ConstantR0(56.4f); - auto operand2 = builder.ConstantR0(12.6f); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand1 = ConstantR0(&builder, 56.4f); + auto operand2 = ConstantR0(&builder, 12.6f); auto floor = CreateR0FloorComputation(); - builder.Conditional(pred, operand1, floor, operand2, floor); + Conditional(pred, operand1, floor, operand2, floor); - ComputeAndCompareR0(&builder, 12.0f, {}, error_spec_); + ComputeAndCompareR0(&builder, 12.0f, {pred_arg.get()}, error_spec_); } // Test conditional with the same computation in the true and false cases that // take in the same arguments. XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operand = builder.ConstantR0(12.6f); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand = ConstantR0(&builder, 12.6f); auto floor = CreateR0FloorComputation(); - builder.Conditional(pred, operand, floor, operand, floor); + Conditional(pred, operand, floor, operand, floor); - ComputeAndCompareR0(&builder, 12.0f, {}, error_spec_); + ComputeAndCompareR0(&builder, 12.0f, {pred_arg.get()}, error_spec_); } // Test conditional with different instances of the same computation in the true // and false cases. XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operand1 = builder.ConstantR0(56.4f); - auto operand2 = builder.ConstantR0(12.6f); - builder.Conditional(pred, operand1, CreateR0FloorComputation(), operand2, - CreateR0FloorComputation()); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand1 = ConstantR0(&builder, 56.4f); + auto operand2 = ConstantR0(&builder, 12.6f); + Conditional(pred, operand1, CreateR0FloorComputation(), operand2, + CreateR0FloorComputation()); - ComputeAndCompareR0(&builder, 12.0f, {}, error_spec_); + ComputeAndCompareR0(&builder, 12.0f, {pred_arg.get()}, error_spec_); } // Test the case when a call invokes a computation that contains a conditional. XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) { Shape r0bool = ShapeUtil::MakeShape(PRED, {}); XlaBuilder inner_builder(TestName() + ".inner_conditional"); - auto pred_cond = inner_builder.Parameter(0, r0bool, "param0"); - auto true_operand = inner_builder.Parameter(1, r0f32_, "param1"); - auto false_operand = inner_builder.Parameter(2, r0f32_, "param2"); - inner_builder.Conditional(pred_cond, true_operand, CreateR0CeilComputation(), - false_operand, CreateR0FloorComputation()); + auto pred_cond = Parameter(&inner_builder, 0, r0bool, "param0"); + auto true_operand = Parameter(&inner_builder, 1, r0f32_, "param1"); + auto false_operand = Parameter(&inner_builder, 2, r0f32_, "param2"); + Conditional(pred_cond, true_operand, CreateR0CeilComputation(), false_operand, + CreateR0FloorComputation()); auto inner_builder_result = inner_builder.Build(); XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operand1 = builder.ConstantR0(56.4f); - auto operand2 = builder.ConstantR0(12.6f); - builder.Call(inner_builder_result.ConsumeValueOrDie(), - {pred, operand1, operand2}); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand1 = ConstantR0(&builder, 56.4f); + auto operand2 = ConstantR0(&builder, 12.6f); + Call(&builder, inner_builder_result.ConsumeValueOrDie(), + {pred, operand1, operand2}); - ComputeAndCompareR0(&builder, 12.0f, {}, error_spec_); + ComputeAndCompareR0(&builder, 12.0f, {pred_arg.get()}, error_spec_); } // Test true and false computations that take in 2 parameters and predicate is // true. XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(true); - auto operand1 = builder.ConstantR0(56.0f); - auto operand2 = builder.ConstantR0(12.0f); - auto operands = builder.Tuple({operand1, operand2}); - builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands, - CreateR0TupleSubComputation()); - - ComputeAndCompareR0(&builder, 68.0f, {}, error_spec_); + XlaOp pred; + auto pred_arg = CreateR0Parameter(true, 0, "pred", &builder, &pred); + auto operand1 = ConstantR0(&builder, 56.0f); + auto operand2 = ConstantR0(&builder, 12.0f); + auto operands = Tuple(&builder, {operand1, operand2}); + Conditional(pred, operands, CreateR0TupleAddComputation(), operands, + CreateR0TupleSubComputation()); + + ComputeAndCompareR0(&builder, 68.0f, {pred_arg.get()}, error_spec_); } // Test true and false computations that take in 2 parameters and predicate is // false. XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operand1 = builder.ConstantR0(56.0f); - auto operand2 = builder.ConstantR0(12.0f); - auto operands = builder.Tuple({operand1, operand2}); - builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands, - CreateR0TupleSubComputation()); - - ComputeAndCompareR0(&builder, 44.0f, {}, error_spec_); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand1 = ConstantR0(&builder, 56.0f); + auto operand2 = ConstantR0(&builder, 12.0f); + auto operands = Tuple(&builder, {operand1, operand2}); + Conditional(pred, operands, CreateR0TupleAddComputation(), operands, + CreateR0TupleSubComputation()); + + ComputeAndCompareR0(&builder, 44.0f, {pred_arg.get()}, error_spec_); } // Test true and false computations that take in 2 array parameters and // predicate is true. XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(true); - auto operand1 = builder.ConstantR1({24.0f, 56.0f}); - auto operand2 = builder.ConstantR1({10.0f, 11.0f}); - auto operands = builder.Tuple({operand1, operand2}); - builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands, - CreateR1TupleSubComputation()); - - ComputeAndCompareR1(&builder, {34.0f, 67.0f}, {}, error_spec_); + XlaOp pred; + auto pred_arg = CreateR0Parameter(true, 0, "pred", &builder, &pred); + auto operand1 = ConstantR1(&builder, {24.0f, 56.0f}); + auto operand2 = ConstantR1(&builder, {10.0f, 11.0f}); + auto operands = Tuple(&builder, {operand1, operand2}); + Conditional(pred, operands, CreateR1TupleAddComputation(), operands, + CreateR1TupleSubComputation()); + + ComputeAndCompareR1(&builder, {34.0f, 67.0f}, {pred_arg.get()}, + error_spec_); } // Test true and false computations that take in 2 array parameters and // predicate is false. XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operand1 = builder.ConstantR1({24.0f, 56.0f}); - auto operand2 = builder.ConstantR1({10.0f, 11.0f}); - auto operands = builder.Tuple({operand1, operand2}); - builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands, - CreateR1TupleSubComputation()); - - ComputeAndCompareR1(&builder, {14.0f, 45.0f}, {}, error_spec_); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand1 = ConstantR1(&builder, {24.0f, 56.0f}); + auto operand2 = ConstantR1(&builder, {10.0f, 11.0f}); + auto operands = Tuple(&builder, {operand1, operand2}); + Conditional(pred, operands, CreateR1TupleAddComputation(), operands, + CreateR1TupleSubComputation()); + + ComputeAndCompareR1(&builder, {14.0f, 45.0f}, {pred_arg.get()}, + error_spec_); } // Test true and false computations that return a tuple of scalars. XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operands = builder.Tuple( - {builder.ConstantR0(12.2f), builder.ConstantR0(25.6f)}); - builder.Conditional(pred, operands, CreateR0TupleCeilComputation(), operands, - CreateR0TupleFloorComputation()); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operands = Tuple(&builder, {ConstantR0(&builder, 12.2f), + ConstantR0(&builder, 25.6f)}); + Conditional(pred, operands, CreateR0TupleCeilComputation(), operands, + CreateR0TupleFloorComputation()); ComputeAndCompareTuple( &builder, - *Literal::MakeTuple({Literal::CreateR0(12.0f).get(), - Literal::CreateR0(25.0f).get()}), - {}, error_spec_); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR0(12.0f).get(), + LiteralUtil::CreateR0(25.0f).get()}), + {pred_arg.get()}, error_spec_); } // Test true and false computations that return a tuple of arrays. XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(true); - auto operands = builder.Tuple({builder.ConstantR1({12.2f, 15.8f}), - builder.ConstantR1({25.6f, 29.2f})}); - builder.Conditional(pred, operands, CreateR1TupleCeilComputation(), operands, - CreateR1TupleFloorComputation()); + XlaOp pred; + auto pred_arg = CreateR0Parameter(true, 0, "pred", &builder, &pred); + auto operands = + Tuple(&builder, {ConstantR1(&builder, {12.2f, 15.8f}), + ConstantR1(&builder, {25.6f, 29.2f})}); + Conditional(pred, operands, CreateR1TupleCeilComputation(), operands, + CreateR1TupleFloorComputation()); ComputeAndCompareTuple( &builder, - *Literal::MakeTuple({Literal::CreateR1({13.0f, 16.0f}).get(), - Literal::CreateR1({26.0f, 30.0f}).get()}), - {}, error_spec_); + *LiteralUtil::MakeTuple( + {LiteralUtil::CreateR1({13.0f, 16.0f}).get(), + LiteralUtil::CreateR1({26.0f, 30.0f}).get()}), + {pred_arg.get()}, error_spec_); } // Test true and false computations that return a tuple of a predicate, a @@ -371,85 +388,91 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) { XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) { XlaBuilder true_builder(TestName() + ".true"); { - true_builder.Parameter(0, empty_tuple_, "tuple"); - auto true_pred = true_builder.ConstantR0(true); - auto true_scalar = true_builder.ConstantR0(12.2f); - auto true_array = true_builder.ConstantR1({12.8f, 14.6f}); - true_builder.Tuple({true_pred, true_scalar, true_array}); + Parameter(&true_builder, 0, empty_tuple_, "tuple"); + auto true_pred = ConstantR0(&true_builder, true); + auto true_scalar = ConstantR0(&true_builder, 12.2f); + auto true_array = ConstantR1(&true_builder, {12.8f, 14.6f}); + Tuple(&true_builder, {true_pred, true_scalar, true_array}); } auto true_builder_result = true_builder.Build(); EXPECT_IS_OK(true_builder_result.status()); XlaBuilder false_builder(TestName() + ".false"); { - false_builder.Parameter(0, empty_tuple_, "tuple"); - auto false_pred = false_builder.ConstantR0(false); - auto false_scalar = false_builder.ConstantR0(25.6f); - auto false_array = false_builder.ConstantR1({26.4f, 32.6f}); - false_builder.Tuple({false_pred, false_scalar, false_array}); + Parameter(&false_builder, 0, empty_tuple_, "tuple"); + auto false_pred = ConstantR0(&false_builder, false); + auto false_scalar = ConstantR0(&false_builder, 25.6f); + auto false_array = ConstantR1(&false_builder, {26.4f, 32.6f}); + Tuple(&false_builder, {false_pred, false_scalar, false_array}); } auto false_builder_result = false_builder.Build(); EXPECT_IS_OK(false_builder_result.status()); XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(true); - auto operands = builder.Tuple({}); - builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(), - operands, false_builder_result.ConsumeValueOrDie()); + XlaOp pred; + auto pred_arg = CreateR0Parameter(true, 0, "pred", &builder, &pred); + auto operands = Tuple(&builder, {}); + Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(), operands, + false_builder_result.ConsumeValueOrDie()); ComputeAndCompareTuple( &builder, - *Literal::MakeTuple({Literal::CreateR0(true).get(), - Literal::CreateR0(12.2f).get(), - Literal::CreateR1({12.8f, 14.6f}).get()}), - {}, error_spec_); + *LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(true).get(), + LiteralUtil::CreateR0(12.2f).get(), + LiteralUtil::CreateR1({12.8f, 14.6f}).get()}), + {pred_arg.get()}, error_spec_); } // Test true and false computations that return a nested tuple. XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) { XlaBuilder true_builder(TestName() + ".true"); { - true_builder.Parameter(0, empty_tuple_, "tuple"); - auto true_constant1 = true_builder.ConstantR0(12.2f); - auto true_constant2 = true_builder.ConstantR1({12.8f, 14.6f}); - auto true_constant3 = true_builder.ConstantR1({25.4f, 29.8f}); - auto true_constant4 = true_builder.ConstantR0(35.6f); - true_builder.Tuple({true_builder.Tuple({true_constant1, true_constant2}), - true_builder.Tuple({true_constant3, true_constant4})}); + Parameter(&true_builder, 0, empty_tuple_, "tuple"); + auto true_constant1 = ConstantR0(&true_builder, 12.2f); + auto true_constant2 = ConstantR1(&true_builder, {12.8f, 14.6f}); + auto true_constant3 = ConstantR1(&true_builder, {25.4f, 29.8f}); + auto true_constant4 = ConstantR0(&true_builder, 35.6f); + Tuple(&true_builder, + {Tuple(&true_builder, {true_constant1, true_constant2}), + Tuple(&true_builder, {true_constant3, true_constant4})}); } auto true_builder_result = true_builder.Build(); EXPECT_IS_OK(true_builder_result.status()); XlaBuilder false_builder(TestName() + ".false"); { - false_builder.Parameter(0, empty_tuple_, "tuple"); - auto false_constant1 = false_builder.ConstantR0(46.6f); - auto false_constant2 = false_builder.ConstantR1({54.4f, 58.4f}); - auto false_constant3 = false_builder.ConstantR1({62.1f, 67.4f}); - auto false_constant4 = false_builder.ConstantR0(9.3f); - false_builder.Tuple( - {false_builder.Tuple({false_constant1, false_constant2}), - false_builder.Tuple({false_constant3, false_constant4})}); + Parameter(&false_builder, 0, empty_tuple_, "tuple"); + auto false_constant1 = ConstantR0(&false_builder, 46.6f); + auto false_constant2 = ConstantR1(&false_builder, {54.4f, 58.4f}); + auto false_constant3 = ConstantR1(&false_builder, {62.1f, 67.4f}); + auto false_constant4 = ConstantR0(&false_builder, 9.3f); + Tuple(&false_builder, + {Tuple(&false_builder, {false_constant1, false_constant2}), + Tuple(&false_builder, {false_constant3, false_constant4})}); } auto false_builder_result = false_builder.Build(); EXPECT_IS_OK(false_builder_result.status()); XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto operands = builder.Tuple({}); - builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(), - operands, false_builder_result.ConsumeValueOrDie()); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operands = Tuple(&builder, {}); + Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(), operands, + false_builder_result.ConsumeValueOrDie()); ComputeAndCompareTuple( &builder, - *Literal::MakeTuple( - {Literal::MakeTuple({Literal::CreateR0(46.6f).get(), - Literal::CreateR1({54.4f, 58.4f}).get()}) + *LiteralUtil::MakeTuple( + {LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(46.6f).get(), + LiteralUtil::CreateR1({54.4f, 58.4f}).get()}) .get(), - Literal::MakeTuple({Literal::CreateR1({62.1f, 67.4f}).get(), - Literal::CreateR0(9.3f).get()}) + LiteralUtil::MakeTuple( + {LiteralUtil::CreateR1({62.1f, 67.4f}).get(), + LiteralUtil::CreateR0(9.3f).get()}) .get()}), - {}, error_spec_); + {pred_arg.get()}, error_spec_); } // Test conditional that takes in scalar operands in the form of external @@ -464,8 +487,8 @@ XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) { CreateR0Parameter(56.3f, 1, "operand1", &builder, &operand1); auto operand2_param = CreateR0Parameter(12.7f, 2, "operand2", &builder, &operand2); - builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2, - CreateR0FloorComputation()); + Conditional(pred, operand1, CreateR0CeilComputation(), operand2, + CreateR0FloorComputation()); ComputeAndCompareR0( &builder, 57.0f, @@ -484,8 +507,8 @@ XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) { &builder, &operand1); auto operand2_param = CreateR1Parameter({10.2f, 11.6f}, 2, "operand2", &builder, &operand2); - builder.Conditional(pred, operand1, CreateR1CeilComputation(), operand2, - CreateR1FloorComputation()); + Conditional(pred, operand1, CreateR1CeilComputation(), operand2, + CreateR1FloorComputation()); ComputeAndCompareR1( &builder, {10.0f, 11.0f}, @@ -499,29 +522,29 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) { { Shape r0bool = ShapeUtil::MakeShape(PRED, {}); Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_}); - auto param0 = inner_builder.Parameter(0, tuple_shape, "param0"); - auto pred_cond = inner_builder.GetTupleElement(param0, 0); - auto true_operand = inner_builder.GetTupleElement(param0, 1); - auto false_operand = inner_builder.GetTupleElement(param0, 2); - inner_builder.Conditional(pred_cond, true_operand, - CreateR0CeilComputation(), false_operand, - CreateR0FloorComputation()); + auto param0 = Parameter(&inner_builder, 0, tuple_shape, "param0"); + auto pred_cond = GetTupleElement(param0, 0); + auto true_operand = GetTupleElement(param0, 1); + auto false_operand = GetTupleElement(param0, 2); + Conditional(pred_cond, true_operand, CreateR0CeilComputation(), + false_operand, CreateR0FloorComputation()); } auto inner_builder_result = inner_builder.Build(); EXPECT_IS_OK(inner_builder_result.status()); XlaBuilder builder(TestName()); - auto pred1 = builder.ConstantR0(true); - auto pred2 = builder.ConstantR0(false); - auto operand1 = builder.ConstantR0(1.1f); - auto operand2 = builder.ConstantR0(12.2f); - auto operand3 = builder.ConstantR0(43.3f); - auto tuple_operand = builder.Tuple({pred2, operand1, operand2}); - builder.Conditional(pred1, tuple_operand, - inner_builder_result.ConsumeValueOrDie(), operand3, - CreateR0IdentityComputation()); - - ComputeAndCompareR0(&builder, 12.0f, {}, error_spec_); + XlaOp pred1, pred2; + auto pred1_arg = CreateR0Parameter(true, 0, "pred1", &builder, &pred1); + auto pred2_arg = CreateR0Parameter(false, 1, "pred2", &builder, &pred2); + auto operand1 = ConstantR0(&builder, 1.1f); + auto operand2 = ConstantR0(&builder, 12.2f); + auto operand3 = ConstantR0(&builder, 43.3f); + auto tuple_operand = Tuple(&builder, {pred2, operand1, operand2}); + Conditional(pred1, tuple_operand, inner_builder_result.ConsumeValueOrDie(), + operand3, CreateR0IdentityComputation()); + + ComputeAndCompareR0(&builder, 12.0f, + {pred1_arg.get(), pred2_arg.get()}, error_spec_); } XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) { @@ -529,36 +552,36 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) { { Shape r0bool = ShapeUtil::MakeShape(PRED, {}); Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_}); - auto param0 = inner_builder.Parameter(0, tuple_shape, "param0"); - auto pred_cond = inner_builder.GetTupleElement(param0, 0); - auto true_operand = inner_builder.GetTupleElement(param0, 1); - auto false_operand = inner_builder.GetTupleElement(param0, 2); - inner_builder.Conditional(pred_cond, true_operand, - CreateR0CeilComputation(), false_operand, - CreateR0FloorComputation()); + auto param0 = Parameter(&inner_builder, 0, tuple_shape, "param0"); + auto pred_cond = GetTupleElement(param0, 0); + auto true_operand = GetTupleElement(param0, 1); + auto false_operand = GetTupleElement(param0, 2); + Conditional(pred_cond, true_operand, CreateR0CeilComputation(), + false_operand, CreateR0FloorComputation()); } auto inner_builder_result = inner_builder.Build(); EXPECT_IS_OK(inner_builder_result.status()); XlaBuilder builder(TestName()); - auto pred2 = builder.ConstantR0(false); - auto operand1 = builder.ConstantR0(1.1f); - auto operand2 = builder.ConstantR0(12.2f); - auto tuple_operand = builder.Tuple({pred2, operand1, operand2}); - builder.Call(inner_builder_result.ConsumeValueOrDie(), {tuple_operand}); + XlaOp pred; + auto pred_arg = CreateR0Parameter(false, 0, "pred", &builder, &pred); + auto operand1 = ConstantR0(&builder, 1.1f); + auto operand2 = ConstantR0(&builder, 12.2f); + auto tuple_operand = Tuple(&builder, {pred, operand1, operand2}); + Call(&builder, inner_builder_result.ConsumeValueOrDie(), {tuple_operand}); - ComputeAndCompareR0(&builder, 12.0f, {}, error_spec_); + ComputeAndCompareR0(&builder, 12.0f, {pred_arg.get()}, error_spec_); } // Test a mismatch in the shape of the true operand and true computation. XLA_TEST_F(ConditionalOpTest, ShapeMismatch) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(true); - auto operand1 = builder.ConstantR0(56.0f); - auto operand2 = builder.ConstantR0(12.0f); - auto operands = builder.Tuple({operand1, operand2}); - builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands, - CreateR0TupleSubComputation()); + auto pred = ConstantR0(&builder, true); + auto operand1 = ConstantR0(&builder, 56.0f); + auto operand2 = ConstantR0(&builder, 12.0f); + auto operands = Tuple(&builder, {operand1, operand2}); + Conditional(pred, operands, CreateR1TupleAddComputation(), operands, + CreateR0TupleSubComputation()); auto result = builder.Build(); EXPECT_FALSE(result.ok()); @@ -572,51 +595,104 @@ XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) { XlaComputation swapper; { XlaBuilder builder(TestName() + ".swapper"); - auto param0 = builder.Parameter(0, tuple_shape, "sp0"); - auto x = builder.GetTupleElement(param0, 0); - auto y = builder.GetTupleElement(param0, 1); - builder.Tuple({y, x}); + auto param0 = Parameter(&builder, 0, tuple_shape, "sp0"); + auto x = GetTupleElement(param0, 0); + auto y = GetTupleElement(param0, 1); + Tuple(&builder, {y, x}); swapper = builder.Build().ConsumeValueOrDie(); } XlaComputation forwarder; { XlaBuilder builder(TestName() + ".forwarder"); - auto param0 = builder.Parameter(0, tuple_shape, "fp0"); - auto x = builder.GetTupleElement(param0, 0); - auto y = builder.GetTupleElement(param0, 1); - builder.Tuple({x, y}); + auto param0 = Parameter(&builder, 0, tuple_shape, "fp0"); + auto x = GetTupleElement(param0, 0); + auto y = GetTupleElement(param0, 1); + Tuple(&builder, {x, y}); forwarder = builder.Build().ConsumeValueOrDie(); } XlaComputation main; { XlaBuilder builder(TestName() + ".main"); - auto param0 = builder.Parameter(0, tuple_shape, "mp0"); - auto x = builder.GetTupleElement(param0, 0); - auto y = builder.GetTupleElement(param0, 1); - auto lt_pred = builder.Lt(x, y); - auto res = builder.Conditional(lt_pred, param0, forwarder, param0, swapper); - auto ge_pred = builder.Ge(x, y); - builder.Conditional(ge_pred, res, swapper, res, forwarder); + auto param0 = Parameter(&builder, 0, tuple_shape, "mp0"); + auto x = GetTupleElement(param0, 0); + auto y = GetTupleElement(param0, 1); + auto lt_pred = Lt(x, y); + auto res = Conditional(lt_pred, param0, forwarder, param0, swapper); + auto ge_pred = Ge(x, y); + Conditional(ge_pred, res, swapper, res, forwarder); main = builder.Build().ConsumeValueOrDie(); } auto test_swap = [&](float a, float b) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR0(a); - auto y = builder.ConstantR0(b); - auto tuple_operand = builder.Tuple({x, y}); - builder.Call(main, {tuple_operand}); + XlaOp x, y; + auto x_arg = CreateR0Parameter(a, 0, "x", &builder, &x); + auto y_arg = CreateR0Parameter(b, 1, "y", &builder, &y); + auto tuple_operand = Tuple(&builder, {x, y}); + Call(&builder, main, {tuple_operand}); ComputeAndCompareTuple( &builder, - *Literal::MakeTuple({Literal::CreateR0(a).get(), - Literal::CreateR0(b).get()}), - {}, error_spec_); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR0(a).get(), + LiteralUtil::CreateR0(b).get()}), + {x_arg.get(), y_arg.get()}, error_spec_); }; test_swap(3.11f, 9.4f); test_swap(11.24f, 5.55f); } +// Test conditional that duplicates tuple elements in the then and else +// computations. This is a regression test for b/112550242. +XLA_TEST_F(ConditionalOpTest, DuplicateElementsConditional) { + const Shape scalar = ShapeUtil::MakeShape(S32, {}); + const Shape tuple2 = ShapeUtil::MakeTupleShape({scalar, scalar}); + XlaComputation then_comp; + { + XlaBuilder builder(TestName() + ".then"); + auto p = Parameter(&builder, 0, tuple2, "then.p"); + auto e0 = GetTupleElement(p, 0); + auto e1 = GetTupleElement(p, 1); + Tuple(&builder, {e0, e1, e0}); + then_comp = builder.Build().ConsumeValueOrDie(); + } + XlaComputation else_comp; + { + XlaBuilder builder(TestName() + ".else"); + auto p = Parameter(&builder, 0, tuple2, "else.p"); + auto e0 = GetTupleElement(p, 0); + auto e1 = GetTupleElement(p, 1); + Tuple(&builder, {e0, e1, e1}); + else_comp = builder.Build().ConsumeValueOrDie(); + } + + { + // Pred is true case. + std::vector args; + args.push_back(std::move( + *LiteralUtil::MakeTuple({LiteralUtil::CreateR0(123).get(), + LiteralUtil::CreateR0(-42).get()}))); + args.push_back(std::move(*LiteralUtil::CreateR0(true))); + XlaBuilder builder(TestName() + ".main"); + auto p = Parameter(&builder, 0, tuple2, "p0"); + auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1"); + Conditional(p_pred, p, then_comp, p, else_comp); + ComputeAndCompare(&builder, args); + } + { + // Pred is false case. + std::vector args; + args.push_back(std::move( + *LiteralUtil::MakeTuple({LiteralUtil::CreateR0(123).get(), + LiteralUtil::CreateR0(-42).get()}))); + args.push_back(std::move(*LiteralUtil::CreateR0(false))); + XlaBuilder builder(TestName() + ".main"); + auto p = Parameter(&builder, 0, tuple2, "p0"); + auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1"); + Conditional(p_pred, p, then_comp, p, else_comp); + ComputeAndCompare(&builder, args); + } +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc index 916ffadbc798ec0dd016f45b0bc4c36233455ee7..49375748319ad5fe40db507a034ec4b07adb7e84 100644 --- a/tensorflow/compiler/xla/tests/constants_test.cc +++ b/tensorflow/compiler/xla/tests/constants_test.cc @@ -22,10 +22,11 @@ limitations under the License. #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -39,7 +40,7 @@ class ConstantsTest : public ClientLibraryTestBase { TEST_F(ConstantsTest, ZeroCellF32) { XlaBuilder builder(TestName()); - builder.ConstantR1({}); + ConstantR1(&builder, {}); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } @@ -48,7 +49,7 @@ TEST_F(ConstantsTest, OneCellF32) { std::vector constant = {2.0}; XlaBuilder builder(TestName()); - builder.ConstantR1(constant); + ConstantR1(&builder, constant); ComputeAndCompareR1(&builder, constant, {}, error_spec_); } @@ -57,7 +58,7 @@ TEST_F(ConstantsTest, OneCellS32) { std::vector constant = {2}; XlaBuilder builder(TestName()); - builder.ConstantR1(constant); + ConstantR1(&builder, constant); ComputeAndCompareR1(&builder, constant, {}); } @@ -66,7 +67,7 @@ TEST_F(ConstantsTest, OneCellU32) { std::vector constant = {2}; XlaBuilder builder(TestName()); - builder.ConstantR1(constant); + ConstantR1(&builder, constant); ComputeAndCompareR1(&builder, constant, {}); } @@ -75,7 +76,7 @@ TEST_F(ConstantsTest, EightCells) { std::vector constant = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}; XlaBuilder builder(TestName()); - builder.ConstantR1(constant); + ConstantR1(&builder, constant); ComputeAndCompareR1(&builder, constant, {}, error_spec_); } @@ -85,14 +86,14 @@ TEST_F(ConstantsTest, SixteenCells) { 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; XlaBuilder builder(TestName()); - builder.ConstantR1(constant); + ConstantR1(&builder, constant); ComputeAndCompareR1(&builder, constant, {}, error_spec_); } TEST_F(ConstantsTest, Empty_0x2) { XlaBuilder builder(TestName()); - builder.ConstantR2FromArray2D(Array2D(0, 2)); + ConstantR2FromArray2D(&builder, Array2D(0, 2)); ComputeAndCompareR2(&builder, Array2D(0, 2), {}, error_spec_); } @@ -102,15 +103,15 @@ TEST_F(ConstantsTest, Small_2x2) { MakeLinspaceArray2D(100.0, 200.0, 2, 2); XlaBuilder builder(TestName()); - builder.ConstantR2FromArray2D(*constant); + ConstantR2FromArray2D(&builder, *constant); ComputeAndCompareR2(&builder, *constant, {}, error_spec_); } TEST_F(ConstantsTest, Empty_3x0x2) { XlaBuilder builder(TestName()); - auto constant = builder.ConstantLiteral( - *Literal::CreateR3FromArray3D(Array3D(3, 0, 2))); + ConstantLiteral(&builder, *LiteralUtil::CreateR3FromArray3D( + Array3D(3, 0, 2))); ComputeAndCompareR3(&builder, Array3D(3, 0, 2), {}); } @@ -125,8 +126,7 @@ TEST_F(ConstantsTest, Small_2x2x2) { {{5.f, 6.f}, // y0 {7.f, 8.f}}, // y1 }); - auto constant = - builder.ConstantLiteral(*Literal::CreateR3FromArray3D(array3d)); + ConstantLiteral(&builder, *LiteralUtil::CreateR3FromArray3D(array3d)); ComputeAndCompareR3(&builder, array3d, {}); } @@ -141,17 +141,17 @@ TEST_F(ConstantsTest, Small_3x2x1x1) { }); input_array.FillWithPZ(pz); std::unique_ptr input_literal = - Literal::CreateR4FromArray4D(input_array); + LiteralUtil::CreateR4FromArray4D(input_array); { XlaBuilder builder(TestName()); - builder.ConstantLiteral(*input_literal); + ConstantLiteral(&builder, *input_literal); ComputeAndCompareR4(&builder, input_array, {}, error_spec_); } { XlaBuilder builder(TestName()); - builder.ConstantR4FromArray4D(input_array); + ConstantR4FromArray4D(&builder, input_array); ComputeAndCompareR4(&builder, input_array, {}, error_spec_); } } @@ -159,17 +159,26 @@ TEST_F(ConstantsTest, Small_3x2x1x1) { // TODO(b/29263943): Support tuple constants. TEST_F(ConstantsTest, DISABLED_TupleConstant) { XlaBuilder builder(TestName()); - builder.ConstantLiteral( - *Literal::MakeTuple({Literal::CreateR2({{1.0}, {2.0}}).get(), - Literal::CreateR1({2.0, 42}).get()})); + ConstantLiteral(&builder, + *LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{1.0}, {2.0}}).get(), + LiteralUtil::CreateR1({2.0, 42}).get()})); std::unique_ptr result = ExecuteAndTransfer(&builder, {}).ConsumeValueOrDie(); - LiteralTestUtil::ExpectR2Near( - {{1.0}, {2.0}}, LiteralSlice(*result, {0}), error_spec_); - LiteralTestUtil::ExpectR1Near( - {2.0, 42.0}, LiteralSlice(*result, {1}), error_spec_); + LiteralTestUtil::ExpectR2Near({{1.0}, {2.0}}, + LiteralSlice(*result, {0}), error_spec_); + LiteralTestUtil::ExpectR1Near({2.0, 42.0}, LiteralSlice(*result, {1}), + error_spec_); +} + +TEST_F(ConstantsTest, Token) { + XlaBuilder builder(TestName()); + ConstantLiteral(&builder, *LiteralUtil::CreateToken()); + // TODO(b/80000000): tokens cannot be returned from computations. + Tuple(&builder, {}); + TF_ASSERT_OK(Execute(&builder, {}).status()); } } // namespace diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc index 722d882471a41a75c1e5e60f8c1a151b76c7e004..7a203d6873dbb5b69f96c50048c2c5ff3150c544 100644 --- a/tensorflow/compiler/xla/tests/convert_test.cc +++ b/tensorflow/compiler/xla/tests/convert_test.cc @@ -13,13 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include #include #include #include #include +#include "absl/algorithm/container.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -45,44 +47,107 @@ class ConvertTest : public ClientLibraryTestBase { TEST_F(ConvertTest, ConvertR1S32ToR1S32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42, 64}); - builder.ConvertElementType(a, S32); + auto a = ConstantR1(&builder, {42, 64}); + ConvertElementType(a, S32); std::vector expected = {42, 64}; ComputeAndCompareR1(&builder, expected, {}); } +TEST_F(ConvertTest, ConvertR1S32ToR1U32) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {42, 64}); + ConvertElementType(a, U32); + + std::vector expected = {42, 64}; + ComputeAndCompareR1(&builder, expected, {}); +} + +TEST_F(ConvertTest, ConvertR1S32ToR1PRED) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {42, 0, -64}); + ConvertElementType(a, PRED); + + std::array expected = {true, false, true}; + ComputeAndCompareR1(&builder, expected, {}); +} + +TEST_F(ConvertTest, ConvertR1U32ToR1U32) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {42, 64}); + ConvertElementType(a, U32); + + std::vector expected = {42, 64}; + ComputeAndCompareR1(&builder, expected, {}); +} + +TEST_F(ConvertTest, ConvertR1U32ToR1S32) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {42, 64}); + ConvertElementType(a, S32); + + std::vector expected = {42, 64}; + ComputeAndCompareR1(&builder, expected, {}); +} + +TEST_F(ConvertTest, ConvertR1U32ToR1PRED) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {42, 0, 64}); + ConvertElementType(a, PRED); + + std::array expected = {true, false, true}; + ComputeAndCompareR1(&builder, expected, {}); +} + TEST_F(ConvertTest, ConvertR1F32ToR1F32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.0f, 64.0f}); - builder.ConvertElementType(a, F32); + auto a = ConstantR1(&builder, {42.0f, 64.0f}); + ConvertElementType(a, F32); std::vector expected = {42.0f, 64.0f}; - ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); + ComputeAndCompareR1(&builder, expected, {}); +} + +TEST_F(ConvertTest, ConvertR1F32ToR1PRED) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {42.0f, 0.0f, 64.0f}); + ConvertElementType(a, PRED); + + std::array expected = {true, false, true}; + ComputeAndCompareR1(&builder, expected, {}); } TEST_F(ConvertTest, ConvertR1S32ToR1F32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42, 64}); - builder.ConvertElementType(a, F32); + auto a = ConstantR1(&builder, {42, 64}); + ConvertElementType(a, F32); std::vector expected = {42.0f, 64.0f}; - ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); + ComputeAndCompareR1(&builder, expected, {}); } TEST_F(ConvertTest, ConvertR1PREDToR1S32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({true, false, true}); - builder.ConvertElementType(a, S32); + auto a = ConstantR1(&builder, {true, false, true}); + ConvertElementType(a, S32); std::vector expected = {1, 0, 1}; ComputeAndCompareR1(&builder, expected, {}); } +TEST_F(ConvertTest, ConvertR1PREDToR1U32) { + XlaBuilder builder(TestName()); + auto a = ConstantR1(&builder, {true, false, true}); + ConvertElementType(a, U32); + + std::vector expected = {1, 0, 1}; + ComputeAndCompareR1(&builder, expected, {}); +} + TEST_F(ConvertTest, ConvertR1PREDToR1F32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({true, false, true}); - builder.ConvertElementType(a, F32); + auto a = ConstantR1(&builder, {true, false, true}); + ConvertElementType(a, F32); std::vector expected = {1., 0., 1.}; ComputeAndCompareR1(&builder, expected, {}); @@ -90,17 +155,17 @@ TEST_F(ConvertTest, ConvertR1PREDToR1F32) { XLA_TEST_F(ConvertTest, ConvertR1S0S32ToR1S0F32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - builder.ConvertElementType(a, F32); + auto a = ConstantR1(&builder, {}); + ConvertElementType(a, F32); std::vector expected = {}; - ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); + ComputeAndCompareR1(&builder, expected, {}); } TEST_F(ConvertTest, ConvertR1F32ToR1S32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({42.6, 64.4}); - builder.ConvertElementType(a, S32); + auto a = ConstantR1(&builder, {42.6, 64.4}); + ConvertElementType(a, S32); std::vector expected = {42, 64}; ComputeAndCompareR1(&builder, expected, {}); @@ -145,12 +210,12 @@ XLA_TEST_F(ConvertTest, ConvertR1S64ToR1F32) { static_cast(0x8000008000000000LL), static_cast(0x8000010000000000LL), }; - std::unique_ptr arg_literal = Literal::CreateR1({arg}); - auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param"); + std::unique_ptr arg_literal = LiteralUtil::CreateR1({arg}); + auto arg_param = Parameter(&builder, 0, arg_literal->shape(), "arg_param"); std::unique_ptr arg_data = client_->TransferToServer(*arg_literal).ConsumeValueOrDie(); - builder.ConvertElementType(arg_param, F32); + ConvertElementType(arg_param, F32); std::vector expected(arg.size()); for (int64 i = 0; i < arg.size(); ++i) { @@ -164,12 +229,12 @@ XLA_TEST_F(ConvertTest, ConvertR1U32ToR1F32) { std::vector arg{0, 1, 0x1000, 0x7fffffff, 0x80000000, 0x80000001, 0x80000002, 0x80000003, 0x80000080, 0x80000081, 0x80000082, 0xFFFFFFFF}; - std::unique_ptr arg_literal = Literal::CreateR1({arg}); - auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param"); + std::unique_ptr arg_literal = LiteralUtil::CreateR1({arg}); + auto arg_param = Parameter(&builder, 0, arg_literal->shape(), "arg_param"); std::unique_ptr arg_data = client_->TransferToServer(*arg_literal).ConsumeValueOrDie(); - builder.ConvertElementType(arg_param, F32); + ConvertElementType(arg_param, F32); std::vector expected(arg.size()); for (int64 i = 0; i < arg.size(); ++i) { @@ -182,12 +247,12 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1U32) { XlaBuilder builder(TestName()); std::vector arg{0.0f, 1.0f, 16777216.0f, 16777218.0f, 2147483647.0f, 4294967040.0f}; - std::unique_ptr arg_literal = Literal::CreateR1({arg}); - auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param"); + std::unique_ptr arg_literal = LiteralUtil::CreateR1({arg}); + auto arg_param = Parameter(&builder, 0, arg_literal->shape(), "arg_param"); std::unique_ptr arg_data = client_->TransferToServer(*arg_literal).ConsumeValueOrDie(); - builder.ConvertElementType(arg_param, U32); + ConvertElementType(arg_param, U32); std::vector expected(arg.size()); for (int64 i = 0; i < arg.size(); ++i) { @@ -199,12 +264,12 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1U32) { XLA_TEST_F(ConvertTest, ConvertR1U32ToR1S64) { XlaBuilder builder(TestName()); std::vector arg{0, 1, 0x1000, 0x7fffffff, 0x80000082, 0xFFFFFFFF}; - std::unique_ptr arg_literal = Literal::CreateR1({arg}); - auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param"); + std::unique_ptr arg_literal = LiteralUtil::CreateR1({arg}); + auto arg_param = Parameter(&builder, 0, arg_literal->shape(), "arg_param"); std::unique_ptr arg_data = client_->TransferToServer(*arg_literal).ConsumeValueOrDie(); - builder.ConvertElementType(arg_param, S64); + ConvertElementType(arg_param, S64); std::vector expected(arg.size()); for (int64 i = 0; i < arg.size(); ++i) { @@ -216,12 +281,12 @@ XLA_TEST_F(ConvertTest, ConvertR1U32ToR1S64) { XLA_TEST_F(ConvertTest, ConvertR1S32ToR1S64) { XlaBuilder builder(TestName()); std::vector arg{0, 1, 0x1000, -1, -0x1000}; - std::unique_ptr arg_literal = Literal::CreateR1({arg}); - auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param"); + std::unique_ptr arg_literal = LiteralUtil::CreateR1({arg}); + auto arg_param = Parameter(&builder, 0, arg_literal->shape(), "arg_param"); std::unique_ptr arg_data = client_->TransferToServer(*arg_literal).ConsumeValueOrDie(); - builder.ConvertElementType(arg_param, S64); + ConvertElementType(arg_param, S64); std::vector expected(arg.size()); for (int64 i = 0; i < arg.size(); ++i) { @@ -253,12 +318,12 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) { 9223370937343148032.f, -9223371487098961920.f, -9223370937343148032.f}; - std::unique_ptr arg_literal = Literal::CreateR1({arg}); - auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param"); + std::unique_ptr arg_literal = LiteralUtil::CreateR1({arg}); + auto arg_param = Parameter(&builder, 0, arg_literal->shape(), "arg_param"); std::unique_ptr arg_data = client_->TransferToServer(*arg_literal).ConsumeValueOrDie(); - builder.ConvertElementType(arg_param, S64); + ConvertElementType(arg_param, S64); std::vector expected(arg.size()); for (int64 i = 0; i < arg.size(); ++i) { @@ -269,8 +334,8 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) { XLA_TEST_F(ConvertTest, ConvertR1U8ToR1F32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({32, 64}); - builder.ConvertElementType(a, F32); + auto a = ConstantR1(&builder, {32, 64}); + ConvertElementType(a, F32); std::vector expected = {32.0, 64.0}; ComputeAndCompareR1(&builder, expected, {}); @@ -278,8 +343,8 @@ XLA_TEST_F(ConvertTest, ConvertR1U8ToR1F32) { XLA_TEST_F(ConvertTest, ConvertR1U8ToR1S32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({32, 64}); - builder.ConvertElementType(a, S32); + auto a = ConstantR1(&builder, {32, 64}); + ConvertElementType(a, S32); std::vector expected = {32, 64}; ComputeAndCompareR1(&builder, expected, {}); @@ -287,8 +352,8 @@ XLA_TEST_F(ConvertTest, ConvertR1U8ToR1S32) { XLA_TEST_F(ConvertTest, ConvertR1U8ToR1U32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({32, 64}); - builder.ConvertElementType(a, U32); + auto a = ConstantR1(&builder, {32, 64}); + ConvertElementType(a, U32); std::vector expected = {32, 64}; ComputeAndCompareR1(&builder, expected, {}); @@ -296,8 +361,8 @@ XLA_TEST_F(ConvertTest, ConvertR1U8ToR1U32) { XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F64) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({32.0f, 64.0f}); - builder.ConvertElementType(a, F64); + auto a = ConstantR1(&builder, {32.0f, 64.0f}); + ConvertElementType(a, F64); std::vector expected = {32.0, 64.0}; ComputeAndCompareR1(&builder, expected, {}); @@ -305,8 +370,8 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F64) { XLA_TEST_F(ConvertTest, ConvertR1F64ToR1F32) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({32.0, 64.0}); - builder.ConvertElementType(a, F32); + auto a = ConstantR1(&builder, {32.0, 64.0}); + ConvertElementType(a, F32); std::vector expected = {32.0f, 64.0f}; ComputeAndCompareR1(&builder, expected, {}); @@ -314,9 +379,9 @@ XLA_TEST_F(ConvertTest, ConvertR1F64ToR1F32) { TEST_F(ConvertTest, ConvertS32Extremes) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1( - {std::numeric_limits::min(), std::numeric_limits::max()}); - builder.ConvertElementType(a, F32); + auto a = ConstantR1(&builder, {std::numeric_limits::min(), + std::numeric_limits::max()}); + ConvertElementType(a, F32); std::vector expected = { static_cast(std::numeric_limits::min()), @@ -327,10 +392,10 @@ TEST_F(ConvertTest, ConvertS32Extremes) { TEST_F(ConvertTest, ConvertMapToS32) { XlaBuilder builder(TestName()); auto b = builder.CreateSubBuilder("convert"); - auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in"); - b->ConvertElementType(param, S32); - auto a = builder.ConstantR1({42.0f, 64.0f}); - builder.Map({a}, b->BuildAndNoteError(), {0}); + auto param = Parameter(b.get(), 0, ShapeUtil::MakeShape(F32, {}), "in"); + ConvertElementType(param, S32); + auto a = ConstantR1(&builder, {42.0f, 64.0f}); + Map(&builder, {a}, b->BuildAndNoteError(), {0}); std::vector expected = {42, 64}; ComputeAndCompareR1(&builder, expected, {}); @@ -339,10 +404,10 @@ TEST_F(ConvertTest, ConvertMapToS32) { TEST_F(ConvertTest, ConvertMapToF32) { XlaBuilder builder(TestName()); auto b = builder.CreateSubBuilder("convert"); - auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in"); - b->ConvertElementType(param, F32); - auto a = builder.ConstantR1({42, 64}); - builder.Map({a}, b->BuildAndNoteError(), {0}); + auto param = Parameter(b.get(), 0, ShapeUtil::MakeShape(S32, {}), "in"); + ConvertElementType(param, F32); + auto a = ConstantR1(&builder, {42, 64}); + Map(&builder, {a}, b->BuildAndNoteError(), {0}); std::vector expected = {42.0f, 64.0f}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -355,9 +420,9 @@ TEST_F(ConvertTest, ConvertMapToF32) { // the new convert should have the same element type as the old convert. TEST_F(ConvertTest, ConvertReshape) { XlaBuilder builder(TestName()); - auto input = builder.ConstantR1({42}); - auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{}); - builder.ConvertElementType(reshape, F32); + auto input = ConstantR1(&builder, {42}); + auto reshape = Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{}); + ConvertElementType(reshape, F32); ComputeAndCompareR0(&builder, 42.0f, {}, ErrorSpec(0.0001)); } @@ -383,21 +448,21 @@ std::vector GetInterestingF16ConversionTestCases() { XLA_TEST_F(ConvertTest, ConvertR1F16ToR1F32) { std::vector test_cases = GetInterestingF16ConversionTestCases(); std::vector input; - c_transform(test_cases, std::back_inserter(input), - [](float f) { return Eigen::half(f); }); + absl::c_transform(test_cases, std::back_inserter(input), + [](float f) { return Eigen::half(f); }); std::vector expected_output; - c_transform(input, std::back_inserter(expected_output), - [](Eigen::half h) { return static_cast(h); }); + absl::c_transform(input, std::back_inserter(expected_output), + [](Eigen::half h) { return static_cast(h); }); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr dot_lhs_handle, - client_->TransferToServer(*Literal::CreateR1(input))); + client_->TransferToServer(*LiteralUtil::CreateR1(input))); XlaBuilder builder(TestName()); - builder.ConvertElementType( - builder.Parameter( - 0, ShapeUtil::MakeShape(F16, {static_cast(input.size())}), - "param"), + ConvertElementType( + Parameter(&builder, 0, + ShapeUtil::MakeShape(F16, {static_cast(input.size())}), + "param"), F32); ComputeAndCompareR1(&builder, expected_output, {dot_lhs_handle.get()}); @@ -406,18 +471,18 @@ XLA_TEST_F(ConvertTest, ConvertR1F16ToR1F32) { XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F16) { std::vector input = GetInterestingF16ConversionTestCases(); std::vector expected_output; - c_transform(input, std::back_inserter(expected_output), - [](float f) { return Eigen::half(f); }); + absl::c_transform(input, std::back_inserter(expected_output), + [](float f) { return Eigen::half(f); }); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr dot_lhs_handle, - client_->TransferToServer(*Literal::CreateR1(input))); + client_->TransferToServer(*LiteralUtil::CreateR1(input))); XlaBuilder builder(TestName()); - builder.ConvertElementType( - builder.Parameter( - 0, ShapeUtil::MakeShape(F32, {static_cast(input.size())}), - "param"), + ConvertElementType( + Parameter(&builder, 0, + ShapeUtil::MakeShape(F32, {static_cast(input.size())}), + "param"), F16); ComputeAndCompareR1(&builder, expected_output, {dot_lhs_handle.get()}); @@ -426,28 +491,28 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F16) { XLA_TEST_F(ConvertTest, ConvertC64ToC64) { XlaBuilder builder(TestName()); std::vector x = {{42.0f, 64.0f}}; - builder.ConvertElementType(builder.ConstantR1(x), C64); + ConvertElementType(ConstantR1(&builder, x), C64); ComputeAndCompareR1(&builder, x, {}, ErrorSpec(0.0001)); } XLA_TEST_F(ConvertTest, ConvertS64S64) { XlaBuilder builder(TestName()); std::vector x = {{-42, 64}}; - builder.ConvertElementType(builder.ConstantR1(x), S64); + ConvertElementType(ConstantR1(&builder, x), S64); ComputeAndCompareR1(&builder, x, {}); } XLA_TEST_F(ConvertTest, ConvertU64U64) { XlaBuilder builder(TestName()); std::vector x = {{42, 64}}; - builder.ConvertElementType(builder.ConstantR1(x), U64); + ConvertElementType(ConstantR1(&builder, x), U64); ComputeAndCompareR1(&builder, x, {}); } XLA_TEST_F(ConvertTest, ConvertU64S64) { XlaBuilder builder(TestName()); std::vector unsigned_x = {{42, UINT64_MAX}}; - builder.ConvertElementType(builder.ConstantR1(unsigned_x), S64); + ConvertElementType(ConstantR1(&builder, unsigned_x), S64); std::vector signed_x = {{42, -1}}; ComputeAndCompareR1(&builder, signed_x, {}); } @@ -455,11 +520,31 @@ XLA_TEST_F(ConvertTest, ConvertU64S64) { XLA_TEST_F(ConvertTest, ConvertS64U64) { XlaBuilder builder(TestName()); std::vector signed_x = {{42, -1, INT64_MIN}}; - builder.ConvertElementType(builder.ConstantR1(signed_x), U64); + ConvertElementType(ConstantR1(&builder, signed_x), U64); std::vector unsigned_x = { {42, UINT64_MAX, tensorflow::MathUtil::IPow(2, 63)}}; ComputeAndCompareR1(&builder, unsigned_x, {}); } +XLA_TEST_F(ConvertTest, ConvertBF16F32) { + XlaBuilder builder(TestName()); + + std::vector all_bfloats(1 << 16); + for (int i = 0; i < all_bfloats.size(); ++i) { + all_bfloats[i].value = i; + } + + std::vector expected(all_bfloats.size()); + for (int i = 0; i < expected.size(); ++i) { + expected[i] = (1U << 16) * i; + } + + // Exhaustively test all bf16 to f32 conversions. + xla::XlaOp all_bfloats_bf16 = ConstantR1(&builder, all_bfloats); + xla::XlaOp all_bfloats_f32 = ConvertElementType(all_bfloats_bf16, F32); + BitcastConvertType(all_bfloats_f32, U32); + ComputeAndCompareR1(&builder, expected, {}); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc index b5a42e305987df030c15d089f5877f73bb61de1b..38b6da4fa96b0f6b7ed2d56852eb3ab2872f3520 100644 --- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc @@ -17,11 +17,11 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" @@ -88,19 +88,20 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) { XLA_TEST_F(ConvolutionDimensionNumbersTest, TwoConvsWithDifferentDimensionNumbers) { - auto input_array = MakeUnique>(2, 3, 5, 5); + auto input_array = absl::make_unique>(2, 3, 5, 5); input_array->FillWithMultiples(0.1); - auto weight_array = MakeUnique>(4, 3, 1, 1); + auto weight_array = absl::make_unique>(4, 3, 1, 1); weight_array->FillWithMultiples(0.2); auto weight_data = - client_->TransferToServer(*Literal::CreateR4FromArray4D(*weight_array)) + client_ + ->TransferToServer(*LiteralUtil::CreateR4FromArray4D(*weight_array)) .ConsumeValueOrDie(); XlaBuilder builder(TestName()); - auto input = builder.ConstantR4FromArray4D(*input_array); + auto input = ConstantR4FromArray4D(&builder, *input_array); auto weight = - builder.Parameter(0, ShapeUtil::MakeShape(F32, {4, 3, 1, 1}), "weight"); - auto conv1 = builder.Conv(input, weight, {1, 1}, Padding::kValid); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {4, 3, 1, 1}), "weight"); + auto conv1 = Conv(input, weight, {1, 1}, Padding::kValid); ConvolutionDimensionNumbers dim_nums = XlaBuilder::CreateDefaultConvDimensionNumbers(); @@ -117,8 +118,7 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest, dim_nums.set_kernel_input_feature_dimension( dim_nums.kernel_output_feature_dimension()); dim_nums.set_kernel_output_feature_dimension(old_kernel_input_feature_dim); - builder.ConvWithGeneralDimensions(input, conv1, {1, 1}, Padding::kValid, - dim_nums); + ConvWithGeneralDimensions(input, conv1, {1, 1}, Padding::kValid, dim_nums); auto expected_conv1 = ReferenceUtil::ConvArray4D(*input_array, *weight_array, {1, 1}, Padding::kValid); diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc index 947959beb144e1509a77ad2f94b8493de46ba6f2..d2c6478b02423c93860244bc5eb91e652a3eac2e 100644 --- a/tensorflow/compiler/xla/tests/convolution_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_test.cc @@ -18,24 +18,24 @@ limitations under the License. #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -47,9 +47,9 @@ class ConvolutionTest : public ClientLibraryTestBase { #if XLA_TEST_BACKEND_GPU // XLA:GPU sometimes uses FFT convolution which isn't as precise as spatial // convolution. So relax the absolute error threshold. - ErrorSpec error_spec_ = ErrorSpec(1e-2); + ErrorSpec error_spec_ = ErrorSpec(1e-2, 1e-4); #else - ErrorSpec error_spec_ = ErrorSpec(1e-4); + ErrorSpec error_spec_ = ErrorSpec(1e-4, 1e-4); #endif }; @@ -70,16 +70,16 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest { const int kKernelSizeY = 2; const int kOutputActivationSizeZ = 256; const int kMiniBatchSize = 4; - auto alhs = - MakeUnique>(kMiniBatchSize, kInputActivationSizeZ, - kInputActivationSizeY, kInputActivationSizeX); + auto alhs = absl::make_unique>( + kMiniBatchSize, kInputActivationSizeZ, kInputActivationSizeY, + kInputActivationSizeX); alhs->FillWithMultiples(static_cast(1.0f)); ASSERT_EQ(3, alhs->width()); ASSERT_EQ(3, alhs->height()); - auto arhs = - MakeUnique>(kOutputActivationSizeZ, kInputActivationSizeZ, - kKernelSizeY, kKernelSizeX); + auto arhs = absl::make_unique>(kOutputActivationSizeZ, + kInputActivationSizeZ, + kKernelSizeY, kKernelSizeX); Array2D rhs_raster({ {1.0f, 0.0f}, // row 0 {0.0f, 0.0f}, // row 1 @@ -89,9 +89,9 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest { ASSERT_EQ(2, arhs->height()); XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR4FromArray4D(*alhs); - auto rhs = builder.ConstantR4FromArray4D(*arhs); - builder.Conv(lhs, rhs, {1, 1}, Padding::kValid); + auto lhs = ConstantR4FromArray4D(&builder, *alhs); + auto rhs = ConstantR4FromArray4D(&builder, *arhs); + Conv(lhs, rhs, {1, 1}, Padding::kValid); ComputeAndCompare(&builder, {}, error_spec_); } @@ -109,9 +109,9 @@ class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest { XlaBuilder builder(TestName()); Shape input_shape = ShapeUtil::MakeShapeWithType({1, 1, 1, 2}); Shape filter_shape = ShapeUtil::MakeShapeWithType({1, 1, 1, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D input_data(1, 1, 1, 2); input_data.FillWithYX(Array2D({ @@ -123,8 +123,8 @@ class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest { })); ComputeAndCompare(&builder, - {std::move(*Literal::CreateFromArray(input_data)), - std::move(*Literal::CreateFromArray(filter_data))}, + {std::move(*LiteralUtil::CreateFromArray(input_data)), + std::move(*LiteralUtil::CreateFromArray(filter_data))}, error_spec_); } }; @@ -140,9 +140,9 @@ class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest { XlaBuilder builder(TestName()); Shape input_shape = ShapeUtil::MakeShapeWithType({1, 1, 4, 4}); Shape filter_shape = ShapeUtil::MakeShapeWithType({1, 1, 2, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D input_data(1, 1, 4, 4); input_data.FillWithYX(Array2D({ @@ -157,8 +157,8 @@ class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest { {7.0f, 8.0f}, })); ComputeAndCompare(&builder, - {std::move(*Literal::CreateFromArray(input_data)), - std::move(*Literal::CreateFromArray(filter_data))}, + {std::move(*LiteralUtil::CreateFromArray(input_data)), + std::move(*LiteralUtil::CreateFromArray(filter_data))}, error_spec_); } }; @@ -174,9 +174,9 @@ class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest { XlaBuilder builder(TestName()); Shape input_shape = ShapeUtil::MakeShapeWithType({1, 1, 4, 4}); Shape filter_shape = ShapeUtil::MakeShapeWithType({1, 1, 2, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); - builder.Conv(input, filter, {1, 1}, Padding::kSame); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + Conv(input, filter, {1, 1}, Padding::kSame); Array4D input_data(1, 1, 4, 4); input_data.FillWithYX(Array2D({ @@ -192,8 +192,8 @@ class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest { })); ComputeAndCompare(&builder, - {std::move(*Literal::CreateFromArray(input_data)), - std::move(*Literal::CreateFromArray(filter_data))}, + {std::move(*LiteralUtil::CreateFromArray(input_data)), + std::move(*LiteralUtil::CreateFromArray(filter_data))}, error_spec_); } }; @@ -210,9 +210,9 @@ class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest { XlaBuilder builder(TestName()); Shape input_shape = ShapeUtil::MakeShapeWithType({1, 1, 4, 4}); Shape filter_shape = ShapeUtil::MakeShapeWithType({1, 1, 3, 3}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); - builder.Conv(input, filter, {1, 1}, Padding::kSame); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + Conv(input, filter, {1, 1}, Padding::kSame); Array4D input_data(1, 1, 4, 4); input_data.FillWithYX(Array2D({{1.0f, 2.0f, 3.0f, 4.0f}, @@ -224,8 +224,8 @@ class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest { {{5.0f, 6.0f, 7.0f}, {8.0f, 9.0f, 10.0f}, {11.0f, 12.0f, 13.0f}})); // clang-format on ComputeAndCompare(&builder, - {std::move(*Literal::CreateFromArray(input_data)), - std::move(*Literal::CreateFromArray(filter_data))}, + {std::move(*LiteralUtil::CreateFromArray(input_data)), + std::move(*LiteralUtil::CreateFromArray(filter_data))}, error_spec_); } }; @@ -238,9 +238,9 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) { { Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5}); Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); - builder.Conv(input, filter, {1}, Padding::kValid); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + Conv(input, filter, {1}, Padding::kValid); } Array3D input({{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}}}); @@ -249,10 +249,10 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) { Array3D expected({{{510, 610, 710, 810}}}); auto input_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(input)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(input)) .ConsumeValueOrDie(); auto filter_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(filter)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(filter)) .ConsumeValueOrDie(); ComputeAndCompareR3(&builder, expected, @@ -268,10 +268,10 @@ class Convolve1D_1x2x5_1x2x2_WithRHSDilation : public ConvolutionTest { { Shape input_shape = ShapeUtil::MakeShapeWithType({1, 2, 5}); Shape filter_shape = ShapeUtil::MakeShapeWithType({1, 2, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); // Convolution dimensions are bf0_oi0->bo0. - builder.ConvGeneralDilated( + ConvGeneralDilated( input, filter, /*window_strides=*/{1}, /*padding=*/{{0, 0}}, /*lhs_dilation=*/{1}, /*rhs_dilation=*/{2}, /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1)); @@ -284,10 +284,10 @@ class Convolve1D_1x2x5_1x2x2_WithRHSDilation : public ConvolutionTest { Array3D expected({{{570.0f, 670.0f, 770.0f}}}); auto input_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(input)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(input)) .ConsumeValueOrDie(); auto filter_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(filter)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(filter)) .ConsumeValueOrDie(); ComputeAndCompareR3(&builder, expected, @@ -304,10 +304,10 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) { { Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5}); Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); // Convolution dimensions are bf0_oi0->bo0. - builder.ConvGeneralDilated( + ConvGeneralDilated( input, filter, /*window_strides=*/{1}, /*padding=*/{{0, 0}}, /*lhs_dilation=*/{2}, /*rhs_dilation=*/{1}, /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1)); @@ -319,10 +319,10 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) { Array3D expected({{{190, 320, 230, 380, 270, 440, 310, 500}}}); auto input_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(input)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(input)) .ConsumeValueOrDie(); auto filter_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(filter)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(filter)) .ConsumeValueOrDie(); ComputeAndCompareR3(&builder, expected, @@ -335,10 +335,10 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) { { Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5}); Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); // Convolution dimensions are bf0_oi0->bo0. - builder.ConvGeneralDilated( + ConvGeneralDilated( input, filter, /*window_strides=*/{1}, /*padding=*/{{0, 0}}, /*lhs_dilation=*/{2}, /*rhs_dilation=*/{2}, /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1)); @@ -350,10 +350,10 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) { Array3D expected({{{510, 0, 610, 0, 710, 0, 810}}}); auto input_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(input)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(input)) .ConsumeValueOrDie(); auto filter_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(filter)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(filter)) .ConsumeValueOrDie(); ComputeAndCompareR3(&builder, expected, @@ -369,10 +369,10 @@ class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest { { Shape input_shape = ShapeUtil::MakeShapeWithType({1, 2, 5}); Shape filter_shape = ShapeUtil::MakeShapeWithType({1, 2, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); // Convolution dimensions are bf0_oi0->bo0. - builder.ConvGeneralDilated( + ConvGeneralDilated( input, filter, /*window_strides=*/{1}, /*padding=*/{{2, 2}}, /*lhs_dilation=*/{1}, /*rhs_dilation=*/{1}, /*dimension_numbers=*/builder.CreateDefaultConvDimensionNumbers(1)); @@ -386,10 +386,10 @@ class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest { {{{0.0f, 260.0f, 510.0f, 610.0f, 710.0f, 810.0f, 350.0f, 0.0f}}}); auto input_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(input)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(input)) .ConsumeValueOrDie(); auto filter_literal = - client_->TransferToServer(*Literal::CreateR3FromArray3D(filter)) + client_->TransferToServer(*LiteralUtil::CreateR3FromArray3D(filter)) .ConsumeValueOrDie(); ComputeAndCompareR3(&builder, expected, @@ -408,8 +408,8 @@ XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) { Shape input_shape = ShapeUtil::MakeShape(F32, input_dims); Shape filter_shape = ShapeUtil::MakeShape(F32, filter_dims); { - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); // Tensorflow dimension numbers for 3D convolution. ConvolutionDimensionNumbers dnums; @@ -429,21 +429,20 @@ XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) { dnums.set_kernel_input_feature_dimension(3); dnums.set_kernel_output_feature_dimension(4); - builder.ConvWithGeneralDimensions(input, filter, {1, 1, 1}, Padding::kValid, - dnums); + ConvWithGeneralDimensions(input, filter, {1, 1, 1}, Padding::kValid, dnums); } std::vector input_elems(ShapeUtil::ElementsIn(input_shape)); iota(input_elems.begin(), input_elems.end(), 1.0f); - auto input_r1 = Literal::CreateR1(input_elems); + auto input_r1 = LiteralUtil::CreateR1(input_elems); auto input_r5 = input_r1->Reshape(input_dims).ConsumeValueOrDie(); std::vector filter_elems(ShapeUtil::ElementsIn(filter_shape)); iota(filter_elems.begin(), filter_elems.end(), 1.0f); - auto filter_r1 = Literal::CreateR1(filter_elems); + auto filter_r1 = LiteralUtil::CreateR1(filter_elems); auto filter_r5 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie(); - auto expected_r1 = Literal::CreateR1( + auto expected_r1 = LiteralUtil::CreateR1( {19554, 19962, 20370, 22110, 22590, 23070, 34890, 35730, 36570, 37446, 38358, 39270, 50226, 51498, 52770, 52782, 54126, 55470}); auto expected_r5 = expected_r1->Reshape({1, 3, 1, 2, 3}).ConsumeValueOrDie(); @@ -466,7 +465,7 @@ void iota_int_init_value(std::vector& values, int init_value) { } template -class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest { +class Convolve2D_1x3x3x5_3x3x5x3_Valid : public ConvolutionTest { public: void RunTest() { XlaBuilder builder(TestName()); @@ -475,8 +474,8 @@ class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest { Shape input_shape = ShapeUtil::MakeShapeWithType(input_dims); Shape filter_shape = ShapeUtil::MakeShapeWithType(filter_dims); { - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); // Tensorflow dimension numbers for 2D convolution. ConvolutionDimensionNumbers dnums; @@ -493,21 +492,20 @@ class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest { dnums.set_kernel_input_feature_dimension(2); dnums.set_kernel_output_feature_dimension(3); - builder.ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, - dnums); + ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums); } std::vector input_elems(ShapeUtil::ElementsIn(input_shape)); iota_int_init_value(input_elems, 1); - auto input_r1 = Literal::CreateR1(input_elems); + auto input_r1 = LiteralUtil::CreateR1(input_elems); auto input_r4 = input_r1->Reshape(input_dims).ConsumeValueOrDie(); std::vector filter_elems(ShapeUtil::ElementsIn(filter_shape)); iota_int_init_value(filter_elems, 1); - auto filter_r1 = Literal::CreateR1(filter_elems); + auto filter_r1 = LiteralUtil::CreateR1(filter_elems); auto filter_r4 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie(); - auto expected_r1 = Literal::CreateR1( + auto expected_r1 = LiteralUtil::CreateR1( {static_cast(92115), static_cast(93150), static_cast(94185)}); auto expected_r4 = expected_r1->Reshape({1, 1, 1, 3}).ConsumeValueOrDie(); @@ -522,8 +520,139 @@ class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest { } }; -TYPED_TEST_CASE(Convolve2D_1x3x3x5_3x3x5x5_Valid, TestTypes); -TYPED_TEST(Convolve2D_1x3x3x5_3x3x5x5_Valid, Types) { this->RunTest(); } +TYPED_TEST_CASE(Convolve2D_1x3x3x5_3x3x5x3_Valid, TestTypes); +TYPED_TEST(Convolve2D_1x3x3x5_3x3x5x3_Valid, Types) { this->RunTest(); } + +template +class Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid : public ConvolutionTest { + public: + void RunTest() { + XlaBuilder builder(TestName()); + std::vector input_dims = {1, 3, 3, 5}; + std::vector filter_dims = {3, 3, 1, 15}; + Shape input_shape = ShapeUtil::MakeShapeWithType(input_dims); + Shape filter_shape = ShapeUtil::MakeShapeWithType(filter_dims); + { + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + + // Tensorflow dimension numbers for 2D convolution. + ConvolutionDimensionNumbers dnums; + dnums.set_input_batch_dimension(0); + dnums.set_output_batch_dimension(0); + dnums.add_input_spatial_dimensions(1); + dnums.add_output_spatial_dimensions(1); + dnums.add_input_spatial_dimensions(2); + dnums.add_output_spatial_dimensions(2); + dnums.set_input_feature_dimension(3); + dnums.set_output_feature_dimension(3); + dnums.add_kernel_spatial_dimensions(0); + dnums.add_kernel_spatial_dimensions(1); + dnums.set_kernel_input_feature_dimension(2); + dnums.set_kernel_output_feature_dimension(3); + + ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums, + /*feature_group_count=*/5); + } + + std::vector input_elems(ShapeUtil::ElementsIn(input_shape)); + iota_int_init_value(input_elems, 1); + auto input_r1 = LiteralUtil::CreateR1(input_elems); + auto input_r4 = input_r1->Reshape(input_dims).ConsumeValueOrDie(); + + std::vector filter_elems(ShapeUtil::ElementsIn(filter_shape)); + iota_int_init_value(filter_elems, 1); + auto filter_r1 = LiteralUtil::CreateR1(filter_elems); + auto filter_r4 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie(); + + auto expected_r1 = LiteralUtil::CreateR1( + {static_cast(16029), static_cast(16218), static_cast(16407), + static_cast(17172), static_cast(17370), static_cast(17568), + static_cast(18369), static_cast(18576), static_cast(18783), + static_cast(19620), static_cast(19836), static_cast(20052), + static_cast(20925), static_cast(21150), static_cast(21375)}); + auto expected_r4 = expected_r1->Reshape({1, 1, 1, 15}).ConsumeValueOrDie(); + + auto input_literal = + client_->TransferToServer(*input_r4).ConsumeValueOrDie(); + auto filter_literal = + client_->TransferToServer(*filter_r4).ConsumeValueOrDie(); + + ComputeAndCompareLiteral(&builder, *expected_r4, + {input_literal.get(), filter_literal.get()}, + error_spec_); + } +}; + +TYPED_TEST_CASE(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, TestTypes); +TYPED_TEST(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, Types) { + this->RunTest(); +} + +template +class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest { + public: + void RunTest() { + XlaBuilder builder(TestName()); + std::vector input_dims = {1, 2, 2, 6}; + std::vector filter_dims = {2, 2, 2, 12}; + Shape input_shape = ShapeUtil::MakeShapeWithType(input_dims); + Shape filter_shape = ShapeUtil::MakeShapeWithType(filter_dims); + { + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + + // Tensorflow dimension numbers for 2D convolution. + ConvolutionDimensionNumbers dnums; + dnums.set_input_batch_dimension(0); + dnums.set_output_batch_dimension(0); + dnums.add_input_spatial_dimensions(1); + dnums.add_output_spatial_dimensions(1); + dnums.add_input_spatial_dimensions(2); + dnums.add_output_spatial_dimensions(2); + dnums.set_input_feature_dimension(3); + dnums.set_output_feature_dimension(3); + dnums.add_kernel_spatial_dimensions(0); + dnums.add_kernel_spatial_dimensions(1); + dnums.set_kernel_input_feature_dimension(2); + dnums.set_kernel_output_feature_dimension(3); + + ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums, + /*feature_group_count=*/3); + } + + std::vector input_elems(ShapeUtil::ElementsIn(input_shape)); + iota_int_init_value(input_elems, 1); + auto input_r1 = LiteralUtil::CreateR1(input_elems); + auto input_r4 = input_r1->Reshape(input_dims).ConsumeValueOrDie(); + + std::vector filter_elems(ShapeUtil::ElementsIn(filter_shape)); + iota_int_init_value(filter_elems, 1); + auto filter_r1 = LiteralUtil::CreateR1(filter_elems); + auto filter_r4 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie(); + + auto expected_r1 = LiteralUtil::CreateR1( + {static_cast(5076), static_cast(5160), static_cast(5244), + static_cast(5328), static_cast(6164), static_cast(6264), + static_cast(6364), static_cast(6464), static_cast(7380), + static_cast(7496), static_cast(7612), static_cast(7728)}); + auto expected_r4 = expected_r1->Reshape({1, 1, 1, 12}).ConsumeValueOrDie(); + + auto input_literal = + client_->TransferToServer(*input_r4).ConsumeValueOrDie(); + auto filter_literal = + client_->TransferToServer(*filter_r4).ConsumeValueOrDie(); + + ComputeAndCompareLiteral(&builder, *expected_r4, + {input_literal.get(), filter_literal.get()}, + error_spec_); + } +}; + +TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, TestTypes); +TYPED_TEST(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, Types) { + this->RunTest(); +} // Test fixture to run convolution tests with and without convolution // canonicalization enabled. @@ -541,8 +670,8 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization, Shape input_shape = ShapeUtil::MakeShape(F32, {4, 29}); Shape filter_shape = ShapeUtil::MakeShape(F32, {4, 10}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); ConvolutionDimensionNumbers dnums; dnums.set_input_feature_dimension(0); @@ -551,7 +680,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization, dnums.set_kernel_output_feature_dimension(1); dnums.set_output_batch_dimension(0); dnums.set_output_feature_dimension(1); - builder.ConvWithGeneralDimensions(input, filter, {}, Padding::kValid, dnums); + ConvWithGeneralDimensions(input, filter, {}, Padding::kValid, dnums); Array2D param0(4, 29); param0.FillUnique(); @@ -563,8 +692,8 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization, expected_result.Fill(0); ComputeAndCompare(&builder, - {std::move(*Literal::CreateFromArray(param0)), - std::move(*Literal::CreateFromArray(param1))}, + {std::move(*LiteralUtil::CreateFromArray(param0)), + std::move(*LiteralUtil::CreateFromArray(param1))}, error_spec_); } @@ -599,8 +728,8 @@ class Convolve1D1WindowTestBase Shape input_shape = ShapeUtil::MakeShapeWithType(input_dims); Shape filter_shape = ShapeUtil::MakeShapeWithType(filter_dims); { - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); // Tensorflow dimension numbers for 1D convolution. ConvolutionDimensionNumbers dnums; @@ -614,24 +743,23 @@ class Convolve1D1WindowTestBase dnums.set_kernel_input_feature_dimension(1); dnums.set_kernel_output_feature_dimension(2); - builder.ConvWithGeneralDimensions(input, filter, {1}, Padding::kValid, - dnums); + ConvWithGeneralDimensions(input, filter, {1}, Padding::kValid, dnums); } std::vector input_elems(ShapeUtil::ElementsIn(input_shape), static_cast(1.0f)); - auto input_r1 = Literal::CreateR1(input_elems); + auto input_r1 = LiteralUtil::CreateR1(input_elems); auto input_r3 = input_r1->Reshape(input_dims).ConsumeValueOrDie(); std::vector filter_elems(ShapeUtil::ElementsIn(filter_shape), static_cast(1.0f)); - auto filter_r1 = Literal::CreateR1(filter_elems); + auto filter_r1 = LiteralUtil::CreateR1(filter_elems); auto filter_r3 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie(); std::vector expect_elems(batch * output_feature * num_windows, static_cast(window_size * input_feature)); - auto expected_r1 = Literal::CreateR1(expect_elems); + auto expected_r1 = LiteralUtil::CreateR1(expect_elems); auto expected_r3 = expected_r1->Reshape({batch, num_windows, output_feature}) .ConsumeValueOrDie(); @@ -726,9 +854,9 @@ XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) { XlaBuilder builder(TestName()); Shape input_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2}); Shape filter_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D input_data(1, 1, 1, 2); input_data.FillWithYX(Array2D({ @@ -740,8 +868,8 @@ XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) { })); ComputeAndCompare(&builder, - {std::move(*Literal::CreateFromArray(input_data)), - std::move(*Literal::CreateFromArray(filter_data))}, + {std::move(*LiteralUtil::CreateFromArray(input_data)), + std::move(*LiteralUtil::CreateFromArray(filter_data))}, error_spec_); } @@ -754,9 +882,9 @@ XLA_TEST_F(ConvolutionTest, NoCudnnAlgorithmPicker) { XlaBuilder builder(TestName()); Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2}); Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto filter = builder.Parameter(1, filter_shape, "filter"); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D input_data(1, 1, 1, 2); input_data.FillIota(0); @@ -764,8 +892,47 @@ XLA_TEST_F(ConvolutionTest, NoCudnnAlgorithmPicker) { filter_data.FillIota(10); ComputeAndCompare(&builder, - {std::move(*Literal::CreateFromArray(input_data)), - std::move(*Literal::CreateFromArray(filter_data))}); + {std::move(*LiteralUtil::CreateFromArray(input_data)), + std::move(*LiteralUtil::CreateFromArray(filter_data))}); +} + +class ConvolutionHloTest : public HloTestBase {}; + +XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64Forward)) { + constexpr char kHlo[] = R"( +HloModule TestModule + +ENTRY Test { + %arg0 = f64[3,56,56,16] parameter(0) + %arg1 = f64[3,3,3,64] parameter(1) + ROOT %conv = f64[54,54,16,64] convolution(%arg0, %arg1), window={size=3x3}, dim_labels=f01b_i01o->01bf +})"; + EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001})); +} + +XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardFilter)) { + constexpr char kHlo[] = R"( +HloModule TestModule + +ENTRY Test { + %arg0 = f64[2,5,8,1] parameter(0) + %arg1 = f64[2,5,8,2] parameter(1) + ROOT %conv = f64[4,4,1,2] convolution(%arg0, %arg1), window={size=5x8 pad=1_2x1_2}, dim_labels=f01b_i01o->01bf +})"; + EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001})); +} + +XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardInput)) { + constexpr char kHlo[] = R"( +HloModule TestModule + +ENTRY Test { + %output = f64[4,5,16,16] parameter(0) + %kernel = f64[5,3,7,7] parameter(1) + %reverse = f64[5,3,7,7] reverse(f64[5,3,7,7] %kernel), dimensions={2,3} + ROOT %convolution = f64[4,3,16,16] convolution(%output, %reverse), window={size=7x7 pad=3_3x3_3}, dim_labels=bf01_io01->bf01 +})"; + EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001})); } } // namespace diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc index fea850dc135e33fe098aa755c6fdd93319cd2837..6784c16715da72d337edf70fa51db42c59404136 100644 --- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc @@ -27,8 +27,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -55,12 +55,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Minimal) { XlaBuilder builder(TestName()); const Array4D input_array(1, 1, 1, 1, {2}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 1, {3}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); const Array4D expected(1, 1, 1, 1, {6}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -70,12 +70,12 @@ XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) { XlaBuilder builder(TestName()); const Array4D input_array(5, 1, 1, 1, {1, 2, 3, 4, 5}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 1, {2}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); const Array4D expected(5, 1, 1, 1, {2, 4, 6, 8, 10}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -86,12 +86,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) { Array4D input_array(2, 1, 3, 4); input_array.FillWithMultiples(1); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 1, {2.3}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(2, 1, 3, 4); expected.FillWithMultiples(2.3); @@ -102,12 +102,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) { XlaBuilder builder(TestName()); Array4D input_array(1, 2, 1, 1, {10, 1}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(3, 2, 1, 1, {1, 2, 3, 4, 5, 6}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 3, 1, 1, {12, 34, 56}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -117,12 +117,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 2, {1, 2}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 2, {10, 1}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 1, 1, 1, {12}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -132,12 +132,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 3, {1, 2, 3}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 2, {10, 1}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 1, 1, 2, {12, 23}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -147,12 +147,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 2, {10, 1}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 1, 2, 1, {12, 34}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -162,12 +162,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 2, 1, {10, 1}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 1, 1, 2, {13, 24}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -177,12 +177,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 2, 2, {1000, 100, 10, 1}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 1, 1, 1, {1234}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -194,13 +194,13 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) { Array4D input_array( 2, 2, 2, 3, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, // plane 0 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 0, 0}); // plane 1 - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array( 2, 2, 1, 2, {1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected( 2, 2, 2, 2, @@ -213,12 +213,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 4, {1, 2, 3, 4}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 1, {10}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 2}, Padding::kValid); + Conv(input, filter, {1, 2}, Padding::kValid); Array4D expected(1, 1, 1, 2, {10, 30}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -228,12 +228,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 5, {1, 2, 3, 4, 5}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 1, {10}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 2}, Padding::kValid); + Conv(input, filter, {1, 2}, Padding::kValid); Array4D expected(1, 1, 1, 3, {10, 30, 50}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -243,12 +243,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 4, {1, 2, 3, 4}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 3, {100, 10, 1}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 2}, Padding::kValid); + Conv(input, filter, {1, 2}, Padding::kValid); Array4D expected(1, 1, 1, 1, {123}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -258,12 +258,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 5, {1, 2, 3, 4, 5}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 3, {100, 10, 1}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 2}, Padding::kValid); + Conv(input, filter, {1, 2}, Padding::kValid); Array4D expected(1, 1, 1, 2, {123, 345}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -273,12 +273,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 1, {10}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {2, 2}, Padding::kValid); + Conv(input, filter, {2, 2}, Padding::kValid); Array4D expected(1, 1, 2, 2, {10, 30, 70, 90}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -288,12 +288,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 1, {1}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 3, {10, 20, 30}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kSame); + Conv(input, filter, {1, 1}, Padding::kSame); Array4D expected(1, 1, 1, 1, {20}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -303,12 +303,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 3, {1, 2, 3}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 5, {10000, 1000, 100, 10, 1}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kSame); + Conv(input, filter, {1, 1}, Padding::kSame); Array4D expected(1, 1, 1, 3, {123, 1230, 12300}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -318,15 +318,15 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 3, 3, {10000, 0, 1000, // row 0 0, 100, 0, // row 1 10, 0, 1}); // row 2 - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kSame); + Conv(input, filter, {1, 1}, Padding::kSame); Array4D expected(1, 1, 2, 2, {104, 230, 2300, 10400}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -336,12 +336,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) { XlaBuilder builder(TestName()); Array4D input_array(1, 2, 1, 2, {1, 2, 3, 4}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 2, 1, 1, {10, 1}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kSame); + Conv(input, filter, {1, 1}, Padding::kSame); Array4D expected(1, 1, 1, 2, {13, 24}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -351,12 +351,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 2, 2, {7, 13, 17, 23}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 1, 2, 2, {216, 276, 396, 456}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -366,12 +366,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) { XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 3, {1, 2, 3}); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); const Array4D filter_array(1, 1, 1, 2, {7, 13}); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 1, 1, 2, {33, 53}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -383,15 +383,15 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) { std::vector input_data(64); std::iota(input_data.begin(), input_data.end(), 0.0); Array4D input_array(1, 1, 8, 8, input_data); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(128); std::fill(filter_data.begin(), filter_data.begin() + 64, 1.0); std::fill(filter_data.begin() + 64, filter_data.begin() + 128, 2.0); const Array4D filter_array(2, 1, 8, 8, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 2, 1, 1, {2016, 4032}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -403,14 +403,14 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) { std::vector input_data(16 * 1 * 1 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); Array4D input_array(16, 1, 1, 1, input_data); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(1 * 1 * 1 * 1); std::iota(filter_data.begin(), filter_data.end(), 1.0); const Array4D filter_array(1, 1, 1, 1, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); std::vector expected_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; @@ -432,14 +432,14 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) { } } } - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(1 * 1 * ky * kx); std::iota(filter_data.begin(), filter_data.end(), 1.0); const Array4D filter_array(1, 1, ky, kx, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); std::vector expected_data(bs); for (int i = 0; i < bs; ++i) { @@ -463,14 +463,14 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) { } } } - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(1 * 1 * ky * kx); std::iota(filter_data.begin(), filter_data.end(), 1.0); const Array4D filter_array(1, 1, ky, kx, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); std::vector expected_data = { 23, @@ -492,14 +492,14 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) { } } } - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(1 * 1 * 8 * 8); std::iota(filter_data.begin(), filter_data.end(), 1.0); const Array4D filter_array(1, 1, 8, 8, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); std::vector expected_data = { 19664, 21744, 23824, 25904, 27984, 30064, 32144, 34224, @@ -515,7 +515,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) { std::vector input_data(2 * 8 * 8); std::iota(input_data.begin(), input_data.end(), 0.0); Array4D input_array(1, 2, 8, 8, input_data); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(2 * 2 * 8 * 8); std::fill(filter_data.begin(), filter_data.begin() + filter_data.size() / 4, @@ -527,9 +527,9 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) { std::fill(filter_data.begin() + 3 * filter_data.size() / 4, filter_data.end(), 4.0); const Array4D filter_array(2, 2, 8, 8, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(1, 2, 1, 1, {14240, 30496}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -541,7 +541,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) { std::vector input_data(2 * 2 * 8 * 8); std::iota(input_data.begin(), input_data.end(), 0.0); Array4D input_array(2, 2, 8, 8, input_data); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(2 * 2 * 8 * 8); std::fill(filter_data.begin(), filter_data.begin() + filter_data.size() / 4, @@ -553,9 +553,9 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) { std::fill(filter_data.begin() + 3 * filter_data.size() / 4, filter_data.end(), 4.0); const Array4D filter_array(2, 2, 8, 8, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(2, 2, 1, 1, {14240, 30496, 38816, 87840}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); @@ -567,7 +567,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) { std::vector input_data(32 * 2 * 8 * 8); std::iota(input_data.begin(), input_data.end(), 0.0); Array4D input_array(32, 2, 8, 8, input_data); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(2 * 2 * 8 * 8); std::fill(filter_data.begin(), filter_data.begin() + filter_data.size() / 4, @@ -579,9 +579,9 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) { std::fill(filter_data.begin() + 3 * filter_data.size() / 4, filter_data.end(), 4.0); const Array4D filter_array(2, 2, 8, 8, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + Conv(input, filter, {1, 1}, Padding::kValid); std::vector expected_data = { 14240, 30496, 38816, 87840, 63392, 145184, 87968, @@ -613,9 +613,9 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) { } } - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + Conv(input, filter, {1, 1}, Padding::kValid); Array4D expected(16, 16, 1, 1); for (int i0 = 0; i0 < 16; ++i0) { @@ -635,9 +635,9 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) { Array4D input_array(1, 1, 4, 6, input_data); Array4D filter_array(1, 1, 2, 3, {1, 10, 100, 2, 20, 200}); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.ConvGeneralDilated( + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{2, 2}, XlaBuilder::CreateDefaultConvDimensionNumbers()); @@ -654,9 +654,9 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) { Array4D input_array(1, 1, 1, 5, input_data); Array4D filter_array(1, 1, 1, 2, {10, 1}); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.ConvGeneralDilated( + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{}, /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{}, XlaBuilder::CreateDefaultConvDimensionNumbers()); @@ -677,9 +677,9 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) { 200, 20, 2, // 300, 30, 3, // 400, 40, 4}); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.ConvGeneralDilated( + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{2, 1}, /*padding=*/{{1, 0}, {0, 0}}, /*lhs_dilation=*/{3, 2}, /*rhs_dilation=*/{}, XlaBuilder::CreateDefaultConvDimensionNumbers()); @@ -699,9 +699,9 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) { Array4D input_array(1, 1, 1, 5, input_data); Array4D filter_array(1, 1, 1, 2, {10, 1}); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.ConvGeneral( + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + ConvGeneral( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {-1, -1}}, XlaBuilder::CreateDefaultConvDimensionNumbers()); @@ -718,9 +718,9 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) { Array4D input_array(1, 1, 1, 5, input_data); Array4D filter_array(1, 1, 1, 2, {10, 1}); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.ConvGeneral( + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + ConvGeneral( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {-1, 2}}, XlaBuilder::CreateDefaultConvDimensionNumbers()); @@ -737,9 +737,9 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) { Array4D input_array(1, 1, 1, 5, input_data); Array4D filter_array(1, 1, 1, 2, {10, 1}); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.ConvGeneral( + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + ConvGeneral( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {2, -1}}, XlaBuilder::CreateDefaultConvDimensionNumbers()); @@ -756,9 +756,9 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) { Array4D input_array(1, 1, 1, 5, input_data); Array4D filter_array(1, 1, 1, 2, {10, 1}); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.ConvGeneralDilated( + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {3, 2}}, /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2}, @@ -781,9 +781,9 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) { Array4D input_array(1, 1, 1, 5, input_data); Array4D filter_array(1, 1, 1, 2, {10, 1}); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.ConvGeneralDilated( + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {-3, -2}}, /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2}, @@ -821,9 +821,9 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x1x2x3_Filter2x1x1x2) { Array4D filter_array(oz, iz, ky, kx, kernel_data); XlaBuilder builder(TestName()); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + Conv(input, filter, {1, 1}, Padding::kValid); std::unique_ptr> expected = ReferenceUtil::ConvArray4D( input_array, filter_array, {1, 1}, Padding::kValid); @@ -854,9 +854,9 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x16x1x1_Filter1x16x1x1) { Array4D filter_array(oz, iz, ky, kx, kernel_data); XlaBuilder builder(TestName()); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + Conv(input, filter, {1, 1}, Padding::kValid); std::unique_ptr> expected = ReferenceUtil::ConvArray4D( input_array, filter_array, {1, 1}, Padding::kValid); @@ -887,9 +887,9 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter1x16x1x1) { Array4D filter_array(oz, iz, ky, kx, kernel_data); XlaBuilder builder(TestName()); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + Conv(input, filter, {1, 1}, Padding::kValid); std::unique_ptr> expected = ReferenceUtil::ConvArray4D( input_array, filter_array, {1, 1}, Padding::kValid); @@ -920,9 +920,9 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) { Array4D filter_array(oz, iz, ky, kx, kernel_data); XlaBuilder builder(TestName()); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + Conv(input, filter, {1, 1}, Padding::kValid); std::unique_ptr> expected = ReferenceUtil::ConvArray4D( input_array, filter_array, {1, 1}, Padding::kValid); @@ -954,9 +954,9 @@ XLA_TEST_F(ConvolutionVariantsTest, Array4D filter_array(oz, iz, ky, kx, kernel_data); XlaBuilder builder(TestName()); - auto input = builder.ConstantR4FromArray4D(input_array); - auto filter = builder.ConstantR4FromArray4D(filter_array); - builder.Conv(input, filter, {1, 1}, Padding::kValid); + auto input = ConstantR4FromArray4D(&builder, input_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); + Conv(input, filter, {1, 1}, Padding::kValid); std::unique_ptr> expected = ReferenceUtil::ConvArray4D( input_array, filter_array, {1, 1}, Padding::kValid); @@ -970,12 +970,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) { std::vector input_data(1 * 2 * 3 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); Array4D input_array(1, 2, 3, 1, input_data); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(1 * 2 * 1 * 1); std::iota(filter_data.begin(), filter_data.end(), 1.0); Array4D filter_array(1, 2, 1, 1, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); ConvolutionDimensionNumbers dnums; // NHWC input format. @@ -995,7 +995,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) { dnums.set_kernel_output_feature_dimension(3); // Tests padding sizes that don't correspond either to SAME or VALID padding. - builder.ConvGeneral(input, filter, {1, 1}, {{2, 1}, {2, 3}}, dnums); + ConvGeneral(input, filter, {1, 1}, {{2, 1}, {2, 3}}, dnums); std::vector expected_data = { 0, 0, 0, 0, 0, 0, 0, // @@ -1014,12 +1014,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) { std::vector input_data(1 * 2 * 3 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); Array4D input_array(1, 2, 3, 1, input_data); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(1 * 1 * 1 * 1); std::iota(filter_data.begin(), filter_data.end(), 2.0); Array4D filter_array(1, 1, 1, 1, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); ConvolutionDimensionNumbers dnums; // NHWC input format. @@ -1039,7 +1039,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) { dnums.set_kernel_output_feature_dimension(3); // Tests padding sizes that don't correspond either to SAME or VALID padding. - builder.ConvGeneral(input, filter, {1, 1}, {{2, 1}, {2, 3}}, dnums); + ConvGeneral(input, filter, {1, 1}, {{2, 1}, {2, 3}}, dnums); std::vector expected_data = { 0, 0, 0, 0, 0, 0, 0, 0, // @@ -1058,12 +1058,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) { std::vector input_data(1 * 2 * 3 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); Array4D input_array(1, 2, 3, 1, input_data); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(1 * 1 * 1 * 1); std::iota(filter_data.begin(), filter_data.end(), 2.0); Array4D filter_array(1, 1, 1, 1, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); ConvolutionDimensionNumbers dnums; // NHWC input format. @@ -1083,7 +1083,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) { dnums.set_kernel_output_feature_dimension(3); // Tests zero padding sizes. This can use matmul for computation. - builder.ConvGeneral(input, filter, {1, 1}, {{0, 0}, {0, 0}}, dnums); + ConvGeneral(input, filter, {1, 1}, {{0, 0}, {0, 0}}, dnums); std::vector expected_data = { 2, 4, 6, // @@ -1099,12 +1099,12 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) { std::vector input_data(1 * 2 * 3 * 2); std::iota(input_data.begin(), input_data.end(), 1.0); Array4D input_array(1, 2, 3, 2, input_data); - auto input = builder.ConstantR4FromArray4D(input_array); + auto input = ConstantR4FromArray4D(&builder, input_array); std::vector filter_data(1 * 1 * 2 * 3); std::iota(filter_data.begin(), filter_data.end(), 2.0); Array4D filter_array(1, 1, 2, 3, filter_data); - auto filter = builder.ConstantR4FromArray4D(filter_array); + auto filter = ConstantR4FromArray4D(&builder, filter_array); ConvolutionDimensionNumbers dnums; // NHWC input format. @@ -1124,7 +1124,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) { dnums.set_kernel_output_feature_dimension(3); // Tests zero padding sizes. This can use matmul for computation. - builder.ConvGeneral(input, filter, {1, 1}, {{0, 0}, {0, 0}}, dnums); + ConvGeneral(input, filter, {1, 1}, {{0, 0}, {0, 0}}, dnums); std::vector expected_data = { 12, 15, 18, // @@ -1148,14 +1148,14 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingLessThanHighPadding) { XlaBuilder builder(TestName()); - auto gradients = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3})); - auto weights = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 2, /*values=*/{5, 6})); - auto mirrored_weights = builder.Rev(weights, {2, 3}); - builder.ConvWithGeneralPadding(gradients, mirrored_weights, - /*window_strides=*/{1, 1}, - /*padding=*/{{0, 0}, {1, 0}}); + auto gradients = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3})); + auto weights = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 2, /*values=*/{5, 6})); + auto mirrored_weights = Rev(weights, {2, 3}); + ConvWithGeneralPadding(gradients, mirrored_weights, + /*window_strides=*/{1, 1}, + /*padding=*/{{0, 0}, {1, 0}}); ComputeAndCompareR4(&builder, {{{{5, 16, 27}}}}, {}, error_spec_); } @@ -1167,16 +1167,16 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingGreaterThanHighPadding) { XlaBuilder builder(TestName()); - auto gradients = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 1, /*values=*/{1})); - auto weights = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 3, /*values=*/{1, 10, 100})); - auto mirrored_weights = builder.Rev(weights, {2, 3}); - builder.ConvGeneralDilated(gradients, mirrored_weights, - /*window_strides=*/{1, 1}, - /*padding=*/{{0, 0}, {0, 3}}, - /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{}, - XlaBuilder::CreateDefaultConvDimensionNumbers()); + auto gradients = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 1, /*values=*/{1})); + auto weights = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 3, /*values=*/{1, 10, 100})); + auto mirrored_weights = Rev(weights, {2, 3}); + ConvGeneralDilated(gradients, mirrored_weights, + /*window_strides=*/{1, 1}, + /*padding=*/{{0, 0}, {0, 3}}, + /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{}, + XlaBuilder::CreateDefaultConvDimensionNumbers()); ComputeAndCompareR4(&builder, {{{{100, 0}}}}, {}, error_spec_); } @@ -1187,14 +1187,14 @@ XLA_TEST_F(ConvolutionVariantsTest, XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) { XlaBuilder builder(TestName()); - auto gradients = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 1, /*values=*/{1})); - auto weights = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 3, /*values=*/{1, 10, 100})); - auto mirrored_weights = builder.Rev(weights, {2, 3}); - builder.ConvWithGeneralPadding(gradients, mirrored_weights, - /*window_strides=*/{1, 1}, - /*padding=*/{{0, 0}, {1, 1}}); + auto gradients = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 1, /*values=*/{1})); + auto weights = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 3, /*values=*/{1, 10, 100})); + auto mirrored_weights = Rev(weights, {2, 3}); + ConvWithGeneralPadding(gradients, mirrored_weights, + /*window_strides=*/{1, 1}, + /*padding=*/{{0, 0}, {1, 1}}); ComputeAndCompareR4(&builder, {{{{10}}}}, {}, error_spec_); } @@ -1208,14 +1208,14 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) { XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) { XlaBuilder builder(TestName()); - auto gradients = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3})); - auto weights = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 2, /*values=*/{1, 10})); - auto mirrored_weights = builder.Rev(weights, {2, 3}); - builder.ConvWithGeneralPadding(gradients, mirrored_weights, - /*window_strides=*/{1, 1}, - /*padding=*/{{0, 0}, {0, 2}}); + auto gradients = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3})); + auto weights = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 2, /*values=*/{1, 10})); + auto mirrored_weights = Rev(weights, {2, 3}); + ConvWithGeneralPadding(gradients, mirrored_weights, + /*window_strides=*/{1, 1}, + /*padding=*/{{0, 0}, {0, 2}}); ComputeAndCompareR4(&builder, {{{{12, 23, 30, 0}}}}, {}, error_spec_); } @@ -1229,17 +1229,17 @@ XLA_TEST_F(ConvolutionVariantsTest, // weight gradients: 24,130,240 // // This pattern will be fused to backward convolution with padding=(1,2). - auto activations = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 4, /*values=*/{1, 2, 3, 4})); - auto gradients = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 3, /*values=*/{100, 10, 1})); - auto forward_conv = builder.ConvGeneralDilated( - activations, gradients, - /*window_strides=*/{1, 1}, - /*padding=*/{{0, 0}, {1, 2}}, - /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, - XlaBuilder::CreateDefaultConvDimensionNumbers()); - builder.Transpose(forward_conv, {0, 1, 2, 3}); + auto activations = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 4, /*values=*/{1, 2, 3, 4})); + auto gradients = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 3, /*values=*/{100, 10, 1})); + auto forward_conv = + ConvGeneralDilated(activations, gradients, + /*window_strides=*/{1, 1}, + /*padding=*/{{0, 0}, {1, 2}}, + /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, + XlaBuilder::CreateDefaultConvDimensionNumbers()); + Transpose(forward_conv, {0, 1, 2, 3}); ComputeAndCompareR4(&builder, {{{{24, 130, 240}}}}, {}, error_spec_); } @@ -1255,17 +1255,17 @@ XLA_TEST_F(ConvolutionVariantsTest, // This pattern will be fused to backward convolution with padding=(2,1). // Note: both (2,1) and (2,0) are valid padding for the backward convolution // because the stride is 2. - auto activations = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 4, /*values=*/{1, 2, 3, 4})); - auto gradients = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 3, /*values=*/{100, 10, 1})); - auto forward_conv = builder.ConvGeneralDilated( - activations, gradients, - /*window_strides=*/{1, 1}, - /*padding=*/{{0, 0}, {2, 0}}, - /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, - XlaBuilder::CreateDefaultConvDimensionNumbers()); - builder.Transpose(forward_conv, {0, 1, 2, 3}); + auto activations = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 4, /*values=*/{1, 2, 3, 4})); + auto gradients = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 3, /*values=*/{100, 10, 1})); + auto forward_conv = + ConvGeneralDilated(activations, gradients, + /*window_strides=*/{1, 1}, + /*padding=*/{{0, 0}, {2, 0}}, + /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, + XlaBuilder::CreateDefaultConvDimensionNumbers()); + Transpose(forward_conv, {0, 1, 2, 3}); ComputeAndCompareR4(&builder, {{{{13, 24}}}}, {}, error_spec_); } @@ -1282,17 +1282,17 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) { // because the stride is 2. ConvolutionFolding prefers (2,2) because cuDNN // supports even padding only -- using (2,1) would need extra effort of // canonicalization. - auto activations = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 4, /*values=*/{1, 2, 3, 4})); - auto gradients = builder.ConstantR4FromArray4D( - Array4D(1, 1, 1, 3, /*values=*/{100, 10, 1})); - auto forward_conv = builder.ConvGeneralDilated( - activations, gradients, - /*window_strides=*/{1, 1}, - /*padding=*/{{0, 0}, {2, 1}}, - /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, - XlaBuilder::CreateDefaultConvDimensionNumbers()); - builder.Transpose(forward_conv, {0, 1, 2, 3}); + auto activations = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 4, /*values=*/{1, 2, 3, 4})); + auto gradients = ConstantR4FromArray4D( + &builder, Array4D(1, 1, 1, 3, /*values=*/{100, 10, 1})); + auto forward_conv = + ConvGeneralDilated(activations, gradients, + /*window_strides=*/{1, 1}, + /*padding=*/{{0, 0}, {2, 1}}, + /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, + XlaBuilder::CreateDefaultConvDimensionNumbers()); + Transpose(forward_conv, {0, 1, 2, 3}); ComputeAndCompareR4(&builder, {{{{13, 24, 130}}}}, {}, error_spec_); } @@ -1300,14 +1300,14 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) { XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) { XlaBuilder builder(TestName()); - auto gradients = builder.ConstantR3FromArray3D( - Array3D(1, 1, 1, /*value=*/1)); + auto gradients = ConstantR3FromArray3D( + &builder, Array3D(1, 1, 1, /*value=*/1)); auto weights = - builder.ConstantR3FromArray3D(Array3D({{{1, 10, 100}}})); - auto mirrored_weights = builder.Rev(weights, {2}); - builder.ConvWithGeneralPadding(gradients, mirrored_weights, - /*window_strides=*/{1}, - /*padding=*/{{1, 1}}); + ConstantR3FromArray3D(&builder, Array3D({{{1, 10, 100}}})); + auto mirrored_weights = Rev(weights, {2}); + ConvWithGeneralPadding(gradients, mirrored_weights, + /*window_strides=*/{1}, + /*padding=*/{{1, 1}}); ComputeAndCompareR3(&builder, {{{10}}}, {}, error_spec_); } @@ -1315,17 +1315,17 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) { XlaBuilder builder(TestName()); auto activations = - builder.ConstantR3FromArray3D(Array3D({{{1, 2, 3, 4}}})); + ConstantR3FromArray3D(&builder, Array3D({{{1, 2, 3, 4}}})); auto gradients = - builder.ConstantR3FromArray3D(Array3D({{{100, 10, 1}}})); + ConstantR3FromArray3D(&builder, Array3D({{{100, 10, 1}}})); auto forward_conv = - builder.ConvGeneralDilated(activations, gradients, - /*window_strides=*/{1}, - /*padding=*/{{2, 1}}, - /*lhs_dilation=*/{}, /*rhs_dilation=*/{2}, - XlaBuilder::CreateDefaultConvDimensionNumbers( - /*num_spatial_dims=*/1)); - builder.Transpose(forward_conv, {0, 1, 2}); + ConvGeneralDilated(activations, gradients, + /*window_strides=*/{1}, + /*padding=*/{{2, 1}}, + /*lhs_dilation=*/{}, /*rhs_dilation=*/{2}, + XlaBuilder::CreateDefaultConvDimensionNumbers( + /*num_spatial_dims=*/1)); + Transpose(forward_conv, {0, 1, 2}); ComputeAndCompareR3(&builder, {{{13, 24, 130}}}, {}, error_spec_); } @@ -1333,52 +1333,52 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) { XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) { XlaBuilder builder(TestName()); - auto gradients_flat = Literal::CreateR1({1}); + auto gradients_flat = LiteralUtil::CreateR1({1}); auto gradients_literal = gradients_flat->Reshape({1, 1, 1, 1, 1}).ConsumeValueOrDie(); - auto gradients = builder.ConstantLiteral(*gradients_literal); + auto gradients = ConstantLiteral(&builder, *gradients_literal); - auto weights_flat = Literal::CreateR1({1, 10, 100}); + auto weights_flat = LiteralUtil::CreateR1({1, 10, 100}); auto weights_literal = weights_flat->Reshape({1, 1, 1, 1, 3}).ConsumeValueOrDie(); - auto weights = builder.ConstantLiteral(*weights_literal); + auto weights = ConstantLiteral(&builder, *weights_literal); - auto expected_flat = Literal::CreateR1({10}); + auto expected_flat = LiteralUtil::CreateR1({10}); auto expected_literal = expected_flat->Reshape({1, 1, 1, 1, 1}).ConsumeValueOrDie(); - auto mirrored_weights = builder.Rev(weights, {2, 3, 4}); - builder.ConvWithGeneralPadding(gradients, mirrored_weights, - /*window_strides=*/{1, 1, 1}, - /*padding=*/{{0, 0}, {0, 0}, {1, 1}}); + auto mirrored_weights = Rev(weights, {2, 3, 4}); + ConvWithGeneralPadding(gradients, mirrored_weights, + /*window_strides=*/{1, 1, 1}, + /*padding=*/{{0, 0}, {0, 0}, {1, 1}}); ComputeAndCompareLiteral(&builder, *expected_literal, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) { XlaBuilder builder(TestName()); - auto activations_flat = Literal::CreateR1({1, 2, 3, 4}); + auto activations_flat = LiteralUtil::CreateR1({1, 2, 3, 4}); auto activations_literal = activations_flat->Reshape({1, 1, 1, 1, 4}).ConsumeValueOrDie(); - auto activations = builder.ConstantLiteral(*activations_literal); + auto activations = ConstantLiteral(&builder, *activations_literal); - auto gradients_flat = Literal::CreateR1({100, 10, 1}); + auto gradients_flat = LiteralUtil::CreateR1({100, 10, 1}); auto gradients_literal = gradients_flat->Reshape({1, 1, 1, 1, 3}).ConsumeValueOrDie(); - auto gradients = builder.ConstantLiteral(*gradients_literal); + auto gradients = ConstantLiteral(&builder, *gradients_literal); - auto expected_flat = Literal::CreateR1({13, 24, 130}); + auto expected_flat = LiteralUtil::CreateR1({13, 24, 130}); auto expected_literal = expected_flat->Reshape({1, 1, 1, 1, 3}).ConsumeValueOrDie(); - auto forward_conv = builder.ConvGeneralDilated( - activations, gradients, - /*window_strides=*/{1, 1, 1}, - /*padding=*/{{0, 0}, {0, 0}, {2, 1}}, - /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 1, 2}, - XlaBuilder::CreateDefaultConvDimensionNumbers( - /*num_spatial_dims=*/3)); - builder.Transpose(forward_conv, {0, 1, 2, 3, 4}); + auto forward_conv = + ConvGeneralDilated(activations, gradients, + /*window_strides=*/{1, 1, 1}, + /*padding=*/{{0, 0}, {0, 0}, {2, 1}}, + /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 1, 2}, + XlaBuilder::CreateDefaultConvDimensionNumbers( + /*num_spatial_dims=*/3)); + Transpose(forward_conv, {0, 1, 2, 3, 4}); ComputeAndCompareLiteral(&builder, *expected_literal, {}, error_spec_); } diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc index 2b3390ca98cb2922410d451c06811aa9d4ff8c0b..526626c1ddd902a4ba6c608f2b9355cece9ec833 100644 --- a/tensorflow/compiler/xla/tests/copy_test.cc +++ b/tensorflow/compiler/xla/tests/copy_test.cc @@ -16,10 +16,10 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/array2d.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -54,41 +54,42 @@ class CopyOpTest : public HloTestBase { void TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3); void TestCopyConstantLayoutR4(size_t n1, size_t n2, size_t n3, size_t n4, - tensorflow::gtl::ArraySlice permutation); + absl::Span permutation); }; XLA_TEST_F(CopyOpTest, CopyR0Bool) { - TestCopyOp(*Literal::CreateR0(true)); + TestCopyOp(*LiteralUtil::CreateR0(true)); } XLA_TEST_F(CopyOpTest, CopyR1S0U32) { - TestCopyOp(*Literal::CreateR1({})); + TestCopyOp(*LiteralUtil::CreateR1({})); } XLA_TEST_F(CopyOpTest, CopyR1S3U32) { - TestCopyOp(*Literal::CreateR1({1, 2, 3})); + TestCopyOp(*LiteralUtil::CreateR1({1, 2, 3})); } XLA_TEST_F(CopyOpTest, CopyR3F32_2x2x3) { - TestCopyOp(*Literal::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, - {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}})); + TestCopyOp( + *LiteralUtil::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, + {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}})); } XLA_TEST_F(CopyOpTest, CopyR4S32_2x2x3x2) { - TestCopyOp(*Literal::CreateR4( + TestCopyOp(*LiteralUtil::CreateR4( {{{{1, -2}, {-4, 5}, {6, 7}}, {{8, 9}, {10, 11}, {12, 13}}}, {{{10, 3}, {7, -2}, {3, 6}}, {{2, 5}, {-11, 5}, {-2, -5}}}})); } XLA_TEST_F(CopyOpTest, CopyR4S32_0x2x3x2) { - TestCopyOp(*Literal::CreateR4FromArray4D(Array4D(0, 2, 3, 2))); + TestCopyOp(*LiteralUtil::CreateR4FromArray4D(Array4D(0, 2, 3, 2))); } XLA_TEST_F(CopyOpTest, CopyParameterScalar) { auto builder = HloComputation::Builder(TestName()); // Copy literal to device to use as parameter. - auto literal = Literal::CreateR0(42.0); + auto literal = LiteralUtil::CreateR0(42.0); Shape shape = literal->shape(); auto param0 = builder.AddInstruction( @@ -109,7 +110,7 @@ XLA_TEST_F(CopyOpTest, CopyParameterScalar) { XLA_TEST_F(CopyOpTest, CopyConstantR2Twice) { auto builder = HloComputation::Builder(TestName()); - auto literal = Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + auto literal = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); auto constant = builder.AddInstruction( HloInstruction::CreateConstant(std::move(literal))); @@ -131,7 +132,7 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2DifferentLayouts) { HloComputation::Builder builder(TestName()); std::unique_ptr literal = - Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); + LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); // Reverse the minor-to-major order of the literal. Layout* literal_layout = literal->mutable_shape_do_not_use()->mutable_layout(); @@ -168,7 +169,7 @@ void CopyOpTest::TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3) { HloComputation::Builder builder(TestName()); - std::unique_ptr literal = Literal::CreateR3FromArray3D(a); + std::unique_ptr literal = LiteralUtil::CreateR3FromArray3D(a); HloInstruction* constant = builder.AddInstruction( HloInstruction::CreateConstant(std::move(literal))); @@ -186,9 +187,9 @@ void CopyOpTest::TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3) { LiteralTestUtil::ExpectR3EqualArray3D(a, *result); } -void CopyOpTest::TestCopyConstantLayoutR4( - size_t n1, size_t n2, size_t n3, size_t n4, - tensorflow::gtl::ArraySlice permutation) { +void CopyOpTest::TestCopyConstantLayoutR4(size_t n1, size_t n2, size_t n3, + size_t n4, + absl::Span permutation) { Array4D a(n1, n2, n3, n4); for (size_t i = 0; i < n1; ++i) { for (size_t j = 0; j < n2; ++j) { @@ -202,7 +203,7 @@ void CopyOpTest::TestCopyConstantLayoutR4( HloComputation::Builder builder(TestName()); - std::unique_ptr literal = Literal::CreateR4FromArray4D(a); + std::unique_ptr literal = LiteralUtil::CreateR4FromArray4D(a); HloInstruction* constant = builder.AddInstruction( HloInstruction::CreateConstant(std::move(literal))); @@ -248,7 +249,7 @@ XLA_TEST_F(CopyOpClientTest, Copy0x0) { auto empty = Literal::CreateFromShape(in_shape); XlaBuilder builder(TestName()); - auto param0 = builder.Parameter(0, in_shape, "input"); + Parameter(&builder, 0, in_shape, "input"); auto input_data = client_->TransferToServer(*empty).ConsumeValueOrDie(); auto actual = ExecuteAndTransfer(&builder, {input_data.get()}, &out_shape) diff --git a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc index b15988776513a60c9e5c85d4780912106db98e75..d12a4e7fcd7813775a81677bcaa07af60ff9b477 100644 --- a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc +++ b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc @@ -13,13 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" namespace xla { namespace { @@ -32,28 +32,44 @@ class TrivialCrossReplicaSumTest : public HloTestBase {}; XLA_TEST_F(TrivialCrossReplicaSumTest, OneOperand) { const char* module_str = R"( HloModule test + + add { + x = f32[] parameter(0) + y = f32[] parameter(1) + add = f32[] add(x, y) + } + ENTRY test_computation { p = f32[3] parameter(0) - ROOT crs = f32[3] cross-replica-sum(p) + ROOT crs = f32[3] cross-replica-sum(p), to_apply=add })"; - auto module = tools::Parse(module_str, GetModuleConfigForTest()).ValueOrDie(); - auto literal = Literal::CreateR1({1, 2, 3}); + auto module = + ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie(); + auto literal = LiteralUtil::CreateR1({1, 2, 3}); EXPECT_EQ(*literal, *ExecuteAndTransfer(std::move(module), {literal.get()})); } XLA_TEST_F(TrivialCrossReplicaSumTest, MultipleOperands) { const char* module_str = R"( HloModule test + + add { + x = f32[] parameter(0) + y = f32[] parameter(1) + add = f32[] add(x, y) + } + ENTRY test_computation { p0 = f32[3] parameter(0) p1 = f32[2] parameter(1) - ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1) + ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1), to_apply=add })"; - auto module = tools::Parse(module_str, GetModuleConfigForTest()).ValueOrDie(); - auto literal0 = Literal::CreateR1({1, 2, 3}); - auto literal1 = Literal::CreateR1({10, 20}); + auto module = + ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie(); + auto literal0 = LiteralUtil::CreateR1({1, 2, 3}); + auto literal1 = LiteralUtil::CreateR1({10, 20}); EXPECT_EQ( - *Literal::MakeTuple({literal0.get(), literal1.get()}), + *LiteralUtil::MakeTuple({literal0.get(), literal1.get()}), *ExecuteAndTransfer(std::move(module), {literal0.get(), literal1.get()})); } @@ -63,15 +79,23 @@ XLA_TEST_F(TrivialCrossReplicaSumTest, MultipleOperands) { XLA_TEST_F(TrivialCrossReplicaSumTest, ConstantOperand) { const char* module_str = R"( HloModule test + + add { + x = f32[] parameter(0) + y = f32[] parameter(1) + add = f32[] add(x, y) + } + ENTRY test_computation { p0 = f32[3] parameter(0) p1 = f32[2] constant({10, 20}) - ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1) + ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1), to_apply=add })"; - auto module = tools::Parse(module_str, GetModuleConfigForTest()).ValueOrDie(); - auto literal0 = Literal::CreateR1({1, 2, 3}); - auto literal1 = Literal::CreateR1({10, 20}); - EXPECT_EQ(*Literal::MakeTuple({literal0.get(), literal1.get()}), + auto module = + ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie(); + auto literal0 = LiteralUtil::CreateR1({1, 2, 3}); + auto literal1 = LiteralUtil::CreateR1({10, 20}); + EXPECT_EQ(*LiteralUtil::MakeTuple({literal0.get(), literal1.get()}), *ExecuteAndTransfer(std::move(module), {literal0.get()})); } diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc index b43d5c9ff5d75ee0e1b3c9ceb2bc295e631ac107..6f7fc0e6e52a69387a4c491871b6fcd97ac638b6 100644 --- a/tensorflow/compiler/xla/tests/custom_call_test.cc +++ b/tensorflow/compiler/xla/tests/custom_call_test.cc @@ -16,8 +16,9 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -73,7 +74,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) { auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0f))); builder.AddInstruction( HloInstruction::CreateCustomCall(r0f32_, {constant}, "R0F32Add2")); @@ -94,7 +95,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) { array(1, 1) = 4.0f; auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2FromArray2D(array))); + HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(array))); builder.AddInstruction( HloInstruction::CreateCustomCall(r0f32_, {constant}, "R2F32ReduceSum")); @@ -110,7 +111,7 @@ XLA_TEST_F(CustomCallTest, auto b = HloComputation::Builder(TestName()); auto input = b.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2FromArray2D( + HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D( Array2D{{1.0f, 2.0f}, {3.0f, 4.0f}}))); auto incremented = b.AddInstruction(HloInstruction::CreateCustomCall( ShapeUtil::MakeShape(F32, {1, 2, 2}), {input}, "Add1ToValues")); @@ -135,8 +136,8 @@ class CustomCallClientAPITest : public ClientLibraryTestBase {}; // are reserved for internal use. XLA_TEST_F(CustomCallClientAPITest, IllegalCustomCallTarget) { XlaBuilder builder(TestName()); - builder.CustomCall("$illegal", /*operands=*/{}, - ShapeUtil::MakeShape(F32, {1})); + CustomCall(&builder, "$illegal", /*operands=*/{}, + ShapeUtil::MakeShape(F32, {1})); StatusOr> result = Execute(&builder, /*arguments=*/{}); diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc index bfe688e20d182d581c3e3b545ac2289413deef7c..86fd1ceb1368feedb14088fa7045224440f6c4f9 100644 --- a/tensorflow/compiler/xla/tests/deallocation_test.cc +++ b/tensorflow/compiler/xla/tests/deallocation_test.cc @@ -15,16 +15,16 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/core/lib/gtl/array_slice.h" namespace xla { namespace { @@ -36,7 +36,7 @@ class DeallocationTest : public ClientLibraryTestBase { // Build and execute the given computation then verify the results can be // transferred from the device successfully. std::unique_ptr ExecuteAndCheckTransfer( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments) { + XlaBuilder* builder, absl::Span arguments) { XlaComputation computation = builder->Build().ConsumeValueOrDie(); auto global_data = client_->Execute(computation, arguments, &execution_options_) @@ -48,7 +48,7 @@ class DeallocationTest : public ClientLibraryTestBase { TEST_F(DeallocationTest, DeallocateScalar) { XlaBuilder builder(TestName()); - builder.ConstantR0(42.0); + ConstantR0(&builder, 42.0); auto global_data = ExecuteAndCheckTransfer(&builder, {}); // A result can be transferred an arbitrary number of times. Add an extra @@ -66,7 +66,7 @@ TEST_F(DeallocationTest, DeallocateScalar) { TEST_F(DeallocationTest, DeallocateVector) { XlaBuilder builder(TestName()); - builder.ConstantR1({1.0, 2.0, 3.0, 4.0}); + ConstantR1(&builder, {1.0, 2.0, 3.0, 4.0}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); ASSERT_IS_OK(client_->Unregister(*global_data)); @@ -79,7 +79,7 @@ TEST_F(DeallocationTest, DeallocateVector) { TEST_F(DeallocationTest, DeallocateEmptyVector) { XlaBuilder builder(TestName()); - builder.ConstantR1({}); + ConstantR1(&builder, {}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); ASSERT_IS_OK(client_->Unregister(*global_data)); @@ -92,8 +92,8 @@ TEST_F(DeallocationTest, DeallocateEmptyVector) { XLA_TEST_F(DeallocationTest, DeallocateTuple) { XlaBuilder builder(TestName()); - builder.Tuple({builder.ConstantR0(42.0), - builder.ConstantR1({1.0, 2.0, 3.0})}); + Tuple(&builder, {ConstantR0(&builder, 42.0), + ConstantR1(&builder, {1.0, 2.0, 3.0})}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); ASSERT_IS_OK(client_->Unregister(*global_data)); @@ -106,9 +106,10 @@ XLA_TEST_F(DeallocationTest, DeallocateTuple) { XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) { XlaBuilder builder(TestName()); - auto element = builder.ConstantR0(42.0); - auto inner_tuple = builder.Tuple({builder.ConstantR0(42.0), element}); - builder.Tuple({element, inner_tuple, element}); + auto element = ConstantR0(&builder, 42.0); + auto inner_tuple = + Tuple(&builder, {ConstantR0(&builder, 42.0), element}); + Tuple(&builder, {element, inner_tuple, element}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); ASSERT_IS_OK(client_->Unregister(*global_data)); @@ -122,9 +123,9 @@ XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) { XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) { XlaBuilder builder(TestName()); auto inner_tuple = - builder.Tuple({builder.ConstantR0(42.0), - builder.ConstantR1({1.0, 2.0, 3.0})}); - builder.Tuple({inner_tuple, builder.ConstantR1({0.123, 0.456})}); + Tuple(&builder, {ConstantR0(&builder, 42.0), + ConstantR1(&builder, {1.0, 2.0, 3.0})}); + Tuple(&builder, {inner_tuple, ConstantR1(&builder, {0.123, 0.456})}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); ASSERT_IS_OK(client_->Unregister(*global_data)); diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc index 12789fe66530fe03eb33316eda652336f29971ab..eb15fc0593adf2d1bd84da4d0f708b6244f0fb33 100644 --- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc +++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc @@ -16,11 +16,12 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" @@ -28,7 +29,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/test.h" namespace xla { @@ -42,7 +42,7 @@ class DeconstructTupleTest : public ClientLibraryTestBase { // Build and execute the given computation then verify the results can be // transferred from the device successfully. std::unique_ptr ExecuteAndCheckTransfer( - XlaBuilder* builder, tensorflow::gtl::ArraySlice arguments) { + XlaBuilder* builder, absl::Span arguments) { XlaComputation computation = builder->Build().ConsumeValueOrDie(); auto global_data = client_->Execute(computation, arguments, &execution_options_) @@ -54,9 +54,9 @@ class DeconstructTupleTest : public ClientLibraryTestBase { TEST_F(DeconstructTupleTest, DeconstructTuple) { XlaBuilder builder(TestName()); - auto const1 = builder.ConstantR1({1.0, 2.0, 3.0, 4.0}); - auto const2 = builder.ConstantR1({2.0, 4.0, 6.0, 8.0}); - builder.Tuple({const1, const2}); + auto const1 = ConstantR1(&builder, {1.0, 2.0, 3.0, 4.0}); + auto const2 = ConstantR1(&builder, {2.0, 4.0, 6.0, 8.0}); + Tuple(&builder, {const1, const2}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); auto result_status = client_->DeconstructTuple(*global_data); @@ -73,9 +73,9 @@ TEST_F(DeconstructTupleTest, DeconstructTuple) { TEST_F(DeconstructTupleTest, DeconstructTupleTwice) { XlaBuilder builder(TestName()); - auto const1 = builder.ConstantR1({1.0, 2.0, 3.0, 4.0}); - auto const2 = builder.ConstantR1({2.0, 4.0, 6.0, 8.0}); - builder.Tuple({const1, const2}); + auto const1 = ConstantR1(&builder, {1.0, 2.0, 3.0, 4.0}); + auto const2 = ConstantR1(&builder, {2.0, 4.0, 6.0, 8.0}); + Tuple(&builder, {const1, const2}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); auto result_status1 = client_->DeconstructTuple(*global_data); @@ -103,9 +103,9 @@ TEST_F(DeconstructTupleTest, DeconstructTupleTwice) { XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) { XlaBuilder builder(TestName()); - auto const1 = builder.ConstantR1({1.0, 2.0, 3.0, 4.0}); - auto const2 = builder.ConstantR1({2.0, 4.0, 6.0, 8.0}); - builder.Tuple({const1, const2, const2, const1}); + auto const1 = ConstantR1(&builder, {1.0, 2.0, 3.0, 4.0}); + auto const2 = ConstantR1(&builder, {2.0, 4.0, 6.0, 8.0}); + Tuple(&builder, {const1, const2, const2, const1}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); auto result_status = client_->DeconstructTuple(*global_data); @@ -129,9 +129,9 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) { TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) { XlaBuilder builder(TestName()); - auto const1 = builder.ConstantR1({1.0, 2.0, 3.0, 4.0}); - auto const2 = builder.ConstantR1({2.0, 4.0, 6.0, 8.0}); - builder.Tuple({const1, const2, const1}); + auto const1 = ConstantR1(&builder, {1.0, 2.0, 3.0, 4.0}); + auto const2 = ConstantR1(&builder, {2.0, 4.0, 6.0, 8.0}); + Tuple(&builder, {const1, const2, const1}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); auto result_status = client_->DeconstructTuple(*global_data); @@ -159,7 +159,7 @@ TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) { TEST_F(DeconstructTupleTest, DeconstructNonTuple) { XlaBuilder builder(TestName()); - builder.ConstantR1({1.0, 2.0, 3.0, 4.0}); + ConstantR1(&builder, {1.0, 2.0, 3.0, 4.0}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); auto result_status = client_->DeconstructTuple(*global_data); @@ -171,11 +171,11 @@ TEST_F(DeconstructTupleTest, DeconstructNonTuple) { XLA_TEST_F(DeconstructTupleTest, DeconstructTupleFromParam) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({3.14f, -100.25f}); + LiteralUtil::CreateR1({3.14f, -100.25f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto p = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2}), "param0"); - builder.Tuple({p}); + auto p = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "param0"); + Tuple(&builder, {p}); auto global_data = ExecuteAndCheckTransfer(&builder, {param0_data.get()}); auto result_status = client_->DeconstructTuple(*global_data); @@ -186,9 +186,9 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructTupleFromParam) { XLA_TEST_F(DeconstructTupleTest, DeconstructNestedTuple) { XlaBuilder builder(TestName()); - auto const1 = builder.ConstantR1({1.0, 2.0, 3.0, 4.0}); - auto const2 = builder.ConstantR1({2.0, 4.0, 6.0, 8.0}); - builder.Tuple({builder.Tuple({const1, const2}), const1}); + auto const1 = ConstantR1(&builder, {1.0, 2.0, 3.0, 4.0}); + auto const2 = ConstantR1(&builder, {2.0, 4.0, 6.0, 8.0}); + Tuple(&builder, {Tuple(&builder, {const1, const2}), const1}); auto global_data = ExecuteAndCheckTransfer(&builder, {}); auto result_status = client_->DeconstructTuple(*global_data); diff --git a/tensorflow/compiler/xla/tests/deep_graph_test.cc b/tensorflow/compiler/xla/tests/deep_graph_test.cc index 085a5105aca1c173a7cbc211aebbeb5b254b0753..3f3e8ab712fea14be9e4a7015effdf8ce518309b 100644 --- a/tensorflow/compiler/xla/tests/deep_graph_test.cc +++ b/tensorflow/compiler/xla/tests/deep_graph_test.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" namespace xla { @@ -30,7 +30,7 @@ TEST_F(ClientLibraryTestBase, DeepGraph) { auto y_data = CreateR0Parameter(1, 1, "y", &b, &y); XlaOp z = x; for (int i = 0; i < kDepth; ++i) { - z = b.Add(z, y); + z = Add(z, y); } ComputeAndCompareR0(&b, /*expected=*/kDepth + 3, {x_data.get(), y_data.get()}); diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc index 0fd846cef8095a857dd7b2c12d8afdf409e2bd66..5873516442fa63de47360acaa353abb3a97fe881 100644 --- a/tensorflow/compiler/xla/tests/dot_operation_test.cc +++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc @@ -16,10 +16,11 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -67,15 +68,16 @@ XLA_TEST_F(DotOperationTest, DotOfInputTupleElem) { XlaOp param; auto param_data = CreateParameterAndTransferLiteral( 0, - *Literal::MakeTuple({Literal::CreateR2({{1, 2}, {3, 4}}).get(), - Literal::CreateR2({{5, 6}, {7, 8}}).get()}), + *LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{1, 2}, {3, 4}}).get(), + LiteralUtil::CreateR2({{5, 6}, {7, 8}}).get()}), "arg0", &builder, ¶m); - auto lhs = builder.GetTupleElement(param, 0); - auto rhs = builder.GetTupleElement(param, 1); - builder.Dot(lhs, rhs); + auto lhs = GetTupleElement(param, 0); + auto rhs = GetTupleElement(param, 1); + Dot(lhs, rhs); ComputeAndCompareLiteral(&builder, - *Literal::CreateR2({{19, 22}, {43, 50}}), + *LiteralUtil::CreateR2({{19, 22}, {43, 50}}), {param_data.get()}); } @@ -87,9 +89,9 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, ZeroElementVectorDot) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto lhs = builder.ConstantR1({}); - auto rhs = builder.ConstantR1({}); - auto result = builder.Dot(lhs, rhs); + auto lhs = ConstantR1(&builder, {}); + auto rhs = ConstantR1(&builder, {}); + Dot(lhs, rhs); this->template ComputeAndCompareR0(&builder, static_cast(0.0), {}, this->error_spec_); @@ -102,20 +104,20 @@ TYPED_TEST_CASE(DotOperationTest_F16F32F64, TypesF16F32F64); XLA_TYPED_TEST(DotOperationTest_F16F32F64, TrivialMatrixVectorDot) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto lhs = builder.ConstantR2FromArray2D({{3.0f, 4.0f}}); - auto rhs = builder.ConstantFromArray({3.0f, 4.0f}); - auto result = builder.Dot(lhs, rhs); + auto lhs = ConstantR2FromArray2D(&builder, {{3.0f, 4.0f}}); + auto rhs = ConstantFromArray(&builder, {3.0f, 4.0f}); + Dot(lhs, rhs); this->template ComputeAndCompareR1(&builder, {static_cast(25.0f)}, {}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, OneElementVectorDot) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, OneElementVectorDot) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto lhs = builder.ConstantR1({static_cast(2.0f)}); - auto rhs = builder.ConstantR1({static_cast(3.0f)}); - auto result = builder.Dot(lhs, rhs); + auto lhs = ConstantR1(&builder, {static_cast(2.0f)}); + auto rhs = ConstantR1(&builder, {static_cast(3.0f)}); + Dot(lhs, rhs); this->template ComputeAndCompareR0(&builder, static_cast(6.0f), {}, this->error_spec_); @@ -124,9 +126,9 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, OneElementVectorDot) { XLA_TYPED_TEST(DotOperationTest_F16F32F64, VectorDot) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto lhs = builder.ConstantFromArray({1.0f, 2.5f, 42.0f}); - auto rhs = builder.ConstantFromArray({11.0f, -1.0f, 0.5f}); - auto result = builder.Dot(lhs, rhs); + auto lhs = ConstantFromArray(&builder, {1.0f, 2.5f, 42.0f}); + auto rhs = ConstantFromArray(&builder, {11.0f, -1.0f, 0.5f}); + Dot(lhs, rhs); this->template ComputeAndCompareR0(&builder, static_cast(29.5f), {}, this->error_spec_); @@ -136,69 +138,69 @@ std::vector MinorToMajorForIsRowMajor(bool row_major) { return {row_major ? 1 : 0, row_major ? 0 : 1}; } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x0) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, Dot_0x2_2x0) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto lhs = builder.ConstantR2FromArray2D(Array2D(0, 2)); - auto rhs = builder.ConstantR2FromArray2D(Array2D(2, 0)); - auto result = builder.Dot(lhs, rhs); + auto lhs = ConstantR2FromArray2D(&builder, Array2D(0, 2)); + auto rhs = ConstantR2FromArray2D(&builder, Array2D(2, 0)); + Dot(lhs, rhs); this->template ComputeAndCompareR2(&builder, Array2D(0, 0), {}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x3) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, Dot_0x2_2x3) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto lhs = builder.ConstantR2FromArray2D(Array2D(0, 2)); - auto rhs = builder.ConstantR2FromArray2D( - {{7.0f, 8.0f, 9.0f}, {42.0f, 77.0f, 101.0f}}); - auto result = builder.Dot(lhs, rhs); + auto lhs = ConstantR2FromArray2D(&builder, Array2D(0, 2)); + auto rhs = ConstantR2FromArray2D( + &builder, {{7.0f, 8.0f, 9.0f}, {42.0f, 77.0f, 101.0f}}); + Dot(lhs, rhs); this->template ComputeAndCompareR2(&builder, Array2D(0, 3), {}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_3x2_2x0) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, Dot_3x2_2x0) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto lhs = builder.ConstantR2FromArray2D( - {{7.0f, 8.0f}, {9.0f, 42.0f}, {77.0f, 101.0f}}); - auto rhs = builder.ConstantR2FromArray2D(Array2D(2, 0)); - auto result = builder.Dot(lhs, rhs); + auto lhs = ConstantR2FromArray2D( + &builder, {{7.0f, 8.0f}, {9.0f, 42.0f}, {77.0f, 101.0f}}); + auto rhs = ConstantR2FromArray2D(&builder, Array2D(2, 0)); + Dot(lhs, rhs); this->template ComputeAndCompareR2(&builder, Array2D(3, 0), {}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_2x0_0x2) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, Dot_2x0_0x2) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto lhs = builder.ConstantR2FromArray2D(Array2D(2, 0)); - auto rhs = builder.ConstantR2FromArray2D(Array2D(0, 2)); - auto result = builder.Dot(lhs, rhs); + auto lhs = ConstantR2FromArray2D(&builder, Array2D(2, 0)); + auto rhs = ConstantR2FromArray2D(&builder, Array2D(0, 2)); + Dot(lhs, rhs); this->template ComputeAndCompareR2( &builder, Array2D(2, 2, static_cast(0.0f)), {}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, FusedDot) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, FusedDot) { using T = TypeParam; XlaBuilder builder(this->TestName()); auto param0 = - builder.Parameter(0, ShapeUtil::MakeShapeWithType({2, 4}), "arg0"); + Parameter(&builder, 0, ShapeUtil::MakeShapeWithType({2, 4}), "arg0"); auto param1 = - builder.Parameter(1, ShapeUtil::MakeShapeWithType({4, 1}), "arg1"); - auto exp0 = builder.Exp(param0); - auto result = builder.Dot(exp0, param1); + Parameter(&builder, 1, ShapeUtil::MakeShapeWithType({4, 1}), "arg1"); + auto exp0 = Exp(param0); + Dot(exp0, param1); auto lhs_handle = this->client_ - ->TransferToServer(*Literal::CreateR2FromArray2D( + ->TransferToServer(*LiteralUtil::CreateR2FromArray2D( {{1.0f, 2.0f, 3.0f, 4.0f}, {-1.0f, -2.0f, -3.0f, -4.0f}})) .ConsumeValueOrDie(); auto rhs_handle = this->client_ - ->TransferToServer(*Literal::CreateR2FromArray2D( + ->TransferToServer(*LiteralUtil::CreateR2FromArray2D( {{1.0f}, {2.0f}, {3.0f}, {4.0f}})) .ConsumeValueOrDie(); @@ -217,23 +219,22 @@ class SquareMatrixDot : public DotOperationTest { void TestImpl(bool lhs_row_major, bool rhs_row_major) { auto lhs_handle = client_ - ->TransferToServer(*Literal::CreateFromArrayWithLayout( + ->TransferToServer(*LiteralUtil::CreateFromArrayWithLayout( {{1.0f, 2.0f}, {3.0f, -4.0f}}, LayoutUtil::MakeLayout( MinorToMajorForIsRowMajor(lhs_row_major)))) .ConsumeValueOrDie(); auto rhs_handle = client_ - ->TransferToServer(*Literal::CreateFromArrayWithLayout( + ->TransferToServer(*LiteralUtil::CreateFromArrayWithLayout( {{1.0f, 6.0f}, {7.0f, -4.0f}}, LayoutUtil::MakeLayout( MinorToMajorForIsRowMajor(rhs_row_major)))) .ConsumeValueOrDie(); XlaBuilder builder(TestName()); auto prim_type = primitive_util::NativeToPrimitiveType(); - auto result = builder.Dot( - builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}), "lhs"), - builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {2, 2}), "rhs")); + Dot(Parameter(&builder, 0, ShapeUtil::MakeShape(prim_type, {2, 2}), "lhs"), + Parameter(&builder, 1, ShapeUtil::MakeShape(prim_type, {2, 2}), "rhs")); Array2D expected({{15.0f, -2.0f}, {-25.0f, 34.0f}}); ComputeAndCompareR2(&builder, expected, @@ -261,16 +262,14 @@ string PrintDotTestParam( const ::testing::TestParamInfo& test_param) { const DotTestParam& param = test_param.param; if (param.has_addend) { - return tensorflow::strings::StrCat(param.m, "x", param.k, "x", param.n, - "_MajorToMinor", - param.dot_lhs_row_major ? "T" : "F", - param.dot_rhs_row_major ? "T" : "F", - param.addend_row_major ? "T" : "F"); + return absl::StrCat(param.m, "x", param.k, "x", param.n, "_MajorToMinor", + param.dot_lhs_row_major ? "T" : "F", + param.dot_rhs_row_major ? "T" : "F", + param.addend_row_major ? "T" : "F"); } else { - return tensorflow::strings::StrCat(param.m, "x", param.k, "x", param.n, - "_MajorToMinor", - param.dot_lhs_row_major ? "T" : "F", - param.dot_rhs_row_major ? "T" : "F"); + return absl::StrCat(param.m, "x", param.k, "x", param.n, "_MajorToMinor", + param.dot_lhs_row_major ? "T" : "F", + param.dot_rhs_row_major ? "T" : "F"); } } @@ -287,9 +286,10 @@ void ParametricDotTest::TestImpl() { std::unique_ptr> dot_lhs_data = MakeLinspaceArray2D(0.0, 1.0, param.m, param.k); - std::unique_ptr dot_lhs_lit = Literal::CreateR2FromArray2DWithLayout( - *dot_lhs_data, LayoutUtil::MakeLayout( - MinorToMajorForIsRowMajor(param.dot_lhs_row_major))); + std::unique_ptr dot_lhs_lit = + LiteralUtil::CreateR2FromArray2DWithLayout( + *dot_lhs_data, LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor( + param.dot_lhs_row_major))); std::unique_ptr dot_lhs_handle = client_->TransferToServer(*dot_lhs_lit).ConsumeValueOrDie(); @@ -298,7 +298,7 @@ void ParametricDotTest::TestImpl() { Layout rhs_layout = LayoutUtil::MakeLayout( MinorToMajorForIsRowMajor(param.dot_rhs_row_major)); std::unique_ptr dot_rhs_lit = - Literal::CreateR2FromArray2DWithLayout(*dot_rhs_data, rhs_layout); + LiteralUtil::CreateR2FromArray2DWithLayout(*dot_rhs_data, rhs_layout); std::unique_ptr dot_rhs_handle = client_->TransferToServer(*dot_rhs_lit).ConsumeValueOrDie(); @@ -308,7 +308,7 @@ void ParametricDotTest::TestImpl() { if (param.has_addend) { addend_data = MakeLinspaceArray2D(0.0, 1.0, param.m, param.n); - addend_lit = Literal::CreateR2FromArray2DWithLayout( + addend_lit = LiteralUtil::CreateR2FromArray2DWithLayout( *addend_data, LayoutUtil::MakeLayout( MinorToMajorForIsRowMajor(param.addend_row_major))); addend_handle = client_->TransferToServer(*addend_lit).ConsumeValueOrDie(); @@ -316,26 +316,26 @@ void ParametricDotTest::TestImpl() { XlaBuilder builder(TestName()); auto prim_type = primitive_util::NativeToPrimitiveType(); - auto result = builder.Dot( - builder.Parameter(0, - ShapeUtil::MakeShapeWithLayout( - prim_type, {param.m, param.k}, - MinorToMajorForIsRowMajor(param.dot_lhs_row_major)), - "dot_lhs"), - builder.Parameter(1, - ShapeUtil::MakeShapeWithLayout( - prim_type, {param.k, param.n}, - MinorToMajorForIsRowMajor(param.dot_rhs_row_major)), - "dot_rhs")); + auto result = + Dot(Parameter(&builder, 0, + ShapeUtil::MakeShapeWithLayout( + prim_type, {param.m, param.k}, + MinorToMajorForIsRowMajor(param.dot_lhs_row_major)), + "dot_lhs"), + Parameter(&builder, 1, + ShapeUtil::MakeShapeWithLayout( + prim_type, {param.k, param.n}, + MinorToMajorForIsRowMajor(param.dot_rhs_row_major)), + "dot_rhs")); if (param.has_addend) { - result = builder.Add( - result, builder.Parameter( - 2, - ShapeUtil::MakeShapeWithLayout( - prim_type, {param.m, param.n}, - MinorToMajorForIsRowMajor(param.addend_row_major)), - "addend")); + result = + Add(result, + Parameter(&builder, 2, + ShapeUtil::MakeShapeWithLayout( + prim_type, {param.m, param.n}, + MinorToMajorForIsRowMajor(param.addend_row_major)), + "addend")); } std::unique_ptr> expected; @@ -477,14 +477,14 @@ class NonsquareMatrixDot : public DotOperationTest { void TestImpl(bool lhs_row_major, bool rhs_row_major) { auto lhs_handle = client_ - ->TransferToServer(*Literal::CreateFromArrayWithLayout( + ->TransferToServer(*LiteralUtil::CreateFromArrayWithLayout( {{1.0f, 2.0f, 3.0f}, {3.0f, -4.0f, -1.0f}}, LayoutUtil::MakeLayout( MinorToMajorForIsRowMajor(lhs_row_major)))) .ConsumeValueOrDie(); auto rhs_handle = client_ - ->TransferToServer(*Literal::CreateFromArrayWithLayout( + ->TransferToServer(*LiteralUtil::CreateFromArrayWithLayout( {{1.0f, 6.0f}, {2.0f, 3.0f}, {7.0f, -4.0f}}, LayoutUtil::MakeLayout( MinorToMajorForIsRowMajor(rhs_row_major)))) @@ -492,9 +492,8 @@ class NonsquareMatrixDot : public DotOperationTest { XlaBuilder builder(TestName()); auto prim_type = primitive_util::NativeToPrimitiveType(); - auto result = builder.Dot( - builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 3}), "lhs"), - builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {3, 2}), "rhs")); + Dot(Parameter(&builder, 0, ShapeUtil::MakeShape(prim_type, {2, 3}), "lhs"), + Parameter(&builder, 1, ShapeUtil::MakeShape(prim_type, {3, 2}), "rhs")); Array2D expected({{26.0f, 0.0f}, {-12.0f, 10.0f}}); @@ -512,21 +511,20 @@ XLA_TYPED_TEST(NonsquareMatrixDot, TestTT) { this->TestImpl(true, true); } XLA_TEST_F(DotOperationTest, MatrixVectorC64) { auto lhs_handle = client_ - ->TransferToServer(*Literal::CreateR2WithLayout( + ->TransferToServer(*LiteralUtil::CreateR2WithLayout( {{1.0, 2.0, 3.0, -4.0}}, LayoutUtil::MakeLayout({1, 0}))) .ConsumeValueOrDie(); auto rhs_handle = client_ - ->TransferToServer(*Literal::CreateR2WithLayout( + ->TransferToServer(*LiteralUtil::CreateR2WithLayout( {{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}, {-4.0, 4.0}}, LayoutUtil::MakeLayout({1, 0}))) .ConsumeValueOrDie(); XlaBuilder builder(TestName()); auto prim_type = primitive_util::NativeToPrimitiveType(); - auto result = builder.Dot( - builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {1, 4}), "lhs"), - builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {4, 2}), "rhs")); + Dot(Parameter(&builder, 0, ShapeUtil::MakeShape(prim_type, {1, 4}), "lhs"), + Parameter(&builder, 1, ShapeUtil::MakeShape(prim_type, {4, 2}), "rhs")); Array2D expected({{30.0, -2.0}}); @@ -534,15 +532,17 @@ XLA_TEST_F(DotOperationTest, MatrixVectorC64) { &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, ConcurrentMatMult) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, ConcurrentMatMult) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto matrix1 = builder.ConstantR2FromArray2D({{1.0f, 2.0f}, {3.0f, 4.0f}}); - auto matrix2 = builder.ConstantR2FromArray2D({{5.0f, 6.0f}, {7.0f, 8.0f}}); - auto matrix12 = builder.Dot(matrix1, matrix2); - auto matrix21 = builder.Dot(matrix2, matrix1); - builder.Add(matrix12, matrix21); + auto matrix1 = + ConstantR2FromArray2D(&builder, {{1.0f, 2.0f}, {3.0f, 4.0f}}); + auto matrix2 = + ConstantR2FromArray2D(&builder, {{5.0f, 6.0f}, {7.0f, 8.0f}}); + auto matrix12 = Dot(matrix1, matrix2); + auto matrix21 = Dot(matrix2, matrix1); + Add(matrix12, matrix21); Array2D expected({{42.0f, 56.0f}, {74.0f, 96.0f}}); this->template ComputeAndCompareR2(&builder, expected, {}, @@ -559,32 +559,32 @@ TYPED_TEST_CASE(DotOperationTestForBatchMatMul, TypesF16F32F64); XLA_TYPED_TEST(DotOperationTestForBatchMatMul, Types) { using T = TypeParam; XlaBuilder builder(this->TestName()); - auto x = - builder.Parameter(0, ShapeUtil::MakeShapeWithType({2, 2, 2, 2}), "x"); - auto y = - builder.Parameter(1, ShapeUtil::MakeShapeWithType({2, 2, 2, 2}), "y"); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShapeWithType({2, 2, 2, 2}), + "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShapeWithType({2, 2, 2, 2}), + "y"); - auto x_flat = builder.Reshape(x, {0, 1, 2, 3}, {4, 2, 2}); - auto y_flat = builder.Reshape(y, {0, 1, 2, 3}, {4, 2, 2}); + auto x_flat = Reshape(x, {0, 1, 2, 3}, {4, 2, 2}); + auto y_flat = Reshape(y, {0, 1, 2, 3}, {4, 2, 2}); // Slice batches into individual matrices and multiply them. std::vector out_slices; for (int i = 0; i < 4; ++i) { // Slice off individual matrices and reshape to 2D tensors. - auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1}); - x_slice = builder.Reshape(x_slice, {0, 1, 2}, {2, 2}); - auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1}); - y_slice = builder.Reshape(y_slice, {0, 1, 2}, {2, 2}); + auto x_slice = Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1}); + x_slice = Reshape(x_slice, {0, 1, 2}, {2, 2}); + auto y_slice = Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1}); + y_slice = Reshape(y_slice, {0, 1, 2}, {2, 2}); - auto out = builder.Dot(x_slice, y_slice); - out = builder.Reshape(out, {0, 1}, {1, 2, 2}); + auto out = Dot(x_slice, y_slice); + out = Reshape(out, {0, 1}, {1, 2, 2}); out_slices.push_back(out); } - auto out_flat = builder.ConcatInDim(out_slices, 0); - builder.Reshape(out_flat, {0, 1, 2}, {2, 2, 2, 2}); + auto out_flat = ConcatInDim(&builder, out_slices, 0); + Reshape(out_flat, {0, 1, 2}, {2, 2, 2, 2}); auto x_data = this->client_ - ->TransferToServer(*Literal::CreateR4FromArray4D( + ->TransferToServer(*LiteralUtil::CreateR4FromArray4D( {{{{1000.0f, 100.0f}, {10.0f, 1.0f}}, {{2000.0f, 200.0f}, {20.0f, 2.0f}}}, {{{3000.0f, 300.0f}, {30.0f, 3.0f}}, @@ -592,7 +592,7 @@ XLA_TYPED_TEST(DotOperationTestForBatchMatMul, Types) { .ConsumeValueOrDie(); auto y_data = this->client_ - ->TransferToServer(*Literal::CreateR4FromArray4D( + ->TransferToServer(*LiteralUtil::CreateR4FromArray4D( {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}, {{{11.0f, 22.0f}, {33.0f, 44.0f}}, {{55.0f, 66.0f}, {77.0f, 88.0f}}}})) @@ -611,14 +611,14 @@ XLA_TYPED_TEST(DotOperationTestForBatchMatMul, Types) { {x_data.get(), y_data.get()}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, GeneralMatMul) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMul) { using T = TypeParam; XlaBuilder builder(this->TestName()); auto x = - builder.Parameter(0, ShapeUtil::MakeShapeWithType({2, 2, 2}), "x"); + Parameter(&builder, 0, ShapeUtil::MakeShapeWithType({2, 2, 2}), "x"); auto y = - builder.Parameter(1, ShapeUtil::MakeShapeWithType({2, 2, 2}), "y"); + Parameter(&builder, 1, ShapeUtil::MakeShapeWithType({2, 2, 2}), "y"); DotDimensionNumbers dnums; dnums.add_lhs_contracting_dimensions(2); @@ -626,17 +626,17 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, GeneralMatMul) { dnums.add_lhs_batch_dimensions(0); dnums.add_rhs_batch_dimensions(0); - auto out = builder.DotGeneral(x, y, dnums); + DotGeneral(x, y, dnums); auto x_data = this->client_ - ->TransferToServer(*Literal::CreateR3FromArray3D( + ->TransferToServer(*LiteralUtil::CreateR3FromArray3D( {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}})) .ConsumeValueOrDie(); auto y_data = this->client_ - ->TransferToServer(*Literal::CreateR3FromArray3D( + ->TransferToServer(*LiteralUtil::CreateR3FromArray3D( {{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}})) .ConsumeValueOrDie(); @@ -647,7 +647,49 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, GeneralMatMul) { {x_data.get(), y_data.get()}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, TransposeFolding) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulMultipleBatch) { + using T = TypeParam; + + XlaBuilder builder(this->TestName()); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShapeWithType({2, 2, 2, 2}), + "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShapeWithType({2, 2, 2, 2}), + "y"); + + DotDimensionNumbers dnums; + dnums.add_lhs_contracting_dimensions(3); + dnums.add_rhs_contracting_dimensions(2); + dnums.add_lhs_batch_dimensions(0); + dnums.add_lhs_batch_dimensions(1); + dnums.add_rhs_batch_dimensions(0); + dnums.add_rhs_batch_dimensions(1); + + DotGeneral(x, y, dnums); + + auto x_data = + this->client_ + ->TransferToServer(*LiteralUtil::CreateR4FromArray4D( + {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}, + {{{9.0f, 10.0f}, {11.0f, 12.0f}}, + {{13.0f, 14.0f}, {15.0f, 16.0f}}}})) + .ConsumeValueOrDie(); + + auto y_data = + this->client_ + ->TransferToServer(*LiteralUtil::CreateR4FromArray4D( + {{{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}}, + {{{0.0f, 1.0f}, {1.0f, 0.0f}}, {{0.0f, 1.0f}, {1.0f, 0.0f}}}})) + .ConsumeValueOrDie(); + + this->template ComputeAndCompareR4( + &builder, + /*expected=*/ + {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}, + {{{10.0f, 9.0f}, {12.0f, 11.0f}}, {{14.0f, 13.0f}, {16.0f, 15.0f}}}}, + {x_data.get(), y_data.get()}, this->error_spec_); +} + +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, TransposeFolding) { using T = TypeParam; for (bool transpose_lhs : {false, true}) { for (bool transpose_rhs : {false, true}) { @@ -665,32 +707,36 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, TransposeFolding) { } auto lhs_handle = this->client_ - ->TransferToServer(*Literal::CreateR2FromArray2DWithLayout( - *lhs, LayoutUtil::MakeLayout( - MinorToMajorForIsRowMajor(row_major)))) + ->TransferToServer( + *LiteralUtil::CreateR2FromArray2DWithLayout( + *lhs, LayoutUtil::MakeLayout( + MinorToMajorForIsRowMajor(row_major)))) .ConsumeValueOrDie(); auto rhs_handle = this->client_ - ->TransferToServer(*Literal::CreateR2FromArray2DWithLayout( - *rhs, LayoutUtil::MakeLayout( - MinorToMajorForIsRowMajor(row_major)))) + ->TransferToServer( + *LiteralUtil::CreateR2FromArray2DWithLayout( + *rhs, LayoutUtil::MakeLayout( + MinorToMajorForIsRowMajor(row_major)))) .ConsumeValueOrDie(); XlaBuilder builder(this->TestName()); auto prim_type = primitive_util::NativeToPrimitiveType(); - auto lhs_arg = builder.Parameter( - 0, ShapeUtil::MakeShape(prim_type, {lhs->height(), lhs->width()}), + auto lhs_arg = Parameter( + &builder, 0, + ShapeUtil::MakeShape(prim_type, {lhs->height(), lhs->width()}), "lhs"); - auto rhs_arg = builder.Parameter( - 1, ShapeUtil::MakeShape(prim_type, {rhs->height(), rhs->width()}), + auto rhs_arg = Parameter( + &builder, 1, + ShapeUtil::MakeShape(prim_type, {rhs->height(), rhs->width()}), "rhs"); if (transpose_lhs) { - lhs_arg = builder.Transpose(lhs_arg, {1, 0}); + lhs_arg = Transpose(lhs_arg, {1, 0}); } if (transpose_rhs) { - rhs_arg = builder.Transpose(rhs_arg, {1, 0}); + rhs_arg = Transpose(rhs_arg, {1, 0}); } - auto result = builder.Dot(lhs_arg, rhs_arg); + Dot(lhs_arg, rhs_arg); Array2D expected({{26.0f, 0.0f}, {-12.0f, 10.0f}}); VLOG(1) << "TestTransposeFolding " << transpose_lhs << " " @@ -703,7 +749,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, TransposeFolding) { } } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, DotOfConcatOptimizationWithConstLHS) { using T = TypeParam; auto prim_type = primitive_util::NativeToPrimitiveType(); @@ -713,15 +759,15 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, {6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}})); XlaBuilder builder(this->TestName()); - auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array); - auto rhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}), - "rhs_arg_0"); - auto rhs_arg_1 = builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {3, 2}), - "rhs_arg_1"); - auto rhs_arg_2 = builder.Parameter(2, ShapeUtil::MakeShape(prim_type, {1, 2}), - "rhs_arg_2"); - auto result = builder.Dot( - lhs_constant, builder.ConcatInDim({rhs_arg_0, rhs_arg_1, rhs_arg_2}, 0)); + auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array); + auto rhs_arg_0 = Parameter( + &builder, 0, ShapeUtil::MakeShape(prim_type, {2, 2}), "rhs_arg_0"); + auto rhs_arg_1 = Parameter( + &builder, 1, ShapeUtil::MakeShape(prim_type, {3, 2}), "rhs_arg_1"); + auto rhs_arg_2 = Parameter( + &builder, 2, ShapeUtil::MakeShape(prim_type, {1, 2}), "rhs_arg_2"); + Dot(lhs_constant, + ConcatInDim(&builder, {rhs_arg_0, rhs_arg_1, rhs_arg_2}, 0)); std::unique_ptr> arg_0_value_array( new Array2D({{1.0f, 2.0f}, {3.0f, 4.0f}})); @@ -732,15 +778,15 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, TF_ASSERT_OK_AND_ASSIGN( auto arg_0_value, this->client_->TransferToServer( - *Literal::CreateR2FromArray2D(*arg_0_value_array))); + *LiteralUtil::CreateR2FromArray2D(*arg_0_value_array))); TF_ASSERT_OK_AND_ASSIGN( auto arg_1_value, this->client_->TransferToServer( - *Literal::CreateR2FromArray2D(*arg_1_value_array))); + *LiteralUtil::CreateR2FromArray2D(*arg_1_value_array))); TF_ASSERT_OK_AND_ASSIGN( auto arg_2_value, this->client_->TransferToServer( - *Literal::CreateR2FromArray2D(*arg_2_value_array))); + *LiteralUtil::CreateR2FromArray2D(*arg_2_value_array))); Array2D expected({{53.0f, 74.0f}, {45.0f, 66.0f}}); this->template ComputeAndCompareR2( @@ -749,7 +795,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, DotOfConcatOptimizationWithConstRHS) { using T = TypeParam; std::unique_ptr> constant_rhs_array( @@ -761,15 +807,15 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, {2.0f, 1.0f}})); XlaBuilder builder(this->TestName()); - auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array); - auto lhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShapeWithType({2, 2}), - "lhs_arg_0"); - auto lhs_arg_1 = builder.Parameter(1, ShapeUtil::MakeShapeWithType({2, 3}), - "lhs_arg_1"); - auto lhs_arg_2 = builder.Parameter(2, ShapeUtil::MakeShapeWithType({2, 1}), - "lhs_arg_2"); - auto result = builder.Dot( - builder.ConcatInDim({lhs_arg_0, lhs_arg_1, lhs_arg_2}, 1), rhs_constant); + auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array); + auto lhs_arg_0 = Parameter( + &builder, 0, ShapeUtil::MakeShapeWithType({2, 2}), "lhs_arg_0"); + auto lhs_arg_1 = Parameter( + &builder, 1, ShapeUtil::MakeShapeWithType({2, 3}), "lhs_arg_1"); + auto lhs_arg_2 = Parameter( + &builder, 2, ShapeUtil::MakeShapeWithType({2, 1}), "lhs_arg_2"); + Dot(ConcatInDim(&builder, {lhs_arg_0, lhs_arg_1, lhs_arg_2}, 1), + rhs_constant); std::unique_ptr> arg_0_value_array( new Array2D({{1.0f, 2.0f}, {3.0f, 4.0f}})); @@ -781,15 +827,15 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, TF_ASSERT_OK_AND_ASSIGN( auto arg_0_value, this->client_->TransferToServer( - *Literal::CreateR2FromArray2D(*arg_0_value_array))); + *LiteralUtil::CreateR2FromArray2D(*arg_0_value_array))); TF_ASSERT_OK_AND_ASSIGN( auto arg_1_value, this->client_->TransferToServer( - *Literal::CreateR2FromArray2D(*arg_1_value_array))); + *LiteralUtil::CreateR2FromArray2D(*arg_1_value_array))); TF_ASSERT_OK_AND_ASSIGN( auto arg_2_value, this->client_->TransferToServer( - *Literal::CreateR2FromArray2D(*arg_2_value_array))); + *LiteralUtil::CreateR2FromArray2D(*arg_2_value_array))); Array2D expected({{38.0f, 36.0f}, {93.0f, 91.0f}}); this->template ComputeAndCompareR2( @@ -811,16 +857,15 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) { // Dot result to slice from: {{114, 105, 96}, {96, 105, 114}} XlaBuilder builder(TestName()); - auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array); - auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array); - auto start_constant = builder.ConstantR1({1, 0}); - auto dynamic_slice = - builder.DynamicSlice(lhs_constant, start_constant, {1, 6}); + auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array); + auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array); + auto start_constant = ConstantR1(&builder, {1, 0}); + auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {1, 6}); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(1); dot_dnums.add_rhs_contracting_dimensions(0); - auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums); + DotGeneral(dynamic_slice, rhs_constant, dot_dnums); Array2D expected({{96.0, 105.0, 114.0}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); @@ -839,25 +884,23 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) { // Dot result to slice from: {{114, 105, 96}, {96, 105, 114}} XlaBuilder builder(TestName()); - auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array); - auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array); - auto start_constant = builder.ConstantR1({0, 1}); - auto dynamic_slice = - builder.DynamicSlice(rhs_constant, start_constant, {6, 1}); + auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array); + auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array); + auto start_constant = ConstantR1(&builder, {0, 1}); + auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {6, 1}); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(1); dot_dnums.add_rhs_contracting_dimensions(0); - auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums); + DotGeneral(lhs_constant, dynamic_slice, dot_dnums); Array2D expected({{105.0}, {105.0}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); } -// TODO (b/69062148) Enable when Dot implements general contracting dimensions. XLA_TEST_F(DotOperationTest, - DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER( - DotOfGatherOptimizationWithConstRHSReverseMM)))) { + + DotOfGatherOptimizationWithConstRHSReverseMM) { std::unique_ptr> constant_lhs_array( new Array2D({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, @@ -870,25 +913,21 @@ XLA_TEST_F(DotOperationTest, // Dot result to slice from: {{114, 96}, {105, 105}, {96, 114}} XlaBuilder builder(TestName()); - auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array); - auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array); - auto start_constant = builder.ConstantR1({0, 1}); - auto dynamic_slice = - builder.DynamicSlice(lhs_constant, start_constant, {6, 1}); + auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array); + auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array); + auto start_constant = ConstantR1(&builder, {0, 1}); + auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {6, 1}); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(0); dot_dnums.add_rhs_contracting_dimensions(1); - auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums); + DotGeneral(dynamic_slice, rhs_constant, dot_dnums); Array2D expected({{105.0, 105.0}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); } -// TODO (b/69062148) Enable when Dot implements general contracting dimensions. -XLA_TEST_F(DotOperationTest, - DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER( - DotOfGatherOptimizationWithConstLHSReverseMM)))) { +XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSReverseMM) { std::unique_ptr> constant_lhs_array( new Array2D({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, @@ -901,25 +940,21 @@ XLA_TEST_F(DotOperationTest, // Dot result to slice from: {{114, 96}, {105, 105}, {96, 114}} XlaBuilder builder(TestName()); - auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array); - auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array); - auto start_constant = builder.ConstantR1({1, 0}); - auto dynamic_slice = - builder.DynamicSlice(rhs_constant, start_constant, {1, 6}); + auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array); + auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array); + auto start_constant = ConstantR1(&builder, {1, 0}); + auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {1, 6}); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(0); dot_dnums.add_rhs_contracting_dimensions(1); - auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums); + DotGeneral(lhs_constant, dynamic_slice, dot_dnums); Array2D expected({{96.0}, {105.0}, {114.0}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); } -// TODO (b/69062148) Enable when Dot implements general contracting dimensions. -XLA_TEST_F(DotOperationTest, - DISABLED_ON_CPU(DISABLED_ON_GPU( - DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSRows)))) { +XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSRows) { std::unique_ptr> constant_lhs_array( new Array2D({{1.0, 2.0}, {3.0, 4.0}, @@ -937,25 +972,21 @@ XLA_TEST_F(DotOperationTest, // Dot result to slice from: {{132, 129, 126}, {126, 129, 132}} XlaBuilder builder(TestName()); - auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array); - auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array); - auto start_constant = builder.ConstantR1({0, 1}); - auto dynamic_slice = - builder.DynamicSlice(lhs_constant, start_constant, {6, 1}); + auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array); + auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array); + auto start_constant = ConstantR1(&builder, {0, 1}); + auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {6, 1}); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(0); dot_dnums.add_rhs_contracting_dimensions(0); - auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums); + DotGeneral(dynamic_slice, rhs_constant, dot_dnums); Array2D expected({{126.0, 129.0, 132.0}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); } -// TODO (b/69062148) Enable when Dot implements general contracting dimensions. -XLA_TEST_F(DotOperationTest, - DISABLED_ON_CPU(DISABLED_ON_GPU( - DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSRows)))) { +XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSRows) { std::unique_ptr> constant_lhs_array( new Array2D({{1.0, 2.0}, {3.0, 4.0}, @@ -973,25 +1004,21 @@ XLA_TEST_F(DotOperationTest, // Dot result to slice from: {{132, 129, 126}, {126, 129, 132}} XlaBuilder builder(TestName()); - auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array); - auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array); - auto start_constant = builder.ConstantR1({0, 1}); - auto dynamic_slice = - builder.DynamicSlice(rhs_constant, start_constant, {6, 1}); + auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array); + auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array); + auto start_constant = ConstantR1(&builder, {0, 1}); + auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {6, 1}); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(0); dot_dnums.add_rhs_contracting_dimensions(0); - auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums); + DotGeneral(lhs_constant, dynamic_slice, dot_dnums); Array2D expected({{129.0}, {129.0}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); } -// TODO (b/69062148) Enable when Dot implements general contracting dimensions. -XLA_TEST_F(DotOperationTest, - DISABLED_ON_CPU(DISABLED_ON_GPU( - DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSCols)))) { +XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSCols) { std::unique_ptr> constant_lhs_array(new Array2D( {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}})); std::unique_ptr> constant_rhs_array( @@ -1001,25 +1028,21 @@ XLA_TEST_F(DotOperationTest, // Dot result to slice from: {{91, 168, 56}, {56, 168, 91}} XlaBuilder builder(TestName()); - auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array); - auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array); - auto start_constant = builder.ConstantR1({1, 0}); - auto dynamic_slice = - builder.DynamicSlice(lhs_constant, start_constant, {1, 6}); + auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array); + auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array); + auto start_constant = ConstantR1(&builder, {1, 0}); + auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {1, 6}); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(1); dot_dnums.add_rhs_contracting_dimensions(1); - auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums); + DotGeneral(dynamic_slice, rhs_constant, dot_dnums); Array2D expected({{56.0, 168.0, 91.0}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); } -// TODO (b/69062148) Enable when Dot implements general contracting dimensions. -XLA_TEST_F(DotOperationTest, - DISABLED_ON_CPU(DISABLED_ON_GPU( - DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSCols)))) { +XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSCols) { std::unique_ptr> constant_lhs_array(new Array2D( {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}})); std::unique_ptr> constant_rhs_array( @@ -1029,19 +1052,41 @@ XLA_TEST_F(DotOperationTest, // Dot result to slice from: {{91, 168, 56}, {56, 168, 91}} XlaBuilder builder(TestName()); - auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array); - auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array); - auto start_constant = builder.ConstantR1({1, 0}); - auto dynamic_slice = - builder.DynamicSlice(rhs_constant, start_constant, {1, 6}); + auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array); + auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array); + auto start_constant = ConstantR1(&builder, {1, 0}); + auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {1, 6}); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(1); dot_dnums.add_rhs_contracting_dimensions(1); - auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums); + DotGeneral(lhs_constant, dynamic_slice, dot_dnums); Array2D expected({{168.0}, {168.0}}); ComputeAndCompareR2(&builder, expected, {}, error_spec_); } + +XLA_TEST_F(DotOperationTest, DotRank2AndRank2NonDefaultContractionDims) { + XlaBuilder builder(TestName()); + + Array2D lhs_array({{1.0f, 2.0f}, {3.0f, 4.0f}}); + auto lhs_constant = ConstantR2FromArray2D(&builder, lhs_array); + + Array2D rhs_array({{5.0f, 6.0f}, {7.0f, 8.0f}}); + auto rhs_constant = ConstantR2FromArray2D(&builder, rhs_array); + + Shape shape = ShapeUtil::MakeShape(F32, {2, 2}); + DotDimensionNumbers dot_dnums; + dot_dnums.add_lhs_contracting_dimensions(0); + dot_dnums.add_rhs_contracting_dimensions(0); + DotGeneral(lhs_constant, rhs_constant, dot_dnums); + + Array2D expected({ + {26.f, 30.f}, + {38.f, 44.f}, + }); + + ComputeAndCompareR2(&builder, expected, {}, error_spec_); +} } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc index 49f3a10d227f2f9edfe76405ba13498fe822f8d8..9bf3767ca3e229cd3eb37c1f51c526c7dd2bf0f8 100644 --- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc +++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc @@ -19,7 +19,7 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/local_service.h" @@ -114,21 +114,21 @@ class DynamicSliceTest : public ClientLibraryTestBase { } template - void RunR1(tensorflow::gtl::ArraySlice input_values_int, + void RunR1(absl::Span input_values_int, const std::vector slice_starts, const std::vector& slice_sizes, - tensorflow::gtl::ArraySlice expected_values_int) { + absl::Span expected_values_int) { // bfloat16 has explicit constructors, so it does not implicitly convert the // way built-in types do, which is why we can't take the parameter as an - // ArraySlice. We also can't convert it to a vector, because - // vector is special so that it cannot be an ArraySlice, which + // Span. We also can't convert it to a vector, because + // vector is special so that it cannot be a Span, which // is what the code below wants. So instead we do this. Literal input_values = - std::move(*Literal::CreateR1(input_values_int) + std::move(*LiteralUtil::CreateR1(input_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal expected_values = - std::move(*Literal::CreateR1(expected_values_int) + std::move(*LiteralUtil::CreateR1(expected_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); @@ -138,8 +138,8 @@ class DynamicSliceTest : public ClientLibraryTestBase { std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantLiteral(input_values); - builder.DynamicSlice(input, starts, slice_sizes); + auto input = ConstantLiteral(&builder, input_values); + DynamicSlice(input, starts, slice_sizes); // Run computation and compare against expected values. ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()}); } @@ -150,11 +150,11 @@ class DynamicSliceTest : public ClientLibraryTestBase { const std::vector& slice_sizes, const Array2D& expected_values_int) { Literal input_values = - std::move(*Literal::CreateR2FromArray2D(input_values_int) + std::move(*LiteralUtil::CreateR2FromArray2D(input_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal expected_values = - std::move(*Literal::CreateR2FromArray2D(expected_values_int) + std::move(*LiteralUtil::CreateR2FromArray2D(expected_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); @@ -164,8 +164,8 @@ class DynamicSliceTest : public ClientLibraryTestBase { std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantLiteral(input_values); - builder.DynamicSlice(input, starts, slice_sizes); + auto input = ConstantLiteral(&builder, input_values); + DynamicSlice(input, starts, slice_sizes); // Run computation and compare against expected values. ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()}); } @@ -176,11 +176,11 @@ class DynamicSliceTest : public ClientLibraryTestBase { const std::vector& slice_sizes, const Array3D& expected_values_int) { Literal input_values = - std::move(*Literal::CreateR3FromArray3D(input_values_int) + std::move(*LiteralUtil::CreateR3FromArray3D(input_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal expected_values = - std::move(*Literal::CreateR3FromArray3D(expected_values_int) + std::move(*LiteralUtil::CreateR3FromArray3D(expected_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); @@ -190,8 +190,8 @@ class DynamicSliceTest : public ClientLibraryTestBase { std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantLiteral(input_values); - builder.DynamicSlice(input, starts, slice_sizes); + auto input = ConstantLiteral(&builder, input_values); + DynamicSlice(input, starts, slice_sizes); // Run computation and compare against expected values. ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()}); } @@ -202,18 +202,28 @@ XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1(); } XLA_TEST_F(DynamicSliceTest, Int32R1OOB) { TestR1OOB(); } XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1(); } XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1(); } +XLA_TEST_F(DynamicSliceTest, UInt32R1OOB) { + RunR1({0, 1, 2, 3, 4}, {2147483648u}, {2}, {3, 4}); +} XLA_TEST_F(DynamicSliceTest, Int32R2BF16) { TestR2(); } XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2(); } XLA_TEST_F(DynamicSliceTest, Int32R2OOB) { TestR2OOB(); } XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2(); } XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2(); } +XLA_TEST_F(DynamicSliceTest, UInt32R2OOB) { + RunR2({{0, 1}, {2, 3}}, {2147483648u, 0}, {1, 1}, {{2}}); +} XLA_TEST_F(DynamicSliceTest, Int32R3BF16) { TestR3(); } XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3(); } XLA_TEST_F(DynamicSliceTest, Int32R3OOB) { TestR3OOB(); } XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3(); } XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3(); } +XLA_TEST_F(DynamicSliceTest, UInt32R3OOB) { + RunR3({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}, + {2147483648u, 0, 2147483648u}, {1, 1, 1}, {{{5}}}); +} XLA_TEST_F(DynamicSliceTest, Int32R1Pred) { // Slice at dimension start. @@ -349,15 +359,15 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { void RunR0(int input_value_int, int update_value_int, const std::vector slice_starts, int expected_value_int) { Literal input_value = - std::move(*Literal::CreateR0(input_value_int) + std::move(*LiteralUtil::CreateR0(input_value_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal update_value = - std::move(*Literal::CreateR0(update_value_int) + std::move(*LiteralUtil::CreateR0(update_value_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal expected_value = - std::move(*Literal::CreateR0(expected_value_int) + std::move(*LiteralUtil::CreateR0(expected_value_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); @@ -367,28 +377,28 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantLiteral(input_value); - auto update = builder.ConstantLiteral(update_value); - builder.DynamicUpdateSlice(input, update, starts); + auto input = ConstantLiteral(&builder, input_value); + auto update = ConstantLiteral(&builder, update_value); + DynamicUpdateSlice(input, update, starts); // Run computation and compare against expected values. ComputeAndCompareLiteral(&builder, expected_value, {start_data.get()}); } template - void RunR1(tensorflow::gtl::ArraySlice input_values_int, - tensorflow::gtl::ArraySlice update_values_int, + void RunR1(absl::Span input_values_int, + absl::Span update_values_int, const std::vector slice_starts, - tensorflow::gtl::ArraySlice expected_values_int) { + absl::Span expected_values_int) { Literal input_values = - std::move(*Literal::CreateR1(input_values_int) + std::move(*LiteralUtil::CreateR1(input_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal update_values = - std::move(*Literal::CreateR1(update_values_int) + std::move(*LiteralUtil::CreateR1(update_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal expected_values = - std::move(*Literal::CreateR1(expected_values_int) + std::move(*LiteralUtil::CreateR1(expected_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); @@ -398,9 +408,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantLiteral(input_values); - auto update = builder.ConstantLiteral(update_values); - builder.DynamicUpdateSlice(input, update, starts); + auto input = ConstantLiteral(&builder, input_values); + auto update = ConstantLiteral(&builder, update_values); + DynamicUpdateSlice(input, update, starts); // Run computation and compare against expected values. ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()}); } @@ -411,15 +421,15 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { const std::vector slice_starts, const Array2D& expected_values_int) { Literal input_values = - std::move(*Literal::CreateR2FromArray2D(input_values_int) + std::move(*LiteralUtil::CreateR2FromArray2D(input_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal update_values = - std::move(*Literal::CreateR2FromArray2D(update_values_int) + std::move(*LiteralUtil::CreateR2FromArray2D(update_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal expected_values = - std::move(*Literal::CreateR2FromArray2D(expected_values_int) + std::move(*LiteralUtil::CreateR2FromArray2D(expected_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); @@ -429,9 +439,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantLiteral(input_values); - auto update = builder.ConstantLiteral(update_values); - builder.DynamicUpdateSlice(input, update, starts); + auto input = ConstantLiteral(&builder, input_values); + auto update = ConstantLiteral(&builder, update_values); + DynamicUpdateSlice(input, update, starts); // Run computation and compare against expected values. ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()}); } @@ -442,15 +452,15 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { const std::vector slice_starts, const Array3D& expected_values_int) { Literal input_values = - std::move(*Literal::CreateR3FromArray3D(input_values_int) + std::move(*LiteralUtil::CreateR3FromArray3D(input_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal update_values = - std::move(*Literal::CreateR3FromArray3D(update_values_int) + std::move(*LiteralUtil::CreateR3FromArray3D(update_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); Literal expected_values = - std::move(*Literal::CreateR3FromArray3D(expected_values_int) + std::move(*LiteralUtil::CreateR3FromArray3D(expected_values_int) ->Convert(primitive_util::NativeToPrimitiveType()) .ValueOrDie()); @@ -460,9 +470,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantLiteral(input_values); - auto update = builder.ConstantLiteral(update_values); - builder.DynamicUpdateSlice(input, update, starts); + auto input = ConstantLiteral(&builder, input_values); + auto update = ConstantLiteral(&builder, update_values); + DynamicUpdateSlice(input, update, starts); // Run computation and compare against expected values. ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()}); } @@ -508,8 +518,8 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { XlaOp update; std::unique_ptr update_data = CreateR3Parameter( update_values, 1, "update_values", &builder, &update); - auto starts = builder.ConstantR1({index, 0, 0}); - builder.DynamicUpdateSlice(input, update, starts); + auto starts = ConstantR1(&builder, {index, 0, 0}); + DynamicUpdateSlice(input, update, starts); // Run computation and compare against expected values. ComputeAndCompareR3(&builder, expected_values, @@ -520,7 +530,7 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { template void DumpArray(const string& name, const Array3D values) { std::unique_ptr literal = - Literal::CreateR3FromArray3D(values); + LiteralUtil::CreateR3FromArray3D(values); LOG(INFO) << name << ":" << literal->ToString(); } }; @@ -530,21 +540,32 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int32R0) { TestR0(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0(); } -// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10. XLA_TEST_F(DynamicUpdateSliceTest, Int32R1BF16) { TestR1(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1(); } +XLA_TEST_F(DynamicUpdateSliceTest, UInt32R1OOB) { + RunR1({0, 1, 2, 3, 4}, {5, 6}, {2147483648u}, {0, 1, 2, 5, 6}); +} XLA_TEST_F(DynamicUpdateSliceTest, Int32R2BF16) { TestR2(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2(); } +XLA_TEST_F(DynamicUpdateSliceTest, UInt32R2OOB) { + RunR2({{0, 1}, {2, 3}}, {{4}}, {2147483648u, 0}, + {{0, 1}, {4, 3}}); +} XLA_TEST_F(DynamicUpdateSliceTest, Int32R3BF16) { TestR3(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3(); } +XLA_TEST_F(DynamicUpdateSliceTest, UInt32R3OOB) { + RunR3({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}, {{{8}}}, + {2147483648u, 0, 2147483648u}, + {{{0, 1}, {2, 3}}, {{4, 8}, {6, 7}}}); +} XLA_TEST_F(DynamicUpdateSliceTest, Int32OOBBF16) { TestOOB(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32OOB) { TestOOB(); } @@ -695,17 +716,17 @@ void BM_DynamicSlice(int num_iters) { XlaBuilder builder("DynamicSlice"); // Create input as a constant: shape [1, 2, 3, 4] - auto input_literal = Literal::CreateR4( + auto input_literal = LiteralUtil::CreateR4( {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}, {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}}); - auto input = builder.ConstantLiteral(*input_literal); + auto input = ConstantLiteral(&builder, *input_literal); // Create dynamic slice start indices as a parameter: shape [4] auto start_indices_shape = ShapeUtil::MakeShape(S32, {4}); auto start_indices = - builder.Parameter(0, start_indices_shape, "start_indices"); + Parameter(&builder, 0, start_indices_shape, "start_indices"); // Add DynamicSlice op to the computatation. - builder.DynamicSlice(input, start_indices, {1, 1, 1, 1}); + DynamicSlice(input, start_indices, {1, 1, 1, 1}); auto computation = builder.Build().ConsumeValueOrDie(); // Initialize and transfer parameter buffer. @@ -715,9 +736,11 @@ void BM_DynamicSlice(int num_iters) { start_indices_shape, &allocator, /*device_ordinal=*/0) .ConsumeValueOrDie(); - auto start_indices_literal = Literal::CreateR1({0, 1, 2, 3}); + auto start_indices_literal = LiteralUtil::CreateR1({0, 1, 2, 3}); + auto stream = + client->mutable_backend()->BorrowStream(device_ordinal).ValueOrDie(); ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice( - executors[device_ordinal], *start_indices_literal, buffer)); + stream.get(), *start_indices_literal, buffer)); std::unique_ptr executable = client diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc index a6ba6db5d3bf86de91f6fda022c46afee01281c2..5116e60ca63ef5f94b25b15e6616086fb9e44bbb 100644 --- a/tensorflow/compiler/xla/tests/execution_profile_test.cc +++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/client/global_data.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/core/platform/test.h" @@ -31,10 +31,10 @@ XLA_TEST_F(ExecutionProfileTest, ExecuteWithExecutionProfile) { TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr input, client_->TransferToServer( - *Literal::CreateR2F32Linspace(1e0, 1e5, 256, 256))); + *LiteralUtil::CreateR2F32Linspace(1e0, 1e5, 256, 256))); XlaBuilder b(TestName() + ".add"); - b.Dot(b.Parameter(0, shape, "param_0"), b.Parameter(1, shape, "param_1")); + Dot(Parameter(&b, 0, shape, "param_0"), Parameter(&b, 1, shape, "param_1")); TF_ASSERT_OK_AND_ASSIGN(XlaComputation dot_product, b.Build()); ExecutionProfile execution_profile; diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc index 0a37e4d423620122f2e109343a86a964f46d778f..bf1de02ba9dbd97db9ee31484402fe9b92385219 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc +++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -39,7 +39,7 @@ class ExhaustiveF32ElementwiseOpTest XlaBuilder builder(TestName()); std::unique_ptr input_literal = - Literal::CreateFromDimensions(F32, {input_size}); + LiteralUtil::CreateFromDimensions(F32, {input_size}); for (int64 i = begin; i < end; i++) { if (i >= known_incorrect_range.first && i < known_incorrect_range.second) { @@ -54,7 +54,7 @@ class ExhaustiveF32ElementwiseOpTest TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr input_data, client_->TransferToServer(*input_literal)); - auto input = builder.Parameter(0, input_literal->shape(), "input"); + auto input = Parameter(&builder, 0, input_literal->shape(), "input"); enqueue_op(&builder, input); std::vector expected_result; @@ -79,8 +79,8 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) { #endif ExhaustivelyTestF32Op( - [](XlaBuilder* builder, const XlaOp& input) { builder->Log(input); }, - std::log, known_incorrect_range); + [](XlaBuilder* builder, const XlaOp& input) { Log(input); }, std::log, + known_incorrect_range); } XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ExpF32) { @@ -95,14 +95,14 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ExpF32) { #endif ExhaustivelyTestF32Op( - [](XlaBuilder* builder, const XlaOp& input) { builder->Exp(input); }, - std::exp, known_incorrect_range); + [](XlaBuilder* builder, const XlaOp& input) { Exp(input); }, std::exp, + known_incorrect_range); } XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, TanhF32) { ExhaustivelyTestF32Op( - [](XlaBuilder* builder, const XlaOp& input) { builder->Tanh(input); }, - std::tanh, /*known_incorrect_range=*/{0, 0}); + [](XlaBuilder* builder, const XlaOp& input) { Tanh(input); }, std::tanh, + /*known_incorrect_range=*/{0, 0}); } std::vector> CreateExhaustiveParameters() { diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc index 93d1c921c4a138cda55ed7338b8e3aa82518d114..dcb469087e0064d17ce3b04fdeaf0b6136069a55 100644 --- a/tensorflow/compiler/xla/tests/filecheck.cc +++ b/tensorflow/compiler/xla/tests/filecheck.cc @@ -76,6 +76,11 @@ StatusOr RunFileCheck(const string& input, const string& pattern) { XLA_LOG_LINES(tensorflow::WARNING, input); LOG(WARNING) << "FileCheck pattern was:"; XLA_LOG_LINES(tensorflow::WARNING, pattern); + } else if (!standard_error.empty()) { + LOG(INFO) << "FileCheck stderr:"; + XLA_LOG_LINES(tensorflow::INFO, standard_error); + LOG(INFO) << "FileCheck input was:"; + XLA_LOG_LINES(tensorflow::INFO, input); } return succeeded; } diff --git a/tensorflow/compiler/xla/tests/floor_ceil_test.cc b/tensorflow/compiler/xla/tests/floor_ceil_test.cc index 71eb914a8e5eaef2e38b9e6e7d45b8a10ce1bd7a..3be9657db40a7ea073baca32d8a20ccd6fa8a274 100644 --- a/tensorflow/compiler/xla/tests/floor_ceil_test.cc +++ b/tensorflow/compiler/xla/tests/floor_ceil_test.cc @@ -16,13 +16,13 @@ limitations under the License. #include #include +#include "absl/strings/str_join.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" @@ -37,17 +37,16 @@ class FloorCeilTest : public ClientLibraryTestBase { }; // Runs a computation and comparison on expected vs f(input) - void TestR1F32(tensorflow::gtl::ArraySlice input, - tensorflow::gtl::ArraySlice expected, Function f) { - LOG(INFO) << "input: {" << tensorflow::str_util::Join(expected, ", ") - << "}"; + void TestR1F32(absl::Span input, + absl::Span expected, Function f) { + LOG(INFO) << "input: {" << absl::StrJoin(expected, ", ") << "}"; XlaBuilder builder(TestName()); - auto c = builder.ConstantR1(input); + auto c = ConstantR1(&builder, input); if (f == kCeil) { - builder.Ceil(c); + Ceil(c); } else { ASSERT_EQ(kFloor, f); - builder.Floor(c); + Floor(c); } ComputeAndCompareR1(&builder, expected, /*arguments=*/{}); } @@ -55,12 +54,12 @@ class FloorCeilTest : public ClientLibraryTestBase { void TestR0F32(float input, float expected, Function f) { LOG(INFO) << "input: " << expected; XlaBuilder builder(TestName()); - auto c = builder.ConstantR0(input); + auto c = ConstantR0(&builder, input); if (f == kCeil) { - builder.Ceil(c); + Ceil(c); } else { ASSERT_EQ(kFloor, f); - builder.Floor(c); + Floor(c); } ComputeAndCompareR0(&builder, expected, /*arguments=*/{}); } diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc index 73f029b59bc56aa6c3e86200a49fcae0fd177101..c5bbbe778df15d63a2586bd6291a7a33fc82aa52 100644 --- a/tensorflow/compiler/xla/tests/fmax_test.cc +++ b/tensorflow/compiler/xla/tests/fmax_test.cc @@ -16,7 +16,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/core/platform/test.h" @@ -28,11 +28,11 @@ class FmaxSimpleTest : public ClientLibraryTestBase {}; TEST_F(FmaxSimpleTest, FmaxTenValues) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0}); - auto y = builder.ConstantR1( - {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0}); - builder.Max(x, y); + auto x = ConstantR1( + &builder, {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0}); + auto y = ConstantR1( + &builder, {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0}); + Max(x, y); std::vector expected = {-0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc index e6f79b5ac55dddfbb213a36cadbee53bc9443d9d..7cb2f0cedfc2e74386bb3c01ca0b838e7cdcbce9 100644 --- a/tensorflow/compiler/xla/tests/fusion_test.cc +++ b/tensorflow/compiler/xla/tests/fusion_test.cc @@ -22,17 +22,19 @@ limitations under the License. #define EIGEN_USE_THREADS +#include "absl/memory/memory.h" +#include "absl/types/span.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/primitive_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -41,14 +43,11 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/common_runtime/eigen_thread_pool.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/platform/types.h" -using tensorflow::gtl::ArraySlice; - namespace xla { namespace { @@ -89,7 +88,7 @@ class FusionTest : public HloTestBase { HloInstruction* hlos[4]; for (int i = 0; i < Arity; ++i) { hlos[i + 1] = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2FromArray2D(operand_data[i]))); + LiteralUtil::CreateR2FromArray2D(operand_data[i]))); } auto answer_shape = ShapeUtil::MakeShape(prim_type, {test_width, test_height}); @@ -112,10 +111,10 @@ class FusionTest : public HloTestBase { hlos[0] = builder.AddInstruction(std::move(root_hlo)); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction( - ArraySlice(hlos, 0, Arity + 1), + absl::Span(hlos).subspan(0, Arity + 1), HloInstruction::FusionKind::kLoop); - auto expected = Literal::CreateR2FromArray2D(answer_data); + auto expected = LiteralUtil::CreateR2FromArray2D(answer_data); auto actual = ExecuteAndTransfer(std::move(hlo_module), {}); if (primitive_util::IsFloatingPointType(prim_type)) { EXPECT_TRUE(LiteralTestUtil::Near(*expected, *actual, ErrorSpec(1e-4))); @@ -126,12 +125,12 @@ class FusionTest : public HloTestBase { private: template - T ComputeElementwiseAnswer(HloOpcode opcode, ArraySlice xs); + T ComputeElementwiseAnswer(HloOpcode opcode, absl::Span xs); }; template <> float FusionTest::ComputeElementwiseAnswer(HloOpcode opcode, - ArraySlice xs) { + absl::Span xs) { switch (opcode) { case HloOpcode::kAdd: return xs[0] + xs[1]; @@ -156,7 +155,7 @@ float FusionTest::ComputeElementwiseAnswer(HloOpcode opcode, template <> bool FusionTest::ComputeElementwiseAnswer(HloOpcode opcode, - ArraySlice xs) { + absl::Span xs) { switch (opcode) { case HloOpcode::kEq: return xs[0] == xs[1]; @@ -186,27 +185,28 @@ XLA_TEST_F(FusionTest, Test) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0}, {2.0}, {3.0}}))); + LiteralUtil::CreateR2({{1.0}, {2.0}, {3.0}}))); auto const1 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{-1.0}, {-1.0}, {-1.0}}))); + LiteralUtil::CreateR2({{-1.0}, {-1.0}, {-1.0}}))); auto add2 = builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(F32, {3, 1}), HloOpcode::kAdd, const0, const1)); auto reshape3 = builder.AddInstruction(HloInstruction::CreateTranspose( ShapeUtil::MakeShape(F32, {1, 3}), add2, {1, 0})); auto const4 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.62, 2.72, 3.14}}))); + LiteralUtil::CreateR2({{1.62, 2.72, 3.14}}))); auto concat5 = builder.AddInstruction(HloInstruction::CreateConcatenate( ShapeUtil::MakeShape(F32, {2, 3}), {reshape3, const4}, 0)); auto const6 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}}))); + LiteralUtil::CreateR2({{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}}))); auto negate7 = builder.AddInstruction(HloInstruction::CreateUnary( ShapeUtil::MakeShape(F32, {2, 3}), HloOpcode::kNegate, const6)); auto add8 = builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(F32, {2, 3}), HloOpcode::kAdd, concat5, negate7)); auto const9 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}}))); - auto const10 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{true, false, true}, {false, true, false}}))); + LiteralUtil::CreateR2({{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}}))); + auto const10 = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR2( + {{true, false, true}, {false, true, false}}))); auto select11 = builder.AddInstruction( HloInstruction::CreateTernary(ShapeUtil::MakeShape(F32, {2, 3}), HloOpcode::kSelect, const10, add8, const9)); @@ -222,7 +222,7 @@ XLA_TEST_F(FusionTest, Test) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR2({{0.5}, {2.72}}), + *LiteralUtil::CreateR2({{0.5}, {2.72}}), *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4))); } @@ -233,11 +233,11 @@ XLA_TEST_F(FusionTest, Parameter) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1.0, 2.0, 3.0}}))); + LiteralUtil::CreateR2({{1.0, 2.0, 3.0}}))); auto copy1 = builder.AddInstruction(HloInstruction::CreateUnary( ShapeUtil::MakeShape(F32, {1, 3}), HloOpcode::kCopy, const0)); auto const2 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{-2.0, -2.0, -2.0}}))); + LiteralUtil::CreateR2({{-2.0, -2.0, -2.0}}))); // add3 = copy1 + const2 = const0 + const2 = {1,2,3} + {-2,-2,-2} = {-1,0,+1} auto add3 = builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(F32, {1, 3}), HloOpcode::kAdd, copy1, const2)); @@ -248,7 +248,7 @@ XLA_TEST_F(FusionTest, Parameter) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR2({{-1.0, 0.0, 1.0}}), + *LiteralUtil::CreateR2({{-1.0, 0.0, 1.0}}), *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4))); } @@ -269,7 +269,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) { auto hlo_module = CreateNewModule(); auto two = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); auto x = builder.AddInstruction(HloInstruction::CreateBroadcast(shape, two, {})); auto y = builder.AddInstruction( @@ -292,9 +292,9 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR1({1.0, 2.0, 3.0}))); + LiteralUtil::CreateR1({1.0, 2.0, 3.0}))); auto const_array = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{-1.0, -2.0, -4.0}, {10.0, 20.0, 30.0}}))); + LiteralUtil::CreateR2({{-1.0, -2.0, -4.0}, {10.0, 20.0, 30.0}}))); auto broadcast = builder.AddInstruction( HloInstruction::CreateBroadcast(const_array->shape(), const_vector, {1})); // add2 = broadcast(const_vector) + const_array @@ -308,7 +308,7 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE(LiteralTestUtil::Near( - *Literal::CreateR2({{0.0, 0.0, -1.0}, {11.0, 22.0, 33.0}}), + *LiteralUtil::CreateR2({{0.0, 0.0, -1.0}, {11.0, 22.0, 33.0}}), *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4))); } @@ -316,14 +316,14 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto single_element_array = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR2({{5}}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR2({{5}}))); auto reshape = builder.AddInstruction(HloInstruction::CreateReshape( ShapeUtil::MakeShape(S32, {}), single_element_array)); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape}, HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR0(5), + LiteralTestUtil::Equal(*LiteralUtil::CreateR0(5), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -331,14 +331,14 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1, 2}, {3, 4}, {5, 6}}))); + LiteralUtil::CreateR2({{1, 2}, {3, 4}, {5, 6}}))); auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape( ShapeUtil::MakeShape(S32, {1, 2, 3}), const0)); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1}, HloInstruction::FusionKind::kLoop); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR3({{{1, 2, 3}, {4, 5, 6}}}), + *LiteralUtil::CreateR3({{{1, 2, 3}, {4, 5, 6}}}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -346,14 +346,14 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR3({{{1, 2, 3}, {4, 5, 6}}}))); + LiteralUtil::CreateR3({{{1, 2, 3}, {4, 5, 6}}}))); auto reshape1 = builder.AddInstruction( HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {3, 2}), const0)); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1}, HloInstruction::FusionKind::kLoop); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR2({{1, 2}, {3, 4}, {5, 6}}), + *LiteralUtil::CreateR2({{1, 2}, {3, 4}, {5, 6}}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -361,14 +361,14 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR3({{{7}}}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR3({{{7}}}))); auto reshape1 = builder.AddInstruction( HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), const0)); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1}, HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR0(7), + LiteralTestUtil::Equal(*LiteralUtil::CreateR0(7), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -376,14 +376,14 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(7))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(7))); auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape( ShapeUtil::MakeShape(S32, {1, 1, 1}), const0)); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1}, HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR3({{{7}}}), + LiteralTestUtil::Equal(*LiteralUtil::CreateR3({{{7}}}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -391,14 +391,14 @@ XLA_TEST_F(FusionTest, Reshape__) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(7))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(7))); auto reshape1 = builder.AddInstruction( HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), const0)); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1}, HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR0(7), + LiteralTestUtil::Equal(*LiteralUtil::CreateR0(7), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -406,14 +406,14 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}))); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}))); auto reshape1 = builder.AddInstruction( HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {3, 3}), const0)); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1}, HloInstruction::FusionKind::kLoop); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}), + *LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -421,14 +421,14 @@ XLA_TEST_F(FusionTest, Transpose_2by3) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}}))); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}}))); auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose( ShapeUtil::MakeShape(S32, {3, 2}), const0, {1, 0})); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1}, HloInstruction::FusionKind::kLoop); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR2({{1, 4}, {2, 5}, {3, 6}}), + *LiteralUtil::CreateR2({{1, 4}, {2, 5}, {3, 6}}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -436,14 +436,14 @@ XLA_TEST_F(FusionTest, Transpose_3by3) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}))); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}))); auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose( ShapeUtil::MakeShape(S32, {3, 3}), const0, {1, 0})); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1}, HloInstruction::FusionKind::kLoop); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR2({{1, 4, 7}, {2, 5, 8}, {3, 6, 9}}), + *LiteralUtil::CreateR2({{1, 4, 7}, {2, 5, 8}, {3, 6, 9}}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -451,7 +451,7 @@ XLA_TEST_F(FusionTest, Reverse) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({1, 2, 3}))); auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse( ShapeUtil::MakeShape(S32, {3}), const0, {0})); hlo_module->AddEntryComputation(builder.Build()) @@ -459,7 +459,7 @@ XLA_TEST_F(FusionTest, Reverse) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR1({3, 2, 1}), + LiteralTestUtil::Equal(*LiteralUtil::CreateR1({3, 2, 1}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -467,7 +467,7 @@ XLA_TEST_F(FusionTest, ReverseNegate) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({1, 2, 3}))); auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse( ShapeUtil::MakeShape(S32, {3}), const0, {0})); auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary( @@ -477,7 +477,7 @@ XLA_TEST_F(FusionTest, ReverseNegate) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR1({-3, -2, -1}), + LiteralTestUtil::Equal(*LiteralUtil::CreateR1({-3, -2, -1}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -485,7 +485,7 @@ XLA_TEST_F(FusionTest, BroadcastNegate) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); auto broadcast1 = builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(S32, {2}), const0, {})); auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary( @@ -495,15 +495,15 @@ XLA_TEST_F(FusionTest, BroadcastNegate) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR1({-1, -1}), + LiteralTestUtil::Equal(*LiteralUtil::CreateR1({-1, -1}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } XLA_TEST_F(FusionTest, SliceNegate) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); - auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3, 4}))); + auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({1, 2, 3, 4}))); auto slice1 = builder.AddInstruction(HloInstruction::CreateSlice( ShapeUtil::MakeShape(S32, {2}), const0, {0}, {4}, {2})); auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary( @@ -513,17 +513,17 @@ XLA_TEST_F(FusionTest, SliceNegate) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR1({-1, -3}), + LiteralTestUtil::Equal(*LiteralUtil::CreateR1({-1, -3}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } XLA_TEST_F(FusionTest, DynamicSliceNegate) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); - auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3, 4}))); + auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({1, 2, 3, 4}))); auto const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({1}))); auto dynamic_slice2 = builder.AddInstruction(HloInstruction::CreateDynamicSlice( ShapeUtil::MakeShape(S32, {2}), const0, const1, {2})); @@ -535,15 +535,15 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR1({-2, -3}), + LiteralTestUtil::Equal(*LiteralUtil::CreateR1({-2, -3}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } XLA_TEST_F(FusionTest, ReshapeNegate) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); - auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3, 4}))); + auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({1, 2, 3, 4}))); auto reshape1 = builder.AddInstruction( HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {2, 2}), const0)); auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary( @@ -552,17 +552,16 @@ XLA_TEST_F(FusionTest, ReshapeNegate) { ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, reshape1}, HloInstruction::FusionKind::kLoop); - EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR2({{-1, -2}, {-3, -4}}), - *ExecuteAndTransfer(std::move(hlo_module), {}))); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{-1, -2}, {-3, -4}}), + *ExecuteAndTransfer(std::move(hlo_module), {}))); } -// TODO(b/64070202): Investigate failure. -XLA_TEST_F(FusionTest, DISABLED_ON_GPU(TransposeNegate)) { +XLA_TEST_F(FusionTest, TransposeNegate) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{1, 2}, {3, 4}}))); + LiteralUtil::CreateR2({{1, 2}, {3, 4}}))); auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose( ShapeUtil::MakeShape(S32, {2, 2}), const0, {1, 0})); auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary( @@ -571,9 +570,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_GPU(TransposeNegate)) { ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, transpose1}, HloInstruction::FusionKind::kLoop); - EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR2({{-1, -3}, {-2, -4}}), - *ExecuteAndTransfer(std::move(hlo_module), {}))); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{-1, -3}, {-2, -4}}), + *ExecuteAndTransfer(std::move(hlo_module), {}))); } std::unique_ptr MakeReduceTestComputation() { @@ -591,19 +590,19 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) { auto hlo_module = CreateNewModule(); auto builder = HloComputation::Builder(TestName()); - auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 4, 8}))); + auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({1, 2, 4, 8}))); auto const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); auto reduce2 = builder.AddInstruction(HloInstruction::CreateReduce( ShapeUtil::MakeShape(S32, {}), const0, const1, {0}, hlo_module->AddEmbeddedComputation(MakeReduceTestComputation()))); hlo_module->AddEntryComputation(builder.Build()) ->CreateFusionInstruction(/*instructions_to_fuse=*/{reduce2}, - HloInstruction::FusionKind::kLoop); + HloInstruction::FusionKind::kInput); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR0(15), + LiteralTestUtil::Equal(*LiteralUtil::CreateR0(15), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -611,10 +610,10 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) { auto hlo_module = CreateNewModule(); auto builder = HloComputation::Builder(TestName()); - auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 4, 8}))); + auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR1({1, 2, 4, 8}))); auto const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))); auto reduce2 = builder.AddInstruction(HloInstruction::CreateReduce( ShapeUtil::MakeShape(S32, {}), const0, const1, {0}, hlo_module->AddEmbeddedComputation(MakeReduceTestComputation()))); @@ -625,7 +624,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR0(-15), + LiteralTestUtil::Equal(*LiteralUtil::CreateR0(-15), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -633,9 +632,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); auto const0 = builder.AddInstruction(HloInstruction::CreateConstant( - Literal::CreateR2({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}}))); + LiteralUtil::CreateR2({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}}))); auto const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1))); Window window; ASSERT_TRUE( tensorflow::protobuf::TextFormat::ParseFromString("dimensions:{\n" @@ -675,7 +674,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) { HloInstruction::FusionKind::kLoop); EXPECT_TRUE(LiteralTestUtil::Equal( - *Literal::CreateR2({{462, 2145}, {24871, 62491}}), + *LiteralUtil::CreateR2({{462, 2145}, {24871, 62491}}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -687,9 +686,9 @@ XLA_TEST_F(FusionTest, SharedConstant) { auto builder = HloComputation::Builder(TestName()); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({0}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({0}))); auto const1 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR1({2}))); + HloInstruction::CreateConstant(LiteralUtil::CreateR1({2}))); auto add1 = builder.AddInstruction(HloInstruction::CreateBinary( ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, const0)); auto add2 = builder.AddInstruction(HloInstruction::CreateBinary( @@ -711,7 +710,7 @@ XLA_TEST_F(FusionTest, SharedConstant) { EXPECT_EQ(entry_comp->root_instruction()->fused_instruction_count(), 6); EXPECT_TRUE( - LiteralTestUtil::Equal(*Literal::CreateR1({8}), + LiteralTestUtil::Equal(*LiteralUtil::CreateR1({8}), *ExecuteAndTransfer(std::move(hlo_module), {}))); } @@ -765,6 +764,79 @@ XLA_TEST_F(FusionTest, Clamp2D) { TestElementwise2D(HloOpcode::kClamp); } +// TODO(b/73903144): Enable on interpreter once interpreter supports bitcast. +XLA_TEST_F(FusionTest, DISABLED_ON_INTERPRETER(FusionWithLayout)) { + const string hlo_text = R"( +HloModule Cluster + +fusion_c { + fusion.arg = f32[2,2]{1,0} parameter(0) + bitcast.0 = f32[2,2,1]{2,1,0} bitcast(fusion.arg) + tanh.0 = f32[2,2,1]{0,2,1} tanh(bitcast.0) + ROOT bitcast.2 = f32[2,2,1]{1,2,0} bitcast(tanh.0) +} + +ENTRY main { + arg = f32[2,2]{1,0} parameter(0) + ROOT fusion = f32[2,2,1]{1,2,0} fusion(arg), kind=kLoop, calls=fusion_c +} +)"; + + std::unique_ptr operand = + LiteralUtil::CreateR2({{0., 0.}, {1., 0.}}); + HloModuleConfig config; + config.set_debug_options(GetDebugOptionsForTest()); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_text, config)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + test_runner_.Execute(std::move(module), {operand.get()}, + /*run_hlo_passes=*/false)); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR3({{{0.}, {0.76159415595}}, {{0.}, {0.}}}), + *result)); +} + +class FusionClientLibraryTest : public ClientLibraryTestBase {}; + +XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) { + // On the GPU backend, it's possible to have too many transposes within one + // fusion, causing the kernel to run out shared memory and thus not compile. + // We want to check that doesn't happen. + // + // To do this, we create a computation that computes + // + // P0 + P0*P1*P1 + P0*P2*P2 ... + // + // where even parameters have layout 1 and odd parameters have layout 2. + // + // Our goal is to tempt the backend into creating one giant multi-output + // fusion for the whole computation, including the transposes. Currently + // multi-output fusion only fuses fusions, so each of the terms in the sum + // needs to be a fusion itself, thus the contortions above. + constexpr int kNumParams = 25; + XlaBuilder b("ManyLayoutTransformations"); + + // This test produces values that overflow int32, which is UB, so use uint32, + // where overflow is OK. + Array2D arr(32, 32); + arr.FillUnique(); + std::unique_ptr l1 = LiteralUtil::CreateR2FromArray2D(arr)->Relayout( + LayoutUtil::MakeLayout({0, 1})); + + std::unique_ptr l2 = LiteralUtil::CreateR2FromArray2D(arr)->Relayout( + LayoutUtil::MakeLayout({1, 0})); + + XlaOp p0 = AddParam(*l1, &b); + XlaOp sum = p0; + for (int i = 1; i < kNumParams; ++i) { + auto pN = AddParam((i % 2 == 0 ? *l1 : *l2), &b); + sum = sum + p0 * pN * pN; + } + + ComputeAndCompare(&b, {}); +} + void BM_ParallelFusion(int num_iters) { // Simple element-wise computation to benchmark parallel task partitioning. tensorflow::testing::StopTiming(); @@ -793,31 +865,31 @@ void BM_ParallelFusion(int num_iters) { // Create computation. XlaBuilder builder("ParallelFusion"); Shape shape0 = ShapeUtil::MakeShape(F32, {param0_dim0, param0_dim1}); - auto param0 = builder.Parameter(0, shape0, "param0"); + auto param0 = Parameter(&builder, 0, shape0, "param0"); Shape shape1 = ShapeUtil::MakeShape(F32, {param1_dim0, param1_dim1}); - auto param1 = builder.Parameter(1, shape1, "param1"); + auto param1 = Parameter(&builder, 1, shape1, "param1"); Shape shape2 = ShapeUtil::MakeShape(F32, {param2_dim0, param2_dim1}); - auto param2 = builder.Parameter(2, shape2, "param2"); + auto param2 = Parameter(&builder, 2, shape2, "param2"); - auto x = builder.Mul(param0, param1); - auto y = builder.Add(x, param2); + auto x = Mul(param0, param1); + Add(x, param2); auto computation = builder.Build().ConsumeValueOrDie(); // Transfer literals to device. auto param0_literal = - Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1); + LiteralUtil::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1); ScopedShapedBuffer buffer0 = client->LiteralToShapedBuffer(*param0_literal, device_ordinal) .ConsumeValueOrDie(); auto param1_literal = - Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1); + LiteralUtil::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1); ScopedShapedBuffer buffer1 = client->LiteralToShapedBuffer(*param1_literal, device_ordinal) .ConsumeValueOrDie(); auto param2_literal = - Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1); + LiteralUtil::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1); ScopedShapedBuffer buffer2 = client->LiteralToShapedBuffer(*param2_literal, device_ordinal) .ConsumeValueOrDie(); diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc index 4854c649c15f2ab89bd3b343abd248be6e227c60..6d634980449268e509d87ee064fbaaaf59abd195 100644 --- a/tensorflow/compiler/xla/tests/gather_operation_test.cc +++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc @@ -13,35 +13,32 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/execution_options_util.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" - -// NB! TODO(b/74360564): These tests do not test out of bounds behavior since -// that hasn't been specced yet. namespace xla { namespace { -using tensorflow::gtl::nullopt; +using absl::nullopt; class GatherOperationTest : public HloTestBase { protected: void RunTest(const string& hlo_text, Literal* operand, - Literal* gather_indices) { - RunTest(hlo_text, {operand, gather_indices}); + Literal* start_indices) { + RunTest(hlo_text, {operand, start_indices}); } - void RunTest(const string& hlo_text, - tensorflow::gtl::ArraySlice args) { + void RunTest(const string& hlo_text, absl::Span args) { HloModuleConfig config; config.set_debug_options(GetDebugOptionsForTest()); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - tools::Parse(hlo_text, config)); + ParseHloString(hlo_text, config)); EXPECT_TRUE(RunAndCompare(std::move(module), args, nullopt)); } }; @@ -54,17 +51,17 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) ROOT gather = s32[2,3] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1, 3} + slice_sizes={1, 3} } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR1({0, 2}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({0, 2}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, TensorFlowGatherV2) { @@ -75,17 +72,17 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) ROOT gather = s32[3,2] gather(operand, indices), - output_window_dims={0}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={0}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=1, - window_bounds={3, 1} + slice_sizes={3, 1} } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR1({0, 2}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({0, 2}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, TensorFlowGatherMultipleBatchDims) { @@ -96,18 +93,18 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2] parameter(1) ROOT gather = s32[2,3,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={1}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=2, - window_bounds={3, 1} + slice_sizes={3, 1} } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{0, 2}, {2, 1}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{0, 2}, {2, 1}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdMultipleBatchDims_0) { @@ -118,18 +115,18 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2,2] parameter(1) ROOT gather = s32[2,2] gather(operand, indices), - output_window_dims={}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=2, - window_bounds={1, 1} + slice_sizes={1, 1} } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = - Literal::CreateR3({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR3({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdMultipleBatchDims_1) { @@ -140,18 +137,18 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2,2] parameter(1) ROOT gather = s32[2,1,1,2] gather(operand, indices), - output_window_dims={1,2}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1,2}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=2, - window_bounds={1, 1} + slice_sizes={1, 1} } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = - Literal::CreateR3({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR3({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, TensorFlowGatherNd) { @@ -162,20 +159,20 @@ ENTRY main { operand = s32[3,3,2] parameter(0) indices = s32[2,2] parameter(1) ROOT gather = s32[2,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=1, - window_bounds={1,1,2} + slice_sizes={1,1,2} } )"; std::unique_ptr operand = - Literal::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // - {{-4, 4}, {-5, 5}, {-6, 6}}, // - {{-7, 7}, {-8, 8}, {-9, 9}}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{0, 0}, {1, 0}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdNonDefaultIndexVectorDim) { @@ -186,20 +183,20 @@ ENTRY main { operand = s32[3,3,2] parameter(0) indices = s32[2,2] parameter(1) ROOT gather = s32[2,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1,2} + slice_sizes={1,1,2} } )"; std::unique_ptr operand = - Literal::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // - {{-4, 4}, {-5, 5}, {-6, 6}}, // - {{-7, 7}, {-8, 8}, {-9, 9}}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{0, 0}, {1, 0}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, DynamicSlice) { @@ -210,17 +207,17 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) ROOT gather = s32[1,1] gather(operand, indices), - output_window_dims={0,1}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={0,1}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1} + slice_sizes={1,1} } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR1({1, 1}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({1, 1}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, BatchDynamicSlice) { @@ -231,18 +228,18 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2] parameter(1) ROOT gather = s32[2,1,1] gather(operand, indices), - output_window_dims={1,2}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1,2}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1} + slice_sizes={1,1} } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{2, 1}, {1, 1}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{2, 1}, {1, 1}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, ZeroDimBounds) { @@ -253,25 +250,21 @@ ENTRY main { operand = s32[3,0] parameter(0) indices = s32[2] parameter(1) ROOT gather = s32[2,0] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1, 0} + slice_sizes={1, 0} } )"; - std::unique_ptr operand = Literal::CreateR2({{}, {}, {}}); - std::unique_ptr gather_indices = Literal::CreateR1({0, 2}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + std::unique_ptr operand = LiteralUtil::CreateR2({{}, {}, {}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({0, 2}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, OutOfBoundsIndex) { // Out of bounds indices must not crash, and the indices in range should // produce the same values across all backends. - // - // TODO(b/74360564): Once we have a well defined semantics for OOB accesses, - // we should get rid of the mask and check that backends produce the same - // value for OOB indices too. const string hlo_text = R"( HloModule BatchDynamicSlice @@ -280,34 +273,50 @@ ENTRY main { operand = s32[3,3]{1,0} parameter(0) indices = s32[6,2]{1,0} parameter(1) gather = s32[6,1,1]{2,1,0} gather(operand, indices), - output_window_dims={1,2}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1,2}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=1, - window_bounds={1,1} - gather_reshaped = s32[6]{0} reshape(gather) - in_bounds_mask = s32[6]{0} parameter(2) - ROOT result = s32[6]{0} multiply(gather_reshaped, in_bounds_mask) + slice_sizes={1,1} + ROOT result = s32[6]{0} reshape(gather) } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR2( + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR2( {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483647, 1}, {1, 2}}); - std::unique_ptr in_bounds_mask = - Literal::CreateR1({0, 1, 1, 0, 0, 1}); + RunTest(hlo_text, operand.get(), start_indices.get()); +} + +XLA_TEST_F(GatherOperationTest, OutOfBoundsUnsignedIndex) { + // Out of bounds indices must not crash, and the indices in range should + // produce the same values across all backends. + + const string hlo_text = R"( +HloModule BatchDynamicSlice - RunTest(hlo_text, - {operand.get(), gather_indices.get(), in_bounds_mask.get()}); +ENTRY main { + operand = s32[3,3]{1,0} parameter(0) + indices = u32[6,2]{1,0} parameter(1) + gather = s32[6,1,1]{2,1,0} gather(operand, indices), + offset_dims={1,2}, + collapsed_slice_dims={}, + start_index_map={0,1}, + index_vector_dim=1, + slice_sizes={1,1} + ROOT result = s32[6]{0} reshape(gather) +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR2( + {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483648u, 1}, {1, 2}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, NegativeIndex) { // Negative indices must not crash, and the indices in range should produce // the same values across all backends. - // - // TODO(b/74360564): Once we have a well defined semantics for negative - // accesses, we should get rid of the mask and check that backends produce the - // same value for negative indices too. const string hlo_text = R"( HloModule BatchDynamicSlice @@ -316,25 +325,45 @@ ENTRY main { operand = s32[3,3]{1,0} parameter(0) indices = s32[6,2]{1,0} parameter(1) gather = s32[6,1,1]{2,1,0} gather(operand, indices), - output_window_dims={1,2}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1,2}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=1, - window_bounds={1,1} - gather_reshaped = s32[6]{0} reshape(gather) - in_bounds_mask = s32[6]{0} parameter(2) - ROOT result = s32[6]{0} multiply(gather_reshaped, in_bounds_mask) + slice_sizes={1,1} + ROOT result = s32[6]{0} reshape(gather) } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR2( + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR2( {{2, -1}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}}); - std::unique_ptr in_bounds_mask = - Literal::CreateR1({0, 1, 1, 0, 0, 1}); + RunTest(hlo_text, operand.get(), start_indices.get()); +} + +XLA_TEST_F(GatherOperationTest, NegativeIndexIntoUnsignedOperand) { + // Negative indices must not crash, and the indices in range should produce + // the same values across all backends. + + const string hlo_text = R"( +HloModule BatchDynamicSlice - RunTest(hlo_text, - {operand.get(), gather_indices.get(), in_bounds_mask.get()}); +ENTRY main { + operand = u32[3,3]{1,0} parameter(0) + indices = s32[6,2]{1,0} parameter(1) + gather = u32[6,1,1]{2,1,0} gather(operand, indices), + offset_dims={1,2}, + collapsed_slice_dims={}, + start_index_map={0,1}, + index_vector_dim=1, + slice_sizes={1,1} + ROOT result = u32[6]{0} reshape(gather) +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR2( + {{2, -1}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, OneScalarIndex) { @@ -345,17 +374,17 @@ ENTRY main { operand = s32[2,3,2]{2,1,0} parameter(0) index = s32[] parameter(1) ROOT gather = s32[1,3,2]{2,1,0} gather(operand, index), - output_window_dims={0,1,2}, - elided_window_dims={}, - gather_dims_to_operand_dims={0}, + offset_dims={0,1,2}, + collapsed_slice_dims={}, + start_index_map={0}, index_vector_dim=0, - window_bounds={1,3,2} + slice_sizes={1,3,2} } )"; - std::unique_ptr operand = Literal::CreateR3( + std::unique_ptr operand = LiteralUtil::CreateR3( {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}); - std::unique_ptr gather_indices = Literal::CreateR0(1); - RunTest(hlo_text, operand.get(), gather_indices.get()); + std::unique_ptr start_indices = LiteralUtil::CreateR0(1); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, ScalarResult) { @@ -366,16 +395,16 @@ ENTRY main { operand = s32[4]{0} parameter(0) index = s32[] parameter(1) ROOT gather = s32[] gather(operand, index), - output_window_dims={}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=0, - window_bounds={1} + slice_sizes={1} } )"; - std::unique_ptr operand = Literal::CreateR1({1, 2, 3, 4}); - std::unique_ptr gather_indices = Literal::CreateR0(1); - RunTest(hlo_text, operand.get(), gather_indices.get()); + std::unique_ptr operand = LiteralUtil::CreateR1({1, 2, 3, 4}); + std::unique_ptr start_indices = LiteralUtil::CreateR0(1); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, ZeroSizedResult) { @@ -386,17 +415,17 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[0] parameter(1) ROOT gather = s32[0,3] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0}, - gather_dims_to_operand_dims={0}, + offset_dims={1}, + collapsed_slice_dims={0}, + start_index_map={0}, index_vector_dim=1, - window_bounds={1, 3} + slice_sizes={1, 3} } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR1({}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherV2) { @@ -407,20 +436,20 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) gather = s32[3,2] gather(operand, indices), - output_window_dims={0}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={0}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=1, - window_bounds={3, 1} + slice_sizes={3, 1} one = s32[] constant(1) one_broadcasted = s32[3,2] broadcast(one), dimensions={} ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted) } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR1({0, 2}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({0, 2}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherMultipleBatchDims) { @@ -431,21 +460,21 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2] parameter(1) gather = s32[2,3,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={1}, - gather_dims_to_operand_dims={1}, + offset_dims={1}, + collapsed_slice_dims={1}, + start_index_map={1}, index_vector_dim=2, - window_bounds={3, 1} + slice_sizes={3, 1} one = s32[] constant(1) one_broadcasted = s32[2,3,2] broadcast(one), dimensions={} ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted) } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{0, 2}, {2, 1}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{0, 2}, {2, 1}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNdMultipleBatchDims) { @@ -456,21 +485,21 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2,2] parameter(1) gather = s32[2,2] gather(operand, indices), - output_window_dims={}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=2, - window_bounds={1, 1} + slice_sizes={1, 1} one = s32[] constant(1) one_broadcasted = s32[2,2] broadcast(one), dimensions={} ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted) } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = - Literal::CreateR3({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR3({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNd) { @@ -481,23 +510,23 @@ ENTRY main { operand = s32[3,3,2] parameter(0) indices = s32[2,2] parameter(1) gather = s32[2,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=1, - window_bounds={1,1,2} + slice_sizes={1,1,2} one = s32[] constant(1) one_broadcasted = s32[2,2] broadcast(one), dimensions={} ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted) } )"; std::unique_ptr operand = - Literal::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // - {{-4, 4}, {-5, 5}, {-6, 6}}, // - {{-7, 7}, {-8, 8}, {-9, 9}}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{0, 0}, {1, 0}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, @@ -509,23 +538,23 @@ ENTRY main { operand = s32[3,3,2] parameter(0) indices = s32[2,2] parameter(1) gather = s32[2,2] gather(operand, indices), - output_window_dims={1}, - elided_window_dims={0,1}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1}, + collapsed_slice_dims={0,1}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1,2} + slice_sizes={1,1,2} one = s32[] constant(1) one_broadcasted = s32[2,2] broadcast(one), dimensions={} ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted) } )"; std::unique_ptr operand = - Literal::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // - {{-4, 4}, {-5, 5}, {-6, 6}}, // - {{-7, 7}, {-8, 8}, {-9, 9}}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{0, 0}, {1, 0}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, FusedDynamicSlice) { @@ -536,20 +565,20 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2] parameter(1) gather = s32[1,1] gather(operand, indices), - output_window_dims={0,1}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={0,1}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1} + slice_sizes={1,1} one = s32[] constant(1) one_broadcasted = s32[1,1] broadcast(one), dimensions={} ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted) } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = Literal::CreateR1({1, 1}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = LiteralUtil::CreateR1({1, 1}); + RunTest(hlo_text, operand.get(), start_indices.get()); } XLA_TEST_F(GatherOperationTest, FusedBatchDynamicSlice) { @@ -560,21 +589,21 @@ ENTRY main { operand = s32[3,3] parameter(0) indices = s32[2,2] parameter(1) gather = s32[2,1,1] gather(operand, indices), - output_window_dims={1,2}, - elided_window_dims={}, - gather_dims_to_operand_dims={0,1}, + offset_dims={1,2}, + collapsed_slice_dims={}, + start_index_map={0,1}, index_vector_dim=0, - window_bounds={1,1} + slice_sizes={1,1} one = s32[] constant(1) one_broadcasted = s32[2,1,1] broadcast(one), dimensions={} ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted) } )"; std::unique_ptr operand = - Literal::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - std::unique_ptr gather_indices = - Literal::CreateR2({{2, 1}, {1, 1}}); - RunTest(hlo_text, operand.get(), gather_indices.get()); + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr start_indices = + LiteralUtil::CreateR2({{2, 1}, {1, 1}}); + RunTest(hlo_text, operand.get(), start_indices.get()); } class GatherClientLibraryTest : public ClientLibraryTestBase {}; @@ -586,11 +615,11 @@ XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) { // operand = s32[3,3] parameter(0) // indices = s32[2] parameter(1) // ROOT gather = s32[2,3] gather(operand, indices), - // output_window_dims={1}, - // elided_window_dims={0}, - // gather_dims_to_operand_dims={0}, + // offset_dims={1}, + // collapsed_slice_dims={0}, + // start_index_map={0}, // index_vector_dim=1, - // window_bounds={1, 3} + // slice_sizes={1, 3} // } XlaBuilder builder("gather_basic"); @@ -598,22 +627,23 @@ XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) { Shape operand_shape = ShapeUtil::MakeShape(S32, {3, 3}); Shape indices_shape = ShapeUtil::MakeShape(S32, {2}); - auto operand = builder.Parameter(0, operand_shape, "operand"); - auto indices = builder.Parameter(1, indices_shape, "indices"); + auto operand = Parameter(&builder, 0, operand_shape, "operand"); + auto indices = Parameter(&builder, 1, indices_shape, "indices"); GatherDimensionNumbers dim_numbers; - dim_numbers.add_output_window_dims(1); - dim_numbers.add_elided_window_dims(0); - dim_numbers.add_gather_dims_to_operand_dims(0); + dim_numbers.add_offset_dims(1); + dim_numbers.add_collapsed_slice_dims(0); + dim_numbers.add_start_index_map(0); dim_numbers.set_index_vector_dim(1); - builder.Gather(operand, indices, dim_numbers, {1, 3}); + Gather(operand, indices, dim_numbers, {1, 3}); std::vector expected = {}; - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr operand_arg, - client_->TransferToServer(*Literal::CreateR2( - {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}))); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr operand_arg, + client_->TransferToServer( + *LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}))); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr indices_arg, - client_->TransferToServer(*Literal::CreateR1({0, 2}))); + client_->TransferToServer(*LiteralUtil::CreateR1({0, 2}))); TF_ASSERT_OK_AND_ASSIGN(std::vector devices, client_->GetDeviceHandles(1)); xla::ExecutionOptions execution_options = CreateDefaultExecutionOptions(); @@ -629,8 +659,8 @@ XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) { client_->ExecuteParallel(computation_instances)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, client_->Transfer(*(result_data[0]))); - EXPECT_TRUE(LiteralTestUtil::Equal( - *result_literal, *Literal::CreateR2({{1, 2, 3}, {7, 8, 9}}))); + LiteralTestUtil::ExpectR2Equal({{1, 2, 3}, {7, 8, 9}}, + *result_literal); } } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/half_test.cc b/tensorflow/compiler/xla/tests/half_test.cc index 76bf47845ca045b4eede9a3b47ae5c2ce93ce577..1115e50fe3120b7dbd891f07dedcacefa5ecf3ea 100644 --- a/tensorflow/compiler/xla/tests/half_test.cc +++ b/tensorflow/compiler/xla/tests/half_test.cc @@ -16,8 +16,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -37,8 +37,7 @@ class HalfTestBase : public ClientLibraryTestBase { static const int kNumElements = 4; }; -using UnaryBuildFuncTy = - std::function; +using UnaryBuildFuncTy = std::function; struct UnaryOpTestParam { std::function compute_func; @@ -49,7 +48,8 @@ class UnaryOpTest : public HalfTestBase, public ::testing::WithParamInterface {}; XLA_TEST_P(UnaryOpTest, Ops) { - std::vector x({half(1.4), half(-2.3), half(3.2), half(-4.1)}); + std::vector x({half(1.4), half(-2.3), half(3.2), half(-4.1), half(9.0), + half(42.0), half(-9.0), half(-100.0)}); XlaBuilder builder(TestName()); XlaOp x_opnd; auto x_data = CreateR1Parameter(x, /*parameter_number=*/0, "x", @@ -62,7 +62,7 @@ XLA_TEST_P(UnaryOpTest, Ops) { } UnaryBuildFuncTy build_func = GetParam().build_func; - build_func(&builder, x_opnd); + build_func(x_opnd); ComputeAndCompareR1(&builder, expected, {x_data.get()}, error_spec_); } @@ -79,18 +79,17 @@ half round_imp(half value) { INSTANTIATE_TEST_CASE_P( half, UnaryOpTest, ::testing::Values( - UnaryOpTestParam{[](half x) { return abs(x); }, &XlaBuilder::Abs}, - UnaryOpTestParam{[](half x) { return round_imp(x); }, - &XlaBuilder::Round}, - UnaryOpTestParam{[](half x) { return ceil(x); }, &XlaBuilder::Ceil}, - UnaryOpTestParam{[](half x) { return cos(x); }, &XlaBuilder::Cos}, - UnaryOpTestParam{[](half x) { return exp(x); }, &XlaBuilder::Exp}, - UnaryOpTestParam{[](half x) { return floor(x); }, &XlaBuilder::Floor}, - UnaryOpTestParam{[](half x) { return log(x); }, &XlaBuilder::Log}, - UnaryOpTestParam{[](half x) { return -x; }, &XlaBuilder::Neg}, - UnaryOpTestParam{[](half x) { return sign_imp(x); }, &XlaBuilder::Sign}, - UnaryOpTestParam{[](half x) { return sin(x); }, &XlaBuilder::Sin}, - UnaryOpTestParam{[](half x) { return tanh(x); }, &XlaBuilder::Tanh} + UnaryOpTestParam{[](half x) { return abs(x); }, &Abs}, + UnaryOpTestParam{[](half x) { return round_imp(x); }, &Round}, + UnaryOpTestParam{[](half x) { return ceil(x); }, &Ceil}, + UnaryOpTestParam{[](half x) { return cos(x); }, &Cos}, + UnaryOpTestParam{[](half x) { return exp(x); }, &Exp}, + UnaryOpTestParam{[](half x) { return floor(x); }, &Floor}, + UnaryOpTestParam{[](half x) { return log(x); }, &Log}, + UnaryOpTestParam{[](half x) { return -x; }, &Neg}, + UnaryOpTestParam{[](half x) { return sign_imp(x); }, &Sign}, + UnaryOpTestParam{[](half x) { return sin(x); }, &Sin}, + UnaryOpTestParam{[](half x) { return tanh(x); }, &Tanh} )); @@ -118,19 +117,17 @@ XLA_TEST_P(UnaryPredTest, Ops) { } UnaryBuildFuncTy build_func = GetParam().build_func; - build_func(&builder, x_opnd); + build_func(x_opnd); ComputeAndCompareR1(&builder, expected, {x_data.get()}); } INSTANTIATE_TEST_CASE_P(half, UnaryPredTest, ::testing::Values(UnaryPredTestParam{ - [](half x) { return isfinite(x); }, - &XlaBuilder::IsFinite})); + [](half x) { return isfinite(x); }, &IsFinite})); using BinaryBuildFuncTy = std::function)>; + const xla::XlaOp& x, const xla::XlaOp& y, absl::Span)>; struct BinaryOpTestParam { std::function compute_func; @@ -159,7 +156,7 @@ XLA_TEST_P(BinaryOpTest, Ops) { } BinaryBuildFuncTy build_func = GetParam().build_func; - build_func(&builder, x_opnd, y_opnd, {}); + build_func(x_opnd, y_opnd, {}); ComputeAndCompareR1(&builder, expected, {x_data.get(), y_data.get()}, error_spec_); @@ -173,22 +170,15 @@ half atan2_imp(half x, half y) { INSTANTIATE_TEST_CASE_P( half, BinaryOpTest, ::testing::Values( - BinaryOpTestParam{[](half x, half y) { return x + y; }, - &XlaBuilder::Add}, + BinaryOpTestParam{[](half x, half y) { return x + y; }, &Add}, BinaryOpTestParam{[](half x, half y) { return atan2_imp(x, y); }, - &XlaBuilder::Atan2}, - BinaryOpTestParam{[](half x, half y) { return x / y; }, - &XlaBuilder::Div}, - BinaryOpTestParam{[](half x, half y) { return max(x, y); }, - &XlaBuilder::Max}, - BinaryOpTestParam{[](half x, half y) { return min(x, y); }, - &XlaBuilder::Min}, - BinaryOpTestParam{[](half x, half y) { return x * y; }, - &XlaBuilder::Mul}, - BinaryOpTestParam{[](half x, half y) { return pow(x, y); }, - &XlaBuilder::Pow}, - BinaryOpTestParam{[](half x, half y) { return x - y; }, - &XlaBuilder::Sub} + &Atan2}, + BinaryOpTestParam{[](half x, half y) { return x / y; }, &Div}, + BinaryOpTestParam{[](half x, half y) { return max(x, y); }, &Max}, + BinaryOpTestParam{[](half x, half y) { return min(x, y); }, &Min}, + BinaryOpTestParam{[](half x, half y) { return x * y; }, &Mul}, + BinaryOpTestParam{[](half x, half y) { return pow(x, y); }, &Pow}, + BinaryOpTestParam{[](half x, half y) { return x - y; }, &Sub} )); @@ -221,27 +211,22 @@ XLA_TEST_P(BinaryPredTest, Ops) { } BinaryBuildFuncTy build_func = GetParam().build_func; - build_func(&builder, x_opnd, y_opnd, {}); + build_func(x_opnd, y_opnd, {}); ComputeAndCompareR1(&builder, expected, {x_data.get(), y_data.get()}); } INSTANTIATE_TEST_CASE_P( half, BinaryPredTest, - ::testing::Values(BinaryPredTestParam{[](half x, half y) { return x == y; }, - &XlaBuilder::Eq}, - BinaryPredTestParam{[](half x, half y) { return x != y; }, - &XlaBuilder::Ne}, - BinaryPredTestParam{[](half x, half y) { return x >= y; }, - &XlaBuilder::Ge}, - BinaryPredTestParam{[](half x, half y) { return x > y; }, - &XlaBuilder::Gt}, - BinaryPredTestParam{[](half x, half y) { return x <= y; }, - &XlaBuilder::Le}, - BinaryPredTestParam{[](half x, half y) { return x < y; }, - &XlaBuilder::Lt} - - )); + ::testing::Values( + BinaryPredTestParam{[](half x, half y) { return x == y; }, &Eq}, + BinaryPredTestParam{[](half x, half y) { return x != y; }, &Ne}, + BinaryPredTestParam{[](half x, half y) { return x >= y; }, &Ge}, + BinaryPredTestParam{[](half x, half y) { return x > y; }, &Gt}, + BinaryPredTestParam{[](half x, half y) { return x <= y; }, &Le}, + BinaryPredTestParam{[](half x, half y) { return x < y; }, &Lt} + + )); } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc index cf971dd61b71ad329b20b0bb7c16166126562681..5511190caf95544e2ac48d91c0a138db06a2544c 100644 --- a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc +++ b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc @@ -14,7 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/local_client_test_base.h" @@ -30,9 +30,9 @@ class HloMetadataTest : public LocalClientTestBase { } void BuildAddComputation(XlaBuilder* builder) { - auto x = builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = builder->Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - builder->Add(x, y); + auto x = Parameter(builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(builder, 1, ShapeUtil::MakeShape(F32, {}), "y"); + Add(x, y); } OpMetadata metadata_; diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc index 36e19e6507fa3b6f4a21949583f92716d2f44333..fc4c68246e62a4baa7a506ec37886102c35c4b3b 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.cc +++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc @@ -20,17 +20,20 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_utils.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -39,9 +42,8 @@ namespace xla { namespace { -using tensorflow::StringPiece; -using tensorflow::gtl::ArraySlice; -using tensorflow::gtl::optional; +using absl::optional; +using absl::string_view; constexpr char kInterpreter[] = "interpreter"; @@ -83,22 +85,42 @@ ProgramShape GetProgramShapeWithLayout(const HloModule& module) { } // namespace -HloTestBase::HloTestBase() - : HloTestBase(GetTestPlatform(), GetReferencePlatform()) {} +HloTestBase::HloTestBase(bool verifier_layout_sensitive, + bool allow_mixed_precision_in_hlo_verifier) + : HloTestBase(GetTestPlatform(), GetReferencePlatform(), + verifier_layout_sensitive, + allow_mixed_precision_in_hlo_verifier) {} HloTestBase::HloTestBase(se::Platform* test_platform, - se::Platform* reference_platform) + se::Platform* reference_platform, + bool verifier_layout_sensitive, + bool allow_mixed_precision_in_hlo_verifier) : test_runner_(test_platform), reference_runner_(reference_platform) { - hlo_verifier_ = MakeUnique(/*allow_mixed_precision=*/true); + hlo_verifier_ = absl::make_unique( + /*layout_sensitive=*/verifier_layout_sensitive, + /*allow_mixed_precision=*/allow_mixed_precision_in_hlo_verifier); } -/* static */ std::unique_ptr HloTestBase::CreateNewModule(const string& name) { - return MakeUnique(name, VersionedComputationHandle(), - GetModuleConfigForTest()); + return absl::make_unique(name, GetModuleConfigForTest()); } -/*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() { +/* static */ +StatusOr HloTestBase::RunHloPass(HloPassInterface* hlo_pass, + HloModule* module) { + const string module_str_before_run = module->ToProto().ShortDebugString(); + const auto status_or = hlo_pass->Run(module); + if (status_or.status().ok()) { + const string module_str_after_run = module->ToProto().ShortDebugString(); + if (!status_or.ValueOrDie()) { + // Check that the proto remains same. + EXPECT_EQ(module_str_after_run, module_str_before_run); + } + } + return status_or; +} + +DebugOptions HloTestBase::GetDebugOptionsForTest() { auto debug_options = legacy_flags::GetDebugOptionsFromFlags(); // TODO(b/38354253): Change tests to use Parameters instead of Constants. debug_options.add_xla_disable_hlo_passes("constant_folding"); @@ -107,14 +129,12 @@ std::unique_ptr HloTestBase::CreateNewModule(const string& name) { } StatusOr> HloTestBase::Execute( - std::unique_ptr module, - tensorflow::gtl::ArraySlice arguments) { + std::unique_ptr module, absl::Span arguments) { return test_runner_.Execute(std::move(module), arguments); } std::unique_ptr HloTestBase::ExecuteNoHloPasses( - std::unique_ptr module, - tensorflow::gtl::ArraySlice arguments) { + std::unique_ptr module, absl::Span arguments) { return test_runner_ .Execute(std::move(module), arguments, /*run_hlo_passes=*/false) @@ -122,8 +142,7 @@ std::unique_ptr HloTestBase::ExecuteNoHloPasses( } std::unique_ptr HloTestBase::ExecuteAndTransfer( - std::unique_ptr module, - tensorflow::gtl::ArraySlice arguments) { + std::unique_ptr module, absl::Span arguments) { return test_runner_.Execute(std::move(module), arguments).ValueOrDie(); } @@ -146,7 +165,8 @@ StatusOr> HloTestBase::MakeReferenceModule( } StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( - std::unique_ptr module, const ArraySlice arguments, + std::unique_ptr module, + const absl::Span arguments, const optional& error, bool run_hlo_passes, const std::function& reference_preprocessor) { TF_RETURN_IF_ERROR(hlo_verifier_->Run(module.get()).status()); @@ -165,7 +185,8 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( } ::testing::AssertionResult HloTestBase::RunAndCompare( - std::unique_ptr module, const ArraySlice arguments, + std::unique_ptr module, + const absl::Span arguments, const optional& error, const std::function& reference_preprocessor) { auto result = @@ -178,7 +199,8 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( } ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses( - std::unique_ptr module, const ArraySlice arguments, + std::unique_ptr module, + const absl::Span arguments, const optional& error, const std::function& reference_preprocessor) { auto result = @@ -197,7 +219,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( MakeFakeArguments(module.get()).ConsumeValueOrDie(); std::vector fake_argument_ptrs; - c_transform( + absl::c_transform( fake_arguments, std::back_inserter(fake_argument_ptrs), [](const std::unique_ptr& literal) { return literal.get(); }); @@ -211,7 +233,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( const auto& fake_arguments = MakeFakeArguments(module.get()).ConsumeValueOrDie(); std::vector fake_argument_ptrs; - c_transform( + absl::c_transform( fake_arguments, std::back_inserter(fake_argument_ptrs), [](const std::unique_ptr& literal) { return literal.get(); }); @@ -220,8 +242,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( } ::testing::AssertionResult HloTestBase::RunAndCompare( - const StringPiece hlo_string, - const tensorflow::gtl::optional& error, + string_view hlo_string, const absl::optional& error, const std::function& reference_preprocessor) { auto module_or_status = HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()); @@ -234,8 +255,31 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( reference_preprocessor); } +::testing::AssertionResult HloTestBase::Run(string_view hlo_string) { + auto module_or_status = + HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()); + if (!module_or_status.ok()) { + return ::testing::AssertionFailure() + << "Error while parsing HLO text format: " + << module_or_status.status().ToString(); + } + const auto& fake_arguments = + MakeFakeArguments(module_or_status.ValueOrDie().get()) + .ConsumeValueOrDie(); + std::vector fake_argument_ptrs; + absl::c_transform( + fake_arguments, std::back_inserter(fake_argument_ptrs), + [](const std::unique_ptr& literal) { return literal.get(); }); + return test_runner_ + .Execute(std::move(module_or_status.ValueOrDie()), + fake_argument_ptrs, /*run_hlo_passes=*/true) + .ok() + ? ::testing::AssertionSuccess() + : ::testing::AssertionFailure(); +} + ::testing::AssertionResult HloTestBase::RunAndCompareFromFile( - const string& filename, const tensorflow::gtl::optional& error, + const string& filename, const absl::optional& error, const std::function& reference_preprocessor) { auto module_or_status = HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest()); @@ -248,8 +292,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( } ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses( - const StringPiece hlo_string, - const tensorflow::gtl::optional& error, + string_view hlo_string, const absl::optional& error, const std::function& reference_preprocessor) { auto module_or_status = HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()); @@ -263,7 +306,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( } ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile( - const string& filename, const tensorflow::gtl::optional& error, + const string& filename, const absl::optional& error, const std::function& reference_preprocessor) { auto module_or_status = HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest()); @@ -276,21 +319,23 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( } HloComputation* HloTestBase::FindComputation(HloModule* module, - tensorflow::StringPiece name) { - auto it = c_find_if(module->computations(), - [&](HloComputation* c) { return c->name() == name; }); - if (it == module->computations().end()) { + absl::string_view name) { + auto computations = module->computations(); + auto it = absl::c_find_if( + computations, [&](HloComputation* c) { return c->name() == name; }); + if (it == computations.end()) { return nullptr; } return *it; } HloInstruction* HloTestBase::FindInstruction(HloModule* module, - tensorflow::StringPiece name) { + absl::string_view name) { for (const HloComputation* c : module->computations()) { - auto it = c_find_if(c->instructions(), - [&](HloInstruction* i) { return i->name() == name; }); - if (it != c->instructions().end()) { + auto instructions = c->instructions(); + auto it = absl::c_find_if( + instructions, [&](HloInstruction* i) { return i->name() == name; }); + if (it != instructions.end()) { return *it; } } diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h index eb3a2ea76a667a2afa2562f01d28f34384b84a21..4c88257bb27f5504588bba3ee0b14ac53c971225 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.h +++ b/tensorflow/compiler/xla/tests/hlo_test_base.h @@ -20,6 +20,8 @@ limitations under the License. #include #include +#include "absl/types/optional.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -31,8 +33,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/test.h" @@ -66,35 +66,47 @@ namespace xla { // // For a more detailed example, see "../tests/sample_text_test.cc". class HloTestBase : public ::testing::Test { + public: + // Creates a new HLO module for a test. The module created will have + // TestName() for its name; it will also automatically populate its debug + // options from command-line flags. If you want a fresh HloModule object and + // then add HloComputations to it, it's recommended to use this method in your + // tests. + std::unique_ptr CreateNewModule(const string& name = TestName()); + + // Runs the hlo_pass with the provided module and returns the result. This + // function also verifies that the module remains unchanged when hlo_pass + // returns false as the StatusOr value. + static StatusOr RunHloPass(HloPassInterface* hlo_pass, + HloModule* module); + protected: // This uses the interpreter backend as the reference backend and // automatically finds another supported backend as the test backend. If the // interpreter is the only supported backend, it will be both the test backend // and the reference backend. - HloTestBase(); + HloTestBase(bool verifier_layout_sensitive = false, + bool allow_mixed_precision_in_hlo_verifier = true); // If your test doesn't use interpreter as the reference backend, you can use // this constructor. Note that your test target is responsible for linking in // both needed backends. - HloTestBase(se::Platform* test_platform, se::Platform* reference_platform); + HloTestBase(se::Platform* test_platform, se::Platform* reference_platform, + bool verifier_layout_sensitive = false, + bool allow_mixed_precision_in_hlo_verifier = true); ~HloTestBase() override {} - // Creates a new HLO module for a test. The module created will have - // TestName() for its name; it will also automatically populate its debug - // options from command-line flags. If you want a fresh HloModule object and - // then add HloComputations to it, it's recommended to use this method in your - // tests. - static std::unique_ptr CreateNewModule( - const string& name = TestName()); - // Populates debug options from command-line flags and adjusts the options for // testing. It is recommended to use this when you need to pass in // DebugOptions, e.g. when creating a module from a string or a file. - static DebugOptions GetDebugOptionsForTest(); + // + // This function is virtual so tests can specify an alternative set of debug + // options (e.g. disabling additional passes). + virtual DebugOptions GetDebugOptionsForTest(); // Gets an HloModuleConfig with options appropriate for tests. - static HloModuleConfig GetModuleConfigForTest() { + HloModuleConfig GetModuleConfigForTest() { HloModuleConfig config; config.set_debug_options(GetDebugOptionsForTest()); return config; @@ -102,18 +114,15 @@ class HloTestBase : public ::testing::Test { // Executes the given module and return the result as a Literal. StatusOr> Execute( - std::unique_ptr module, - tensorflow::gtl::ArraySlice arguments); + std::unique_ptr module, absl::Span arguments); // Same as above, except the module will be executed without running any HLO // passes on it. std::unique_ptr ExecuteNoHloPasses( - std::unique_ptr module, - tensorflow::gtl::ArraySlice arguments); + std::unique_ptr module, absl::Span arguments); std::unique_ptr ExecuteAndTransfer( - std::unique_ptr module, - tensorflow::gtl::ArraySlice arguments); + std::unique_ptr module, absl::Span arguments); // Executes the given hlo module on two backends and compares results. // @@ -128,8 +137,8 @@ class HloTestBase : public ::testing::Test { // modified. ::testing::AssertionResult RunAndCompare( std::unique_ptr module, - const tensorflow::gtl::ArraySlice arguments, - const tensorflow::gtl::optional& error, + const absl::Span arguments, + const absl::optional& error, const std::function& reference_preprocessor = nullptr) TF_MUST_USE_RESULT; @@ -137,23 +146,21 @@ class HloTestBase : public ::testing::Test { // optimization. ::testing::AssertionResult RunAndCompareNoHloPasses( std::unique_ptr module, - const tensorflow::gtl::ArraySlice arguments, - const tensorflow::gtl::optional& error, + const absl::Span arguments, + const absl::optional& error, const std::function& reference_preprocessor = nullptr) TF_MUST_USE_RESULT; // Executes an hlo module with fake inputs and compares the results. ::testing::AssertionResult RunAndCompare( - std::unique_ptr module, - const tensorflow::gtl::optional& error, + std::unique_ptr module, const absl::optional& error, const std::function& reference_preprocessor = nullptr) TF_MUST_USE_RESULT; // Same as above, except that the module will be executed without Hlo // optimization. ::testing::AssertionResult RunAndCompareNoHloPasses( - std::unique_ptr module, - const tensorflow::gtl::optional& error, + std::unique_ptr module, const absl::optional& error, const std::function& reference_preprocessor = nullptr) TF_MUST_USE_RESULT; @@ -161,21 +168,23 @@ class HloTestBase : public ::testing::Test { // input. Module can be passed in directly, or parsed from an hlo_string, // or loaded from a file. ::testing::AssertionResult RunAndCompare( - const tensorflow::StringPiece hlo_string, - const tensorflow::gtl::optional& error, + const absl::string_view hlo_string, + const absl::optional& error, const std::function& reference_preprocessor = nullptr) TF_MUST_USE_RESULT; + ::testing::AssertionResult Run(const absl::string_view hlo_string) + TF_MUST_USE_RESULT; ::testing::AssertionResult RunAndCompareFromFile( - const string& filename, const tensorflow::gtl::optional& error, + const string& filename, const absl::optional& error, const std::function& reference_preprocessor = nullptr) TF_MUST_USE_RESULT; ::testing::AssertionResult RunAndCompareNoHloPasses( - const tensorflow::StringPiece hlo_string, - const tensorflow::gtl::optional& error, + const absl::string_view hlo_string, + const absl::optional& error, const std::function& reference_preprocessor = nullptr) TF_MUST_USE_RESULT; ::testing::AssertionResult RunAndCompareNoHloPassesFromFile( - const string& filename, const tensorflow::gtl::optional& error, + const string& filename, const absl::optional& error, const std::function& reference_preprocessor = nullptr) TF_MUST_USE_RESULT; @@ -184,13 +193,9 @@ class HloTestBase : public ::testing::Test { // 'layout'. void ForceParameterLayout(HloModule* module, int64 param_no, const Layout& layout) { - ASSERT_LT( - param_no, - module->mutable_host_entry_computation_layout()->parameter_count()); - module->mutable_host_entry_computation_layout() - ->mutable_parameter_layout(param_no) - ->ResetLayout(layout); - module->mutable_device_entry_computation_layout() + ASSERT_LT(param_no, + module->mutable_entry_computation_layout()->parameter_count()); + module->mutable_entry_computation_layout() ->mutable_parameter_layout(param_no) ->ResetLayout(layout); } @@ -198,21 +203,22 @@ class HloTestBase : public ::testing::Test { // Convenience method to force the layout of the computation result in a // module. The result layout of 'module' is set to 'layout'. void ForceResultLayout(HloModule* module, const Layout& layout) { - module->mutable_host_entry_computation_layout() + module->mutable_entry_computation_layout() ->mutable_result_layout() ->ResetLayout(layout); - module->mutable_device_entry_computation_layout() + } + + void ForceResultLayout(HloModule* module, const Layout& layout, + ShapeIndexView shape_index) { + module->mutable_entry_computation_layout() ->mutable_result_layout() - ->ResetLayout(layout); + ->ResetLayout(layout, shape_index); } // Convenience method to clear the layout of the computation result in // 'module'. void ForceClearResultLayout(HloModule* module) { - module->mutable_host_entry_computation_layout() - ->mutable_result_layout() - ->Clear(); - module->mutable_device_entry_computation_layout() + module->mutable_entry_computation_layout() ->mutable_result_layout() ->Clear(); } @@ -221,10 +227,8 @@ class HloTestBase : public ::testing::Test { // // This is useful for tests which create HLOs from a string and then want to // inspect a particular computation or instruction. - HloComputation* FindComputation(HloModule* module, - tensorflow::StringPiece name); - HloInstruction* FindInstruction(HloModule* module, - tensorflow::StringPiece name); + HloComputation* FindComputation(HloModule* module, absl::string_view name); + HloInstruction* FindInstruction(HloModule* module, absl::string_view name); // Return an HLO verifier constructed for the test backend. HloVerifier& verifier() const { return *hlo_verifier_; } @@ -254,8 +258,8 @@ class HloTestBase : public ::testing::Test { // error happens before the results are computed, returns the error status. StatusOr<::testing::AssertionResult> RunAndCompareInternal( std::unique_ptr module, - const tensorflow::gtl::ArraySlice arguments, - const tensorflow::gtl::optional& error, bool run_hlo_passes, + const absl::Span arguments, + const absl::optional& error, bool run_hlo_passes, const std::function& reference_preprocessor); }; diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc index da4cf4ae0c31bc194cd2ec9b845df36afbde69b0..8f86c528d0f346b0264948d592660911880f96d1 100644 --- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc +++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc @@ -15,17 +15,21 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" namespace xla { -HloVerifiedTestBase::HloVerifiedTestBase() - : shape_verifier_(MakeUnique()) {} +HloVerifiedTestBase::HloVerifiedTestBase(bool layout_sensitive, + bool allow_mixed_precision) + : HloTestBase( + /*verifier_layout_sensitive=*/layout_sensitive, + /*allow_mixed_precision_in_hlo_verifier=*/allow_mixed_precision) {} HloVerifiedTestBase::~HloVerifiedTestBase() { // We can't call the ASSERT or EXPECT test macros in destructors, so we @@ -41,14 +45,16 @@ void HloVerifiedTestBase::TearDown() { << "TearDown called more than once; it should be called exactly once."; tear_down_called_ = true; if (module_) { - VerifyModule(); + VerifyModule(module_.get()); + } + for (int i = 0; i < modules_.size(); ++i) { + VerifyModule(modules_.at(i).get()); } HloTestBase::TearDown(); } -void HloVerifiedTestBase::VerifyModule() { - HloVerifier verifier; - xla::StatusOr mutated = verifier.Run(module_.get()); +void HloVerifiedTestBase::VerifyModule(HloModule* module) { + xla::StatusOr mutated = verifier().Run(module); if (!mutated.ok()) { ADD_FAILURE() << "HloVerifier failed: " << mutated.status(); } else { @@ -59,15 +65,20 @@ void HloVerifiedTestBase::VerifyModule() { HloModule& HloVerifiedTestBase::module() { if (!module_) { - module_ = CreateNewModule(); + module_ = HloTestBase::CreateNewModule(); } return *module_; } -void HloVerifiedTestBase::ParseAndVerifyModule( - tensorflow::StringPiece hlo_text) { +HloModule* HloVerifiedTestBase::CreateNewModule(const string& name) { + modules_.emplace_back(HloTestBase::CreateNewModule()); + return modules_.back().get(); +} + +void HloVerifiedTestBase::ParseAndVerifyModule(absl::string_view hlo_text, + const HloModuleConfig& config) { CHECK(!module_) << "Called ParseModule when test already has a module."; - TF_ASSERT_OK_AND_ASSIGN(module_, tools::Parse(hlo_text)); - VerifyModule(); + TF_ASSERT_OK_AND_ASSIGN(module_, ParseHloString(hlo_text, config)); + VerifyModule(module_.get()); } } // namespace xla diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h index e5bb14a8839acbdef8fd2b79bb0f574c46ea3d40..8fbc4fa753ebf0c02b44ce10edf9251d28113f98 100644 --- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h +++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h @@ -29,7 +29,8 @@ namespace xla { // performs verification on that module on tear-down. class HloVerifiedTestBase : public HloTestBase { protected: - HloVerifiedTestBase(); + explicit HloVerifiedTestBase(bool layout_sensitive = false, + bool allow_mixed_precision = false); ~HloVerifiedTestBase() override; // Constructs a default shape verifier. @@ -44,19 +45,28 @@ class HloVerifiedTestBase : public HloTestBase { // Returns the default HloModule, lazily creating it if necessary via // HloTestBase::CreateNewModule(). HloModule& module(); - void ParseAndVerifyModule(tensorflow::StringPiece hlo_text); + void ParseAndVerifyModule(absl::string_view hlo_text, + const HloModuleConfig& config = HloModuleConfig()); - // Sets the shape-size function used during hlo verification. If this isn't - // called, a default ShapeVerifier is used instead. - void SetShapeVerifier(std::unique_ptr shape_verifier) { - shape_verifier_ = std::move(shape_verifier); - } + // Creates a new module for a test, and stores it in modules_ so it can be + // verified. Intentionally hides HloTestBase::CreateNewModule, to prevent + // creation of unverified modules. + HloModule* CreateNewModule(const string& name = TestName()); private: - std::unique_ptr module_; // Lazily populated. Access via module(). - std::unique_ptr shape_verifier_; + void VerifyModule(HloModule* module); + + // It is confusing to store modules created by module() and CreateNewModule() + // in different fields, but it allows us to migrate tests to + // HloVerifiedTestBase more easily, so it's a win because we can verify more + // modules. See b/80488902. + // + // Lazily populated. Access via module(). + std::unique_ptr module_; + // Populated by calls to CreateNewModule. + std::vector> modules_; + bool tear_down_called_ = false; - void VerifyModule(); }; } // namespace xla diff --git a/tensorflow/compiler/xla/tests/iota_test.cc b/tensorflow/compiler/xla/tests/iota_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..310f3495922250d68aa463fcbb24ef0b04603d09 --- /dev/null +++ b/tensorflow/compiler/xla/tests/iota_test.cc @@ -0,0 +1,117 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace xla { +namespace { + +template +std::vector GetR1Expected(const int64 num_elements) { + std::vector result(num_elements); + std::iota(result.begin(), result.end(), 0); + return result; +} + +class IotaR1Test + : public ClientLibraryTestBase, + public ::testing::WithParamInterface> {}; + +TEST_P(IotaR1Test, DoIt) { + const auto& spec = GetParam(); + const auto element_type = std::get<0>(spec); + const int64 num_elements = std::get<1>(spec); + XlaBuilder builder(TestName() + "_" + PrimitiveType_Name(element_type)); + Iota(&builder, element_type, num_elements); + if (element_type == F32) { + ComputeAndCompareR1(&builder, GetR1Expected(num_elements), {}, + ErrorSpec{0.0001}); + } else if (element_type == U32) { + ComputeAndCompareR1(&builder, GetR1Expected(num_elements), + {}); + } else { + CHECK_EQ(element_type, S32); + ComputeAndCompareR1(&builder, GetR1Expected(num_elements), + {}); + } +} + +INSTANTIATE_TEST_CASE_P(IotaR1TestInstantiation, IotaR1Test, + ::testing::Combine(::testing::Values(F32, U32, S32), + ::testing::Range(/*start=*/10, + /*end=*/10001, + /*step=*/10))); + +class IotaR2Test : public ClientLibraryTestBase, + public ::testing::WithParamInterface< + std::tuple> {}; + +TEST_P(IotaR2Test, DoIt) { + const auto& spec = GetParam(); + const auto element_type = std::get<0>(spec); + const int64 num_elements = std::get<1>(spec); + const int64 iota_dim = std::get<2>(spec); + XlaBuilder builder(TestName() + "_" + PrimitiveType_Name(element_type)); + std::vector dimensions = {42}; + dimensions.insert(dimensions.begin() + iota_dim, num_elements); + Iota(&builder, ShapeUtil::MakeShape(element_type, dimensions), iota_dim); + if (primitive_util::IsFloatingPointType(element_type)) { + ComputeAndCompare(&builder, {}, ErrorSpec{0.0001}); + } else { + ComputeAndCompare(&builder, {}); + } +} + +INSTANTIATE_TEST_CASE_P(IotaR2TestInstantiation, IotaR2Test, + ::testing::Combine(::testing::Values(F32, S32), + ::testing::Range(/*start=*/10, + /*end=*/1001, + /*step=*/10), + ::testing::Values(0, 1))); + +class IotaR3Test : public ClientLibraryTestBase, + public ::testing::WithParamInterface< + std::tuple> {}; + +TEST_P(IotaR3Test, DoIt) { + const auto& spec = GetParam(); + const auto element_type = std::get<0>(spec); + const int64 num_elements = std::get<1>(spec); + const int64 iota_dim = std::get<2>(spec); + XlaBuilder builder(TestName() + "_" + PrimitiveType_Name(element_type)); + std::vector dimensions = {42, 19}; + dimensions.insert(dimensions.begin() + iota_dim, num_elements); + Iota(&builder, ShapeUtil::MakeShape(element_type, dimensions), iota_dim); + if (primitive_util::IsFloatingPointType(element_type)) { + ComputeAndCompare(&builder, {}, ErrorSpec{0.0001}); + } else { + ComputeAndCompare(&builder, {}); + } +} + +INSTANTIATE_TEST_CASE_P(IotaR3TestInstantiation, IotaR3Test, + ::testing::Combine(::testing::Values(F32, S32), + ::testing::Range(/*start=*/10, + /*end=*/1001, + /*step=*/10), + ::testing::Values(0, 1, 2))); + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc index cde1dcd9cd10c86107f495a92be42b57bf6a085b..554eb24d44168caa7d7252015e3d99f2d567df9b 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util.cc +++ b/tensorflow/compiler/xla/tests/literal_test_util.cc @@ -15,9 +15,9 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/literal_test_util.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/literal_comparison.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/test.h" namespace xla { @@ -35,8 +35,7 @@ void WriteLiteralToTempFile(const LiteralSlice& literal, const string& name) { int64 now_usec = tensorflow::Env::Default()->NowMicros(); string filename = tensorflow::io::JoinPath( tensorflow::testing::TmpDir(), - tensorflow::strings::Printf("tempfile-%s-%llx-%s", get_hostname().c_str(), - now_usec, name.c_str())); + absl::StrFormat("tempfile-%s-%x-%s", get_hostname(), now_usec, name)); TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(), filename, literal.ToProto())); LOG(ERROR) << "wrote to " << name << " file: " << filename; @@ -94,7 +93,7 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual, /* static */ ::testing::AssertionResult LiteralTestUtil::NearOrEqual( const LiteralSlice& expected, const LiteralSlice& actual, - const tensorflow::gtl::optional& error) { + const absl::optional& error) { if (error.has_value()) { VLOG(1) << "Expects near"; return StatusToAssertion(literal_comparison::Near( diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h index d1b8a6cf0b2552f1b7d95a2560d502da14ddc39a..96f72212f35f5e6e98e2dc24fd9a87891a326e8f 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util.h +++ b/tensorflow/compiler/xla/tests/literal_test_util.h @@ -21,18 +21,19 @@ limitations under the License. #include #include +#include "absl/types/optional.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/error_spec.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -61,7 +62,7 @@ class LiteralTestUtil { static void ExpectR0Equal(NativeT expected, const LiteralSlice& actual); template - static void ExpectR1Equal(tensorflow::gtl::ArraySlice expected, + static void ExpectR1Equal(absl::Span expected, const LiteralSlice& actual); template static void ExpectR2Equal( @@ -101,7 +102,7 @@ class LiteralTestUtil { const ErrorSpec& error); template - static void ExpectR1Near(tensorflow::gtl::ArraySlice expected, + static void ExpectR1Near(absl::Span expected, const LiteralSlice& actual, const ErrorSpec& error); template @@ -145,7 +146,7 @@ class LiteralTestUtil { // will be compared recursively. static ::testing::AssertionResult NearOrEqual( const LiteralSlice& expected, const LiteralSlice& actual, - const tensorflow::gtl::optional& error) TF_MUST_USE_RESULT; + const absl::optional& error) TF_MUST_USE_RESULT; private: TF_DISALLOW_COPY_AND_ASSIGN(LiteralTestUtil); @@ -154,20 +155,20 @@ class LiteralTestUtil { template /* static */ void LiteralTestUtil::ExpectR0Equal(NativeT expected, const LiteralSlice& actual) { - EXPECT_TRUE(Equal(*Literal::CreateR0(expected), actual)); + EXPECT_TRUE(Equal(*LiteralUtil::CreateR0(expected), actual)); } template /* static */ void LiteralTestUtil::ExpectR1Equal( - tensorflow::gtl::ArraySlice expected, const LiteralSlice& actual) { - EXPECT_TRUE(Equal(*Literal::CreateR1(expected), actual)); + absl::Span expected, const LiteralSlice& actual) { + EXPECT_TRUE(Equal(*LiteralUtil::CreateR1(expected), actual)); } template /* static */ void LiteralTestUtil::ExpectR2Equal( std::initializer_list> expected, const LiteralSlice& actual) { - EXPECT_TRUE(Equal(*Literal::CreateR2(expected), actual)); + EXPECT_TRUE(Equal(*LiteralUtil::CreateR2(expected), actual)); } template @@ -175,46 +176,46 @@ template std::initializer_list>> expected, const LiteralSlice& actual) { - EXPECT_TRUE(Equal(*Literal::CreateR3(expected), actual)); + EXPECT_TRUE(Equal(*LiteralUtil::CreateR3(expected), actual)); } template /* static */ void LiteralTestUtil::ExpectR2EqualArray2D( const Array2D& expected, const LiteralSlice& actual) { - EXPECT_TRUE(Equal(*Literal::CreateR2FromArray2D(expected), actual)); + EXPECT_TRUE(Equal(*LiteralUtil::CreateR2FromArray2D(expected), actual)); } template /* static */ void LiteralTestUtil::ExpectR3EqualArray3D( const Array3D& expected, const LiteralSlice& actual) { - EXPECT_TRUE(Equal(*Literal::CreateR3FromArray3D(expected), actual)); + EXPECT_TRUE(Equal(*LiteralUtil::CreateR3FromArray3D(expected), actual)); } template /* static */ void LiteralTestUtil::ExpectR4EqualArray4D( const Array4D& expected, const LiteralSlice& actual) { - EXPECT_TRUE(Equal(*Literal::CreateR4FromArray4D(expected), actual)); + EXPECT_TRUE(Equal(*LiteralUtil::CreateR4FromArray4D(expected), actual)); } template /* static */ void LiteralTestUtil::ExpectR0Near(NativeT expected, const LiteralSlice& actual, const ErrorSpec& error) { - EXPECT_TRUE(Near(*Literal::CreateR0(expected), actual, error)); + EXPECT_TRUE(Near(*LiteralUtil::CreateR0(expected), actual, error)); } template /* static */ void LiteralTestUtil::ExpectR1Near( - tensorflow::gtl::ArraySlice expected, const LiteralSlice& actual, + absl::Span expected, const LiteralSlice& actual, const ErrorSpec& error) { - EXPECT_TRUE(Near(*Literal::CreateR1(expected), actual, error)); + EXPECT_TRUE(Near(*LiteralUtil::CreateR1(expected), actual, error)); } template /* static */ void LiteralTestUtil::ExpectR2Near( std::initializer_list> expected, const LiteralSlice& actual, const ErrorSpec& error) { - EXPECT_TRUE(Near(*Literal::CreateR2(expected), actual, error)); + EXPECT_TRUE(Near(*LiteralUtil::CreateR2(expected), actual, error)); } template @@ -222,7 +223,7 @@ template std::initializer_list>> expected, const LiteralSlice& actual, const ErrorSpec& error) { - EXPECT_TRUE(Near(*Literal::CreateR3(expected), actual, error)); + EXPECT_TRUE(Near(*LiteralUtil::CreateR3(expected), actual, error)); } template @@ -231,28 +232,28 @@ template std::initializer_list>>> expected, const LiteralSlice& actual, const ErrorSpec& error) { - EXPECT_TRUE(Near(*Literal::CreateR4(expected), actual, error)); + EXPECT_TRUE(Near(*LiteralUtil::CreateR4(expected), actual, error)); } template /* static */ void LiteralTestUtil::ExpectR2NearArray2D( const Array2D& expected, const LiteralSlice& actual, const ErrorSpec& error) { - EXPECT_TRUE(Near(*Literal::CreateR2FromArray2D(expected), actual, error)); + EXPECT_TRUE(Near(*LiteralUtil::CreateR2FromArray2D(expected), actual, error)); } template /* static */ void LiteralTestUtil::ExpectR3NearArray3D( const Array3D& expected, const LiteralSlice& actual, const ErrorSpec& error) { - EXPECT_TRUE(Near(*Literal::CreateR3FromArray3D(expected), actual, error)); + EXPECT_TRUE(Near(*LiteralUtil::CreateR3FromArray3D(expected), actual, error)); } template /* static */ void LiteralTestUtil::ExpectR4NearArray4D( const Array4D& expected, const LiteralSlice& actual, const ErrorSpec& error) { - EXPECT_TRUE(Near(*Literal::CreateR4FromArray4D(expected), actual, error)); + EXPECT_TRUE(Near(*LiteralUtil::CreateR4FromArray4D(expected), actual, error)); } } // namespace xla diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc index bbac7285aefbb1f028fad152e4b7fe6af01e9f6d..4151bfae0332ffc706ba730d181c487eabab856f 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc +++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc @@ -20,9 +20,9 @@ limitations under the License. #include +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" @@ -31,8 +31,9 @@ namespace xla { namespace { TEST(LiteralTestUtilTest, ComparesEqualTuplesEqual) { - std::unique_ptr literal = Literal::MakeTuple({ - Literal::CreateR0(42).get(), Literal::CreateR0(64).get(), + std::unique_ptr literal = LiteralUtil::MakeTuple({ + LiteralUtil::CreateR0(42).get(), + LiteralUtil::CreateR0(64).get(), }); EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *literal)); } @@ -42,11 +43,13 @@ TEST(LiteralTestUtilTest, ComparesUnequalTuplesUnequal) { // un-fail an assertion failure. The CHECK-failure is death, so we can make a // death assertion. auto unequal_things_are_equal = [] { - std::unique_ptr lhs = Literal::MakeTuple({ - Literal::CreateR0(42).get(), Literal::CreateR0(64).get(), + std::unique_ptr lhs = LiteralUtil::MakeTuple({ + LiteralUtil::CreateR0(42).get(), + LiteralUtil::CreateR0(64).get(), }); - std::unique_ptr rhs = Literal::MakeTuple({ - Literal::CreateR0(64).get(), Literal::CreateR0(42).get(), + std::unique_ptr rhs = LiteralUtil::MakeTuple({ + LiteralUtil::CreateR0(64).get(), + LiteralUtil::CreateR0(42).get(), }); CHECK(LiteralTestUtil::Equal(*lhs, *rhs)) << "LHS and RHS are unequal"; }; @@ -55,8 +58,8 @@ TEST(LiteralTestUtilTest, ComparesUnequalTuplesUnequal) { TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) { auto dummy_lambda = [] { - auto two = Literal::CreateR0(2); - auto four = Literal::CreateR0(4); + auto two = LiteralUtil::CreateR0(2); + auto four = LiteralUtil::CreateR0(4); ErrorSpec error(0.001); CHECK(LiteralTestUtil::Near(*two, *four, error)) << "two is not near four"; }; @@ -77,7 +80,7 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) { std::vector results; TF_CHECK_OK(env->GetMatchingPaths(pattern, &results)); - LOG(INFO) << "results: [" << tensorflow::str_util::Join(results, ", ") << "]"; + LOG(INFO) << "results: [" << absl::StrJoin(results, ", ") << "]"; EXPECT_EQ(3, results.size()); for (const string& result : results) { LiteralProto literal_proto; @@ -98,34 +101,37 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) { } TEST(LiteralTestUtilTest, NotEqualHasValuesInMessage) { - auto expected = Literal::CreateR1({1, 2, 3}); - auto actual = Literal::CreateR1({4, 5, 6}); + auto expected = LiteralUtil::CreateR1({1, 2, 3}); + auto actual = LiteralUtil::CreateR1({4, 5, 6}); ::testing::AssertionResult result = LiteralTestUtil::Equal(*expected, *actual); - EXPECT_THAT(result.message(), ::testing::HasSubstr("expected: {1, 2, 3}")); - EXPECT_THAT(result.message(), ::testing::HasSubstr("actual: {4, 5, 6}")); + EXPECT_THAT(result.message(), + ::testing::HasSubstr("Expected literal:\n{1, 2, 3}")); + EXPECT_THAT(result.message(), + ::testing::HasSubstr("Actual literal:\n{4, 5, 6}")); } TEST(LiteralTestUtilTest, NearComparatorR1) { - auto a = - Literal::CreateR1({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}); - auto b = - Literal::CreateR1({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}); + auto a = LiteralUtil::CreateR1( + {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}); + auto b = LiteralUtil::CreateR1( + {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}); EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001})); } TEST(LiteralTestUtilTest, NearComparatorR1Nan) { - auto a = - Literal::CreateR1({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8}); - auto b = - Literal::CreateR1({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8}); + auto a = LiteralUtil::CreateR1( + {0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8}); + auto b = LiteralUtil::CreateR1( + {0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8}); EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001})); } TEST(LiteralTestUtil, NearComparatorDifferentLengths) { - auto a = - Literal::CreateR1({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}); - auto b = Literal::CreateR1({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7}); + auto a = LiteralUtil::CreateR1( + {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}); + auto b = + LiteralUtil::CreateR1({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7}); EXPECT_FALSE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001})); EXPECT_FALSE(LiteralTestUtil::Near(*b, *a, ErrorSpec{0.0001})); } diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc index 2f46ee0be216d7dabf1c476d3cfb7d528f8ab6a4..8d658695576035cdc34a213847460dd80de5f67e 100644 --- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc +++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc @@ -14,9 +14,11 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/llvm_compiler.h" +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h" -#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h" +#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -64,7 +66,7 @@ class LLVMCompilerTest : public ::testing::Test { // Create HLO module, and run the compiler. auto builder = HloComputation::Builder(TestName()); builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); auto hlo_module = CreateNewModule(); hlo_module->AddEntryComputation(builder.Build()); @@ -86,7 +88,7 @@ class LLVMCompilerTest : public ::testing::Test { void TestMultiModuleCompilation(LLVMCompiler *compiler) { HloComputation::Builder builder(TestName()); builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(42.0))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.0))); std::unique_ptr hlo_module = CreateNewModule(); hlo_module->AddEntryComputation(builder.Build()); @@ -124,8 +126,7 @@ class LLVMCompilerTest : public ::testing::Test { static std::unique_ptr CreateNewModule() { HloModuleConfig config; config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags()); - return MakeUnique(TestName(), VersionedComputationHandle(), - config); + return absl::make_unique(TestName(), config); } }; @@ -145,7 +146,7 @@ TEST_F(CpuCompilerTest, HooksTest) { } TEST_F(GpuCompilerTest, HooksTest) { - gpu::GpuCompiler compiler; + gpu::NVPTXCompiler compiler; TestCompilerHooks(&compiler); } @@ -155,7 +156,7 @@ TEST_F(CpuCompilerTest, MultiModuleCompilation) { } TEST_F(GpuCompilerTest, MultModuleCompilation) { - gpu::GpuCompiler compiler; + gpu::NVPTXCompiler compiler; TestMultiModuleCompilation(&compiler); } } // namespace diff --git a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc index 2c45f19c090d2690878430363bf0d20252b2f3df..0487d314094edcab61a92de32f14113dd19673fa 100644 --- a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc +++ b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/tests/filecheck.h" #include "tensorflow/core/lib/core/status_test_util.h" @@ -25,57 +26,69 @@ limitations under the License. namespace xla { -void LLVMIRGenTestBase::SetIrHook(bool match_optimized_ir) { +void LlvmIrGenTestBase::SetIrHook(bool match_optimized_ir) { auto llvm_compiler = GetLLVMCompiler(); using std::placeholders::_1; // Add the IR inspection hook to the LLVM compiler. if (match_optimized_ir) { llvm_compiler->SetPostOptimizationHook( - std::bind(&LLVMIRGenTestBase::IrHook, this, _1)); + std::bind(&LlvmIrGenTestBase::IrHook, this, _1)); } else { llvm_compiler->SetPreOptimizationHook( - std::bind(&LLVMIRGenTestBase::IrHook, this, _1)); + std::bind(&LlvmIrGenTestBase::IrHook, this, _1)); } } -void LLVMIRGenTestBase::ResetIrHook() { +void LlvmIrGenTestBase::ResetIrHook() { auto llvm_compiler = GetLLVMCompiler(); llvm_compiler->RemovePreOptimizationHook(); llvm_compiler->RemovePostOptimizationHook(); } -void LLVMIRGenTestBase::CompileAndVerifyIr( +void LlvmIrGenTestBase::CompileAndVerifyIr( std::unique_ptr hlo_module, const string& pattern, bool match_optimized_ir) { SetIrHook(match_optimized_ir); - TF_ASSERT_OK(CompileToExecutable(std::move(hlo_module)).status()); + Status status = CompileToExecutable(std::move(hlo_module)).status(); ResetIrHook(); + TF_ASSERT_OK(status); StatusOr filecheck_result = RunFileCheck(ir_, pattern); TF_ASSERT_OK(filecheck_result.status()); EXPECT_TRUE(filecheck_result.ValueOrDie()); } -void LLVMIRGenTestBase::CompileAheadOfTimeAndVerifyIr( +void LlvmIrGenTestBase::CompileAndVerifyIr(const string& hlo_text, + const string& expected_llvm_ir, + bool match_optimized_ir) { + HloModuleConfig config; + config.set_debug_options(GetDebugOptionsForTest()); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_text, config)); + CompileAndVerifyIr(std::move(module), expected_llvm_ir, match_optimized_ir); +} + +void LlvmIrGenTestBase::CompileAheadOfTimeAndVerifyIr( std::unique_ptr hlo_module, const AotCompilationOptions& options, const string& pattern, bool match_optimized_ir) { SetIrHook(match_optimized_ir); - TF_ASSERT_OK( - CompileToAotCompilationResult(std::move(hlo_module), options).status()); + Status status = + CompileToAotCompilationResult(std::move(hlo_module), options).status(); ResetIrHook(); + TF_ASSERT_OK(status); StatusOr filecheck_result = RunFileCheck(ir_, pattern); ASSERT_TRUE(filecheck_result.ok()); EXPECT_TRUE(filecheck_result.ValueOrDie()); } -LLVMCompiler* LLVMIRGenTestBase::GetLLVMCompiler() { +LLVMCompiler* LlvmIrGenTestBase::GetLLVMCompiler() { return static_cast(backend().compiler()); } -Status LLVMIRGenTestBase::IrHook(const llvm::Module& module) { +Status LlvmIrGenTestBase::IrHook(const llvm::Module& module) { ir_ = llvm_ir::DumpModuleToString(module); return Status::OK(); } diff --git a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.h b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.h index 74cbb5f5df662992046a5b0f9a31e52879f375ad..018f9546afc3e408686a9ac75a74320a05b27182 100644 --- a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.h +++ b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.h @@ -24,7 +24,7 @@ limitations under the License. namespace xla { // Tests that verify IR emitted by the CPU/GPU backend is as expected. -class LLVMIRGenTestBase : public CodegenTestBase { +class LlvmIrGenTestBase : public CodegenTestBase { protected: // Compiles the given HLO module to LLVM IR and verifies the IR matches the // given pattern. `pattern` is in the FileCheck pattern matching syntax @@ -38,6 +38,12 @@ class LLVMIRGenTestBase : public CodegenTestBase { void CompileAndVerifyIr(std::unique_ptr hlo_module, const string& pattern, bool match_optimized_ir); + // A thin wrapper around CompileAndVerifyIr that parses `hlo_text` to create + // an HLO module. + void CompileAndVerifyIr(const string& hlo_text, + const string& expected_llvm_ir, + bool match_optimized_ir = false); + // Compiles the given HLO module to LLVM IR and verifies the IR matches the // given pattern. `pattern` is in the FileCheck pattern matching syntax // (http://llvm.org/docs/CommandGuide/FileCheck.html). diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc index f21f83992ffb7c07dff31c68a7e9e3f7944bf512..237a4a361e386e24c2897c42602eb60ca7234731 100644 --- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc @@ -15,16 +15,16 @@ limitations under the License. #include +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/local_client_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -38,14 +38,14 @@ class LocalClientAllocationTest : public LocalClientTestBase { XLA_TEST_F(LocalClientAllocationTest, AddVectors) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({0.0f, 1.0f, 2.0f}); - auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); - builder.Add(x, y); + auto x = ConstantR1(&builder, {0.0f, 1.0f, 2.0f}); + auto y = ConstantR1(&builder, {2.0f, 3.0f, 4.0f}); + Add(x, y); TestAllocator* allocator = GetOrCreateAllocator(local_client_->platform()); auto x_array = - LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); + LiteralToShapedBuffer(*LiteralUtil::CreateR1({0.0f, 1.0f, 2.0f})); int64 allocation_count_before = allocator_->allocation_count(); @@ -53,7 +53,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) { // deallocation happen on the right allocator. ExecutableRunOptions options; options.set_allocator(allocator); - tensorflow::gtl::optional result = + absl::optional result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}, DefaultExecutableBuildOptions(), options); @@ -74,9 +74,9 @@ XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) { // Run a computation on every device on the system. Verify that allocation // occurs on the proper device. XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({0.0f, 1.0f, 2.0f}); - auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); - builder.Add(x, y); + auto x = ConstantR1(&builder, {0.0f, 1.0f, 2.0f}); + auto y = ConstantR1(&builder, {2.0f, 3.0f, 4.0f}); + Add(x, y); auto computation = builder.Build().ConsumeValueOrDie(); TestAllocator* allocator = GetOrCreateAllocator(local_client_->platform()); diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test.cc b/tensorflow/compiler/xla/tests/local_client_aot_test.cc index 47cab796041e9669affaebd7866d0d80100730f1..115448c908ac9e7f0b01772ce348d23bf4d838ed 100644 --- a/tensorflow/compiler/xla/tests/local_client_aot_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_aot_test.cc @@ -42,13 +42,12 @@ extern "C" void SumStructElements(float* out, void** parameters) { TEST_F(LocalClientAotTest, Constant) { xla::ExecutableRunOptions run_options; OpaqueData opaque_data{100, 20, 3}; - void* parameters[] = {&opaque_data}; float out = 0; - void* temporary_buffers[] = {nullptr, &out}; - SumAndDouble(&out, &run_options, parameters, temporary_buffers); + void* temporary_buffers[] = {&opaque_data, &out}; + SumAndDouble(&out, &run_options, nullptr, temporary_buffers); EXPECT_EQ(out, 246.0f); opaque_data = {1, 2, 3}; - SumAndDouble(&out, &run_options, parameters, temporary_buffers); + SumAndDouble(&out, &run_options, nullptr, temporary_buffers); EXPECT_EQ(out, 12.0f); } diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc index a366afe8262e1f537b225e395bba9cb2fc22683a..60eb21aafd23a8d724d1f08d5c87098b7c3dcd6b 100644 --- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc +++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc @@ -21,8 +21,8 @@ limitations under the License. #include "llvm/ADT/Triple.h" #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/types.h" @@ -37,8 +37,8 @@ using xla::string; xla::XlaComputation Doubler() { xla::XlaBuilder builder("doubler"); auto r0f32 = xla::ShapeUtil::MakeShape(xla::F32, {}); - auto x = builder.Parameter(0, r0f32, "x"); - builder.Mul(x, builder.ConstantR0(2.0)); + auto x = xla::Parameter(&builder, 0, r0f32, "x"); + xla::Mul(x, xla::ConstantR0(&builder, 2.0)); return std::move(builder.Build().ValueOrDie()); } @@ -51,10 +51,10 @@ int main(int argc, char** argv) { xla::XlaBuilder builder("aot_test_helper"); auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape(); - auto opaque_param = builder.Parameter(0, opaque_shape, "x"); + auto opaque_param = Parameter(&builder, 0, opaque_shape, "x"); auto r0f32 = xla::ShapeUtil::MakeShape(xla::F32, {}); - auto sum = builder.CustomCall("SumStructElements", {opaque_param}, r0f32); - builder.Call(Doubler(), {sum}); + auto sum = CustomCall(&builder, "SumStructElements", {opaque_param}, r0f32); + Call(&builder, Doubler(), {sum}); if (argc != 2) { LOG(FATAL) << "local_client_aot_test_helper TARGET_CPU"; @@ -92,9 +92,10 @@ int main(int argc, char** argv) { // It's lame to hard-code the buffer assignments, but we need // local_client_aot_test.cc to be able to easily invoke the function. CHECK_EQ(result->result_buffer_index(), 1); - CHECK_EQ(result->buffer_sizes().size(), 2); - CHECK_EQ(result->buffer_sizes()[0], -1); // param buffer - CHECK_EQ(result->buffer_sizes()[1], sizeof(float)); // result buffer + CHECK_EQ(result->buffer_infos().size(), 3); + CHECK(result->buffer_infos()[0].is_entry_parameter()); // param buffer + CHECK_EQ(result->buffer_infos()[1].size(), sizeof(float)); // result buffer + CHECK(result->buffer_infos()[2].is_constant()); // const buffer if (triple.isOSBinFormatELF()) { // Check the ELF magic. CHECK_EQ(result->object_file_data()[0], 0x7F); diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc index 96858c00d6bbe59b673a34e7d5ca261756709596..1a823cf189b310c62c735419936544ea99fcfbaf 100644 --- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc @@ -19,9 +19,9 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/platform_util.h" @@ -54,7 +54,7 @@ class LocalClientExecuteTest : public LocalClientTestBase { XLA_TEST_F(LocalClientExecuteTest, Constant) { XlaBuilder builder(TestName()); - auto y = builder.ConstantR0(123.0f); + ConstantR0(&builder, 123.0f); ScopedShapedBuffer result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}); @@ -64,11 +64,11 @@ XLA_TEST_F(LocalClientExecuteTest, Constant) { XLA_TEST_F(LocalClientExecuteTest, AddScalars) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = builder.ConstantR0(123.0f); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = ConstantR0(&builder, 123.0f); + Add(x, y); - auto x_value = LiteralToShapedBuffer(*Literal::CreateR0(42.0f)); + auto x_value = LiteralToShapedBuffer(*LiteralUtil::CreateR0(42.0f)); ScopedShapedBuffer result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_value}); LiteralTestUtil::ExpectR0Near(165.f, *ShapedBufferToLiteral(result), @@ -77,11 +77,11 @@ XLA_TEST_F(LocalClientExecuteTest, AddScalars) { XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0}), "x"); - auto y = builder.ConstantR1({}); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {0}), "x"); + auto y = ConstantR1(&builder, {}); + Add(x, y); - auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({})); + auto x_array = LiteralToShapedBuffer(*LiteralUtil::CreateR1({})); ScopedShapedBuffer result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array}); LiteralTestUtil::ExpectR1Near({}, *ShapedBufferToLiteral(result), @@ -90,12 +90,12 @@ XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) { XLA_TEST_F(LocalClientExecuteTest, AddVectors) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); - auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3}), "x"); + auto y = ConstantR1(&builder, {2.0f, 3.0f, 4.0f}); + Add(x, y); auto x_array = - LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); + LiteralToShapedBuffer(*LiteralUtil::CreateR1({0.0f, 1.0f, 2.0f})); ScopedShapedBuffer result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array}); LiteralTestUtil::ExpectR1Near( @@ -104,12 +104,12 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectors) { XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); - auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3}), "x"); + auto y = ConstantR1(&builder, {2.0f, 3.0f, 4.0f}); + Add(x, y); auto x_array = - LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); + LiteralToShapedBuffer(*LiteralUtil::CreateR1({0.0f, 1.0f, 2.0f})); ExecutionProfile profile; ScopedShapedBuffer result = ExecuteLocallyOrDie( builder.Build().ValueOrDie(), {&x_array}, DefaultExecutableBuildOptions(), @@ -122,19 +122,19 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) { XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); + Add(x, y); auto computation = builder.Build().ConsumeValueOrDie(); // Create x as a col-major array. - auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout( + auto x_array = LiteralToShapedBuffer(*LiteralUtil::CreateR2WithLayout( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1}))); EXPECT_TRUE(LayoutUtil::Equal(x_array.on_device_shape().layout(), LayoutUtil::MakeLayout({0, 1}))); // Create y as a row-major array. - auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout( + auto y_array = LiteralToShapedBuffer(*LiteralUtil::CreateR2WithLayout( {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0}))); EXPECT_TRUE(LayoutUtil::Equal(y_array.on_device_shape().layout(), LayoutUtil::MakeLayout({1, 0}))); @@ -155,15 +155,15 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) { XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); + Add(x, y); auto computation = builder.Build().ConsumeValueOrDie(); auto x_array = LiteralToShapedBuffer( - *Literal::CreateR2({{1.0f, 2.0f}, {3.0f, 4.0f}})); + *LiteralUtil::CreateR2({{1.0f, 2.0f}, {3.0f, 4.0f}})); auto y_array = LiteralToShapedBuffer( - *Literal::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); + *LiteralUtil::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); // Run with col-major result layout. ScopedShapedBuffer result_colmaj = ExecuteLocallyOrDie( @@ -192,15 +192,15 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) { XLA_TEST_F(LocalClientExecuteTest, TupleResult) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); - builder.Tuple({x, y, x}); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); + Tuple(&builder, {x, y, x}); auto computation = builder.Build().ConsumeValueOrDie(); auto x_array = LiteralToShapedBuffer( - *Literal::CreateR2({{1.0f, 2.0f}, {3.0f, 4.0f}})); + *LiteralUtil::CreateR2({{1.0f, 2.0f}, {3.0f, 4.0f}})); auto y_array = LiteralToShapedBuffer( - *Literal::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); + *LiteralUtil::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&x_array, &y_array}); @@ -209,27 +209,26 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) { EXPECT_EQ(3, ShapeUtil::TupleElementCount(result.on_host_shape())); std::unique_ptr result_literal = ShapedBufferToLiteral(result); - LiteralTestUtil::ExpectR2Equal( - {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {0})); - LiteralTestUtil::ExpectR2Equal( - {{10.0f, 20.0f}, {30.0f, 40.0f}}, - LiteralSlice(*result_literal, {1})); - LiteralTestUtil::ExpectR2Equal( - {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {2})); + LiteralTestUtil::ExpectR2Equal({{1.0f, 2.0f}, {3.0f, 4.0f}}, + LiteralSlice(*result_literal, {0})); + LiteralTestUtil::ExpectR2Equal({{10.0f, 20.0f}, {30.0f, 40.0f}}, + LiteralSlice(*result_literal, {1})); + LiteralTestUtil::ExpectR2Equal({{1.0f, 2.0f}, {3.0f, 4.0f}}, + LiteralSlice(*result_literal, {2})); } XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); - auto inner_tuple = builder.Tuple({x, y, x}); - builder.Tuple({inner_tuple, x}); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); + auto inner_tuple = Tuple(&builder, {x, y, x}); + Tuple(&builder, {inner_tuple, x}); auto computation = builder.Build().ConsumeValueOrDie(); auto x_array = LiteralToShapedBuffer( - *Literal::CreateR2({{1.0f, 2.0f}, {3.0f, 4.0f}})); + *LiteralUtil::CreateR2({{1.0f, 2.0f}, {3.0f, 4.0f}})); auto y_array = LiteralToShapedBuffer( - *Literal::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); + *LiteralUtil::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&x_array, &y_array}); @@ -238,28 +237,25 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) { EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape())); std::unique_ptr result_literal = ShapedBufferToLiteral(result); - LiteralTestUtil::ExpectR2Equal( - {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {1})); - LiteralTestUtil::ExpectR2Equal( - {{1.0f, 2.0f}, {3.0f, 4.0f}}, - LiteralSlice(*result_literal, {0, 0})); - LiteralTestUtil::ExpectR2Equal( - {{10.0f, 20.0f}, {30.0f, 40.0f}}, - LiteralSlice(*result_literal, {0, 1})); - LiteralTestUtil::ExpectR2Equal( - {{1.0f, 2.0f}, {3.0f, 4.0f}}, - LiteralSlice(*result_literal, {0, 2})); + LiteralTestUtil::ExpectR2Equal({{1.0f, 2.0f}, {3.0f, 4.0f}}, + LiteralSlice(*result_literal, {1})); + LiteralTestUtil::ExpectR2Equal({{1.0f, 2.0f}, {3.0f, 4.0f}}, + LiteralSlice(*result_literal, {0, 0})); + LiteralTestUtil::ExpectR2Equal({{10.0f, 20.0f}, {30.0f, 40.0f}}, + LiteralSlice(*result_literal, {0, 1})); + LiteralTestUtil::ExpectR2Equal({{1.0f, 2.0f}, {3.0f, 4.0f}}, + LiteralSlice(*result_literal, {0, 2})); } XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) { // Verify setting the result layout of a computation with a tuple output. XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); - builder.Tuple({x, y}); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); + Tuple(&builder, {x, y}); auto array = LiteralToShapedBuffer( - *Literal::CreateR2({{1.0f, 2.0f}, {3.0f, 4.0f}})); + *LiteralUtil::CreateR2({{1.0f, 2.0f}, {3.0f, 4.0f}})); ExecutableBuildOptions options = DefaultExecutableBuildOptions(); Shape shape_with_layout = ShapeUtil::MakeTupleShape( @@ -273,10 +269,10 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) { options, DefaultExecutableRunOptions()); std::unique_ptr result_literal = ShapedBufferToLiteral(result); - LiteralTestUtil::ExpectR2Equal( - {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {0})); - LiteralTestUtil::ExpectR2Equal( - {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {1})); + LiteralTestUtil::ExpectR2Equal({{1.0f, 2.0f}, {3.0f, 4.0f}}, + LiteralSlice(*result_literal, {0})); + LiteralTestUtil::ExpectR2Equal({{1.0f, 2.0f}, {3.0f, 4.0f}}, + LiteralSlice(*result_literal, {1})); } XLA_TEST_F(LocalClientExecuteTest, TupleArguments) { @@ -291,23 +287,23 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) { // Computation adds the respective array and vector elements from each tuple // argument and returns the results as a tuple. XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, tuple_shape0, "x"); - auto y = builder.Parameter(1, tuple_shape1, "y"); - auto x_0 = builder.GetTupleElement(x, 0); - auto x_1 = builder.GetTupleElement(x, 1); - auto y_0 = builder.GetTupleElement(y, 0); - auto y_1 = builder.GetTupleElement(y, 1); - auto array_sum = builder.Add(x_0, y_1); - auto vector_diff = builder.Sub(x_1, y_0); - builder.Tuple({array_sum, vector_diff}); + auto x = Parameter(&builder, 0, tuple_shape0, "x"); + auto y = Parameter(&builder, 1, tuple_shape1, "y"); + auto x_0 = GetTupleElement(x, 0); + auto x_1 = GetTupleElement(x, 1); + auto y_0 = GetTupleElement(y, 0); + auto y_1 = GetTupleElement(y, 1); + auto array_sum = Add(x_0, y_1); + auto vector_diff = Sub(x_1, y_0); + Tuple(&builder, {array_sum, vector_diff}); auto computation = builder.Build().ConsumeValueOrDie(); - auto x_literal = Literal::MakeTuple( - {Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}).get(), - Literal::CreateR1({42.0, 75.0, 123.0}).get()}); - auto y_literal = Literal::MakeTuple( - {Literal::CreateR1({2.0, 4.0, 6.0}).get(), - Literal::CreateR2({{55.0, 44.0}, {33.0, 22.0}}).get()}); + auto x_literal = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}).get(), + LiteralUtil::CreateR1({42.0, 75.0, 123.0}).get()}); + auto y_literal = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR1({2.0, 4.0, 6.0}).get(), + LiteralUtil::CreateR2({{55.0, 44.0}, {33.0, 22.0}}).get()}); auto x_buffer = LiteralToShapedBuffer(*x_literal); auto y_buffer = LiteralToShapedBuffer(*y_literal); @@ -319,11 +315,10 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) { EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape())); std::unique_ptr result_literal = ShapedBufferToLiteral(result); - LiteralTestUtil::ExpectR2Equal( - {{56.0f, 46.0f}, {36.0f, 26.0f}}, - LiteralSlice(*result_literal, {0})); - LiteralTestUtil::ExpectR1Equal( - {40.0f, 71.0f, 117.0f}, LiteralSlice(*result_literal, {1})); + LiteralTestUtil::ExpectR2Equal({{56.0f, 46.0f}, {36.0f, 26.0f}}, + LiteralSlice(*result_literal, {0})); + LiteralTestUtil::ExpectR1Equal({40.0f, 71.0f, 117.0f}, + LiteralSlice(*result_literal, {1})); } XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) { @@ -338,32 +333,32 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) { // Computation negates the array element and sums the two vector elements in // the nested tuple. The resulting array and vector are returned as a tuple. XlaBuilder builder(TestName()); - auto param = builder.Parameter(0, nested_tuple_shape, "param"); - auto inner_tuple = builder.GetTupleElement(param, 0); - auto inner_array = builder.GetTupleElement(inner_tuple, 0); - auto inner_vector = builder.GetTupleElement(inner_tuple, 1); - auto outer_vector = builder.GetTupleElement(param, 1); - - auto negate_array = builder.Neg(inner_array); - auto vector_sum = builder.Add(inner_vector, outer_vector); - builder.Tuple({negate_array, vector_sum}); + auto param = Parameter(&builder, 0, nested_tuple_shape, "param"); + auto inner_tuple = GetTupleElement(param, 0); + auto inner_array = GetTupleElement(inner_tuple, 0); + auto inner_vector = GetTupleElement(inner_tuple, 1); + auto outer_vector = GetTupleElement(param, 1); + + auto negate_array = Neg(inner_array); + auto vector_sum = Add(inner_vector, outer_vector); + Tuple(&builder, {negate_array, vector_sum}); auto computation = builder.Build().ConsumeValueOrDie(); - auto arg_literal = Literal::MakeTuple( - {Literal::MakeTuple( - {Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}).get(), - Literal::CreateR1({42.0, 75.0, 123.0}).get()}) + auto arg_literal = LiteralUtil::MakeTuple( + {LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}).get(), + LiteralUtil::CreateR1({42.0, 75.0, 123.0}).get()}) .get(), - Literal::CreateR1({222.0, -2.0, 10.0}).get()}); + LiteralUtil::CreateR1({222.0, -2.0, 10.0}).get()}); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); std::unique_ptr result_literal = ShapedBufferToLiteral(result); - LiteralTestUtil::ExpectR2Equal( - {{-1.0, -2.0}, {-3.0, -4}}, LiteralSlice(*result_literal, {0})); - LiteralTestUtil::ExpectR1Equal( - {264.0, 73.0, 133.0}, LiteralSlice(*result_literal, {1})); + LiteralTestUtil::ExpectR2Equal({{-1.0, -2.0}, {-3.0, -4}}, + LiteralSlice(*result_literal, {0})); + LiteralTestUtil::ExpectR1Equal({264.0, 73.0, 133.0}, + LiteralSlice(*result_literal, {1})); } XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) { @@ -376,31 +371,30 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) { ShapeUtil::MakeTupleShape({array_shape, array_shape}); XlaBuilder builder(TestName()); - auto param = builder.Parameter(0, tuple_shape, "param"); - auto element_0 = builder.GetTupleElement(param, 0); - auto element_1 = builder.GetTupleElement(param, 1); - builder.Tuple({builder.Neg(element_0), builder.Add(element_1, element_1)}); + auto param = Parameter(&builder, 0, tuple_shape, "param"); + auto element_0 = GetTupleElement(param, 0); + auto element_1 = GetTupleElement(param, 1); + Tuple(&builder, {Neg(element_0), Add(element_1, element_1)}); auto computation = builder.Build().ConsumeValueOrDie(); - auto arg_literal = Literal::MakeTuple( - {Literal::CreateR2({{1.0, 2.0}, {3.0, 4.0}}).get(), - Literal::CreateR2({{11.0, 3.0}, {4.0, 5.0}}).get()}); + auto arg_literal = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}).get(), + LiteralUtil::CreateR2({{11.0, 3.0}, {4.0, 5.0}}).get()}); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); ScopedShapedBuffer result_0 = ExecuteLocallyOrDie(computation, {&arg_buffer}); std::unique_ptr result_0_literal = ShapedBufferToLiteral(result_0); - LiteralTestUtil::ExpectR2Equal( - {{-1.0, -2.0}, {-3.0, -4.0}}, - LiteralSlice(*result_0_literal, {0})); - LiteralTestUtil::ExpectR2Equal( - {{22.0, 6.0}, {8.0, 10}}, LiteralSlice(*result_0_literal, {1})); + LiteralTestUtil::ExpectR2Equal({{-1.0, -2.0}, {-3.0, -4.0}}, + LiteralSlice(*result_0_literal, {0})); + LiteralTestUtil::ExpectR2Equal({{22.0, 6.0}, {8.0, 10}}, + LiteralSlice(*result_0_literal, {1})); ScopedShapedBuffer result_1 = ExecuteLocallyOrDie(computation, {&result_0}); std::unique_ptr result_1_literal = ShapedBufferToLiteral(result_1); - LiteralTestUtil::ExpectR2Equal( - {{1.0, 2.0}, {3.0, 4.0}}, LiteralSlice(*result_1_literal, {0})); - LiteralTestUtil::ExpectR2Equal( - {{44.0, 12.0}, {16.0, 20}}, LiteralSlice(*result_1_literal, {1})); + LiteralTestUtil::ExpectR2Equal({{1.0, 2.0}, {3.0, 4.0}}, + LiteralSlice(*result_1_literal, {0})); + LiteralTestUtil::ExpectR2Equal({{44.0, 12.0}, {16.0, 20}}, + LiteralSlice(*result_1_literal, {1})); } XLA_TEST_F(LocalClientExecuteTest, LargeTuple) { @@ -420,26 +414,25 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) { const Shape tuple_shape = ShapeUtil::MakeTupleShape(element_shapes); XlaBuilder builder(TestName()); - auto param = builder.Parameter(0, tuple_shape, "param"); + auto param = Parameter(&builder, 0, tuple_shape, "param"); // Add each element's tuple index value to every element. std::vector result_elements; for (int i = 0; i < kElementCount; ++i) { - auto element = builder.GetTupleElement(param, i); - result_elements.push_back( - builder.Add(element, builder.ConstantR0(i))); + auto element = GetTupleElement(param, i); + result_elements.push_back(Add(element, ConstantR0(&builder, i))); } - builder.Tuple(result_elements); + Tuple(&builder, result_elements); auto computation = builder.Build().ConsumeValueOrDie(); // Feed in a tuple where each two-element vector element is {tuple_index, // -tuple_index}. std::vector> arg_elements; for (int i = 0; i < kElementCount; ++i) { - arg_elements.push_back(Literal::CreateR1({1.0f * i, -1.0f * i})); + arg_elements.push_back(LiteralUtil::CreateR1({1.0f * i, -1.0f * i})); } std::unique_ptr arg_literal = - Literal::MakeTupleOwned(std::move(arg_elements)); + LiteralUtil::MakeTupleOwned(std::move(arg_elements)); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); @@ -447,8 +440,7 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) { for (int i = 0; i < kElementCount; ++i) { LiteralTestUtil::ExpectR1Near( - {2.0f * i, 0.0f}, LiteralSlice(*result_literal, {i}), - error_spec_); + {2.0f * i, 0.0f}, LiteralSlice(*result_literal, {i}), error_spec_); } } @@ -465,22 +457,22 @@ XLA_TEST_F(LocalClientExecuteTest, LargeNestedTuple) { const Shape tuple_shape = ShapeUtil::MakeTupleShape(inner_tuple_shapes); XlaBuilder builder(TestName()); - auto param = builder.Parameter(0, tuple_shape, "param"); + auto param = Parameter(&builder, 0, tuple_shape, "param"); // The computation increments each leaf value by an amount equal to the leaf's // ordinal position in a traversal of the tuple. std::vector result_elements; for (int i = 0; i < kFanout; ++i) { - auto outer_element = builder.GetTupleElement(param, i); + auto outer_element = GetTupleElement(param, i); std::vector inner_result_elements; for (int j = 0; j < kFanout; ++j) { - auto inner_element = builder.GetTupleElement(outer_element, j); - inner_result_elements.push_back(builder.Add( - inner_element, builder.ConstantR0(i * kFanout + j))); + auto inner_element = GetTupleElement(outer_element, j); + inner_result_elements.push_back( + Add(inner_element, ConstantR0(&builder, i * kFanout + j))); } - result_elements.push_back(builder.Tuple(inner_result_elements)); + result_elements.push_back(Tuple(&builder, inner_result_elements)); } - builder.Tuple(result_elements); + Tuple(&builder, result_elements); auto computation = builder.Build().ConsumeValueOrDie(); // Construct the argument to pass to the computation. @@ -488,12 +480,13 @@ XLA_TEST_F(LocalClientExecuteTest, LargeNestedTuple) { for (int i = 0; i < kFanout; ++i) { std::vector> inner_tuple_elements; for (int j = 0; j < kFanout; ++j) { - inner_tuple_elements.push_back(Literal::CreateR0(i + j)); + inner_tuple_elements.push_back(LiteralUtil::CreateR0(i + j)); } outer_tuple_elements.push_back( - Literal::MakeTupleOwned(std::move(inner_tuple_elements))); + LiteralUtil::MakeTupleOwned(std::move(inner_tuple_elements))); } - auto arg_literal = Literal::MakeTupleOwned(std::move(outer_tuple_elements)); + auto arg_literal = + LiteralUtil::MakeTupleOwned(std::move(outer_tuple_elements)); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); @@ -520,23 +513,23 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) { } XlaBuilder builder(TestName()); - auto element = builder.Parameter(0, shape, "param"); + auto element = Parameter(&builder, 0, shape, "param"); for (int i = 0; i < kTupleDepth; ++i) { - element = builder.GetTupleElement(element, 0); + element = GetTupleElement(element, 0); } - auto output = builder.Add(element, builder.ConstantR0(42.0)); + auto output = Add(element, ConstantR0(&builder, 42.0)); for (int i = 0; i < kTupleDepth; ++i) { - output = builder.Tuple({output}); + output = Tuple(&builder, {output}); } auto computation = builder.Build().ConsumeValueOrDie(); // Construct the argument to pass to the computation. - std::unique_ptr arg_literal = Literal::CreateR0(123.0); + std::unique_ptr arg_literal = LiteralUtil::CreateR0(123.0); for (int i = 0; i < kTupleDepth; ++i) { std::vector> arg_vector; arg_vector.push_back(std::move(arg_literal)); - arg_literal = Literal::MakeTupleOwned(std::move(arg_vector)); + arg_literal = LiteralUtil::MakeTupleOwned(std::move(arg_vector)); } auto arg_buffer = LiteralToShapedBuffer(*arg_literal); @@ -547,19 +540,19 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) { for (int i = 0; i < kTupleDepth; ++i) { index.push_back(0); } - LiteralTestUtil::ExpectR0Equal( - 165.0, LiteralSlice(*result_literal, index)); + LiteralTestUtil::ExpectR0Equal(165.0, + LiteralSlice(*result_literal, index)); } XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) { // Test passing in an invalid number of arguments. XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {3}), "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {3}), "y"); + Add(x, y); auto x_array = - LiteralToShapedBuffer(*Literal::CreateR1({1.0f, 2.0f, 3.0f})); + LiteralToShapedBuffer(*LiteralUtil::CreateR1({1.0f, 2.0f, 3.0f})); auto execute_status = ExecuteLocally(builder.Build().ValueOrDie(), {&x_array}); @@ -571,11 +564,11 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) { XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) { // Test passing in an argument with the wrong shape. XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); - builder.Neg(x); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3}), "x"); + Neg(x); auto x_array = LiteralToShapedBuffer( - *Literal::CreateR2({{0.0f, 1.0f}, {2.0f, 3.0f}})); + *LiteralUtil::CreateR2({{0.0f, 1.0f}, {2.0f, 3.0f}})); auto execute_status = ExecuteLocally(builder.Build().ValueOrDie(), {&x_array}); @@ -588,11 +581,11 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) { XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) { // Test passing in an invalid result layout parameter. XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); - builder.Neg(x); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); + Neg(x); auto x_array = LiteralToShapedBuffer( - *Literal::CreateR2({{0.0f, 1.0f}, {2.0f, 3.0f}})); + *LiteralUtil::CreateR2({{0.0f, 1.0f}, {2.0f, 3.0f}})); auto execute_status = ExecuteLocally( builder.Build().ValueOrDie(), {&x_array}, DefaultExecutableBuildOptions().set_result_layout( @@ -611,7 +604,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) { // Try to run a trivial computation on every device on the system. If a // specific device is not supported, check that the right error is returned. XlaBuilder builder(TestName()); - builder.ConstantR0(42.0f); + ConstantR0(&builder, 42.0f); auto computation = builder.Build().ConsumeValueOrDie(); for (int d = 0; d < local_client_->device_count(); ++d) { if (!local_client_->device_ordinal_supported(d)) { @@ -638,7 +631,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) { // Try running computations on devices with device ordinal values which do not // exist. XlaBuilder builder(TestName()); - builder.ConstantR0(42.0f); + ConstantR0(&builder, 42.0f); auto computation = builder.Build().ConsumeValueOrDie(); auto execute_status = @@ -655,7 +648,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) { XLA_TEST_F(LocalClientExecuteTest, RunOnStream) { // Run a computation on a specific stream on each device on the system. XlaBuilder builder(TestName()); - builder.ConstantR0(42.0f); + ConstantR0(&builder, 42.0f); auto computation = builder.Build().ConsumeValueOrDie(); for (int d = 0; d < local_client_->device_count(); ++d) { @@ -691,7 +684,7 @@ XLA_TEST_F(LocalClientExecuteTest, wrong_stream.Init(); XlaBuilder builder(TestName()); - builder.ConstantR0(42.0f); + ConstantR0(&builder, 42.0f); auto execute_status = ExecuteLocally( builder.Build().ValueOrDie(), {}, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions().set_stream(&wrong_stream)); @@ -708,7 +701,7 @@ XLA_TEST_F(LocalClientExecuteTest, TestAllocator allocator(wrong_platform); XlaBuilder builder(TestName()); - auto y = builder.ConstantR0(123.0f); + ConstantR0(&builder, 123.0f); auto execute_status = ExecuteLocally( builder.Build().ValueOrDie(), {}, DefaultExecutableBuildOptions(), @@ -721,7 +714,7 @@ XLA_TEST_F(LocalClientExecuteTest, XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) { // Try to run a computation on a stream that has not been initialized. XlaBuilder builder(TestName()); - builder.ConstantR0(42.0f); + ConstantR0(&builder, 42.0f); LOG(INFO) << "default device = " << local_client_->default_device_ordinal(); se::StreamExecutor* executor = @@ -744,26 +737,26 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) { std::initializer_list vec1 = {1.f, 2.f, 3.f}; std::initializer_list vec2 = {2.f, 4.f, 6.f}; - auto tuple12 = builder.Tuple( - {builder.ConstantR1(vec1), builder.ConstantR1(vec2)}); - auto tuple21 = builder.Tuple( - {builder.ConstantR1(vec2), builder.ConstantR1(vec1)}); - builder.Select(builder.ConstantR0(false), tuple12, tuple21); + auto tuple12 = Tuple(&builder, {ConstantR1(&builder, vec1), + ConstantR1(&builder, vec2)}); + auto tuple21 = Tuple(&builder, {ConstantR1(&builder, vec2), + ConstantR1(&builder, vec1)}); + Select(ConstantR0(&builder, false), tuple12, tuple21); ScopedShapedBuffer result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}); std::unique_ptr tuple_literal = ShapedBufferToLiteral(result); - LiteralTestUtil::ExpectR1Equal( - {2.0f, 4.0f, 6.0f}, LiteralSlice(*tuple_literal, {0})); - LiteralTestUtil::ExpectR1Equal( - {1.0f, 2.0f, 3.0f}, LiteralSlice(*tuple_literal, {1})); + LiteralTestUtil::ExpectR1Equal({2.0f, 4.0f, 6.0f}, + LiteralSlice(*tuple_literal, {0})); + LiteralTestUtil::ExpectR1Equal({1.0f, 2.0f, 3.0f}, + LiteralSlice(*tuple_literal, {1})); } XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); - auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3}), "x"); + auto y = ConstantR1(&builder, {2.0f, 3.0f, 4.0f}); + Add(x, y); Shape argument_layout = ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{3}, {0}); @@ -775,10 +768,14 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) { executable_status.ConsumeValueOrDie(); auto x_array = - LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); + LiteralToShapedBuffer(*LiteralUtil::CreateR1({0.0f, 1.0f, 2.0f})); ScopedShapedBuffer result = executable->Run({&x_array}, DefaultExecutableRunOptions()) .ConsumeValueOrDie(); + ASSERT_IS_OK(local_client_->mutable_backend() + ->BorrowStream(0) + .ValueOrDie() + ->BlockHostUntilDone()); LiteralTestUtil::ExpectR1Near( {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_); @@ -799,29 +796,29 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) { }; // Array shapes. - test_to_device_and_back(*Literal::CreateR0(42.0)); - test_to_device_and_back(*Literal::CreateR0(true)); - test_to_device_and_back(*Literal::CreateR1({1.0, 42.0, 744.4})); + test_to_device_and_back(*LiteralUtil::CreateR0(42.0)); + test_to_device_and_back(*LiteralUtil::CreateR0(true)); + test_to_device_and_back(*LiteralUtil::CreateR1({1.0, 42.0, 744.4})); test_to_device_and_back( - *Literal::CreateR2({{1.0, 2.0, 3.0}, {44.0, 0.1, -3}})); - test_to_device_and_back(*Literal::CreateR2({{2, 1}, {4444, 56}})); + *LiteralUtil::CreateR2({{1.0, 2.0, 3.0}, {44.0, 0.1, -3}})); + test_to_device_and_back(*LiteralUtil::CreateR2({{2, 1}, {4444, 56}})); // Null shape (empty tuple). - test_to_device_and_back(*Literal::MakeTuple({})); + test_to_device_and_back(*LiteralUtil::MakeTuple({})); // Non-nested tuples. test_to_device_and_back( - *Literal::MakeTuple({Literal::CreateR0(12223.0).get()})); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR0(12223.0).get()})); test_to_device_and_back( - *Literal::MakeTuple({Literal::CreateR1({1.0, -42.0}).get(), - Literal::CreateR0(123456.0).get()})); + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({1.0, -42.0}).get(), + LiteralUtil::CreateR0(123456.0).get()})); // Nested tuple. - test_to_device_and_back(*Literal::MakeTuple( - {Literal::MakeTuple({Literal::CreateR1({1.0, -42.0}).get(), - Literal::CreateR0(123456.0).get()}) + test_to_device_and_back(*LiteralUtil::MakeTuple( + {LiteralUtil::MakeTuple({LiteralUtil::CreateR1({1.0, -42.0}).get(), + LiteralUtil::CreateR0(123456.0).get()}) .get(), - Literal::CreateR0(false).get()})); + LiteralUtil::CreateR0(false).get()})); } XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) { @@ -839,24 +836,47 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) { }; test_to_device_and_back( - *Literal::CreateR2({{1.0, 2.0, 3.0}, {44.0, 0.1, -3}})); - test_to_device_and_back(*Literal::CreateR2({{2, 1}, {4444, 56}})); + *LiteralUtil::CreateR2({{1.0, 2.0, 3.0}, {44.0, 0.1, -3}})); + test_to_device_and_back(*LiteralUtil::CreateR2({{2, 1}, {4444, 56}})); test_to_device_and_back( - *Literal::CreateR2({{20000000000ULL, 1}, {4444, 56}})); - test_to_device_and_back( - *Literal::MakeTuple({Literal::CreateR1({1.0, -42.0}).get(), - Literal::CreateR0(123456789000LL).get()})); + *LiteralUtil::CreateR2({{20000000000ULL, 1}, {4444, 56}})); + test_to_device_and_back(*LiteralUtil::MakeTuple( + {LiteralUtil::CreateR1({1.0, -42.0}).get(), + LiteralUtil::CreateR0(123456789000LL).get()})); } -// TODO(b/34359662): Support infeed/outfeed on GPU and CPU parallel. -// 2017-10-18. -XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_GPU(InfeedOutfeedTest)) { +XLA_TEST_F(LocalClientExecuteTest, InfeedTest) { XlaBuilder builder(TestName()); const Shape shape = ShapeUtil::MakeShape(F32, {3}); - auto in = builder.Infeed(shape); - auto constant = builder.ConstantR1({1.0f, 2.0f, 3.0f}); - auto sum = builder.Add(in, constant); - builder.Outfeed(sum, shape, /*outfeed_config=*/""); + auto in = Infeed(&builder, shape); + auto constant = ConstantR1(&builder, {1.0f, 2.0f, 3.0f}); + Add(in, constant); + + std::unique_ptr result; + std::unique_ptr thread( + tensorflow::Env::Default()->StartThread( + tensorflow::ThreadOptions(), "execute_thread", [&] { + result = ShapedBufferToLiteral(ExecuteLocallyOrDie( + builder.Build().ValueOrDie(), /*arguments=*/{})); + })); + + ASSERT_IS_OK(local_client_->TransferToInfeedLocal( + *LiteralUtil::CreateR1({-5.0, 123.0, 42.0}), + local_client_->default_device_ordinal())); + + // Join the thread. + thread.reset(); + + LiteralTestUtil::ExpectR1Equal({-4.0, 125.0, 45.0}, *result); +} + +XLA_TEST_F(LocalClientExecuteTest, InfeedOutfeedTest) { + XlaBuilder builder(TestName()); + const Shape shape = ShapeUtil::MakeShape(F32, {3}); + auto in = Infeed(&builder, shape); + auto constant = ConstantR1(&builder, {1.0f, 2.0f, 3.0f}); + auto sum = Add(in, constant); + Outfeed(sum, shape, /*outfeed_config=*/""); std::unique_ptr thread( tensorflow::Env::Default()->StartThread( @@ -864,7 +884,7 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_GPU(InfeedOutfeedTest)) { [&] { ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}); })); ASSERT_IS_OK(local_client_->TransferToInfeedLocal( - *Literal::CreateR1({-5.0, 123.0, 42.0}), + *LiteralUtil::CreateR1({-5.0, 123.0, 42.0}), local_client_->default_device_ordinal())); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, @@ -891,17 +911,19 @@ void BM_LocalClientOverhead(int num_iters) { // Use a tiny add operation as the computation. XlaBuilder builder("Add"); auto shape = ShapeUtil::MakeShape(F32, {2, 3}); - auto x = builder.Parameter(0, shape, "x"); - builder.Add(x, x); + auto x = Parameter(&builder, 0, shape, "x"); + Add(x, x); auto computation = builder.Build().ConsumeValueOrDie(); auto buffer = transfer_manager ->AllocateScopedShapedBuffer(shape, &allocator, /*device_ordinal=*/0) .ConsumeValueOrDie(); - auto literal = Literal::CreateR2({{0, 0, 0}, {0, 0, 0}}); - ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice( - executors[device_ordinal], *literal, buffer)); + auto literal = LiteralUtil::CreateR2({{0, 0, 0}, {0, 0, 0}}); + auto stream = + client->mutable_backend()->BorrowStream(device_ordinal).ValueOrDie(); + ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(stream.get(), *literal, + buffer)); const int kWarmups = 2; @@ -911,11 +933,8 @@ void BM_LocalClientOverhead(int num_iters) { std::unique_ptr executable = executable_status.ConsumeValueOrDie(); - se::Stream stream(executors[client->default_device_ordinal()]); - stream.Init(); - ExecutableRunOptions run_options; - run_options.set_allocator(&allocator).set_stream(&stream); + run_options.set_allocator(&allocator).set_stream(stream.get()); for (int i = 0; i < kWarmups; ++i) { auto result = executable->Run({&buffer}, run_options); diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc index 88797a7d0a7d0567b3a380c5fb1ad0c0ee875587..a8c68fc7fdbad30068af44606f559ca96603fe66 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.cc +++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc @@ -18,10 +18,11 @@ limitations under the License. #include +#include "absl/memory/memory.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -155,7 +156,7 @@ ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const { ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions()) .ConsumeValueOrDie(); @@ -163,7 +164,7 @@ ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options) { return ExecuteLocally(computation, arguments, build_options, run_options) @@ -172,14 +173,14 @@ ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( StatusOr LocalClientTestBase::ExecuteLocally( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments) { + absl::Span arguments) { return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions()); } StatusOr LocalClientTestBase::ExecuteLocally( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options) { std::vector argument_layouts(arguments.size()); @@ -189,7 +190,19 @@ StatusOr LocalClientTestBase::ExecuteLocally( TF_ASSIGN_OR_RETURN( std::unique_ptr executable, local_client_->Compile(computation, argument_layouts, build_options)); - return executable->Run(arguments, run_options); + TF_ASSIGN_OR_RETURN(auto ret, executable->Run(arguments, run_options)); + + auto device_ordinal = + build_options.device_ordinal() == -1 ? 0 : build_options.device_ordinal(); + auto* stream = run_options.stream(); + if (!stream) { + stream = local_client_->mutable_backend() + ->BorrowStream(device_ordinal) + .ValueOrDie() + .get(); + } + TF_RETURN_IF_ERROR(stream->BlockHostUntilDone()); + return std::move(ret); } } // namespace xla diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h index 258226523d830b40ecaa761df95988dc90f5ca47..90095c5d410f1561a1303a0f62f44d22ed5340f9 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.h +++ b/tensorflow/compiler/xla/tests/local_client_test_base.h @@ -20,9 +20,10 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/platform_util.h" @@ -31,7 +32,6 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -93,19 +93,19 @@ class LocalClientTestBase : public ::testing::Test { // options. StatusOr ExecuteLocally( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); StatusOr ExecuteLocally( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options); ScopedShapedBuffer ExecuteLocallyOrDie( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments); + absl::Span arguments); ScopedShapedBuffer ExecuteLocallyOrDie( const XlaComputation& computation, - tensorflow::gtl::ArraySlice arguments, + absl::Span arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options); diff --git a/tensorflow/compiler/xla/tests/log_test.cc b/tensorflow/compiler/xla/tests/log_test.cc index c0c02e584c2348f64a9d7d0800038f5ca67a2171..2d622242e657ce032a17f7b26c94227d343e2a38 100644 --- a/tensorflow/compiler/xla/tests/log_test.cc +++ b/tensorflow/compiler/xla/tests/log_test.cc @@ -17,7 +17,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -30,8 +30,8 @@ class LogTest : public ClientLibraryTestBase {}; XLA_TEST_F(LogTest, LogZeroValues) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR3FromArray3D(Array3D(3, 0, 0)); - builder.Log(x); + auto x = ConstantR3FromArray3D(&builder, Array3D(3, 0, 0)); + Log(x); ComputeAndCompareR3(&builder, Array3D(3, 0, 0), {}, ErrorSpec(0.0001)); @@ -42,8 +42,8 @@ TEST_F(LogTest, LogTenValues) { 5.0, 6.0, -7.0, -8.0, 9.0}; XlaBuilder builder(TestName()); - auto x = builder.ConstantR1(input); - builder.Log(x); + auto x = ConstantR1(&builder, input); + Log(x); std::vector expected; expected.reserve(input.size()); diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc index 7df45bebebdd3eb2e71f27d831a8e2ac9e3b5f7c..0732e195d44d738b264361e43d38259c26a4116e 100644 --- a/tensorflow/compiler/xla/tests/map_test.cc +++ b/tensorflow/compiler/xla/tests/map_test.cc @@ -19,9 +19,9 @@ limitations under the License. #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" @@ -52,9 +52,9 @@ class MapTest : public ClientLibraryTestBase { // 1.0f ---------/ XlaComputation CreateAdderToOne() { XlaBuilder mapped_builder(TestName()); - auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto one = mapped_builder.ConstantR0(1.0); - mapped_builder.Add(x, one); + auto x = Parameter(&mapped_builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto one = ConstantR0(&mapped_builder, 1.0); + Add(x, one); auto computation_status = mapped_builder.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -62,9 +62,9 @@ class MapTest : public ClientLibraryTestBase { XlaComputation CreateMax() { XlaBuilder b(TestName()); - auto lhs = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto rhs = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - b.Max(lhs, rhs); + auto lhs = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto rhs = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y"); + Max(lhs, rhs); auto computation_status = b.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -75,8 +75,8 @@ class MapTest : public ClientLibraryTestBase { template XlaComputation CreateScalarOne() { XlaBuilder mapped_builder("scalar_one"); - (void)mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - mapped_builder.ConstantR0(1); + (void)Parameter(&mapped_builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + ConstantR0(&mapped_builder, 1); auto computation_status = mapped_builder.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -89,9 +89,9 @@ class MapTest : public ClientLibraryTestBase { // 2.0f ---------/ XlaComputation CreateMulByTwo() { XlaBuilder mapped_builder(TestName()); - auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto two = mapped_builder.ConstantR0(2.0); - mapped_builder.Mul(x, two); + auto x = Parameter(&mapped_builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto two = ConstantR0(&mapped_builder, 2.0); + Mul(x, two); auto computation_status = mapped_builder.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -107,10 +107,10 @@ class MapTest : public ClientLibraryTestBase { // 1.0f ---------/ XlaComputation CreateAdderToOneTimesItself() { XlaBuilder mapped_builder(TestName()); - auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto one = mapped_builder.ConstantR0(1.0); - auto adder_to_one = mapped_builder.Add(x, one); - mapped_builder.Mul(x, adder_to_one); + auto x = Parameter(&mapped_builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto one = ConstantR0(&mapped_builder, 1.0); + auto adder_to_one = Add(x, one); + Mul(x, adder_to_one); auto computation_status = mapped_builder.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -125,10 +125,10 @@ class MapTest : public ClientLibraryTestBase { XlaComputation CreateMapPlusN(const XlaComputation& embedded_computation, float n) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto map = builder.Map({x}, embedded_computation, {}); - auto constant_n = builder.ConstantR0(n); - builder.Add(map, constant_n); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto map = Map(&builder, {x}, embedded_computation, {}); + auto constant_n = ConstantR0(&builder, n); + Add(map, constant_n); auto computation_status = builder.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -138,9 +138,9 @@ class MapTest : public ClientLibraryTestBase { // defined by (x, y) -> x > y. XlaComputation CreateGt() { XlaBuilder b("Gt"); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - b.Gt(x, y); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y"); + Gt(x, y); auto computation_status = b.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -155,11 +155,11 @@ class MapTest : public ClientLibraryTestBase { // z {R0F32} ---------------/ XlaComputation CreateTernaryAdder() { XlaBuilder mapped_builder("TernaryAdder"); - auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = mapped_builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - auto z = mapped_builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "z"); - auto xy = mapped_builder.Add(x, y); - mapped_builder.Add(xy, z); + auto x = Parameter(&mapped_builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(&mapped_builder, 1, ShapeUtil::MakeShape(F32, {}), "y"); + auto z = Parameter(&mapped_builder, 2, ShapeUtil::MakeShape(F32, {}), "z"); + auto xy = Add(x, y); + Add(xy, z); auto computation_status = mapped_builder.Build(); TF_CHECK_OK(computation_status.status()); return computation_status.ConsumeValueOrDie(); @@ -169,12 +169,12 @@ class MapTest : public ClientLibraryTestBase { TEST_F(MapTest, MapEachElemPlusOneR0) { // Applies lambda (x) (+ x 1)) to an input scalar. XlaBuilder builder(TestName()); - std::unique_ptr param0_literal = Literal::CreateR0(42.0); + std::unique_ptr param0_literal = LiteralUtil::CreateR0(42.0); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param = builder.Parameter(0, param0_literal->shape(), "param0"); - builder.Map({param}, CreateAdderToOne(), {}); + auto param = Parameter(&builder, 0, param0_literal->shape(), "param0"); + Map(&builder, {param}, CreateAdderToOne(), {}); ComputeAndCompareR0(&builder, 43.0, {param0_data.get()}, ErrorSpec(0.01f)); @@ -183,12 +183,12 @@ TEST_F(MapTest, MapEachElemPlusOneR0) { XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) { // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0. XlaBuilder builder(TestName()); - std::unique_ptr param0_literal = Literal::CreateR1({}); + std::unique_ptr param0_literal = LiteralUtil::CreateR1({}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param = builder.Parameter(0, param0_literal->shape(), "param0"); - builder.Map({param}, CreateAdderToOne(), {0}); + auto param = Parameter(&builder, 0, param0_literal->shape(), "param0"); + Map(&builder, {param}, CreateAdderToOne(), {0}); ComputeAndCompareR1(&builder, {}, {param0_data.get()}, ErrorSpec(0.01f)); @@ -198,12 +198,12 @@ TEST_F(MapTest, MapEachElemPlusOneR1S4) { // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4. XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); + LiteralUtil::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param = builder.Parameter(0, param0_literal->shape(), "param0"); - builder.Map({param}, CreateAdderToOne(), {0}); + auto param = Parameter(&builder, 0, param0_literal->shape(), "param0"); + Map(&builder, {param}, CreateAdderToOne(), {0}); ComputeAndCompareR1(&builder, {3.2f, 4.3f, 5.4f, 6.5f}, {param0_data.get()}, ErrorSpec(0.01f)); @@ -212,12 +212,12 @@ TEST_F(MapTest, MapEachElemPlusOneR1S4) { TEST_F(MapTest, MapEachF32ElementToS32Constant) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); + LiteralUtil::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param = builder.Parameter(0, param0_literal->shape(), "param0"); - builder.Map({param}, CreateScalarOne(), {0}); + auto param = Parameter(&builder, 0, param0_literal->shape(), "param0"); + Map(&builder, {param}, CreateScalarOne(), {0}); ComputeAndCompareR1(&builder, {1, 1, 1, 1}, {param0_data.get()}); } @@ -225,12 +225,12 @@ TEST_F(MapTest, MapEachF32ElementToS32Constant) { TEST_F(MapTest, MapEachF32ElementToU32Constant) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); + LiteralUtil::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param = builder.Parameter(0, param0_literal->shape(), "param0"); - builder.Map({param}, CreateScalarOne(), {0}); + auto param = Parameter(&builder, 0, param0_literal->shape(), "param0"); + Map(&builder, {param}, CreateScalarOne(), {0}); ComputeAndCompareR1(&builder, {1, 1, 1, 1}, {param0_data.get()}); } @@ -239,12 +239,12 @@ TEST_F(MapTest, MapEachElemLongerChainR1) { // Maps (lambda (x) (* (+ x 1) x)) onto an input R1F32 vector. XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({2.6f, -5.1f, 0.1f, 0.2f, 999.0f, 255.5f}); + LiteralUtil::CreateR1({2.6f, -5.1f, 0.1f, 0.2f, 999.0f, 255.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param = builder.Parameter(0, param0_literal->shape(), "param0"); - builder.Map({param}, CreateAdderToOneTimesItself(), {0}); + auto param = Parameter(&builder, 0, param0_literal->shape(), "param0"); + Map(&builder, {param}, CreateAdderToOneTimesItself(), {0}); ComputeAndCompareR1( &builder, {9.36f, 20.91f, 0.11f, 0.24f, 999000.0f, 65535.75f}, @@ -255,13 +255,13 @@ XLA_TEST_F(MapTest, MapMultipleMapsR1S0) { // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0, and then // maps (lambda (x) (* x 2)) on the result. XlaBuilder builder(TestName()); - std::unique_ptr param0_literal = Literal::CreateR1({}); + std::unique_ptr param0_literal = LiteralUtil::CreateR1({}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param = builder.Parameter(0, param0_literal->shape(), "param0"); - auto map1 = builder.Map({param}, CreateAdderToOne(), {0}); - builder.Map({map1}, CreateMulByTwo(), {0}); + auto param = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto map1 = Map(&builder, {param}, CreateAdderToOne(), {0}); + Map(&builder, {map1}, CreateMulByTwo(), {0}); ComputeAndCompareR1(&builder, {}, {param0_data.get()}, ErrorSpec(0.01f)); @@ -272,13 +272,13 @@ TEST_F(MapTest, MapMultipleMapsR1S4) { // maps (lambda (x) (* x 2)) on the result. XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); + LiteralUtil::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param = builder.Parameter(0, param0_literal->shape(), "param0"); - auto map1 = builder.Map({param}, CreateAdderToOne(), {0}); - builder.Map({map1}, CreateMulByTwo(), {0}); + auto param = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto map1 = Map(&builder, {param}, CreateAdderToOne(), {0}); + Map(&builder, {map1}, CreateMulByTwo(), {0}); ComputeAndCompareR1(&builder, {6.4f, 8.6f, 10.8f, 13.0f}, {param0_data.get()}, ErrorSpec(0.01f)); @@ -287,13 +287,13 @@ TEST_F(MapTest, MapMultipleMapsR1S4) { TEST_F(MapTest, MapEachElemPlusOneR2) { // Maps (lambda (x) (+ x 1)) onto an input R2F32 vector. XlaBuilder builder(TestName()); - std::unique_ptr param0_literal = Literal::CreateR2( + std::unique_ptr param0_literal = LiteralUtil::CreateR2( {{13.25f, 14.0f}, {-7.1f, -7.2f}, {-8.8f, 8.8f}}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param = builder.Parameter(0, param0_literal->shape(), "param0"); - builder.Map({param}, CreateAdderToOne(), {0, 1}); + auto param = Parameter(&builder, 0, param0_literal->shape(), "param0"); + Map(&builder, {param}, CreateAdderToOne(), {0, 1}); Array2D expected_array( {{14.25f, 15.0f}, {-6.1f, -6.2f}, {-7.8f, 9.8f}}); @@ -319,10 +319,10 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) { auto embed3 = CreateMapPlusN(embed1, 4.0); XlaBuilder embed4_builder("embed4"); - auto embed4_param = embed4_builder.Parameter(0, scalar_shape, "x"); - auto embed4_map_lhs = embed4_builder.Map({embed4_param}, embed2, {}); - auto embed4_map_rhs = embed4_builder.Map({embed4_param}, embed3, {}); - embed4_builder.Add(embed4_map_lhs, embed4_map_rhs); + auto embed4_param = Parameter(&embed4_builder, 0, scalar_shape, "x"); + auto embed4_map_lhs = Map(&embed4_builder, {embed4_param}, embed2, {}); + auto embed4_map_rhs = Map(&embed4_builder, {embed4_param}, embed3, {}); + Add(embed4_map_lhs, embed4_map_rhs); auto embed4_status = embed4_builder.Build(); ASSERT_IS_OK(embed4_status.status()); auto embed4 = embed4_status.ConsumeValueOrDie(); @@ -330,11 +330,11 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) { auto embed5 = CreateMapPlusN(embed2, 6.0); XlaBuilder builder(TestName()); - auto constant_42 = builder.ConstantR0(42.0); - auto constant_7 = builder.ConstantR0(7.0); - auto map_42 = builder.Map({constant_42}, embed5, {}); - auto map_7 = builder.Map({constant_7}, embed4, {}); - builder.Add(map_42, map_7); + auto constant_42 = ConstantR0(&builder, 42.0); + auto constant_7 = ConstantR0(&builder, 7.0); + auto map_42 = Map(&builder, {constant_42}, embed5, {}); + auto map_7 = Map(&builder, {constant_7}, embed4, {}); + Add(map_42, map_7); ComputeAndCompareR0(&builder, 73.0, {}, ErrorSpec(0.01f)); } @@ -343,17 +343,18 @@ TEST_F(MapTest, MapBinaryAdder) { // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors. XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); + LiteralUtil::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_literal = - Literal::CreateR1({5.1f, 4.4f, -0.1f, -5.5f}); + LiteralUtil::CreateR1({5.1f, 4.4f, -0.1f, -5.5f}); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto param1 = builder.Parameter(1, param1_literal->shape(), "param1"); - builder.Map({param0, param1}, CreateScalarAddComputation(F32, &builder), {0}); + auto param0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto param1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + Map(&builder, {param0, param1}, CreateScalarAddComputation(F32, &builder), + {0}); ComputeAndCompareR1(&builder, {7.3f, 7.7, 4.3f, 0}, {param0_data.get(), param1_data.get()}, @@ -364,20 +365,20 @@ TEST_F(MapTest, MapBinaryAdder) { // for Map that used to fail in shape inference (b/28989438). XLA_TEST_F(MapTest, AddWithMixedLayouts) { XlaBuilder builder(TestName()); - std::unique_ptr param0_literal = Literal::CreateR2WithLayout( + std::unique_ptr param0_literal = LiteralUtil::CreateR2WithLayout( {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({1, 0})); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - std::unique_ptr param1_literal = Literal::CreateR2WithLayout( + std::unique_ptr param1_literal = LiteralUtil::CreateR2WithLayout( {{10, 20}, {30, 40}}, LayoutUtil::MakeLayout({0, 1})); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto param1 = builder.Parameter(1, param1_literal->shape(), "param1"); - builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder), - {0, 1}); + auto param0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto param1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + Map(&builder, {param0, param1}, CreateScalarAddComputation(S32, &builder), + {0, 1}); Array2D expected(2, 2); expected(0, 0) = 11; @@ -391,19 +392,19 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) { XLA_TEST_F(MapTest, AddR3_3x0x2) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR3FromArray3D(Array3D(3, 0, 2)); + LiteralUtil::CreateR3FromArray3D(Array3D(3, 0, 2)); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_literal = - Literal::CreateR3FromArray3D(Array3D(3, 0, 2)); + LiteralUtil::CreateR3FromArray3D(Array3D(3, 0, 2)); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto param1 = builder.Parameter(1, param1_literal->shape(), "param1"); - builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder), - {0, 1, 2}); + auto param0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto param1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + Map(&builder, {param0, param1}, CreateScalarAddComputation(S32, &builder), + {0, 1, 2}); ComputeAndCompareR3(&builder, Array3D(3, 0, 2), {param0_data.get(), param1_data.get()}); @@ -413,22 +414,22 @@ TEST_F(MapTest, MapTernaryAdder) { // Maps (lambda (x y z) (+ x y z)) onto three R1F32 vectors. XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); + LiteralUtil::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_literal = - Literal::CreateR1({5.1f, 4.4f, -0.1f, -5.5f}); + LiteralUtil::CreateR1({5.1f, 4.4f, -0.1f, -5.5f}); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); std::unique_ptr param2_literal = - Literal::CreateR1({-10.0f, -100.0f, -900.0f, -400.0f}); + LiteralUtil::CreateR1({-10.0f, -100.0f, -900.0f, -400.0f}); std::unique_ptr param2_data = client_->TransferToServer(*param2_literal).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto param1 = builder.Parameter(1, param1_literal->shape(), "param1"); - auto param2 = builder.Parameter(2, param2_literal->shape(), "param2"); - builder.Map({param0, param1, param2}, CreateTernaryAdder(), {0}); + auto param0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto param1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + auto param2 = Parameter(&builder, 2, param2_literal->shape(), "param2"); + Map(&builder, {param0, param1, param2}, CreateTernaryAdder(), {0}); ComputeAndCompareR1( &builder, {-2.7f, -92.3f, -895.7f, -400.0f}, @@ -440,7 +441,8 @@ TEST_F(MapTest, MapGt) { // Maps (x,y) -> x > y onto two R1F32 vectors. XlaBuilder b(TestName()); auto gt = CreateGt(); - b.Map({b.ConstantR1({1, 20}), b.ConstantR1({10, 2})}, gt, {0}); + Map(&b, {ConstantR1(&b, {1, 20}), ConstantR1(&b, {10, 2})}, gt, + {0}); ComputeAndCompareR1(&b, {false, true}, {}); } @@ -449,15 +451,15 @@ TEST_F(MapTest, NestedBinaryMap) { { // max_with_square(x) = do max(x, x^2) via a map. XlaBuilder b("max_with_square"); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - b.Map({x, b.Mul(x, x)}, CreateMax(), {}); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x"); + Map(&b, {x, Mul(x, x)}, CreateMax(), {}); auto computation_status = b.Build(); ASSERT_IS_OK(computation_status.status()); max_with_square = computation_status.ConsumeValueOrDie(); } XlaBuilder b(TestName()); - auto input = b.ConstantR1({0.1f, 0.5f, -0.5f, 1.0f, 2.0f}); - b.Map({input}, max_with_square, {0}); + auto input = ConstantR1(&b, {0.1f, 0.5f, -0.5f, 1.0f, 2.0f}); + Map(&b, {input}, max_with_square, {0}); ComputeAndCompareR1(&b, {0.1f, 0.5f, 0.25f, 1.0f, 4.0f}, {}); } @@ -468,30 +470,29 @@ TEST_F(MapTest, MapOperantionWithBuildError) { XlaBuilder builder(TestName()); auto sub_builder = builder.CreateSubBuilder("ErrorAdd"); - auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = sub_builder->Parameter(1, ShapeUtil::MakeShape(U16, {}), "y"); - sub_builder->Add(x, y); + auto x = Parameter(sub_builder.get(), 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(sub_builder.get(), 1, ShapeUtil::MakeShape(U16, {}), "y"); + Add(x, y); auto error_add = sub_builder->BuildAndNoteError(); std::unique_ptr param0_literal = - Literal::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); + LiteralUtil::CreateR1({2.2f, 3.3f, 4.4f, 5.5f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_literal = - Literal::CreateR1({5.1f, 4.4f, -0.1f, -5.5f}); + LiteralUtil::CreateR1({5.1f, 4.4f, -0.1f, -5.5f}); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto param1 = builder.Parameter(1, param1_literal->shape(), "param1"); - builder.Map({param0, param1}, error_add, {0}); + auto param0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto param1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + Map(&builder, {param0, param1}, error_add, {0}); StatusOr computation_status = builder.Build(); ASSERT_TRUE(!computation_status.ok()); - EXPECT_THAT( - computation_status.status().ToString(), - ::testing::HasSubstr("error from: ErrorAdd: Binary op BINOP_ADD with " - "different element types: f32[] and u16[]")); + EXPECT_THAT(computation_status.status().ToString(), + ::testing::HasSubstr("error from: ErrorAdd: Binary op add with " + "different element types: f32[] and u16[]")); } // MapTest disables inline and algsimp. MapTestWithFullOpt runs all @@ -507,21 +508,21 @@ TEST_F(MapTestWithFullOpt, MapScalarPower) { XlaBuilder builder(TestName()); auto sub_builder = builder.CreateSubBuilder("power"); - auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = sub_builder->Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - sub_builder->Pow(x, y); + auto x = Parameter(sub_builder.get(), 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(sub_builder.get(), 1, ShapeUtil::MakeShape(F32, {}), "y"); + Pow(x, y); auto power = sub_builder->BuildAndNoteError(); - std::unique_ptr param0_literal = Literal::CreateR0(2.0f); - std::unique_ptr param1_literal = Literal::CreateR0(5.0f); + std::unique_ptr param0_literal = LiteralUtil::CreateR0(2.0f); + std::unique_ptr param1_literal = LiteralUtil::CreateR0(5.0f); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto param1 = builder.Parameter(1, param1_literal->shape(), "param1"); - builder.Map({param0, param1}, power, {}); + auto param0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto param1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + Map(&builder, {param0, param1}, power, {}); ComputeAndCompareR0(&builder, 32.0f, {param0_data.get(), param1_data.get()}, @@ -534,21 +535,21 @@ TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) { XlaBuilder builder(TestName()); auto sub_builder = builder.CreateSubBuilder("power"); - auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = sub_builder->Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - sub_builder->Sub(y, x); // note that this is y - x, not x - y + auto x = Parameter(sub_builder.get(), 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(sub_builder.get(), 1, ShapeUtil::MakeShape(F32, {}), "y"); + Sub(y, x); // note that this is y - x, not x - y auto sub_opposite = sub_builder->BuildAndNoteError(); - std::unique_ptr param0_literal = Literal::CreateR0(2.0f); - std::unique_ptr param1_literal = Literal::CreateR0(5.0f); + std::unique_ptr param0_literal = LiteralUtil::CreateR0(2.0f); + std::unique_ptr param1_literal = LiteralUtil::CreateR0(5.0f); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); std::unique_ptr param1_data = client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, param0_literal->shape(), "param0"); - auto param1 = builder.Parameter(1, param1_literal->shape(), "param1"); - builder.Map({param0, param1}, sub_opposite, {}); + auto param0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + auto param1 = Parameter(&builder, 1, param1_literal->shape(), "param1"); + Map(&builder, {param0, param1}, sub_opposite, {}); ComputeAndCompareR0( &builder, 3.0f, {param0_data.get(), param1_data.get()}, ErrorSpec(0.01f)); @@ -560,16 +561,16 @@ TEST_F(MapTestWithFullOpt, MapSquare) { XlaBuilder builder(TestName()); auto sub_builder = builder.CreateSubBuilder("power"); - auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - sub_builder->Mul(x, x); + auto x = Parameter(sub_builder.get(), 0, ShapeUtil::MakeShape(F32, {}), "x"); + Mul(x, x); auto square = sub_builder->BuildAndNoteError(); - std::unique_ptr param0_literal = Literal::CreateR0(10.0f); + std::unique_ptr param0_literal = LiteralUtil::CreateR0(10.0f); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, param0_literal->shape(), "param0"); - builder.Map({param0}, square, {}); + auto param0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); + Map(&builder, {param0}, square, {}); ComputeAndCompareR0(&builder, 100.0f, {param0_data.get()}, ErrorSpec(0.01f)); diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc index 27fd36e06acdc589f3a84ad561164e4a33b93506..edb592f43ec778a3fe6e5ef936827dd612791760 100644 --- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc +++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc @@ -17,12 +17,14 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" @@ -32,7 +34,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/tests/test_utils.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -56,15 +57,15 @@ TYPED_TEST_CASE(MatOpsSimpleTest_F16F32, TypesF16F32); XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) { using T = TypeParam; XlaBuilder builder("exp_2x2"); - auto data = builder.ConstantR2FromArray2D({ - {1.0f, 0.0f}, // row 0 - {-1.0f, 0.5f}, // row 1 - }); - builder.Exp(data); + auto data = ConstantR2FromArray2D(&builder, { + {1.0f, 0.0f}, // row 0 + {-1.0f, 0.5f}, // row 1 + }); + Exp(data); std::unique_ptr expected = - Literal::CreateR2FromArray2D({{2.71828f, 1.00000f}, // row 0 - {0.36788f, 1.64872f}}); // row 1 + LiteralUtil::CreateR2FromArray2D({{2.71828f, 1.00000f}, // row 0 + {0.36788f, 1.64872f}}); // row 1 this->ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-5)); } @@ -76,43 +77,43 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) { // add_half(x) = x + 0.5 XlaBuilder builder("add_half"); auto x_value = - builder.Parameter(0, ShapeUtil::MakeShapeWithType({}), "x_value"); - auto half = builder.ConstantR0(static_cast(0.5)); - builder.Add(x_value, half); + Parameter(&builder, 0, ShapeUtil::MakeShapeWithType({}), "x_value"); + auto half = ConstantR0(&builder, static_cast(0.5)); + Add(x_value, half); auto computation_status = builder.Build(); ASSERT_IS_OK(computation_status.status()); add_half = computation_status.ConsumeValueOrDie(); } XlaBuilder builder("map_2x2"); - auto data = builder.ConstantR2FromArray2D({ - {1.0f, 0.0f}, // row 0 - {-1.0f, 0.5f}, // row 1 - }); - auto map = builder.Map({data}, add_half, {0, 1}); + auto data = ConstantR2FromArray2D(&builder, { + {1.0f, 0.0f}, // row 0 + {-1.0f, 0.5f}, // row 1 + }); + Map(&builder, {data}, add_half, {0, 1}); std::unique_ptr expected = - Literal::CreateR2FromArray2D({{1.5f, 0.5f}, // row 0 - {-0.5f, 1.0f}}); // row 1 + LiteralUtil::CreateR2FromArray2D({{1.5f, 0.5f}, // row 0 + {-0.5f, 1.0f}}); // row 1 this->ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-5)); } XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MaxTwoByTwoValues) { using T = TypeParam; XlaBuilder builder("max_2x2"); - auto lhs = builder.ConstantR2FromArray2D({ - {7.0f, 2.0f}, // row 0 - {3.0f, -4.0f}, // row 1 - }); - auto rhs = builder.ConstantR2FromArray2D({ - {5.0f, 6.0f}, // row 0 - {1.0f, -8.0f}, // row 1 - }); - auto max = builder.Max(lhs, rhs); + auto lhs = ConstantR2FromArray2D(&builder, { + {7.0f, 2.0f}, // row 0 + {3.0f, -4.0f}, // row 1 + }); + auto rhs = ConstantR2FromArray2D(&builder, { + {5.0f, 6.0f}, // row 0 + {1.0f, -8.0f}, // row 1 + }); + Max(lhs, rhs); std::unique_ptr expected = - Literal::CreateR2FromArray2D({{7.0f, 6.0f}, // row 0 - {3.0f, -4.0f}}); // row 1 + LiteralUtil::CreateR2FromArray2D({{7.0f, 6.0f}, // row 0 + {3.0f, -4.0f}}); // row 1 this->ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-6)); } @@ -133,13 +134,12 @@ class TestLinspaceMaxParametric float from = -128.0, to = 256.0; std::unique_ptr> alhs = MakeLinspaceArray2D(from, to, rows, cols); - auto arhs = MakeUnique>(rows, cols, static_cast(1.0f)); + auto arhs = absl::make_unique>(rows, cols, static_cast(1.0f)); - XlaBuilder builder( - tensorflow::strings::Printf("max_%lldx%lld_linspace", rows, cols)); - auto lhs = builder.ConstantR2FromArray2D(*alhs); - auto rhs = builder.ConstantR2FromArray2D(*arhs); - auto max = builder.Max(lhs, rhs); + XlaBuilder builder(absl::StrFormat("max_%dx%d_linspace", rows, cols)); + auto lhs = ConstantR2FromArray2D(&builder, *alhs); + auto rhs = ConstantR2FromArray2D(&builder, *arhs); + Max(lhs, rhs); Array2D expected(rows, cols); for (int row = 0; row < rows; ++row) { @@ -158,7 +158,7 @@ class TestLinspaceMaxParametric string PrintTestLinspaceMaxParam( const ::testing::TestParamInfo& test_param) { const TestLinspaceMaxParam& param = test_param.param; - return tensorflow::strings::StrCat(param.rows, "r", param.cols, "c"); + return absl::StrCat(param.rows, "r", param.cols, "c"); } #ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16 @@ -200,31 +200,33 @@ class MatOpsDotAddTest TF_ASSERT_OK_AND_ASSIGN( auto lhs_handle, - client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout( - lhs, LayoutUtil::MakeLayout(minor_to_major(row_major))))); + client_->TransferToServer( + *LiteralUtil::CreateR2FromArray2DWithLayout( + lhs, LayoutUtil::MakeLayout(minor_to_major(row_major))))); TF_ASSERT_OK_AND_ASSIGN( auto rhs_handle, - client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout( - rhs, LayoutUtil::MakeLayout(minor_to_major(row_major))))); + client_->TransferToServer( + *LiteralUtil::CreateR2FromArray2DWithLayout( + rhs, LayoutUtil::MakeLayout(minor_to_major(row_major))))); XlaBuilder builder(TestName()); - auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs"); + auto lhs_arg = Parameter(&builder, 0, lhs_shape, "lhs"); auto lhs_mat_arg = lhs_arg; if (transpose) { - lhs_mat_arg = builder.Transpose(lhs_mat_arg, {1, 0}); + lhs_mat_arg = Transpose(lhs_mat_arg, {1, 0}); } - auto rhs_arg = builder.Parameter(1, rhs_shape, "rhs"); - auto result = builder.Dot(lhs_mat_arg, rhs_arg); + auto rhs_arg = Parameter(&builder, 1, rhs_shape, "rhs"); + auto result = Dot(lhs_mat_arg, rhs_arg); Array2D expected; if (add_lhs) { - result = builder.Add(result, lhs_arg); + result = Add(result, lhs_arg); if (transpose) { expected = Array2D({{47.0f, 52.0f}, {71.0f, 78.0f}}); } else { expected = Array2D({{35.0f, 39.0f}, {81.0f, 89.0f}}); } } else { - result = builder.Add(result, rhs_arg); + result = Add(result, rhs_arg); if (transpose) { expected = Array2D({{56.0f, 61.0f}, {80.0f, 87.0f}}); } else { diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc index 0791a71aacf7614286fe964623a3172a174d4722..955dbef6dcd28421fb351c6ee064ac53eda1fd08 100644 --- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc +++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc @@ -20,7 +20,7 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -33,9 +33,10 @@ class SliceTest : public ClientLibraryTestBase {}; XLA_TEST_F(SliceTest, Slice2D) { XlaBuilder builder("slice_2d"); - auto original = builder.ConstantR2( + auto original = ConstantR2( + &builder, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}); - builder.Slice(original, {2, 1}, {4, 3}, {1, 1}); + Slice(original, {2, 1}, {4, 3}, {1, 1}); Array2D expected({{8.0f, 9.0f}, {11.0f, 12.0f}}); ComputeAndCompareR2(&builder, expected, {}, ErrorSpec(0.000001)); @@ -45,8 +46,8 @@ XLA_TEST_F(SliceTest, Slice3D) { XlaBuilder builder("slice_3d"); Array3D array_3d( {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}); - auto original = builder.ConstantR3FromArray3D(array_3d); - builder.Slice(original, {0, 0, 1}, {2, 1, 2}, {1, 1, 1}); + auto original = ConstantR3FromArray3D(&builder, array_3d); + Slice(original, {0, 0, 1}, {2, 1, 2}, {1, 1, 1}); Array3D expected_3d({{{2.0f}}, {{6.0f}}}); ComputeAndCompareR3(&builder, expected_3d, {}, ErrorSpec(0.000001)); diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc index 3cbb2452fb245b6703d3bcd5771a51f6e30aa593..05f90ba9fb7d781f64bd52008423f603397ce628 100644 --- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc +++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc @@ -19,10 +19,12 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/primitive_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -36,7 +38,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_utils.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/test.h" @@ -46,21 +47,30 @@ limitations under the License. namespace xla { namespace { -using ::tensorflow::gtl::ArraySlice; class MultiOutputFusionTest : public HloTestBase { protected: MultiOutputFusionTest() { error_spec_ = ErrorSpec{0.0001, 1e-2}; } + // Layout assignment assumes that there are no fusions in the input graph. + // Since the purpose of this test is to send pre-fused graphs to XLA, we have + // to do layout assignment ourselves. + DebugOptions GetDebugOptionsForTest() override { + auto opts = HloTestBase::GetDebugOptionsForTest(); + opts.add_xla_disable_hlo_passes("layout-assignment"); + return opts; + } + void RunTest2D(bool manual_fusion, int64 size) { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); - const Shape elem_shape0 = ShapeUtil::MakeShape(F32, {}); - const Shape elem_shape2 = ShapeUtil::MakeShape(F32, {size, size}); + const Shape elem_shape0 = ShapeUtil::MakeShapeWithLayout(F32, {}, {}); + const Shape elem_shape2 = + ShapeUtil::MakeShapeWithLayout(F32, {size, size}, {1, 0}); auto const0 = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(8.0f))); + HloInstruction::CreateConstant(LiteralUtil::CreateR0(8.0f))); auto param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, elem_shape0, "0")); @@ -85,8 +95,8 @@ class MultiOutputFusionTest : public HloTestBase { auto computation = hlo_module->AddEntryComputation(builder.Build(dot)); if (manual_fusion) { - auto tuple = computation->AddInstruction(HloInstruction::CreateTuple( - ArraySlice({sub, add2}, 0, 2))); + auto tuple = + computation->AddInstruction(HloInstruction::CreateTuple({sub, add2})); auto gte0 = computation->AddInstruction( HloInstruction::CreateGetTupleElement(elem_shape2, tuple, 0)); auto gte1 = computation->AddInstruction( @@ -100,13 +110,14 @@ class MultiOutputFusionTest : public HloTestBase { nullptr); } - Literal arg1(ShapeUtil::MakeShape(F32, {size, size})); + Literal arg1(ShapeUtil::MakeShapeWithDescendingLayout(F32, {size, size})); arg1.PopulateWithValue(2.5f); - Literal expect(ShapeUtil::MakeShape(F32, {size, size})); + Literal expect(ShapeUtil::MakeShapeWithDescendingLayout(F32, {size, size})); expect.PopulateWithValue(size * 1.5f * 3.5f); - auto actual = ExecuteAndTransfer( - std::move(hlo_module), {Literal::CreateR0(-9.0f).get(), &arg1}); + auto actual = + ExecuteAndTransfer(std::move(hlo_module), + {LiteralUtil::CreateR0(-9.0f).get(), &arg1}); EXPECT_TRUE(LiteralTestUtil::Near(expect, *actual, error_spec_)); } @@ -114,8 +125,10 @@ class MultiOutputFusionTest : public HloTestBase { auto builder = HloComputation::Builder(TestName()); auto hlo_module = CreateNewModule(); - const Shape elem_shape_F32 = ShapeUtil::MakeShape(F32, {size}); - const Shape elem_shape_U8 = ShapeUtil::MakeShape(F64, {size}); + const Shape elem_shape_F32 = + ShapeUtil::MakeShapeWithDescendingLayout(F32, {size}); + const Shape elem_shape_U8 = + ShapeUtil::MakeShapeWithDescendingLayout(F64, {size}); auto param0 = builder.AddInstruction( HloInstruction::CreateParameter(0, elem_shape_F32, "0")); auto param1 = builder.AddInstruction( @@ -135,17 +148,18 @@ class MultiOutputFusionTest : public HloTestBase { HloInstruction* reshape = builder.AddInstruction(HloInstruction::CreateReshape( - ShapeUtil::MakeShape(F32, {size, 1}), add)); + ShapeUtil::MakeShapeWithDescendingLayout(F32, {size, 1}), add)); DotDimensionNumbers dot_dnums; dot_dnums.add_lhs_contracting_dimensions(0); dot_dnums.add_rhs_contracting_dimensions(0); HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot( - ShapeUtil::MakeShape(F32, {1}), sub, reshape, dot_dnums)); + ShapeUtil::MakeShapeWithDescendingLayout(F32, {1}), sub, reshape, + dot_dnums)); auto computation = hlo_module->AddEntryComputation(builder.Build(dot)); if (manual_fusion) { - auto tuple = computation->AddInstruction(HloInstruction::CreateTuple( - ArraySlice({sub_U8, add}, 0, 2))); + auto tuple = computation->AddInstruction( + HloInstruction::CreateTuple({sub_U8, add})); auto gte0 = computation->AddInstruction( HloInstruction::CreateGetTupleElement(elem_shape_U8, tuple, 0)); @@ -160,12 +174,13 @@ class MultiOutputFusionTest : public HloTestBase { nullptr); } - Literal input0(ShapeUtil::MakeShape(F32, {size})); + Literal input0(ShapeUtil::MakeShapeWithDescendingLayout(F32, {size})); input0.PopulateWithValue(2.5f); - Literal input1(ShapeUtil::MakeShape(F64, {size})); + Literal input1(ShapeUtil::MakeShapeWithDescendingLayout(F64, {size})); input1.PopulateWithValue(1.); - Literal expect = std::move(*Literal::CreateR1({size * 1.5f * 3.5f})); + Literal expect = + std::move(*LiteralUtil::CreateR1({size * 1.5f * 3.5f})); auto actual = ExecuteAndTransfer(std::move(hlo_module), {&input0, &input1}); EXPECT_TRUE(LiteralTestUtil::Near(expect, *actual, error_spec_)); } @@ -198,16 +213,16 @@ XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) { auto module = HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) .ValueOrDie(); - auto param = Literal::MakeTupleOwned( - Literal::MakeTupleOwned( - Literal::MakeTupleOwned(Literal::CreateR0(42)), - Literal::CreateR0(1.0)), - Literal::MakeTupleOwned(Literal::CreateR0(3.0), - Literal::CreateR0(4))); - TF_ASSERT_OK_AND_ASSIGN(auto result, - Execute(std::move(module), {param.get()})); + auto param = LiteralUtil::MakeTupleOwned( + LiteralUtil::MakeTupleOwned( + LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0(42)), + LiteralUtil::CreateR0(1.0)), + LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0(3.0), + LiteralUtil::CreateR0(4))); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); EXPECT_TRUE(LiteralTestUtil::Equal( - *result, *Literal::MakeTupleOwned(Literal::CreateR0(42)))); + *LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0(42)), *result)); } XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) { @@ -232,11 +247,10 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) { auto module = HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) .ValueOrDie(); - auto param = Literal::CreateR1({1.0, 2.0, 3.0, -1.0}); - TF_ASSERT_OK_AND_ASSIGN(auto result, - Execute(std::move(module), {param.get()})); - EXPECT_TRUE(LiteralTestUtil::Equal( - *result, *Literal::CreateR1({0.0, 4.0, 9.0, 1.0}))); + auto param = LiteralUtil::CreateR1({1.0, 2.0, 3.0, -1.0}); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); + LiteralTestUtil::ExpectR1Equal({0.0, 4.0, 9.0, 1.0}, *result); } XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) { @@ -266,11 +280,10 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) { auto module = HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) .ValueOrDie(); - auto param = Literal::CreateR1({1.0, 2.0, 3.0}); - TF_ASSERT_OK_AND_ASSIGN(auto result, - Execute(std::move(module), {param.get()})); - EXPECT_TRUE(LiteralTestUtil::Equal( - *result, *Literal::CreateR1({0.0, 4.0, 9.0}))); + auto param = LiteralUtil::CreateR1({1.0, 2.0, 3.0}); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); + LiteralTestUtil::ExpectR1Equal({0.0, 4.0, 9.0}, *result); } const char* const kScalarOps = R"( @@ -291,7 +304,7 @@ const char* const kScalarOps = R"( XLA_TEST_F(MultiOutputFusionTest, DISABLED_ON_CPU(MultiOutputReduceFusionMinor)) { - const string testcase = tensorflow::strings::StrCat(kScalarOps, R"( + const string testcase = absl::StrCat(kScalarOps, R"( fused_reduce { p0 = f32[2,2,2]{2,1,0} parameter(0) c0 = f32[] constant(0) @@ -310,18 +323,20 @@ XLA_TEST_F(MultiOutputFusionTest, auto module = HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) .ValueOrDie(); - auto param = Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); - TF_ASSERT_OK_AND_ASSIGN(auto result, - Execute(std::move(module), {param.get()})); + auto param = + LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); EXPECT_TRUE(LiteralTestUtil::Equal( - *result, - *Literal::MakeTupleOwned(Literal::CreateR2({{3, 7}, {11, 15}}), - Literal::CreateR2({{5, 16}, {36, 64}})))); + *LiteralUtil::MakeTupleOwned( + LiteralUtil::CreateR2({{3, 7}, {11, 15}}), + LiteralUtil::CreateR2({{5, 16}, {36, 64}})), + *result)); } XLA_TEST_F(MultiOutputFusionTest, DISABLED_ON_CPU(MultiOutputReduceFusionMajor)) { - const string testcase = tensorflow::strings::StrCat(kScalarOps, R"( + const string testcase = absl::StrCat(kScalarOps, R"( fused_reduce { p0 = f32[2,2,2]{2,1,0} parameter(0) c0 = f32[] constant(0) @@ -340,26 +355,28 @@ XLA_TEST_F(MultiOutputFusionTest, auto module = HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) .ValueOrDie(); - auto param = Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); - TF_ASSERT_OK_AND_ASSIGN(auto result, - Execute(std::move(module), {param.get()})); + auto param = + LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); EXPECT_TRUE(LiteralTestUtil::Equal( - *result, *Literal::MakeTupleOwned( - Literal::CreateR2({{6, 8}, {10, 12}}), - Literal::CreateR2({{25, 36}, {49, 64}})))); + *LiteralUtil::MakeTupleOwned( + LiteralUtil::CreateR2({{6, 8}, {10, 12}}), + LiteralUtil::CreateR2({{25, 36}, {49, 64}})), + *result)); } XLA_TEST_F(MultiOutputFusionTest, DISABLED_ON_CPU(MultiOutputReduceFusionScalar)) { - const string testcase = tensorflow::strings::StrCat(kScalarOps, R"( + const string testcase = absl::StrCat(kScalarOps, R"( fused_reduce { p0 = f32[2,2,2]{2,1,0} parameter(0) c0 = f32[] constant(0) r1 = f32[2]{0} reduce(p0, c0), dimensions={0,2}, to_apply=Add mul = f32[2,2,2]{2,1,0} multiply(p0, p0) - c1 = f32[] constant(5) + c1 = f32[] constant(1.17549e-38) r2 = f32[2]{0} reduce(mul, c1), dimensions={0,2}, to_apply=Max - r3 = f32[2]{0} reduce(mul, c1), dimensions={0,2}, to_apply=Add + r3 = f32[2]{0} reduce(mul, c0), dimensions={0,2}, to_apply=Add ROOT tuple = (f32[2]{0}, f32[2]{0}, f32[2]{0}) tuple(r1, r2, r3) } @@ -371,13 +388,196 @@ XLA_TEST_F(MultiOutputFusionTest, auto module = HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) .ValueOrDie(); - auto param = Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); - TF_ASSERT_OK_AND_ASSIGN(auto result, - Execute(std::move(module), {param.get()})); + auto param = + LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1({14, 22}), + LiteralUtil::CreateR1({36, 64}), + LiteralUtil::CreateR1({66, 138})), + *result)); +} + +XLA_TEST_F(MultiOutputFusionTest, + DISABLED_ON_CPU(MultiOutputReduceFusionMinorWithExtraOutput)) { + const string testcase = absl::StrCat(kScalarOps, R"( + fused_reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + c0 = f32[] constant(0) + r1 = f32[2,2]{1,0} reduce(p0, c0), dimensions={2}, to_apply=Add + mul = f32[2,2,2]{2,1,0} multiply(p0, p0) + c1 = f32[] constant(5) + r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=Max + ROOT tuple = (f32[2,2,2]{2,1,0}, f32[2,2]{1,0}, f32[2,2]{1,0}) + tuple(p0, r1, r2) + } + + ENTRY reduce { + p = f32[2,2,2]{2,1,0} parameter(0) + ROOT fusion = (f32[2,2,2]{2,1,0}, f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p), + kind=kInput, calls=fused_reduce + })"); + auto module = + HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) + .ValueOrDie(); + auto param = + LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::MakeTupleOwned( + LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}), + LiteralUtil::CreateR2({{3, 7}, {11, 15}}), + LiteralUtil::CreateR2({{5, 16}, {36, 64}})), + *result)); +} + +XLA_TEST_F(MultiOutputFusionTest, + DISABLED_ON_CPU(MultiOutputReduceFusionMajorWithExtraOutput)) { + const string testcase = absl::StrCat(kScalarOps, R"( + fused_reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + c0 = f32[] constant(0) + r1 = f32[2,2]{1,0} reduce(p0, c0), dimensions={0}, to_apply=Add + mul = f32[2,2,2]{2,1,0} multiply(p0, p0) + c1 = f32[] constant(5) + r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={0}, to_apply=Max + ROOT tuple = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}, f32[2,2]{1,0}) + tuple(r1, mul, r2) + } + + ENTRY reduce { + p = f32[2,2,2]{2,1,0} parameter(0) + ROOT fusion = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}, f32[2,2]{1,0}) fusion(p), + kind=kInput, calls=fused_reduce + })"); + auto module = + HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) + .ValueOrDie(); + auto param = + LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::MakeTupleOwned( + LiteralUtil::CreateR2({{6, 8}, {10, 12}}), + LiteralUtil::CreateR3( + {{{1, 4}, {9, 16}}, {{25, 36}, {49, 64}}}), + LiteralUtil::CreateR2({{25, 36}, {49, 64}})), + *result)); +} + +XLA_TEST_F(MultiOutputFusionTest, + DISABLED_ON_CPU(MultiOutputReduceFusionScalarWithExtraOutput)) { + const string testcase = absl::StrCat(kScalarOps, R"( + fused_reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + c0 = f32[] constant(0) + r1 = f32[2]{0} reduce(p0, c0), dimensions={0,2}, to_apply=Add + mul = f32[2,2,2]{2,1,0} multiply(p0, p0) + c1 = f32[] constant(5) + b1 = f32[2,2,2]{2,1,0} broadcast(c1), dimensions={} + mul2 = f32[2,2,2]{2,1,0} multiply(p0, b1) + ROOT tuple = (f32[2]{0}, f32[2,2,2]{2,1,0}, f32[2,2,2]{2,1,0}) + tuple(r1, mul, mul2) + } + + ENTRY reduce { + p = f32[2,2,2]{2,1,0} parameter(0) + ROOT fusion = (f32[2]{0}, f32[2,2,2]{2,1,0}, f32[2,2,2]{2,1,0}) fusion(p), + kind=kInput, calls=fused_reduce + })"); + auto module = + HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) + .ValueOrDie(); + auto param = + LiteralUtil::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::MakeTupleOwned( + LiteralUtil::CreateR1({14, 22}), + LiteralUtil::CreateR3( + {{{1, 4}, {9, 16}}, {{25, 36}, {49, 64}}}), + LiteralUtil::CreateR3( + {{{5, 10}, {15, 20}}, {{25, 30}, {35, 40}}})), + *result)); +} + +XLA_TEST_F(MultiOutputFusionTest, + DISABLED_ON_CPU(MultiOutputReduceFusionNonConstInit)) { + const string testcase = absl::StrCat(kScalarOps, R"( + fused_reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + init1 = f32[] parameter(1) + init2 = f32[] parameter(2) + r1 = f32[2,2]{1,0} reduce(p0, init1), dimensions={2}, to_apply=Add + r2 = f32[2,2]{1,0} reduce(p0, init2), dimensions={2}, to_apply=Max + ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2) + } + + ENTRY reduce { + p = f32[2,2,2]{2,1,0} parameter(0) + i = f32[] parameter(1) + j = f32[] parameter(2) + ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p, i, j), kind=kInput, + calls=fused_reduce + })"); + auto module = + HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) + .ValueOrDie(); + auto param = + LiteralUtil::CreateR3({{{0, 2}, {3, 4}}, {{5, 6}, {7, 8}}}); + auto init1 = LiteralUtil::CreateR0(5); + auto init2 = LiteralUtil::CreateR0(6); + std::unique_ptr result = ExecuteNoHloPasses( + std::move(module), {param.get(), init1.get(), init2.get()}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::MakeTupleOwned( + LiteralUtil::CreateR2({{167, 172}, {176, 180}}), + LiteralUtil::CreateR2({{6, 6}, {6, 8}})), + *result)); +} + +XLA_TEST_F(MultiOutputFusionTest, + DISABLED_ON_CPU(MultiOutputReduceFusionDifferentElementTypes)) { + const string testcase = absl::StrCat(kScalarOps, R"( + fused_reduce (p0: f16[2,2,2]) -> (f32[2,2], f32[2,2], f16[2,2,2]) { + p0 = f16[2,2,2]{2,1,0} parameter(0) + convert = f32[2,2,2]{2,1,0} convert(p0) + c0 = f32[] constant(0) + r1 = f32[2,2]{1,0} reduce(convert, c0), dimensions={2}, to_apply=Add + mul = f32[2,2,2]{2,1,0} multiply(convert, convert) + c1 = f32[] constant(5) + r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=Max + ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}, f16[2,2,2]{2,1,0}) + tuple(r1, r2, p0) + } + + ENTRY reduce { + p = f16[2,2,2]{2,1,0} parameter(0) + ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}, f16[2,2,2]{2,1,0}) fusion(p), + kind=kInput, calls=fused_reduce + })"); + auto module = + HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) + .ValueOrDie(); + auto param = LiteralUtil::CreateR3( + {{{Eigen::half(1), Eigen::half(2)}, {Eigen::half(3), Eigen::half(4)}}, + {{Eigen::half(5), Eigen::half(6)}, {Eigen::half(7), Eigen::half(8)}}}); + std::unique_ptr result = + ExecuteNoHloPasses(std::move(module), {param.get()}); EXPECT_TRUE(LiteralTestUtil::Equal( - *result, *Literal::MakeTupleOwned(Literal::CreateR1({14, 22}), - Literal::CreateR1({36, 64}), - Literal::CreateR1({391, 463})))); + *LiteralUtil::MakeTupleOwned( + LiteralUtil::CreateR2({{3, 7}, {11, 15}}), + LiteralUtil::CreateR2({{5, 16}, {36, 64}}), + LiteralUtil::CreateR3( + {{{Eigen::half(1), Eigen::half(2)}, + {Eigen::half(3), Eigen::half(4)}}, + {{Eigen::half(5), Eigen::half(6)}, + {Eigen::half(7), Eigen::half(8)}}})), + *result)); } } // namespace diff --git a/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc b/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0a0426adcbc1b5b89be0841fa2c4204e2b65abf4 --- /dev/null +++ b/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc @@ -0,0 +1,169 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/tests/local_client_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace xla { +namespace { + +// Tests that ensure outfeed instructions that are contained in nested +// computations in non-root positions are executed. + +class OutfeedInNestedComputationTest : public LocalClientTestBase {}; + +XLA_TEST_F(OutfeedInNestedComputationTest, OutfeedInWhile) { + XlaBuilder b(TestName()); + + Shape state_tuple_array_shape = ShapeUtil::MakeShape(xla::S32, {10, 5}); + Shape int_shape = ShapeUtil::MakeShape(xla::S32, {}); + Shape state_tuple_shape = + ShapeUtil::MakeTupleShape({int_shape, state_tuple_array_shape}); + Shape xfeed_shape = ShapeUtil::MakeShape(xla::S32, {2}); + + XlaOp some_buffer = Broadcast(ConstantR0(&b, 0), {10, 5}); + XlaOp num_iter = Infeed(&b, int_shape); + XlaOp init_tuple = Tuple(&b, {num_iter, some_buffer}); + + TF_ASSERT_OK_AND_ASSIGN(XlaComputation loop_cond, [&] { + // Condition: iteration variable > 0 + XlaBuilder cond_builder("loop_condition"); + XlaOp state_tuple = Parameter(&cond_builder, 0, state_tuple_shape, "state"); + XlaOp loop_counter = GetTupleElement(state_tuple, 0); + Outfeed(loop_counter, int_shape, ""); + Gt(loop_counter, ConstantR0(&cond_builder, 0)); + return cond_builder.Build(); + }()); + + TF_ASSERT_OK_AND_ASSIGN(XlaComputation loop_body, [&] { + XlaBuilder body_builder("loop_body"); + XlaOp state_tuple = Parameter(&body_builder, 0, state_tuple_shape, "state"); + XlaOp loop_counter = GetTupleElement(state_tuple, 0); + XlaOp buffer_inside = GetTupleElement(state_tuple, 1); + + // Read some stuff from Infeed. + XlaOp some_input = Infeed(&body_builder, xfeed_shape); + XlaOp sum = Add(some_input, Broadcast(loop_counter, {2})); + Outfeed(sum, xfeed_shape, ""); + + XlaOp iter_left = Sub(loop_counter, ConstantR0(&body_builder, 1)); + + Tuple(&body_builder, {iter_left, buffer_inside}); + return body_builder.Build(); + }()); + + // Build loop. + XlaOp result_tuple = While(loop_cond, loop_body, init_tuple); + GetTupleElement(result_tuple, 0); + TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, b.Build()); + + std::unique_ptr comp_result; + std::unique_ptr thread( + tensorflow::Env::Default()->StartThread( + tensorflow::ThreadOptions(), "execute_thread", [&] { + comp_result = local_client_->ExecuteAndTransfer(computation, {}) + .ConsumeValueOrDie(); + })); + + VLOG(1) << "Transferring trip count to computation"; + // Transfer number of iterations to Infeed. + TF_ASSERT_OK( + local_client_->TransferToInfeed(*LiteralUtil::CreateR0(1))); + + // Pick up value from outfeed + { + VLOG(1) << "Reading from condition outfeed"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr r, + local_client_->TransferFromOutfeed(&int_shape)); + EXPECT_EQ(r->Get({}), 1); + } + + VLOG(1) << "Writing data to infeed"; + // Transfer some stuff to Infeed for use inside of loop. + TF_ASSERT_OK(local_client_->TransferToInfeed( + *LiteralUtil::CreateR1({10, 20}))); + + // Pick up value from outfeed + { + VLOG(1) << "Reading from body outfeed"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr r, + local_client_->TransferFromOutfeed(&xfeed_shape)); + EXPECT_EQ(r->Get({0}), 11); + EXPECT_EQ(r->Get({1}), 21); + } + + { + VLOG(1) << "Reading from condition outfeed"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr r, + local_client_->TransferFromOutfeed(&int_shape)); + EXPECT_EQ(r->Get({}), 0); + } + + // Joins the thread + thread.reset(); + + EXPECT_EQ(comp_result->Get({}), 0); +} + +XLA_TEST_F(OutfeedInNestedComputationTest, OutfeedInConditional) { + XlaBuilder b(TestName()); + + Shape condition_shape = ShapeUtil::MakeShape(xla::PRED, {}); + Shape result_shape = ShapeUtil::MakeShape(xla::PRED, {}); + + TF_ASSERT_OK_AND_ASSIGN(XlaComputation true_computation, [&] { + XlaBuilder inner_builder("true_computation"); + XlaOp param = Parameter(&inner_builder, 0, result_shape, "param"); + Outfeed(param, result_shape, ""); + Or(param, param); + return inner_builder.Build(); + }()); + + TF_ASSERT_OK_AND_ASSIGN(XlaComputation false_computation, [&] { + XlaBuilder inner_builder("false_computation"); + Parameter(&inner_builder, 0, result_shape, "param"); + return inner_builder.Build(); + }()); + + XlaOp pred = Infeed(&b, condition_shape); + Conditional(/*predicate=*/pred, /*true_operand=*/pred, + /*true_computation=*/true_computation, /*false_operand=*/pred, + /*false_computation=*/false_computation); + + TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, b.Build()); + + std::unique_ptr comp_result; + std::unique_ptr thread( + tensorflow::Env::Default()->StartThread( + tensorflow::ThreadOptions(), "execute_thread", [&] { + comp_result = local_client_->ExecuteAndTransfer(computation, {}) + .ConsumeValueOrDie(); + })); + + TF_ASSERT_OK( + local_client_->TransferToInfeed(*LiteralUtil::CreateR0(true))); + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr r, + local_client_->TransferFromOutfeed(&result_shape)); + + EXPECT_EQ(r->Get({}), true); + + // Join the thread + thread.reset(); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc index ce295b832d79e4f00656f2893c2ba1162693dd73..cbeddffacfa4a0fc560e8b9f9a8d7bd23ff32e55 100644 --- a/tensorflow/compiler/xla/tests/pad_test.cc +++ b/tensorflow/compiler/xla/tests/pad_test.cc @@ -16,12 +16,12 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -93,8 +93,8 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) { dimension->set_edge_padding_high(0); dimension->set_interior_padding(0); - b.Pad(AddParam(*Literal::CreateR1({}), &b), - AddParam(*Literal::CreateR0(0.1), &b), padding_config); + Pad(AddParam(*LiteralUtil::CreateR1({}), &b), + AddParam(*LiteralUtil::CreateR0(0.1), &b), padding_config); ComputeAndCompareR1(&b, {}, {}, DefaultErrorSpec()); } @@ -108,8 +108,8 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) { dimension->set_edge_padding_high(4); dimension->set_interior_padding(7); - b.Pad(AddParam(*Literal::CreateR1({}), &b), - AddParam(*Literal::CreateR0(0.1), &b), padding_config); + Pad(AddParam(*LiteralUtil::CreateR1({}), &b), + AddParam(*LiteralUtil::CreateR0(0.1), &b), padding_config); ComputeAndCompareR1(&b, std::vector(5, 0.1), {}, DefaultErrorSpec()); } @@ -123,23 +123,24 @@ XLA_TEST_P(PadTestFloat, Pad1DS3Array) { dimension->set_edge_padding_high(0); dimension->set_interior_padding(1); - b.Pad(AddParam(*Literal::CreateR1({1, 2, 3}), &b), - AddParam(*Literal::CreateR0(0.1), &b), padding_config); + Pad(AddParam(*LiteralUtil::CreateR1({1, 2, 3}), &b), + AddParam(*LiteralUtil::CreateR0(0.1), &b), padding_config); std::vector expected({0.1, 0.1, 0.1, 1, 0.1, 2, 0.1, 3}); ComputeAndCompareR1(&b, expected, {}, DefaultErrorSpec()); } XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) { XlaBuilder b(TestName()); - b.Pad(AddParam(Array4D(2, 0, 3, 2), &b), - AddParam(*Literal::CreateR0(1.5), &b), r4_padding_on_dim0_dim1_); + Pad(AddParam(Array4D(2, 0, 3, 2), &b), + AddParam(*LiteralUtil::CreateR0(1.5), &b), + r4_padding_on_dim0_dim1_); ComputeAndCompareR4(&b, Array4D(5, 2, 3, 2, 1.5f), {}, DefaultErrorSpec()); } TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) { XlaBuilder b(TestName()); - auto input = MakeUnique>(1, 1, 3, 2); + auto input = absl::make_unique>(1, 1, 3, 2); Array2D input_xy({ {1.0f, 2.0f}, // row 0 {3.0f, 4.0f}, // row 1 @@ -147,10 +148,10 @@ TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) { }); input->FillWithYX(input_xy); - b.Pad(AddParam(*input, &b), AddParam(*Literal::CreateR0(1.5), &b), - r4_padding_on_dim0_dim1_); + Pad(AddParam(*input, &b), AddParam(*LiteralUtil::CreateR0(1.5), &b), + r4_padding_on_dim0_dim1_); - auto expected = MakeUnique>(2, 3, 3, 2); + auto expected = absl::make_unique>(2, 3, 3, 2); expected->Fill(1.5); (*expected)(1, 0, 0, 0) = 1.0f; (*expected)(1, 0, 0, 1) = 2.0f; @@ -166,10 +167,11 @@ TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) { const float pad_value = 1.5f; Array4D input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6}); - b.Pad(AddParam(input, &b), AddParam(*Literal::CreateR0(pad_value), &b), - r4_padding_on_dim0_dim1_); + Pad(AddParam(input, &b), + AddParam(*LiteralUtil::CreateR0(pad_value), &b), + r4_padding_on_dim0_dim1_); - auto expected = MakeUnique>(8, 5, 1, 1); + auto expected = absl::make_unique>(8, 5, 1, 1); expected->Fill(pad_value); (*expected)(1, 0, 0, 0) = 1.0f; (*expected)(1, 2, 0, 0) = 2.0f; @@ -205,11 +207,11 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) { const float pad_value = -5.123f; Array4D input_array(1, 1, 2, 3, {1, 2, 3, 4, 5, 6}); - auto input = Literal::CreateR4FromArray4D(input_array); + auto input = LiteralUtil::CreateR4FromArray4D(input_array); input = input->Relayout(layout); - b.Pad(AddParam(*input, &b), - AddParam(*Literal::CreateR0(pad_value), &b), padding_config); + Pad(AddParam(*input, &b), + AddParam(*LiteralUtil::CreateR0(pad_value), &b), padding_config); Array4D expected_array(1, 1, 5, 8); expected_array.Fill(pad_value); @@ -251,11 +253,11 @@ XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) { input_array(0, 0, 0, 0) = 1.0f; input_array(0, 24, 6, 6) = 2.0f; input_array(0, 17, 2, 5) = 3.0f; - auto input = Literal::CreateR4FromArray4D(input_array); + auto input = LiteralUtil::CreateR4FromArray4D(input_array); input = input->Relayout(layout); - b.Pad(AddParam(*input, &b), - AddParam(*Literal::CreateR0(pad_value), &b), padding_config); + Pad(AddParam(*input, &b), + AddParam(*LiteralUtil::CreateR0(pad_value), &b), padding_config); Array4D expected_array(1, 25, 17, 11); expected_array.Fill(pad_value); @@ -267,7 +269,7 @@ XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) { XLA_TEST_F(PadTest, Pad4DU8Array) { XlaBuilder b(TestName()); - auto input = MakeUnique>(1, 1, 3, 2); + auto input = absl::make_unique>(1, 1, 3, 2); Array2D input_xy({ {1, 2}, // row 0 {3, 4}, // row 1 @@ -275,10 +277,10 @@ XLA_TEST_F(PadTest, Pad4DU8Array) { }); input->FillWithYX(input_xy); - b.Pad(AddParam(*input, &b), b.ConstantR0(35), - r4_padding_on_dim0_dim1_); + Pad(AddParam(*input, &b), ConstantR0(&b, 35), + r4_padding_on_dim0_dim1_); - auto expected = MakeUnique>(2, 3, 3, 2); + auto expected = absl::make_unique>(2, 3, 3, 2); expected->Fill(35); (*expected)(1, 0, 0, 0) = 1; (*expected)(1, 0, 0, 1) = 2; @@ -294,18 +296,18 @@ XLA_TEST_F(PadTest, Pad4DPredArray) { // Since bool is currently not well supported, use Broadcast operation to // create the operand for Pad. - auto input = b.Broadcast(b.ConstantR0(true), {1, 1, 3, 2}); + auto input = Broadcast(ConstantR0(&b, true), {1, 1, 3, 2}); auto padded = - b.Pad(input, b.ConstantR0(false), r4_padding_on_dim0_dim1_); + Pad(input, ConstantR0(&b, false), r4_padding_on_dim0_dim1_); // For the same reason, use Select to convert boolean values to int32. - auto zeros = MakeUnique>(2, 3, 3, 2); - auto ones = MakeUnique>(2, 3, 3, 2); + auto zeros = absl::make_unique>(2, 3, 3, 2); + auto ones = absl::make_unique>(2, 3, 3, 2); zeros->Fill(0); ones->Fill(1); - b.Select(padded, AddParam(*ones, &b), AddParam(*zeros, &b)); + Select(padded, AddParam(*ones, &b), AddParam(*zeros, &b)); - auto expected = MakeUnique>(2, 3, 3, 2); + auto expected = absl::make_unique>(2, 3, 3, 2); expected->Fill(0); (*expected)(1, 0, 0, 0) = 1; (*expected)(1, 0, 0, 1) = 1; @@ -319,7 +321,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) { XLA_TEST_P(PadTestFloat, Large2DPad) { XlaBuilder b(TestName()); - auto ones = MakeUnique>(4, 4); + auto ones = absl::make_unique>(4, 4); ones->Fill(1.0f); auto input = AddParam(*ones, &b); PaddingConfig padding_config = MakeNoPaddingConfig(2); @@ -329,7 +331,7 @@ XLA_TEST_P(PadTestFloat, Large2DPad) { padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 + 100 * dim); } - b.Pad(input, AddParam(*Literal::CreateR0(0.0f), &b), padding_config); + Pad(input, AddParam(*LiteralUtil::CreateR0(0.0f), &b), padding_config); auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f); ComputeAndCompareR2(&b, *expected, {}, DefaultErrorSpec()); @@ -340,7 +342,7 @@ XLA_TEST_P(PadTestFloat, AllTypes2DPad) { constexpr int64 in_rows = 35; constexpr int64 in_cols = 35; - auto operand = MakeUnique>(in_rows, in_cols); + auto operand = absl::make_unique>(in_rows, in_cols); operand->FillUnique(0.0f); auto input = AddParam(*operand, &b); @@ -351,7 +353,8 @@ XLA_TEST_P(PadTestFloat, AllTypes2DPad) { padding_config.mutable_dimensions(1)->set_edge_padding_low(6); padding_config.mutable_dimensions(1)->set_edge_padding_high(4); padding_config.mutable_dimensions(1)->set_interior_padding(2); - b.Pad(input, AddParam(*Literal::CreateR0(3.14f), &b), padding_config); + Pad(input, AddParam(*LiteralUtil::CreateR0(3.14f), &b), + padding_config); auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f); ComputeAndCompareR2(&b, *expected, {}, DefaultErrorSpec()); @@ -365,7 +368,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) { constexpr int64 low_padding = 0; int64 high_padding[2] = {5, 7}; constexpr int64 interior_padding = 0; - auto operand = MakeUnique>(in_rows, in_cols); + auto operand = absl::make_unique>(in_rows, in_cols); operand->FillUnique(1.0f); auto input = AddParam(*operand, &b); PaddingConfig padding_config = MakeNoPaddingConfig(2); @@ -376,7 +379,8 @@ XLA_TEST_P(PadTestFloat, High2DPad) { padding_config.mutable_dimensions(dim)->set_interior_padding( interior_padding); } - b.Pad(input, AddParam(*Literal::CreateR0(2.718f), &b), padding_config); + Pad(input, AddParam(*LiteralUtil::CreateR0(2.718f), &b), + padding_config); auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f); @@ -391,7 +395,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) { int64 low_padding[2] = {-1, -2}; int64 high_padding[2] = {-3, 4}; constexpr int64 interior_padding = 0; - auto operand = MakeUnique>(in_rows, in_cols); + auto operand = absl::make_unique>(in_rows, in_cols); operand->FillUnique(1.0f); auto input = AddParam(*operand, &b); PaddingConfig padding_config = MakeNoPaddingConfig(2); @@ -403,7 +407,8 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) { padding_config.mutable_dimensions(dim)->set_interior_padding( interior_padding); } - b.Pad(input, AddParam(*Literal::CreateR0(2.718f), &b), padding_config); + Pad(input, AddParam(*LiteralUtil::CreateR0(2.718f), &b), + padding_config); auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f); @@ -418,7 +423,7 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) { int64 low_padding[2] = {4, -1}; int64 high_padding[2] = {-2, -4}; int64 interior_padding[2] = {1, 2}; - auto operand = MakeUnique>(in_rows, in_cols); + auto operand = absl::make_unique>(in_rows, in_cols); operand->FillUnique(1.0f); auto input = AddParam(*operand, &b); PaddingConfig padding_config = MakeNoPaddingConfig(2); @@ -430,7 +435,8 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) { padding_config.mutable_dimensions(dim)->set_interior_padding( interior_padding[dim]); } - b.Pad(input, AddParam(*Literal::CreateR0(2.718f), &b), padding_config); + Pad(input, AddParam(*LiteralUtil::CreateR0(2.718f), &b), + padding_config); auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f); @@ -440,18 +446,19 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) { // Regression test for b/31827337. XLA_TEST_P(PadTestFloat, ReducePad) { XlaBuilder b(TestName()); - auto ones = MakeUnique>(2, 2, 2, 2); + auto ones = absl::make_unique>(2, 2, 2, 2); ones->Fill(1.0); auto input = AddParam(*ones, &b); XlaComputation add = CreateScalarAddComputation(FloatType(), &b); auto reduce = - b.Reduce(input, AddParam(*Literal::CreateR0(0.0), &b), add, {0}); + Reduce(input, AddParam(*LiteralUtil::CreateR0(0.0), &b), add, {0}); PaddingConfig padding_config = MakeNoPaddingConfig(3); padding_config.mutable_dimensions(0)->set_edge_padding_low(1); padding_config.mutable_dimensions(0)->set_edge_padding_high(1); - b.Pad(reduce, AddParam(*Literal::CreateR0(0.0f), &b), padding_config); + Pad(reduce, AddParam(*LiteralUtil::CreateR0(0.0f), &b), + padding_config); Array3D expected({{{0.0, 0.0}, {0.0, 0.0}}, {{2.0, 2.0}, {2.0, 2.0}}, diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc index 838f1b4e2f0f0e0871ec717bdeefcbbc653397e3..f6c762e7a4bee91a26c4c2e033c3717fef6d91d0 100644 --- a/tensorflow/compiler/xla/tests/params_test.cc +++ b/tensorflow/compiler/xla/tests/params_test.cc @@ -21,10 +21,10 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -42,11 +42,12 @@ class ParamsTest : public ClientLibraryTestBase {}; XLA_TEST_F(ParamsTest, ConstantR0F32Param) { XlaBuilder builder(TestName()); - std::unique_ptr param0_literal = Literal::CreateR0(3.14159f); + std::unique_ptr param0_literal = + LiteralUtil::CreateR0(3.14159f); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto p = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0"); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "param0"); ComputeAndCompareR0(&builder, 3.14159f, {param0_data.get()}, ErrorSpec(0.0001f)); @@ -54,11 +55,11 @@ XLA_TEST_F(ParamsTest, ConstantR0F32Param) { XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) { XlaBuilder builder(TestName()); - std::unique_ptr param0_literal = Literal::CreateR1({}); + std::unique_ptr param0_literal = LiteralUtil::CreateR1({}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto p = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0}), "param0"); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {0}), "param0"); ComputeAndCompareR1(&builder, {}, {param0_data.get()}, ErrorSpec(0.01f)); @@ -67,11 +68,11 @@ XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) { XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({3.14f, -100.25f}); + LiteralUtil::CreateR1({3.14f, -100.25f}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto p = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2}), "param0"); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "param0"); ComputeAndCompareR1(&builder, {3.14f, -100.25f}, {param0_data.get()}, ErrorSpec(0.01f)); @@ -80,12 +81,13 @@ XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) { XLA_TEST_F(ParamsTest, ConstantR1U8Param) { XlaBuilder builder(TestName()); string str("hello world"); - std::unique_ptr param0_literal = Literal::CreateR1U8(str); + std::unique_ptr param0_literal = LiteralUtil::CreateR1U8(str); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto p = builder.Parameter( - 0, ShapeUtil::MakeShape(U8, {static_cast(str.size())}), "param0"); + Parameter(&builder, 0, + ShapeUtil::MakeShape(U8, {static_cast(str.size())}), + "param0"); ComputeAndCompareR1U8(&builder, str, {param0_data.get()}); } @@ -93,11 +95,11 @@ XLA_TEST_F(ParamsTest, ConstantR1U8Param) { XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR2FromArray2D(Array2D(3, 0)); + LiteralUtil::CreateR2FromArray2D(Array2D(3, 0)); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto p = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3, 0}), "param0"); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3, 0}), "param0"); ComputeAndCompareR2(&builder, Array2D(3, 0), {param0_data.get()}, ErrorSpec(0.01f)); @@ -105,12 +107,12 @@ XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) { XLA_TEST_F(ParamsTest, ConstantR2F32Param) { XlaBuilder builder(TestName()); - std::unique_ptr param0_literal = Literal::CreateR2( + std::unique_ptr param0_literal = LiteralUtil::CreateR2( {{3.14f, -100.25f}, {7e8f, 7e-9f}, {30.3f, -100.0f}}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); - auto p = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3, 2}), "param0"); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3, 2}), "param0"); Array2D expected_array( {{3.14f, -100.25f}, {7e8f, 7e-9f}, {30.3f, -100.0f}}); @@ -121,28 +123,28 @@ XLA_TEST_F(ParamsTest, ConstantR2F32Param) { XLA_TEST_F(ParamsTest, TwoParameters) { XlaBuilder builder(TestName()); - std::unique_ptr literal0 = Literal::CreateR1({1, 2}); + std::unique_ptr literal0 = LiteralUtil::CreateR1({1, 2}); std::unique_ptr param0_data = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, literal0->shape(), "param0"); + auto param0 = Parameter(&builder, 0, literal0->shape(), "param0"); - std::unique_ptr literal1 = Literal::CreateR1({10, 20}); + std::unique_ptr literal1 = LiteralUtil::CreateR1({10, 20}); std::unique_ptr param1_data = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - auto param1 = builder.Parameter(1, literal1->shape(), "param1"); + auto param1 = Parameter(&builder, 1, literal1->shape(), "param1"); // Use both parameters // // {1, 2} + {10, 20} = {11, 22} - auto sum = builder.Add(param0, param1); - sum = builder.Add(param0, param1); + auto sum = Add(param0, param1); + sum = Add(param0, param1); // Use only the second parameter again, to show that it can be used // twice and to make the computation asymmetric in the two // parameters to test that the parameters are not swapped. // // {11, 22} * {10, 20} = {110, 440} - auto prod = builder.Mul(sum, param1); + Mul(sum, param1); ComputeAndCompareR1(&builder, {110, 440}, {param0_data.get(), param1_data.get()}, @@ -152,12 +154,12 @@ XLA_TEST_F(ParamsTest, TwoParameters) { XLA_TEST_F(ParamsTest, MissingParameter) { // Test that an error is returned when a computation with an incomplete set of // parameters (parameter numbers not contiguous from 0) is executed. - std::unique_ptr literal = Literal::CreateR0(3.14159f); + std::unique_ptr literal = LiteralUtil::CreateR0(3.14159f); std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); XlaBuilder builder(TestName()); - auto p = builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "param2"); + Parameter(&builder, 2, ShapeUtil::MakeShape(F32, {}), "param2"); auto computation_status = builder.Build(); ASSERT_NE(computation_status.status(), Status::OK()); @@ -166,15 +168,15 @@ XLA_TEST_F(ParamsTest, MissingParameter) { XLA_TEST_F(ParamsTest, UnusedParameter) { XlaBuilder builder(TestName()); - std::unique_ptr literal0 = Literal::CreateR1({1, 2}); + std::unique_ptr literal0 = LiteralUtil::CreateR1({1, 2}); std::unique_ptr param0_data = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, literal0->shape(), "param0"); + Parameter(&builder, 0, literal0->shape(), "param0"); - std::unique_ptr literal1 = Literal::CreateR1({10, 20}); + std::unique_ptr literal1 = LiteralUtil::CreateR1({10, 20}); std::unique_ptr param1_data = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - auto param1 = builder.Parameter(1, literal1->shape(), "param1"); + Parameter(&builder, 1, literal1->shape(), "param1"); ComputeAndCompareR1(&builder, {10, 20}, {param0_data.get(), param1_data.get()}, @@ -186,22 +188,23 @@ XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) { // unused expression. XlaBuilder builder(TestName()); - std::unique_ptr literal0 = Literal::CreateR1({1, 2}); + std::unique_ptr literal0 = LiteralUtil::CreateR1({1, 2}); std::unique_ptr param0_data = client_->TransferToServer(*literal0).ConsumeValueOrDie(); - std::unique_ptr literal1 = Literal::CreateR1({10, 20, 30}); + std::unique_ptr literal1 = + LiteralUtil::CreateR1({10, 20, 30}); std::unique_ptr param1_data = client_->TransferToServer(*literal1).ConsumeValueOrDie(); - auto param0 = builder.Parameter(0, literal0->shape(), "param0"); - auto param1 = builder.Parameter(1, literal1->shape(), "param1"); - auto param2 = builder.Parameter(2, literal1->shape(), "param2"); + auto param0 = Parameter(&builder, 0, literal0->shape(), "param0"); + auto param1 = Parameter(&builder, 1, literal1->shape(), "param1"); + auto param2 = Parameter(&builder, 2, literal1->shape(), "param2"); // This add is unused. - builder.Add(param1, param2); + Add(param1, param2); - builder.Neg(param0); + Neg(param0); ComputeAndCompareR1( &builder, {-1, -2}, @@ -215,7 +218,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) { std::vector init_value = {{0, 1}}; init_value.resize(size); - XlaOp sum_handle = builder.ConstantR1(init_value); + XlaOp sum_handle = ConstantR1(&builder, init_value); std::vector sum = {{0, 1}}; sum.resize(size); @@ -230,11 +233,11 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) { std::vector sum_value = {{entry0, entry1}}; sum_value.resize(size); - std::unique_ptr literal = Literal::CreateR1(sum_value); + std::unique_ptr literal = LiteralUtil::CreateR1(sum_value); param_data_owner.push_back( client_->TransferToServer(*literal).ConsumeValueOrDie()); - XlaOp param = builder.Parameter(i, literal->shape(), "param"); - sum_handle = builder.Add(sum_handle, param); + XlaOp param = Parameter(&builder, i, literal->shape(), "param"); + sum_handle = Add(sum_handle, param); } std::vector param_data; @@ -260,16 +263,16 @@ XLA_TEST_F(ParamsTest, XlaBuilder builder(TestName()); std::vector> param_data_owner; - XlaOp sum_handle = builder.ConstantR0(0.0f); + XlaOp sum_handle = ConstantR0(&builder, 0.0f); float target = 0.0; constexpr int kParamCount = 3000; for (int i = 0; i < kParamCount; ++i) { target += i; - std::unique_ptr literal = Literal::CreateR0(i); + std::unique_ptr literal = LiteralUtil::CreateR0(i); param_data_owner.push_back( std::move(client_->TransferToServer(*literal)).ValueOrDie()); - XlaOp param = builder.Parameter(i, literal->shape(), "param"); - sum_handle = builder.Add(sum_handle, param); + XlaOp param = Parameter(&builder, i, literal->shape(), "param"); + sum_handle = Add(sum_handle, param); } std::vector param_data; @@ -291,26 +294,26 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU( XlaBuilder builder(TestName()); std::vector> param_data_owner; - XlaOp sum_handle = builder.ConstantR1({0, 0}); + XlaOp sum_handle = ConstantR1(&builder, {0, 0}); int32 target = 0; constexpr int kParamCount = 3000; std::vector params; for (int i = 0; i < kParamCount; ++i) { target += i; - std::unique_ptr literal = Literal::CreateR1({i, i}); + std::unique_ptr literal = LiteralUtil::CreateR1({i, i}); param_data_owner.push_back( std::move(client_->TransferToServer(*literal)).ValueOrDie()); - XlaOp param = builder.Parameter(i, literal->shape(), "param"); + XlaOp param = Parameter(&builder, i, literal->shape(), "param"); params.push_back(param); - sum_handle = builder.Add(sum_handle, param); + sum_handle = Add(sum_handle, param); } std::vector outputs; for (int i = 0; i < kParamCount; ++i) { - outputs.push_back(builder.Add(params[i], sum_handle)); + outputs.push_back(Add(params[i], sum_handle)); } - builder.Tuple(outputs); + Tuple(&builder, outputs); std::vector param_data; param_data.reserve(param_data_owner.size()); @@ -321,10 +324,10 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU( std::vector> elements; std::vector ptrs; for (int i = 0; i < kParamCount; ++i) { - elements.push_back(Literal::CreateR1({target + i, target + i})); + elements.push_back(LiteralUtil::CreateR1({target + i, target + i})); ptrs.push_back(elements.back().get()); } - ComputeAndCompareTuple(&builder, *Literal::MakeTuple(ptrs), param_data); + ComputeAndCompareTuple(&builder, *LiteralUtil::MakeTuple(ptrs), param_data); } // Test large number of parameters flowing into a while-loop. @@ -353,25 +356,25 @@ XLA_TEST_F(ParamsTest, std::vector params; std::vector parameter_shapes; for (int i = 0; i < kParamCount; ++i) { - std::unique_ptr literal = Literal::CreateR1({i, i}); + std::unique_ptr literal = LiteralUtil::CreateR1({i, i}); param_data_owner.push_back( std::move(client_->TransferToServer(*literal)).ValueOrDie()); - XlaOp param = builder.Parameter(i, literal->shape(), "param"); + XlaOp param = Parameter(&builder, i, literal->shape(), "param"); params.push_back(param); parameter_shapes.push_back(literal->shape()); } // Add bool parameter for the loop condition. Use a parameter HLO instead of a // constant because DCE may eliminate the while-body otherwise. - std::unique_ptr bool_literal = Literal::CreateR0(false); + std::unique_ptr bool_literal = LiteralUtil::CreateR0(false); param_data_owner.push_back( std::move(client_->TransferToServer(*bool_literal)).ValueOrDie()); XlaOp bool_param = - builder.Parameter(kParamCount, bool_literal->shape(), "bool_param"); + Parameter(&builder, kParamCount, bool_literal->shape(), "bool_param"); params.push_back(bool_param); parameter_shapes.push_back(bool_literal->shape()); - auto init = builder.Tuple(params); + auto init = Tuple(&builder, params); // Create a computation for the condition: while(bool_param). Shape while_shape = ShapeUtil::MakeTupleShape(parameter_shapes); @@ -379,8 +382,8 @@ XLA_TEST_F(ParamsTest, { XlaBuilder builder("condition"); auto condition_parameter = - builder.Parameter(0, while_shape, "condition_parameter"); - builder.GetTupleElement(condition_parameter, kParamCount); + Parameter(&builder, 0, while_shape, "condition_parameter"); + GetTupleElement(condition_parameter, kParamCount); condition = builder.Build().ConsumeValueOrDie(); } @@ -389,27 +392,27 @@ XLA_TEST_F(ParamsTest, XlaComputation body; { XlaBuilder builder("body"); - auto body_parameter = builder.Parameter(0, while_shape, "body_parameter"); + auto body_parameter = Parameter(&builder, 0, while_shape, "body_parameter"); std::vector updates; for (int i = 0; i < kParamCount; ++i) { - auto add = builder.Add(builder.GetTupleElement(body_parameter, i), - builder.ConstantR1({1, 1})); + auto add = Add(GetTupleElement(body_parameter, i), + ConstantR1(&builder, {1, 1})); updates.push_back(add); } // Add bool parameter. - updates.push_back(builder.GetTupleElement(body_parameter, kParamCount)); + updates.push_back(GetTupleElement(body_parameter, kParamCount)); - builder.Tuple(updates); + Tuple(&builder, updates); body = builder.Build().ConsumeValueOrDie(); } - auto loop = builder.While(condition, body, init); + auto loop = While(condition, body, init); std::vector outputs; for (int i = 0; i < kParamCount; ++i) { - outputs.push_back(builder.GetTupleElement(loop, i)); + outputs.push_back(GetTupleElement(loop, i)); } - builder.Tuple(outputs); + Tuple(&builder, outputs); std::vector param_data; param_data.reserve(param_data_owner.size()); @@ -420,10 +423,10 @@ XLA_TEST_F(ParamsTest, std::vector> elements; std::vector ptrs; for (int i = 0; i < kParamCount; ++i) { - elements.push_back(Literal::CreateR1({i, i})); + elements.push_back(LiteralUtil::CreateR1({i, i})); ptrs.push_back(elements.back().get()); } - ComputeAndCompareTuple(&builder, *Literal::MakeTuple(ptrs), param_data); + ComputeAndCompareTuple(&builder, *LiteralUtil::MakeTuple(ptrs), param_data); } #endif @@ -433,16 +436,16 @@ XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) { Shape r1f32_3 = ShapeUtil::MakeShape(F32, {3}); Shape tuple_shape = ShapeUtil::MakeTupleShape({r1f32_3, r1f32_3}); - auto input = builder.Parameter(0, tuple_shape, "input"); - auto lhs = builder.GetTupleElement(input, 0); - auto rhs = builder.GetTupleElement(input, 1); - builder.Add(lhs, rhs); + auto input = Parameter(&builder, 0, tuple_shape, "input"); + auto lhs = GetTupleElement(input, 0); + auto rhs = GetTupleElement(input, 1); + Add(lhs, rhs); std::unique_ptr data = client_ - ->TransferToServer(*Literal::MakeTuple({ - Literal::CreateR1({1, 2, 3}).get(), - Literal::CreateR1({4, 5, 6}).get(), + ->TransferToServer(*LiteralUtil::MakeTuple({ + LiteralUtil::CreateR1({1, 2, 3}).get(), + LiteralUtil::CreateR1({4, 5, 6}).get(), })) .ConsumeValueOrDie(); @@ -454,10 +457,10 @@ XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) { // Verifies that passing a 2x2 with {0, 1} layout returns the same value back // when (transferred to the server and) passed through a parameter. XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) { - std::unique_ptr literal = Literal::CreateR2WithLayout( + std::unique_ptr literal = LiteralUtil::CreateR2WithLayout( {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({0, 1})); XlaBuilder builder(TestName()); - builder.Parameter(0, literal->shape(), "input"); + Parameter(&builder, 0, literal->shape(), "input"); std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); @@ -466,10 +469,10 @@ XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) { // As above, but for {1, 0} layout. XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) { - std::unique_ptr literal = Literal::CreateR2WithLayout( + std::unique_ptr literal = LiteralUtil::CreateR2WithLayout( {{1, 3}, {2, 4}}, LayoutUtil::MakeLayout({1, 0})); XlaBuilder builder(TestName()); - builder.Parameter(0, literal->shape(), "input"); + Parameter(&builder, 0, literal->shape(), "input"); std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); @@ -477,8 +480,9 @@ XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) { } XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) { - std::unique_ptr literal = Literal::CreateR2({ - {1, 3}, {2, 4}, + std::unique_ptr literal = LiteralUtil::CreateR2({ + {1, 3}, + {2, 4}, }); const Shape original = literal->shape(); { @@ -494,9 +498,9 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) { } // Use the original shape in building the computation. XlaBuilder builder(TestName()); - auto input = builder.Parameter(0, original, "input"); + auto input = Parameter(&builder, 0, original, "input"); // Use the slice operator to get an off-diagonal element. - builder.Slice(input, {0, 1}, {1, 2}, {1, 1}); + Slice(input, {0, 1}, {1, 2}, {1, 1}); std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc index 77159efb26f3b7dd4918f24305f7269a2d6ff647..58539e6b061b0cec1cc660b52e78894e5deeea56 100644 --- a/tensorflow/compiler/xla/tests/pred_test.cc +++ b/tensorflow/compiler/xla/tests/pred_test.cc @@ -19,7 +19,7 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" @@ -29,64 +29,63 @@ namespace { class PredTest : public ClientLibraryTestBase { protected: - void TestCompare( - bool lhs, bool rhs, bool expected, - XlaOp (XlaBuilder::*op)(const xla::XlaOp&, const xla::XlaOp&, - tensorflow::gtl::ArraySlice)) { + void TestCompare(bool lhs, bool rhs, bool expected, + std::function)> + op) { XlaBuilder builder(TestName()); - XlaOp lhs_op = builder.ConstantR0(lhs); - XlaOp rhs_op = builder.ConstantR0(rhs); - XlaOp result = (builder.*op)(lhs_op, rhs_op, {}); + XlaOp lhs_op = ConstantR0(&builder, lhs); + XlaOp rhs_op = ConstantR0(&builder, rhs); + op(lhs_op, rhs_op, {}); ComputeAndCompareR0(&builder, expected, {}); } }; TEST_F(PredTest, ConstantR0PredTrue) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR0(true); + ConstantR0(&builder, true); ComputeAndCompareR0(&builder, true, {}); } TEST_F(PredTest, ConstantR0PredFalse) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR0(false); + ConstantR0(&builder, false); ComputeAndCompareR0(&builder, false, {}); } TEST_F(PredTest, ConstantR0PredCompareEq) { - TestCompare(true, false, false, &XlaBuilder::Eq); + TestCompare(true, false, false, &Eq); } TEST_F(PredTest, ConstantR0PredCompareNe) { - TestCompare(true, false, true, &XlaBuilder::Ne); + TestCompare(true, false, true, &Ne); } TEST_F(PredTest, ConstantR0PredCompareLe) { - TestCompare(true, false, false, &XlaBuilder::Le); + TestCompare(true, false, false, &Le); } TEST_F(PredTest, ConstantR0PredCompareLt) { - TestCompare(true, false, false, &XlaBuilder::Lt); + TestCompare(true, false, false, &Lt); } TEST_F(PredTest, ConstantR0PredCompareGe) { - TestCompare(true, false, true, &XlaBuilder::Ge); + TestCompare(true, false, true, &Ge); } TEST_F(PredTest, ConstantR0PredCompareGt) { - TestCompare(true, false, true, &XlaBuilder::Gt); + TestCompare(true, false, true, &Gt); } TEST_F(PredTest, ConstantR1Pred) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({true, false, false, true}); + ConstantR1(&builder, {true, false, false, true}); ComputeAndCompareR1(&builder, {true, false, false, true}, {}); } TEST_F(PredTest, ConstantR2Pred) { XlaBuilder builder(TestName()); - auto a = - builder.ConstantR2({{false, true, true}, {true, false, false}}); + ConstantR2(&builder, {{false, true, true}, {true, false, false}}); const string expected = R"(pred[2,3] { { 011 }, { 100 } @@ -96,44 +95,44 @@ TEST_F(PredTest, ConstantR2Pred) { TEST_F(PredTest, AnyR1True) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({true, false}); - TF_ASSERT_OK(Any(a, &builder).status()); + auto a = ConstantR1(&builder, {true, false}); + Any(a); ComputeAndCompareR0(&builder, true, {}); } TEST_F(PredTest, AnyR1False) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({false, false}); - TF_ASSERT_OK(Any(a, &builder).status()); + auto a = ConstantR1(&builder, {false, false}); + Any(a); ComputeAndCompareR0(&builder, false, {}); } TEST_F(PredTest, AnyR1VacuouslyFalse) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR1({}); - TF_ASSERT_OK(Any(a, &builder).status()); + auto a = ConstantR1(&builder, {}); + Any(a); ComputeAndCompareR0(&builder, false, {}); } TEST_F(PredTest, AnyR2True) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({ - {false, false, false}, - {false, false, false}, - {false, false, true}, - }); - TF_ASSERT_OK(Any(a, &builder).status()); + auto a = ConstantR2(&builder, { + {false, false, false}, + {false, false, false}, + {false, false, true}, + }); + Any(a); ComputeAndCompareR0(&builder, true, {}); } TEST_F(PredTest, AnyR2False) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({ - {false, false, false}, - {false, false, false}, - {false, false, false}, - }); - TF_ASSERT_OK(Any(a, &builder).status()); + auto a = ConstantR2(&builder, { + {false, false, false}, + {false, false, false}, + {false, false, false}, + }); + Any(a); ComputeAndCompareR0(&builder, false, {}); } diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc index 1a2de6937c3e134852a730f62f7b56417cf49b28..5f322b768d8620cb64a79bb8fca5fecf282f28f5 100644 --- a/tensorflow/compiler/xla/tests/prng_test.cc +++ b/tensorflow/compiler/xla/tests/prng_test.cc @@ -16,9 +16,10 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" @@ -26,7 +27,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -37,8 +37,7 @@ namespace { class PrngTest : public ClientLibraryTestBase { protected: template - std::unique_ptr UniformTest(T a, T b, - tensorflow::gtl::ArraySlice dims, + std::unique_ptr UniformTest(T a, T b, absl::Span dims, int64 seed = 42); // Computes the χ² statistic of a sample of the discrete uniform distribution @@ -50,18 +49,19 @@ class PrngTest : public ClientLibraryTestBase { }; template -std::unique_ptr PrngTest::UniformTest( - T a, T b, tensorflow::gtl::ArraySlice dims, int64 seed) { +std::unique_ptr PrngTest::UniformTest(T a, T b, + absl::Span dims, + int64 seed) { XlaBuilder builder(TestName()); - builder.RngUniform( - builder.ConstantR0(a), builder.ConstantR0(b), + RngUniform( + ConstantR0(&builder, a), ConstantR0(&builder, b), ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType(), dims)); SetSeed(seed); auto actual = ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions())); - actual->EachCell([=](tensorflow::gtl::ArraySlice, T value) { + actual->EachCell([=](absl::Span, T value) { EXPECT_LE(a, value); EXPECT_LT(value, b); }); @@ -117,7 +117,7 @@ XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16CountTests))) { for (int64 seed = 0; seed < count; ++seed) { auto result = UniformTest(low, high, {}, /*seed=*/seed); result->Literal::EachCell( - [&](tensorflow::gtl::ArraySlice, bfloat16 value) { + [&](absl::Span, bfloat16 value) { int64 index = static_cast((value - low) / interval); counts[index]++; }); @@ -141,16 +141,16 @@ double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count, int32 sample_size = range_size * expected_count; XlaBuilder builder(TestName()); - builder.RngUniform(builder.ConstantR0(0), - builder.ConstantR0(range_size), - ShapeUtil::MakeShape(S32, {sample_size})); + RngUniform(ConstantR0(&builder, 0), + ConstantR0(&builder, range_size), + ShapeUtil::MakeShape(S32, {sample_size})); SetSeed(seed); auto actual = ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); std::vector counts(range_size, 0); - actual->EachCell([&counts](tensorflow::gtl::ArraySlice, - int32 value) { ++counts[value]; }); + actual->EachCell( + [&counts](absl::Span, int32 value) { ++counts[value]; }); int64 sum = 0; for (int32 i = 0; i < range_size; ++i) { sum += Square(static_cast(counts[i] - expected_count)); @@ -177,28 +177,29 @@ XLA_TEST_F(PrngTest, Uniformity108) { EXPECT_LT(UniformChiSquared(108, 256), 132.144); } XLA_TEST_F(PrngTest, Uniformity256) { - EXPECT_LT(UniformChiSquared(256, 256), 293.248); + EXPECT_LT(UniformChiSquared(256, 512), 293.248); } XLA_TEST_F(PrngTest, MapUsingRng) { // Build a x -> (x + U[0,1)) computation. - auto build_sum_rng = [this](XlaBuilder& builder) { + auto build_sum_rng = [](XlaBuilder& builder) { auto b = builder.CreateSubBuilder("sum_with_rng"); - auto x = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "input"); - b->Add(x, b->RngUniform(b->ConstantR0(0), b->ConstantR0(1), - ShapeUtil::MakeShape(F32, {}))); + auto x = Parameter(b.get(), 0, ShapeUtil::MakeShape(F32, {}), "input"); + Add(x, + RngUniform(ConstantR0(b.get(), 0), ConstantR0(b.get(), 1), + ShapeUtil::MakeShape(F32, {}))); return b->BuildAndNoteError(); }; XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR1({2.2f, 5.3f, 4.4f, 5.5f}); + LiteralUtil::CreateR1({2.2f, 5.3f, 4.4f, 5.5f}); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr param0_data, client_->TransferToServer(*param0_literal)); - auto param0 = builder.Parameter(0, param0_literal->shape(), "param0"); + auto param0 = Parameter(&builder, 0, param0_literal->shape(), "param0"); auto fn = build_sum_rng(builder); - builder.Map({param0}, fn, {0}); + Map(&builder, {param0}, fn, {0}); TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build()); @@ -226,9 +227,8 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) { // Build a U[0,1) computation. auto build_computation = [this]() { XlaBuilder builder(TestName()); - builder.RngUniform(builder.ConstantR0(0), - builder.ConstantR0(1), - ShapeUtil::MakeShape(F32, {10})); + RngUniform(ConstantR0(&builder, 0), ConstantR0(&builder, 1), + ShapeUtil::MakeShape(F32, {10})); return builder.Build(); }; @@ -282,8 +282,8 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) { XLA_TEST_F(PrngTest, TenValuesN01) { XlaBuilder builder(TestName()); - builder.RngNormal(builder.ConstantR0(0), builder.ConstantR0(1), - ShapeUtil::MakeShape(F32, {10})); + RngNormal(ConstantR0(&builder, 0), ConstantR0(&builder, 1), + ShapeUtil::MakeShape(F32, {10})); SetSeed(42); ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); @@ -294,9 +294,9 @@ XLA_TEST_F(PrngTest, RngUniformCrash) { XlaBuilder builder(TestName()); // This used to crash XLA during LLVM IR generation for CPUs. - auto rng_uniform = builder.RngUniform(builder.ConstantR0(0), - builder.ConstantR0(1000 * 1000), - ShapeUtil::MakeShape(S32, {})); + RngUniform(ConstantR0(&builder, 0), + ConstantR0(&builder, 1000 * 1000), + ShapeUtil::MakeShape(S32, {})); SetSeed(0); ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); } diff --git a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc index f95e75648343aa88bd7c39de4ee9f387f2b60506..fab2a65de109c670a6854c0fc1118162acf3d312 100644 --- a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc +++ b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc @@ -16,7 +16,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -31,8 +31,8 @@ class QueryInferredShapeTest : public ClientLibraryTestBase {}; TEST_F(QueryInferredShapeTest, OnePlusOneShape) { XlaBuilder builder("one_plus_one"); - auto one = builder.ConstantR0(1.0); - auto result = builder.Add(one, one); + auto one = ConstantR0(&builder, 1.0); + auto result = Add(one, one); StatusOr shape_status = builder.GetShape(result); ASSERT_IS_OK(shape_status.status()); auto shape = shape_status.ConsumeValueOrDie(); diff --git a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc index c0a2c0ca4cb8414e0771a541b9f963f9aedc8376..9af9ea4a2229bb6ca7c3561350f11837f5072a2c 100644 --- a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc @@ -15,11 +15,11 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -29,16 +29,13 @@ limitations under the License. namespace xla { namespace { -namespace str_util = tensorflow::str_util; -namespace strings = tensorflow::strings; - struct ReduceLayout { std::array input_minor_to_major; std::array output_minor_to_major; string ToString() const { - return strings::StrCat(str_util::Join(input_minor_to_major, "x"), "_", - str_util::Join(output_minor_to_major, "x")); + return absl::StrCat(absl::StrJoin(input_minor_to_major, "x"), "_", + absl::StrJoin(output_minor_to_major, "x")); } }; @@ -73,7 +70,7 @@ ENTRY reduce.1 { } )"; - return tools::Parse(hlo_string); + return ParseHloString(hlo_string); } // TODO(b/72454718): XLA:GPU does not support executing code compiled without @@ -95,21 +92,21 @@ XLA_TEST_P(ReduceWithLayoutTest, DISABLED_ON_GPU(Reduce)) { *reduce_input_shape->mutable_layout() = LayoutUtil::MakeLayout(reduce_layout.input_minor_to_major); - std::unique_ptr reduce_input = - Literal::CreateR4({{ /*i0=0*/ - {/*i1=0*/ - {-0.246092796, -0.179497838, -0.161181688}, - {-0.151643038, -0.240213156, -0.198156}}, - {/*i1=1*/ - {-0.14222312, -0.162200093, -0.193907976}, - {-0.239411, -0.198166847, -0.172471642}}}, - { /*i0=1*/ - {/*i1=0*/ - {-0.22965157, -0.218723893, -0.129257083}, - {-0.188762426, -0.16123569, -0.181166649}}, - {/*i1=1*/ - {-0.241772294, -0.245131493, -0.160247207}, - {-0.179881215, -0.23383224, -0.121976733}}}}); + std::unique_ptr reduce_input = LiteralUtil::CreateR4( + {{ /*i0=0*/ + {/*i1=0*/ + {-0.246092796, -0.179497838, -0.161181688}, + {-0.151643038, -0.240213156, -0.198156}}, + {/*i1=1*/ + {-0.14222312, -0.162200093, -0.193907976}, + {-0.239411, -0.198166847, -0.172471642}}}, + { /*i0=1*/ + {/*i1=0*/ + {-0.22965157, -0.218723893, -0.129257083}, + {-0.188762426, -0.16123569, -0.181166649}}, + {/*i1=1*/ + {-0.241772294, -0.245131493, -0.160247207}, + {-0.179881215, -0.23383224, -0.121976733}}}}); EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), ErrorSpec(1e-5))); } diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc index b311785449f1774c3bc1e4d7ad35c2866e3b4061..0916a07f4fa99af6cf25441fa8558a558bfa032f 100644 --- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc @@ -19,12 +19,13 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" @@ -57,8 +58,8 @@ static const int mantissa_sizes[] = {23, 10, 23, 10}; string TestDataToString(const ::testing::TestParamInfo data) { int i = data.param; - return tensorflow::strings::StrCat(exponent_sizes[i], "_exponent_bits_", - mantissa_sizes[i], "_mantissa_bits"); + return absl::StrCat(exponent_sizes[i], "_exponent_bits_", mantissa_sizes[i], + "_mantissa_bits"); } // The FPVAL macro allows us to write out the binary representation of the @@ -230,12 +231,13 @@ XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) { XlaBuilder builder(TestName()); - std::unique_ptr a_literal = Literal::CreateR1({input_values}); + std::unique_ptr a_literal = + LiteralUtil::CreateR1({input_values}); std::unique_ptr a_data = client_->TransferToServer(*a_literal).ConsumeValueOrDie(); - auto a = builder.Parameter(0, a_literal->shape(), "a"); + auto a = Parameter(&builder, 0, a_literal->shape(), "a"); - builder.ReducePrecision(a, exponent_bits, mantissa_bits); + ReducePrecision(a, exponent_bits, mantissa_bits); ComputeAndCompareR1(&builder, expected_values, {a_data.get()}); } @@ -253,18 +255,18 @@ XLA_TEST_F(ReducePrecisionInsertionTest, DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) { XlaBuilder builder(TestName()); - std::unique_ptr a_literal = Literal::CreateR1({1.00001}); + std::unique_ptr a_literal = LiteralUtil::CreateR1({1.00001}); std::unique_ptr a_data = client_->TransferToServer(*a_literal).ConsumeValueOrDie(); - auto a = builder.Parameter(0, a_literal->shape(), "a"); + auto a = Parameter(&builder, 0, a_literal->shape(), "a"); // Abs doesn't affect resolution. - auto abs = builder.Abs(a); + auto abs = Abs(a); // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the // reduce-precision operation showed up in the correct place in the // graph. - builder.Log(abs); + Log(abs); // Insert precision-reduction after the Abs(x) operation, rounding that // result to exactly 1.0f. @@ -282,14 +284,14 @@ XLA_TEST_F(ReducePrecisionInsertionTest, DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) { XlaBuilder builder(TestName()); - std::unique_ptr a_literal = Literal::CreateR1({1.00001}); + std::unique_ptr a_literal = LiteralUtil::CreateR1({1.00001}); std::unique_ptr a_data = client_->TransferToServer(*a_literal).ConsumeValueOrDie(); - auto a = builder.Parameter(0, a_literal->shape(), "a"); + auto a = Parameter(&builder, 0, a_literal->shape(), "a"); // These two operations should be fused by any reasonable backend. - auto abs = builder.Abs(a); - builder.Neg(abs); + auto abs = Abs(a); + Neg(abs); // Add a pass after operation fusion, suffixing kAbs operations. This // should not see into the fusion nodes and thus should not affect the @@ -308,14 +310,14 @@ XLA_TEST_F(ReducePrecisionInsertionTest, DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) { XlaBuilder builder(TestName()); - std::unique_ptr a_literal = Literal::CreateR1({1.00001}); + std::unique_ptr a_literal = LiteralUtil::CreateR1({1.00001}); std::unique_ptr a_data = client_->TransferToServer(*a_literal).ConsumeValueOrDie(); - auto a = builder.Parameter(0, a_literal->shape(), "a"); + auto a = Parameter(&builder, 0, a_literal->shape(), "a"); // These two operations should be fused by any reasonable backend. - auto abs = builder.Abs(a); - builder.Neg(abs); + auto abs = Abs(a); + Neg(abs); // Add a pass after operation fusion, suffixing kFusion operations. auto reduce_precision_pass = execution_options_.mutable_debug_options() @@ -332,14 +334,14 @@ XLA_TEST_F(ReducePrecisionInsertionTest, DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) { XlaBuilder builder(TestName()); - std::unique_ptr a_literal = Literal::CreateR1({1.00001}); + std::unique_ptr a_literal = LiteralUtil::CreateR1({1.00001}); std::unique_ptr a_data = client_->TransferToServer(*a_literal).ConsumeValueOrDie(); - auto a = builder.Parameter(0, a_literal->shape(), "a"); + auto a = Parameter(&builder, 0, a_literal->shape(), "a"); // These two operations should be fused by any reasonable backend. - auto abs = builder.Abs(a); - builder.Neg(abs); + auto abs = Abs(a); + Neg(abs); // Add a pass suffixing fusion nodes containing kCos operations. This // should have no effect. @@ -357,14 +359,14 @@ XLA_TEST_F(ReducePrecisionInsertionTest, DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) { XlaBuilder builder(TestName()); - std::unique_ptr a_literal = Literal::CreateR1({1.00001}); + std::unique_ptr a_literal = LiteralUtil::CreateR1({1.00001}); std::unique_ptr a_data = client_->TransferToServer(*a_literal).ConsumeValueOrDie(); - auto a = builder.Parameter(0, a_literal->shape(), "a"); + auto a = Parameter(&builder, 0, a_literal->shape(), "a"); // These two operations should be fused by any reasonable backend. - auto abs = builder.Abs(a); - builder.Neg(abs); + auto abs = Abs(a); + Neg(abs); // Add a pass suffixing fusion nodes containing kAbs operations. This // should see the kAbs operation within the above fusion node. diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc index d671d40456a276a44b462f390c95aa4af301263a..8c62adea231d1d3197c6e483d58008b1577b156d 100644 --- a/tensorflow/compiler/xla/tests/reduce_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_test.cc @@ -32,13 +32,16 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/reference_util.h" @@ -51,7 +54,6 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -67,12 +69,12 @@ class ReduceTest : public ClientLibraryTestBase { ReduceTest() { // Implementation note: laid out z >> y >> x by default. // clang-format off - literal_2d_ = Literal::CreateR2({ + literal_2d_ = LiteralUtil::CreateR2({ // x0 x1 x2 { 1.f, 2.f, 3.f}, // y0 { 4.f, 5.f, 6.f}, // y1 }); - literal_3d_ = Literal::CreateR3Projected({ + literal_3d_ = LiteralUtil::CreateR3Projected({ // x0 x1 x2 { 1.f, 2.f, 3.f}, // y0 { 4.f, 5.f, 6.f}, // y1 @@ -89,9 +91,9 @@ class ReduceTest : public ClientLibraryTestBase { XlaBuilder builder(TestName()); XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {element_count}); - auto input = builder.Parameter(0, input_shape, "input"); - auto zero = builder.ConstantR0(0.0); - builder.Reduce(input, zero, add_f32, /*dimensions_to_reduce=*/{0}); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto zero = ConstantR0(&builder, 0.0); + Reduce(input, zero, add_f32, /*dimensions_to_reduce=*/{0}); std::vector input_data(element_count); for (int64 i = 0; i < element_count; ++i) { @@ -101,7 +103,7 @@ class ReduceTest : public ClientLibraryTestBase { } } std::unique_ptr input_literal = - Literal::CreateR1(AsSlice(input_data)); + LiteralUtil::CreateR1(AsSlice(input_data)); std::unique_ptr input_global_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); @@ -113,27 +115,26 @@ class ReduceTest : public ClientLibraryTestBase { ErrorSpec(0.001)); } - void RunR1ToR0PredTest(bool and_reduce, - tensorflow::gtl::ArraySlice input_data) { + void RunR1ToR0PredTest(bool and_reduce, absl::Span input_data) { const int element_count = input_data.size(); XlaBuilder builder(TestName()); const Shape input_shape = ShapeUtil::MakeShape(S32, {element_count}); - auto input_par = builder.Parameter(0, input_shape, "input"); + auto input_par = Parameter(&builder, 0, input_shape, "input"); auto pred_values = - builder.Eq(input_par, builder.ConstantR1(element_count, 1)); + Eq(input_par, ConstantR1(&builder, element_count, 1)); XlaOp init_value; XlaComputation reduce; if (and_reduce) { - init_value = builder.ConstantR0(true); - reduce = CreateScalarAndComputation(&builder); + init_value = ConstantR0(&builder, true); + reduce = CreateScalarAndComputation(PRED, &builder); } else { - init_value = builder.ConstantR0(false); - reduce = CreateScalarOrComputation(&builder); + init_value = ConstantR0(&builder, false); + reduce = CreateScalarOrComputation(PRED, &builder); } - builder.Reduce(pred_values, init_value, reduce, - /*dimensions_to_reduce=*/{0}); + Reduce(pred_values, init_value, reduce, + /*dimensions_to_reduce=*/{0}); - std::unique_ptr input_literal = Literal::CreateR1(input_data); + std::unique_ptr input_literal = LiteralUtil::CreateR1(input_data); std::unique_ptr input_global_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); @@ -156,26 +157,26 @@ class ReduceTest : public ClientLibraryTestBase { int64 major = 0) { XlaBuilder builder(TestName()); const Shape input_shape = ShapeUtil::MakeShape(U8, {rows, cols}); - auto input = builder.Parameter(0, input_shape, "input"); - auto input_pred = builder.Eq(input, builder.ConstantR0(1)); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto input_pred = Eq(input, ConstantR0(&builder, 1)); XlaOp init_value; XlaComputation reduce_op; if (and_reduce) { - init_value = builder.ConstantR0(true); - reduce_op = CreateScalarAndComputation(&builder); + init_value = ConstantR0(&builder, true); + reduce_op = CreateScalarAndComputation(PRED, &builder); } else { - init_value = builder.ConstantR0(false); - reduce_op = CreateScalarOrComputation(&builder); + init_value = ConstantR0(&builder, false); + reduce_op = CreateScalarOrComputation(PRED, &builder); } - builder.Reduce(input_pred, init_value, reduce_op, - /*dimensions_to_reduce=*/{0}); + Reduce(input_pred, init_value, reduce_op, + /*dimensions_to_reduce=*/{0}); Array2D input_data(rows, cols); input_data.FillRandom(0, 1); std::unique_ptr input_literal = - Literal::CreateR2FromArray2D(input_data); + LiteralUtil::CreateR2FromArray2D(input_data); input_literal = input_literal->Relayout(LayoutUtil::MakeLayout({minor, major})); std::unique_ptr input_global_data = @@ -202,14 +203,14 @@ class ReduceTest : public ClientLibraryTestBase { XlaBuilder builder(TestName()); XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); - auto input = builder.Parameter(0, input_shape, "input"); - auto zero = builder.ConstantR0(0.0); - builder.Reduce(input, zero, add_f32, /*dimensions_to_reduce=*/{0, 1}); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto zero = ConstantR0(&builder, 0.0); + Reduce(input, zero, add_f32, /*dimensions_to_reduce=*/{0, 1}); Array2D input_data(rows, cols); input_data.FillRandom(3.14f, 0.04); std::unique_ptr input_literal = - Literal::CreateR2FromArray2D(input_data); + LiteralUtil::CreateR2FromArray2D(input_data); input_literal = input_literal->Relayout(LayoutUtil::MakeLayout({minor, major})); std::unique_ptr input_global_data = @@ -230,14 +231,14 @@ class ReduceTest : public ClientLibraryTestBase { XlaBuilder builder(TestName()); XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); - auto input = builder.Parameter(0, input_shape, "input"); - auto zero = builder.ConstantR0(0.0); - builder.Reduce(input, zero, add_f32, /*dimensions_to_reduce=*/{0}); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto zero = ConstantR0(&builder, 0.0); + Reduce(input, zero, add_f32, /*dimensions_to_reduce=*/{0}); Array2D input_data(rows, cols); input_data.FillRandom(3.14f, 0.04); std::unique_ptr input_literal = - Literal::CreateR2FromArray2D(input_data); + LiteralUtil::CreateR2FromArray2D(input_data); input_literal = input_literal->Relayout(LayoutUtil::MakeLayout({minor, major})); std::unique_ptr input_global_data = @@ -259,8 +260,8 @@ class ReduceTest : public ClientLibraryTestBase { void ComputeAndCompareGeneric( typename std::enable_if::value, XlaBuilder>::type* builder, - tensorflow::gtl::ArraySlice expected, - tensorflow::gtl::ArraySlice arguments) { + absl::Span expected, + absl::Span arguments) { ComputeAndCompareR1(builder, expected, arguments, ErrorSpec(0.01, 1e-4)); } @@ -269,8 +270,8 @@ class ReduceTest : public ClientLibraryTestBase { void ComputeAndCompareGeneric( typename std::enable_if::value, XlaBuilder>::type* builder, - tensorflow::gtl::ArraySlice expected, - tensorflow::gtl::ArraySlice arguments) { + absl::Span expected, + absl::Span arguments) { ComputeAndCompareR1(builder, expected, arguments); } @@ -287,22 +288,22 @@ class ReduceTest : public ClientLibraryTestBase { XlaComputation reduction_function = reduction_function_generator(&builder); const Shape input_shape = ShapeUtil::MakeShape( xla::primitive_util::NativeToPrimitiveType(), {rows, cols}); - auto input = builder.Parameter(0, input_shape, "input"); - auto zero = builder.ConstantR0(initial_value); - builder.Reduce(input, zero, reduction_function, - /*dimensions_to_reduce=*/{0}); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto zero = ConstantR0(&builder, initial_value); + Reduce(input, zero, reduction_function, + /*dimensions_to_reduce=*/{0}); Array2D input_data(rows, cols); input_data.FillUnique(initial_value); std::unique_ptr input_literal = - Literal::CreateR2FromArray2D(input_data); + LiteralUtil::CreateR2FromArray2D(input_data); input_literal = input_literal->Relayout(LayoutUtil::MakeLayout({minor, major})); std::unique_ptr input_global_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); // NativeT can be bool, and std::vector does not convert to - // ArraySlice. + // Span. std::unique_ptr expected(new NativeT[cols]); for (int64 colno = 0; colno < cols; ++colno) { NativeT column_result = initial_value; @@ -314,7 +315,7 @@ class ReduceTest : public ClientLibraryTestBase { } ComputeAndCompareGeneric( - &builder, tensorflow::gtl::ArraySlice(expected.get(), cols), + &builder, absl::Span(expected.get(), cols), {input_global_data.get()}); } @@ -442,15 +443,15 @@ XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) { XlaBuilder builder(TestName()); XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); - auto input = builder.Parameter(0, input_shape, "input"); - auto zero = builder.ConstantR0(0.0); - auto log_ = builder.Log(input); - builder.Reduce(log_, zero, add_f32, /*dimensions_to_reduce=*/{0}); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto zero = ConstantR0(&builder, 0.0); + auto log_ = Log(input); + Reduce(log_, zero, add_f32, /*dimensions_to_reduce=*/{0}); Array2D input_data(rows, cols); input_data.FillRandom(3.14f, 0.04); std::unique_ptr input_literal = - Literal::CreateR2FromArray2D(input_data); + LiteralUtil::CreateR2FromArray2D(input_data); input_literal = input_literal->Relayout(LayoutUtil::MakeLayout({0, 1})); std::unique_ptr input_global_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); @@ -473,16 +474,16 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) { XlaBuilder builder(TestName()); XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); - auto input = builder.Parameter(0, input_shape, "input"); - auto zero = builder.ConstantR0(0.0); - auto log_ = builder.Log(input); - auto transpose = builder.Transpose(log_, {1, 0}); - builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{1}); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto zero = ConstantR0(&builder, 0.0); + auto log_ = Log(input); + auto transpose = Transpose(log_, {1, 0}); + Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{1}); Array2D input_data(rows, cols); input_data.FillRandom(3.14f, 0.04); std::unique_ptr input_literal = - Literal::CreateR2FromArray2D(input_data); + LiteralUtil::CreateR2FromArray2D(input_data); input_literal = input_literal->Relayout(LayoutUtil::MakeLayout({0, 1})); std::unique_ptr input_global_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); @@ -505,10 +506,10 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) { XlaBuilder builder(TestName()); XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50}); - XlaOp input = builder.Parameter(0, input_shape, "input"); - XlaOp zero = builder.ConstantR0(0.0); - XlaOp transpose = builder.Transpose(input, /*permutation=*/{1, 0, 2}); - builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0}); + XlaOp input = Parameter(&builder, 0, input_shape, "input"); + XlaOp zero = ConstantR0(&builder, 0.0); + XlaOp transpose = Transpose(input, /*permutation=*/{1, 0, 2}); + Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0}); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr input_data, MakeFakeLiteral(input_shape)); @@ -522,16 +523,16 @@ XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) { XlaBuilder builder(TestName()); XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, 2, cols / 2}); - auto input = builder.Parameter(0, input_shape, "input"); - auto zero = builder.ConstantR0(0.0); - auto log_ = builder.Tanh(input); - auto reshape = builder.Reshape(log_, {rows, cols}); - builder.Reduce(reshape, zero, add_f32, /*dimensions_to_reduce=*/{0}); + auto input = Parameter(&builder, 0, input_shape, "input"); + auto zero = ConstantR0(&builder, 0.0); + auto log_ = Tanh(input); + auto reshape = Reshape(log_, {rows, cols}); + Reduce(reshape, zero, add_f32, /*dimensions_to_reduce=*/{0}); Array3D input_data(rows, 2, cols / 2); input_data.FillRandom(3.14f, 0.04); std::unique_ptr input_literal = - Literal::CreateR3FromArray3D(input_data); + LiteralUtil::CreateR3FromArray3D(input_data); std::unique_ptr input_global_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); @@ -556,21 +557,20 @@ struct BoundsLayout { }; void PrintTo(const BoundsLayout& spec, std::ostream* os) { - *os << tensorflow::strings::Printf( - "R%luToR%lu%s_%s_Reduce%s", spec.bounds.size(), - spec.bounds.size() - spec.reduce_dims.size(), - tensorflow::str_util::Join(spec.bounds, "x").c_str(), - tensorflow::str_util::Join(spec.layout, "").c_str(), - tensorflow::str_util::Join(spec.reduce_dims, "").c_str()); + *os << absl::StrFormat("R%uToR%u%s_%s_Reduce%s", spec.bounds.size(), + spec.bounds.size() - spec.reduce_dims.size(), + absl::StrJoin(spec.bounds, "x"), + absl::StrJoin(spec.layout, ""), + absl::StrJoin(spec.reduce_dims, "")); } // Add-reduces a broadcasted scalar matrix among dimension 1 and 0. XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) { XlaBuilder builder(TestName()); auto add = CreateScalarAddComputation(F32, &builder); - auto scalar = builder.ConstantR0(42.0); - auto broadcasted = builder.Broadcast(scalar, {500, 500}); - builder.Reduce(broadcasted, builder.ConstantR0(0.0f), add, {0, 1}); + auto scalar = ConstantR0(&builder, 42.0); + auto broadcasted = Broadcast(scalar, {500, 500}); + Reduce(broadcasted, ConstantR0(&builder, 0.0f), add, {0, 1}); float expected = 42.0f * static_cast(500 * 500); ComputeAndCompareR0(&builder, expected, {}, ErrorSpec(0.0001)); @@ -580,9 +580,9 @@ XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) { XLA_TEST_F(ReduceTest, MaxReduce2DScalarToR0) { XlaBuilder builder(TestName()); auto max = CreateScalarMaxComputation(F32, &builder); - auto scalar = builder.ConstantR0(42.0); - auto broadcasted = builder.Broadcast(scalar, {500, 500}); - builder.Reduce(broadcasted, builder.ConstantR0(0.0f), max, {0, 1}); + auto scalar = ConstantR0(&builder, 42.0); + auto broadcasted = Broadcast(scalar, {500, 500}); + Reduce(broadcasted, ConstantR0(&builder, 0.0f), max, {0, 1}); float expected = 42.0f; ComputeAndCompareR0(&builder, expected, {}, ErrorSpec(0.0001)); @@ -594,9 +594,9 @@ XLA_TEST_F(ReduceTest, MaxReduce2DToR0) { auto max = CreateScalarMaxComputation(F32, &builder); Array2D input(300, 250); input.FillRandom(214.0f); - auto input_literal = Literal::CreateR2FromArray2D(input); - builder.Reduce(builder.ConstantLiteral(*input_literal), - builder.ConstantR0(FLT_MIN), max, {0, 1}); + auto input_literal = LiteralUtil::CreateR2FromArray2D(input); + Reduce(ConstantLiteral(&builder, *input_literal), + ConstantR0(&builder, FLT_MIN), max, {0, 1}); auto input_max = FLT_MIN; input.Each( [&](int64, int64, float* v) { input_max = std::max(input_max, *v); }); @@ -609,9 +609,9 @@ XLA_TEST_F(ReduceTest, MinReduce2DToR0) { auto min = CreateScalarMinComputation(F32, &builder); Array2D input(150, 130); input.FillRandom(214.0f); - auto input_literal = Literal::CreateR2FromArray2D(input); - builder.Reduce(builder.ConstantLiteral(*input_literal), - builder.ConstantR0(FLT_MAX), min, {0, 1}); + auto input_literal = LiteralUtil::CreateR2FromArray2D(input); + Reduce(ConstantLiteral(&builder, *input_literal), + ConstantR0(&builder, FLT_MAX), min, {0, 1}); auto input_min = FLT_MAX; input.Each( @@ -623,12 +623,11 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) { XlaBuilder builder(TestName()); Array2D input({{1}, {2}}); auto min = CreateScalarMinComputation(U32, &builder); - auto input_literal = Literal::CreateR2FromArray2D(input); + auto input_literal = LiteralUtil::CreateR2FromArray2D(input); auto initial_value = - builder.ConstantR0(std::numeric_limits::max()); + ConstantR0(&builder, std::numeric_limits::max()); - builder.Reduce(builder.ConstantLiteral(*input_literal), initial_value, min, - {0, 1}); + Reduce(ConstantLiteral(&builder, *input_literal), initial_value, min, {0, 1}); ComputeAndCompareR0(&builder, 1, {}); } @@ -636,21 +635,20 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) { XlaBuilder builder(TestName()); Array2D input({{1}, {2}}); auto max = CreateScalarMaxComputation(U32, &builder); - auto input_literal = Literal::CreateR2FromArray2D(input); + auto input_literal = LiteralUtil::CreateR2FromArray2D(input); auto initial_value = - builder.ConstantR0(std::numeric_limits::min()); + ConstantR0(&builder, std::numeric_limits::min()); - builder.Reduce(builder.ConstantLiteral(*input_literal), initial_value, max, - {0, 1}); + Reduce(ConstantLiteral(&builder, *input_literal), initial_value, max, {0, 1}); ComputeAndCompareR0(&builder, 2, {}); } // Reduces a matrix among dimension 1. XLA_TEST_F(ReduceTest, Reduce2DAmong1) { XlaBuilder builder(TestName()); - auto m = builder.ConstantLiteral(*literal_2d_); + auto m = ConstantLiteral(&builder, *literal_2d_); auto add = CreateScalarAddComputation(F32, &builder); - builder.Reduce(m, builder.ConstantR0(0.0f), add, {1}); + Reduce(m, ConstantR0(&builder, 0.0f), add, {1}); std::vector expected = {6.f, 15.f}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -659,9 +657,9 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong1) { XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) { // Reduce a matrix among dimensions 0 and 1 (sum it up to a scalar). XlaBuilder builder(TestName()); - auto m = builder.ConstantLiteral(*literal_2d_); + auto m = ConstantLiteral(&builder, *literal_2d_); auto add = CreateScalarAddComputation(F32, &builder); - builder.Reduce(m, builder.ConstantR0(0.0f), add, {0, 1}); + Reduce(m, ConstantR0(&builder, 0.0f), add, {0, 1}); ComputeAndCompareR0(&builder, 21.0f, {}, ErrorSpec(0.0001, 1e-4)); } @@ -669,9 +667,9 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) { // Tests 2D matrix ReduceToRow operation. XLA_TEST_F(ReduceTest, Reduce2DAmongY) { XlaBuilder builder("reduce_among_y"); - auto m = builder.ConstantLiteral(*literal_2d_); + auto m = ConstantLiteral(&builder, *literal_2d_); auto add = CreateScalarAddComputation(F32, &builder); - builder.Reduce(m, builder.ConstantR0(0.0f), add, {0}); + Reduce(m, ConstantR0(&builder, 0.0f), add, {0}); std::vector expected = {5.f, 7.f, 9.f}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -679,9 +677,9 @@ XLA_TEST_F(ReduceTest, Reduce2DAmongY) { XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) { XlaBuilder builder(TestName()); - auto m = builder.ConstantLiteral(*literal_3d_); + auto m = ConstantLiteral(&builder, *literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); - builder.Reduce(m, builder.ConstantR0(0.0f), add, {1, 2}); + Reduce(m, ConstantR0(&builder, 0.0f), add, {1, 2}); std::vector expected = {21.f, 21.f, 21.f, 21.f}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -689,9 +687,9 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) { XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) { XlaBuilder builder(TestName()); - auto m = builder.ConstantLiteral(*literal_3d_); + auto m = ConstantLiteral(&builder, *literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); - builder.Reduce(m, builder.ConstantR0(0.0f), add, {0, 1}); + Reduce(m, ConstantR0(&builder, 0.0f), add, {0, 1}); std::vector expected = {20.f, 28.f, 36.f}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.0001)); @@ -699,9 +697,9 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) { XLA_TEST_F(ReduceTest, ReduceR3ToR0) { XlaBuilder builder(TestName()); - auto m = builder.ConstantLiteral(*literal_3d_); + auto m = ConstantLiteral(&builder, *literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); - builder.Reduce(m, builder.ConstantR0(0.0f), add, {0, 1, 2}); + Reduce(m, ConstantR0(&builder, 0.0f), add, {0, 1, 2}); float expected = 21.0f * 4.0; ComputeAndCompareR0(&builder, expected, {}, ErrorSpec(0.0001)); @@ -709,9 +707,9 @@ XLA_TEST_F(ReduceTest, ReduceR3ToR0) { XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) { XlaBuilder builder(TestName()); - auto m = builder.ConstantLiteral(*literal_3d_); + auto m = ConstantLiteral(&builder, *literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); - builder.Reduce(m, builder.ConstantR0(0.0f), add, {0}); + Reduce(m, ConstantR0(&builder, 0.0f), add, {0}); // clang-format off Array2D expected({ @@ -724,9 +722,9 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) { XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) { XlaBuilder builder(TestName()); - auto m = builder.ConstantLiteral(*literal_3d_); + auto m = ConstantLiteral(&builder, *literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); - builder.Reduce(m, builder.ConstantR0(0.0f), add, {1}); + Reduce(m, ConstantR0(&builder, 0.0f), add, {1}); // clang-format off Array2D expected({ @@ -741,9 +739,9 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) { XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) { XlaBuilder builder(TestName()); - auto m = builder.ConstantLiteral(*literal_3d_); + auto m = ConstantLiteral(&builder, *literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); - builder.Reduce(m, builder.ConstantR0(0.0f), add, {2}); + Reduce(m, ConstantR0(&builder, 0.0f), add, {2}); // clang-format off Array2D expected({ @@ -800,13 +798,17 @@ XLA_TEST_F(ReduceTest, VectorizedReduce_Min) { XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanAnd) { RunVectorizedReduceTestForType( - static_cast(CreateScalarAndComputation), + static_cast([](XlaBuilder* builder) { + return CreateScalarAndComputation(PRED, builder); + }), [](bool a, bool b) { return a && b; }, true); } XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanOr) { RunVectorizedReduceTestForType( - static_cast(CreateScalarOrComputation), + static_cast([](XlaBuilder* builder) { + return CreateScalarOrComputation(PRED, builder); + }), [](bool a, bool b) { return a || b; }, false); } @@ -820,17 +822,17 @@ XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) { // input_array.FillRandom(3.14f, 0.05); input_array.Fill(1.0f); - auto input_literal = Literal::CreateR3FromArray3D(input_array); + auto input_literal = LiteralUtil::CreateR3FromArray3D(input_array); input_literal = input_literal->Relayout(LayoutUtil::MakeLayout(GetParam().layout)); std::unique_ptr input_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); auto input_activations = - builder.Parameter(0, input_literal->shape(), "input"); + Parameter(&builder, 0, input_literal->shape(), "input"); XlaComputation add = CreateScalarAddComputation(F32, &builder); - auto sum = builder.Reduce(input_activations, builder.ConstantR0(0.0f), - add, GetParam().reduce_dims); + Reduce(input_activations, ConstantR0(&builder, 0.0f), add, + GetParam().reduce_dims); auto expected = ReferenceUtil::Reduce3DTo2D(input_array, 0.0f, GetParam().reduce_dims, @@ -871,14 +873,15 @@ XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OperationOnConstantAsInitValue)) { XlaBuilder builder(TestName()); XlaComputation max_f32 = CreateScalarMaxComputation(F32, &builder); - auto a = builder.ConstantR0(2.0f); - auto a2 = builder.Abs(a); + auto a = ConstantR0(&builder, 2.0f); + auto a2 = Abs(a); - std::unique_ptr b_literal = Literal::CreateR1({1.0f, 4.0f}); + std::unique_ptr b_literal = + LiteralUtil::CreateR1({1.0f, 4.0f}); std::unique_ptr b_data = client_->TransferToServer(*b_literal).ConsumeValueOrDie(); - auto b = builder.Parameter(0, b_literal->shape(), "b"); - auto max = builder.Reduce(b, a2, max_f32, {0}); + auto b = Parameter(&builder, 0, b_literal->shape(), "b"); + Reduce(b, a2, max_f32, {0}); ComputeAndCompareR0(&builder, 4.0f, {b_data.get()}); } @@ -900,13 +903,13 @@ class ReduceInitializerTest : public ReduceTest { XlaComputation max_fn = CreateScalarMaxComputation( primitive_util::NativeToPrimitiveType(), &builder); - auto init = builder.ConstantR0(initializer); + auto init = ConstantR0(&builder, initializer); std::vector input_arr(num_elems, std::numeric_limits::lowest()); - auto input_literal = Literal::CreateR1(input_arr); + auto input_literal = LiteralUtil::CreateR1(input_arr); auto input_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); - builder.Reduce(builder.Parameter(0, input_literal->shape(), "input"), init, - max_fn, {0}); + Reduce(Parameter(&builder, 0, input_literal->shape(), "input"), init, + max_fn, {0}); ComputeAndCompareR0(&builder, initializer, {input_data.get()}); } @@ -939,23 +942,24 @@ XLA_TEST_F(ReduceInitializerTest, U64InitializerBigValue) { XLA_TEST_F(ReduceTest, ReduceIdentity) { XlaBuilder builder(TestName()); Shape single_float = ShapeUtil::MakeShape(F32, {}); - builder.Parameter(0, single_float, "lhs-unused"); - builder.Parameter(1, single_float, "rhs-used"); + Parameter(&builder, 0, single_float, "lhs-unused"); + Parameter(&builder, 1, single_float, "rhs-used"); auto computation_status = builder.Build(); TF_ASSERT_OK(computation_status.status()); Shape operand_shape = ShapeUtil::MakeShape(F32, {1}); - builder.Reduce(builder.Parameter(0, operand_shape, "operand"), - builder.Parameter(1, single_float, "init"), - computation_status.ValueOrDie(), {0}); + Reduce(Parameter(&builder, 0, operand_shape, "operand"), + Parameter(&builder, 1, single_float, "init"), + computation_status.ValueOrDie(), {0}); float operand[] = {42.0f}; float init = 58.5f; float expected = 42.0f; - std::unique_ptr input_literal = Literal::CreateR1(operand); + std::unique_ptr input_literal = + LiteralUtil::CreateR1(operand); std::unique_ptr input_global_data = client_->TransferToServer(*input_literal).ConsumeValueOrDie(); - std::unique_ptr input_literal2 = Literal::CreateR0(init); + std::unique_ptr input_literal2 = LiteralUtil::CreateR0(init); std::unique_ptr input_global_data2 = client_->TransferToServer(*input_literal2).ConsumeValueOrDie(); ComputeAndCompareR0( @@ -963,5 +967,32 @@ XLA_TEST_F(ReduceTest, ReduceIdentity) { ErrorSpec(0.0001)); } +XLA_TEST_F(ReduceTest, AndReduceU64) { + XlaBuilder builder(TestName()); + Array2D initializer = {{0x123456789ABCDEF0LL, 0x3BCDEF12A4567890LL}, + {0XFFFFFFFFFFFFFFD6LL, 101}, + {1, 0XFFFFFFFFFFFFFFFFLL}}; + auto reducer = CreateScalarAndComputation(U64, &builder); + auto m = ConstantR2FromArray2D(&builder, initializer); + Reduce(m, ConstantR0(&builder, 0xFFFFFFFFFFFFFFFFLL), reducer, {1}); + + std::vector expected = {0x1204461080145890LL, 68, 1}; + ComputeAndCompareR1(&builder, expected, {}); +} + +XLA_TEST_F(ReduceTest, OrReduceU64) { + XlaBuilder builder(TestName()); + Array2D initializer = {{0x123456789ABCDEF0LL, 0x3BCDEF12A4567890LL}, + {0xFFFFFFFFFFFFFFD6LL, 101}, + {1, 0xCAFEBEEFABABABABLL}}; + auto reducer = CreateScalarOrComputation(U64, &builder); + auto m = ConstantR2FromArray2D(&builder, initializer); + Reduce(m, ConstantR0(&builder, 0), reducer, {1}); + + std::vector expected = {0X3BFDFF7ABEFEFEF0LL, 0XFFFFFFFFFFFFFFF7LL, + 0xCAFEBEEFABABABABLL}; + ComputeAndCompareR1(&builder, expected, {}); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc index 266760e8202fddc48792ac66dda334255e428808..997880a018a264de7b0623d27997defdfc68f14a 100644 --- a/tensorflow/compiler/xla/tests/reduce_window_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc @@ -18,14 +18,18 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -35,7 +39,6 @@ limitations under the License. #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -54,7 +57,7 @@ class ReduceWindowTestBase : public ClientLibraryTestBase { public: ErrorSpec DefaultErrorSpec() const { if (use_bfloat16()) { - return ErrorSpec(1e-1, 5e-2); + return ErrorSpec(2e-1, 6e-2); } else { return ErrorSpec(1e-3, 1e-3); } @@ -67,34 +70,36 @@ class ReduceWindowTest : public ::testing::WithParamInterface, ReduceWindowTest() : builder_(TestName()) { set_use_bfloat16(GetParam()); } void ReduceWindowAdd(const XlaOp& input, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding) { - auto init = - CreateConstantFromLiteral(*Literal::CreateR0(0.0f), &builder_); - builder_.ReduceWindow(input, init, - CreateScalarAddComputation(FloatType(), &builder_), - window_dimensions, window_strides, padding); + auto init = CreateConstantFromLiteral(*LiteralUtil::CreateR0(0.0f), + &builder_); + ReduceWindow(input, init, + CreateScalarAddComputation(FloatType(), &builder_), + window_dimensions, window_strides, padding); } void ReduceWindowMax(const XlaOp& input, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding) { - auto init = CreateConstantFromLiteral(Literal::MinValue(F32), &builder_); - builder_.ReduceWindow(input, init, - CreateScalarMaxComputation(FloatType(), &builder_), - window_dimensions, window_strides, padding); + auto init = + CreateConstantFromLiteral(LiteralUtil::MinValue(F32), &builder_); + ReduceWindow(input, init, + CreateScalarMaxComputation(FloatType(), &builder_), + window_dimensions, window_strides, padding); } void ReduceWindowMin(const XlaOp& input, - tensorflow::gtl::ArraySlice window_dimensions, - tensorflow::gtl::ArraySlice window_strides, + absl::Span window_dimensions, + absl::Span window_strides, Padding padding) { - auto init = CreateConstantFromLiteral(Literal::MaxValue(F32), &builder_); - builder_.ReduceWindow(input, init, - CreateScalarMinComputation(FloatType(), &builder_), - window_dimensions, window_strides, padding); + auto init = + CreateConstantFromLiteral(LiteralUtil::MaxValue(F32), &builder_); + ReduceWindow(input, init, + CreateScalarMinComputation(FloatType(), &builder_), + window_dimensions, window_strides, padding); } XlaBuilder builder_; @@ -102,14 +107,14 @@ class ReduceWindowTest : public ::testing::WithParamInterface, TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) { const auto input = CreateConstantFromLiteral( - *Literal::CreateR1({1, 1, 1, 1}), &builder_); + *LiteralUtil::CreateR1({1, 1, 1, 1}), &builder_); const auto init_value = - CreateConstantFromLiteral(*Literal::CreateR0(0), &builder_); + CreateConstantFromLiteral(*LiteralUtil::CreateR0(0), &builder_); TF_ASSERT_OK(builder_.first_error()); - builder_.ReduceWindow(input, init_value, - CreateScalarAddComputation(FloatType(), &builder_), - /*window_dimensions=*/{1, 2}, - /*window_strides=*/{1}, Padding::kValid); + ReduceWindow(input, init_value, + CreateScalarAddComputation(FloatType(), &builder_), + /*window_dimensions=*/{1, 2}, + /*window_strides=*/{1}, Padding::kValid); ASSERT_EQ(builder_.first_error().code(), tensorflow::error::INVALID_ARGUMENT) << builder_.first_error(); ASSERT_THAT(builder_.first_error().error_message(), @@ -119,33 +124,32 @@ TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) { // Regression test for b/68964348. TEST_P(ReduceWindowTest, R0ReduceWindow) { const auto input = - CreateConstantFromLiteral(*Literal::CreateR0(42.0), &builder_); + CreateConstantFromLiteral(*LiteralUtil::CreateR0(42.0), &builder_); const auto init = - CreateConstantFromLiteral(*Literal::CreateR0(1.0), &builder_); - builder_.ReduceWindow(input, init, - CreateScalarAddComputation(FloatType(), &builder_), - /*window_dimensions=*/{}, - /*window_strides=*/{}, Padding::kSame); - ComputeAndCompareLiteral(&builder_, *Literal::CreateR0(43.0), {}, + CreateConstantFromLiteral(*LiteralUtil::CreateR0(1.0), &builder_); + ReduceWindow(input, init, CreateScalarAddComputation(FloatType(), &builder_), + /*window_dimensions=*/{}, + /*window_strides=*/{}, Padding::kSame); + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateR0(43.0), {}, ErrorSpec(0.00001)); } TEST_P(ReduceWindowTest, Min3In5Stride2) { const auto input = CreateConstantFromLiteral( - *Literal::CreateR1({10000, 1000, 100, 10, 1}), &builder_); + *LiteralUtil::CreateR1({10000, 1000, 100, 10, 1}), &builder_); ReduceWindowMin(input, {3}, {2}, Padding::kValid); - ComputeAndCompareLiteral(&builder_, *Literal::CreateR1({100, 1}), {}, - ErrorSpec(0.00001)); + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateR1({100, 1}), + {}, ErrorSpec(0.00001)); } TEST_P(ReduceWindowTest, Min3In5Stride1WithSamePadding) { const auto input = CreateConstantFromLiteral( - *Literal::CreateR1({10000, 1000, 100, 10, 1}), &builder_); + *LiteralUtil::CreateR1({10000, 1000, 100, 10, 1}), &builder_); ReduceWindowMin(input, /*window_dimensions=*/{3}, /*window_strides=*/{1}, Padding::kSame); ComputeAndCompareLiteral(&builder_, - *Literal::CreateR1({1000, 100, 10, 1, 1}), {}, - ErrorSpec(0.00001)); + *LiteralUtil::CreateR1({1000, 100, 10, 1, 1}), + {}, ErrorSpec(0.00001)); } XLA_TEST_P(ReduceWindowTest, ZeroElementSmall) { @@ -157,7 +161,7 @@ XLA_TEST_P(ReduceWindowTest, ZeroElementSmall) { auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 2, 1}, {1, 1, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {}, + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*res), {}, DefaultErrorSpec()); } @@ -172,7 +176,7 @@ TEST_P(ReduceWindowTest, NonSquareSmall) { auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 2, 1}, {1, 1, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {}, + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*res), {}, DefaultErrorSpec()); } @@ -186,7 +190,7 @@ TEST_P(ReduceWindowTest, MiddleDimsSmall) { auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 1, 1}, {1, 2, 2, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {}, + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*res), {}, DefaultErrorSpec()); } @@ -203,7 +207,7 @@ TEST_P(ReduceWindowTest, Along2ndMinorDim) { auto res = ReferenceUtil::ReduceWindow4DAdd( input_array, 0.0f, {1, 1, lrn_diameter, 1}, {1, 1, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {}, + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*res), {}, DefaultErrorSpec()); } @@ -225,8 +229,8 @@ TEST_P(ReduceWindowTest, AmongMajor2Dims) { input_array, 0.0f, {win_len, win_len, 1, 1}, {win_stride, win_stride, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {}, - DefaultErrorSpec()); + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*result), + {}, DefaultErrorSpec()); } TEST_P(ReduceWindowTest, AmongMajor2DimsMediumSize) { @@ -248,8 +252,8 @@ TEST_P(ReduceWindowTest, AmongMajor2DimsMediumSize) { input_array, 0.0f, {win_len, win_len, 1, 1}, {win_stride, win_stride, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {}, - DefaultErrorSpec()); + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*result), + {}, DefaultErrorSpec()); } // Tests the super windowing logic w.r.t handling prime number of windows in a @@ -273,8 +277,8 @@ TEST_P(ReduceWindowTest, PrimeWindowsInReductionDimension) { input_array, 0.0f, {win_len, win_len, 1, 1}, {win_stride, win_stride, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {}, - DefaultErrorSpec()); + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*result), + {}, DefaultErrorSpec()); } TEST_P(ReduceWindowTest, ReduceAlongLaneDimension) { @@ -290,8 +294,8 @@ TEST_P(ReduceWindowTest, ReduceAlongLaneDimension) { auto result = ReferenceUtil::ReduceWindow4DAdd( input_array, 0.0f, {1, 1, 1, 11}, {1, 1, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {}, - DefaultErrorSpec()); + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*result), + {}, DefaultErrorSpec()); } // Tests a reduction function that is not a simple add/min/max/etc. @@ -306,15 +310,15 @@ XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) { Padding padding = Padding::kValid; const Shape scalar = ShapeUtil::MakeShape(FloatType(), {}); auto b = builder_.CreateSubBuilder("unusual"); - auto lhs = b->Parameter(0, scalar, "lhs"); - auto rhs = b->Parameter(1, scalar, "rhs"); - b->Min(b->Add(lhs, rhs), - CreateConstantFromLiteral(*Literal::CreateR0(8.0f), b.get())); + auto lhs = Parameter(b.get(), 0, scalar, "lhs"); + auto rhs = Parameter(b.get(), 1, scalar, "rhs"); + Min(Add(lhs, rhs), + CreateConstantFromLiteral(*LiteralUtil::CreateR0(8.0f), b.get())); XlaComputation reduce_fn = b->BuildAndNoteError(); - builder_.ReduceWindow( + ReduceWindow( input, - CreateConstantFromLiteral(*Literal::CreateR0(0.0f), &builder_), + CreateConstantFromLiteral(*LiteralUtil::CreateR0(0.0f), &builder_), reduce_fn, /*window_dimensions=*/{1, 1, 2, 1}, /*window_strides=*/{1, 1, 1, 1}, padding); @@ -328,15 +332,15 @@ XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) { /*window=*/{1, 1, 2, 1}, /*stride=*/{1, 1, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*expected), {}, - DefaultErrorSpec()); + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*expected), + {}, DefaultErrorSpec()); } TEST_P(ReduceWindowTest, R4UnitWindow) { Array4D input_array(13, 12, 8, 15); input_array.FillRandom(2.f, 2.f); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input_array, LayoutUtil::MakeLayout({0, 3, 2, 1})); XlaOp input; auto input_data = CreateParameterAndTransferLiteral( @@ -348,7 +352,7 @@ TEST_P(ReduceWindowTest, R4UnitWindow) { auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 7, 1}, {1, 4, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*res), {input_data.get()}, DefaultErrorSpec()); } @@ -356,7 +360,7 @@ XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) { std::vector input_dims(6, 8); auto shape = ShapeUtil::MakeShape(F32, input_dims); - auto arg_literal = MakeUnique(shape); + auto arg_literal = absl::make_unique(shape); arg_literal->PopulateWithValue(1.0f); const auto input = CreateConstantFromLiteral(*arg_literal, &builder_); @@ -367,7 +371,7 @@ XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) { std::vector output_dims = {6, 8, 6, 6, 8, 8}; Shape result_shape = ShapeUtil::MakeShapeWithLayout(F32, output_dims, output_layout); - auto expected = MakeUnique(result_shape); + auto expected = absl::make_unique(result_shape); expected->PopulateWithValue(27.0f); ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec()); } @@ -377,7 +381,7 @@ XLA_TEST_P(ReduceWindowTest, R6Add) { auto shape = ShapeUtil::MakeShape(F32, input_dims); std::unique_ptr arg_literal = - Literal::CreateFullWithDescendingLayout(input_dims, 1.0f); + LiteralUtil::CreateFullWithDescendingLayout(input_dims, 1.0f); const auto input = CreateConstantFromLiteral(*arg_literal, &builder_); @@ -386,7 +390,7 @@ XLA_TEST_P(ReduceWindowTest, R6Add) { std::vector output_dims = {8, 8, 6, 6, 8, 8}; std::unique_ptr expected = - Literal::CreateFullWithDescendingLayout(output_dims, 9.0f); + LiteralUtil::CreateFullWithDescendingLayout(output_dims, 9.0f); ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec()); } @@ -395,7 +399,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) { Array4D input_array(2, 1, 27, 119); input_array.FillRandom(2.0f); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input_array, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaOp input; auto input_data = CreateParameterAndTransferLiteral( @@ -409,7 +413,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) { auto res = ReferenceUtil::ReduceWindow4DAdd( input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*res), {input_data.get()}, DefaultErrorSpec()); } @@ -417,7 +421,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) { Array4D input_array(3, 2, 4, 64); input_array.FillRandom(2.0f); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input_array, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaOp input; auto input_data = CreateParameterAndTransferLiteral( @@ -431,7 +435,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) { auto res = ReferenceUtil::ReduceWindow4DAdd( input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*res), {input_data.get()}, DefaultErrorSpec()); } @@ -439,7 +443,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) { Array4D input_array(1, 3, 12, 200); input_array.FillRandom(2.0f); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input_array, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaOp input; auto input_data = CreateParameterAndTransferLiteral( @@ -453,7 +457,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) { auto res = ReferenceUtil::ReduceWindow4DAdd( input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*res), {input_data.get()}, DefaultErrorSpec()); } @@ -474,18 +478,18 @@ TEST_P(ReduceWindowTest, AmongMajor2DimsMultipleMinor) { auto result = ReferenceUtil::ReduceWindow4DAdd( input_array, 0.0f, {win_len, win_len, 1, 1}, {win_stride, win_stride, 1, 1}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {}, - DefaultErrorSpec()); + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateFromArray(*result), + {}, DefaultErrorSpec()); } XLA_TEST_P(ReduceWindowTest, Add24In1152_NoOverlap) { std::vector input_vector(128 * 9, 1); const auto input = CreateConstantFromLiteral( - *Literal::CreateR1(input_vector), &builder_); + *LiteralUtil::CreateR1(input_vector), &builder_); ReduceWindowAdd(input, {32}, {128}, Padding::kValid); ComputeAndCompareLiteral( &builder_, - *Literal::CreateR1({32, 32, 32, 32, 32, 32, 32, 32, 32}), {}, + *LiteralUtil::CreateR1({32, 32, 32, 32, 32, 32, 32, 32, 32}), {}, DefaultErrorSpec()); } @@ -500,9 +504,9 @@ XLA_TEST_P(ReduceWindowTest, Add128In128Stride128) { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; const auto input = CreateConstantFromLiteral( - *Literal::CreateR1(input_vector), &builder_); + *LiteralUtil::CreateR1(input_vector), &builder_); ReduceWindowAdd(input, {128}, {128}, Padding::kValid); - ComputeAndCompareLiteral(&builder_, *Literal::CreateR1({1088}), {}, + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateR1({1088}), {}, DefaultErrorSpec()); } @@ -517,9 +521,9 @@ XLA_TEST_P(ReduceWindowTest, Add128In128) { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; const auto input = CreateConstantFromLiteral( - *Literal::CreateR1(input_vector), &builder_); + *LiteralUtil::CreateR1(input_vector), &builder_); ReduceWindowAdd(input, {128}, {1}, Padding::kValid); - ComputeAndCompareLiteral(&builder_, *Literal::CreateR1({1088}), {}, + ComputeAndCompareLiteral(&builder_, *LiteralUtil::CreateR1({1088}), {}, DefaultErrorSpec()); } @@ -536,14 +540,15 @@ TEST_P(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) { auto res = ReferenceUtil::ReduceWindow2DAdd( input_array, 0.0f, {win_len, win_len}, {stride, stride}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), - {}, DefaultErrorSpec()); + ComputeAndCompareLiteral(&builder_, + *LiteralUtil::CreateFromArray(*res), {}, + DefaultErrorSpec()); } TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) { Array2D input_array(6, 4, 1.0f); - XlaOp input = builder_.Broadcast( - CreateConstantFromLiteral(Literal::One(F32), &builder_), {6, 4}); + XlaOp input = Broadcast( + CreateConstantFromLiteral(LiteralUtil::One(F32), &builder_), {6, 4}); Padding padding = Padding::kSame; ReduceWindowAdd(input, {4, 2}, {3, 3}, padding); @@ -551,8 +556,9 @@ TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) { auto res = ReferenceUtil::ReduceWindow2DAdd(input_array, 0.0f, {4, 2}, {3, 3}, padding); - ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), - {}, DefaultErrorSpec()); + ComputeAndCompareLiteral(&builder_, + *LiteralUtil::CreateFromArray(*res), {}, + DefaultErrorSpec()); } INSTANTIATE_TEST_CASE_P(ReduceWindowTestInstance, ReduceWindowTest, @@ -575,21 +581,20 @@ string R4ReduceWindowTestDataToString( const ::testing::TestParamInfo< ::testing::tuple>& data) { const auto& param = ::testing::get<0>(data.param); - string str = tensorflow::strings::StrCat( - "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"), // - "__window_bounds_", - tensorflow::str_util::Join(param.window_bounds, "x"), // - "__strides_", tensorflow::str_util::Join(param.strides, "x"), // - "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"), // - "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"), // - "__layout_", tensorflow::str_util::Join(param.layout, "_"), // + string str = absl::StrCat( + "base_bounds_", absl::StrJoin(param.base_bounds, "x"), // + "__window_bounds_", absl::StrJoin(param.window_bounds, "x"), // + "__strides_", absl::StrJoin(param.strides, "x"), // + "__pad_low_", absl::StrJoin(param.pad_low, "x"), // + "__pad_high_", absl::StrJoin(param.pad_high, "x"), // + "__layout_", absl::StrJoin(param.layout, "_"), // (param.reducer == kAdd) ? "_add" : "_max"); CHECK(param.reducer == kAdd || param.reducer == kMax); // Test names are not allowed to contain the '-' character. std::replace(str.begin(), str.end(), '-', 'n'); if (::testing::get<1>(data.param)) { - str = tensorflow::strings::StrCat(str, "_bfloat16"); + str = absl::StrCat(str, "_bfloat16"); } return str; } @@ -610,7 +615,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase, param.base_bounds[2], param.base_bounds[3]); input.FillIota(1); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input, LayoutUtil::MakeLayout(param.layout)); XlaOp parameter; auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0", @@ -622,12 +627,12 @@ class R4ReduceWindowTest : public ReduceWindowTestBase, } auto init_value = - CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b); + CreateConstantFromLiteral(*LiteralUtil::CreateR0(kInitValue), &b); CHECK(param.reducer == kAdd || param.reducer == kMax); auto computation = param.reducer == kAdd ? CreateScalarAddComputation(FloatType(), &b) : CreateScalarMaxComputation(FloatType(), &b); - b.ReduceWindowWithGeneralPadding( + ReduceWindowWithGeneralPadding( /*operand=*/parameter, /*init_value=*/init_value, /*computation=*/computation, @@ -648,7 +653,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase, /*stride=*/param.strides, /*padding=*/padding); std::unique_ptr expected_literal = - Literal::CreateFromArray(*expected); + LiteralUtil::CreateFromArray(*expected); const Shape& expected_shape_with_layout = ShapeUtil::MakeShapeWithLayout( input_literal->shape().element_type(), AsInt64Slice(expected_literal->shape().dimensions()), param.layout); @@ -931,15 +936,15 @@ string R3ReduceWindowTestDataToString( const ::testing::TestParamInfo< ::testing::tuple>& data) { const auto& param = ::testing::get<0>(data.param); - string str = tensorflow::strings::StrCat( - "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"), - "__window_bounds_", tensorflow::str_util::Join(param.window_bounds, "x"), - "__strides_", tensorflow::str_util::Join(param.strides, "x"), - "__padding_", param.padding == Padding::kSame ? "same" : "valid", - "__layout_", param.layout[0], "_", param.layout[1], "_", param.layout[2], - "__reducer_", param.reducer == kAdd ? "add" : "max"); + string str = absl::StrCat( + "base_bounds_", absl::StrJoin(param.base_bounds, "x"), "__window_bounds_", + absl::StrJoin(param.window_bounds, "x"), "__strides_", + absl::StrJoin(param.strides, "x"), "__padding_", + param.padding == Padding::kSame ? "same" : "valid", "__layout_", + param.layout[0], "_", param.layout[1], "_", param.layout[2], "__reducer_", + param.reducer == kAdd ? "add" : "max"); if (::testing::get<1>(data.param)) { - str = tensorflow::strings::StrCat(str, "_bfloat16"); + str = absl::StrCat(str, "_bfloat16"); } return str; } @@ -960,25 +965,25 @@ TEST_P(R3ReduceWindowTest, Add) { Array3D input(param.base_bounds[0], param.base_bounds[1], param.base_bounds[2], 1.0f); std::unique_ptr input_literal = - Literal::CreateR3FromArray3DWithLayout( + LiteralUtil::CreateR3FromArray3DWithLayout( input, LayoutUtil::MakeLayout(param.layout)); XlaOp parameter; auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0", &b, ¶meter); auto init_value = - CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b); - b.ReduceWindow(/*operand=*/parameter, - /*init_value=*/init_value, - /*computation=*/CreateScalarAddComputation(FloatType(), &b), - /*window_dimensions=*/param.window_bounds, - /*window_strides=*/param.strides, /*padding=*/param.padding); + CreateConstantFromLiteral(*LiteralUtil::CreateR0(kInitValue), &b); + ReduceWindow(/*operand=*/parameter, + /*init_value=*/init_value, + /*computation=*/CreateScalarAddComputation(FloatType(), &b), + /*window_dimensions=*/param.window_bounds, + /*window_strides=*/param.strides, /*padding=*/param.padding); auto expected = ReferenceUtil::ReduceWindow3DAdd( /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds, /*stride=*/param.strides, /*padding=*/param.padding); - ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected), + ComputeAndCompareLiteral(&b, *LiteralUtil::CreateFromArray(*expected), {input_arg.get()}, DefaultErrorSpec()); } @@ -1065,17 +1070,16 @@ string R2ReduceWindowTestDataToString( const ::testing::TestParamInfo< ::testing::tuple>& data) { const auto& param = ::testing::get<0>(data.param); - string str = tensorflow::strings::StrCat( - "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"), // - "__window_bounds_", - tensorflow::str_util::Join(param.window_bounds, "x"), // - "__strides_", tensorflow::str_util::Join(param.strides, "x"), // - "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"), - "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"), - "__layout_", param.layout[0], "_", param.layout[1], // + string str = absl::StrCat( + "base_bounds_", absl::StrJoin(param.base_bounds, "x"), // + "__window_bounds_", absl::StrJoin(param.window_bounds, "x"), // + "__strides_", absl::StrJoin(param.strides, "x"), // + "__pad_low_", absl::StrJoin(param.pad_low, "x"), "__pad_high_", + absl::StrJoin(param.pad_high, "x"), "__layout_", param.layout[0], "_", + param.layout[1], // "__reducer_", param.reducer == kAdd ? "add" : "max"); if (::testing::get<1>(data.param)) { - str = tensorflow::strings::StrCat(str, "_bfloat16"); + str = absl::StrCat(str, "_bfloat16"); } return str; } @@ -1094,7 +1098,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase, const float kInitValue = 0.0f; Array2D input(param.base_bounds[0], param.base_bounds[1], 1.0f); std::unique_ptr input_literal = - Literal::CreateR2FromArray2DWithLayout( + LiteralUtil::CreateR2FromArray2DWithLayout( input, LayoutUtil::MakeLayout(param.layout)); XlaOp parameter; @@ -1108,8 +1112,8 @@ class R2ReduceWindowTest : public ReduceWindowTestBase, ? CreateScalarAddComputation(FloatType(), &b) : CreateScalarMaxComputation(FloatType(), &b); auto init_value = - CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b); - b.ReduceWindowWithGeneralPadding( + CreateConstantFromLiteral(*LiteralUtil::CreateR0(kInitValue), &b); + ReduceWindowWithGeneralPadding( /*operand=*/parameter, /*init_value=*/init_value, /*computation=*/computation, @@ -1124,7 +1128,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase, /*window=*/param.window_bounds, /*stride=*/param.strides, /*padding=*/padding); - ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected), + ComputeAndCompareLiteral(&b, *LiteralUtil::CreateFromArray(*expected), {input_arg.get()}, DefaultErrorSpec()); } }; @@ -1258,21 +1262,27 @@ struct R1ReduceWindowTestData { /*pad_low=*/{5}, /*pad_high=*/{0}, /*reducer=*/Reducer::kAdd}, + + {/*base_bounds=*/{4096}, /*window_bounds=*/{4096}, + /*strides=*/{1}, + /*pad_low=*/{4095}, + /*pad_high=*/{0}, + /*reducer=*/Reducer::kMax}, }; string R1ReduceWindowTestDataToString( const ::testing::TestParamInfo< ::testing::tuple>& data) { const auto& param = ::testing::get<0>(data.param); - string str = tensorflow::strings::StrCat( - "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"), - "__window_bounds_", tensorflow::str_util::Join(param.window_bounds, "x"), - "__strides_", tensorflow::str_util::Join(param.strides, "x"), - "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"), - "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"), - "__reducer_", param.reducer == kAdd ? "add" : "max"); + string str = + absl::StrCat("base_bounds_", absl::StrJoin(param.base_bounds, "x"), + "__window_bounds_", absl::StrJoin(param.window_bounds, "x"), + "__strides_", absl::StrJoin(param.strides, "x"), + "__pad_low_", absl::StrJoin(param.pad_low, "x"), + "__pad_high_", absl::StrJoin(param.pad_high, "x"), + "__reducer_", param.reducer == kAdd ? "add" : "max"); if (::testing::get<1>(data.param)) { - str = tensorflow::strings::StrCat(str, "_bfloat16"); + str = absl::StrCat(str, "_bfloat16"); } return str; } @@ -1293,7 +1303,7 @@ TEST_P(R1ReduceWindowTest, DoIt) { std::vector input_vector(param.base_bounds[0]); std::iota(std::begin(input_vector), std::end(input_vector), 0); std::unique_ptr input_literal = - Literal::CreateR1(tensorflow::gtl::ArraySlice(input_vector)); + LiteralUtil::CreateR1(absl::Span(input_vector)); XlaOp parameter; auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0", &b, ¶meter); @@ -1305,8 +1315,8 @@ TEST_P(R1ReduceWindowTest, DoIt) { ? CreateScalarAddComputation(FloatType(), &b) : CreateScalarMaxComputation(FloatType(), &b); auto init_value = - CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b); - b.ReduceWindowWithGeneralPadding( + CreateConstantFromLiteral(*LiteralUtil::CreateR0(kInitValue), &b); + ReduceWindowWithGeneralPadding( /*operand=*/parameter, /*init_value=*/init_value, /*computation=*/computation, @@ -1317,14 +1327,14 @@ TEST_P(R1ReduceWindowTest, DoIt) { ? +[](float a, float b) { return a + b; } : +[](float a, float b) { return std::max(a, b); }; auto expected = ReferenceUtil::ReduceWindow1DGeneric( - /*operand=*/tensorflow::gtl::ArraySlice(input_vector), + /*operand=*/absl::Span(input_vector), /*init=*/kInitValue, /*reduce_func=*/reduce_func, /*window=*/param.window_bounds, /*stride=*/param.strides, /*padding=*/padding); - ComputeAndCompareLiteral(&b, *Literal::CreateR1(*expected), + ComputeAndCompareLiteral(&b, *LiteralUtil::CreateR1(*expected), {input_arg.get()}, DefaultErrorSpec()); } @@ -1338,7 +1348,7 @@ INSTANTIATE_TEST_CASE_P( // results on the interpreter backend. class ReduceWindowTextTest : public HloTestBase {}; -TEST_F(ReduceWindowTextTest, R2General256x384) { +XLA_TEST_F(ReduceWindowTextTest, R2General256x384) { const string hlo_string = R"( HloModule R2Window mul { @@ -1355,7 +1365,7 @@ ENTRY R2Window { EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(ReduceWindowTextTest, R2General256x384Layout01) { +XLA_TEST_F(ReduceWindowTextTest, R2General256x384Layout01) { const string hlo_string = R"( HloModule R2Window mul { @@ -1372,7 +1382,7 @@ ROOT reduce-window = f32[256,384]{0,1} reduce-window(operand, constant), window= EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(ReduceWindowTextTest, R2General2x5) { +XLA_TEST_F(ReduceWindowTextTest, R2General2x5) { const string hlo_string = R"( HloModule R2Window mul { @@ -1389,7 +1399,7 @@ ENTRY R2Window { EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(ReduceWindowTextTest, R2EffectiveScalar) { +XLA_TEST_F(ReduceWindowTextTest, R2EffectiveScalar) { const string hlo_string = R"( HloModule R2Window mul { @@ -1407,7 +1417,7 @@ ENTRY R2Window { EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(ReduceWindowTextTest, R3EffectiveScalar) { +XLA_TEST_F(ReduceWindowTextTest, R3EffectiveScalar) { const string hlo_string = R"( HloModule R3Window mul { @@ -1425,7 +1435,7 @@ ENTRY R3Window { EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(HloTestBase, ReduceWindowIdentity) { +XLA_TEST_F(HloTestBase, ReduceWindowIdentity) { const string hlo_string = R"( HloModule ReduceWindowIdentity identity.pad_to_reduce_window { @@ -1439,10 +1449,10 @@ ENTRY reduce-window-identity { } )"; - EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt)); + EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt)); } -TEST_F(HloTestBase, ReduceWindowS32) { +XLA_TEST_F(HloTestBase, ReduceWindowS32) { const string hlo_string = R"( HloModule reduce-window @@ -1458,7 +1468,26 @@ ENTRY %reduce-window (parameter.0: s32[81,8], parameter.1: s32[]) -> s32[82,8] { } )"; - EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt)); + EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt)); +} + +XLA_TEST_F(HloTestBase, ReduceWindowF16) { + const string hlo_string = R"( +HloModule reduce-window + +%identity.pad_to_reduce_window (param0: f16[], param1: f16[]) -> f16[] { + %param0 = f16[] parameter(0) + ROOT %param1 = f16[] parameter(1) +} + +ENTRY %reduce-window (parameter.0: f16[81,8], parameter.1: f16[]) -> f16[82,8] { + %parameter.0 = f16[81,8]{1,0} parameter(0) + %parameter.1 = f16[] parameter(1) + ROOT %reduce-window = f16[82,8]{1,0} reduce-window(f16[81,8]{1,0} %parameter.0, f16[] %parameter.1), window={size=1x1 pad=0_1x0_0}, to_apply=%identity.pad_to_reduce_window +} + +)"; + EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt)); } } // namespace diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc index 36d763b0f7f4267ede076c0b25cfaf9654e96e0d..d8914513819415368a628eab1f482f9644dd46b1 100644 --- a/tensorflow/compiler/xla/tests/replay_test.cc +++ b/tensorflow/compiler/xla/tests/replay_test.cc @@ -17,9 +17,9 @@ limitations under the License. #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/protobuf_util.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -39,8 +39,8 @@ class ReplayTest : public ClientLibraryTestBase {}; TEST_F(ReplayTest, TwoPlusTwoReplay) { // Make 2+2 computation. XlaBuilder builder(TestName()); - auto two = builder.ConstantR0(2); - builder.Add(two, two); + auto two = ConstantR0(&builder, 2); + Add(two, two); XlaComputation computation = builder.Build().ConsumeValueOrDie(); // Serialize it out. @@ -70,9 +70,9 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) { XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) { // Make computation. XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(S32, {}), "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(S32, {}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(S32, {}), "y"); + Add(x, y); XlaComputation computation = builder.Build().ConsumeValueOrDie(); // Serialize it out. @@ -91,10 +91,10 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) { // Run it. std::unique_ptr x_data = - client_->TransferToServer(*Literal::CreateR0(2)) + client_->TransferToServer(*LiteralUtil::CreateR0(2)) .ConsumeValueOrDie(); std::unique_ptr y_data = - client_->TransferToServer(*Literal::CreateR0(3)) + client_->TransferToServer(*LiteralUtil::CreateR0(3)) .ConsumeValueOrDie(); std::unique_ptr literal = client_ @@ -111,13 +111,13 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) { // As above, but with map(+2) over some constant array. XlaBuilder plus_two_builder("plus two"); auto input = - plus_two_builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "input"); - plus_two_builder.Add(input, plus_two_builder.ConstantR0(2)); + Parameter(&plus_two_builder, 0, ShapeUtil::MakeShape(S32, {}), "input"); + Add(input, ConstantR0(&plus_two_builder, 2)); XlaComputation plus_two = plus_two_builder.Build().ConsumeValueOrDie(); XlaBuilder mapper_builder(TestName()); - auto original = mapper_builder.ConstantR1({1, 2, 3}); - mapper_builder.Map({original}, plus_two, {0}); + auto original = ConstantR1(&mapper_builder, {1, 2, 3}); + Map(&mapper_builder, {original}, plus_two, {0}); XlaComputation computation = mapper_builder.Build().ConsumeValueOrDie(); diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc index da1b588ec41cef711412367e89b2a9b1029bca71..ae24eb5eb4822a2057e34a1aec8b7d64604d8984 100644 --- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc +++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc @@ -18,13 +18,14 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -33,7 +34,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -44,11 +44,11 @@ using ReshapeMotionTest = ClientLibraryTestBase; TEST_F(ReshapeMotionTest, ElementwiseOfReshapesWithNonSameInputShapes) { XlaBuilder builder(TestName()); - auto a = builder.ConstantR2({{2, 3, 5}, {7, 11, 13}}); - auto b = builder.ConstantR2({{17, 19}, {23, 29}, {31, 37}}); - auto c = builder.Reshape(a, {6}); - auto d = builder.Reshape(b, {6}); - auto e = builder.Mul(c, d); + auto a = ConstantR2(&builder, {{2, 3, 5}, {7, 11, 13}}); + auto b = ConstantR2(&builder, {{17, 19}, {23, 29}, {31, 37}}); + auto c = Reshape(a, {6}); + auto d = Reshape(b, {6}); + Mul(c, d); ComputeAndCompareR1(&builder, {34, 57, 115, 203, 341, 481}, {}); } diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc index a4580cd71d46ad0a0186eddd51291f9c322b6f49..17d12715f60f624c35169048121ca139d78a544f 100644 --- a/tensorflow/compiler/xla/tests/reshape_test.cc +++ b/tensorflow/compiler/xla/tests/reshape_test.cc @@ -18,12 +18,13 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/reference_util.h" @@ -35,7 +36,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -55,39 +55,39 @@ XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) { XlaBuilder builder(TestName()); Array2D input_array(1, 1); input_array.Fill(1.0f); - auto input_literal = Literal::CreateR2FromArray2D(input_array); + auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter", &builder, ¶meter); - builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); + Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); - auto expected_literal = Literal::CreateR1({1.0f}); + auto expected_literal = LiteralUtil::CreateR1({1.0f}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateR1({1.0f}); + auto input_literal = LiteralUtil::CreateR1({1.0f}); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter", &builder, ¶meter); - builder.Collapse(/*operand=*/parameter, /*dimensions=*/{}); + Collapse(/*operand=*/parameter, /*dimensions=*/{}); - auto expected_literal = Literal::CreateR1({1.0f}); + auto expected_literal = LiteralUtil::CreateR1({1.0f}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateR1({1.0f}); + auto input_literal = LiteralUtil::CreateR1({1.0f}); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter", &builder, ¶meter); - builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0}); + Collapse(/*operand=*/parameter, /*dimensions=*/{0}); - auto expected_literal = Literal::CreateR1({1.0f}); + auto expected_literal = LiteralUtil::CreateR1({1.0f}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -97,15 +97,15 @@ XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) { XlaBuilder builder(TestName()); Array2D input_array(1, 1); input_array.Fill(1.0f); - auto input_literal = Literal::CreateR2FromArray2D(input_array); + auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter", &builder, ¶meter); - auto reshape = builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, - /*new_sizes=*/{}); + auto reshape = Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, + /*new_sizes=*/{}); auto new_shape = builder.GetShape(reshape).ConsumeValueOrDie(); - auto expected_literal = Literal::CreateR0(1.0f); + auto expected_literal = LiteralUtil::CreateR0(1.0f); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -113,63 +113,54 @@ XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) { XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) { XlaBuilder builder(TestName()); - std::unique_ptr param0_literal = Literal::CreateR0(1.0f); + std::unique_ptr param0_literal = LiteralUtil::CreateR0(1.0f); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0", &builder, ¶meter); - auto a = builder.Neg(parameter); - builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1}); + auto a = Neg(parameter); + Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1}); - auto expected_literal = Literal::CreateR1({-1.0f}); + auto expected_literal = LiteralUtil::CreateR1({-1.0f}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } -// TODO(b/29185393): Make this work with the GPU backend. The GPU backend -// does not handle zero-sized shapes correctly. Failed last on 2017-11-30 -// with an incorrect result rank. -XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) { +XLA_TEST_P(ReshapeTest, Trivial0x3) { XlaBuilder builder(TestName()); Array2D input_array(0, 3); - auto input_literal = Literal::CreateR2FromArray2D(input_array); + auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); - auto expected_literal = Literal::CreateR1({}); + Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); + auto expected_literal = LiteralUtil::CreateR1({}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } -// TODO(b/29185393): Make this work with the GPU backend. The GPU backend -// does not handle zero-sized shapes correctly. Failed last on 2017-05-15 -// with an incorrect result rank. -XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) { +XLA_TEST_P(ReshapeTest, Trivial0x3WithParameter) { XlaBuilder builder(TestName()); std::unique_ptr param0_literal = - Literal::CreateR2FromArray2D(Array2D(0, 3)); + LiteralUtil::CreateR2FromArray2D(Array2D(0, 3)); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0", &builder, ¶meter); - builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); - auto expected_literal = Literal::CreateR1({}); + Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); + auto expected_literal = LiteralUtil::CreateR1({}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } -// TODO(b/29185393): Make this work with the GPU backend. The GPU backend -// does not handle zero-sized shapes correctly. Failed last on 2017-11-30 -// with an incorrect result rank. -XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) { +XLA_TEST_P(ReshapeTest, Trivial3x0) { XlaBuilder builder(TestName()); Array2D input_array(3, 0); - auto input_literal = Literal::CreateR2FromArray2D(input_array); + auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); - auto expected_literal = Literal::CreateR1({}); + Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); + auto expected_literal = LiteralUtil::CreateR1({}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -177,12 +168,12 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) { // Collapses a 2-dimensional row vector to 1 dimension. XLA_TEST_P(ReshapeTest, Trivial1x3) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateR2({{1.0f, 2.0f, 3.0f}}); + auto input_literal = LiteralUtil::CreateR2({{1.0f, 2.0f, 3.0f}}); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); - auto expected_literal = Literal::CreateR1({1.0f, 2.0f, 3.0f}); + Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); + auto expected_literal = LiteralUtil::CreateR1({1.0f, 2.0f, 3.0f}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -190,30 +181,26 @@ XLA_TEST_P(ReshapeTest, Trivial1x3) { // Collapses a 2-dimensional column vector to 1 dimension. XLA_TEST_P(ReshapeTest, Trivial3x1) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateR2({{1.0f}, {2.0f}, {3.0f}}); + auto input_literal = LiteralUtil::CreateR2({{1.0f}, {2.0f}, {3.0f}}); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); - auto expected_literal = Literal::CreateR1({1.0f, 2.0f, 3.0f}); + Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1}); + auto expected_literal = LiteralUtil::CreateR1({1.0f, 2.0f, 3.0f}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } -// TODO(b/29185393): Make this work with the GPU backend. The GPU backend -// does not handle zero-sized shapes correctly. Failed last on 2017-11-30 -// with an incorrect result rank. -// // Splits an empty vector into an empty matrix. -XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(R1ToR2_0_To_2x0)) { +XLA_TEST_P(ReshapeTest, R1ToR2_0_To_2x0) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateR1({}); + auto input_literal = LiteralUtil::CreateR1({}); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0}, - /*new_sizes=*/{2, 0}); - auto expected_literal = Literal::CreateR2({{}, {}}); + Reshape(/*operand=*/parameter, /*dimensions=*/{0}, + /*new_sizes=*/{2, 0}); + auto expected_literal = LiteralUtil::CreateR2({{}, {}}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -222,32 +209,28 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(R1ToR2_0_To_2x0)) { XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) { XlaBuilder builder(TestName()); auto input_literal = - Literal::CreateR1({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + LiteralUtil::CreateR1({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0}, - /*new_sizes=*/{2, 3}); + Reshape(/*operand=*/parameter, /*dimensions=*/{0}, + /*new_sizes=*/{2, 3}); auto expected_literal = - Literal::CreateR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}); + LiteralUtil::CreateR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } -// TODO(b/29185393): Make this work with the GPU backend. The GPU backend -// does not handle zero-sized shapes correctly. Failed last on 2017-11-30 -// with an incorrect result rank. -// // Transposes a 2x0 array to a 0x2 array. -XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Reshape0x2To2x0)) { +XLA_TEST_P(ReshapeTest, Reshape0x2To2x0) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(Array2D(0, 2)); + auto input_literal = LiteralUtil::CreateFromArray(Array2D(0, 2)); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, - /*new_sizes=*/{2, 0}); - auto expected_literal = Literal::CreateR2({{}, {}}); + Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, + /*new_sizes=*/{2, 0}); + auto expected_literal = LiteralUtil::CreateR2({{}, {}}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -256,15 +239,15 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Reshape0x2To2x0)) { XLA_TEST_P(ReshapeTest, ReshapeRowToCol) { XlaBuilder builder(TestName()); auto simple = MakeLinspaceArray2D(1.0f, 3.0f, 1, 3); - auto input_literal = Literal::CreateFromArray(*simple); + auto input_literal = LiteralUtil::CreateFromArray(*simple); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, - /*new_sizes=*/{3, 1}); + Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, + /*new_sizes=*/{3, 1}); auto expected = ReferenceUtil::TransposeArray2D(*simple); - auto expected_literal = Literal::CreateFromArray(*expected); + auto expected_literal = LiteralUtil::CreateFromArray(*expected); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -273,32 +256,28 @@ XLA_TEST_P(ReshapeTest, ReshapeRowToCol) { XLA_TEST_P(ReshapeTest, TransposeAsReshape) { XlaBuilder builder(TestName()); auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3); - auto input_literal = Literal::CreateFromArray(*a4x3); + auto input_literal = LiteralUtil::CreateFromArray(*a4x3); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0}, - /*new_sizes=*/{3, 4}); + Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0}, + /*new_sizes=*/{3, 4}); auto expected = ReferenceUtil::TransposeArray2D(*a4x3); - auto expected_literal = Literal::CreateFromArray(*expected); + auto expected_literal = LiteralUtil::CreateFromArray(*expected); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } -// TODO(b/29185393): Make this work with the GPU backend. The GPU backend -// does not handle zero-sized shapes correctly. Failed last on 2017-11-30 -// with an incorrect result rank. -// // Transposes a 0x4 array with XlaBuilder::Transpose. -XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Transpose0x4)) { +XLA_TEST_P(ReshapeTest, Transpose0x4) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(Array2D(0, 4)); + auto input_literal = LiteralUtil::CreateFromArray(Array2D(0, 4)); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Transpose(parameter, {1, 0}); - auto expected_literal = Literal::CreateR2({{}, {}, {}, {}}); + Transpose(parameter, {1, 0}); + auto expected_literal = LiteralUtil::CreateR2({{}, {}, {}, {}}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -307,49 +286,43 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Transpose0x4)) { XLA_TEST_P(ReshapeTest, Transpose4x3) { XlaBuilder builder(TestName()); auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3); - auto input_literal = Literal::CreateFromArray(*a4x3); + auto input_literal = LiteralUtil::CreateFromArray(*a4x3); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Transpose(parameter, {1, 0}); + Transpose(parameter, {1, 0}); auto expected = ReferenceUtil::TransposeArray2D(*a4x3); - auto expected_literal = Literal::CreateFromArray(*expected); + auto expected_literal = LiteralUtil::CreateFromArray(*expected); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } -// TODO(b/29185393): Make this work with the GPU backend. The GPU backend -// does not handle zero-sized shapes correctly. Failed last on 2017-11-30 -// with an incorrect result rank. -// // Reshapes an empty 2-dimensional array with dimensions that are not just a // rearrangement of the originals (split), but no reordering (no shuffle). -XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) { +XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffleZeroElements) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(Array2D(6, 0)); + auto input_literal = LiteralUtil::CreateFromArray(Array2D(6, 0)); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, - /*new_sizes=*/{2, 3, 0, 0}); - auto expected_literal = Literal::CreateFromArray(Array4D(2, 3, 0, 0)); + Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, + /*new_sizes=*/{2, 3, 0, 0}); + auto expected_literal = + LiteralUtil::CreateFromArray(Array4D(2, 3, 0, 0)); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } -// TODO(b/29185393): Make this work with the GPU backend. The GPU backend -// does not handle zero-sized shapes correctly. Failed last on 2017-11-30 -// with an incorrect result rank. -XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) { +XLA_TEST_P(ReshapeTest, ReshapeR4ToR2ZeroElements) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(Array4D(2, 3, 4, 0)); + auto input_literal = LiteralUtil::CreateFromArray(Array4D(2, 3, 4, 0)); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3}, - /*new_sizes=*/{24, 0}); - auto expected_literal = Literal::CreateFromArray(Array2D(24, 0)); + Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3}, + /*new_sizes=*/{24, 0}); + auto expected_literal = LiteralUtil::CreateFromArray(Array2D(24, 0)); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -359,32 +332,28 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) { XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) { XlaBuilder builder(TestName()); auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3); - auto input_literal = Literal::CreateFromArray(*a4x3); + auto input_literal = LiteralUtil::CreateFromArray(*a4x3); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, - /*new_sizes=*/{2, 6}); + Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1}, + /*new_sizes=*/{2, 6}); auto expected = MakeLinspaceArray2D(1.0f, 12.0f, 2, 6); - auto expected_literal = Literal::CreateFromArray(*expected); + auto expected_literal = LiteralUtil::CreateFromArray(*expected); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } -// TODO(b/29185393): Make this work with the GPU backend. The GPU backend -// does not handle zero-sized shapes correctly. Failed last on 2017-11-30 -// with an incorrect result rank. -// -XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) { +XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffleZeroElements) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(Array2D(0, 6)); + auto input_literal = LiteralUtil::CreateFromArray(Array2D(0, 6)); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0}, - /*new_sizes=*/{3, 0}); - auto expected_literal = Literal::CreateFromArray(Array2D(3, 0)); + Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0}, + /*new_sizes=*/{3, 0}); + auto expected_literal = LiteralUtil::CreateFromArray(Array2D(3, 0)); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -394,15 +363,15 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) { XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) { XlaBuilder builder(TestName()); auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3); - auto input_literal = Literal::CreateFromArray(*a4x3); + auto input_literal = LiteralUtil::CreateFromArray(*a4x3); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0}, - /*new_sizes=*/{2, 6}); + Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0}, + /*new_sizes=*/{2, 6}); Array2D expected({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f}, {8.0f, 11.0f, 3.0f, 6.0f, 9.0f, 12.0f}}); - auto expected_literal = Literal::CreateFromArray(expected); + auto expected_literal = LiteralUtil::CreateFromArray(expected); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -420,13 +389,13 @@ static Array3D ArrayForDocR3Tests() { XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests()); + auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests()); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2}, - /*new_sizes=*/{24}); - auto expected_literal = Literal::CreateR1( + Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2}, + /*new_sizes=*/{24}); + auto expected_literal = LiteralUtil::CreateR1( {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27, 30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, @@ -435,33 +404,33 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) { XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests()); + auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests()); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2}, - /*new_sizes=*/{8, 3}); - auto expected_literal = Literal::CreateR2({{10, 11, 12}, - {15, 16, 17}, - {20, 21, 22}, - {25, 26, 27}, - {30, 31, 32}, - {35, 36, 37}, - {40, 41, 42}, - {45, 46, 47}}); + Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2}, + /*new_sizes=*/{8, 3}); + auto expected_literal = LiteralUtil::CreateR2({{10, 11, 12}, + {15, 16, 17}, + {20, 21, 22}, + {25, 26, 27}, + {30, 31, 32}, + {35, 36, 37}, + {40, 41, 42}, + {45, 46, 47}}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests()); + auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests()); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0}, - /*new_sizes=*/{24}); - auto expected_literal = Literal::CreateR1( + Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0}, + /*new_sizes=*/{24}); + auto expected_literal = LiteralUtil::CreateR1( {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42, 15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, @@ -470,33 +439,33 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) { XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests()); + auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests()); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0}, - /*new_sizes=*/{8, 3}); - auto expected_literal = Literal::CreateR2({{10, 20, 30}, - {40, 11, 21}, - {31, 41, 12}, - {22, 32, 42}, - {15, 25, 35}, - {45, 16, 26}, - {36, 46, 17}, - {27, 37, 47}}); + Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0}, + /*new_sizes=*/{8, 3}); + auto expected_literal = LiteralUtil::CreateR2({{10, 20, 30}, + {40, 11, 21}, + {31, 41, 12}, + {22, 32, 42}, + {15, 25, 35}, + {45, 16, 26}, + {36, 46, 17}, + {27, 37, 47}}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) { XlaBuilder builder(TestName()); - auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests()); + auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests()); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0}, - /*new_sizes=*/{2, 6, 2}); - auto expected_literal = Literal::CreateR3( + Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0}, + /*new_sizes=*/{2, 6, 2}); + auto expected_literal = LiteralUtil::CreateR3( {{{10, 20}, {30, 40}, {11, 21}, {31, 41}, {12, 22}, {32, 42}}, {{15, 25}, {35, 45}, {16, 26}, {36, 46}, {17, 27}, {37, 47}}}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, @@ -523,12 +492,12 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) { Array4D t2x2x2x3(2, 2, 2, 3); auto filler2x3 = MakeLinspaceArray2D(1.0f, 6.0f, 2, 3); t2x2x2x3.FillWithYX(*filler2x3); - auto input_literal = Literal::CreateFromArray(t2x2x2x3); + auto input_literal = LiteralUtil::CreateFromArray(t2x2x2x3); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3}); - auto expected_literal = Literal::CreateR2( + Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3}); + auto expected_literal = LiteralUtil::CreateR2( {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}}); @@ -548,15 +517,15 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) { t(1, 0, 0, 1) = 5; t(1, 0, 1, 0) = 6; t(1, 0, 1, 1) = 7; - auto input_literal = Literal::CreateFromArray(t); + auto input_literal = LiteralUtil::CreateFromArray(t); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3}, - /*new_sizes=*/{2, 4}); + Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3}, + /*new_sizes=*/{2, 4}); auto expected_literal = - Literal::CreateR2({{0, 1, 2, 3}, {4, 5, 6, 7}}); + LiteralUtil::CreateR2({{0, 1, 2, 3}, {4, 5, 6, 7}}); ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()}, zero_error_spec_); } @@ -575,9 +544,9 @@ XLA_TEST_P(ReshapeTest, ToScalar) { XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, input_literal, "input", &b, ¶meter); - b.Reshape(parameter, dimensions, {}); + Reshape(parameter, dimensions, {}); - auto expected_literal = Literal::CreateR0(83.0f); + auto expected_literal = LiteralUtil::CreateR0(83.0f); ComputeAndCompareLiteral(&b, *expected_literal, {input.get()}, zero_error_spec_); } @@ -585,11 +554,11 @@ XLA_TEST_P(ReshapeTest, ToScalar) { XLA_TEST_P(ReshapeTest, BadDimensions) { XlaBuilder b(TestName()); - auto input_literal = Literal::CreateR1({1.0f}); + auto input_literal = LiteralUtil::CreateR1({1.0f}); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b, ¶meter); - b.Reshape(parameter, {}, {}); + Reshape(parameter, {}, {}); EXPECT_THAT( ExecuteToString(&b, {}), ::testing::HasSubstr("not a permutation of the operand dimensions")); @@ -597,11 +566,11 @@ XLA_TEST_P(ReshapeTest, BadDimensions) { XLA_TEST_P(ReshapeTest, BadNewSizes) { XlaBuilder b(TestName()); - auto input_literal = Literal::CreateR1({1.0f, 2.0f}); + auto input_literal = LiteralUtil::CreateR1({1.0f, 2.0f}); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b, ¶meter); - b.Reshape(parameter, {1}, {}); + Reshape(parameter, {1}, {}); EXPECT_THAT(ExecuteToString(&b, {}), ::testing::HasSubstr("mismatched element counts")); } @@ -609,7 +578,8 @@ XLA_TEST_P(ReshapeTest, BadNewSizes) { XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) { XlaBuilder builder(TestName()); // clang-format off - auto input_literal = Literal::CreateR4FromArray4DWithLayout(Array4D{ + auto input_literal = LiteralUtil::CreateR4FromArray4DWithLayout( + Array4D{ { { {0, 1}, @@ -637,7 +607,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) { auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8}); + Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8}); Array2D expected_array({ {0, 1, 2, 3, 100, 101, 102, 103}, @@ -654,16 +624,16 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) { ->ExecuteAndTransfer(computation, {input.get()}, &execution_options) .ConsumeValueOrDie(); std::unique_ptr expected = - Literal::CreateR2FromArray2D(expected_array); + LiteralUtil::CreateR2FromArray2D(expected_array); if (use_bfloat16()) { - expected = Literal::ConvertF32ToBF16(*expected); + expected = LiteralUtil::ConvertF32ToBF16(*expected); } EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *actual)); } XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) { XlaBuilder builder(TestName()); - std::unique_ptr input_literal = Literal::CreateR2({ + std::unique_ptr input_literal = LiteralUtil::CreateR2({ {0, 1, 2, 3, 4, 5, 6, 7}, {100, 101, 102, 103, 104, 105, 106, 107}, {200, 201, 202, 203, 204, 205, 206, 207}, @@ -671,10 +641,10 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) { XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4}); + Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4}); // clang-format off - auto expected_literal = Literal::CreateR4({ + auto expected_literal = LiteralUtil::CreateR4({ {{{0, 1, 2, 3}}, {{4, 5, 6, 7}}}, {{{100, 101, 102, 103}}, @@ -690,7 +660,7 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) { // Tests R2->R4 reshape with the reshape dimensions {1, 0}. XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) { XlaBuilder builder(TestName()); - std::unique_ptr input_literal = Literal::CreateR2({ + std::unique_ptr input_literal = LiteralUtil::CreateR2({ {0, 1, 2, 3, 4, 5, 6, 7}, {100, 101, 102, 103, 104, 105, 106, 107}, {200, 201, 202, 203, 204, 205, 206, 207}, @@ -698,10 +668,10 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) { XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4}); + Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4}); // clang-format off - auto expected_literal = Literal::CreateR4({ + auto expected_literal = LiteralUtil::CreateR4({ {{{0, 100, 200, 1}}, {{101, 201, 2, 102}}}, {{{202, 3, 103, 203}}, @@ -719,19 +689,18 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) { std::mt19937 rng; std::uniform_real_distribution distribution; Array4D input(2, 1, 1, 1); - input.Each( - [&rng, &distribution](tensorflow::gtl::ArraySlice /* indices */, - float* cell) { *cell = distribution(rng); }); + input.Each([&rng, &distribution](absl::Span /* indices */, + float* cell) { *cell = distribution(rng); }); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaOp parameter; auto input_data = CreateParameterAndTransferLiteral( 0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1}); + Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1}); std::unique_ptr expected = - Literal::ReshapeSlice({2, 1}, {1, 0}, *input_literal); + LiteralUtil::ReshapeSlice({2, 1}, {1, 0}, *input_literal); ComputeAndCompareLiteral(&builder, *expected, {input_data.get()}, zero_error_spec_); } @@ -741,19 +710,18 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) { std::mt19937 rng; std::uniform_real_distribution distribution; Array4D input(2, 1, 4, 1); - input.Each( - [&rng, &distribution](tensorflow::gtl::ArraySlice /* indices */, - float* cell) { *cell = distribution(rng); }); + input.Each([&rng, &distribution](absl::Span /* indices */, + float* cell) { *cell = distribution(rng); }); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaOp parameter; auto input_data = CreateParameterAndTransferLiteral( 0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2}); + Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2}); std::unique_ptr expected = - Literal::ReshapeSlice({4, 2}, {1, 0}, *input_literal); + LiteralUtil::ReshapeSlice({4, 2}, {1, 0}, *input_literal); ComputeAndCompareLiteral(&builder, *expected, {input_data.get()}, zero_error_spec_); } @@ -764,24 +732,23 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) { std::mt19937 rng; std::uniform_real_distribution distribution; Array4D input(5, 10, 2, 3); - input.Each( - [&rng, &distribution](tensorflow::gtl::ArraySlice /* indices */, - float* cell) { *cell = distribution(rng); }); + input.Each([&rng, &distribution](absl::Span /* indices */, + float* cell) { *cell = distribution(rng); }); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaOp parameter; auto input_data = CreateParameterAndTransferLiteral( 0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 2, 1, 3}, - /*new_sizes=*/{5, 60}); + Reshape(parameter, /*dimensions=*/{0, 2, 1, 3}, + /*new_sizes=*/{5, 60}); Array2D expected_array(5, 60); - input.Each([&](tensorflow::gtl::ArraySlice indices, float* cell) { + input.Each([&](absl::Span indices, float* cell) { expected_array(indices[0], indices[2] * 30 + indices[1] * 3 + indices[3]) = *cell; }); - auto expected = Literal::CreateR2FromArray2D(expected_array); + auto expected = LiteralUtil::CreateR2FromArray2D(expected_array); ComputeAndCompareLiteral(&builder, *expected, {input_data.get()}, zero_error_spec_); } @@ -792,16 +759,16 @@ XLA_TEST_P(ReshapeTest, NoopReshape) { std::uniform_real_distribution distribution; Array4D input_array(2, 3, 5, 7); input_array.Each( - [&rng, &distribution](tensorflow::gtl::ArraySlice /* indices */, + [&rng, &distribution](absl::Span /* indices */, float* cell) { *cell = distribution(rng); }); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input_array, LayoutUtil::MakeLayout({1, 2, 3, 0})); XlaOp parameter; auto input_data = CreateParameterAndTransferLiteral( 0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{3, 0, 1, 2}, - /*new_sizes=*/{7, 2, 3, 5}); + Reshape(parameter, /*dimensions=*/{3, 0, 1, 2}, + /*new_sizes=*/{7, 2, 3, 5}); XlaComputation computation = builder.Build().ConsumeValueOrDie(); ExecutionOptions execution_options = execution_options_; @@ -817,7 +784,7 @@ XLA_TEST_P(ReshapeTest, NoopReshape) { // Since the reshape is a no-op, verify that it does not change the underlying // data. if (use_bfloat16()) { - auto expected = Literal::ConvertF32ToBF16(*input_literal); + auto expected = LiteralUtil::ConvertF32ToBF16(*input_literal); EXPECT_EQ(expected->data(), output_literal->data()); } else { EXPECT_EQ(input_literal->data(), output_literal->data()); @@ -826,21 +793,21 @@ XLA_TEST_P(ReshapeTest, NoopReshape) { XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) { XlaBuilder builder(TestName()); - auto literal_1x2x3x4 = Literal::CreateR4( + auto literal_1x2x3x4 = LiteralUtil::CreateR4( {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}, {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}}); XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, - /*new_sizes=*/{1, 2, 3, 4}); + Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, + /*new_sizes=*/{1, 2, 3, 4}); ComputeAndCompareLiteral(&builder, *literal_1x2x3x4, {input.get()}); } XLA_TEST_P(ReshapeTest, R4ToR4Reshape) { - auto literal_1x2x3x4 = Literal::CreateR4( + auto literal_1x2x3x4 = LiteralUtil::CreateR4( {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}, {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}}); @@ -848,11 +815,11 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape) { XlaOp parameter; auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{1, 3, 2, 0}, - /*new_sizes=*/{2, 4, 3, 1}); + Reshape(parameter, /*dimensions=*/{1, 3, 2, 0}, + /*new_sizes=*/{2, 4, 3, 1}); // clang-format off - auto expected_2x4x3x1 = Literal::CreateR4( + auto expected_2x4x3x1 = LiteralUtil::CreateR4( {{{{1}, {5}, {9}}, {{2}, {6}, {10}}, {{3}, {7}, {11}}, @@ -872,21 +839,20 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) { std::vector bounds = {2, 2, 2, 2}; std::vector new_bounds = {bounds[0], bounds[1], bounds[3], bounds[2]}; Array4D input(bounds[0], bounds[1], bounds[2], bounds[3]); - input.Each( - [&rng, &distribution](tensorflow::gtl::ArraySlice /* indices */, - float* cell) { *cell = distribution(rng); }); + input.Each([&rng, &distribution](absl::Span /* indices */, + float* cell) { *cell = distribution(rng); }); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaBuilder builder(TestName()); XlaOp parameter; auto input_data = CreateParameterAndTransferLiteral( 0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2}, - /*new_sizes=*/new_bounds); + Reshape(parameter, /*dimensions=*/{0, 1, 3, 2}, + /*new_sizes=*/new_bounds); std::unique_ptr expected = - Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal) + LiteralUtil::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal) ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0})); // Specify the requested output shape explicitly to ensure that this reshape @@ -901,21 +867,20 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) { std::vector bounds = {1, 1, 250, 300}; std::vector new_bounds = {bounds[0], bounds[1], bounds[3], bounds[2]}; Array4D input(bounds[0], bounds[1], bounds[2], bounds[3]); - input.Each( - [&rng, &distribution](tensorflow::gtl::ArraySlice /* indices */, - float* cell) { *cell = distribution(rng); }); + input.Each([&rng, &distribution](absl::Span /* indices */, + float* cell) { *cell = distribution(rng); }); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaBuilder builder(TestName()); XlaOp parameter; auto input_data = CreateParameterAndTransferLiteral( 0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2}, - /*new_sizes=*/new_bounds); + Reshape(parameter, /*dimensions=*/{0, 1, 3, 2}, + /*new_sizes=*/new_bounds); std::unique_ptr expected = - Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal) + LiteralUtil::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal) ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0})); // Specify the requested output shape explicitly to ensure that this reshape @@ -930,21 +895,20 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) { std::vector bounds = {5, 5, 1, 10}; std::vector new_bounds = {bounds[0], bounds[1], bounds[3], bounds[2]}; Array4D input(bounds[0], bounds[1], bounds[2], bounds[3]); - input.Each( - [&rng, &distribution](tensorflow::gtl::ArraySlice /* indices */, - float* cell) { *cell = distribution(rng); }); + input.Each([&rng, &distribution](absl::Span /* indices */, + float* cell) { *cell = distribution(rng); }); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaBuilder builder(TestName()); XlaOp parameter; auto input_data = CreateParameterAndTransferLiteral( 0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2}, - /*new_sizes=*/new_bounds); + Reshape(parameter, /*dimensions=*/{0, 1, 3, 2}, + /*new_sizes=*/new_bounds); std::unique_ptr expected = - Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal) + LiteralUtil::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal) ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0})); // Specify the requested output shape explicitly to ensure that this reshape @@ -960,21 +924,20 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) { std::vector bounds = {5, 5, 10, 1}; std::vector new_bounds = {bounds[0], bounds[1], bounds[3], bounds[2]}; Array4D input(bounds[0], bounds[1], bounds[2], bounds[3]); - input.Each( - [&rng, &distribution](tensorflow::gtl::ArraySlice /* indices */, - float* cell) { *cell = distribution(rng); }); + input.Each([&rng, &distribution](absl::Span /* indices */, + float* cell) { *cell = distribution(rng); }); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input, LayoutUtil::MakeLayout({3, 2, 1, 0})); XlaBuilder builder(TestName()); XlaOp parameter; auto input_data = CreateParameterAndTransferLiteral( 0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2}, - /*new_sizes=*/new_bounds); + Reshape(parameter, /*dimensions=*/{0, 1, 3, 2}, + /*new_sizes=*/new_bounds); std::unique_ptr expected = - Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal) + LiteralUtil::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal) ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0})); // Specify the requested output shape explicitly to ensure that this reshape @@ -989,21 +952,20 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) { std::vector bounds = {3, 3, 1, 3}; std::vector new_bounds = {bounds[1], bounds[0], bounds[2], bounds[3]}; Array4D input(bounds[0], bounds[1], bounds[2], bounds[3]); - input.Each( - [&rng, &distribution](tensorflow::gtl::ArraySlice /* indices */, - float* cell) { *cell = distribution(rng); }); + input.Each([&rng, &distribution](absl::Span /* indices */, + float* cell) { *cell = distribution(rng); }); std::unique_ptr input_literal = - Literal::CreateR4FromArray4DWithLayout( + LiteralUtil::CreateR4FromArray4DWithLayout( input, LayoutUtil::MakeLayout({0, 1, 2, 3})); XlaBuilder builder(TestName()); XlaOp parameter; auto input_data = CreateParameterAndTransferLiteral( 0, *input_literal, "input", &builder, ¶meter); - builder.Reshape(parameter, /*dimensions=*/{1, 0, 2, 3}, - /*new_sizes=*/new_bounds); + Reshape(parameter, /*dimensions=*/{1, 0, 2, 3}, + /*new_sizes=*/new_bounds); std::unique_ptr expected = - Literal::ReshapeSlice(new_bounds, {1, 0, 2, 3}, *input_literal) + LiteralUtil::ReshapeSlice(new_bounds, {1, 0, 2, 3}, *input_literal) ->Relayout(input_literal->shape().layout()); // Specify the requested output shape explicitly to ensure that this reshape diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc index e7bd142dc9ddefbd8bebfb77d72218d662645c31..74ded82ddfae10c21fe98ec2e250b4eaecf95222 100644 --- a/tensorflow/compiler/xla/tests/reverse_test.cc +++ b/tensorflow/compiler/xla/tests/reverse_test.cc @@ -15,10 +15,12 @@ limitations under the License. #include +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -37,16 +39,14 @@ static std::array use_bfloat16_params{false}; #endif struct ReverseSpec { - tensorflow::gtl::ArraySlice input_dims; - tensorflow::gtl::ArraySlice reversal; + absl::Span input_dims; + absl::Span reversal; bool use_bfloat16; string ToTestCaseName() const { - return tensorflow::strings::Printf( - "reverse_%s_in_dims_%s_%s", - tensorflow::str_util::Join(input_dims, "x").c_str(), - tensorflow::str_util::Join(reversal, "x").c_str(), - use_bfloat16 ? "bf16" : "f32"); + return absl::StrFormat( + "reverse_%s_in_dims_%s_%s", absl::StrJoin(input_dims, "x"), + absl::StrJoin(reversal, "x"), use_bfloat16 ? "bf16" : "f32"); } }; @@ -82,26 +82,25 @@ TEST_P(FloatReverseTest, Reverses) { std::vector input_vector( ShapeUtil::ElementsIn(ShapeUtil::MakeShape(F32, spec.input_dims))); std::iota(input_vector.begin(), input_vector.end(), 0.0); - auto r1_literal = Literal::CreateR1(input_vector); + auto r1_literal = LiteralUtil::CreateR1(input_vector); auto input_literal = r1_literal->Reshape(spec.input_dims).ConsumeValueOrDie(); XlaBuilder builder(TestName()); auto a = AddParam(*input_literal, &builder); - builder.Rev(a, spec.reversal); + Rev(a, spec.reversal); std::unique_ptr expected = input_literal->CloneToUnique(); std::vector output_indices(spec.input_dims.size()); - expected->EachCell( - [&](tensorflow::gtl::ArraySlice indices, float) { - for (int64 i = 0; i < indices.size(); ++i) { - output_indices[i] = indices[i]; - } - float value = input_literal->Get(indices); - for (int64 dim : spec.reversal) { - output_indices[dim] = (spec.input_dims[dim] - 1) - indices[dim]; - } - expected->Set(output_indices, value); - }); + expected->EachCell([&](absl::Span indices, float) { + for (int64 i = 0; i < indices.size(); ++i) { + output_indices[i] = indices[i]; + } + float value = input_literal->Get(indices); + for (int64 dim : spec.reversal) { + output_indices[dim] = (spec.input_dims[dim] - 1) - indices[dim]; + } + expected->Set(output_indices, value); + }); ComputeAndCompareLiteral(&builder, *expected, {}); } @@ -127,7 +126,7 @@ XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) { }}); // clang-format on - b.Rev(b.ConstantR4FromArray4D(input), {0, 3}); + Rev(ConstantR4FromArray4D(&b, input), {0, 3}); // clang-format off Array4D expected({{ @@ -163,7 +162,7 @@ TEST_F(ReverseTest, Reverse4DFloatArrayOnDim01) { }); // clang-format on - b.Rev(b.ConstantR4FromArray4D(input), {0, 1}); + Rev(ConstantR4FromArray4D(&b, input), {0, 1}); // clang-format off Array4D expected({ diff --git a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc index 7cfca781acda15879075f4386c2096e537877aac..e692b8c5d5e661587bac16a2992e35f92c4c0bd9 100644 --- a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc +++ b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc @@ -15,10 +15,11 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/packed_literal_reader.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" @@ -27,7 +28,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/casts.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -47,8 +47,7 @@ class RoundTripPackedLiteralTest : public ClientLibraryTestBase { TEST_F(RoundTripPackedLiteralTest, RoundTripsR1F32Length2) { string data(sizeof(float) * 2, 0); - tensorflow::gtl::MutableArraySlice floats( - tensorflow::bit_cast(data.data()), 2); + absl::Span floats(tensorflow::bit_cast(data.data()), 2); floats[0] = 42.0; floats[1] = 24.0; @@ -70,8 +69,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR1F32Length2) { TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim0Minor) { string data(sizeof(float) * 4, 0); - tensorflow::gtl::MutableArraySlice floats( - tensorflow::bit_cast(data.data()), 4); + absl::Span floats(tensorflow::bit_cast(data.data()), 4); // With x as the minor dimension, these will become: floats[0] = 42.0; // y=0,x=0 floats[1] = 24.0; // y=0,x=1 @@ -105,8 +103,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim0Minor) { TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) { string data(sizeof(float) * 4, 0); - tensorflow::gtl::MutableArraySlice floats( - tensorflow::bit_cast(data.data()), 4); + absl::Span floats(tensorflow::bit_cast(data.data()), 4); // With y as the minor dimension, these will become: floats[0] = 42.0; // y=0,x=0 floats[1] = 24.0; // y=1,x=0 diff --git a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc index f334a8c1318a59bbfdd27dd1a63ed162600089ce..a8193c2eac05ba4f0df339909f3e82a28ac35253 100644 --- a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc +++ b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc @@ -23,7 +23,7 @@ limitations under the License. #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -46,61 +46,62 @@ class RoundTripTransferTest : public ClientLibraryTestBase { }; TEST_F(RoundTripTransferTest, R0S32) { - RoundTripTest(*Literal::CreateR0(42)); + RoundTripTest(*LiteralUtil::CreateR0(42)); } TEST_F(RoundTripTransferTest, R0F32) { - RoundTripTest(*Literal::CreateR0(42.0)); + RoundTripTest(*LiteralUtil::CreateR0(42.0)); } TEST_F(RoundTripTransferTest, R1F32_Len0) { - RoundTripTest(*Literal::CreateR1({})); + RoundTripTest(*LiteralUtil::CreateR1({})); } TEST_F(RoundTripTransferTest, R1F32_Len2) { - RoundTripTest(*Literal::CreateR1({42.0, 64.0})); + RoundTripTest(*LiteralUtil::CreateR1({42.0, 64.0})); } TEST_F(RoundTripTransferTest, R1F32_Len256) { std::vector values(256); std::iota(values.begin(), values.end(), 1.0); - RoundTripTest(*Literal::CreateR1(values)); + RoundTripTest(*LiteralUtil::CreateR1(values)); } TEST_F(RoundTripTransferTest, R1F32_Len1024) { std::vector values(1024); std::iota(values.begin(), values.end(), 1.0); - RoundTripTest(*Literal::CreateR1(values)); + RoundTripTest(*LiteralUtil::CreateR1(values)); } TEST_F(RoundTripTransferTest, R1F32_Len1025) { std::vector values(1025); std::iota(values.begin(), values.end(), 1.0); - RoundTripTest(*Literal::CreateR1(values)); + RoundTripTest(*LiteralUtil::CreateR1(values)); } TEST_F(RoundTripTransferTest, R1F32_Len4096) { std::vector values(4096); std::iota(values.begin(), values.end(), 1.0); - RoundTripTest(*Literal::CreateR1(values)); + RoundTripTest(*LiteralUtil::CreateR1(values)); } TEST_F(RoundTripTransferTest, R2F32_Len10x0) { - RoundTripTest(*Literal::CreateR2FromArray2D(Array2D(10, 0))); + RoundTripTest( + *LiteralUtil::CreateR2FromArray2D(Array2D(10, 0))); } TEST_F(RoundTripTransferTest, R2F32_Len2x2) { - RoundTripTest(*Literal::CreateR2({{42.0, 64.0}, {77.0, 88.0}})); + RoundTripTest(*LiteralUtil::CreateR2({{42.0, 64.0}, {77.0, 88.0}})); } TEST_F(RoundTripTransferTest, R3F32) { RoundTripTest( - *Literal::CreateR3({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}}, - {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}})); + *LiteralUtil::CreateR3({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}}, + {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}})); } TEST_F(RoundTripTransferTest, R4F32) { - RoundTripTest(*Literal::CreateR4({{ + RoundTripTest(*LiteralUtil::CreateR4({{ {{10, 11, 12, 13}, {14, 15, 16, 17}}, {{18, 19, 20, 21}, {22, 23, 24, 25}}, {{26, 27, 28, 29}, {30, 31, 32, 33}}, @@ -108,33 +109,36 @@ TEST_F(RoundTripTransferTest, R4F32) { } TEST_F(RoundTripTransferTest, EmptyTuple) { - RoundTripTest(*Literal::MakeTuple({})); + RoundTripTest(*LiteralUtil::MakeTuple({})); } TEST_F(RoundTripTransferTest, TupleOfR1F32) { - RoundTripTest(*Literal::MakeTuple({Literal::CreateR1({1, 2}).get(), - Literal::CreateR1({3, 4}).get()})); + RoundTripTest( + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({1, 2}).get(), + LiteralUtil::CreateR1({3, 4}).get()})); } TEST_F(RoundTripTransferTest, TupleOfR1F32_Len0_Len2) { - RoundTripTest(*Literal::MakeTuple({Literal::CreateR1({}).get(), - Literal::CreateR1({3, 4}).get()})); + RoundTripTest( + *LiteralUtil::MakeTuple({LiteralUtil::CreateR1({}).get(), + LiteralUtil::CreateR1({3, 4}).get()})); } TEST_F(RoundTripTransferTest, TupleOfR0F32AndR1S32) { - RoundTripTest(*Literal::MakeTuple({Literal::CreateR0(1.0).get(), - Literal::CreateR1({2, 3}).get()})); + RoundTripTest( + *LiteralUtil::MakeTuple({LiteralUtil::CreateR0(1.0).get(), + LiteralUtil::CreateR1({2, 3}).get()})); } // Below two tests are added to identify the cost of large data transfers. TEST_F(RoundTripTransferTest, R2F32_Large) { - RoundTripTest(*Literal::CreateR2F32Linspace(-1.0f, 1.0f, 512, 512)); + RoundTripTest(*LiteralUtil::CreateR2F32Linspace(-1.0f, 1.0f, 512, 512)); } TEST_F(RoundTripTransferTest, R4F32_Large) { Array4D array4d(2, 2, 256, 256); array4d.FillWithMultiples(1.0f); - RoundTripTest(*Literal::CreateR4FromArray4D(array4d)); + RoundTripTest(*LiteralUtil::CreateR4FromArray4D(array4d)); } } // namespace diff --git a/tensorflow/compiler/xla/tests/sample_text_test.cc b/tensorflow/compiler/xla/tests/sample_text_test.cc index b4f2b74e3dc9e80f50454b28eb6f2502cef3e681..2b03a0b0b22eb0ae4777417f6640c5f90171d808 100644 --- a/tensorflow/compiler/xla/tests/sample_text_test.cc +++ b/tensorflow/compiler/xla/tests/sample_text_test.cc @@ -19,18 +19,18 @@ limitations under the License. #include #include +#include "absl/types/optional.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/types.h" namespace xla { namespace { -using tensorflow::gtl::nullopt; +using absl::nullopt; class SampleTextTest : public HloTestBase {}; diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc index 308d3fc78a51e63c0e3db8c0cda18caf11f665bd..07460a7e01a5497aa6411ddb6866dddfc70f2068 100644 --- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc +++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc @@ -17,10 +17,13 @@ limitations under the License. #include #include +#include "absl/strings/str_cat.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -29,8 +32,6 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -44,74 +45,73 @@ class ScalarComputationsTest : public ClientLibraryTestBase { protected: // A template for building and running a binary comparison test. template - void TestCompare( - NativeT lhs, NativeT rhs, bool expected, - XlaOp (XlaBuilder::*op)(const XlaOp&, const XlaOp&, - tensorflow::gtl::ArraySlice)) { + void TestCompare(NativeT lhs, NativeT rhs, bool expected, + const std::function)>& op) { XlaBuilder builder(TestName()); - XlaOp lhs_op = builder.ConstantR0(lhs); - XlaOp rhs_op = builder.ConstantR0(rhs); - XlaOp result = (builder.*op)(lhs_op, rhs_op, {}); + XlaOp lhs_op = ConstantR0(&builder, lhs); + XlaOp rhs_op = ConstantR0(&builder, rhs); + op(lhs_op, rhs_op, {}); ComputeAndCompareR0(&builder, expected, {}); } template void TestMinMax(NativeT lhs, NativeT rhs, NativeT expected, - XlaOp (XlaBuilder::*op)(const XlaOp&, const XlaOp&, - tensorflow::gtl::ArraySlice)) { + const std::function)>& op) { XlaBuilder builder(TestName()); - XlaOp lhs_op = builder.ConstantR0(lhs); - XlaOp rhs_op = builder.ConstantR0(rhs); - XlaOp result = (builder.*op)(lhs_op, rhs_op, {}); + XlaOp lhs_op = ConstantR0(&builder, lhs); + XlaOp rhs_op = ConstantR0(&builder, rhs); + op(lhs_op, rhs_op, {}); ComputeAndCompareR0(&builder, expected, {}); } }; XLA_TEST_F(ScalarComputationsTest, ReturnScalarF32) { XlaBuilder builder(TestName()); - builder.ConstantR0(2.1f); + ConstantR0(&builder, 2.1f); ComputeAndCompareR0(&builder, 2.1f, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, NegateScalarF32) { XlaBuilder builder(TestName()); - builder.Neg(builder.ConstantR0(2.1f)); + Neg(ConstantR0(&builder, 2.1f)); ComputeAndCompareR0(&builder, -2.1f, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, NegateScalarS32) { XlaBuilder builder(TestName()); - builder.Neg(builder.ConstantR0(2)); + Neg(ConstantR0(&builder, 2)); ComputeAndCompareR0(&builder, -2, {}); } XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF32) { XlaBuilder builder(TestName()); - builder.Add(builder.ConstantR0(2.1f), builder.ConstantR0(5.5f)); + Add(ConstantR0(&builder, 2.1f), ConstantR0(&builder, 5.5f)); ComputeAndCompareR0(&builder, 7.6f, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS32) { XlaBuilder builder(TestName()); - builder.Add(builder.ConstantR0(2), builder.ConstantR0(5)); + Add(ConstantR0(&builder, 2), ConstantR0(&builder, 5)); ComputeAndCompareR0(&builder, 7, {}); } XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU32) { XlaBuilder builder(TestName()); - builder.Add(builder.ConstantR0(35), builder.ConstantR0(57)); + Add(ConstantR0(&builder, 35), ConstantR0(&builder, 57)); ComputeAndCompareR0(&builder, 92, {}); } XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU8) { XlaBuilder builder(TestName()); - builder.Add(builder.ConstantR0(35), builder.ConstantR0(57)); + Add(ConstantR0(&builder, 35), ConstantR0(&builder, 57)); ComputeAndCompareR0(&builder, 92, {}); } @@ -120,7 +120,7 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU64) { XlaBuilder builder(TestName()); const uint64 a = static_cast(1) << 63; const uint64 b = a + 1; - builder.Add(builder.ConstantR0(a), builder.ConstantR0(b)); + Add(ConstantR0(&builder, a), ConstantR0(&builder, b)); ComputeAndCompareR0(&builder, a + b, {}); } @@ -129,40 +129,39 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS64) { XlaBuilder builder(TestName()); const int64 a = static_cast(1) << 62; const int64 b = a - 1; - builder.Add(builder.ConstantR0(a), builder.ConstantR0(b)); + Add(ConstantR0(&builder, a), ConstantR0(&builder, b)); ComputeAndCompareR0(&builder, a + b, {}); } XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF64) { XlaBuilder builder(TestName()); - builder.Add(builder.ConstantR0(0.25), - builder.ConstantR0(3.5)); + Add(ConstantR0(&builder, 0.25), ConstantR0(&builder, 3.5)); ComputeAndCompareR0(&builder, 3.75, {}); } XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsF32) { XlaBuilder builder(TestName()); - builder.Sub(builder.ConstantR0(2.1f), builder.ConstantR0(5.5f)); + Sub(ConstantR0(&builder, 2.1f), ConstantR0(&builder, 5.5f)); ComputeAndCompareR0(&builder, -3.4f, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsS32) { XlaBuilder builder(TestName()); - builder.Sub(builder.ConstantR0(2), builder.ConstantR0(5)); + Sub(ConstantR0(&builder, 2), ConstantR0(&builder, 5)); ComputeAndCompareR0(&builder, -3, {}); } XLA_TEST_F(ScalarComputationsTest, CastS64ToF32) { XlaBuilder builder(TestName()); - auto a = builder.Parameter(0, ShapeUtil::MakeShape(S64, {}), "a"); - builder.ConvertElementType(a, F32); + auto a = Parameter(&builder, 0, ShapeUtil::MakeShape(S64, {}), "a"); + ConvertElementType(a, F32); int64 value = 3LL << 35; - std::unique_ptr a_literal = Literal::CreateR0(value); + std::unique_ptr a_literal = LiteralUtil::CreateR0(value); std::unique_ptr a_data = client_->TransferToServer(*a_literal).ConsumeValueOrDie(); ComputeAndCompareR0(&builder, static_cast(value), @@ -171,9 +170,8 @@ XLA_TEST_F(ScalarComputationsTest, CastS64ToF32) { XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32) { XlaBuilder builder(TestName()); - builder.Mul(builder.Mul(builder.ConstantR0(2.1f), - builder.ConstantR0(5.5f)), - builder.ConstantR0(0.5f)); + Mul(Mul(ConstantR0(&builder, 2.1f), ConstantR0(&builder, 5.5f)), + ConstantR0(&builder, 0.5f)); ComputeAndCompareR0(&builder, 5.775f, {}, error_spec_); } @@ -190,7 +188,7 @@ XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsS32) { for (int32 x : data) { for (int32 y : data) { XlaBuilder builder(TestName()); - builder.Mul(builder.ConstantR0(x), builder.ConstantR0(y)); + Mul(ConstantR0(&builder, x), ConstantR0(&builder, y)); // Signed integer overflow is undefined behavior in C++. Convert the input // integers to unsigned, perform the multiplication unsigned, and convert @@ -209,7 +207,7 @@ XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsU32) { for (uint32 x : data) { for (uint32 y : data) { XlaBuilder builder(TestName()); - builder.Mul(builder.ConstantR0(x), builder.ConstantR0(y)); + Mul(ConstantR0(&builder, x), ConstantR0(&builder, y)); uint32 expected = x * y; ComputeAndCompareR0(&builder, expected, {}); @@ -219,18 +217,17 @@ XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsU32) { XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsS32) { XlaBuilder builder(TestName()); - builder.Mul( - builder.Mul(builder.ConstantR0(2), builder.ConstantR0(5)), - builder.ConstantR0(1)); + Mul(Mul(ConstantR0(&builder, 2), ConstantR0(&builder, 5)), + ConstantR0(&builder, 1)); ComputeAndCompareR0(&builder, 10, {}); } XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) { XlaBuilder builder(TestName()); - std::unique_ptr a_literal = Literal::CreateR0(2.1f); - std::unique_ptr b_literal = Literal::CreateR0(5.5f); - std::unique_ptr c_literal = Literal::CreateR0(0.5f); + std::unique_ptr a_literal = LiteralUtil::CreateR0(2.1f); + std::unique_ptr b_literal = LiteralUtil::CreateR0(5.5f); + std::unique_ptr c_literal = LiteralUtil::CreateR0(0.5f); std::unique_ptr a_data = client_->TransferToServer(*a_literal).ConsumeValueOrDie(); @@ -239,10 +236,10 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) { std::unique_ptr c_data = client_->TransferToServer(*c_literal).ConsumeValueOrDie(); - XlaOp a = builder.Parameter(0, a_literal->shape(), "a"); - XlaOp b = builder.Parameter(1, b_literal->shape(), "b"); - XlaOp c = builder.Parameter(2, c_literal->shape(), "c"); - builder.Mul(builder.Mul(a, b), c); + XlaOp a = Parameter(&builder, 0, a_literal->shape(), "a"); + XlaOp b = Parameter(&builder, 1, b_literal->shape(), "b"); + XlaOp c = Parameter(&builder, 2, c_literal->shape(), "c"); + Mul(Mul(a, b), c); ComputeAndCompareR0(&builder, 5.775f, {a_data.get(), b_data.get(), c_data.get()}, @@ -251,14 +248,14 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) { XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsF32) { XlaBuilder builder(TestName()); - builder.Div(builder.ConstantR0(5.0f), builder.ConstantR0(2.5f)); + Div(ConstantR0(&builder, 5.0f), ConstantR0(&builder, 2.5f)); ComputeAndCompareR0(&builder, 2.0f, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsF32) { XlaBuilder builder(TestName()); - builder.Rem(builder.ConstantR0(2.5f), builder.ConstantR0(5.0f)); + Rem(ConstantR0(&builder, 2.5f), ConstantR0(&builder, 5.0f)); ComputeAndCompareR0(&builder, 2.5f, {}, error_spec_); } @@ -281,8 +278,8 @@ class DivS32Test : public ClientLibraryTestBase, XLA_TEST_P(DivS32Test, DivideTwoScalarsS32) { DivS32Params p = GetParam(); XlaBuilder builder(TestName()); - builder.Div(builder.ConstantR0(p.dividend), - builder.ConstantR0(p.divisor)); + Div(ConstantR0(&builder, p.dividend), + ConstantR0(&builder, p.divisor)); ComputeAndCompareR0(&builder, p.quotient, {}); } @@ -290,8 +287,8 @@ XLA_TEST_P(DivS32Test, DivideTwoScalarsS32) { XLA_TEST_P(DivS32Test, RemainderTwoScalarsS32) { DivS32Params p = GetParam(); XlaBuilder builder(TestName()); - builder.Rem(builder.ConstantR0(p.dividend), - builder.ConstantR0(p.divisor)); + Rem(ConstantR0(&builder, p.dividend), + ConstantR0(&builder, p.divisor)); ComputeAndCompareR0(&builder, p.remainder, {}); } @@ -305,7 +302,7 @@ XLA_TEST_P(DivS32Test, DivideTwoScalarsNonConstS32) { CreateR0Parameter(p.dividend, 0, "dividend", &builder, ÷nd); auto divisord = CreateR0Parameter(p.divisor, 1, "divisor", &builder, &divisor); - builder.Div(dividend, divisor); + Div(dividend, divisor); ComputeAndCompareR0(&builder, p.quotient, {dividendd.get(), divisord.get()}); @@ -320,7 +317,7 @@ XLA_TEST_P(DivS32Test, RemainderTwoScalarsNonConstDivisorS32) { CreateR0Parameter(p.dividend, 0, "dividend", &builder, ÷nd); auto divisord = CreateR0Parameter(p.divisor, 1, "divisor", &builder, &divisor); - builder.Rem(dividend, divisor); + Rem(dividend, divisor); ComputeAndCompareR0(&builder, p.remainder, {dividendd.get(), divisord.get()}); @@ -367,18 +364,18 @@ XLA_TEST_F(ScalarComputationsTest, DivU32s) { XlaBuilder builder(TestName()); XlaOp dividend = - builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend"); + Parameter(&builder, 0, ShapeUtil::MakeShape(U32, {}), "dividend"); XlaOp divisor = - builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor"); - builder.Div(dividend, divisor); + Parameter(&builder, 1, ShapeUtil::MakeShape(U32, {}), "divisor"); + Div(dividend, divisor); TF_ASSERT_OK_AND_ASSIGN(div_computation, builder.Build()); } for (uint32 divisor : vals) { if (divisor != 0) { for (uint32 dividend : vals) { - auto dividend_literal = Literal::CreateR0(dividend); - auto divisor_literal = Literal::CreateR0(divisor); + auto dividend_literal = LiteralUtil::CreateR0(dividend); + auto divisor_literal = LiteralUtil::CreateR0(divisor); TF_ASSERT_OK_AND_ASSIGN(auto dividend_data, client_->TransferToServer(*dividend_literal)); TF_ASSERT_OK_AND_ASSIGN(auto divisor_data, @@ -389,7 +386,8 @@ XLA_TEST_F(ScalarComputationsTest, DivU32s) { {dividend_data.get(), divisor_data.get()}, &execution_options_) .ConsumeValueOrDie(); - auto expected_literal = Literal::CreateR0(dividend / divisor); + auto expected_literal = + LiteralUtil::CreateR0(dividend / divisor); EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *actual_literal)); } } @@ -408,18 +406,18 @@ XLA_TEST_F(ScalarComputationsTest, RemU32s) { XlaBuilder builder(TestName()); XlaOp dividend = - builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend"); + Parameter(&builder, 0, ShapeUtil::MakeShape(U32, {}), "dividend"); XlaOp divisor = - builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor"); - builder.Rem(dividend, divisor); + Parameter(&builder, 1, ShapeUtil::MakeShape(U32, {}), "divisor"); + Rem(dividend, divisor); TF_ASSERT_OK_AND_ASSIGN(rem_computation, builder.Build()); } for (uint32 divisor : vals) { if (divisor != 0) { for (uint32 dividend : vals) { - auto dividend_literal = Literal::CreateR0(dividend); - auto divisor_literal = Literal::CreateR0(divisor); + auto dividend_literal = LiteralUtil::CreateR0(dividend); + auto divisor_literal = LiteralUtil::CreateR0(divisor); TF_ASSERT_OK_AND_ASSIGN(auto dividend_data, client_->TransferToServer(*dividend_literal)); TF_ASSERT_OK_AND_ASSIGN(auto divisor_data, @@ -430,7 +428,8 @@ XLA_TEST_F(ScalarComputationsTest, RemU32s) { {dividend_data.get(), divisor_data.get()}, &execution_options_) .ConsumeValueOrDie(); - auto expected_literal = Literal::CreateR0(dividend % divisor); + auto expected_literal = + LiteralUtil::CreateR0(dividend % divisor); EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *actual_literal)); } } @@ -439,10 +438,10 @@ XLA_TEST_F(ScalarComputationsTest, RemU32s) { XLA_TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) { XlaBuilder builder(TestName()); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x"); - builder.Rem(x, builder.ConstantR0(80000)); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(S32, {}), "x"); + Rem(x, ConstantR0(&builder, 80000)); - std::unique_ptr literal = Literal::CreateR0(87919); + std::unique_ptr literal = LiteralUtil::CreateR0(87919); TF_ASSERT_OK_AND_ASSIGN(auto input_data, client_->TransferToServer(*literal)); ComputeAndCompareR0(&builder, 7919, {input_data.get()}); } @@ -451,15 +450,15 @@ XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsU32) { XlaBuilder builder(TestName()); // This verifies 0xFFFFFFFE / 2 = 0x7FFFFFFF. If XLA incorrectly treated U32 // as S32, it would output -2 / 2 = -1 (0xFFFFFFFF). - builder.Div(builder.ConstantR0(0xFFFFFFFE), - builder.ConstantR0(2)); + Div(ConstantR0(&builder, 0xFFFFFFFE), + ConstantR0(&builder, 2)); ComputeAndCompareR0(&builder, 0x7FFFFFFF, {}); } XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsU32) { XlaBuilder builder(TestName()); - builder.Rem(builder.ConstantR0(11), builder.ConstantR0(3)); + Rem(ConstantR0(&builder, 11), ConstantR0(&builder, 3)); ComputeAndCompareR0(&builder, 2, {}); } @@ -468,7 +467,7 @@ XLA_TEST_F(ScalarComputationsTest, AndBool) { for (bool x : {false, true}) { for (bool y : {false, true}) { XlaBuilder builder(TestName()); - builder.And(builder.ConstantR0(x), builder.ConstantR0(y)); + And(ConstantR0(&builder, x), ConstantR0(&builder, y)); ComputeAndCompareR0(&builder, x && y, {}); } @@ -479,7 +478,7 @@ XLA_TEST_F(ScalarComputationsTest, AndS32) { for (int32 x : {0, 8}) { for (int32 y : {1, -16}) { XlaBuilder builder(TestName()); - builder.And(builder.ConstantR0(x), builder.ConstantR0(y)); + And(ConstantR0(&builder, x), ConstantR0(&builder, y)); ComputeAndCompareR0(&builder, x & y, {}); } @@ -490,7 +489,7 @@ XLA_TEST_F(ScalarComputationsTest, AndU32) { for (uint32 x : {0, 8}) { for (uint32 y : {1, 16}) { XlaBuilder builder(TestName()); - builder.And(builder.ConstantR0(x), builder.ConstantR0(y)); + And(ConstantR0(&builder, x), ConstantR0(&builder, y)); ComputeAndCompareR0(&builder, x & y, {}); } @@ -501,7 +500,7 @@ XLA_TEST_F(ScalarComputationsTest, OrBool) { for (bool x : {false, true}) { for (bool y : {false, true}) { XlaBuilder builder(TestName()); - builder.Or(builder.ConstantR0(x), builder.ConstantR0(y)); + Or(ConstantR0(&builder, x), ConstantR0(&builder, y)); ComputeAndCompareR0(&builder, x || y, {}); } @@ -512,7 +511,7 @@ XLA_TEST_F(ScalarComputationsTest, OrS32) { for (int32 x : {0, 8}) { for (int32 y : {1, -16}) { XlaBuilder builder(TestName()); - builder.Or(builder.ConstantR0(x), builder.ConstantR0(y)); + Or(ConstantR0(&builder, x), ConstantR0(&builder, y)); ComputeAndCompareR0(&builder, x | y, {}); } @@ -523,7 +522,7 @@ XLA_TEST_F(ScalarComputationsTest, OrU32) { for (uint32 x : {0, 8}) { for (uint32 y : {1, 16}) { XlaBuilder builder(TestName()); - builder.Or(builder.ConstantR0(x), builder.ConstantR0(y)); + Or(ConstantR0(&builder, x), ConstantR0(&builder, y)); ComputeAndCompareR0(&builder, x | y, {}); } @@ -533,7 +532,7 @@ XLA_TEST_F(ScalarComputationsTest, OrU32) { XLA_TEST_F(ScalarComputationsTest, NotBool) { for (bool x : {false, true}) { XlaBuilder builder(TestName()); - builder.Not(builder.ConstantR0(x)); + Not(ConstantR0(&builder, x)); ComputeAndCompareR0(&builder, !x, {}); } @@ -542,7 +541,7 @@ XLA_TEST_F(ScalarComputationsTest, NotBool) { XLA_TEST_F(ScalarComputationsTest, NotS32) { for (int32 x : {-1, 0, 1}) { XlaBuilder builder(TestName()); - builder.Not(builder.ConstantR0(x)); + Not(ConstantR0(&builder, x)); ComputeAndCompareR0(&builder, ~x, {}); } @@ -551,7 +550,7 @@ XLA_TEST_F(ScalarComputationsTest, NotS32) { XLA_TEST_F(ScalarComputationsTest, NotU32) { for (uint32 x : {0, 1, 2}) { XlaBuilder builder(TestName()); - builder.Not(builder.ConstantR0(x)); + Not(ConstantR0(&builder, x)); ComputeAndCompareR0(&builder, ~x, {}); } @@ -559,18 +558,18 @@ XLA_TEST_F(ScalarComputationsTest, NotU32) { XLA_TEST_F(ScalarComputationsTest, SelectScalarTrue) { XlaBuilder builder(TestName()); - builder.Select(builder.ConstantR0(true), // The predicate. - builder.ConstantR0(123.0f), // The value on true. - builder.ConstantR0(42.0f)); // The value on false. + Select(ConstantR0(&builder, true), // The predicate. + ConstantR0(&builder, 123.0f), // The value on true. + ConstantR0(&builder, 42.0f)); // The value on false. ComputeAndCompareR0(&builder, 123.0f, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, SelectScalarFalse) { XlaBuilder builder(TestName()); - builder.Select(builder.ConstantR0(false), // The predicate. - builder.ConstantR0(123.0f), // The value on true. - builder.ConstantR0(42.0f)); // The value on false. + Select(ConstantR0(&builder, false), // The predicate. + ConstantR0(&builder, 123.0f), // The value on true. + ConstantR0(&builder, 42.0f)); // The value on false. ComputeAndCompareR0(&builder, 42.0f, {}, error_spec_); } @@ -579,313 +578,311 @@ XLA_TEST_F(ScalarComputationsTest, SelectScalarFalse) { // templatized comparison tests. XLA_TEST_F(ScalarComputationsTest, CompareGtScalar) { XlaBuilder builder(TestName()); - builder.Gt(builder.ConstantR0(2.0f), builder.ConstantR0(1.0f)); + Gt(ConstantR0(&builder, 2.0f), ConstantR0(&builder, 1.0f)); ComputeAndCompareR0(&builder, true, {}); } // S32 comparisons. XLA_TEST_F(ScalarComputationsTest, CompareEqS32Greater) { - TestCompare(2, 1, false, &XlaBuilder::Eq); + TestCompare(2, 1, false, &Eq); } XLA_TEST_F(ScalarComputationsTest, CompareEqS32Equal) { - TestCompare(3, 3, true, &XlaBuilder::Eq); + TestCompare(3, 3, true, &Eq); } XLA_TEST_F(ScalarComputationsTest, CompareNeS32) { - TestCompare(2, 1, true, &XlaBuilder::Ne); + TestCompare(2, 1, true, &Ne); } XLA_TEST_F(ScalarComputationsTest, CompareGeS32) { - TestCompare(2, 1, true, &XlaBuilder::Ge); + TestCompare(2, 1, true, &Ge); } XLA_TEST_F(ScalarComputationsTest, CompareGtS32) { - TestCompare(1, 5, false, &XlaBuilder::Gt); + TestCompare(1, 5, false, &Gt); } XLA_TEST_F(ScalarComputationsTest, CompareLeS32) { - TestCompare(2, 1, false, &XlaBuilder::Le); + TestCompare(2, 1, false, &Le); } XLA_TEST_F(ScalarComputationsTest, CompareLtS32) { - TestCompare(9, 7, false, &XlaBuilder::Lt); + TestCompare(9, 7, false, &Lt); TestCompare(std::numeric_limits::min(), - std::numeric_limits::max(), true, &XlaBuilder::Lt); + std::numeric_limits::max(), true, &Lt); } // U32 comparisons. XLA_TEST_F(ScalarComputationsTest, CompareEqU32False) { - TestCompare(2, 1, false, &XlaBuilder::Eq); + TestCompare(2, 1, false, &Eq); } XLA_TEST_F(ScalarComputationsTest, CompareNeU32) { - TestCompare(2, 1, true, &XlaBuilder::Ne); + TestCompare(2, 1, true, &Ne); } XLA_TEST_F(ScalarComputationsTest, CompareGeU32Greater) { - TestCompare(2, 1, true, &XlaBuilder::Ge); + TestCompare(2, 1, true, &Ge); } XLA_TEST_F(ScalarComputationsTest, CompareGeU32Equal) { - TestCompare(3, 3, true, &XlaBuilder::Ge); + TestCompare(3, 3, true, &Ge); } XLA_TEST_F(ScalarComputationsTest, CompareGtU32) { - TestCompare(1, 5, false, &XlaBuilder::Gt); - TestCompare(5, 5, false, &XlaBuilder::Gt); - TestCompare(5, 1, true, &XlaBuilder::Gt); + TestCompare(1, 5, false, &Gt); + TestCompare(5, 5, false, &Gt); + TestCompare(5, 1, true, &Gt); } XLA_TEST_F(ScalarComputationsTest, CompareLeU32) { - TestCompare(2, 1, false, &XlaBuilder::Le); + TestCompare(2, 1, false, &Le); } XLA_TEST_F(ScalarComputationsTest, CompareLtU32) { - TestCompare(9, 7, false, &XlaBuilder::Lt); - TestCompare(0, std::numeric_limits::max(), true, - &XlaBuilder::Lt); + TestCompare(9, 7, false, &Lt); + TestCompare(0, std::numeric_limits::max(), true, &Lt); } // F32 comparisons. XLA_TEST_F(ScalarComputationsTest, CompareEqF32False) { - TestCompare(2.0, 1.3, false, &XlaBuilder::Eq); + TestCompare(2.0, 1.3, false, &Eq); } XLA_TEST_F(ScalarComputationsTest, CompareNeF32) { - TestCompare(2.0, 1.3, true, &XlaBuilder::Ne); + TestCompare(2.0, 1.3, true, &Ne); } XLA_TEST_F(ScalarComputationsTest, CompareGeF32Greater) { - TestCompare(2.0, 1.9, true, &XlaBuilder::Ge); + TestCompare(2.0, 1.9, true, &Ge); } XLA_TEST_F(ScalarComputationsTest, CompareGeF32Equal) { - TestCompare(3.5, 3.5, true, &XlaBuilder::Ge); + TestCompare(3.5, 3.5, true, &Ge); } XLA_TEST_F(ScalarComputationsTest, CompareGtF32) { - TestCompare(1.0, 5.2, false, &XlaBuilder::Gt); + TestCompare(1.0, 5.2, false, &Gt); } XLA_TEST_F(ScalarComputationsTest, CompareLeF32) { - TestCompare(2.0, 1.2, false, &XlaBuilder::Le); + TestCompare(2.0, 1.2, false, &Le); } XLA_TEST_F(ScalarComputationsTest, CompareLtF32) { - TestCompare(9.0, 7.2, false, &XlaBuilder::Lt); + TestCompare(9.0, 7.2, false, &Lt); } // F32 comparisons with exceptional values. The test names encode the // left/right operands at the end, and use Minf and Mzero for -inf and -0.0. XLA_TEST_F(ScalarComputationsTest, CompareLtF32MinfMzero) { - TestCompare(-INFINITY, -0.0, true, &XlaBuilder::Lt); + TestCompare(-INFINITY, -0.0, true, &Lt); } XLA_TEST_F(ScalarComputationsTest, CompareLtF32MzeroZero) { // Comparisons of 0.0 to -0.0 consider them equal in IEEE 754. - TestCompare(-0.0, 0.0, false, &XlaBuilder::Lt); + TestCompare(-0.0, 0.0, false, &Lt); } XLA_TEST_F(ScalarComputationsTest, CompareLtF32ZeroInf) { - TestCompare(0.0, INFINITY, true, &XlaBuilder::Lt); + TestCompare(0.0, INFINITY, true, &Lt); } XLA_TEST_F(ScalarComputationsTest, CompareGeF32MinfMzero) { - TestCompare(-INFINITY, -0.0, false, &XlaBuilder::Ge); + TestCompare(-INFINITY, -0.0, false, &Ge); } XLA_TEST_F(ScalarComputationsTest, CompareGeF32MzeroZero) { // Comparisons of 0.0 to -0.0 consider them equal in IEEE 754. - TestCompare(-0.0, 0.0, true, &XlaBuilder::Ge); + TestCompare(-0.0, 0.0, true, &Ge); } XLA_TEST_F(ScalarComputationsTest, CompareGeF32ZeroInf) { - TestCompare(0.0, INFINITY, false, &XlaBuilder::Ge); + TestCompare(0.0, INFINITY, false, &Ge); } XLA_TEST_F(ScalarComputationsTest, ExpScalar) { XlaBuilder builder(TestName()); - builder.Exp(builder.ConstantR0(2.0f)); + Exp(ConstantR0(&builder, 2.0f)); ComputeAndCompareR0(&builder, 7.3890562, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, LogScalar) { XlaBuilder builder("log"); - builder.Log(builder.ConstantR0(2.0f)); + Log(ConstantR0(&builder, 2.0f)); ComputeAndCompareR0(&builder, 0.6931471, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, TanhScalar) { XlaBuilder builder(TestName()); - builder.Tanh(builder.ConstantR0(2.0f)); + Tanh(ConstantR0(&builder, 2.0f)); ComputeAndCompareR0(&builder, 0.96402758, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, TanhDoubleScalar) { XlaBuilder builder(TestName()); - builder.Tanh(builder.ConstantR0(2.0)); + Tanh(ConstantR0(&builder, 2.0)); ComputeAndCompareR0(&builder, 0.96402758, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, PowScalar) { XlaBuilder builder(TestName()); - builder.Pow(builder.ConstantR0(2.0f), builder.ConstantR0(3.0f)); + Pow(ConstantR0(&builder, 2.0f), ConstantR0(&builder, 3.0f)); ComputeAndCompareR0(&builder, 8.0, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) { XlaBuilder builder(TestName()); - builder.Clamp(builder.ConstantR0(-1), // The lower bound. - builder.ConstantR0(5), // The operand to be clamped. - builder.ConstantR0(3)); // The upper bound. + Clamp(ConstantR0(&builder, -1), // The lower bound. + ConstantR0(&builder, 5), // The operand to be clamped. + ConstantR0(&builder, 3)); // The upper bound. ComputeAndCompareR0(&builder, 3, {}); } XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleS32) { XlaBuilder builder(TestName()); - builder.Clamp(builder.ConstantR0(-1), // The lower bound. - builder.ConstantR0(2), // The operand to be clamped. - builder.ConstantR0(3)); // The upper bound. + Clamp(ConstantR0(&builder, -1), // The lower bound. + ConstantR0(&builder, 2), // The operand to be clamped. + ConstantR0(&builder, 3)); // The upper bound. ComputeAndCompareR0(&builder, 2, {}); } XLA_TEST_F(ScalarComputationsTest, ClampScalarLowS32) { XlaBuilder builder(TestName()); - builder.Clamp(builder.ConstantR0(-1), // The lower bound. - builder.ConstantR0(-5), // The operand to be clamped. - builder.ConstantR0(3)); // The upper bound. + Clamp(ConstantR0(&builder, -1), // The lower bound. + ConstantR0(&builder, -5), // The operand to be clamped. + ConstantR0(&builder, 3)); // The upper bound. ComputeAndCompareR0(&builder, -1, {}); } XLA_TEST_F(ScalarComputationsTest, ClampScalarHighU32) { XlaBuilder builder(TestName()); - builder.Clamp(builder.ConstantR0(1), // The lower bound. - builder.ConstantR0(5), // The operand to be clamped. - builder.ConstantR0(3)); // The upper bound. + Clamp(ConstantR0(&builder, 1), // The lower bound. + ConstantR0(&builder, 5), // The operand to be clamped. + ConstantR0(&builder, 3)); // The upper bound. ComputeAndCompareR0(&builder, 3, {}); } XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleU32) { XlaBuilder builder(TestName()); - builder.Clamp(builder.ConstantR0(1), // The lower bound. - builder.ConstantR0(2), // The operand to be clamped. - builder.ConstantR0(3)); // The upper bound. + Clamp(ConstantR0(&builder, 1), // The lower bound. + ConstantR0(&builder, 2), // The operand to be clamped. + ConstantR0(&builder, 3)); // The upper bound. ComputeAndCompareR0(&builder, 2, {}); } XLA_TEST_F(ScalarComputationsTest, ClampScalarLowU32) { XlaBuilder builder(TestName()); - builder.Clamp(builder.ConstantR0(1), // The lower bound. - builder.ConstantR0(0), // The operand to be clamped. - builder.ConstantR0(3)); // The upper bound. + Clamp(ConstantR0(&builder, 1), // The lower bound. + ConstantR0(&builder, 0), // The operand to be clamped. + ConstantR0(&builder, 3)); // The upper bound. ComputeAndCompareR0(&builder, 1, {}); } XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) { XlaBuilder builder(TestName()); - builder.Clamp(builder.ConstantR0(2.0f), // The lower bound. - builder.ConstantR0(5.0f), // The operand to be clamped. - builder.ConstantR0(3.0f)); // The upper bound. + Clamp(ConstantR0(&builder, 2.0f), // The lower bound. + ConstantR0(&builder, 5.0f), // The operand to be clamped. + ConstantR0(&builder, 3.0f)); // The upper bound. ComputeAndCompareR0(&builder, 3.0, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) { XlaBuilder builder(TestName()); - builder.Clamp(builder.ConstantR0(2.0f), // The lower bound. - builder.ConstantR0(2.5f), // The operand to be clamped. - builder.ConstantR0(3.0f)); // The upper bound. + Clamp(ConstantR0(&builder, 2.0f), // The lower bound. + ConstantR0(&builder, 2.5f), // The operand to be clamped. + ConstantR0(&builder, 3.0f)); // The upper bound. ComputeAndCompareR0(&builder, 2.5, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) { XlaBuilder builder(TestName()); - builder.Clamp(builder.ConstantR0(2.0f), // The lower bound. - builder.ConstantR0(-5.0f), // The operand to be clamped. - builder.ConstantR0(3.0f)); // The upper bound. + Clamp(ConstantR0(&builder, 2.0f), // The lower bound. + ConstantR0(&builder, -5.0f), // The operand to be clamped. + ConstantR0(&builder, 3.0f)); // The upper bound. ComputeAndCompareR0(&builder, 2.0, {}, error_spec_); } XLA_TEST_F(ScalarComputationsTest, MinS32Above) { - TestMinMax(10, 3, 3, &XlaBuilder::Min); + TestMinMax(10, 3, 3, &Min); } XLA_TEST_F(ScalarComputationsTest, MinS32Below) { - TestMinMax(-100, 3, -100, &XlaBuilder::Min); + TestMinMax(-100, 3, -100, &Min); } XLA_TEST_F(ScalarComputationsTest, MaxS32Above) { - TestMinMax(10, 3, 10, &XlaBuilder::Max); + TestMinMax(10, 3, 10, &Max); } XLA_TEST_F(ScalarComputationsTest, MaxS32Below) { - TestMinMax(-100, 3, 3, &XlaBuilder::Max); + TestMinMax(-100, 3, 3, &Max); } XLA_TEST_F(ScalarComputationsTest, MinU32Above) { const uint32 large = std::numeric_limits::max(); - TestMinMax(large, 3, 3, &XlaBuilder::Min); + TestMinMax(large, 3, 3, &Min); } XLA_TEST_F(ScalarComputationsTest, MinU32Below) { - TestMinMax(0, 5, 0, &XlaBuilder::Min); + TestMinMax(0, 5, 0, &Min); } XLA_TEST_F(ScalarComputationsTest, MaxU32Above) { const uint32 large = std::numeric_limits::max(); - TestMinMax(large, 3, large, &XlaBuilder::Max); + TestMinMax(large, 3, large, &Max); } XLA_TEST_F(ScalarComputationsTest, MaxU32Below) { - TestMinMax(0, 5, 5, &XlaBuilder::Max); + TestMinMax(0, 5, 5, &Max); } XLA_TEST_F(ScalarComputationsTest, MinF32Above) { - TestMinMax(10.1f, 3.1f, 3.1f, &XlaBuilder::Min); + TestMinMax(10.1f, 3.1f, 3.1f, &Min); } XLA_TEST_F(ScalarComputationsTest, MinF32Below) { - TestMinMax(-100.1f, 3.1f, -100.1f, &XlaBuilder::Min); + TestMinMax(-100.1f, 3.1f, -100.1f, &Min); } XLA_TEST_F(ScalarComputationsTest, MinPropagatesNan) { SetFastMathDisabled(true); - TestMinMax(NAN, 3.1f, NAN, &XlaBuilder::Min); - TestMinMax(-3.1f, NAN, NAN, &XlaBuilder::Min); + TestMinMax(NAN, 3.1f, NAN, &Min); + TestMinMax(-3.1f, NAN, NAN, &Min); } XLA_TEST_F(ScalarComputationsTest, MaxF32Above) { - TestMinMax(10.1f, 3.1f, 10.1f, &XlaBuilder::Max); + TestMinMax(10.1f, 3.1f, 10.1f, &Max); } XLA_TEST_F(ScalarComputationsTest, MaxF32Below) { - TestMinMax(-100.1f, 3.1f, 3.1f, &XlaBuilder::Max); + TestMinMax(-100.1f, 3.1f, 3.1f, &Max); } XLA_TEST_F(ScalarComputationsTest, MaxPropagatesNan) { SetFastMathDisabled(true); - TestMinMax(NAN, 3.1f, NAN, &XlaBuilder::Max); - TestMinMax(-3.1f, NAN, NAN, &XlaBuilder::Max); + TestMinMax(NAN, 3.1f, NAN, &Max); + TestMinMax(-3.1f, NAN, NAN, &Max); } XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) { // Compute the expression (1 * (3 - 1) * (7 + 0) - 4) / 20. XlaBuilder b(TestName()); - b.Div( - b.Sub(b.Mul(b.ConstantR0(1), - b.Mul(b.Sub(b.ConstantR0(3), b.ConstantR0(1)), - b.Add(b.ConstantR0(7), b.ConstantR0(0)))), - b.ConstantR0(4)), - b.ConstantR0(20)); + Div(Sub(Mul(ConstantR0(&b, 1), + Mul(Sub(ConstantR0(&b, 3), ConstantR0(&b, 1)), + Add(ConstantR0(&b, 7), ConstantR0(&b, 0)))), + ConstantR0(&b, 4)), + ConstantR0(&b, 20)); ComputeAndCompareR0(&b, 0.5, {}, error_spec_); } @@ -893,30 +890,18 @@ XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) { XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) { // Compute the expression 1 * (3 - 1) * (7 + 0) - 4. XlaBuilder b(TestName()); - b.Sub(b.Mul(b.ConstantR0(1), - b.Mul(b.Sub(b.ConstantR0(3), b.ConstantR0(1)), - b.Add(b.ConstantR0(7), b.ConstantR0(0)))), - b.ConstantR0(4)); + Sub(Mul(ConstantR0(&b, 1), + Mul(Sub(ConstantR0(&b, 3), ConstantR0(&b, 1)), + Add(ConstantR0(&b, 7), ConstantR0(&b, 0)))), + ConstantR0(&b, 4)); ComputeAndCompareR0(&b, 10, {}); } -XLA_TEST_F(ScalarComputationsTest, SqrtF320) { - XlaBuilder builder(TestName()); - Literal zero_literal = Literal::Zero(PrimitiveType::F32); - - std::unique_ptr zero_data = - client_->TransferToServer(zero_literal).ConsumeValueOrDie(); - - XlaOp zero = builder.Parameter(0, zero_literal.shape(), "zero"); - builder.SqrtF32(zero); - - ComputeAndCompareR0(&builder, 0.0f, {zero_data.get()}, error_spec_); -} XLA_TEST_F(ScalarComputationsTest, RoundScalar) { XlaBuilder builder(TestName()); - builder.Round(builder.ConstantR0(1.4f)); + Round(ConstantR0(&builder, 1.4f)); ComputeAndCompareR0(&builder, 1.0f, {}, error_spec_); } diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1858dcea61241a2aeee11592a9b09f200763b25a --- /dev/null +++ b/tensorflow/compiler/xla/tests/scatter_test.cc @@ -0,0 +1,614 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" + +namespace xla { +namespace { + +using absl::nullopt; + +class ScatterTest : public HloTestBase { + protected: + void RunTest(const string& hlo_text, Literal* operand, + Literal* scatter_indices, Literal* updates) { + RunTest(hlo_text, {operand, scatter_indices, updates}); + } + + void RunTest(const string& hlo_text, absl::Span args) { + HloModuleConfig config; + config.set_debug_options(GetDebugOptionsForTest()); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_text, config)); + EXPECT_TRUE(RunAndCompare(std::move(module), args, nullopt)); + } +}; + +XLA_TEST_F(ScatterTest, TensorFlowScatterV1_Update) { + const string hlo_text = R"( +HloModule TensorFlowScatterV1 + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, TensorFlowScatterV2_Update) { + const char* hlo_text = R"( +HloModule TensorFlowScatterV2 + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[3,2] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={0}, + inserted_window_dims={1}, + scatter_dims_to_operand_dims={1}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 30}, {40, 60}, {70, 90}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, TensorFlowScatter_Add) { + const string hlo_text = R"( +HloModule TensorFlowScatter_Add + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, TensorFlowScatter_Mul) { + const string hlo_text = R"( +HloModule TensorFlowScatter_Mul + +mul_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT mul = s32[] multiply(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=mul_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, TensorFlowScatter_F32) { + const string hlo_text = R"( +HloModule TensorFlowScatter_F32 + +add_f32 (lhs: f32[], rhs: f32[]) -> f32[] { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(f32[] lhs, f32[] rhs) +} + +ENTRY main { + operand = f32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = f32[2,3] parameter(2) + ROOT scatter = f32[3,3] scatter(operand, indices, updates), + to_apply=add_f32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = LiteralUtil::CreateR2( + {{1.1, 2.2, 3.3}, {4.4, 5.5, 6.6}, {7.7, 8.8, 9.9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({2, 1}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{0.4, 1.1, 0.7}, {2.3, 3.1, 1.6}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, TensorFlowScatter_RepeatedIndices) { + const char* hlo_text = R"( +HloModule TensorFlowScatter + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({1, 1}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, TensorFlowScatter_MultipleBatchDims) { + const char* hlo_text = R"( +HloModule TensorFlowScatterMultipleBatchDims + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,3,2] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={1}, + scatter_dims_to_operand_dims={1}, + index_vector_dim=2 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{0, 2}, {2, 1}}); + std::unique_ptr updates = LiteralUtil::CreateR3( + {{{10, 30}, {40, 60}, {70, 90}}, {{5, 5}, {5, 5}, {5, 5}}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, TensorFlowScatterNd) { + const char* hlo_text = R"( +HloModule TensorFlowScatterNd + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3,2] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,2] parameter(2) + ROOT scatter = s32[3,3,2] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0,1}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{-10, 10}, {-40, 40}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, TensorFlowScatterNd_NonDefaultIndexVectorDim) { + const char* hlo_text = R"( +HloModule TensorFlowScatterNdNonDefaultIndexVectorDim + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3,2] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,2] parameter(2) + ROOT scatter = s32[3,3,2] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0,1}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=0 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{-10, 10}, {-20, 20}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, DynamicUpdateSlice) { + const char* hlo_text = R"( +HloModule DynamicUpdateSlice + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[1,1] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={0,1}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=0 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({1, 1}); + std::unique_ptr updates = LiteralUtil::CreateR2({{10}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, BatchDynamicUpdateSlice) { + const char* hlo_text = R"( +HloModule BatchDynamicUpdateSlice + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,1,1] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1,2}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=0 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{2, 1}, {1, 1}}); + std::unique_ptr updates = + LiteralUtil::CreateR3({{{10}}, {{20}}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, ZeroDimBounds) { + const char* hlo_text = R"( +HloModule TensorFlowScatter_ZeroDimBounds + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,0] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,0] parameter(2) + ROOT scatter = s32[3,0] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = LiteralUtil::CreateR2({{}, {}, {}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = LiteralUtil::CreateR2({{}, {}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, NoUpdateWindowDims) { + const string hlo_text = R"( +HloModule Scatter_NoUpdateWindowDims + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3] parameter(0) + indices = s32[2,2,1] parameter(1) + updates = s32[2,2] parameter(2) + ROOT scatter = s32[3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=2 +} +)"; + std::unique_ptr operand = LiteralUtil::CreateR1({0, 1, 2}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR3({{{0}, {1}}, {{2}, {1}}}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20}, {30, 40}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, OutOfBoundsIndex) { + const string hlo_text = R"( +HloModule BatchDynamicSlice + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3]{1,0} parameter(0) + indices = s32[6,2]{1,0} parameter(1) + updates = s32[6,1,1]{2,1,0} parameter(2) + ROOT scatter = s32[3,3]{1,0} scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1,2}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = LiteralUtil::CreateR2( + {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483647, 1}, {1, 2}}); + std::unique_ptr updates = LiteralUtil::CreateR3( + {{{10}}, {{20}}, {{30}}, {{40}}, {{50}}, {{60}}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, OutOfBoundsUnsignedIndex) { + const string hlo_text = R"( +HloModule BatchDynamicSlice + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3]{1,0} parameter(0) + indices = u32[6,2]{1,0} parameter(1) + updates = s32[6,1,1]{2,1,0} parameter(2) + ROOT scatter = s32[3,3]{1,0} scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1,2}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = LiteralUtil::CreateR2( + {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483648u, 1}, {1, 2}}); + std::unique_ptr updates = LiteralUtil::CreateR3( + {{{10}}, {{20}}, {{30}}, {{40}}, {{50}}, {{60}}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, NegativeIndex) { + const string hlo_text = R"( +HloModule BatchDynamicSlice + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3]{1,0} parameter(0) + indices = s32[6,2]{1,0} parameter(1) + updates = s32[6,1,1]{2,1,0} parameter(2) + ROOT scatter = s32[3,3]{1,0} scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1,2}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = LiteralUtil::CreateR2( + {{2, 7}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}}); + std::unique_ptr updates = LiteralUtil::CreateR3( + {{{10}}, {{20}}, {{30}}, {{40}}, {{50}}, {{60}}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, OneScalarIndex) { + const char* hlo_text = R"( +HloModule OneScalarIndex + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[2,3,2]{2,1,0} parameter(0) + index = s32[] parameter(1) + updates = s32[1,3,2]{2,1,0} parameter(2) + ROOT scatter = s32[2,3,2]{2,1,0} scatter(operand, index, updates), + to_apply=update_s32, + update_window_dims={0,1,2}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=0 +} +)"; + std::unique_ptr operand = LiteralUtil::CreateR3( + {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}); + std::unique_ptr scatter_indices = LiteralUtil::CreateR0(1); + std::unique_ptr updates = + LiteralUtil::CreateR3({{{10, 20}, {30, 40}, {50, 60}}}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, ScalarUpdate) { + const char* hlo_text = R"( +HloModule ScalarUpdate + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[4]{0} parameter(0) + index = s32[] parameter(1) + updates = s32[] parameter(2) + ROOT scatter = s32[4]{0} scatter(operand, index, updates), + to_apply=update_s32, + update_window_dims={}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=0 +} +)"; + std::unique_ptr operand = LiteralUtil::CreateR1({1, 2, 3, 4}); + std::unique_ptr scatter_indices = LiteralUtil::CreateR0(1); + std::unique_ptr updates = LiteralUtil::CreateR0(25); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +XLA_TEST_F(ScatterTest, EmptyIndices) { + const string hlo_text = R"( +HloModule EmptyIndices + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3] parameter(0) + indices = s32[0] parameter(1) + updates = s32[0] parameter(2) + ROOT scatter = s32[3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + std::unique_ptr operand = LiteralUtil::CreateR1({1, 2, 3}); + std::unique_ptr scatter_indices = LiteralUtil::CreateR1({}); + std::unique_ptr updates = LiteralUtil::CreateR1({}); + RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get()); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc index 7015e5a6a31f506d30c2629d7735482cf354455a..f737b5158b3622d677aea5bf64a421a56e2c42dd 100644 --- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc +++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc @@ -22,10 +22,10 @@ limitations under the License. #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -42,8 +42,8 @@ struct SelectAndScatterTestParam { std::vector operand_shape; std::vector source_shape; Padding padding_type; - tensorflow::gtl::ArraySlice window_dimensions; - tensorflow::gtl::ArraySlice window_strides; + absl::Span window_dimensions; + absl::Span window_strides; }; class SelectAndScatterTest @@ -73,16 +73,16 @@ XLA_TEST_P(SelectAndScatterTest, ParamTest) { auto operand_shape = GetParam().operand_shape; Array o(operand_shape); o.FillRandom(1.5f); - auto operand = builder_.ConstantFromArray(o); + auto operand = ConstantFromArray(&builder_, o); auto source_shape = GetParam().source_shape; Array s(source_shape); s.FillRandom(12.0f); - auto source = builder_.ConstantFromArray(s); + auto source = ConstantFromArray(&builder_, s); - builder_.SelectAndScatter(operand, ge_f32_, GetParam().window_dimensions, - GetParam().window_strides, GetParam().padding_type, - source, builder_.ConstantR0(0.0f), add_f32_); + SelectAndScatter(operand, ge_f32_, GetParam().window_dimensions, + GetParam().window_strides, GetParam().padding_type, source, + ConstantR0(&builder_, 0.0f), add_f32_); ComputeAndCompare(&builder_, {}, ErrorSpec(1e-5)); } @@ -197,110 +197,110 @@ INSTANTIATE_TEST_CASE_P( // Test for F32 1D array, with a zero-element input. XLA_TEST_F(SelectAndScatterTest, R1S0F32) { - const auto operand = builder_.ConstantR1({}); - const auto source = builder_.ConstantR1({}); - builder_.SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{3}, - /*window_strides=*/{3}, Padding::kValid, source, - builder_.ConstantR0(0.0f), add_f32_); + const auto operand = ConstantR1(&builder_, {}); + const auto source = ConstantR1(&builder_, {}); + SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{3}, + /*window_strides=*/{3}, Padding::kValid, source, + ConstantR0(&builder_, 0.0f), add_f32_); ComputeAndCompareR1(&builder_, {}, {}, ErrorSpec(1e-7)); } // Test for F32 1D array, when windows do not overlap. XLA_TEST_F(SelectAndScatterTest, R1F32) { const auto operand = - builder_.ConstantR1({1.f, 9.f, 3.f, 7.f, 5.f, 6.f}); - const auto source = builder_.ConstantR1({34.f, 42.f}); + ConstantR1(&builder_, {1.f, 9.f, 3.f, 7.f, 5.f, 6.f}); + const auto source = ConstantR1(&builder_, {34.f, 42.f}); const std::vector expected = {0.f, 34.f, 0.f, 42.f, 0.f, 0.f}; - builder_.SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{3}, - /*window_strides=*/{3}, Padding::kValid, source, - builder_.ConstantR0(0.0f), add_f32_); + SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{3}, + /*window_strides=*/{3}, Padding::kValid, source, + ConstantR0(&builder_, 0.0f), add_f32_); ComputeAndCompareR1(&builder_, expected, {}, ErrorSpec(1e-7)); } // Test for S32 1D array, when windows do not overlap and the init value is 1. XLA_TEST_F(SelectAndScatterTest, R1S32) { - const auto operand = builder_.ConstantR1({-1, 0, 6, 4, -4, 10}); - const auto source = builder_.ConstantR1({-10, 20}); + const auto operand = ConstantR1(&builder_, {-1, 0, 6, 4, -4, 10}); + const auto source = ConstantR1(&builder_, {-10, 20}); const std::vector expected = {1, 1, -9, 1, 1, 21}; - builder_.SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{3}, - /*window_strides=*/{3}, Padding::kValid, source, - builder_.ConstantR0(1), add_s32_); + SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{3}, + /*window_strides=*/{3}, Padding::kValid, source, + ConstantR0(&builder_, 1), add_s32_); ComputeAndCompareR1(&builder_, expected, {}); } // Test for S32 1D array, when windows overlap with each other. XLA_TEST_F(SelectAndScatterTest, R1S32OverlappingWindow) { - const auto operand = builder_.ConstantR1({1, 9, 3, 7, 5, 6}); - const auto source = builder_.ConstantR1({34, 42, 53, 19}); + const auto operand = ConstantR1(&builder_, {1, 9, 3, 7, 5, 6}); + const auto source = ConstantR1(&builder_, {34, 42, 53, 19}); const std::vector expected = {0, 76, 0, 72, 0, 0}; - builder_.SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{3}, - /*window_strides=*/{1}, Padding::kValid, source, - builder_.ConstantR0(0), add_s32_); + SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{3}, + /*window_strides=*/{1}, Padding::kValid, source, + ConstantR0(&builder_, 0), add_s32_); ComputeAndCompareR1(&builder_, expected, {}); } // Test for S32 2D array, when windows do not overlap. XLA_TEST_F(SelectAndScatterTest, R2S32) { const auto operand = - builder_.ConstantR2({{7, 2, 5, 3, 10, 2}, {3, 8, 9, 3, 4, 2}}); - const auto source = builder_.ConstantR2({{2, 6}}); + ConstantR2(&builder_, {{7, 2, 5, 3, 10, 2}, {3, 8, 9, 3, 4, 2}}); + const auto source = ConstantR2(&builder_, {{2, 6}}); Array2D expected({{0, 0, 0, 0, 6, 0}, {0, 0, 2, 0, 0, 0}}); - builder_.SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{2, 3}, - /*window_strides=*/{2, 3}, Padding::kValid, source, - builder_.ConstantR0(0), add_s32_); + SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{2, 3}, + /*window_strides=*/{2, 3}, Padding::kValid, source, + ConstantR0(&builder_, 0), add_s32_); ComputeAndCompareR2(&builder_, expected, {}); } // Test for tie breaking rule in ge_f32_. When a tie is present, the operand // that has the lower lexicographical order (smaller index) should be chosen. XLA_TEST_F(SelectAndScatterTest, R2F32Tie) { - const auto operand = builder_.ConstantR2( - {{0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}}); - const auto source = builder_.ConstantR2( - {{1.0f, 2.0f, 3.0f}, {4.f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}); + const auto operand = ConstantR2( + &builder_, {{0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}}); + const auto source = ConstantR2( + &builder_, {{1.0f, 2.0f, 3.0f}, {4.f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}); Array2D expected( {{12.f, 9.f, 0.f}, {15.f, 9.f, 0.f}, {0.f, 0.f, 0.f}}); - builder_.SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{3, 3}, - /*window_strides=*/{1, 1}, Padding::kSame, source, - builder_.ConstantR0(0.0f), add_f32_); + SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{3, 3}, + /*window_strides=*/{1, 1}, Padding::kSame, source, + ConstantR0(&builder_, 0.0f), add_f32_); ComputeAndCompareR2(&builder_, expected, {}, ErrorSpec(1e-7)); } // Similar to SelectAndScatterTest.R2S32 but the input is transposed. XLA_TEST_F(SelectAndScatterTest, ReshapeR2S32) { - const auto operand = builder_.ConstantR2( - {{7, 3}, {2, 8}, {5, 9}, {3, 3}, {10, 4}, {2, 2}}); + const auto operand = ConstantR2( + &builder_, {{7, 3}, {2, 8}, {5, 9}, {3, 3}, {10, 4}, {2, 2}}); const auto reshape = - builder_.Reshape(operand, /*dimensions=*/{1, 0}, /*new_sizes=*/{2, 6}); - const auto source = builder_.ConstantR2({{2, 6}}); + Reshape(operand, /*dimensions=*/{1, 0}, /*new_sizes=*/{2, 6}); + const auto source = ConstantR2(&builder_, {{2, 6}}); Array2D expected({{0, 0, 0, 0, 6, 0}, {0, 0, 2, 0, 0, 0}}); - builder_.SelectAndScatter(reshape, ge_s32_, /*window_dimensions=*/{2, 3}, - /*window_strides=*/{2, 3}, Padding::kValid, source, - builder_.ConstantR0(0), add_s32_); + SelectAndScatter(reshape, ge_s32_, /*window_dimensions=*/{2, 3}, + /*window_strides=*/{2, 3}, Padding::kValid, source, + ConstantR0(&builder_, 0), add_s32_); ComputeAndCompareR2(&builder_, expected, {}); } // Test for S32 2D array, when windows overlap with each other. XLA_TEST_F(SelectAndScatterTest, R2S32OverlappingWindow) { const auto operand = - builder_.ConstantR2({{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}}); - const auto source = builder_.ConstantR2({{2, 6, 4}}); + ConstantR2(&builder_, {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}}); + const auto source = ConstantR2(&builder_, {{2, 6, 4}}); Array2D expected({{0, 0, 0, 0, 0}, {0, 0, 12, 0, 0}}); - builder_.SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{2, 3}, - /*window_strides=*/{1, 1}, Padding::kValid, source, - builder_.ConstantR0(0), add_s32_); + SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{2, 3}, + /*window_strides=*/{1, 1}, Padding::kValid, source, + ConstantR0(&builder_, 0), add_s32_); ComputeAndCompareR2(&builder_, expected, {}); } // Test for S32 2D array, when the padding is Padding::kSAME. XLA_TEST_F(SelectAndScatterTest, R2S32SamePadding) { const auto operand = - builder_.ConstantR2({{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}}); - const auto source = builder_.ConstantR2({{2, 6, 4}}); + ConstantR2(&builder_, {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}}); + const auto source = ConstantR2(&builder_, {{2, 6, 4}}); Array2D expected({{0, 0, 0, 0, 4}, {0, 2, 6, 0, 0}}); - builder_.SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{2, 2}, - /*window_strides=*/{2, 2}, Padding::kSame, source, - builder_.ConstantR0(0), add_s32_); + SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{2, 2}, + /*window_strides=*/{2, 2}, Padding::kSame, source, + ConstantR0(&builder_, 0), add_s32_); ComputeAndCompareR2(&builder_, expected, {}); } @@ -308,25 +308,26 @@ XLA_TEST_F(SelectAndScatterTest, R2S32SamePadding) { // with each other. XLA_TEST_F(SelectAndScatterTest, R2S32SamePaddingOverlappingWindow) { const auto operand = - builder_.ConstantR2({{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}}); + ConstantR2(&builder_, {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}}); const auto source = - builder_.ConstantR2({{2, 6, 4, 7, 1}, {3, 5, 8, 9, 10}}); + ConstantR2(&builder_, {{2, 6, 4, 7, 1}, {3, 5, 8, 9, 10}}); Array2D expected({{0, 0, 0, 0, 8}, {0, 5, 23, 0, 19}}); - builder_.SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{2, 2}, - /*window_strides=*/{1, 1}, Padding::kSame, source, - builder_.ConstantR0(0), add_s32_); + SelectAndScatter(operand, ge_s32_, /*window_dimensions=*/{2, 2}, + /*window_strides=*/{1, 1}, Padding::kSame, source, + ConstantR0(&builder_, 0), add_s32_); ComputeAndCompareR2(&builder_, expected, {}); } XLA_TEST_F(SelectAndScatterTest, R2F32OverlappingR2Source) { - const auto operand = builder_.ConstantR2( - {{1.5f, 2.5f, 1.5f}, {3.5f, 1.5f, 3.5f}, {4.5f, 2.5f, 4.5f}}); - const auto source = builder_.ConstantR2({{1.0f, 2.0f}, {3.0f, 4.0f}}); + const auto operand = ConstantR2( + &builder_, {{1.5f, 2.5f, 1.5f}, {3.5f, 1.5f, 3.5f}, {4.5f, 2.5f, 4.5f}}); + const auto source = + ConstantR2(&builder_, {{1.0f, 2.0f}, {3.0f, 4.0f}}); Array2D expected( {{0.0f, 0.0f, 0.0f}, {1.0f, 0.0f, 2.0f}, {3.0f, 0.0f, 4.0f}}); - builder_.SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{2, 2}, - /*window_strides=*/{1, 1}, Padding::kValid, source, - builder_.ConstantR0(0.0f), add_f32_); + SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{2, 2}, + /*window_strides=*/{1, 1}, Padding::kValid, source, + ConstantR0(&builder_, 0.0f), add_f32_); ComputeAndCompareR2(&builder_, expected, {}, ErrorSpec(1e-7)); } @@ -342,16 +343,16 @@ TEST_F(SelectAndScatterTest, R4F32Valid) { {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}}; Array4D o(4, 6, 15, 220); o.FillWithPZ(pzo); - auto operand = builder_.ConstantR4FromArray4D(o); + auto operand = ConstantR4FromArray4D(&builder_, o); Array4D e(4, 6, 15, 220); e.FillWithPZ(pze); Array4D s(2, 2, 15, 220); s.FillWithPZ(pzs); - auto source = builder_.ConstantR4FromArray4D(s); + auto source = ConstantR4FromArray4D(&builder_, s); s.FillWithPZ(pzs); - builder_.SelectAndScatter(operand, ge_f32_, {2, 3, 1, 1}, {2, 3, 1, 1}, - Padding::kValid, source, - builder_.ConstantR0(0.0f), add_f32_); + SelectAndScatter(operand, ge_f32_, {2, 3, 1, 1}, {2, 3, 1, 1}, + Padding::kValid, source, ConstantR0(&builder_, 0.0f), + add_f32_); ComputeAndCompareR4(&builder_, e, {}, ErrorSpec(1e-7)); } @@ -367,16 +368,16 @@ TEST_F(SelectAndScatterTest, R4F32Overlap) { {0.0f, 0.0f, 0.0f, 1.0f, 0.0f}}; Array4D o(4, 5, 17, 128); o.FillWithPZ(pzo); - auto operand = builder_.ConstantR4FromArray4D(o); + auto operand = ConstantR4FromArray4D(&builder_, o); Array4D e(4, 5, 17, 128); e.FillWithPZ(pze); Array4D s(2, 2, 17, 128); s.FillWithPZ(pzs); - auto source = builder_.ConstantR4FromArray4D(s); + auto source = ConstantR4FromArray4D(&builder_, s); s.FillWithPZ(pzs); - builder_.SelectAndScatter(operand, ge_f32_, {2, 3, 1, 1}, {2, 2, 1, 1}, - Padding::kValid, source, - builder_.ConstantR0(0.0f), add_f32_); + SelectAndScatter(operand, ge_f32_, {2, 3, 1, 1}, {2, 2, 1, 1}, + Padding::kValid, source, ConstantR0(&builder_, 0.0f), + add_f32_); ComputeAndCompareR4(&builder_, e, {}, ErrorSpec(1e-7)); } @@ -392,16 +393,16 @@ TEST_F(SelectAndScatterTest, R4F32OverlapSmall) { {0.0f, 0.0f, 0.0f, 1.0f, 0.0f}}; Array4D o(4, 5, 1, 1); o.FillWithPZ(pzo); - auto operand = builder_.ConstantR4FromArray4D(o); + auto operand = ConstantR4FromArray4D(&builder_, o); Array4D e(4, 5, 1, 1); e.FillWithPZ(pze); Array4D s(2, 2, 1, 1); s.FillWithPZ(pzs); - auto source = builder_.ConstantR4FromArray4D(s); + auto source = ConstantR4FromArray4D(&builder_, s); s.FillWithPZ(pzs); - builder_.SelectAndScatter(operand, ge_f32_, {2, 3, 1, 1}, {2, 2, 1, 1}, - Padding::kValid, source, - builder_.ConstantR0(0.0f), add_f32_); + SelectAndScatter(operand, ge_f32_, {2, 3, 1, 1}, {2, 2, 1, 1}, + Padding::kValid, source, ConstantR0(&builder_, 0.0f), + add_f32_); ComputeAndCompareR4(&builder_, e, {}, ErrorSpec(1e-7)); } @@ -414,39 +415,39 @@ TEST_F(SelectAndScatterTest, R4F32RefValidFixedSmall) { Array2D pzs = {{2.0f, 6.0f}, {3.0f, 1.0f}}; Array4D o(4, 6, 4, 4); o.FillWithPZ(pzo); - auto operand = builder_.ConstantR4FromArray4D(o); + auto operand = ConstantR4FromArray4D(&builder_, o); Array4D s(2, 2, 4, 4); s.FillWithPZ(pzs); - auto source = builder_.ConstantR4FromArray4D(s); + auto source = ConstantR4FromArray4D(&builder_, s); s.FillWithPZ(pzs); - builder_.SelectAndScatter(operand, ge_f32_, {2, 3, 1, 1}, {2, 3, 1, 1}, - Padding::kValid, source, - builder_.ConstantR0(0.0f), add_f32_); + SelectAndScatter(operand, ge_f32_, {2, 3, 1, 1}, {2, 3, 1, 1}, + Padding::kValid, source, ConstantR0(&builder_, 0.0f), + add_f32_); auto e = ReferenceUtil::SelectAndScatter4DGePlus(o, s, 0.0f, {2, 3, 1, 1}, {2, 3, 1, 1}, false); ComputeAndCompareR4(&builder_, *e, {}, ErrorSpec(1e-7)); } XLA_TEST_F(SelectAndScatterTest, R1F32OverlappingWindowMaxScatter) { - const auto operand = builder_.ConstantR1({1, 2, 3, 100, 3, 2, 1}); - const auto source = builder_.ConstantR1({34, 42, 53, 19}); + const auto operand = ConstantR1(&builder_, {1, 2, 3, 100, 3, 2, 1}); + const auto source = ConstantR1(&builder_, {34, 42, 53, 19}); const std::vector expected = {0, 0, 0, 53, 0, 0, 0}; - builder_.SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{4}, - /*window_strides=*/{1}, Padding::kValid, source, - builder_.ConstantR0(0), max_f32_); + SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{4}, + /*window_strides=*/{1}, Padding::kValid, source, + ConstantR0(&builder_, 0), max_f32_); ComputeAndCompareR1(&builder_, expected, {}, ErrorSpec(1e-7)); } XLA_TEST_F(SelectAndScatterTest, R1F32OverlappingWindowMinScatter) { - const auto operand = builder_.ConstantR1({1, 2, 3, 100, 3, 2, 1}); - const auto source = builder_.ConstantR1({34, 42, 53, 19}); + const auto operand = ConstantR1(&builder_, {1, 2, 3, 100, 3, 2, 1}); + const auto source = ConstantR1(&builder_, {34, 42, 53, 19}); const float max_float = std::numeric_limits::max(); const std::vector expected = {max_float, max_float, max_float, 19, max_float, max_float, max_float}; - builder_.SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{4}, - /*window_strides=*/{1}, Padding::kValid, source, - builder_.ConstantR0(max_float), min_f32_); + SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{4}, + /*window_strides=*/{1}, Padding::kValid, source, + ConstantR0(&builder_, max_float), min_f32_); ComputeAndCompareR1(&builder_, expected, {}, ErrorSpec(1e-7)); } diff --git a/tensorflow/compiler/xla/tests/select_test.cc b/tensorflow/compiler/xla/tests/select_test.cc index 72707f224446c7585d1d90ac6681a7b38c41d5f1..1c01402798658877889527a5dd02d5c74787ff99 100644 --- a/tensorflow/compiler/xla/tests/select_test.cc +++ b/tensorflow/compiler/xla/tests/select_test.cc @@ -18,7 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -35,50 +35,52 @@ class SelectTest : public ClientLibraryTestBase { TEST_F(SelectTest, SelectScalarF32True) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(true); - auto on_true = builder.ConstantR0(123.0f); - auto on_false = builder.ConstantR0(42.0f); - auto result = builder.Select(pred, on_true, on_false); + auto pred = ConstantR0(&builder, true); + auto on_true = ConstantR0(&builder, 123.0f); + auto on_false = ConstantR0(&builder, 42.0f); + Select(pred, on_true, on_false); ComputeAndCompareR0(&builder, 123.0f, {}, error_spec_); } TEST_F(SelectTest, SelectScalarS32True) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(true); - auto on_true = builder.ConstantR0(-42); - auto on_false = builder.ConstantR0(42); - auto result = builder.Select(pred, on_true, on_false); + auto pred = ConstantR0(&builder, true); + auto on_true = ConstantR0(&builder, -42); + auto on_false = ConstantR0(&builder, 42); + Select(pred, on_true, on_false); ComputeAndCompareR0(&builder, -42, {}); } TEST_F(SelectTest, SelectScalarF32False) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto on_true = builder.ConstantR0(123.0f); - auto on_false = builder.ConstantR0(42.0f); - auto result = builder.Select(pred, on_true, on_false); + auto pred = ConstantR0(&builder, false); + auto on_true = ConstantR0(&builder, 123.0f); + auto on_false = ConstantR0(&builder, 42.0f); + Select(pred, on_true, on_false); ComputeAndCompareR0(&builder, 42.0f, {}, error_spec_); } XLA_TEST_F(SelectTest, SelectR1S0F32WithConstantR1S0PRED) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR1({}); - auto on_true = builder.ConstantR1({}); - auto on_false = builder.ConstantR1({}); - auto select = builder.Select(pred, on_true, on_false); + auto pred = ConstantR1(&builder, {}); + auto on_true = ConstantR1(&builder, {}); + auto on_false = ConstantR1(&builder, {}); + Select(pred, on_true, on_false); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } TEST_F(SelectTest, SelectR1F32WithConstantR1PRED) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR1({false, true, false, true, false}); - auto on_true = builder.ConstantR1({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); - auto on_false = builder.ConstantR1({10.0f, 5.0f, 1.0f, 10.0f, -6.0f}); - auto select = builder.Select(pred, on_true, on_false); + auto pred = ConstantR1(&builder, {false, true, false, true, false}); + auto on_true = + ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); + auto on_false = + ConstantR1(&builder, {10.0f, 5.0f, 1.0f, 10.0f, -6.0f}); + Select(pred, on_true, on_false); ComputeAndCompareR1(&builder, {10.0f, 25.5f, 1.0f, -10.0f, -6.0f}, {}, error_spec_); @@ -88,12 +90,12 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithCmpR1S0S32s) { // Similar to SelectR1S0F32WithConstantR1S0PRED, except that the pred vector // is not a constant, but rather the result of comparing two other vectors. XlaBuilder builder(TestName()); - auto v1 = builder.ConstantR1({}); - auto v2 = builder.ConstantR1({}); - auto cmp = builder.Eq(v1, v2); - auto on_true = builder.ConstantR1({}); - auto on_false = builder.ConstantR1({}); - auto select = builder.Select(cmp, on_true, on_false); + auto v1 = ConstantR1(&builder, {}); + auto v2 = ConstantR1(&builder, {}); + auto cmp = Eq(v1, v2); + auto on_true = ConstantR1(&builder, {}); + auto on_false = ConstantR1(&builder, {}); + Select(cmp, on_true, on_false); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } @@ -102,12 +104,14 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1S32s) { // Similar to SelectR1F32WithConstantR1PRED, except that the pred vector is // not a constant, but rather the result of comparing two other vectors. XlaBuilder builder(TestName()); - auto v1 = builder.ConstantR1({1, 2, 3, 4, 5}); - auto v2 = builder.ConstantR1({9, 2, 9, 4, 9}); - auto cmp = builder.Eq(v1, v2); - auto on_true = builder.ConstantR1({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); - auto on_false = builder.ConstantR1({10.0f, 5.0f, 1.0f, 10.0f, -6.0f}); - auto select = builder.Select(cmp, on_true, on_false); + auto v1 = ConstantR1(&builder, {1, 2, 3, 4, 5}); + auto v2 = ConstantR1(&builder, {9, 2, 9, 4, 9}); + auto cmp = Eq(v1, v2); + auto on_true = + ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); + auto on_false = + ConstantR1(&builder, {10.0f, 5.0f, 1.0f, 10.0f, -6.0f}); + Select(cmp, on_true, on_false); ComputeAndCompareR1(&builder, {10.0f, 25.5f, 1.0f, -10.0f, -6.0f}, {}, error_spec_); @@ -116,12 +120,14 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1S32s) { TEST_F(SelectTest, SelectR1F32WithCmpR1F32s) { // Similar to SelectR1F32WithCmpR1S32s, except "gt"-comparing two R1F32s. XlaBuilder builder(TestName()); - auto v1 = builder.ConstantR1({1.0f, 2.0f, 3.0f, 4.0f, 5.0f}); - auto v2 = builder.ConstantR1({-1.0f, -2.0f, 13.0f, 14.0f, 4.4f}); - auto cmp = builder.Gt(v1, v2); - auto on_true = builder.ConstantR1({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); - auto on_false = builder.ConstantR1({10.0f, 5.0f, 1.0f, 10.0f, -6.0f}); - auto select = builder.Select(cmp, on_true, on_false); + auto v1 = ConstantR1(&builder, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}); + auto v2 = ConstantR1(&builder, {-1.0f, -2.0f, 13.0f, 14.0f, 4.4f}); + auto cmp = Gt(v1, v2); + auto on_true = + ConstantR1(&builder, {-2.5f, 25.5f, 2.25f, -10.0f, 6.0f}); + auto on_false = + ConstantR1(&builder, {10.0f, 5.0f, 1.0f, 10.0f, -6.0f}); + Select(cmp, on_true, on_false); ComputeAndCompareR1(&builder, {-2.5f, 25.5f, 1.0f, 10.0f, 6.0f}, {}, error_spec_); @@ -140,8 +146,8 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsSmall) { {21.0f, 22.0f, 23.0f, 24.0f}, /*parameter_number=*/1, /*name=*/"v2", /*builder=*/&builder, /*data_handle=*/&v2); - auto cmp = builder.Gt(v1, v2); - auto select = builder.Select(cmp, v1, v2); + auto cmp = Gt(v1, v2); + Select(cmp, v1, v2); ComputeAndCompareR1(&builder, {41.0f, 22.0f, 23.0f, 84.0f}, {param0_data.get(), param1_data.get()}, error_spec_); @@ -181,8 +187,8 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsLarge) { CreateR1Parameter(v2vec, /*parameter_number=*/1, /*name=*/"v2", /*builder=*/&builder, /*data_handle=*/&v2); - auto cmp = builder.Gt(v1, v2); - auto select = builder.Select(cmp, v1, v2); + auto cmp = Gt(v1, v2); + Select(cmp, v1, v2); ComputeAndCompareR1(&builder, expected_vec, {param0_data.get(), param1_data.get()}, error_spec_); @@ -192,14 +198,14 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1S32ToScalar) { // "gt"-compares a R1S32 with a S32 scalar, and uses the resulting R1PRED to // select between two R1F32s. XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({1, -1, 2, -2}); - auto s = builder.ConstantR0(0); - auto cmp = builder.Gt(v, s); + auto v = ConstantR1(&builder, {1, -1, 2, -2}); + auto s = ConstantR0(&builder, 0); + auto cmp = Gt(v, s); - auto on_true = builder.ConstantR1({11.0f, 22.0f, 33.0f, 44.0f}); + auto on_true = ConstantR1(&builder, {11.0f, 22.0f, 33.0f, 44.0f}); auto on_false = - builder.ConstantR1({-111.0f, -222.0f, -333.0f, -444.0f}); - auto select = builder.Select(cmp, on_true, on_false); + ConstantR1(&builder, {-111.0f, -222.0f, -333.0f, -444.0f}); + Select(cmp, on_true, on_false); ComputeAndCompareR1(&builder, {11.0f, -222.0f, 33.0f, -444.0f}, {}, error_spec_); @@ -209,14 +215,14 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32ToScalar) { // "gt"-compares a R1F32 with a F32 scalar, and uses the resulting R1PRED to // select between two R1F32s. XlaBuilder builder(TestName()); - auto v = builder.ConstantR1({1.0f, 2.0f, 3.0f, 4.0f}); - auto s = builder.ConstantR0(2.5f); - auto cmp = builder.Gt(v, s); + auto v = ConstantR1(&builder, {1.0f, 2.0f, 3.0f, 4.0f}); + auto s = ConstantR0(&builder, 2.5f); + auto cmp = Gt(v, s); - auto on_true = builder.ConstantR1({11.0f, 22.0f, 33.0f, 44.0f}); + auto on_true = ConstantR1(&builder, {11.0f, 22.0f, 33.0f, 44.0f}); auto on_false = - builder.ConstantR1({-111.0f, -222.0f, -333.0f, -444.0f}); - auto select = builder.Select(cmp, on_true, on_false); + ConstantR1(&builder, {-111.0f, -222.0f, -333.0f, -444.0f}); + Select(cmp, on_true, on_false); ComputeAndCompareR1(&builder, {-111.0f, -222.0f, 33.0f, 44.0f}, {}, error_spec_); @@ -225,10 +231,10 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32ToScalar) { XLA_TEST_F(SelectTest, SelectR1S0F32WithScalarPredicate) { for (bool which : {false, true}) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(which); - auto on_true = builder.ConstantR1({}); - auto on_false = builder.ConstantR1({}); - auto select = builder.Select(pred, on_true, on_false); + auto pred = ConstantR0(&builder, which); + auto on_true = ConstantR1(&builder, {}); + auto on_false = ConstantR1(&builder, {}); + Select(pred, on_true, on_false); ComputeAndCompareR1(&builder, {}, {}, error_spec_); } @@ -236,20 +242,20 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithScalarPredicate) { TEST_F(SelectTest, SelectR1F32WithScalarPredicateTrue) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(true); - auto on_true = builder.ConstantR1({-2.5f, 25.5f}); - auto on_false = builder.ConstantR1({10.0f, 5.0f}); - auto select = builder.Select(pred, on_true, on_false); + auto pred = ConstantR0(&builder, true); + auto on_true = ConstantR1(&builder, {-2.5f, 25.5f}); + auto on_false = ConstantR1(&builder, {10.0f, 5.0f}); + Select(pred, on_true, on_false); ComputeAndCompareR1(&builder, {-2.5f, 25.5f}, {}, error_spec_); } TEST_F(SelectTest, SelectR1F32WithScalarPredicateFalse) { XlaBuilder builder(TestName()); - auto pred = builder.ConstantR0(false); - auto on_true = builder.ConstantR1({-2.5f, 25.5f}); - auto on_false = builder.ConstantR1({10.0f, 5.0f}); - auto select = builder.Select(pred, on_true, on_false); + auto pred = ConstantR0(&builder, false); + auto on_true = ConstantR1(&builder, {-2.5f, 25.5f}); + auto on_false = ConstantR1(&builder, {10.0f, 5.0f}); + Select(pred, on_true, on_false); ComputeAndCompareR1(&builder, {10.0f, 5.0f}, {}, error_spec_); } diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc index 5653bf11a7364bf9ed79bcb6b53f7db31f454803..c9a58aefb4acc066c10e98aea46375523cf554d0 100644 --- a/tensorflow/compiler/xla/tests/slice_test.cc +++ b/tensorflow/compiler/xla/tests/slice_test.cc @@ -18,23 +18,24 @@ limitations under the License. #include #include +#include "absl/container/inlined_vector.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" namespace xla { namespace { -using ::tensorflow::str_util::Join; - class SliceTest : public ClientLibraryTestBase {}; TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) { @@ -42,8 +43,8 @@ TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) { values.FillIota(0); XlaBuilder builder(TestName()); - auto original = builder.ConstantR3FromArray3D(values); - builder.Slice(original, {0, 0, 0}, {3, 3, 1}, {1, 1, 1}); + auto original = ConstantR3FromArray3D(&builder, values); + Slice(original, {0, 0, 0}, {3, 3, 1}, {1, 1, 1}); Array3D expected{ {{0.0}, {3.0}, {6.0}}, {{9.0}, {12.0}, {15.0}}, {{18.0}, {21.0}, {24.0}}}; @@ -55,8 +56,8 @@ TEST_F(SliceTest, Slice3x3x3_To_3x1x3_F32) { values.FillIota(0); XlaBuilder builder(TestName()); - auto original = builder.ConstantR3FromArray3D(values); - builder.Slice(original, {0, 0, 0}, {3, 1, 3}, {1, 1, 1}); + auto original = ConstantR3FromArray3D(&builder, values); + Slice(original, {0, 0, 0}, {3, 1, 3}, {1, 1, 1}); Array3D expected{ {{0.0, 1.0, 2.0}}, {{9.0, 10.0, 11.0}}, {{18.0, 19.0, 20.0}}}; @@ -68,8 +69,8 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) { values.FillIota(0); XlaBuilder builder(TestName()); - auto original = builder.ConstantR3FromArray3D(values); - builder.Slice(original, {0, 0, 0}, {1, 3, 3}, {1, 1, 1}); + auto original = ConstantR3FromArray3D(&builder, values); + Slice(original, {0, 0, 0}, {1, 3, 3}, {1, 1, 1}); Array3D expected{ {{{0.0, 1.0, 2.0}, {3.0, 4.0, 5.0}, {6.0, 7.0, 8.0}}}}; @@ -78,24 +79,24 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) { XLA_TEST_F(SliceTest, Slice0x0to0x0F32) { XlaBuilder builder(TestName()); - auto original = builder.ConstantR2FromArray2D(Array2D(0, 0)); - builder.Slice(original, {0, 0}, {0, 0}, {1, 1}); + auto original = ConstantR2FromArray2D(&builder, Array2D(0, 0)); + Slice(original, {0, 0}, {0, 0}, {1, 1}); ComputeAndCompareR2(&builder, Array2D(0, 0), {}); } XLA_TEST_F(SliceTest, Slice0x20to0x5F32) { XlaBuilder builder(TestName()); - auto original = builder.ConstantR2FromArray2D(Array2D(0, 20)); - builder.Slice(original, {0, 15}, {0, 20}, {1, 1}); + auto original = ConstantR2FromArray2D(&builder, Array2D(0, 20)); + Slice(original, {0, 15}, {0, 20}, {1, 1}); ComputeAndCompareR2(&builder, Array2D(0, 5), {}); } XLA_TEST_F(SliceTest, Slice3x0to2x0F32) { XlaBuilder builder(TestName()); - auto original = builder.ConstantR2FromArray2D(Array2D(3, 0)); - builder.Slice(original, {1, 0}, {3, 0}, {1, 1}); + auto original = ConstantR2FromArray2D(&builder, Array2D(3, 0)); + Slice(original, {1, 0}, {3, 0}, {1, 1}); ComputeAndCompareR2(&builder, Array2D(2, 0), {}); } @@ -109,8 +110,8 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) { } XlaBuilder builder(TestName()); - auto original = builder.ConstantR2FromArray2D(values); - builder.Slice(original, {128, 128}, {256, 256}, {1, 1}); + auto original = ConstantR2FromArray2D(&builder, values); + Slice(original, {128, 128}, {256, 256}, {1, 1}); Array2D expected(128, 128); for (int row = 0; row < 128; ++row) { @@ -127,8 +128,8 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) { std::iota(values.data(), values.data() + 4096, 0.0); XlaBuilder builder(TestName()); - auto original = builder.ConstantR2FromArray2D(values); - builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1}); + auto original = ConstantR2FromArray2D(&builder, values); + Slice(original, {0, 3072}, {1, 4096}, {1, 1}); Array2D expected(1, 1024); std::iota(expected.data(), expected.data() + 1024, 3072.0); @@ -148,8 +149,8 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) { } } XlaBuilder builder(TestName()); - auto original = builder.ConstantR2FromArray2D(values); - builder.Slice(original, {0, 0}, {16, 2}, {1, 1}); + auto original = ConstantR2FromArray2D(&builder, values); + Slice(original, {0, 0}, {16, 2}, {1, 1}); ComputeAndCompareR2(&builder, expected, {}, ErrorSpec(0.000001)); } @@ -160,8 +161,8 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) { auto expected = ReferenceUtil::Slice4D( values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}}, /*strides=*/{{1, 1, 1, 1}}); XlaBuilder builder(TestName()); - auto original = builder.ConstantR4FromArray4D(values); - builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1}); + auto original = ConstantR4FromArray4D(&builder, values); + Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1}); ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001)); } @@ -170,11 +171,11 @@ XLA_TEST_F(SliceTest, StridedSliceR4WithOutputLayout) { values.FillRandom(3.14f); auto expected = ReferenceUtil::Slice4D(values, {{0, 0, 0, 0}}, {{2, 4, 6, 8}}, /*strides=*/{{1, 1, 2, 1}}); - auto expected_literal = Literal::CreateR4FromArray4DWithLayout( + auto expected_literal = LiteralUtil::CreateR4FromArray4DWithLayout( *expected, LayoutUtil::MakeLayout({0, 1, 2, 3})); XlaBuilder builder(TestName()); - auto original = builder.ConstantR4FromArray4D(values); - builder.Slice(original, {0, 0, 0, 0}, {2, 4, 6, 8}, {1, 1, 2, 1}); + auto original = ConstantR4FromArray4D(&builder, values); + Slice(original, {0, 0, 0, 0}, {2, 4, 6, 8}, {1, 1, 2, 1}); ComputeAndCompareLiteral(&builder, *expected_literal, {}, ErrorSpec(0.000001), &expected_literal->shape()); } @@ -193,19 +194,19 @@ class SliceR1Test : public ClientLibraryTestBase, protected: template void Run(const R1Spec& spec) { - // This can't be an std::vector, since you can't grab an ArraySlice of a + // This can't be an std::vector, since you can't grab a Span of a // vector. - tensorflow::gtl::InlinedVector input(spec.input_dim0); + absl::InlinedVector input(spec.input_dim0); std::iota(input.begin(), input.end(), NativeT()); - auto literal = Literal::CreateR1(input); + auto literal = LiteralUtil::CreateR1(input); XlaBuilder builder(TestName()); - auto original = builder.Parameter(0, literal->shape(), "p0"); - builder.Slice(original, {spec.slice_start}, {spec.slice_limit}, - {spec.slice_stride}); + auto original = Parameter(&builder, 0, literal->shape(), "p0"); + Slice(original, {spec.slice_start}, {spec.slice_limit}, + {spec.slice_stride}); // Ditto. - tensorflow::gtl::InlinedVector expected; + absl::InlinedVector expected; for (int i = spec.slice_start; i < spec.slice_limit; i += spec.slice_stride) { expected.push_back(i); @@ -222,9 +223,8 @@ class SliceR1LargeTest : public SliceR1Test {}; string SliceR1TestDataToString(const ::testing::TestParamInfo& data) { const R1Spec& spec = data.param; - return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0, - spec.slice_start, spec.slice_limit, - spec.slice_stride); + return absl::StrFormat("%d_%d_%d_%d", spec.input_dim0, spec.slice_start, + spec.slice_limit, spec.slice_stride); } XLA_TEST_P(SliceR1Test, DoIt_F32) { Run(GetParam()); } @@ -344,7 +344,11 @@ INSTANTIATE_TEST_CASE_P( R1Spec{1024 * 1024 + 71, 3, 1024 * 512 - 9, 2}, R1Spec{1024 * 1024 + 71, 3, 1024 * 512 - 9, 8}, R1Spec{1024 * 1024 + 71, 3, 1024 * 512 - 9, 7}, - R1Spec{1024 * 1024 + 71, 3, 1024 * 512 - 9, 125} + R1Spec{1024 * 1024 + 71, 3, 1024 * 512 - 9, 125}, + R1Spec{16 * 1024 * 1024, 0, 16 * 1024 * 1024, 4097}, + R1Spec{16 * 1024 * 1024, 0, 16 * 1024 * 1024, 4093}, + R1Spec{16 * 1024 * 1024, 12 * 1024 + 17, 16 * 1024 * 1024 - 231, 4097}, + R1Spec{16 * 1024 * 1024, 12 * 1024 + 17, 16 * 1024 * 1024 - 231, 4093} ), SliceR1TestDataToString ); @@ -368,12 +372,12 @@ XLA_TEST_P(SliceR2Test, DoIt) { const R2Spec& spec = GetParam(); Array2D input(spec.input_dim0, spec.input_dim1); input.FillUnique(); - auto literal = Literal::CreateR2FromArray2DWithLayout( + auto literal = LiteralUtil::CreateR2FromArray2DWithLayout( input, LayoutUtil::MakeLayout(spec.layout)); XlaBuilder builder(TestName()); - auto a = builder.Parameter(0, literal->shape(), "p0"); - builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides); + auto a = Parameter(&builder, 0, literal->shape(), "p0"); + Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr arg, client_->TransferToServer(*literal)); @@ -444,13 +448,11 @@ struct R4Spec { string R4SpecToString(const ::testing::TestParamInfo& data) { const R4Spec& spec = data.param; - return tensorflow::strings::StrCat( // - "input_", Join(spec.input_dims, "x"), // - "__layout_", Join(spec.input_layout, ""), // - "__starts_", Join(spec.slice_starts, "x"), // - "__limits_", Join(spec.slice_limits, "x"), // - "__strides_", Join(spec.slice_strides, "x") // - ); + return absl::StrCat("input_", absl::StrJoin(spec.input_dims, "x"), + "__layout_", absl::StrJoin(spec.input_layout, ""), + "__starts_", absl::StrJoin(spec.slice_starts, "x"), + "__limits_", absl::StrJoin(spec.slice_limits, "x"), + "__strides_", absl::StrJoin(spec.slice_strides, "x")); } class SliceR4Test : public ClientLibraryTestBase, @@ -463,13 +465,12 @@ class SliceR4Test : public ClientLibraryTestBase, auto expected = ReferenceUtil::Slice4D( values, spec.slice_starts, spec.slice_limits, spec.slice_strides); XlaBuilder builder(TestName()); - auto literal = Literal::CreateR4FromArray4DWithLayout( + auto literal = LiteralUtil::CreateR4FromArray4DWithLayout( values, LayoutUtil::MakeLayout(spec.input_layout)); - auto parameter = builder.Parameter(0, literal->shape(), "p0"); + auto parameter = Parameter(&builder, 0, literal->shape(), "p0"); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr arg, client_->TransferToServer(*literal)); - builder.Slice(parameter, spec.slice_starts, spec.slice_limits, - spec.slice_strides); + Slice(parameter, spec.slice_starts, spec.slice_limits, spec.slice_strides); ComputeAndCompareR4(&builder, *expected, {arg.get()}, ErrorSpec(0.000001)); } }; diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc index be35ec6c6ee4c015755622b2dc9bb92e23af7c85..a9874a918659f1d7403ba0c5cb968e62d7091936 100644 --- a/tensorflow/compiler/xla/tests/test_macros.cc +++ b/tensorflow/compiler/xla/tests/test_macros.cc @@ -20,7 +20,9 @@ limitations under the License. #include #include -#include "tensorflow/core/lib/strings/str_util.h" +#include "absl/strings/ascii.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_split.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/regexp.h" @@ -44,7 +46,7 @@ ManifestT ReadManifest() { string contents((std::istreambuf_iterator(file_stream)), std::istreambuf_iterator()); - std::vector lines = tensorflow::str_util::Split(contents, '\n'); + std::vector lines = absl::StrSplit(contents, '\n'); for (string& line : lines) { auto comment = line.find("//"); if (comment != string::npos) { @@ -53,8 +55,8 @@ ManifestT ReadManifest() { if (line.empty()) { continue; } - tensorflow::str_util::StripTrailingWhitespace(&line); - std::vector pieces = tensorflow::str_util::Split(line, ' '); + absl::StripTrailingAsciiWhitespace(&line); + std::vector pieces = absl::StrSplit(line, ' '); CHECK_GE(pieces.size(), 1); auto& platforms = manifest[pieces[0]]; for (int64 i = 1; i < pieces.size(); ++i) { @@ -73,8 +75,7 @@ string PrependDisabledIfIndicated(const string& test_case_name, // First try full match: test_case_name.test_name // If that fails, try to find just the test_case_name; this would disable all // tests in the test case. - auto it = manifest.find( - tensorflow::strings::StrCat(test_case_name, ".", test_name)); + auto it = manifest.find(absl::StrCat(test_case_name, ".", test_name)); if (it == manifest.end()) { it = manifest.find(test_case_name); if (it == manifest.end()) { diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc index dd7c541733634213606b5a7983b59bb1f14bf75c..c20a7c8fe49cd6b9161251488b85e08459f68865 100644 --- a/tensorflow/compiler/xla/tests/test_utils.cc +++ b/tensorflow/compiler/xla/tests/test_utils.cc @@ -13,11 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/tests/test_utils.h" +#include + +#include "absl/memory/memory.h" +#include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" +#include "tensorflow/compiler/xla/tests/test_utils.h" namespace xla { @@ -25,145 +29,172 @@ namespace { template void PopulateWithRandomFloatingPointDataImpl(Literal* literal, - std::minstd_rand0* engine) { + std::minstd_rand0* engine, + bool no_duplicates) { CHECK(engine != nullptr); CHECK_EQ(literal->shape().element_type(), primitive_util::NativeToPrimitiveType()); - // Create uniform numbers between 1 and 1.125 to avoid creating denormal - // numbers. - std::uniform_real_distribution generator(1.0f, 1.125f); - const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000; - TF_CHECK_OK(literal->Populate( - [&](tensorflow::gtl::ArraySlice indices) { - // Generate a random uniform number from -0.0625 and 0.0625 and bias it - // with a position dependent number with mean 0.037109375. These number - // should allow for long chains of accumulation without being too close - // to zero or too large to accumulate all numbers accurately. Only do - // this for large literals where the number of elements is much greater - // than 47 otherwise only negative values are produced. - // - // The value is positionally biased using a product of the indices. Add - // one to each index value to avoid collapsing to zero if any of the - // indices are zero. - int64 index_product = 1; - for (int64 i : indices) { - index_product *= (1 + i); - } - const int64 negative_bias = should_index_bias ? 47 : 0; - FloatT index_bias = - static_cast(index_product % 113 - negative_bias) / - static_cast(256.0f); - return static_cast(generator(*engine) - 1.0625f) + index_bias; - })); + if (no_duplicates) { + // Duplicates may be generated if the number of elements in the literal + // exceeds the number of positive values supported by the type. + FloatT next_value = std::numeric_limits::min(); + for (FloatT& value : literal->data()) { + value = next_value; + next_value = + std::nextafter(next_value, std::numeric_limits::max()); + } + std::shuffle(literal->data().begin(), literal->data().end(), + *engine); + } else { + std::uniform_real_distribution generator(-0.1f, 0.2f); + for (FloatT& value : literal->data()) { + value = static_cast(generator(*engine)); + } + } } template void PopulateWithRandomFloatingPointData(Literal* literal, - std::minstd_rand0* engine) { + std::minstd_rand0* engine, + bool no_duplicates) { CHECK(engine != nullptr); - PopulateWithRandomFloatingPointDataImpl(literal, engine); + PopulateWithRandomFloatingPointDataImpl(literal, engine, + no_duplicates); } template <> void PopulateWithRandomFloatingPointData(Literal* literal, - std::minstd_rand0* engine) { + std::minstd_rand0* engine, + bool no_duplicates) { + // no_duplicates is ignored for half types. Unique values can only be + // generated for arrays with fewer than ~2**16 elements and no_duplicates is + // best-effort anyway. CHECK(engine != nullptr); - PopulateWithRandomFloatingPointDataImpl(literal, engine); + std::uniform_real_distribution generator(-0.1f, 0.2f); + for (half& value : literal->data()) { + value = static_cast(generator(*engine)); + } } -// The standard library does not have a case for bfloat16, unsurprisingly, so we -// handle that one specially. template <> void PopulateWithRandomFloatingPointData(Literal* literal, - std::minstd_rand0* engine) { + std::minstd_rand0* engine, + bool no_duplicates) { + // no_duplicates is ignored for bfloat types. Unique values can only be + // generated for arrays with fewer than ~2**16 elements and no_duplicates is + // best-effort anyway. CHECK(engine != nullptr); - CHECK_EQ(literal->shape().element_type(), BF16); - std::uniform_real_distribution generator(-0.9f, 1.0f); - TF_CHECK_OK(literal->Populate( - [&](tensorflow::gtl::ArraySlice /*indices*/) { - return static_cast(generator(*engine)); - })); + std::uniform_real_distribution generator(-0.1f, 0.2f); + for (bfloat16& value : literal->data()) { + value = static_cast(generator(*engine)); + } } template -void PopulateWithRandomIntegralData(Literal* literal, - std::minstd_rand0* engine) { +void PopulateWithRandomIntegralData(Literal* literal, std::minstd_rand0* engine, + bool no_duplicates) { CHECK(engine != nullptr); CHECK_EQ(literal->shape().element_type(), primitive_util::NativeToPrimitiveType()); - std::uniform_int_distribution generator( - std::numeric_limits::lowest(), std::numeric_limits::max()); - TF_CHECK_OK(literal->Populate( - [&](tensorflow::gtl::ArraySlice /*indices*/) { - return generator(*engine); - })); + if (no_duplicates && ShapeUtil::ElementsIn(literal->shape()) < + std::numeric_limits::max()) { + std::iota(literal->data().begin(), literal->data().end(), 0); + std::shuffle(literal->data().begin(), literal->data().end(), + *engine); + } else { + std::uniform_int_distribution generator( + std::numeric_limits::lowest(), std::numeric_limits::max()); + for (IntT& value : literal->data()) { + value = generator(*engine); + } + } } // Similar to MakeFakeLiteral but takes a random number generator engine to -// enable reusing the engine across randomly generated literals. +// enable reusing the engine across randomly generated literals. 'no_duplicates' +// indicates that there should be no duplicate values in each generated +// array. This is uniqueness is best-effort only. Some types (half and bfloat16) +// are not supported and uniqueness cannot be guaranteed if the number of +// elements exceeds the number of different values supported by the type. StatusOr> MakeFakeLiteralInternal( - const Shape& shape, std::minstd_rand0* engine) { + const Shape& shape, std::minstd_rand0* engine, bool no_duplicates) { if (ShapeUtil::IsTuple(shape)) { std::vector> elements; for (const Shape& element_shape : shape.tuple_shapes()) { - TF_ASSIGN_OR_RETURN(std::unique_ptr element, - MakeFakeLiteralInternal(element_shape, engine)); + TF_ASSIGN_OR_RETURN( + std::unique_ptr element, + MakeFakeLiteralInternal(element_shape, engine, no_duplicates)); elements.push_back(std::move(element)); } - return Literal::MakeTupleOwned(std::move(elements)); + return LiteralUtil::MakeTupleOwned(std::move(elements)); } if (engine == nullptr) { return Literal::CreateFromShape(shape); } - auto literal = MakeUnique(shape); + auto literal = absl::make_unique(shape); switch (shape.element_type()) { case BF16: - PopulateWithRandomFloatingPointData(literal.get(), engine); + PopulateWithRandomFloatingPointData(literal.get(), engine, + no_duplicates); break; case F16: - PopulateWithRandomFloatingPointData(literal.get(), engine); + PopulateWithRandomFloatingPointData(literal.get(), engine, + no_duplicates); break; case F32: - PopulateWithRandomFloatingPointData(literal.get(), engine); + PopulateWithRandomFloatingPointData(literal.get(), engine, + no_duplicates); break; case F64: - PopulateWithRandomFloatingPointData(literal.get(), engine); + PopulateWithRandomFloatingPointData(literal.get(), engine, + no_duplicates); break; case S8: - PopulateWithRandomIntegralData(literal.get(), engine); + PopulateWithRandomIntegralData(literal.get(), engine, + no_duplicates); break; case U8: - PopulateWithRandomIntegralData(literal.get(), engine); + PopulateWithRandomIntegralData(literal.get(), engine, + no_duplicates); break; case S16: - PopulateWithRandomIntegralData(literal.get(), engine); + PopulateWithRandomIntegralData(literal.get(), engine, + no_duplicates); break; case U16: - PopulateWithRandomIntegralData(literal.get(), engine); + PopulateWithRandomIntegralData(literal.get(), engine, + no_duplicates); break; case S32: - PopulateWithRandomIntegralData(literal.get(), engine); + PopulateWithRandomIntegralData(literal.get(), engine, + no_duplicates); break; case U32: - PopulateWithRandomIntegralData(literal.get(), engine); + PopulateWithRandomIntegralData(literal.get(), engine, + no_duplicates); break; case S64: - PopulateWithRandomIntegralData(literal.get(), engine); + PopulateWithRandomIntegralData(literal.get(), engine, + no_duplicates); break; case U64: - PopulateWithRandomIntegralData(literal.get(), engine); + PopulateWithRandomIntegralData(literal.get(), engine, + no_duplicates); break; case PRED: { std::uniform_int_distribution generator(0, 1); - TF_CHECK_OK(literal->Populate( - [&](tensorflow::gtl::ArraySlice /*indices*/) { + TF_CHECK_OK( + literal->Populate([&](absl::Span /*indices*/) { return generator(*engine); })); break; } + // Token requires no data. + case TOKEN: + break; default: return Unimplemented("Unsupported type for fake literal generation: %s", - ShapeUtil::HumanString(shape).c_str()); + ShapeUtil::HumanString(shape)); } return std::move(literal); } @@ -172,6 +203,7 @@ enum class ConstantType { kUnknown, kZero, kOne }; // Return the constant type required by this computation, if known. ConstantType GetInitValue(const HloComputation& computation) { + // TODO(b/77635120): Add init values, for min, max, and their arg variants. const HloInstruction* const root = computation.root_instruction(); if (computation.num_parameters() != 2 || root->operand_count() != 2 || root->operand(0)->opcode() != HloOpcode::kParameter || @@ -196,28 +228,24 @@ bool NeedsInitValue(const HloUse& use) { const HloInstruction* const instruction = use.instruction; const HloOpcode opcode = instruction->opcode(); const int64 op_num = use.operand_number; - return ( - ((opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow) && - op_num == 1) || - (opcode == HloOpcode::kSelectAndScatter && op_num == 2)); + return ((opcode == HloOpcode::kReduceWindow && op_num == 1) || + (opcode == HloOpcode::kSelectAndScatter && op_num == 2) || + (opcode == HloOpcode::kReduce && + op_num >= instruction->operand_count() / 2)); } // Generate random values that are constrained to the input_shape minus the // output_shape so as not to produce wrapping slices, for instance. -std::unique_ptr MakeRandomNonwrappingSliceIndex( - const Shape& input_shape, const Shape& slice_shape, - std::minstd_rand0* engine) { - const int64 rank = ShapeUtil::Rank(input_shape); - std::vector start_indices(rank); +std::unique_ptr MakeRandomIndex(absl::Span index_space, + std::minstd_rand0* engine) { + std::vector start_indices(index_space.size()); if (engine != nullptr) { - for (int i = 0; i < rank; ++i) { - const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) - - ShapeUtil::GetDimension(slice_shape, i); - std::uniform_int_distribution generator(0, upper_bound); + for (int i = 0; i < index_space.size(); ++i) { + std::uniform_int_distribution generator(0, index_space[i]); start_indices[i] = generator(*engine); } } - return Literal::CreateR1(start_indices); + return LiteralUtil::CreateR1(start_indices); } // Use dataflow analysis on each parameter to see if there are uses that would @@ -250,6 +278,11 @@ std::vector FindConstrainedUses( auto converted_uses = FindConstrainedUses(dataflow, *instruction); constrained_uses.insert(constrained_uses.end(), converted_uses.begin(), converted_uses.end()); + } else if (opcode == HloOpcode::kSort && + instruction->operand_count() == 2 && op_num == 0) { + // Operand 0 of sort is the array of keys used for key/value + // (two-operand) kSort instructions. + constrained_uses.push_back(instruction); } } } @@ -261,62 +294,81 @@ std::vector FindConstrainedUses( // generate a constrained literal (either bounded in the case of indices, or // zero in the case of init_values for reductions). StatusOr> CreateLiteralForConstrainedUses( - const tensorflow::gtl::ArraySlice constrained_uses, + const absl::Span constrained_uses, const HloInstruction& param, std::minstd_rand0* engine) { - HloInstruction* needs_index = nullptr; - HloInstruction* needs_constant = nullptr; + std::vector index_space; + bool no_duplicates = false; + bool needs_constant = false; ConstantType constant_type = ConstantType::kUnknown; for (HloInstruction* use : constrained_uses) { switch (use->opcode()) { case HloOpcode::kDynamicSlice: - case HloOpcode::kDynamicUpdateSlice: - if (needs_index != nullptr && - !ShapeUtil::Equal(needs_index->shape(), use->shape())) { - return Unimplemented( - "Conflicting operand generation slice index constraints\n"); + case HloOpcode::kDynamicUpdateSlice: { + const Shape& indexed_shape = use->operand(0)->shape(); + const Shape& slice_shape = use->opcode() == HloOpcode::kDynamicSlice + ? use->shape() + : use->operand(1)->shape(); + const int64 rank = ShapeUtil::Rank(indexed_shape); + if (!index_space.empty()) { + TF_RET_CHECK(rank == index_space.size()); + for (int64 i = 0; i < rank; ++i) { + index_space[i] = std::min( + index_space[i], ShapeUtil::GetDimension(indexed_shape, i) - + ShapeUtil::GetDimension(slice_shape, i)); + } + } else { + index_space.resize(rank); + for (int64 i = 0; i < rank; ++i) { + index_space[i] = ShapeUtil::GetDimension(indexed_shape, i) - + ShapeUtil::GetDimension(slice_shape, i); + } } - needs_index = use; break; - + } case HloOpcode::kReduce: case HloOpcode::kReduceWindow: - needs_constant = use; + needs_constant = true; constant_type = GetInitValue(*use->to_apply()); break; case HloOpcode::kSelectAndScatter: - needs_constant = use; + needs_constant = true; constant_type = GetInitValue(*use->scatter()); break; + case HloOpcode::kSort: + no_duplicates = true; + break; + default: return Unimplemented( "Constrained operand generation not implemented for %s.", - use->ToString().c_str()); + use->ToString()); } } - if (needs_index != nullptr && needs_constant != nullptr) { - return Unimplemented( - "Conflicting operand generation constraints.\nNeeds index: %s\nNeeds " - "constant: %s\n", - needs_index->ToString().c_str(), needs_constant->ToString().c_str()); + int constraint_count = 0; + constraint_count += no_duplicates ? 1 : 0; + constraint_count += !index_space.empty() ? 1 : 0; + constraint_count += needs_constant ? 1 : 0; + if (constraint_count > 1) { + return Unimplemented("Conflicting operand generation constraints."); } - if (needs_index != nullptr) { - return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(), - needs_index->shape(), engine); - } else if (needs_constant != nullptr) { + if (!index_space.empty()) { + return MakeRandomIndex(index_space, engine); + } else if (needs_constant) { switch (constant_type) { case ConstantType::kZero: - return Literal::Zero(param.shape().element_type()).CloneToUnique(); + return LiteralUtil::Zero(param.shape().element_type()).CloneToUnique(); case ConstantType::kOne: - return Literal::One(param.shape().element_type()).CloneToUnique(); + return LiteralUtil::One(param.shape().element_type()).CloneToUnique(); case ConstantType::kUnknown: // We want the identity element for the computation, but we don't really // know what it is - so any value we generate will be just as wrong. - return MakeFakeLiteralInternal(param.shape(), engine); + return MakeFakeLiteralInternal(param.shape(), engine, + /*no_duplicates=*/false); } } else { - return MakeFakeLiteralInternal(param.shape(), engine); + return MakeFakeLiteralInternal(param.shape(), engine, no_duplicates); } } @@ -333,25 +385,36 @@ StatusOr> MakeConstrainedArgument( StatusOr> MakeFakeLiteral(const Shape& shape, bool pseudo_random) { - auto engine = pseudo_random ? MakeUnique() : nullptr; - return MakeFakeLiteralInternal(shape, engine.get()); + auto engine = + pseudo_random ? absl::make_unique() : nullptr; + return MakeFakeLiteralInternal(shape, engine.get(), /*no_duplicates=*/false); } StatusOr>> MakeFakeArguments( HloModule* const module, bool pseudo_random) { + auto engine = + pseudo_random ? absl::make_unique() : nullptr; + return MakeFakeArguments(module, engine.get()); +} + +StatusOr>> MakeFakeArguments( + HloModule* const module, std::minstd_rand0* engine) { TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(*module)); const auto params = module->entry_computation()->parameter_instructions(); - auto engine = pseudo_random ? MakeUnique() : nullptr; std::vector> arguments(params.size()); for (int i = 0; i < params.size(); ++i) { - TF_ASSIGN_OR_RETURN(arguments[i], MakeConstrainedArgument( - *dataflow, *params[i], engine.get())); + arguments[i] = + MakeConstrainedArgument(*dataflow, *params[i], engine).ValueOrDie(); } return std::move(arguments); } -Status VerifyHloModule(HloModule* const module, bool allow_mixed_precision) { - return HloVerifier(allow_mixed_precision).Run(module).status(); +Status VerifyHloModule(HloModule* const module, bool layout_sensitive, + bool allow_mixed_precision) { + return HloVerifier(/*layout_sensitive=*/layout_sensitive, + /*allow_mixed_precision=*/allow_mixed_precision) + .Run(module) + .status(); } } // namespace xla diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h index a8689f64981569ceb7c8a712f8ece00c99e8cf2d..7790737c093ad8e5a15c017e3f7890b6f25cb6f8 100644 --- a/tensorflow/compiler/xla/tests/test_utils.h +++ b/tensorflow/compiler/xla/tests/test_utils.h @@ -20,12 +20,12 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/stream_executor/platform.h" @@ -63,8 +63,17 @@ StatusOr> MakeFakeLiteral(const Shape& shape, // Generates a vector of arguments containing fake data. The number, shape and // layout of the arguments is appropriate for given HLO module. // -// Will handle special cases such as making sure that indices used for dynamic -// slices are bounded, reduces that call adds use 0 as an init value, etc. +// A best-effort attempt is made to generate the data in a way which produce +// stable computation results across platforms. Specifically: +// +// (1) Init values of reductions should be the identity of the reduction +// computation. +// +// (2) Indices of dynamic slices and update slices should be in bounds. +// +// (3) Keys of key/value sorts should contain no duplicates. +// +// These constraints are best-effort only. // // If pseudo_random is true, the generated numbers will be generated // deterministically in a pseudo random way unless the values are constrated to @@ -78,10 +87,16 @@ StatusOr> MakeFakeLiteral(const Shape& shape, StatusOr>> MakeFakeArguments( HloModule* const module, bool pseudo_random = true); +// Overload which accepts a random number generator. This enables generation of +// different random values with sequential calls to MakeFakeArguments by reusing +// the same generator. +StatusOr>> MakeFakeArguments( + HloModule* const module, std::minstd_rand0* engine); + // Check that a given module satisfies various constraints before trying to // execute it. -Status VerifyHloModule(HloModule* const module, - bool allow_mixed_precision = false); +Status VerifyHloModule(HloModule* const module, bool layout_sensitive, + bool allow_mixed_precision); } // namespace xla diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc index 59afd28a80c0fbf3df38457cd05961c883769856..322c8ef090cf867f65cada5cb1dbae188f83bad6 100644 --- a/tensorflow/compiler/xla/tests/test_utils_test.cc +++ b/tensorflow/compiler/xla/tests/test_utils_test.cc @@ -15,10 +15,12 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_utils.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/local_client_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/lib/core/status_test_util.h" namespace xla { @@ -31,16 +33,16 @@ XLA_TEST_F(TestUtilsTest, UnusedParam) { XlaBuilder builder(TestName()); // Make the reduction lambda. Shape single_float = ShapeUtil::MakeShape(F32, {}); - builder.Parameter(0, single_float, "unused"); - builder.Parameter(1, single_float, "used"); + Parameter(&builder, 0, single_float, "unused"); + Parameter(&builder, 1, single_float, "used"); auto computation_status = builder.Build(); TF_ASSERT_OK(computation_status.status()); // Make the reduction. Shape pair_float = ShapeUtil::MakeShape(F32, {2}); - builder.Reduce(builder.Parameter(0, pair_float, "operand"), - builder.Parameter(1, single_float, "init"), - computation_status.ValueOrDie(), {0}); + Reduce(Parameter(&builder, 0, pair_float, "operand"), + Parameter(&builder, 1, single_float, "init"), + computation_status.ValueOrDie(), {0}); computation_status = builder.Build(); TF_ASSERT_OK(computation_status.status()); @@ -53,5 +55,124 @@ XLA_TEST_F(TestUtilsTest, UnusedParam) { TF_ASSERT_OK(MakeFakeArguments(&module).status()); } +XLA_TEST_F(TestUtilsTest, Token) { + auto module = ParseHloString( + R"(HloModule outfeed_module + + ENTRY InfeedToOutfeed { + token = token[] parameter(0) + infeed = ((u32[3]{0}, pred[]), token[]) infeed(token) + infeed.data = (u32[3]{0}, pred[]) get-tuple-element(infeed), index=0 + outfeed = token[] outfeed(infeed.data, token) + ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token) + infeed.1.data = (u32[3]{0}, pred[]) get-tuple-element(infeed.1), index=0 + infeed.1.token = token[] get-tuple-element(infeed.1), index=1 + outfeed.1 = token[] outfeed(infeed.1.data, infeed.1.token) + })") + .ValueOrDie(); + TF_ASSERT_OK(MakeFakeArguments(module.get()).status()); +} + +XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicSlices) { + auto module = ParseHloString( + R"(HloModule index_space_module + + ENTRY IndexSpace { + index_param = s32[3]{0} parameter(0) + array_param.1 = f32[123,4,789]{0,1,2} parameter(1) + array_param.2 = f32[3,3000,5]{0,1,2} parameter(2) + dynamic-slice.1 = f32[1,2,3] dynamic-slice(array_param.1, index_param), dynamic_slice_sizes={1,2,3} + ROOT dynamic-slice.2 = f32[3,2,2] dynamic-slice(array_param.2, index_param), dynamic_slice_sizes={3,2,2} + })") + .ValueOrDie(); + TF_ASSERT_OK_AND_ASSIGN(std::vector> args, + MakeFakeArguments(module.get())); + ASSERT_EQ(args.size(), 3); + const Literal& index_arg = *args[0]; + + EXPECT_EQ(index_arg.Get({0}), 0); + + EXPECT_GE(index_arg.Get({1}), 0); + EXPECT_LE(index_arg.Get({1}), 2); + + EXPECT_GE(index_arg.Get({2}), 0); + EXPECT_LE(index_arg.Get({2}), 3); +} + +XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) { + auto module = ParseHloString( + R"(HloModule index_space_module + + ENTRY IndexSpace { + index_param = s32[3]{0} parameter(0) + array_param.1 = f32[123,4,789]{0,1,2} parameter(1) + array_param.2 = f32[3,3000,5]{0,1,2} parameter(2) + update_param.1 = f32[1,2,3]{0,1,2} parameter(3) + update_param.2 = f32[3,2,2]{0,1,2} parameter(4) + + dynamic-update-slice.1 = f32[123,4,789] dynamic-update-slice(array_param.1, update_param.1, index_param) + ROOT dynamic-update-slice.2 = f32[3,3000,5] dynamic-update-slice(array_param.2, update_param.2, index_param) + })") + .ValueOrDie(); + TF_ASSERT_OK_AND_ASSIGN(std::vector> args, + MakeFakeArguments(module.get())); + ASSERT_EQ(args.size(), 5); + const Literal& index_arg = *args[0]; + + EXPECT_EQ(index_arg.Get({0}), 0); + + EXPECT_GE(index_arg.Get({1}), 0); + EXPECT_LE(index_arg.Get({1}), 2); + + EXPECT_GE(index_arg.Get({2}), 0); + EXPECT_LE(index_arg.Get({2}), 3); +} + +XLA_TEST_F(TestUtilsTest, NoDuplicatesFloats) { + // Inputs which are sort keys in key/value sorts should have no duplicates. + auto module = ParseHloString(R"( +HloModule sort.148.1589 + +ENTRY %sort.148.1589 (parameter.0: f32[1048576], parameter.1: s32[1048576]) -> (f32[1048576], s32[1048576]) { + %parameter.0 = f32[1048576]{0} parameter(0) + %parameter.1 = s32[1048576]{0} parameter(1) + ROOT %sort.148.1589 = (f32[1048576]{0}, s32[1048576]{0}) sort(f32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0} +} +)") + .ValueOrDie(); + TF_ASSERT_OK_AND_ASSIGN(std::vector> args, + MakeFakeArguments(module.get())); + ASSERT_EQ(args.size(), 2); + const Literal& key_arg = *args[0]; + + tensorflow::gtl::FlatSet key_set; + for (const float& value : key_arg.data()) { + EXPECT_TRUE(key_set.insert(tensorflow::bit_cast(value)).second); + } +} + +XLA_TEST_F(TestUtilsTest, NoDuplicatesInt32) { + // Inputs which are sort keys in key/value sorts should have no duplicates. + auto module = ParseHloString(R"( +HloModule sort.148.1589 + +ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (s32[1048576], s32[1048576]) { + %parameter.0 = s32[1048576]{0} parameter(0) + %parameter.1 = s32[1048576]{0} parameter(1) + ROOT %sort.148.1589 = (s32[1048576]{0}, s32[1048576]{0}) sort(s32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0} +} +)") + .ValueOrDie(); + TF_ASSERT_OK_AND_ASSIGN(std::vector> args, + MakeFakeArguments(module.get())); + ASSERT_EQ(args.size(), 2); + const Literal& key_arg = *args[0]; + + tensorflow::gtl::FlatSet key_set; + for (const int32& value : key_arg.data()) { + EXPECT_TRUE(key_set.insert(tensorflow::bit_cast(value)).second); + } +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c7eb9e2dbe0e27b7933f5861280a3401cd268c08 --- /dev/null +++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc @@ -0,0 +1,214 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/service/hlo_verifier.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace { + +class TokenHloTest : public HloTestBase {}; + +XLA_TEST_F(TokenHloTest, SingleTokenInstruction) { + std::unique_ptr module = CreateNewModule(); + auto builder = HloComputation::Builder(TestName()); + builder.AddInstruction(HloInstruction::CreateToken()); + + module->AddEntryComputation(builder.Build()); + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, + Execute(std::move(module), {})); + EXPECT_TRUE(LiteralTestUtil::Equal(*result, *LiteralUtil::CreateToken())); +} + +XLA_TEST_F(TokenHloTest, TokenTree) { + std::unique_ptr module = CreateNewModule(); + auto builder = HloComputation::Builder(TestName()); + auto token0 = builder.AddInstruction(HloInstruction::CreateToken()); + auto token1 = builder.AddInstruction(HloInstruction::CreateToken()); + auto token2 = builder.AddInstruction(HloInstruction::CreateToken()); + builder.AddInstruction( + HloInstruction::CreateAfterAll({token0, token0, token1, token2})); + + module->AddEntryComputation(builder.Build()); + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, + Execute(std::move(module), {})); + EXPECT_TRUE(LiteralTestUtil::Equal(*result, *LiteralUtil::CreateToken())); +} + +XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) { + std::unique_ptr module = CreateNewModule(); + auto builder = HloComputation::Builder(TestName()); + builder.AddInstruction( + HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0")); + builder.AddInstruction( + HloInstruction::CreateParameter(1, ShapeUtil::MakeTokenShape(), "p1")); + builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(42))); + module->AddEntryComputation(builder.Build()); + + Status status = + HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false) + .Run(module.get()) + .status(); + ASSERT_IS_NOT_OK(status); + EXPECT_THAT( + status.error_message(), + ::testing::HasSubstr("Entry parameter 1 is or contains a token shape")); +} + +XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) { + std::unique_ptr module = CreateNewModule(); + auto builder = HloComputation::Builder(TestName()); + builder.AddInstruction(HloInstruction::CreateParameter( + 0, + ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeShape(F32, {1, 2, 3}), ShapeUtil::MakeTokenShape()}), + "param")); + module->AddEntryComputation(builder.Build()); + + Status status = + HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false) + .Run(module.get()) + .status(); + ASSERT_IS_NOT_OK(status); + EXPECT_THAT( + status.error_message(), + ::testing::HasSubstr("Entry parameter 0 is or contains a token shape")); +} + +XLA_TEST_F(TokenHloTest, InvalidOperandToTokenInstruction) { + std::unique_ptr module = CreateNewModule(); + auto builder = HloComputation::Builder(TestName()); + auto param = builder.AddInstruction( + HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0")); + builder.AddInstruction(HloInstruction::CreateAfterAll({param})); + builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(123))); + module->AddEntryComputation(builder.Build()); + + Status status = + HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false) + .Run(module.get()) + .status(); + ASSERT_IS_NOT_OK(status); + EXPECT_THAT(status.error_message(), + ::testing::HasSubstr( + "Operands of token instructions must be TOKEN types")); +} + +XLA_TEST_F(TokenHloTest, TokenInWhileLoop) { + // Thread a token around a while loop. Token is created and consumed by a + // AfterAll instruction in the while body. + string module_string = R"( +HloModule TokenInWhileLoop + +%Body (param.1: (s32[], token[])) -> (s32[], token[]) { + %param.1 = (s32[], token[]) parameter(0) + %get-tuple-element.1 = s32[] get-tuple-element((s32[], token[]) %param.1), index=0 + %constant.1 = s32[] constant(1) + %add = s32[] add(s32[] %get-tuple-element.1, s32[] %constant.1) + %get-tuple-element.2 = token[] get-tuple-element((s32[], token[]) %param.1), index=1 + %after-all = token[] after-all(token[] %get-tuple-element.2) + ROOT %tuple = (s32[], token[]) tuple(s32[] %add, token[] %after-all) +} + +%Cond (param: (s32[], token[])) -> pred[] { + %param = (s32[], token[]) parameter(0) + %get-tuple-element = s32[] get-tuple-element((s32[], token[]) %param), index=0 + %constant = s32[] constant(42) + ROOT %less-than = pred[] less-than(s32[] %get-tuple-element, s32[] %constant) +} + +ENTRY %TokenInWhileLoop () -> s32[] { + %zero = s32[] constant(0) + %init_token = token[] after-all() + %init_tuple = (s32[], token[]) tuple(s32[] %zero, token[] %init_token) + %while = (s32[], token[]) while((s32[], token[]) %init_tuple), condition=%Cond, body=%Body + ROOT %root = s32[] get-tuple-element((s32[], token[]) %while), index=0 +} +)"; + + DebugOptions debug_options = GetDebugOptionsForTest(); + // Module DCE pass removes the generate token instructions. + debug_options.add_xla_disable_hlo_passes("hlo-module-dce"); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + HloRunner::CreateModuleFromString(module_string, debug_options)); + + EXPECT_TRUE(RunAndCompare(std::move(module), error_spec_)); +} + +XLA_TEST_F(TokenHloTest, TokenInConditional) { + string module_string = R"( +HloModule TokenInConditional + +%True (param.1: token[]) -> (s32[], token[]) { + %param.1 = token[] parameter(0) + %forty_two = s32[] constant(42) + ROOT %tuple = (s32[], token[]) tuple(s32[] %forty_two, token[] %param.1) +} + +%False (param.2: s32[]) -> (s32[], token[]) { + %param.2 = s32[] parameter(0) + %new_token = token[] after-all() + ROOT %tuple = (s32[], token[]) tuple(s32[] %param.2, token[] %new_token) +} + +ENTRY %TokenInConditional (param.3: pred[]) -> s32[] { + %param.3 = pred[] parameter(0) + %init_token = token[] after-all() + %seven = s32[] constant(7) + %cond = (s32[], token[]) conditional(pred[] %param.3, token[] %init_token, s32[] %seven), true_computation=True, false_computation=False + ROOT %root = s32[] get-tuple-element((s32[], token[]) %cond), index=0 +} +)"; + + DebugOptions debug_options = GetDebugOptionsForTest(); + // Module DCE pass removes the generate token instructions. + debug_options.add_xla_disable_hlo_passes("hlo-module-dce"); + + { + // True case. + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + HloRunner::CreateModuleFromString(module_string, debug_options)); + auto arg = LiteralUtil::CreateR0(true); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, + Execute(std::move(module), {arg.get()})); + EXPECT_EQ(42, result->Get({})); + } + + { + // False case. + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + HloRunner::CreateModuleFromString(module_string, debug_options)); + auto arg = LiteralUtil::CreateR0(false); + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, + Execute(std::move(module), {arg.get()})); + EXPECT_EQ(7, result->Get({})); + } +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc index 0063e7ad415e9b6718c164f415ced6fb76cbf44a..125513ddfd16cb4e742e7d589e22b721307621ee 100644 --- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc +++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc @@ -18,10 +18,11 @@ limitations under the License. #include #include "tensorflow/compiler/xla/layout_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/generic_transfer_manager.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -31,6 +32,7 @@ limitations under the License. #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -41,7 +43,12 @@ class TransferManagerTest : public LocalClientTestBase { TransferManagerTest() : shape_size_fn_([this](const Shape& shape) { return transfer_manager_->GetByteSizeRequirement(shape); - }) {} + }) { + stream_ptr_ = local_client_->mutable_backend() + ->BorrowStream(stream_executor_) + .ValueOrDie(); + stream_ = stream_ptr_.get(); + } ~TransferManagerTest() override = default; @@ -53,37 +60,41 @@ class TransferManagerTest : public LocalClientTestBase { .ValueOrDie(); } + protected: + StreamPool::Ptr stream_ptr_; + se::Stream* stream_; + private: std::function shape_size_fn_; }; XLA_TEST_F(TransferManagerTest, TransferR0U32) { - std::unique_ptr literal = Literal::CreateR0(42); + std::unique_ptr literal = LiteralUtil::CreateR0(42); const Shape& shape = literal->shape(); auto device_buffer = AllocateDeviceBuffer(shape); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); LiteralTestUtil::ExpectR0Equal(42, *result); } XLA_TEST_F(TransferManagerTest, TransferR1F32) { std::unique_ptr literal = - Literal::CreateR1({1.25f, 2.5f, -17.0f, -20.125f}); + LiteralUtil::CreateR1({1.25f, 2.5f, -17.0f, -20.125f}); const Shape& shape = literal->shape(); auto device_buffer = AllocateDeviceBuffer(shape); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); LiteralTestUtil::ExpectR1Equal({1.25f, 2.5f, -17.0f, -20.125f}, *result); @@ -92,48 +103,48 @@ XLA_TEST_F(TransferManagerTest, TransferR1F32) { XLA_TEST_F(TransferManagerTest, TransferR1LargeF32) { std::vector test_vector(1024 * 1024); std::iota(test_vector.begin(), test_vector.end(), 0); - std::unique_ptr literal = Literal::CreateR1(test_vector); + std::unique_ptr literal = LiteralUtil::CreateR1(test_vector); const Shape& shape = literal->shape(); auto device_buffer = AllocateDeviceBuffer(shape); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); LiteralTestUtil::ExpectR1Equal(test_vector, *result); } XLA_TEST_F(TransferManagerTest, TransferR1U8) { const char* test_string = "0123456789abcdef"; - std::unique_ptr literal = Literal::CreateR1U8(test_string); + std::unique_ptr literal = LiteralUtil::CreateR1U8(test_string); const Shape& shape = literal->shape(); auto device_buffer = AllocateDeviceBuffer(shape); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); EXPECT_EQ(result->GetR1U8AsString(), test_string); } XLA_TEST_F(TransferManagerTest, TransferR2F32) { std::unique_ptr literal = - Literal::CreateR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}); + LiteralUtil::CreateR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}); const Shape& shape = literal->shape(); auto device_buffer = AllocateDeviceBuffer(shape); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); LiteralTestUtil::ExpectR2Equal( {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, *result); @@ -141,7 +152,7 @@ XLA_TEST_F(TransferManagerTest, TransferR2F32) { XLA_TEST_F(TransferManagerTest, TransferR2F32AndChangeLayoutTransferringToDevice) { - std::unique_ptr literal = Literal::CreateR2WithLayout( + std::unique_ptr literal = LiteralUtil::CreateR2WithLayout( {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, LayoutUtil::MakeLayout({0, 1})); const Shape ondevice_shape = ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0}); @@ -149,11 +160,11 @@ XLA_TEST_F(TransferManagerTest, // Round trip literal through device. Set the on-device layout to something // different than the literal layout. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); EXPECT_FALSE( LayoutUtil::Equal(result->shape().layout(), literal->shape().layout())); @@ -162,89 +173,237 @@ XLA_TEST_F(TransferManagerTest, } XLA_TEST_F(TransferManagerTest, TransferTuple) { - std::unique_ptr literal = Literal::MakeTuple( - {Literal::CreateR0(123.0f).get(), - Literal::CreateR2({{1.0f, 2.0f}, {4.0f, 5.0f}}).get(), - Literal::CreateR1({44.0f, -10.0f, 3333333.3f}).get()}); + std::unique_ptr literal = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(123.0f).get(), + LiteralUtil::CreateR2({{1.0f, 2.0f}, {4.0f, 5.0f}}).get(), + LiteralUtil::CreateR1({44.0f, -10.0f, 3333333.3f}).get()}); auto device_buffer = AllocateDeviceBuffer(literal->shape()); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result)); } XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) { - std::unique_ptr literal = Literal::MakeTuple({}); + std::unique_ptr literal = LiteralUtil::MakeTuple({}); auto device_buffer = AllocateDeviceBuffer(literal->shape()); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result)); } XLA_TEST_F(TransferManagerTest, TransferNestedTuple) { - std::unique_ptr literal = Literal::MakeTuple( - {Literal::CreateR0(123.0f).get(), - Literal::MakeTuple( - {Literal::CreateR2({{1.0f, 2.0f}, {4.0f, 5.0f}}).get(), - Literal::CreateR1({44.0f, -10.0f, 3333333.3f}).get()}) + std::unique_ptr literal = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(123.0f).get(), + LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{1.0f, 2.0f}, {4.0f, 5.0f}}).get(), + LiteralUtil::CreateR1({44.0f, -10.0f, 3333333.3f}).get()}) .get(), - Literal::CreateR1({-10.0f, 123.0f}).get()}); + LiteralUtil::CreateR1({-10.0f, 123.0f}).get()}); auto device_buffer = AllocateDeviceBuffer(literal->shape()); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result)); } XLA_TEST_F(TransferManagerTest, TransferComplexValue) { - std::unique_ptr literal = Literal::CreateR1( + std::unique_ptr literal = LiteralUtil::CreateR1( {complex64(1.0f, 2.0f), complex64(42.0f, -123.4f)}); auto device_buffer = AllocateDeviceBuffer(literal->shape()); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result)); } XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) { - std::unique_ptr literal = Literal::MakeTuple( - {Literal::CreateR1( + std::unique_ptr literal = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR1( {complex64(1.0f, 2.0f), complex64(42.0f, -123.4f)}) .get(), - Literal::CreateR1({1, 2, 3, 4, 5, 6}).get(), - Literal::CreateR0(complex64(0.3f, -0.4f)).get()}); + LiteralUtil::CreateR1({1, 2, 3, 4, 5, 6}).get(), + LiteralUtil::CreateR0(complex64(0.3f, -0.4f)).get()}); auto device_buffer = AllocateDeviceBuffer(literal->shape()); // Round trip literal through device. - ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, device_buffer)); - TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, - transfer_manager_->TransferLiteralFromDevice( - stream_executor_, device_buffer)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result)); } +XLA_TEST_F(TransferManagerTest, TransferTokenFromDevice) { + // "Copy" a token from the device. The token has no physical representation so + // no copying is actually performed, but it shouldn't fail. + // TODO(b/110532604): Add transferring the token to device when this is + // supported. + auto device_buffer = AllocateDeviceBuffer(ShapeUtil::MakeTokenShape()); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); + EXPECT_TRUE(LiteralTestUtil::Equal(*LiteralUtil::CreateToken(), *result)); +} + +XLA_TEST_F(TransferManagerTest, MultiStreamRoundTripSoak) { + const int64 kIterationCount = 5000; + std::unique_ptr literal1 = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(123.0f).get(), + LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{1.0f, 2.0f}, {4.0f, 5.0f}}).get(), + LiteralUtil::CreateR1({44.0f, -10.0f, 3333333.3f}).get()}) + .get(), + LiteralUtil::CreateR1({-10.0f, 123.0f}).get()}); + std::unique_ptr literal2 = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(456.0f).get(), + LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2({{5.0f, 7.0f}, {9.0f, 4.0f}}).get(), + LiteralUtil::CreateR1({44.0f, -11.0f, 3333333.3f}).get()}) + .get(), + LiteralUtil::CreateR1({-98.0f, 153.0f}).get()}); + + auto device_buffer1 = AllocateDeviceBuffer(literal1->shape()); + auto device_buffer2 = AllocateDeviceBuffer(literal2->shape()); + + auto stream1 = stream_; + auto stream2 = stream_->GetOrCreateSubStream(); + + std::unique_ptr result1, result2; + + // Round trip literals through device in multiple streams asynchronously. + for (int i = 0; i < kIterationCount; ++i) { + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream1, *literal1, + device_buffer1)); + ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream2, *literal2, + device_buffer2)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr this_result1, + transfer_manager_->TransferLiteralFromDevice(stream1, device_buffer1)); + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr this_result2, + transfer_manager_->TransferLiteralFromDevice(stream2, device_buffer2)); + result1 = std::move(this_result1); + result2 = std::move(this_result2); + } + + EXPECT_TRUE(LiteralTestUtil::Equal(*literal1, *result1)); + EXPECT_TRUE(LiteralTestUtil::Equal(*literal2, *result2)); +} + +class TransferDeviceToHostBenchmark : public TransferManagerTest { + public: + using TransferManagerTest::TransferManagerTest; + ~TransferDeviceToHostBenchmark() override {} + + void Run(int iters, int num_tuple_elements, int array_size) { + tensorflow::testing::StopTiming(); + SetUp(); + + std::vector> tuple_elements; + for (int i = 0; i < num_tuple_elements; ++i) { + tuple_elements.push_back( + LiteralUtil::CreateR2F32Linspace(0.0f, 1.0f, array_size, array_size)); + } + std::unique_ptr literal = + LiteralUtil::MakeTupleOwned(std::move(tuple_elements)); + auto device_buffer = AllocateDeviceBuffer(literal->shape()); + TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result, + transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); + } + tensorflow::testing::StopTiming(); + TearDown(); + } + + void TestBody() override {} +}; + +class TransferHostToDeviceBenchmark : public TransferManagerTest { + public: + using TransferManagerTest::TransferManagerTest; + ~TransferHostToDeviceBenchmark() override {} + + void Run(int iters, int num_tuple_elements, int array_size) { + tensorflow::testing::StopTiming(); + SetUp(); + + std::vector> tuple_elements; + for (int i = 0; i < num_tuple_elements; ++i) { + tuple_elements.push_back( + LiteralUtil::CreateR2F32Linspace(0.0f, 1.0f, array_size, array_size)); + } + std::unique_ptr literal = + LiteralUtil::MakeTupleOwned(std::move(tuple_elements)); + auto device_buffer = AllocateDeviceBuffer(literal->shape()); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, *literal, + device_buffer)); + } + tensorflow::testing::StopTiming(); + TearDown(); + } + + void TestBody() override {} +}; + +void BM_TransferDeviceToHost(int iters, int num_tuple_elements, + int array_size) { + TransferDeviceToHostBenchmark bm; + bm.Run(iters, num_tuple_elements, array_size); +} + +void BM_TransferHostToDevice(int iters, int num_tuple_elements, + int array_size) { + TransferHostToDeviceBenchmark bm; + bm.Run(iters, num_tuple_elements, array_size); +} + +BENCHMARK(BM_TransferHostToDevice) + ->ArgPair(1, 256) + ->ArgPair(1, 257) + ->ArgPair(100, 256) + ->ArgPair(100, 257); + +BENCHMARK(BM_TransferDeviceToHost) + ->ArgPair(1, 256) + ->ArgPair(1, 257) + ->ArgPair(100, 256) + ->ArgPair(100, 257); + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + tensorflow::testing::RunBenchmarks(); + return RUN_ALL_TESTS(); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/transpose_test.cc b/tensorflow/compiler/xla/tests/transpose_test.cc index fe1e3da7eca00e128377e6e56af877868aafa836..fbe9d1b64aa0c06d65b547c45cfa981800d40ff3 100644 --- a/tensorflow/compiler/xla/tests/transpose_test.cc +++ b/tensorflow/compiler/xla/tests/transpose_test.cc @@ -17,7 +17,7 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" @@ -38,34 +38,35 @@ class TransposeTest : public ClientLibraryTestBase { XLA_TEST_F(TransposeTest, Transpose0x0) { XlaBuilder builder("Transpose"); - auto lhs = builder.ConstantR2FromArray2D(Array2D(0, 0)); - auto result = builder.Transpose(lhs, {1, 0}); + auto lhs = ConstantR2FromArray2D(&builder, Array2D(0, 0)); + Transpose(lhs, {1, 0}); ComputeAndCompareR2(&builder, Array2D(0, 0), {}, error_spec_); } XLA_TEST_F(TransposeTest, Transpose0x42) { XlaBuilder builder("Transpose"); - auto lhs = builder.ConstantR2FromArray2D(Array2D(0, 42)); - auto result = builder.Transpose(lhs, {1, 0}); + auto lhs = ConstantR2FromArray2D(&builder, Array2D(0, 42)); + Transpose(lhs, {1, 0}); ComputeAndCompareR2(&builder, Array2D(42, 0), {}, error_spec_); } XLA_TEST_F(TransposeTest, Transpose7x0) { XlaBuilder builder("Transpose"); - auto lhs = builder.ConstantR2FromArray2D(Array2D(7, 0)); - auto result = builder.Transpose(lhs, {1, 0}); + auto lhs = ConstantR2FromArray2D(&builder, Array2D(7, 0)); + Transpose(lhs, {1, 0}); ComputeAndCompareR2(&builder, Array2D(0, 7), {}, error_spec_); } TEST_F(TransposeTest, Transpose2x2) { XlaBuilder builder("Transpose"); - auto lhs = builder.ConstantR2({ - {1.0, 2.0}, {3.0, 4.0}, - }); - auto result = builder.Transpose(lhs, {1, 0}); + auto lhs = ConstantR2(&builder, { + {1.0, 2.0}, + {3.0, 4.0}, + }); + Transpose(lhs, {1, 0}); Array2D expected({{1.0f, 3.0f}, {2.0f, 4.0f}}); @@ -74,16 +75,18 @@ TEST_F(TransposeTest, Transpose2x2) { XLA_TEST_F(TransposeTest, Transpose0x2x3_2x3x0) { XlaBuilder builder("Transpose"); - auto operand = builder.ConstantR3FromArray3D(Array3D(0, 2, 3)); - auto result = builder.Transpose(operand, {1, 2, 0}); + auto operand = + ConstantR3FromArray3D(&builder, Array3D(0, 2, 3)); + Transpose(operand, {1, 2, 0}); ComputeAndCompareR3(&builder, Array3D(2, 3, 0), {}); } TEST_F(TransposeTest, Transpose1x2x3_2x3x1) { XlaBuilder builder("Transpose"); - auto operand = builder.ConstantR3FromArray3D({{{1, 2, 3}, {4, 5, 6}}}); - auto result = builder.Transpose(operand, {1, 2, 0}); + auto operand = + ConstantR3FromArray3D(&builder, {{{1, 2, 3}, {4, 5, 6}}}); + Transpose(operand, {1, 2, 0}); Array3D expected({{{1}, {2}, {3}}, {{4}, {5}, {6}}}); @@ -92,8 +95,9 @@ TEST_F(TransposeTest, Transpose1x2x3_2x3x1) { TEST_F(TransposeTest, Transpose1x2x3_3x2x1) { XlaBuilder builder("Transpose"); - auto operand = builder.ConstantR3FromArray3D({{{1, 2, 3}, {4, 5, 6}}}); - auto result = builder.Transpose(operand, {2, 1, 0}); + auto operand = + ConstantR3FromArray3D(&builder, {{{1, 2, 3}, {4, 5, 6}}}); + Transpose(operand, {2, 1, 0}); Array3D expected({{{1}, {4}}, {{2}, {5}}, {{3}, {6}}}); @@ -102,8 +106,9 @@ TEST_F(TransposeTest, Transpose1x2x3_3x2x1) { TEST_F(TransposeTest, Transpose1x2x3_1x2x3) { XlaBuilder builder("Transpose"); - auto operand = builder.ConstantR3FromArray3D({{{1, 2, 3}, {4, 5, 6}}}); - auto result = builder.Transpose(operand, {0, 1, 2}); + auto operand = + ConstantR3FromArray3D(&builder, {{{1, 2, 3}, {4, 5, 6}}}); + Transpose(operand, {0, 1, 2}); Array3D expected({{{1, 2, 3}, {4, 5, 6}}}); @@ -116,9 +121,9 @@ TEST_F(TransposeTest, MultiTranspose3x2) { for (int transposes = 0; transposes <= 10; ++transposes) { XlaBuilder builder("Transpose"); - auto computed = builder.ConstantR2FromArray2D(input); + auto computed = ConstantR2FromArray2D(&builder, input); for (int i = 0; i < transposes; ++i) { - computed = builder.Transpose(computed, {1, 0}); + computed = Transpose(computed, {1, 0}); } const Array2D& expected = transposes % 2 == 0 ? input : transposed; ComputeAndCompareR2(&builder, expected, {}, error_spec_); @@ -130,8 +135,8 @@ TEST_F(TransposeTest, Small_1x1) { auto aoperand = MakeLinspaceArray2D(0.0, 1.0, 1, 1); XlaBuilder builder("transpose_1x1"); - auto operand = builder.ConstantR2FromArray2D(*aoperand); - builder.Transpose(operand, {1, 0}); + auto operand = ConstantR2FromArray2D(&builder, *aoperand); + Transpose(operand, {1, 0}); auto expected = ReferenceUtil::TransposeArray2D(*aoperand); ComputeAndCompareR2(&builder, *expected, {}, ErrorSpec(1e-4)); @@ -142,8 +147,8 @@ TEST_F(TransposeTest, Small_2x2) { auto aoperand = MakeLinspaceArray2D(0.0, 4.0, 2, 2); XlaBuilder builder("transpose_2x2"); - auto operand = builder.ConstantR2FromArray2D(*aoperand); - builder.Transpose(operand, {1, 0}); + auto operand = ConstantR2FromArray2D(&builder, *aoperand); + Transpose(operand, {1, 0}); auto expected = ReferenceUtil::TransposeArray2D(*aoperand); ComputeAndCompareR2(&builder, *expected, {}, ErrorSpec(1e-4)); @@ -162,8 +167,8 @@ void TransposeTest::TestTransposeConstant021(size_t n1, size_t n2, size_t n3) { } XlaBuilder builder(TestName()); - auto operand = builder.ConstantR3FromArray3D(aoperand); - builder.Transpose(operand, {0, 2, 1}); + auto operand = ConstantR3FromArray3D(&builder, aoperand); + Transpose(operand, {0, 2, 1}); ComputeAndCompareR3(&builder, expected, {}); } diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc index 41189231b90e842292830a932cf381af60456d4c..f2b3b49015c7d74d786f63776abff1d5181fd961 100644 --- a/tensorflow/compiler/xla/tests/tuple_test.cc +++ b/tensorflow/compiler/xla/tests/tuple_test.cc @@ -16,10 +16,11 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" @@ -29,6 +30,7 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" namespace xla { @@ -49,12 +51,12 @@ XLA_TEST_F(TupleTest, TupleConstant) { {1.1f, 2.2f, 3.5f}, // row 0 {4.8f, 5.0f, 6.7f}, // row 1 }; - auto value = - Literal::MakeTuple({Literal::CreateR0(constant_scalar).get(), - Literal::CreateR1(constant_vector).get(), - Literal::CreateR2(constant_matrix).get()}); + auto value = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(constant_scalar).get(), + LiteralUtil::CreateR1(constant_vector).get(), + LiteralUtil::CreateR2(constant_matrix).get()}); - builder.ConstantLiteral(*value); + ConstantLiteral(&builder, *value); ComputeAndCompareTuple(&builder, *value, {}, error_spec_); } @@ -64,11 +66,11 @@ XLA_TEST_F(TupleTest, TupleScalarConstant) { const float constant_scalar1 = 7.3f; const float constant_scalar2 = 1.2f; - auto value = - Literal::MakeTuple({Literal::CreateR0(constant_scalar1).get(), - Literal::CreateR0(constant_scalar2).get()}); + auto value = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(constant_scalar1).get(), + LiteralUtil::CreateR0(constant_scalar2).get()}); - builder.ConstantLiteral(*value); + ConstantLiteral(&builder, *value); ComputeAndCompareTuple(&builder, *value, {}, error_spec_); } @@ -82,14 +84,14 @@ XLA_TEST_F(TupleTest, TupleCreate) { {1.1f, 2.2f, 3.5f}, // row 0 {4.8f, 5.0f, 6.7f}, // row 1 }; - builder.Tuple({builder.ConstantR0(constant_scalar), - builder.ConstantR1(constant_vector), - builder.ConstantR2(constant_matrix)}); - - auto expected = - Literal::MakeTuple({Literal::CreateR0(constant_scalar).get(), - Literal::CreateR1(constant_vector).get(), - Literal::CreateR2(constant_matrix).get()}); + Tuple(&builder, {ConstantR0(&builder, constant_scalar), + ConstantR1(&builder, constant_vector), + ConstantR2(&builder, constant_matrix)}); + + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0(constant_scalar).get(), + LiteralUtil::CreateR1(constant_vector).get(), + LiteralUtil::CreateR2(constant_matrix).get()}); ComputeAndCompareTuple(&builder, *expected, {}, error_spec_); } @@ -97,19 +99,20 @@ XLA_TEST_F(TupleTest, TupleCreate) { XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) { XlaBuilder builder(TestName()); - builder.Tuple( - {builder.ConstantR0(7.0), builder.ConstantR1({})}); + Tuple(&builder, + {ConstantR0(&builder, 7.0), ConstantR1(&builder, {})}); - auto expected = Literal::MakeTuple({Literal::CreateR0(7.0).get(), - Literal::CreateR1({}).get()}); + auto expected = + LiteralUtil::MakeTuple({LiteralUtil::CreateR0(7.0).get(), + LiteralUtil::CreateR1({}).get()}); ComputeAndCompareTuple(&builder, *expected, {}, error_spec_); } // Tests the creation of an empty tuple. XLA_TEST_F(TupleTest, EmptyTupleCreate) { XlaBuilder builder(TestName()); - builder.Tuple({}); - auto expected = Literal::MakeTuple({}); + Tuple(&builder, {}); + auto expected = LiteralUtil::MakeTuple({}); ComputeAndCompareTuple(&builder, *expected, {}, error_spec_); } @@ -121,9 +124,10 @@ XLA_TEST_F(TupleTest, GetTupleElement) { {1.f, 2.f, 3.f}, // row 0 {4.f, 5.f, 6.f}, // row 1 }; - auto tuple_data = builder.Tuple({builder.ConstantR1(constant_vector), - builder.ConstantR2(constant_matrix)}); - builder.GetTupleElement(tuple_data, 1); + auto tuple_data = + Tuple(&builder, {ConstantR1(&builder, constant_vector), + ConstantR2(&builder, constant_matrix)}); + GetTupleElement(tuple_data, 1); ComputeAndCompareR2(&builder, Array2D(constant_matrix), {}, error_spec_); } @@ -131,17 +135,18 @@ XLA_TEST_F(TupleTest, GetTupleElement) { // Trivial test for extracting a tuple element with GetTupleElement. XLA_TEST_F(TupleTest, GetTupleElementWithZeroElements) { XlaBuilder builder(TestName()); - auto tuple_data = builder.Tuple( - {builder.ConstantR1({}), - builder.ConstantR2FromArray2D(Array2D(0, 101))}); - builder.GetTupleElement(tuple_data, 1); + auto tuple_data = + Tuple(&builder, + {ConstantR1(&builder, {}), + ConstantR2FromArray2D(&builder, Array2D(0, 101))}); + GetTupleElement(tuple_data, 1); ComputeAndCompareR2(&builder, Array2D(0, 101), {}, error_spec_); } XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) { XlaBuilder builder(TestName()); - auto value = builder.ConstantR1({4.5f}); - builder.GetTupleElement(value, 1); + auto value = ConstantR1(&builder, {4.5f}); + GetTupleElement(value, 1); auto result_status = builder.Build(); EXPECT_FALSE(result_status.ok()); EXPECT_THAT( @@ -158,14 +163,15 @@ XLA_TEST_F(TupleTest, AddTupleElements) { {1.f, 2.f, 3.f}, // row 0 {4.f, 5.f, 6.f}, // row 1 }; - auto tuple_data = builder.Tuple({builder.ConstantR1(constant_vector), - builder.ConstantR2(constant_matrix)}); - auto vector_element = builder.GetTupleElement(tuple_data, 0); - auto matrix_element = builder.GetTupleElement(tuple_data, 1); + auto tuple_data = + Tuple(&builder, {ConstantR1(&builder, constant_vector), + ConstantR2(&builder, constant_matrix)}); + auto vector_element = GetTupleElement(tuple_data, 0); + auto matrix_element = GetTupleElement(tuple_data, 1); auto vector_shape = builder.GetShape(vector_element).ConsumeValueOrDie(); auto matrix_shape = builder.GetShape(matrix_element).ConsumeValueOrDie(); - builder.Add(matrix_element, vector_element, - /*broadcast_dimensions=*/{1}); + Add(matrix_element, vector_element, + /*broadcast_dimensions=*/{1}); Array2D expected({ {2.f, 4.f, 6.f}, // row 0 @@ -185,13 +191,14 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) { {1.f, 2.f, 3.f}, // row 0 {4.f, 5.f, 6.f}, // row 1 }; - auto tuple_data = builder.Tuple({builder.ConstantR1(constant_vector), - builder.ConstantR2(constant_matrix)}); - builder.Tuple({builder.GetTupleElement(tuple_data, 1), - builder.GetTupleElement(tuple_data, 0)}); - auto expected = - Literal::MakeTuple({Literal::CreateR2(constant_matrix).get(), - Literal::CreateR1(constant_vector).get()}); + auto tuple_data = + Tuple(&builder, {ConstantR1(&builder, constant_vector), + ConstantR2(&builder, constant_matrix)}); + Tuple(&builder, + {GetTupleElement(tuple_data, 1), GetTupleElement(tuple_data, 0)}); + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::CreateR2(constant_matrix).get(), + LiteralUtil::CreateR1(constant_vector).get()}); ComputeAndCompareTuple(&builder, *expected, {}, error_spec_); } @@ -206,14 +213,14 @@ XLA_TEST_F(TupleTest, SelectBetweenPredTuples) { std::unique_ptr v2_data = CreateR0Parameter(1.0f, /*parameter_number=*/1, /*name=*/"v2", /*builder=*/&b, /*data_handle=*/&v2); - auto v1_gt = b.Gt(v1, v2); // false - auto v2_gt = b.Gt(v2, v1); // true - auto v1_v2 = b.Tuple({v1_gt, v2_gt}); // {false, true} - auto v2_v1 = b.Tuple({v2_gt, v1_gt}); // {true, false} - b.Select(direction ? v1_gt : v2_gt, v1_v2, v2_v1); + auto v1_gt = Gt(v1, v2); // false + auto v2_gt = Gt(v2, v1); // true + auto v1_v2 = Tuple(&b, {v1_gt, v2_gt}); // {false, true} + auto v2_v1 = Tuple(&b, {v2_gt, v1_gt}); // {true, false} + Select(direction ? v1_gt : v2_gt, v1_v2, v2_v1); auto expected = - Literal::MakeTuple({Literal::CreateR0(direction).get(), - Literal::CreateR0(!direction).get()}); + LiteralUtil::MakeTuple({LiteralUtil::CreateR0(direction).get(), + LiteralUtil::CreateR0(!direction).get()}); ComputeAndCompareTuple(&b, *expected, {v1_data.get(), v2_data.get()}, error_spec_); @@ -243,22 +250,23 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) { {1.f, 2.f, 3.f}, // row 0 {4.f, 5.f, 6.f}, // row 1 }; - auto tuple_data = builder.Tuple({builder.ConstantR1(constant_vector), - builder.ConstantR2(constant_matrix)}); - auto new_tuple01 = builder.Tuple({builder.GetTupleElement(tuple_data, 0), - builder.GetTupleElement(tuple_data, 1)}); - auto new_tuple10 = builder.Tuple({builder.GetTupleElement(tuple_data, 1), - builder.GetTupleElement(tuple_data, 0)}); - auto vector_from_01 = builder.GetTupleElement(new_tuple01, 0); - auto vector_from_10 = builder.GetTupleElement(new_tuple10, 1); - auto matrix_from_01 = builder.GetTupleElement(new_tuple01, 1); - auto matrix_from_10 = builder.GetTupleElement(new_tuple10, 0); - - auto addvectors = builder.Add(vector_from_01, vector_from_10); - auto addmatrices = builder.Add(matrix_from_01, matrix_from_10); - - builder.Add(addmatrices, addvectors, - /*broadcast_dimensions=*/{1}); + auto tuple_data = + Tuple(&builder, {ConstantR1(&builder, constant_vector), + ConstantR2(&builder, constant_matrix)}); + auto new_tuple01 = Tuple(&builder, {GetTupleElement(tuple_data, 0), + GetTupleElement(tuple_data, 1)}); + auto new_tuple10 = Tuple(&builder, {GetTupleElement(tuple_data, 1), + GetTupleElement(tuple_data, 0)}); + auto vector_from_01 = GetTupleElement(new_tuple01, 0); + auto vector_from_10 = GetTupleElement(new_tuple10, 1); + auto matrix_from_01 = GetTupleElement(new_tuple01, 1); + auto matrix_from_10 = GetTupleElement(new_tuple10, 0); + + auto addvectors = Add(vector_from_01, vector_from_10); + auto addmatrices = Add(matrix_from_01, matrix_from_10); + + Add(addmatrices, addvectors, + /*broadcast_dimensions=*/{1}); Array2D expected({ {4.f, 8.f, 12.f}, // row 0 @@ -273,14 +281,15 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesOnFalse) { std::initializer_list vec1 = {1.f, 2.f, 3.f}; std::initializer_list vec2 = {2.f, 4.f, 6.f}; - auto tuple12 = builder.Tuple( - {builder.ConstantR1(vec1), builder.ConstantR1(vec2)}); - auto tuple21 = builder.Tuple( - {builder.ConstantR1(vec2), builder.ConstantR1(vec1)}); - - builder.Select(builder.ConstantR0(false), tuple12, tuple21); - auto expected = Literal::MakeTuple({Literal::CreateR1(vec2).get(), - Literal::CreateR1(vec1).get()}); + auto tuple12 = Tuple(&builder, {ConstantR1(&builder, vec1), + ConstantR1(&builder, vec2)}); + auto tuple21 = Tuple(&builder, {ConstantR1(&builder, vec2), + ConstantR1(&builder, vec1)}); + + Select(ConstantR0(&builder, false), tuple12, tuple21); + auto expected = + LiteralUtil::MakeTuple({LiteralUtil::CreateR1(vec2).get(), + LiteralUtil::CreateR1(vec1).get()}); ComputeAndCompareTuple(&builder, *expected, {}, error_spec_); } @@ -292,22 +301,22 @@ XLA_TEST_F(TupleTest, TuplesInAMap) { // Need to put a select in there to prevent HLO-level optimizations from // optimizing out the tuples. XlaBuilder b("sort_square"); - auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto x2 = b.Mul(x, x); - auto x_smaller_tuple = b.Tuple({x, x2}); - auto x2_smaller_tuple = b.Tuple({x2, x}); - auto sorted = b.Select(b.Lt(x, x2), x_smaller_tuple, x2_smaller_tuple); - auto smaller = b.GetTupleElement(sorted, 0); - auto greater = b.GetTupleElement(sorted, 1); - b.Add(greater, b.Mul(b.ConstantR0(100.0f), smaller)); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto x2 = Mul(x, x); + auto x_smaller_tuple = Tuple(&b, {x, x2}); + auto x2_smaller_tuple = Tuple(&b, {x2, x}); + auto sorted = Select(Lt(x, x2), x_smaller_tuple, x2_smaller_tuple); + auto smaller = GetTupleElement(sorted, 0); + auto greater = GetTupleElement(sorted, 1); + Add(greater, Mul(ConstantR0(&b, 100.0f), smaller)); auto computation_status = b.Build(); ASSERT_IS_OK(computation_status.status()); tuple_computation = computation_status.ConsumeValueOrDie(); } XlaBuilder b(TestName()); - auto input = b.ConstantR1({-1.0f, 1.0f, 2.1f}); - b.Map({input}, tuple_computation, {0}); + auto input = ConstantR1(&b, {-1.0f, 1.0f, 2.1f}); + Map(&b, {input}, tuple_computation, {0}); ComputeAndCompareR1(&b, {-99.0f, 101.0f, 214.41f}, {}, error_spec_); } @@ -317,14 +326,15 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesOnTrue) { std::initializer_list vec1 = {1.f, 2.f, 3.f}; std::initializer_list vec2 = {2.f, 4.f, 6.f}; - auto tuple12 = builder.Tuple( - {builder.ConstantR1(vec1), builder.ConstantR1(vec2)}); - auto tuple21 = builder.Tuple( - {builder.ConstantR1(vec2), builder.ConstantR1(vec1)}); - - builder.Select(builder.ConstantR0(true), tuple12, tuple21); - auto expected = Literal::MakeTuple({Literal::CreateR1(vec1).get(), - Literal::CreateR1(vec2).get()}); + auto tuple12 = Tuple(&builder, {ConstantR1(&builder, vec1), + ConstantR1(&builder, vec2)}); + auto tuple21 = Tuple(&builder, {ConstantR1(&builder, vec2), + ConstantR1(&builder, vec1)}); + + Select(ConstantR0(&builder, true), tuple12, tuple21); + auto expected = + LiteralUtil::MakeTuple({LiteralUtil::CreateR1(vec1).get(), + LiteralUtil::CreateR1(vec2).get()}); ComputeAndCompareTuple(&builder, *expected, {}, error_spec_); } @@ -335,14 +345,13 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) { std::initializer_list vec1 = {1.f, 2.f, 3.f}; std::initializer_list vec2 = {2.f, 4.f, 6.f}; - auto tuple12 = builder.Tuple( - {builder.ConstantR1(vec1), builder.ConstantR1(vec2)}); - auto tuple21 = builder.Tuple( - {builder.ConstantR1(vec2), builder.ConstantR1(vec1)}); + auto tuple12 = Tuple(&builder, {ConstantR1(&builder, vec1), + ConstantR1(&builder, vec2)}); + auto tuple21 = Tuple(&builder, {ConstantR1(&builder, vec2), + ConstantR1(&builder, vec1)}); - auto select = - builder.Select(builder.ConstantR0(false), tuple12, tuple21); - builder.GetTupleElement(select, 0); + auto select = Select(ConstantR0(&builder, false), tuple12, tuple21); + GetTupleElement(select, 0); ComputeAndCompareR1(&builder, vec2, {}, error_spec_); } @@ -371,19 +380,16 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesCascaded) { std::initializer_list vec1 = {1.f, 2.f, 3.f}; std::initializer_list vec2 = {2.f, 4.f, 6.f}; - auto pred_tuple = builder.Tuple( - {builder.ConstantR0(true), builder.ConstantR0(false)}); - auto tuple12 = builder.Tuple( - {builder.ConstantR1(vec1), builder.ConstantR1(vec2)}); - auto tuple21 = builder.Tuple( - {builder.ConstantR1(vec2), builder.ConstantR1(vec1)}); + auto pred_tuple = Tuple(&builder, {ConstantR0(&builder, true), + ConstantR0(&builder, false)}); + auto tuple12 = Tuple(&builder, {ConstantR1(&builder, vec1), + ConstantR1(&builder, vec2)}); + auto tuple21 = Tuple(&builder, {ConstantR1(&builder, vec2), + ConstantR1(&builder, vec1)}); - auto select1 = - builder.Select(builder.GetTupleElement(pred_tuple, 0), tuple12, tuple21); - auto select2 = - builder.Select(builder.GetTupleElement(pred_tuple, 1), tuple21, select1); - builder.Add(builder.GetTupleElement(select2, 0), - builder.GetTupleElement(select2, 1)); + auto select1 = Select(GetTupleElement(pred_tuple, 0), tuple12, tuple21); + auto select2 = Select(GetTupleElement(pred_tuple, 1), tuple21, select1); + Add(GetTupleElement(select2, 0), GetTupleElement(select2, 1)); ComputeAndCompareR1(&builder, {3.f, 6.f, 9.f}, {}, error_spec_); } @@ -395,31 +401,32 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesReuseConstants) { std::initializer_list vec1 = {1.f, 2.f, 3.f}; std::initializer_list vec2 = {2.f, 4.f, 6.f}; - auto c1 = builder.ConstantR1(vec1); - auto c2 = builder.ConstantR1(vec2); - auto tuple12 = builder.Tuple({c1, c2}); - auto tuple21 = builder.Tuple({c2, c1}); + auto c1 = ConstantR1(&builder, vec1); + auto c2 = ConstantR1(&builder, vec2); + auto tuple12 = Tuple(&builder, {c1, c2}); + auto tuple21 = Tuple(&builder, {c2, c1}); - builder.Select(builder.ConstantR0(false), tuple12, tuple21); + Select(ConstantR0(&builder, false), tuple12, tuple21); - auto expected = Literal::MakeTuple({Literal::CreateR1(vec2).get(), - Literal::CreateR1(vec1).get()}); + auto expected = + LiteralUtil::MakeTuple({LiteralUtil::CreateR1(vec2).get(), + LiteralUtil::CreateR1(vec1).get()}); ComputeAndCompareTuple(&builder, *expected, {}, error_spec_); } XLA_TEST_F(TupleTest, NestedTuples) { XlaBuilder builder(TestName()); - auto inner_tuple = builder.Tuple( - {builder.ConstantR1({1.0, 2.0}), builder.ConstantR0(42.0)}); - builder.Tuple({inner_tuple, builder.ConstantR1({22.0, 44.0})}); + auto inner_tuple = Tuple(&builder, {ConstantR1(&builder, {1.0, 2.0}), + ConstantR0(&builder, 42.0)}); + Tuple(&builder, {inner_tuple, ConstantR1(&builder, {22.0, 44.0})}); - auto expected_v1 = Literal::CreateR1({1.0, 2.0}); - auto expected_s = Literal::CreateR0(42.0); + auto expected_v1 = LiteralUtil::CreateR1({1.0, 2.0}); + auto expected_s = LiteralUtil::CreateR0(42.0); auto expected_inner_tuple = - Literal::MakeTuple({expected_v1.get(), expected_s.get()}); - auto expected_v2 = Literal::CreateR1({22.0, 44.0}); + LiteralUtil::MakeTuple({expected_v1.get(), expected_s.get()}); + auto expected_v2 = LiteralUtil::CreateR1({22.0, 44.0}); auto expected = - Literal::MakeTuple({expected_inner_tuple.get(), expected_v2.get()}); + LiteralUtil::MakeTuple({expected_inner_tuple.get(), expected_v2.get()}); ComputeAndCompareTuple(&builder, *expected, {}, error_spec_); } @@ -432,21 +439,21 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) { Shape outer_tuple_shape = ShapeUtil::MakeTupleShape({inner_tuple_shape, data_shape}); - auto input = builder.Parameter(0, outer_tuple_shape, "input"); - auto gte0 = builder.GetTupleElement(input, 0); - auto gte1 = builder.GetTupleElement(gte0, 1); - builder.Add(gte1, builder.ConstantR1({10.0, 11.0, 12.0})); + auto input = Parameter(&builder, 0, outer_tuple_shape, "input"); + auto gte0 = GetTupleElement(input, 0); + auto gte1 = GetTupleElement(gte0, 1); + Add(gte1, ConstantR1(&builder, {10.0, 11.0, 12.0})); std::unique_ptr data = client_ - ->TransferToServer(*Literal::MakeTuple({ - Literal::MakeTuple( + ->TransferToServer(*LiteralUtil::MakeTuple({ + LiteralUtil::MakeTuple( { - Literal::CreateR1({1.0, 2.0, 3.0}).get(), - Literal::CreateR1({4.0, 5.0, 6.0}).get(), + LiteralUtil::CreateR1({1.0, 2.0, 3.0}).get(), + LiteralUtil::CreateR1({4.0, 5.0, 6.0}).get(), }) .get(), - Literal::CreateR1({7.0, 8.0, 9.0}).get(), + LiteralUtil::CreateR1({7.0, 8.0, 9.0}).get(), })) .ConsumeValueOrDie(); @@ -463,25 +470,26 @@ XLA_TEST_F(TupleTest, ComplexTuples) { Shape c64r2 = ShapeUtil::MakeShape(C64, {3, 2}); Shape arg0_shape = ShapeUtil::MakeTupleShape( {c64r0, ShapeUtil::MakeTupleShape({c64r1, c64r2})}); - auto input0 = builder.Parameter(0, arg0_shape, "input0"); - auto t0 = builder.GetTupleElement(input0, 0); - auto t1 = builder.GetTupleElement(input0, 1); - auto t10 = builder.GetTupleElement(t1, 0); - auto t11 = builder.GetTupleElement(t1, 1); - auto sum = builder.Add(builder.Add(t10, t11, {1}), t0); - auto input1 = builder.Parameter(1, c64r1, "input1"); - auto prod = builder.Mul(input1, sum, {1}); - builder.Tuple({builder.Tuple({prod, sum}), - builder.ConstantR0({123, 456})}); + auto input0 = Parameter(&builder, 0, arg0_shape, "input0"); + auto t0 = GetTupleElement(input0, 0); + auto t1 = GetTupleElement(input0, 1); + auto t10 = GetTupleElement(t1, 0); + auto t11 = GetTupleElement(t1, 1); + auto sum = Add(Add(t10, t11, {1}), t0); + auto input1 = Parameter(&builder, 1, c64r1, "input1"); + auto prod = Mul(input1, sum, {1}); + Tuple(&builder, {Tuple(&builder, {prod, sum}), + ConstantR0(&builder, {123, 456})}); } std::unique_ptr arg0 = client_ - ->TransferToServer(*Literal::MakeTuple( - {Literal::CreateR0({1, 2}).get(), - Literal::MakeTuple( - {Literal::CreateR1({{10, 20}, {30, 40}}).get(), - Literal::CreateR2( + ->TransferToServer(*LiteralUtil::MakeTuple( + {LiteralUtil::CreateR0({1, 2}).get(), + LiteralUtil::MakeTuple( + {LiteralUtil::CreateR1({{10, 20}, {30, 40}}) + .get(), + LiteralUtil::CreateR2( {{{100, 200}, {300, 400}}, {{1000, 2000}, {3000, 4000}}, {{10000, 20000}, {30000, 40000}}}) @@ -490,23 +498,25 @@ XLA_TEST_F(TupleTest, ComplexTuples) { .ConsumeValueOrDie(); std::unique_ptr arg1 = client_ - ->TransferToServer(*Literal::CreateR1({{1, 2}, {1, -2}})) + ->TransferToServer( + *LiteralUtil::CreateR1({{1, 2}, {1, -2}})) .ConsumeValueOrDie(); - auto sum = Literal::CreateR2({{{111, 222}, {331, 442}}, - {{1011, 2022}, {3031, 4042}}, - {{10011, 20022}, {30031, 40042}}}); - auto prod = MakeUnique(sum->shape()); + auto sum = + LiteralUtil::CreateR2({{{111, 222}, {331, 442}}, + {{1011, 2022}, {3031, 4042}}, + {{10011, 20022}, {30031, 40042}}}); + auto prod = absl::make_unique(sum->shape()); ASSERT_TRUE(prod->Populate( - [&sum](tensorflow::gtl::ArraySlice indexes) { + [&sum](absl::Span indexes) { return sum->Get(indexes) * (indexes[indexes.size() - 1] == 0 ? complex64(1, 2) : complex64(1, -2)); }) .ok()); - auto expected = - Literal::MakeTuple({Literal::MakeTuple({prod.get(), sum.get()}).get(), - Literal::CreateR0({123, 456}).get()}); + auto expected = LiteralUtil::MakeTuple( + {LiteralUtil::MakeTuple({prod.get(), sum.get()}).get(), + LiteralUtil::CreateR0({123, 456}).get()}); ComputeAndCompareTuple(&builder, *expected, {arg0.get(), arg1.get()}, error_spec_); } @@ -529,11 +539,58 @@ XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) { auto module = HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) .ValueOrDie(); - auto param = Literal::MakeTupleOwned(Literal::CreateR1({1, 2, 3})); + auto param = + LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1({1, 2, 3})); auto result = ExecuteNoHloPasses(std::move(module), {param.get()}); EXPECT_TRUE(LiteralTestUtil::Equal( - *result, - *Literal::MakeTupleOwned(Literal::CreateR2({{1, 2, 3}})))); + *LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR2({{1, 2, 3}})), + *result)); +} + +// Disabled on interpreter due to lack of outfeed. +XLA_TEST_F(TupleHloTest, + DISABLED_ON_INTERPRETER(NonAmbiguousTopLevelAllocation)) { + const char* testcase = R"( + HloModule tuple + + ENTRY main { + a = f32[2] parameter(0) + b = f32[2] parameter(1) + c = f32[2] parameter(2) + d = f32[2] parameter(3) + cond = pred[] parameter(4) + + tup0 = (f32[2],f32[2]) tuple(a, b) + tup1 = (f32[2],f32[2]) tuple(c, d) + + s = (f32[2],f32[2]) tuple-select(cond, tup0, tup1) + gte = f32[2] get-tuple-element(s), index=0 + tuple = (f32[2]) tuple(gte) + token = token[] after-all() + ROOT outfeed = token[] outfeed(tuple, token) + } + )"; + auto module = + HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) + .ValueOrDie(); + auto param0 = LiteralUtil::CreateR1({1, 2}); + auto param1 = LiteralUtil::CreateR1({2, 3}); + auto param4 = LiteralUtil::CreateR0(false); + // Put execution on a separate thread so we can block on outfeed. + std::unique_ptr thread( + tensorflow::Env::Default()->StartThread( + tensorflow::ThreadOptions(), "execute_thread", [&] { + TF_EXPECT_OK(Execute(std::move(module), + {param0.get(), param1.get(), param1.get(), + param0.get(), param4.get()}) + .status()); + })); + auto expected = + LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1({2, 3})); + auto literal = Literal::CreateFromShape(expected->shape()); + TF_EXPECT_OK(backend().transfer_manager()->TransferLiteralFromOutfeed( + backend().default_stream_executor(), expected->shape(), *literal)); + EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *literal)); } } // namespace diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc index c3abe22797f5eaa76ced2ad8534bd68c32983e60..8f80a9f3e466d73f2b718452d9a0d64a80c3b36f 100644 --- a/tensorflow/compiler/xla/tests/unary_op_test.cc +++ b/tensorflow/compiler/xla/tests/unary_op_test.cc @@ -18,7 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -38,8 +38,8 @@ class UnaryOpTest : public ClientLibraryTestBase { template void AbsSize0TestHelper() { XlaBuilder builder(TestName()); - auto arg = builder.ConstantR1({}); - auto abs = builder.Abs(arg); + auto arg = ConstantR1(&builder, {}); + Abs(arg); if (primitive_util::NativeToPrimitiveType() == C64) { ComputeAndCompareR1(&builder, {}, {}); @@ -51,8 +51,8 @@ class UnaryOpTest : public ClientLibraryTestBase { template void AbsTestHelper() { XlaBuilder builder(TestName()); - auto arg = builder.ConstantR1({-2, 25, 0, -123, inf(), -inf()}); - auto abs = builder.Abs(arg); + auto arg = ConstantR1(&builder, {-2, 25, 0, -123, inf(), -inf()}); + Abs(arg); ComputeAndCompareR1(&builder, {2, 25, 0, 123, inf(), inf()}, {}); } @@ -60,9 +60,9 @@ class UnaryOpTest : public ClientLibraryTestBase { template void SignTestHelper() { XlaBuilder builder(TestName()); - auto arg = builder.ConstantR1( - {-2, 25, 0, static_cast(-0.0), -123, inf(), -inf()}); - auto sign = builder.Sign(arg); + auto arg = ConstantR1( + &builder, {-2, 25, 0, static_cast(-0.0), -123, inf(), -inf()}); + Sign(arg); ComputeAndCompareR1(&builder, {-1, 1, 0, 0, -1, 1, -1}, {}); } @@ -70,10 +70,10 @@ class UnaryOpTest : public ClientLibraryTestBase { template void SignAbsTestHelper() { XlaBuilder builder(TestName()); - auto arg = builder.ConstantR1({-2, 25, 0, -123}); - auto sign = builder.Sign(arg); - auto abs = builder.Abs(arg); - builder.Sub(builder.Mul(sign, abs), arg); + auto arg = ConstantR1(&builder, {-2, 25, 0, -123}); + auto sign = Sign(arg); + auto abs = Abs(arg); + Sub(Mul(sign, abs), arg); ComputeAndCompareR1(&builder, {0, 0, 0, 0}, {}); } @@ -92,27 +92,28 @@ int64 UnaryOpTest::inf() { template <> void UnaryOpTest::AbsTestHelper() { XlaBuilder builder(TestName()); - auto arg = builder.ConstantR1({{-2, 0}, - {0, 25}, - {0, 0}, - {-0.3f, 0.4f}, - {0, inf()}, - {-inf(), 0}}); - auto abs = builder.Abs(arg); + auto arg = ConstantR1(&builder, {{-2, 0}, + {0, 25}, + {0, 0}, + {-0.3f, 0.4f}, + {0, inf()}, + {-inf(), 0}}); + Abs(arg); std::unique_ptr expected = - Literal::CreateR1({2, 25, 0, 0.5, inf(), inf()}); + LiteralUtil::CreateR1({2, 25, 0, 0.5, inf(), inf()}); ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-6f)); } template <> void UnaryOpTest::SignTestHelper() { XlaBuilder builder(TestName()); - auto arg = builder.ConstantR1( + auto arg = ConstantR1( + &builder, {{-2, 0}, {0, 25}, {0, 0}, {static_cast(-0.0), 0}, {-1, 1}}); - auto sign = builder.Sign(arg); + Sign(arg); - std::unique_ptr expected = Literal::CreateR1( + std::unique_ptr expected = LiteralUtil::CreateR1( {{-1, 0}, {0, 1}, {0, 0}, {0, 0}, {-std::sqrt(0.5f), std::sqrt(0.5f)}}); ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-6f)); } @@ -121,13 +122,13 @@ template <> void UnaryOpTest::SignAbsTestHelper() { XlaBuilder builder(TestName()); auto arg = - builder.ConstantR1({{-2, 0}, {0, 25}, {0, 0}, {-0.4, 0.3}}); - auto sign = builder.Sign(arg); - auto abs = builder.Abs(arg); - builder.Sub(builder.Mul(sign, builder.ConvertElementType(abs, C64)), arg); + ConstantR1(&builder, {{-2, 0}, {0, 25}, {0, 0}, {-0.4, 0.3}}); + auto sign = Sign(arg); + auto abs = Abs(arg); + Sub(Mul(sign, ConvertElementType(abs, C64)), arg); std::unique_ptr expected = - Literal::CreateR1({0, 0, 0, 0}); + LiteralUtil::CreateR1({0, 0, 0, 0}); ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-6f)); } @@ -145,37 +146,34 @@ XLA_TEST_F(UnaryOpTest, AbsTestR1) { XLA_TEST_F(UnaryOpTest, AbsTestR0) { XlaBuilder builder(TestName()); - auto argi = builder.ConstantR0(-5); - auto absi = builder.Abs(argi); - auto argf = builder.ConstantR0(-3.0f); - auto absf = builder.Abs(argf); - auto argf0 = builder.ConstantR0(-0.0f); - auto absf0 = builder.Abs(argf0); - auto argc = builder.ConstantR0({-0.3f, 0.4f}); - auto absc = builder.Abs(argc); - builder.Add(builder.Add(absc, absf0), - builder.Add(absf, builder.ConvertElementType(absi, F32))); + auto argi = ConstantR0(&builder, -5); + auto absi = Abs(argi); + auto argf = ConstantR0(&builder, -3.0f); + auto absf = Abs(argf); + auto argf0 = ConstantR0(&builder, -0.0f); + auto absf0 = Abs(argf0); + auto argc = ConstantR0(&builder, {-0.3f, 0.4f}); + auto absc = Abs(argc); + Add(Add(absc, absf0), Add(absf, ConvertElementType(absi, F32))); ComputeAndCompareR0(&builder, 8.5f, {}); } XLA_TEST_F(UnaryOpTest, SignTestR0) { XlaBuilder builder(TestName()); - auto argi = builder.ConstantR0(-5); - auto sgni = builder.Sign(argi); // -1 - auto argf = builder.ConstantR0(-4.0f); - auto sgnf = builder.Sign(argf); // -1 - auto argf0 = builder.ConstantR0(-0.0f); - auto sgnf0 = builder.Sign(argf0); // 0 - auto argc = builder.ConstantR0({-.3, .4}); - auto sgnc = builder.Sign(argc); // (-.6, .8) - builder.Add(sgnc, builder.ConvertElementType( - builder.Add(builder.Add(sgnf0, sgnf), - builder.ConvertElementType(sgni, F32)), - C64)); + auto argi = ConstantR0(&builder, -5); + auto sgni = Sign(argi); // -1 + auto argf = ConstantR0(&builder, -4.0f); + auto sgnf = Sign(argf); // -1 + auto argf0 = ConstantR0(&builder, -0.0f); + auto sgnf0 = Sign(argf0); // 0 + auto argc = ConstantR0(&builder, {-.3, .4}); + auto sgnc = Sign(argc); // (-.6, .8) + Add(sgnc, ConvertElementType( + Add(Add(sgnf0, sgnf), ConvertElementType(sgni, F32)), C64)); std::unique_ptr expected = - Literal::CreateR0({-2.6f, 0.8f}); + LiteralUtil::CreateR0({-2.6f, 0.8f}); ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-6f)); } @@ -192,49 +190,30 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR1) { SignAbsTestHelper(); } -XLA_TEST_F(UnaryOpTest, UnsignedAbsTestR1) { - XlaBuilder builder(TestName()); - auto arg = builder.ConstantR1( - {2, 25, 0, 123, std::numeric_limits::max()}); - auto abs = builder.Abs(arg); - - ComputeAndCompareR1( - &builder, {2, 25, 0, 123, std::numeric_limits::max()}, {}); -} - -XLA_TEST_F(UnaryOpTest, UnsignedSignTestR1) { - XlaBuilder builder(TestName()); - auto arg = builder.ConstantR1( - {2, 25, 0, 123, std::numeric_limits::max()}); - auto sign = builder.Sign(arg); - - ComputeAndCompareR1(&builder, {1, 1, 0, 1, 1}, {}); -} - XLA_TEST_F(UnaryOpTest, SignAbsTestR2) { XlaBuilder builder(TestName()); - auto arg = builder.ConstantR2({{1.0, -2.0}, {-3.0, 4.0}}); - auto sign = builder.Sign(arg); - auto abs = builder.Abs(arg); - builder.Sub(builder.Mul(sign, abs), arg); + auto arg = ConstantR2(&builder, {{1.0, -2.0}, {-3.0, 4.0}}); + auto sign = Sign(arg); + auto abs = Abs(arg); + Sub(Mul(sign, abs), arg); ComputeAndCompareR2(&builder, {{0, 0}, {0, 0}}, {}); } XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) { XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({0, 1}); - auto rhs = builder.ConstantR1({1, 1}); - builder.ConvertElementType(builder.Eq(lhs, rhs), S32); + auto lhs = ConstantR1(&builder, {0, 1}); + auto rhs = ConstantR1(&builder, {1, 1}); + ConvertElementType(Eq(lhs, rhs), S32); ComputeAndCompareR1(&builder, {0, 1}, {}); } XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToF32) { XlaBuilder builder(TestName()); - auto lhs = builder.ConstantR1({0, 1}); - auto rhs = builder.ConstantR1({1, 1}); - builder.ConvertElementType(builder.Eq(lhs, rhs), F32); + auto lhs = ConstantR1(&builder, {0, 1}); + auto rhs = ConstantR1(&builder, {1, 1}); + ConvertElementType(Eq(lhs, rhs), F32); ComputeAndCompareR1(&builder, {0.0, 1.0}, {}); } diff --git a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc index 82d301983fc7885ef5c1c1ed05b74fc017bb7727..ef1b1445bbe555da00db4446d59439b752735a80 100644 --- a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc +++ b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc @@ -21,7 +21,7 @@ limitations under the License. #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -46,7 +46,7 @@ class VecOpsReduceTest : public ClientLibraryTestBase { {{1.0, 2.0, 3.0}, // } plane 2 in dim 0 {4.0, 5.0, 6.0}}}); // clang-format on - return builder_.ConstantR3FromArray3D(x3d); + return ConstantR3FromArray3D(&builder_, x3d); } XlaBuilder builder_; @@ -56,11 +56,10 @@ class VecOpsReduceTest : public ClientLibraryTestBase { TEST_F(VecOpsReduceTest, AddReduceR1F32) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); - auto x = builder_.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{0}); + auto x = ConstantR1( + &builder_, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{0}); ComputeAndCompareR0(&builder_, -4.2f, {}, errspec_); } @@ -71,10 +70,9 @@ TEST_F(VecOpsReduceTest, AddReduceBigR1F32) { std::vector input(3000); std::iota(input.begin(), input.end(), 100.0f); - auto x = builder_.ConstantR1(input); - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{0}); + auto x = ConstantR1(&builder_, input); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{0}); float expected = std::accumulate(input.begin(), input.end(), 0.0f); ComputeAndCompareR0(&builder_, expected, {}, errspec_); @@ -83,11 +81,10 @@ TEST_F(VecOpsReduceTest, AddReduceBigR1F32) { TEST_F(VecOpsReduceTest, MaxReduceR1F32) { auto max_reducer = CreateScalarMax(); - auto x = builder_.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - auto max_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), max_reducer, - /*dimensions_to_reduce=*/{0}); + auto x = ConstantR1( + &builder_, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + Reduce(x, ConstantR0(&builder_, 0.0f), max_reducer, + /*dimensions_to_reduce=*/{0}); ComputeAndCompareR0(&builder_, 2.6f, {}, errspec_); } @@ -95,11 +92,10 @@ TEST_F(VecOpsReduceTest, MaxReduceR1F32) { TEST_F(VecOpsReduceTest, MaxReduceR1F32WithNontrivialInit) { auto max_reducer = CreateScalarMax(); - auto x = builder_.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - auto max_reduce = - builder_.Reduce(x, builder_.ConstantR0(4.0f), max_reducer, - /*dimensions_to_reduce=*/{0}); + auto x = ConstantR1( + &builder_, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + Reduce(x, ConstantR0(&builder_, 4.0f), max_reducer, + /*dimensions_to_reduce=*/{0}); ComputeAndCompareR0(&builder_, 4.0f, {}, errspec_); } @@ -108,15 +104,14 @@ TEST_F(VecOpsReduceTest, AddReduceR2F32Dim1) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); // clang-format off - auto x = builder_.ConstantR2({ + auto x = ConstantR2(&builder_, { {1.0, 2.0, 3.0}, // | dim 0 {4.0, 5.0, 6.0}}); // | // ------ dim 1 ---------- // clang-format on - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{1}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{1}); ComputeAndCompareR1(&builder_, {6.0, 15.0}, {}, errspec_); } @@ -125,13 +120,12 @@ TEST_F(VecOpsReduceTest, AddReduceR2F32Dim0) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); // clang-format off - auto x = builder_.ConstantR2({ + auto x = ConstantR2(&builder_, { {1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}); // clang-format on - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{0}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{0}); ComputeAndCompareR1(&builder_, {5.0, 7.0, 9.0}, {}, errspec_); } @@ -139,9 +133,8 @@ TEST_F(VecOpsReduceTest, AddReduceR2F32Dim0) { TEST_F(VecOpsReduceTest, AddReduceR3F32Dim2) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); auto x = BuildSampleConstantCube(); - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{2}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{2}); Array2D expected_array({{6.0f, 15.0f}, {6.0f, 15.0f}, {6.0f, 15.0f}}); @@ -151,9 +144,8 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dim2) { TEST_F(VecOpsReduceTest, AddReduceR3F32Dim1) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); auto x = BuildSampleConstantCube(); - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{1}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{1}); Array2D expected_array( {{5.0f, 7.0f, 9.0f}, {5.0f, 7.0f, 9.0f}, {5.0f, 7.0f, 9.0f}}); @@ -164,9 +156,8 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dim1) { TEST_F(VecOpsReduceTest, AddReduceR3F32Dim0) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); auto x = BuildSampleConstantCube(); - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{0}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{0}); Array2D expected_array({{3.0f, 6.0f, 9.0f}, {12.0f, 15.0f, 18.0f}}); @@ -176,9 +167,8 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dim0) { TEST_F(VecOpsReduceTest, AddReduceR3F32Dims1and2) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); auto x = BuildSampleConstantCube(); - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{1, 2}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{1, 2}); ComputeAndCompareR1(&builder_, {21.0, 21.0, 21.0}, {}, errspec_); } @@ -186,9 +176,8 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dims1and2) { XLA_TEST_F(VecOpsReduceTest, AddReduceR3F32Dims0and2) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); auto x = BuildSampleConstantCube(); - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{0, 2}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{0, 2}); ComputeAndCompareR1(&builder_, {18.0, 45.0}, {}, errspec_); } @@ -196,9 +185,8 @@ XLA_TEST_F(VecOpsReduceTest, AddReduceR3F32Dims0and2) { TEST_F(VecOpsReduceTest, AddReduceR3F32Dims0and1) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); auto x = BuildSampleConstantCube(); - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{0, 1}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{0, 1}); ComputeAndCompareR1(&builder_, {15.0, 21.0, 27.0}, {}, errspec_); } @@ -206,9 +194,8 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dims0and1) { TEST_F(VecOpsReduceTest, AddReduceR3F32AllDims) { auto sum_reducer = CreateScalarAddComputation(F32, &builder_); auto x = BuildSampleConstantCube(); - auto add_reduce = - builder_.Reduce(x, builder_.ConstantR0(0.0f), sum_reducer, - /*dimensions_to_reduce=*/{0, 1, 2}); + Reduce(x, ConstantR0(&builder_, 0.0f), sum_reducer, + /*dimensions_to_reduce=*/{0, 1, 2}); ComputeAndCompareR0(&builder_, 63.0, {}, errspec_); } diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc index 5cce7a2bf82c1a8403536a91e67910f949ef185a..3848ec1684cdc9186e14ac0b60315b7520d127f3 100644 --- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc +++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc @@ -21,8 +21,8 @@ limitations under the License. #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -50,9 +50,9 @@ class VecOpsSimpleTest : public ClientLibraryTestBase { XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - auto exp = builder.Exp(x); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + Exp(x); std::vector expected = {8.1662, 7.4274e-02, 13.4637, 1.8316e-02, 8.1662, 9.9742, 6.7379e-03, 4.0657e-01, @@ -69,8 +69,8 @@ XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) { for (int i = 0; i < count; ++i) { exponents.push_back(i / static_cast(count)); } - auto x = builder.ConstantR1(exponents); - auto exp = builder.Exp(x); + auto x = ConstantR1(&builder, exponents); + Exp(x); std::vector expected; expected.reserve(exponents.size()); @@ -98,8 +98,8 @@ XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) { Array4D expected(2, 2, 2, 2, expected_vector); - auto x = builder.ConstantR4FromArray4D(exponents); - auto exp = builder.Exp(x); + auto x = ConstantR4FromArray4D(&builder, exponents); + Exp(x); ComputeAndCompareR4(&builder, expected, {}, ErrorSpec(/*aabs=*/1e-2, /*arel=*/1e-3)); @@ -107,9 +107,9 @@ XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) { XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - builder.Neg(x); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + Neg(x); std::vector expected = {-2.1, 2.6, -2.6, 4.0, -2.1, -2.3, 5.0, 0.9, 2.4, -1.6}; @@ -118,8 +118,8 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) { XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({2, -2, 12, -4, 5, 20, -15, 0, -2, 1}); - builder.Neg(x); + auto x = ConstantR1(&builder, {2, -2, 12, -4, 5, 20, -15, 0, -2, 1}); + Neg(x); std::vector expected = {-2, 2, -12, 4, -5, -20, 15, 0, 2, -1}; ComputeAndCompareR1(&builder, expected, {}); @@ -127,59 +127,19 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) { XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {0, 1, 42, static_cast(-1), static_cast(-12)}); - builder.Neg(x); + auto x = ConstantR1( + &builder, {0, 1, 42, static_cast(-1), static_cast(-12)}); + Neg(x); std::vector expected = {0, static_cast(-1), static_cast(-42), 1, 12}; ComputeAndCompareR1(&builder, expected, {}); } -XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) { - XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - builder.SquareF32(x); - - std::vector expected = {4.41, 6.76, 6.76, 16., 4.41, - 5.29, 25., 0.81, 5.76, 2.56}; - ComputeAndCompareR1(&builder, expected, {}, error_spec_); -} - -XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) { - XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - builder.ReciprocalF32(x); - - std::vector expected = { - 0.47619048, -0.38461538, 0.38461538, -0.25, 0.47619048, - 0.43478261, -0.2, -1.11111111, -0.41666667, 0.625}; - ComputeAndCompareR1(&builder, expected, {}, error_spec_); -} - -XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) { - XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({0.0, -0.0}); - auto exp = builder.SqrtF32(x); - - ComputeAndCompareR1(&builder, {0, 0}, {}, error_spec_); -} - -XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) { - XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({16.0, 1.0, 1024.0, 0.16, 0.2, 12345}); - auto exp = builder.SqrtF32(x); - - std::vector expected = {4, 1, 32, 0.4, 0.4472, 111.1080}; - ComputeAndCompareR1(&builder, expected, {}, error_spec_); -} - XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) { XlaBuilder builder(TestName()); - auto x = - builder.ConstantR1({16.0, 1.0, 1024.0, 0.16, 0.2, 12345, 1.2345}); - auto exp = builder.Pow(x, builder.ConstantR0(-.5f)); + auto x = ConstantR1(&builder, + {16.0, 1.0, 1024.0, 0.16, 0.2, 12345, 1.2345}); + Pow(x, ConstantR0(&builder, -.5f)); std::vector expected = {.25, 1, .03125, 2.5, 2.23607, .009000, .900025}; @@ -191,11 +151,11 @@ XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) { XlaBuilder builder(TestName()); auto add = CreateScalarAddComputation(F32, &builder); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - auto y = builder.ConstantR1( - {-0.4, -0.6, -3.0, 0.2, 3.8, -2.2, -1.8, 4.9, 1.4, 0.6}); - auto max = builder.Map({x, y}, add, {0}); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + auto y = ConstantR1( + &builder, {-0.4, -0.6, -3.0, 0.2, 3.8, -2.2, -1.8, 4.9, 1.4, 0.6}); + Map(&builder, {x, y}, add, {0}); std::vector expected = {1.7, -3.2, -0.4, -3.8, 5.9, 0.1, -6.8, 4., -1., 2.2}; @@ -204,11 +164,11 @@ XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) { XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - auto y = builder.ConstantR1( - {-0.4, -0.6, -3.0, 0.2, 3.8, -2.2, -1.8, 4.9, 1.4, 0.6}); - auto max = builder.Max(x, y); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + auto y = ConstantR1( + &builder, {-0.4, -0.6, -3.0, 0.2, 3.8, -2.2, -1.8, 4.9, 1.4, 0.6}); + Max(x, y); std::vector expected = {2.1, -0.6, 2.6, 0.2, 3.8, 2.3, -1.8, 4.9, 1.4, 1.6}; @@ -227,7 +187,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) { {21.0f, 22.0f, 23.0f, 24.0f}, /*parameter_number=*/1, /*name=*/"v2", /*builder=*/&builder, /*data_handle=*/&v2); - auto max = builder.Max(v1, v2); + Max(v1, v2); ComputeAndCompareR1(&builder, {41.0f, 22.0f, 23.0f, 84.0f}, {param0_data.get(), param1_data.get()}, error_spec_); @@ -267,7 +227,7 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) { CreateR1Parameter(v2vec, /*parameter_number=*/1, /*name=*/"v2", /*builder=*/&builder, /*data_handle=*/&v2); - auto max = builder.Max(v1, v2); + Max(v1, v2); ComputeAndCompareR1(&builder, expected_vec, {param0_data.get(), param1_data.get()}, error_spec_); @@ -275,10 +235,10 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) { XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - auto y = builder.ConstantR0(0); - auto max = builder.Max(x, y); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + auto y = ConstantR0(&builder, 0); + Max(x, y); std::vector expected = {2.1, 0.0, 2.6, 0.0, 2.1, 2.3, 0.0, 0.0, 0.0, 1.6}; @@ -287,11 +247,11 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) { XLA_TEST_F(VecOpsSimpleTest, MinTenValues) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - auto y = builder.ConstantR1( - {-0.4, -0.6, -3.0, 0.2, 3.8, -2.2, -1.8, 4.9, 1.4, 0.6}); - auto min = builder.Min(x, y); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + auto y = ConstantR1( + &builder, {-0.4, -0.6, -3.0, 0.2, 3.8, -2.2, -1.8, 4.9, 1.4, 0.6}); + Min(x, y); std::vector expected = {-0.4, -2.6, -3.0, -4.0, 2.1, -2.2, -5.0, -0.9, -2.4, 0.6}; @@ -300,11 +260,11 @@ XLA_TEST_F(VecOpsSimpleTest, MinTenValues) { XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) { XlaBuilder builder(TestName()); - auto zero = builder.ConstantR0(0); - auto one = builder.ConstantR0(1); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, 0.3, 3.1, 0.9, -5.0, 0.1, -2.4, 0.6}); - auto clamp = builder.Min(builder.Max(x, zero), one); + auto zero = ConstantR0(&builder, 0); + auto one = ConstantR0(&builder, 1); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, 0.3, 3.1, 0.9, -5.0, 0.1, -2.4, 0.6}); + Min(Max(x, zero), one); std::vector expected = {1.0, 0.0, 1.0, 0.3, 1.0, 0.9, 0.0, 0.1, 0.0, 0.6}; @@ -313,11 +273,11 @@ XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) { XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) { XlaBuilder builder(TestName()); - auto zero = builder.ConstantR0(0); - auto one = builder.ConstantR0(1); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, 0.3, 3.1, 0.9, -5.0, 0.1, -2.4, 0.6}); - auto clamp = builder.Clamp(zero, x, one); + auto zero = ConstantR0(&builder, 0); + auto one = ConstantR0(&builder, 1); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, 0.3, 3.1, 0.9, -5.0, 0.1, -2.4, 0.6}); + Clamp(zero, x, one); std::vector expected = {1.0, 0.0, 1.0, 0.3, 1.0, 0.9, 0.0, 0.1, 0.0, 0.6}; @@ -326,10 +286,10 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) { XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) { XlaBuilder builder(TestName()); - auto zero = builder.ConstantR1({0.0f, 0.0f}); - auto one = builder.ConstantR1({1.0f, 1.0f}); - auto x = builder.ConstantR1({2.1, -2.6}); - auto clamp = builder.Clamp(zero, x, one); + auto zero = ConstantR1(&builder, {0.0f, 0.0f}); + auto one = ConstantR1(&builder, {1.0f, 1.0f}); + auto x = ConstantR1(&builder, {2.1, -2.6}); + Clamp(zero, x, one); std::vector expected = {1.0, 0.0}; ComputeAndCompareR1(&builder, expected, {}); @@ -337,11 +297,11 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) { XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) { XlaBuilder builder(TestName()); - auto one = builder.ConstantR0(1); - auto two = builder.ConstantR0(2); - auto x = builder.ConstantR1( - {2.1, -2.6, 2.6, 0.3, 3.1, 0.9, -5.0, 0.1, -2.4, 0.6}); - auto clamp = builder.Clamp(one, x, two); + auto one = ConstantR0(&builder, 1); + auto two = ConstantR0(&builder, 2); + auto x = ConstantR1( + &builder, {2.1, -2.6, 2.6, 0.3, 3.1, 0.9, -5.0, 0.1, -2.4, 0.6}); + Clamp(one, x, two); std::vector expected = {2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0}; @@ -350,10 +310,10 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) { XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) { XlaBuilder builder(TestName()); - auto zero = builder.ConstantR0(0); - auto one = builder.ConstantR0(10); - auto x = builder.ConstantR1({-3, 3, 9, 13}); - auto clamp = builder.Clamp(zero, x, one); + auto zero = ConstantR0(&builder, 0); + auto one = ConstantR0(&builder, 10); + auto x = ConstantR1(&builder, {-3, 3, 9, 13}); + Clamp(zero, x, one); std::vector expected = {0, 3, 9, 10}; ComputeAndCompareR1(&builder, expected, {}); @@ -365,9 +325,9 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { // add_half(x) = x + 0.5 XlaBuilder builder("add_half"); auto x_value = - builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x_value"); - auto half = builder.ConstantR0(0.5); - builder.Add(x_value, half); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x_value"); + auto half = ConstantR0(&builder, 0.5); + Add(x_value, half); auto computation_status = builder.Build(); ASSERT_IS_OK(computation_status.status()); add_half = computation_status.ConsumeValueOrDie(); @@ -378,9 +338,9 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { // clamp(y) = clamp<0,5>(y) XlaBuilder builder("clamp"); auto y_value = - builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y_value"); - auto zero = builder.ConstantR0(0.0); - auto clamped = builder.Clamp(zero, y_value, builder.ConstantR0(5)); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "y_value"); + auto zero = ConstantR0(&builder, 0.0); + Clamp(zero, y_value, ConstantR0(&builder, 5)); auto computation_status = builder.Build(); ASSERT_IS_OK(computation_status.status()); clamp = computation_status.ConsumeValueOrDie(); @@ -391,13 +351,13 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { // mult_relu_add(z) = clamp(add_half(2 * max(z, 0))) XlaBuilder builder("mult_relu_add"); auto z_value = - builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "z_value"); - auto zero = builder.ConstantR0(0.0); - auto two = builder.ConstantR0(2.0); - auto max = builder.Max(z_value, zero); - auto mult = builder.Mul(two, max); - auto inner = builder.Map({mult}, add_half, {}); - builder.Map({inner}, clamp, {}); + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "z_value"); + auto zero = ConstantR0(&builder, 0.0); + auto two = ConstantR0(&builder, 2.0); + auto max = Max(z_value, zero); + auto mult = Mul(two, max); + auto inner = Map(&builder, {mult}, add_half, {}); + Map(&builder, {inner}, clamp, {}); auto computation_status = builder.Build(); ASSERT_IS_OK(computation_status.status()); mult_relu_add = computation_status.ConsumeValueOrDie(); @@ -405,9 +365,9 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { XlaBuilder builder("map10"); { - auto x = builder.ConstantR1( - {2.1, -21.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); - auto activations = builder.Map({x}, mult_relu_add, {0}); + auto x = ConstantR1( + &builder, {2.1, -21.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); + Map(&builder, {x}, mult_relu_add, {0}); } std::vector expected = {4.7, 0.5, 5.0, 0.5, 4.7, @@ -417,9 +377,9 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}); - auto y = builder.ConstantR0(3); - builder.Rem(x, y); + auto x = ConstantR1(&builder, {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}); + auto y = ConstantR0(&builder, 3); + Rem(x, y); std::vector expected = {-2, -1, 0, -2, -1, 0, 1, 2, 0, 1}; ComputeAndCompareR1(&builder, expected, {}); @@ -427,9 +387,9 @@ XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) { XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({false, true}); - auto y = builder.ConstantR1({true, false}); - builder.Eq(x, y); + auto x = ConstantR1(&builder, {false, true}); + auto y = ConstantR1(&builder, {true, false}); + Eq(x, y); std::array expected = {{false, false}}; ComputeAndCompareR1(&builder, expected, {}); @@ -437,9 +397,9 @@ XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) { XLA_TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) { XlaBuilder builder(TestName()); - auto x = builder.ConstantR1({false, true}); - auto y = builder.ConstantR1({true, false}); - builder.Ne(x, y); + auto x = ConstantR1(&builder, {false, true}); + auto y = ConstantR1(&builder, {true, false}); + Ne(x, y); std::array expected = {{true, true}}; ComputeAndCompareR1(&builder, expected, {}); diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc index c463f3eac55e5b8ab32dc52d5a38e7840241bc58..1bdf1867b9330b715b0ba4aca71d56307883c775 100644 --- a/tensorflow/compiler/xla/tests/while_test.cc +++ b/tensorflow/compiler/xla/tests/while_test.cc @@ -20,9 +20,9 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -55,8 +55,8 @@ TEST_F(WhileTest, WhileWithScalarS32Result) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - builder.Gt(builder.ConstantR0(5), prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + Gt(ConstantR0(&builder, 5), prev); condition = builder.Build().ConsumeValueOrDie(); } @@ -64,16 +64,16 @@ TEST_F(WhileTest, WhileWithScalarS32Result) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto input = builder.ConstantR0(1); - builder.Add(input, prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto input = ConstantR0(&builder, 1); + Add(input, prev); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder(TestName()); - auto init = builder.ConstantR0(0); - builder.While(condition, body, init); + auto init = ConstantR0(&builder, 0); + While(condition, body, init); ComputeAndCompareR0(&builder, 5, {}); } @@ -91,8 +91,8 @@ TEST_F(WhileTest, WhileWithScalarS64Result) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - builder.Gt(builder.ConstantR0(5), prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + Gt(ConstantR0(&builder, 5), prev); condition = builder.Build().ConsumeValueOrDie(); } @@ -100,16 +100,16 @@ TEST_F(WhileTest, WhileWithScalarS64Result) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto input = builder.ConstantR0(1); - builder.Add(input, prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto input = ConstantR0(&builder, 1); + Add(input, prev); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder(TestName()); - auto init = builder.ConstantR0(0); - builder.While(condition, body, init); + auto init = ConstantR0(&builder, 0); + While(condition, body, init); ComputeAndCompareR0(&builder, 5, {}); } @@ -122,8 +122,8 @@ TEST_F(WhileTest, WhileWithScalarResultNonConstInit) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - builder.Gt(builder.ConstantR0(5), prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + Gt(ConstantR0(&builder, 5), prev); condition = builder.Build().ConsumeValueOrDie(); } @@ -131,18 +131,18 @@ TEST_F(WhileTest, WhileWithScalarResultNonConstInit) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto input = builder.ConstantR0(1); - builder.Add(input, prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto input = ConstantR0(&builder, 1); + Add(input, prev); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder(TestName()); - auto init = builder.Reduce(builder.ConstantR1(2, 1), - builder.ConstantR0(0), - CreateScalarAddComputation(S32, &builder), {0}); - builder.While(condition, body, init); + auto init = + Reduce(ConstantR1(&builder, 2, 1), ConstantR0(&builder, 0), + CreateScalarAddComputation(S32, &builder), {0}); + While(condition, body, init); ComputeAndCompareR0(&builder, 5, {}); } @@ -154,8 +154,8 @@ TEST_F(WhileTest, WhileWithPredicateResult) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - builder.Ne(builder.ConstantR0(true), prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + Ne(ConstantR0(&builder, true), prev); condition = builder.Build().ConsumeValueOrDie(); } @@ -163,16 +163,16 @@ TEST_F(WhileTest, WhileWithPredicateResult) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - builder.Or(prev, builder.ConstantR0(true)); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + Or(prev, ConstantR0(&builder, true)); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder(TestName()); - auto init = builder.Ne(builder.ConstantR0(false), - builder.ConstantR0(true)); - builder.While(condition, body, init); + auto init = + Ne(ConstantR0(&builder, false), ConstantR0(&builder, true)); + While(condition, body, init); ComputeAndCompareR0(&builder, true, {}); } @@ -184,17 +184,16 @@ TEST_F(WhileTest, WhileWithPredicateResult) { // while (result.sum() < 15.5f) { // result = result + vector(0); // } -// TODO(b/29185393): does not terminate on CPU. -TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) { +TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithEmptyVectorResult)) { Shape result_shape = ShapeUtil::MakeShape(F32, {0}); // Create a computation for the reduction. XlaComputation add; { XlaBuilder builder("add"); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {}), "y"); + Add(x, y); add = builder.Build().ConsumeValueOrDie(); } @@ -203,10 +202,10 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto sum = builder.Reduce(prev, builder.ConstantR0(0.0f), add, - /*dimensions_to_reduce=*/{0}); - builder.Gt(builder.ConstantR0(15.5f), sum); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto sum = Reduce(prev, ConstantR0(&builder, 0.0f), add, + /*dimensions_to_reduce=*/{0}); + Gt(ConstantR0(&builder, 15.5f), sum); condition = builder.Build().ConsumeValueOrDie(); } @@ -215,16 +214,16 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto input = builder.ConstantR1({}); - builder.Add(input, prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto input = ConstantR1(&builder, {}); + Add(input, prev); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.ConstantR1({}); - auto result = builder.While(condition, body, init); + auto init = ConstantR1(&builder, {}); + auto result = While(condition, body, init); VLOG(2) << "while = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); @@ -246,9 +245,9 @@ TEST_F(WhileTest, WhileWithVectorResult) { XlaComputation add; { XlaBuilder builder("add"); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {}), "y"); + Add(x, y); add = builder.Build().ConsumeValueOrDie(); } @@ -257,10 +256,10 @@ TEST_F(WhileTest, WhileWithVectorResult) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto sum = builder.Reduce(prev, builder.ConstantR0(0.0f), add, - /*dimensions_to_reduce=*/{0}); - builder.Gt(builder.ConstantR0(15.5f), sum); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto sum = Reduce(prev, ConstantR0(&builder, 0.0f), add, + /*dimensions_to_reduce=*/{0}); + Gt(ConstantR0(&builder, 15.5f), sum); condition = builder.Build().ConsumeValueOrDie(); } @@ -269,16 +268,16 @@ TEST_F(WhileTest, WhileWithVectorResult) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto input = builder.ConstantR1(8, 0.125f); - builder.Add(input, prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto input = ConstantR1(&builder, 8, 0.125f); + Add(input, prev); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.ConstantR1(8, 0.f); - auto result = builder.While(condition, body, init); + auto init = ConstantR1(&builder, 8, 0.f); + auto result = While(condition, body, init); VLOG(2) << "while = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); @@ -306,9 +305,9 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) { XlaComputation add; { XlaBuilder builder("add"); - auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); - auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y"); - builder.Add(x, y); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {}), "y"); + Add(x, y); add = builder.Build().ConsumeValueOrDie(); } @@ -317,10 +316,10 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto sum = builder.Reduce(prev, builder.ConstantR0(0.0f), add, - /*dimensions_to_reduce=*/{0}); - builder.Gt(builder.ConstantR0(15.5f), sum); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto sum = Reduce(prev, ConstantR0(&builder, 0.0f), add, + /*dimensions_to_reduce=*/{0}); + Gt(ConstantR0(&builder, 15.5f), sum); condition = builder.Build().ConsumeValueOrDie(); } @@ -329,27 +328,27 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto input = builder.ConstantR1(8, 0.125f); - builder.Add(input, prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto input = ConstantR1(&builder, 8, 0.125f); + Add(input, prev); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.ConstantR1(8, 0.f); - auto result = builder.While(condition, body, init); + auto init = ConstantR1(&builder, 8, 0.f); + auto result = While(condition, body, init); VLOG(2) << "while = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); - builder.Tuple({result}); + Tuple(&builder, {result}); // Individual elements with increase by 1/8 each time through the loop, so // the sum will increase by 1.0. It will first be >15.5 when the elements // have all reached 2.0. auto expected_data = - Literal::CreateR1({2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}); - auto expected = Literal::MakeTuple({expected_data.get()}); + LiteralUtil::CreateR1({2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}); + auto expected = LiteralUtil::MakeTuple({expected_data.get()}); VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape()); ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001)); } @@ -366,9 +365,9 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Gt(builder.ConstantR0(N), iteration); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Gt(ConstantR0(&builder, N), iteration); condition = builder.Build().ConsumeValueOrDie(); } @@ -377,32 +376,34 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - auto w1 = builder.GetTupleElement(prev, 1); - auto w2 = builder.GetTupleElement(prev, 2); - auto w3 = builder.GetTupleElement(prev, 3); - builder.Tuple( - {builder.Add(iteration, builder.ConstantR0(1)), w3, w1, w2}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + auto w1 = GetTupleElement(prev, 1); + auto w2 = GetTupleElement(prev, 2); + auto w3 = GetTupleElement(prev, 3); + Tuple(&builder, + {Add(iteration, ConstantR0(&builder, 1)), w3, w1, w2}); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.Tuple( - {builder.ConstantR0(0), builder.ConstantR1(3, 1.f), - builder.ConstantR1(3, 2.f), builder.ConstantR1(3, 3.f)}); - auto result = builder.While(condition, body, init); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), + ConstantR1(&builder, 3, 1.f), + ConstantR1(&builder, 3, 2.f), + ConstantR1(&builder, 3, 3.f)}); + auto result = While(condition, body, init); VLOG(2) << "result = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); - auto expected_counter = Literal::CreateR0(N); - auto expected_w1 = Literal::CreateR1({1.0f, 1.0f, 1.0f}); - auto expected_w2 = Literal::CreateR1({2.0f, 2.0f, 2.0f}); - auto expected_w3 = Literal::CreateR1({3.0f, 3.0f, 3.0f}); - auto expected = Literal::MakeTuple({expected_counter.get(), expected_w2.get(), - expected_w3.get(), expected_w1.get()}); + auto expected_counter = LiteralUtil::CreateR0(N); + auto expected_w1 = LiteralUtil::CreateR1({1.0f, 1.0f, 1.0f}); + auto expected_w2 = LiteralUtil::CreateR1({2.0f, 2.0f, 2.0f}); + auto expected_w3 = LiteralUtil::CreateR1({3.0f, 3.0f, 3.0f}); + auto expected = + LiteralUtil::MakeTuple({expected_counter.get(), expected_w2.get(), + expected_w3.get(), expected_w1.get()}); VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape()); ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001)); } @@ -419,9 +420,9 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Gt(builder.ConstantR0(N), iteration); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Gt(ConstantR0(&builder, N), iteration); condition = builder.Build().ConsumeValueOrDie(); } @@ -430,26 +431,27 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - auto w1 = builder.GetTupleElement(prev, 1); - auto w2 = builder.GetTupleElement(prev, 2); - auto w3 = builder.GetTupleElement(prev, 3); - builder.Tuple( - {builder.Add(iteration, builder.ConstantR0(1)), w3, w1, w2}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + auto w1 = GetTupleElement(prev, 1); + auto w2 = GetTupleElement(prev, 2); + auto w3 = GetTupleElement(prev, 3); + Tuple(&builder, + {Add(iteration, ConstantR0(&builder, 1)), w3, w1, w2}); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.Tuple( - {builder.ConstantR0(0), builder.ConstantR1(3, 1.f), - builder.ConstantR1(3, 2.f), builder.ConstantR1(3, 3.f)}); - auto xla_while = builder.While(condition, body, init); - - auto add12 = builder.Add(builder.GetTupleElement(xla_while, 1), - builder.GetTupleElement(xla_while, 2)); - auto result = builder.Add(add12, builder.GetTupleElement(xla_while, 3)); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), + ConstantR1(&builder, 3, 1.f), + ConstantR1(&builder, 3, 2.f), + ConstantR1(&builder, 3, 3.f)}); + auto xla_while = While(condition, body, init); + + auto add12 = + Add(GetTupleElement(xla_while, 1), GetTupleElement(xla_while, 2)); + auto result = Add(add12, GetTupleElement(xla_while, 3)); VLOG(2) << "result = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); @@ -474,9 +476,9 @@ TEST_F(WhileTest, WhileWithTupleResult) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Gt(builder.ConstantR0(5), iteration); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Gt(ConstantR0(&builder, 5), iteration); condition = builder.Build().ConsumeValueOrDie(); } @@ -486,30 +488,30 @@ TEST_F(WhileTest, WhileWithTupleResult) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - auto weights = builder.GetTupleElement(prev, 1); - auto input = builder.ConstantR1(10, 1.f); - auto new_weights = builder.Add(weights, input); - builder.Tuple( - {builder.Add(iteration, builder.ConstantR0(1)), new_weights}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + auto weights = GetTupleElement(prev, 1); + auto input = ConstantR1(&builder, 10, 1.f); + auto new_weights = Add(weights, input); + Tuple(&builder, + {Add(iteration, ConstantR0(&builder, 1)), new_weights}); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.Tuple( - {builder.ConstantR0(0), builder.ConstantR1(10, 0.f)}); - auto result = builder.While(condition, body, init); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), + ConstantR1(&builder, 10, 0.f)}); + auto result = While(condition, body, init); VLOG(2) << "while = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); - auto expected_counter = Literal::CreateR0(5); - auto expected_data = Literal::CreateR1( + auto expected_counter = LiteralUtil::CreateR0(5); + auto expected_data = LiteralUtil::CreateR1( {5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f}); auto expected = - Literal::MakeTuple({expected_counter.get(), expected_data.get()}); + LiteralUtil::MakeTuple({expected_counter.get(), expected_data.get()}); VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape()); ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001)); } @@ -524,9 +526,9 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Gt(builder.ConstantR0(5), iteration); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Gt(ConstantR0(&builder, 5), iteration); condition = builder.Build().ConsumeValueOrDie(); } @@ -535,29 +537,28 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - auto pred = builder.GetTupleElement(prev, 1); - auto new_pred = builder.Or(pred, builder.ConstantR0(true)); - builder.Tuple( - {builder.Add(iteration, builder.ConstantR0(1)), new_pred}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + auto pred = GetTupleElement(prev, 1); + auto new_pred = Or(pred, ConstantR0(&builder, true)); + Tuple(&builder, {Add(iteration, ConstantR0(&builder, 1)), new_pred}); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.Tuple({builder.ConstantR0(0), - builder.Ne(builder.ConstantR0(false), - builder.ConstantR0(true))}); - auto result = builder.While(condition, body, init); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), + Ne(ConstantR0(&builder, false), + ConstantR0(&builder, true))}); + auto result = While(condition, body, init); VLOG(2) << "while = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); - auto expected_counter = Literal::CreateR0(5); - auto expected_predicate = Literal::CreateR0(true); - auto expected = - Literal::MakeTuple({expected_counter.get(), expected_predicate.get()}); + auto expected_counter = LiteralUtil::CreateR0(5); + auto expected_predicate = LiteralUtil::CreateR0(true); + auto expected = LiteralUtil::MakeTuple( + {expected_counter.get(), expected_predicate.get()}); ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0)); } @@ -571,9 +572,9 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Gt(builder.ConstantR0(5), iteration); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Gt(ConstantR0(&builder, 5), iteration); condition = builder.Build().ConsumeValueOrDie(); } @@ -583,26 +584,26 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Tuple({builder.Add(iteration, builder.ConstantR0(1)), - builder.ConstantR0(7)}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Tuple(&builder, {Add(iteration, ConstantR0(&builder, 1)), + ConstantR0(&builder, 7)}); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.Tuple( - {builder.ConstantR0(0), builder.ConstantR0(7)}); - auto result = builder.While(condition, body, init); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), + ConstantR0(&builder, 7)}); + auto result = While(condition, body, init); VLOG(2) << "while = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); - auto expected_counter = Literal::CreateR0(5); - auto expected_data = Literal::CreateR0(7); + auto expected_counter = LiteralUtil::CreateR0(5); + auto expected_data = LiteralUtil::CreateR0(7); auto expected = - Literal::MakeTuple({expected_counter.get(), expected_data.get()}); + LiteralUtil::MakeTuple({expected_counter.get(), expected_data.get()}); VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape()); ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001)); } @@ -632,9 +633,9 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) { const int c1 = 5; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Lt(iteration, builder.ConstantR0(c1)); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Lt(iteration, ConstantR0(&builder, c1)); TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build()); } @@ -642,9 +643,9 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) { const int c2 = 7; { XlaBuilder builder("condition2"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Lt(iteration, builder.ConstantR0(c2)); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Lt(iteration, ConstantR0(&builder, c2)); TF_ASSERT_OK_AND_ASSIGN(condition2, builder.Build()); } @@ -654,43 +655,43 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - auto weights = builder.GetTupleElement(prev, 1); - auto input = builder.ConstantR1(10, 1.f); - auto new_weights = builder.Add(weights, input); - builder.Tuple( - {builder.Add(iteration, builder.ConstantR0(1)), new_weights}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + auto weights = GetTupleElement(prev, 1); + auto input = ConstantR1(&builder, 10, 1.f); + auto new_weights = Add(weights, input); + Tuple(&builder, + {Add(iteration, ConstantR0(&builder, 1)), new_weights}); TF_ASSERT_OK_AND_ASSIGN(body, builder.Build()); } XlaComputation body2; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - auto weights = builder.GetTupleElement(prev, 1); - auto input = builder.ConstantR1(10, 1.f); - auto new_weights = builder.Add(weights, input); - builder.Tuple( - {builder.Add(iteration, builder.ConstantR0(1)), new_weights}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + auto weights = GetTupleElement(prev, 1); + auto input = ConstantR1(&builder, 10, 1.f); + auto new_weights = Add(weights, input); + Tuple(&builder, + {Add(iteration, ConstantR0(&builder, 1)), new_weights}); TF_ASSERT_OK_AND_ASSIGN(body2, builder.Build()); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.Tuple( - {builder.ConstantR0(0), builder.ConstantR1(10, 0.f)}); - auto while1 = builder.While(condition, body, init); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), + ConstantR1(&builder, 10, 0.f)}); + auto while1 = While(condition, body, init); - auto while2 = builder.While(condition2, body2, while1); + auto while2 = While(condition2, body2, while1); - auto while_result1 = builder.GetTupleElement(while1, 1); - auto while_result2 = builder.GetTupleElement(while2, 1); + auto while_result1 = GetTupleElement(while1, 1); + auto while_result2 = GetTupleElement(while2, 1); VLOG(2) << "while_result2 = " << ShapeUtil::HumanString( builder.GetShape(while_result2).ConsumeValueOrDie()); - auto result = builder.Add(while_result1, while_result2); + auto result = Add(while_result1, while_result2); VLOG(2) << "result = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); @@ -711,9 +712,9 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) { const int c1 = 5; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Lt(iteration, builder.ConstantR0(c1)); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Lt(iteration, ConstantR0(&builder, c1)); TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build()); } @@ -721,9 +722,9 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) { const int c2 = 7; { XlaBuilder builder("condition2"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Lt(iteration, builder.ConstantR0(c2)); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Lt(iteration, ConstantR0(&builder, c2)); TF_ASSERT_OK_AND_ASSIGN(condition2, builder.Build()); } @@ -733,30 +734,30 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - auto weights = builder.GetTupleElement(prev, 1); - auto input = builder.ConstantR1(10, 1.f); - auto new_weights = builder.Add(weights, input); - builder.Tuple( - {builder.Add(iteration, builder.ConstantR0(1)), new_weights}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + auto weights = GetTupleElement(prev, 1); + auto input = ConstantR1(&builder, 10, 1.f); + auto new_weights = Add(weights, input); + Tuple(&builder, + {Add(iteration, ConstantR0(&builder, 1)), new_weights}); TF_ASSERT_OK_AND_ASSIGN(body, builder.Build()); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.Tuple( - {builder.ConstantR0(0), builder.ConstantR1(10, 0.f)}); - auto while1 = builder.While(condition, body, init); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), + ConstantR1(&builder, 10, 0.f)}); + auto while1 = While(condition, body, init); - auto while2 = builder.While(condition2, body, while1); + auto while2 = While(condition2, body, while1); - auto while_result1 = builder.GetTupleElement(while1, 1); - auto while_result2 = builder.GetTupleElement(while2, 1); + auto while_result1 = GetTupleElement(while1, 1); + auto while_result2 = GetTupleElement(while2, 1); VLOG(2) << "while_result2 = " << ShapeUtil::HumanString( builder.GetShape(while_result2).ConsumeValueOrDie()); - auto result = builder.Add(while_result1, while_result2); + auto result = Add(while_result1, while_result2); VLOG(2) << "result = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); @@ -778,9 +779,9 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) { const int c1 = 5; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Lt(iteration, builder.ConstantR0(c1)); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Lt(iteration, ConstantR0(&builder, c1)); TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build()); } @@ -788,9 +789,9 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) { const int c2 = 7; { XlaBuilder builder("condition2"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Lt(iteration, builder.ConstantR0(c2)); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Lt(iteration, ConstantR0(&builder, c2)); TF_ASSERT_OK_AND_ASSIGN(condition2, builder.Build()); } @@ -800,29 +801,29 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - auto weights = builder.GetTupleElement(prev, 1); - auto input = builder.ConstantR1(10, 1.f); - auto new_weights = builder.Add(weights, input); - builder.Tuple( - {builder.Add(iteration, builder.ConstantR0(1)), new_weights}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + auto weights = GetTupleElement(prev, 1); + auto input = ConstantR1(&builder, 10, 1.f); + auto new_weights = Add(weights, input); + Tuple(&builder, + {Add(iteration, ConstantR0(&builder, 1)), new_weights}); TF_ASSERT_OK_AND_ASSIGN(body, builder.Build()); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.Tuple( - {builder.ConstantR0(0), builder.ConstantR1(10, 0.f)}); - auto while1 = builder.While(condition, body, init); - auto while2 = builder.While(condition2, body, init); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), + ConstantR1(&builder, 10, 0.f)}); + auto while1 = While(condition, body, init); + auto while2 = While(condition2, body, init); - auto while_result1 = builder.GetTupleElement(while1, 1); - auto while_result2 = builder.GetTupleElement(while2, 1); + auto while_result1 = GetTupleElement(while1, 1); + auto while_result2 = GetTupleElement(while2, 1); VLOG(2) << "while_result2 = " << ShapeUtil::HumanString( builder.GetShape(while_result2).ConsumeValueOrDie()); - auto result = builder.Add(while_result1, while_result2); + auto result = Add(while_result1, while_result2); VLOG(2) << "result = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); @@ -844,9 +845,9 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Gt(builder.ConstantR0(5), iteration); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Gt(ConstantR0(&builder, 5), iteration); condition = builder.Build().ConsumeValueOrDie(); } @@ -856,38 +857,37 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); + auto prev = Parameter(&builder, 0, result_shape, "prev"); // TupleElement 0 - auto iteration = builder.GetTupleElement(prev, 0); - auto out0 = builder.Add(iteration, builder.ConstantR0(1)); + auto iteration = GetTupleElement(prev, 0); + auto out0 = Add(iteration, ConstantR0(&builder, 1)); // TupleElement 1 - auto input = builder.GetTupleElement(prev, 1); + auto input = GetTupleElement(prev, 1); // Update. - auto update = builder.ConvertElementType(builder.Broadcast(out0, {2}), F32); + auto update = ConvertElementType(Broadcast(out0, {2}), F32); // Starts = iteration * 2; - auto starts = builder.Reshape( - builder.Mul(iteration, builder.ConstantR0(2)), {1}); + auto starts = Reshape(Mul(iteration, ConstantR0(&builder, 2)), {1}); // UpdateSlice. - auto out1 = builder.DynamicUpdateSlice(input, update, starts); + auto out1 = DynamicUpdateSlice(input, update, starts); - builder.Tuple({out0, out1}); + Tuple(&builder, {out0, out1}); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder("while"); - auto init = builder.Tuple( - {builder.ConstantR0(0), builder.ConstantR1(10, 0.f)}); - auto result = builder.While(condition, body, init); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), + ConstantR1(&builder, 10, 0.f)}); + auto result = While(condition, body, init); VLOG(2) << "while = " << ShapeUtil::HumanString( builder.GetShape(result).ConsumeValueOrDie()); - auto expected_counter = Literal::CreateR0(5); - auto expected_data = Literal::CreateR1( + auto expected_counter = LiteralUtil::CreateR0(5); + auto expected_data = LiteralUtil::CreateR1( {1.0f, 1.0f, 2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f}); auto expected = - Literal::MakeTuple({expected_counter.get(), expected_data.get()}); + LiteralUtil::MakeTuple({expected_counter.get(), expected_data.get()}); VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape()); ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001)); } @@ -913,10 +913,9 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) { // Create a computation for the condition: repeat for count iterations. auto build_condition = [this, v6s32](int count) { XlaBuilder builder(TestName()); - auto prev = builder.Reshape( - builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0}, - {}); - builder.Gt(builder.ConstantR0(count), prev); + auto prev = Reshape( + Slice(Parameter(&builder, 0, v6s32, "prev"), {0}, {1}, {1}), {0}, {}); + Gt(ConstantR0(&builder, count), prev); return builder.Build().ConsumeValueOrDie(); }; @@ -924,22 +923,22 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, v6s32, "prev"); - auto inc = builder.ConcatInDim( - {builder.ConstantR1({1}), - builder.RngUniform(builder.ConstantR0(0), - builder.ConstantR0(100), - ShapeUtil::MakeShape(S32, {5}))}, - 0); - builder.Add(inc, prev); + auto prev = Parameter(&builder, 0, v6s32, "prev"); + auto inc = ConcatInDim(&builder, + {ConstantR1(&builder, {1}), + RngUniform(ConstantR0(&builder, 0), + ConstantR0(&builder, 100), + ShapeUtil::MakeShape(S32, {5}))}, + 0); + Add(inc, prev); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. auto while_loop = [this, &body, build_condition](int count) { XlaBuilder builder(TestName()); - auto init = builder.ConstantR1({0, 0, 0, 0, 0, 0}); - builder.While(build_condition(count), body, init); + auto init = ConstantR1(&builder, {0, 0, 0, 0, 0, 0}); + While(build_condition(count), body, init); return builder.Build(); }; @@ -958,33 +957,30 @@ TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) { auto element_shape = ShapeUtil::MakeShape(F32, {2}); XlaBuilder outer("outer"); - auto p = outer.Parameter(0, element_shape, "param"); - auto t = outer.Tuple({p, outer.ConstantR1({1, 1})}); + auto p = Parameter(&outer, 0, element_shape, "param"); + auto t = Tuple(&outer, {p, ConstantR1(&outer, {1, 1})}); TF_ASSERT_OK_AND_ASSIGN(Shape tuple_shape, outer.GetShape(t)); XlaBuilder cond("cond"); - auto cond_t = cond.Parameter(0, tuple_shape, "t"); - TF_ASSERT_OK(Any(cond.Eq(cond.GetTupleElement(cond_t, 0), - cond.ConstantR1({42, 42})), - &cond) - .status()); + auto cond_t = Parameter(&cond, 0, tuple_shape, "t"); + Any(Eq(GetTupleElement(cond_t, 0), ConstantR1(&cond, {42, 42}))); XlaBuilder body("body"); - auto body_t = body.Parameter(0, tuple_shape, "t"); - auto e = body.GetTupleElement(body_t, 1); - body.Tuple({e, e}); + auto body_t = Parameter(&body, 0, tuple_shape, "t"); + auto e = GetTupleElement(body_t, 1); + Tuple(&body, {e, e}); TF_ASSERT_OK_AND_ASSIGN(auto cond_computation, cond.Build()); TF_ASSERT_OK_AND_ASSIGN(auto body_computation, body.Build()); - outer.While(cond_computation, body_computation, t); + While(cond_computation, body_computation, t); - auto expected_element = Literal::CreateR1({1, 1}); + auto expected_element = LiteralUtil::CreateR1({1, 1}); auto expected = - Literal::MakeTuple({expected_element.get(), expected_element.get()}); + LiteralUtil::MakeTuple({expected_element.get(), expected_element.get()}); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr parameter_data, - client_->TransferToServer(*Literal::CreateR1({42, 42}))); + client_->TransferToServer(*LiteralUtil::CreateR1({42, 42}))); ComputeAndCompareTuple(&outer, *expected, {parameter_data.get()}, ErrorSpec(1e-6)); } @@ -993,24 +989,23 @@ TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) { auto element_shape = ShapeUtil::MakeShape(F32, {2}); XlaBuilder outer("outer"); - auto p = outer.Parameter(0, element_shape, "param"); + auto p = Parameter(&outer, 0, element_shape, "param"); XlaBuilder cond("cond"); - auto cond_t = cond.Parameter(0, element_shape, "t"); - TF_ASSERT_OK( - Any(cond.Eq(cond_t, cond.ConstantR1({42, 42})), &cond).status()); + auto cond_t = Parameter(&cond, 0, element_shape, "t"); + Any(Eq(cond_t, ConstantR1(&cond, {42, 42}))); XlaBuilder body("body"); - auto body_t = body.Parameter(0, element_shape, "t"); - auto e = body.Broadcast(body.ConstantR0(1.0), {2}); + Parameter(&body, 0, element_shape, "t"); + Broadcast(ConstantR0(&body, 1.0), {2}); TF_ASSERT_OK_AND_ASSIGN(auto cond_computation, cond.Build()); TF_ASSERT_OK_AND_ASSIGN(auto body_computation, body.Build()); - outer.While(cond_computation, body_computation, p); + While(cond_computation, body_computation, p); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr parameter_data, - client_->TransferToServer(*Literal::CreateR1({42, 42}))); + client_->TransferToServer(*LiteralUtil::CreateR1({42, 42}))); ComputeAndCompareR1(&outer, {1.0f, 1.0f}, {parameter_data.get()}, ErrorSpec(1e-6)); } @@ -1019,25 +1014,24 @@ TEST_F(WhileTest, WhileThatTurnsScalarParameterToTupleElement) { auto element_shape = ShapeUtil::MakeShape(F32, {}); XlaBuilder outer("outer"); - auto p = outer.Parameter(0, element_shape, "param"); + auto p = Parameter(&outer, 0, element_shape, "param"); XlaBuilder cond("cond"); - auto cond_t = cond.Parameter(0, element_shape, "t"); - cond.Eq(cond_t, cond.ConstantR0(42)); + auto cond_t = Parameter(&cond, 0, element_shape, "t"); + Eq(cond_t, ConstantR0(&cond, 42)); XlaBuilder body("body"); - auto body_t = body.Parameter(0, element_shape, "t"); - auto tuple = - body.Tuple({body_t, body.Add(body_t, body.ConstantR0(1))}); - auto e = body.GetTupleElement(tuple, 1); + auto body_t = Parameter(&body, 0, element_shape, "t"); + auto tuple = Tuple(&body, {body_t, Add(body_t, ConstantR0(&body, 1))}); + GetTupleElement(tuple, 1); TF_ASSERT_OK_AND_ASSIGN(auto cond_computation, cond.Build()); TF_ASSERT_OK_AND_ASSIGN(auto body_computation, body.Build()); - outer.While(cond_computation, body_computation, p); + While(cond_computation, body_computation, p); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr parameter_data, - client_->TransferToServer(*Literal::CreateR0(42))); + client_->TransferToServer(*LiteralUtil::CreateR0(42))); ComputeAndCompareR0(&outer, 43.0f, {parameter_data.get()}, ErrorSpec(1e-6)); } @@ -1056,33 +1050,31 @@ TEST_F(WhileTest, WhileWithMixedTupleElements) { XlaBuilder outer("outer"); auto p = - outer.Tuple({outer.ConstantR0(0), - outer.Parameter(0, ShapeUtil::MakeShape(S32, {}), "t")}); + Tuple(&outer, {ConstantR0(&outer, 0), + Parameter(&outer, 0, ShapeUtil::MakeShape(S32, {}), "t")}); XlaBuilder cond("cond"); - auto params = cond.Parameter(0, result_shape, "prev"); - auto cond_t = cond.Add(cond.GetTupleElement(params, 1), - cond.GetTupleElement(params, 0)); - cond.Lt(cond_t, cond.ConstantR0(30)); + auto params = Parameter(&cond, 0, result_shape, "prev"); + auto cond_t = Add(GetTupleElement(params, 1), GetTupleElement(params, 0)); + Lt(cond_t, ConstantR0(&cond, 30)); XlaBuilder body("body"); - auto body_t = body.Parameter(0, result_shape, "t"); + auto body_t = Parameter(&body, 0, result_shape, "t"); - auto tuple = body.Tuple( - {body.Add(body.GetTupleElement(body_t, 0), body.ConstantR0(1)), - body.Add(body.GetTupleElement(body_t, 1), body.ConstantR0(1))}); + Tuple(&body, {Add(GetTupleElement(body_t, 0), ConstantR0(&body, 1)), + Add(GetTupleElement(body_t, 1), ConstantR0(&body, 1))}); TF_ASSERT_OK_AND_ASSIGN(auto cond_computation, cond.Build()); TF_ASSERT_OK_AND_ASSIGN(auto body_computation, body.Build()); - outer.While(cond_computation, body_computation, p); + While(cond_computation, body_computation, p); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr parameter_data, - client_->TransferToServer(*Literal::CreateR0(1))); + client_->TransferToServer(*LiteralUtil::CreateR0(1))); - auto add1 = Literal::CreateR0(15); - auto add2 = Literal::CreateR0(16); - auto expected = Literal::MakeTuple({add1.get(), add2.get()}); + auto add1 = LiteralUtil::CreateR0(15); + auto add2 = LiteralUtil::CreateR0(16); + auto expected = LiteralUtil::MakeTuple({add1.get(), add2.get()}); ComputeAndCompareTuple(&outer, *expected, {parameter_data.get()}, ErrorSpec(1e-6)); } @@ -1105,9 +1097,9 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) { XlaComputation inner_condition; { XlaBuilder builder("inner_condition"); - auto params = builder.Parameter(0, inner_result_shape, "prev"); - auto i = builder.GetTupleElement(params, 0); - builder.Lt(i, builder.ConstantR0(7)); + auto params = Parameter(&builder, 0, inner_result_shape, "prev"); + auto i = GetTupleElement(params, 0); + Lt(i, ConstantR0(&builder, 7)); inner_condition = builder.Build().ConsumeValueOrDie(); } @@ -1116,8 +1108,8 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) { XlaComputation outer_condition; { XlaBuilder builder("outer_condition"); - auto prev = builder.Parameter(0, outer_result_shape, "prev"); - builder.Lt(prev, builder.ConstantR0(30)); + auto prev = Parameter(&builder, 0, outer_result_shape, "prev"); + Lt(prev, ConstantR0(&builder, 30)); outer_condition = builder.Build().ConsumeValueOrDie(); } @@ -1126,12 +1118,12 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) { XlaComputation inner_body; { XlaBuilder builder("inner_body"); - auto params = builder.Parameter(0, inner_result_shape, "prev"); - auto i = builder.GetTupleElement(params, 0); - auto result = builder.GetTupleElement(params, 1); - i = builder.Add(builder.ConstantR0(1), i); - result = builder.Add(builder.ConstantR0(2), result); - builder.Tuple({i, result}); + auto params = Parameter(&builder, 0, inner_result_shape, "prev"); + auto i = GetTupleElement(params, 0); + auto result = GetTupleElement(params, 1); + i = Add(ConstantR0(&builder, 1), i); + result = Add(ConstantR0(&builder, 2), result); + Tuple(&builder, {i, result}); inner_body = builder.Build().ConsumeValueOrDie(); } @@ -1139,17 +1131,17 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) { XlaComputation outer_body; { XlaBuilder builder("outer_body"); - auto prev = builder.Parameter(0, outer_result_shape, "prev"); - auto init = builder.Tuple({builder.ConstantR0(0), prev}); - auto result = builder.While(inner_condition, inner_body, init); - builder.GetTupleElement(result, 1); + auto prev = Parameter(&builder, 0, outer_result_shape, "prev"); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), prev}); + auto result = While(inner_condition, inner_body, init); + GetTupleElement(result, 1); outer_body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder(TestName()); - auto init = builder.ConstantR0(0); - builder.While(outer_condition, outer_body, init); + auto init = ConstantR0(&builder, 0); + While(outer_condition, outer_body, init); ComputeAndCompareR0(&builder, 42, {}); } @@ -1167,8 +1159,8 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) { XlaComputation condition_callee; { XlaBuilder builder("condition_callee"); - auto prev = builder.Parameter(0, result_shape, "prev"); - builder.Tuple({builder.Gt(builder.ConstantR0(5), prev)}); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + Tuple(&builder, {Gt(ConstantR0(&builder, 5), prev)}); condition_callee = builder.Build().ConsumeValueOrDie(); } @@ -1176,9 +1168,9 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto result = builder.Call(condition_callee, {prev}); - builder.GetTupleElement(result, 0); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto result = Call(&builder, condition_callee, {prev}); + GetTupleElement(result, 0); condition = builder.Build().ConsumeValueOrDie(); } @@ -1186,16 +1178,16 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, result_shape, "prev"); - auto input = builder.ConstantR0(1); - builder.Add(input, prev); + auto prev = Parameter(&builder, 0, result_shape, "prev"); + auto input = ConstantR0(&builder, 1); + Add(input, prev); body = builder.Build().ConsumeValueOrDie(); } // Create a While node with computations for the condition and the body. XlaBuilder builder(TestName()); - auto init = builder.ConstantR0(0); - builder.While(condition, body, init); + auto init = ConstantR0(&builder, 0); + While(condition, body, init); ComputeAndCompareR0(&builder, 5, {}); } @@ -1210,40 +1202,69 @@ TEST_F(WhileTest, WhileWithLoopInvariantOperation) { XlaComputation condition; { XlaBuilder builder("condition"); - auto state = builder.Parameter(0, while_shape, "state"); - builder.Gt(builder.ConstantR0(5), builder.GetTupleElement(state, 0)); + auto state = Parameter(&builder, 0, while_shape, "state"); + Gt(ConstantR0(&builder, 5), GetTupleElement(state, 0)); TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build()); } XlaComputation body; { XlaBuilder builder("body"); - auto state = builder.Parameter(0, while_shape, "state"); - auto indvar = builder.GetTupleElement(state, 0); - auto input_0 = builder.GetTupleElement(state, 1); - auto input_1 = builder.GetTupleElement(state, 2); - auto output = builder.Tanh(builder.Dot(input_0, input_1)); - auto indvar_next = builder.Add(indvar, builder.ConstantR0(1)); - builder.Tuple({indvar_next, input_0, input_1, output}); + auto state = Parameter(&builder, 0, while_shape, "state"); + auto indvar = GetTupleElement(state, 0); + auto input_0 = GetTupleElement(state, 1); + auto input_1 = GetTupleElement(state, 2); + auto output = Tanh(Dot(input_0, input_1)); + auto indvar_next = Add(indvar, ConstantR0(&builder, 1)); + Tuple(&builder, {indvar_next, input_0, input_1, output}); TF_ASSERT_OK_AND_ASSIGN(body, builder.Build()); } XlaBuilder builder(TestName()); - auto matrix_input = builder.Parameter(0, matrix_shape, "matrix"); - auto init = builder.Tuple( - {builder.ConstantR0(0), matrix_input, matrix_input, matrix_input}); - auto while_instruction = builder.While(condition, body, init); - builder.GetTupleElement(while_instruction, 3); + auto matrix_input = Parameter(&builder, 0, matrix_shape, "matrix"); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), matrix_input, + matrix_input, matrix_input}); + auto while_instruction = While(condition, body, init); + GetTupleElement(while_instruction, 3); - TF_ASSERT_OK_AND_ASSIGN(auto param_value, - client_->TransferToServer(*Literal::CreateR2( - {{1.0, 2.0}, {-1.0, -2.0}}))); + TF_ASSERT_OK_AND_ASSIGN( + auto param_value, client_->TransferToServer(*LiteralUtil::CreateR2( + {{1.0, 2.0}, {-1.0, -2.0}}))); ComputeAndCompareR2( &builder, {{-0.76159416, -0.96402758}, {0.76159416, 0.96402758}}, {param_value.get()}, ErrorSpec(4e-5)); } +TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileInfeedCondition)) { + auto while_shape = ShapeUtil::MakeShape(S32, {}); + + XlaComputation condition; + { + XlaBuilder builder("condition"); + Parameter(&builder, 0, while_shape, "state"); + Infeed(&builder, ShapeUtil::MakeShape(PRED, {})); + TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build()); + } + + XlaComputation body; + { + XlaBuilder builder("body"); + auto indvar = Parameter(&builder, 0, while_shape, "state"); + Add(indvar, ConstantR0(&builder, 1)); + TF_ASSERT_OK_AND_ASSIGN(body, builder.Build()); + } + + XlaBuilder builder(TestName()); + While(condition, body, ConstantR0(&builder, 0)); + + TF_ASSERT_OK(client_->TransferToInfeed(*LiteralUtil::CreateR0(true))); + TF_ASSERT_OK(client_->TransferToInfeed(*LiteralUtil::CreateR0(true))); + TF_ASSERT_OK(client_->TransferToInfeed(*LiteralUtil::CreateR0(false))); + + ComputeAndCompareR0(&builder, 2, {}); +} + void BM_WhileLoop(int num_iters) { // Benchmark a simple kernel to measure while loop overheads. tensorflow::testing::StopTiming(); @@ -1264,9 +1285,9 @@ void BM_WhileLoop(int num_iters) { XlaComputation condition; { XlaBuilder builder("condition"); - auto prev = builder.Parameter(0, loop_state_shape, "prev"); - auto iteration = builder.GetTupleElement(prev, 0); - builder.Lt(iteration, builder.ConstantR0(loop_limit)); + auto prev = Parameter(&builder, 0, loop_state_shape, "prev"); + auto iteration = GetTupleElement(prev, 0); + Lt(iteration, ConstantR0(&builder, loop_limit)); condition = builder.Build().ConsumeValueOrDie(); } @@ -1274,29 +1295,29 @@ void BM_WhileLoop(int num_iters) { XlaComputation body; { XlaBuilder builder("body"); - auto prev = builder.Parameter(0, loop_state_shape, "prev"); + auto prev = Parameter(&builder, 0, loop_state_shape, "prev"); // TupleElement 0 - auto iteration = builder.GetTupleElement(prev, 0); - auto out0 = builder.Add(iteration, builder.ConstantR0(1)); + auto iteration = GetTupleElement(prev, 0); + auto out0 = Add(iteration, ConstantR0(&builder, 1)); // TupleElement 1 - auto input = builder.GetTupleElement(prev, 1); + auto input = GetTupleElement(prev, 1); // Update. - auto one = builder.ConstantR0(1.0); - auto update = builder.Broadcast(one, {1, 1024, 1024}); + auto one = ConstantR0(&builder, 1.0); + auto update = Broadcast(one, {1, 1024, 1024}); // Starts = iteration * 2; - auto starts = builder.ConstantR1({0, 0, 0}); + auto starts = ConstantR1(&builder, {0, 0, 0}); // UpdateSlice. - auto out1 = builder.DynamicUpdateSlice(input, update, starts); - builder.Tuple({out0, out1}); + auto out1 = DynamicUpdateSlice(input, update, starts); + Tuple(&builder, {out0, out1}); body = builder.Build().ConsumeValueOrDie(); } // Create a While instruction. XlaBuilder builder("while"); - auto zero = builder.ConstantR0(0.0); - auto input = builder.Broadcast(zero, {seq_len, 1024, 1024}); - auto init = builder.Tuple({builder.ConstantR0(0), input}); - builder.While(condition, body, init); + auto zero = ConstantR0(&builder, 0.0); + auto input = Broadcast(zero, {seq_len, 1024, 1024}); + auto init = Tuple(&builder, {ConstantR0(&builder, 0), input}); + While(condition, body, init); auto computation = builder.Build().ConsumeValueOrDie(); std::unique_ptr executable = diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc index 3c9a01653c67203cbc962a3d3d967142f7a2102c..7fd42944debe38abbf6f0ca36bc5c7ecb1aeaf97 100644 --- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -16,19 +16,23 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/strings/match.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_split.h" #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/platform_util.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/compiler/xla/tests/test_utils.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/gtl/flatmap.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/regexp.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -79,10 +83,11 @@ struct ParsedProfileOutputLine { Status ParseOneProfileOutputLine( const string& line, bool expect_hlo, - gtl::FlatMap* parsed_results) { + gtl::FlatMap* parsed_results, + absl::Span opcodes_to_ignore = {}) { string separator = "[^:]*:: +"; - string match_percentage = "\\d+\\.\\d\\d%"; - string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)"; + string match_percentage = R"(\d+\.\d*% +\d+Σ)"; + string match_cycles = R"((\d+) cycles +\( *()" + match_percentage + R"()\))"; string match_usecs = "([0-9.]+) usec"; string match_flops = "([^ ]*)"; string match_trops = "([^ ]*)"; @@ -96,7 +101,7 @@ Status ParseOneProfileOutputLine( string match_opcode = expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*" : "(\\[total\\])"; - string regexp_pattern = tensorflow::strings::StrCat( + string regexp_pattern = absl::StrCat( " +", match_cycles, separator, match_usecs, separator, match_flops, separator, match_trops, separator, match_bytes_per_sec, separator, match_bytes_per_cycle, separator, match_opcode); @@ -113,7 +118,9 @@ Status ParseOneProfileOutputLine( ", Regexp: ", regexp_pattern); } - InsertOrDie(parsed_results, parsed_line.opcode, parsed_line); + if (!absl::c_linear_search(opcodes_to_ignore, parsed_line.opcode)) { + InsertOrDie(parsed_results, parsed_line.opcode, parsed_line); + } return Status::OK(); } @@ -128,20 +135,23 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, se::StreamExecutor* executor = backend->default_stream_executor(); DeviceMemoryAllocator* allocator = backend->memory_allocator(); auto* transfer_manager = backend->transfer_manager(); + TF_ASSERT_OK_AND_ASSIGN( + StreamPool::Ptr stream_ptr, + backend->BorrowStream(backend->default_device_ordinal())); TF_ASSERT_OK_AND_ASSIGN( ScopedShapedBuffer lhs_arg, transfer_manager->AllocateScopedShapedBuffer( lhs_arg_shape, allocator, backend->default_device_ordinal())); TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( - executor, *Literal::CreateFromShape(lhs_arg_shape), lhs_arg)); + stream_ptr.get(), *Literal::CreateFromShape(lhs_arg_shape), lhs_arg)); TF_ASSERT_OK_AND_ASSIGN( ScopedShapedBuffer rhs_arg, transfer_manager->AllocateScopedShapedBuffer( rhs_arg_shape, allocator, backend->default_device_ordinal())); TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( - executor, *Literal::CreateFromShape(rhs_arg_shape), rhs_arg)); + stream_ptr.get(), *Literal::CreateFromShape(rhs_arg_shape), rhs_arg)); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr local_executable, @@ -153,9 +163,6 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, &executable->hlo_profile_printer_data(), &executable->hlo_profile_index_map()); - TF_ASSERT_OK_AND_ASSIGN( - Backend::StreamPtr stream_ptr, - backend->BorrowStream(backend->default_device_ordinal())); ExecutableRunOptions exec_run_options; exec_run_options.set_stream(stream_ptr.get()); exec_run_options.set_allocator(backend->memory_allocator()); @@ -164,10 +171,11 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, ServiceExecutableRunOptions run_options( exec_run_options, /*borrow_stream=*/nullptr, backend->eigen_intra_op_thread_pool()); + std::vector args = {&lhs_arg, &rhs_arg}; TF_ASSERT_OK_AND_ASSIGN( auto execution_result, - executable->ExecuteOnStream(&run_options, {&lhs_arg, &rhs_arg}, - &hlo_execution_profile)); + executable->ExecuteOnStream(&run_options, args, &hlo_execution_profile)); + TF_ASSERT_OK(stream_ptr->BlockHostUntilDone()); (void)execution_result; *profile_output = @@ -187,9 +195,9 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) { ClientLibrary::GetOrCreateLocalClient(platform)); XlaBuilder builder(TestName()); - auto result = builder.Tanh(builder.Add( - builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"), - builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs"))); + Tanh(Add( + Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"), + Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs"))); TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build()); @@ -198,7 +206,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) { rhs_shape); std::vector profile_output_lines = - tensorflow::str_util::Split(profile_output, '\n'); + absl::StrSplit(profile_output, '\n'); gtl::FlatMap parsed_profile_lines; @@ -219,7 +227,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) { MaybeFind(parsed_profile_lines, "tanh")); EXPECT_GT(total_profile.cycles, 0); - EXPECT_EQ(total_profile.cycles_percentage, "100.00%"); + EXPECT_EQ(total_profile.cycles_percentage, "100.% 100Σ"); EXPECT_TRUE(HasFlops(total_profile)); EXPECT_TRUE(HasTrops(total_profile)); @@ -239,9 +247,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) { EXPECT_TRUE(HasTrops(tanh_profile)); } -// TODO(b/71544591): The GPU backend does not record cycles spent in on Hlo -// instructions "interior" to while nodes. -XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) { +XLA_TEST_F(HloProfileTest, ProfileWhileComputation) { const int64 size = 256; Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size}); Shape while_result_shape = @@ -255,30 +261,30 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) { XlaComputation condition; { XlaBuilder builder("condition"); - auto state = builder.Parameter(0, while_result_shape, "state"); - auto iteration = builder.GetTupleElement(state, 0); - builder.Gt(builder.ConstantR0(5), iteration); + auto state = Parameter(&builder, 0, while_result_shape, "state"); + auto iteration = GetTupleElement(state, 0); + Gt(ConstantR0(&builder, 5), iteration); TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build()); } XlaComputation body; { XlaBuilder builder("body"); - auto state = builder.Parameter(0, while_result_shape, "state"); - auto matrix = builder.GetTupleElement(state, 1); - auto next_iteration = builder.Add(builder.GetTupleElement(state, 0), - builder.ConstantR0(1)); - builder.Tuple({next_iteration, builder.Add(matrix, matrix)}); + auto state = Parameter(&builder, 0, while_result_shape, "state"); + auto matrix = GetTupleElement(state, 1); + auto next_iteration = + Add(GetTupleElement(state, 0), ConstantR0(&builder, 1)); + Tuple(&builder, {next_iteration, Mul(matrix, matrix)}); TF_ASSERT_OK_AND_ASSIGN(body, builder.Build()); } XlaBuilder builder(TestName()); auto initial_while_state = - builder.Tuple({builder.ConstantR0(0), - builder.Parameter(0, matrix_shape, "initial_value")}); - auto while_result = builder.While(condition, body, initial_while_state); - builder.Add(builder.GetTupleElement(while_result, 1), - builder.Parameter(1, matrix_shape, "other_value")); + Tuple(&builder, {ConstantR0(&builder, 0), + Parameter(&builder, 0, matrix_shape, "initial_value")}); + auto while_result = While(condition, body, initial_while_state); + Add(GetTupleElement(while_result, 1), + Parameter(&builder, 1, matrix_shape, "other_value")); TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build()); @@ -287,39 +293,51 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) { matrix_shape); std::vector profile_output_lines = - tensorflow::str_util::Split(profile_output, '\n'); + absl::StrSplit(profile_output, '\n'); auto while_body_profile_start = - std::find_if(profile_output_lines.begin(), profile_output_lines.end(), - [](tensorflow::StringPiece s) { - return tensorflow::str_util::StartsWith( - s, "Execution profile for body"); - }); + absl::c_find_if(profile_output_lines, [](absl::string_view s) { + return absl::StartsWith(s, "Execution profile for body"); + }); - ASSERT_NE(while_body_profile_start, profile_output_lines.end()); + ASSERT_NE(while_body_profile_start, profile_output_lines.cend()); - gtl::FlatMap parsed_profile_lines; + auto while_body_profile_end = std::find_if( + while_body_profile_start, profile_output_lines.end(), + [](absl::string_view s) { + return absl::StartsWith(s, "********** microseconds report **********"); + }); + + // We emit a blank line before the "********** microseconds report **********" + // line. + while_body_profile_end--; - TF_ASSERT_OK( - ParseOneProfileOutputLine(*std::next(while_body_profile_start, 1), - /*expect_hlo=*/false, &parsed_profile_lines)); + ASSERT_NE(while_body_profile_end, profile_output_lines.end()); - TF_ASSERT_OK( - ParseOneProfileOutputLine(*std::next(while_body_profile_start, 2), - /*expect_hlo=*/true, &parsed_profile_lines)); + gtl::FlatMap parsed_profile_lines; + + for (auto while_body_profile_i = while_body_profile_start + 1; + while_body_profile_i != while_body_profile_end; while_body_profile_i++) { + // There are multiple "get-tuple-element" instructions in the while body so + // we ignore them -- we don't want parsed_profile_lines to be a multi-map. + TF_ASSERT_OK(ParseOneProfileOutputLine( + *while_body_profile_i, + /*expect_hlo=*/while_body_profile_i != (while_body_profile_start + 1), + &parsed_profile_lines, {"get-tuple-element"})); + } TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_while_body_profile, MaybeFind(parsed_profile_lines, "[total]")); - TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile, - MaybeFind(parsed_profile_lines, "add")); + TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine multiply_profile, + MaybeFind(parsed_profile_lines, "multiply")); EXPECT_GT(total_while_body_profile.cycles, 0); EXPECT_EQ(total_while_body_profile.opcode, "[total]"); - EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.00%"); + EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.% 100Σ"); - EXPECT_GT(total_while_body_profile.cycles, dot_profile.cycles); - EXPECT_NE(dot_profile.cycles_percentage, "0.00%"); - EXPECT_NE(dot_profile.cycles_percentage, "100.00%"); + EXPECT_GT(total_while_body_profile.cycles, multiply_profile.cycles); + EXPECT_NE(multiply_profile.cycles_percentage, "0.00%"); + EXPECT_NE(multiply_profile.cycles_percentage, "100.00%"); } } // namespace } // namespace xla @@ -336,8 +354,11 @@ static std::pair AddXlaHloProfileFlag(int argc, char** argv) { new_argv[argc] = strdup("--xla_hlo_profile"); // Fusion can change the Hlo instructions that show up in the final Hlo - // executable, so block it here. - new_argv[argc + 1] = strdup("--xla_disable_hlo_passes=fusion"); + // executable, so block it here. Also block the WhileLoopInvariantCodeMotion + // pass, otherwise a while loop is transformed and we could not match the + // original name in the ProfileWhileComputation test. + new_argv[argc + 1] = strdup( + "--xla_disable_hlo_passes=fusion,while-loop-invariant-code-motion"); return {argc + 2, new_argv}; } diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc index a9f2915b458b1816926de727b3da21982d06f6c0..15603619b62d8f45cdce97ac7d83924a78f88cf3 100644 --- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc +++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "absl/strings/match.h" +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" @@ -32,16 +32,14 @@ GTEST_API_ int main(int argc, char** argv) { // If the --benchmarks flag is passed in then only run the benchmarks, not the // tests. for (int i = 1; i < argc; i++) { - tensorflow::StringPiece arg(argv[i]); - if (arg == "--benchmarks" || - tensorflow::str_util::StartsWith(arg, "--benchmarks=")) { + absl::string_view arg(argv[i]); + if (arg == "--benchmarks" || absl::StartsWith(arg, "--benchmarks=")) { const char* pattern = nullptr; - if (tensorflow::str_util::StartsWith(arg, "--benchmarks=")) { + if (absl::StartsWith(arg, "--benchmarks=")) { pattern = argv[i] + strlen("--benchmarks="); } else { // Handle flag of the form '--benchmarks foo' (no '='). - if (i + 1 >= argc || - tensorflow::str_util::StartsWith(argv[i + 1], "--")) { + if (i + 1 >= argc || absl::StartsWith(argv[i + 1], "--")) { LOG(ERROR) << "--benchmarks flag requires an argument."; return 2; } @@ -49,6 +47,7 @@ GTEST_API_ int main(int argc, char** argv) { } // Unfortunately Google's internal benchmark infrastructure has a // different API than Tensorflow's. + testing::InitGoogleTest(&argc, argv); #if defined(PLATFORM_GOOGLE) base::SetFlag(&FLAGS_benchmarks, pattern); RunSpecifiedBenchmarks(); diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc index 56702feab9a4e8d00df3a165ab994aef2d42d830..442e66321ee732f3d9cdfe4931433bd864b7fa82 100644 --- a/tensorflow/compiler/xla/text_literal_reader.cc +++ b/tensorflow/compiler/xla/text_literal_reader.cc @@ -20,25 +20,28 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/ptr_util.h" +#include "absl/memory/memory.h" +#include "absl/strings/match.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_split.h" +#include "absl/strings/string_view.h" +#include "absl/strings/strip.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/io/buffered_inputstream.h" #include "tensorflow/core/lib/io/random_inputstream.h" -#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/types.h" namespace xla { StatusOr> TextLiteralReader::ReadPath( - tensorflow::StringPiece path) { - CHECK(!tensorflow::str_util::EndsWith(path, ".gz")) + absl::string_view path) { + CHECK(!absl::EndsWith(path, ".gz")) << "TextLiteralReader no longer supports reading .gz files"; std::unique_ptr file; Status s = @@ -54,33 +57,6 @@ StatusOr> TextLiteralReader::ReadPath( TextLiteralReader::TextLiteralReader(tensorflow::RandomAccessFile* file) : file_(file) {} -namespace { -// This is an optimized version of tensorflow::str_util::Split which uses -// StringPiece for the delimited strings and uses an out parameter for the -// result to avoid vector creation/destruction. -void SplitByDelimToStringPieces(tensorflow::StringPiece text, char delim, - std::vector* result) { - result->clear(); - - if (text.empty()) { - return; - } - - // The following loop is a little strange: its bound is text.size() + 1 - // instead of the more typical text.size(). - // The final iteration of the loop (when i is equal to text.size()) handles - // the trailing token. - size_t token_start = 0; - for (size_t i = 0; i < text.size() + 1; i++) { - if (i == text.size() || text[i] == delim) { - tensorflow::StringPiece token(text.data() + token_start, i - token_start); - result->push_back(token); - token_start = i + 1; - } - } -} -} // namespace - StatusOr> TextLiteralReader::ReadAllLines() { tensorflow::io::RandomAccessInputStream stream(file_.get()); tensorflow::io::BufferedInputStream buf(&stream, 65536); @@ -90,61 +66,55 @@ StatusOr> TextLiteralReader::ReadAllLines() { return s; } - tensorflow::StringPiece sp(shape_string); - if (tensorflow::str_util::RemoveWhitespaceContext(&sp) > 0) { - string tmp = std::string(sp); - shape_string = tmp; - } + absl::StripAsciiWhitespace(&shape_string); TF_ASSIGN_OR_RETURN(Shape shape, ShapeUtil::ParseShapeString(shape_string)); if (shape.element_type() != F32) { return Unimplemented( "unsupported element type for text literal reading: %s", - ShapeUtil::HumanString(shape).c_str()); + ShapeUtil::HumanString(shape)); } - auto result = MakeUnique(shape); + auto result = absl::make_unique(shape); const float fill = std::numeric_limits::quiet_NaN(); result->PopulateWithValue(fill); - std::vector pieces; - std::vector coordinates; + std::vector pieces; + std::vector coordinates; std::vector coordinate_values; string line; while (buf.ReadLine(&line).ok()) { - SplitByDelimToStringPieces(line, ':', &pieces); - tensorflow::StringPiece coordinates_string = pieces[0]; - tensorflow::StringPiece value_string = pieces[1]; - tensorflow::str_util::RemoveWhitespaceContext(&coordinates_string); - tensorflow::str_util::RemoveWhitespaceContext(&value_string); - if (!tensorflow::str_util::ConsumePrefix(&coordinates_string, "(")) { + pieces = absl::StrSplit(line, ':'); + absl::string_view coordinates_string = + absl::StripAsciiWhitespace(pieces[0]); + absl::string_view value_string = absl::StripAsciiWhitespace(pieces[1]); + if (!absl::ConsumePrefix(&coordinates_string, "(")) { return InvalidArgument( - "expected '(' at the beginning of coordinates: \"%s\"", line.c_str()); + "expected '(' at the beginning of coordinates: \"%s\"", line); } - if (!tensorflow::str_util::ConsumeSuffix(&coordinates_string, ")")) { + if (!absl::ConsumeSuffix(&coordinates_string, ")")) { return InvalidArgument("expected ')' at the end of coordinates: \"%s\"", - line.c_str()); + line); } float value; - if (!tensorflow::strings::safe_strtof(std::string(value_string).c_str(), - &value)) { + if (!absl::SimpleAtof(value_string, &value)) { return InvalidArgument("could not parse value as float: \"%s\"", - std::string(value_string).c_str()); + value_string); } - SplitByDelimToStringPieces(coordinates_string, ',', &coordinates); + coordinates = absl::StrSplit(coordinates_string, ','); coordinate_values.clear(); - for (tensorflow::StringPiece piece : coordinates) { + for (absl::string_view piece : coordinates) { int64 coordinate_value; - if (!tensorflow::strings::safe_strto64(piece, &coordinate_value)) { + if (!absl::SimpleAtoi(piece, &coordinate_value)) { return InvalidArgument( "could not parse coordinate member as int64: \"%s\"", - std::string(piece).c_str()); + std::string(piece)); } coordinate_values.push_back(coordinate_value); } if (coordinate_values.size() != shape.dimensions_size()) { return InvalidArgument( - "line did not have expected number of coordinates; want %d got %zu: " + "line did not have expected number of coordinates; want %d got %u: " "\"%s\"", - shape.dimensions_size(), coordinate_values.size(), line.c_str()); + shape.dimensions_size(), coordinate_values.size(), line); } result->Set(coordinate_values, value); } diff --git a/tensorflow/compiler/xla/text_literal_reader.h b/tensorflow/compiler/xla/text_literal_reader.h index e45e5291c9b10803f5e5008b72c7dd0116a0dea0..b265640802c88847ce57e9f942f9f0859b873ae8 100644 --- a/tensorflow/compiler/xla/text_literal_reader.h +++ b/tensorflow/compiler/xla/text_literal_reader.h @@ -18,11 +18,11 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/strings/string_view.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/macros.h" @@ -41,8 +41,7 @@ class TextLiteralReader { public: // See class comment -- reads a file in its entirety (there must be only one // literal in the text file path provided). - static StatusOr> ReadPath( - tensorflow::StringPiece path); + static StatusOr> ReadPath(absl::string_view path); private: // Ownership of file is transferred. diff --git a/tensorflow/compiler/xla/text_literal_reader_test.cc b/tensorflow/compiler/xla/text_literal_reader_test.cc index 23070b663870a2b78b38663e09a32fcb28d9c2dc..92f9b4f9f0efa2dc08287bdcbefc88f879164308 100644 --- a/tensorflow/compiler/xla/text_literal_reader_test.cc +++ b/tensorflow/compiler/xla/text_literal_reader_test.cc @@ -17,7 +17,7 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/types.h" diff --git a/tensorflow/compiler/xla/text_literal_writer.cc b/tensorflow/compiler/xla/text_literal_writer.cc index 373c0d2d8d8ab05dec11e51f265d41b91e7920bf..7289ae7df65e56652eeeb67e536e4c721d97d999 100644 --- a/tensorflow/compiler/xla/text_literal_writer.cc +++ b/tensorflow/compiler/xla/text_literal_writer.cc @@ -17,23 +17,23 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/types.h" namespace xla { -/* static */ Status TextLiteralWriter::WriteToPath( - const Literal& literal, tensorflow::StringPiece path) { +/* static */ Status TextLiteralWriter::WriteToPath(const Literal& literal, + absl::string_view path) { std::unique_ptr f; - auto s = tensorflow::Env::Default()->NewWritableFile(std::string(path), &f); + auto s = tensorflow::Env::Default()->NewWritableFile(string(path), &f); if (!s.ok()) { return s; } @@ -46,16 +46,14 @@ namespace xla { Status status; tensorflow::WritableFile* f_ptr = f.get(); literal.EachCellAsString( - [f_ptr, &status](tensorflow::gtl::ArraySlice indices, - const string& value) { + [f_ptr, &status](absl::Span indices, const string& value) { if (!status.ok()) { return; } - string coordinates = tensorflow::strings::StrCat( - "(", tensorflow::str_util::Join(indices, ", "), ")"); + string coordinates = + absl::StrCat("(", absl::StrJoin(indices, ", "), ")"); - status = f_ptr->Append( - tensorflow::strings::StrCat(coordinates, ": ", value, "\n")); + status = f_ptr->Append(absl::StrCat(coordinates, ": ", value, "\n")); }); auto ignored = f->Close(); return status; diff --git a/tensorflow/compiler/xla/text_literal_writer.h b/tensorflow/compiler/xla/text_literal_writer.h index 0a1235b5e04675da0f412bafab6c4ecf04367787..34de8572d638067b327711017ee173b16c8da21e 100644 --- a/tensorflow/compiler/xla/text_literal_writer.h +++ b/tensorflow/compiler/xla/text_literal_writer.h @@ -16,11 +16,11 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_TEXT_LITERAL_WRITER_H_ #define TENSORFLOW_COMPILER_XLA_TEXT_LITERAL_WRITER_H_ -#include "tensorflow/compiler/xla/literal_util.h" +#include "absl/strings/string_view.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/macros.h" namespace xla { @@ -37,8 +37,7 @@ namespace xla { // This should be readable by xla::TextLiteralReader. class TextLiteralWriter { public: - static Status WriteToPath(const Literal& literal, - tensorflow::StringPiece path); + static Status WriteToPath(const Literal& literal, absl::string_view path); private: TF_DISALLOW_COPY_AND_ASSIGN(TextLiteralWriter); diff --git a/tensorflow/compiler/xla/text_literal_writer_test.cc b/tensorflow/compiler/xla/text_literal_writer_test.cc index 70cf2fb1b8a1b4f2ecfdaeaef3a00ddc974e2652..4ea02faffcd52065b05c0444202bd1a3d9d87ee6 100644 --- a/tensorflow/compiler/xla/text_literal_writer_test.cc +++ b/tensorflow/compiler/xla/text_literal_writer_test.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -30,8 +31,9 @@ namespace xla { namespace { TEST(TextLiteralWriterTest, WritesFloatLiteral) { - auto literal = Literal::CreateR2({ - {3.14, 2.17}, {1.23, 4.56}, + auto literal = LiteralUtil::CreateR2({ + {3.14, 2.17}, + {1.23, 4.56}, }); string path = tensorflow::io::JoinPath(tensorflow::testing::TmpDir(), "/whatever"); diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD index 15b9cd42650af7db8624abe1bfb6f31292ff67c1..3a086c66bbb37965b1ad7c83a93f0054ae723e87 100644 --- a/tensorflow/compiler/xla/tools/BUILD +++ b/tensorflow/compiler/xla/tools/BUILD @@ -24,6 +24,8 @@ tf_cc_binary( "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "@com_google_absl//absl/base", + "@com_google_absl//absl/strings", ], ) @@ -37,10 +39,12 @@ cc_library( "//tensorflow/compiler/xla/client", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", "//tensorflow/compiler/xla/service", "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -66,6 +70,7 @@ tf_cc_binary( "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/service:interpreter_plugin", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -74,7 +79,7 @@ cc_library( srcs = ["replay_computation.cc"], deps = [ "//tensorflow/compiler/xla:execution_options_util", - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", @@ -84,12 +89,16 @@ cc_library( "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:testing", + "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/service/gpu:infeed_manager", "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], alwayslink = True, ) @@ -122,7 +131,7 @@ tf_cc_binary( name = "show_literal", srcs = ["show_literal.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", @@ -135,7 +144,7 @@ tf_cc_binary( deps = [ "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla/service:session_proto", + "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/core:lib", ], ) @@ -144,7 +153,7 @@ tf_cc_binary( name = "show_text_literal", srcs = ["show_text_literal.cc"], deps = [ - "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:text_literal_reader", "//tensorflow/compiler/xla:types", @@ -163,11 +172,12 @@ tf_cc_binary( "//tensorflow/compiler/xla/client", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service", - "//tensorflow/compiler/xla/service:computation_tracker", "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/service:interpreter_plugin", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) @@ -181,11 +191,15 @@ tf_cc_binary( "//tensorflow/compiler/xla/client", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/service:interpreter_plugin", "//tensorflow/core:lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", ], ) @@ -198,12 +212,14 @@ tf_cc_binary( "//tensorflow/compiler/xla/client", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", "//tensorflow/compiler/xla/service", "//tensorflow/compiler/xla/service:hlo_graph_dumper", "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/service:interpreter_plugin", "//tensorflow/core:lib", + "@com_google_absl//absl/types:span", ], ) diff --git a/tensorflow/compiler/xla/tools/convert_computation.cc b/tensorflow/compiler/xla/tools/convert_computation.cc index fe03a6e7bdfe99877c250fe1ae22beee4c8018a2..14d01b5bfb067cc39abc4d6e0605007624b6e0ae 100644 --- a/tensorflow/compiler/xla/tools/convert_computation.cc +++ b/tensorflow/compiler/xla/tools/convert_computation.cc @@ -21,7 +21,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/service/session.pb.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/platform/env.h" @@ -33,7 +33,7 @@ namespace xla { namespace tools { void RealMain(const string& mode, const string& path) { - SessionModule module; + HloSnapshot module; tensorflow::Env* env = tensorflow::Env::Default(); if (mode == "txt2bin") { TF_CHECK_OK(tensorflow::ReadTextProto(env, path, &module)); diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc index befb55453777dce30af89bcaad2ffe1647097576..c866a13de7543fc948311f94708bc6b904717b62 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc @@ -28,16 +28,17 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/service.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -45,7 +46,7 @@ limitations under the License. namespace xla { namespace tools { -void RealMain(tensorflow::gtl::ArraySlice args) { +void RealMain(absl::Span args) { Client* client = ClientLibrary::LocalClientOrDie(); for (char* arg : args) { HloSnapshot module; @@ -76,8 +77,8 @@ int main(int argc, char** argv) { } tensorflow::port::InitMain(argv[0], &argc, &argv); - tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + absl::Span args(argv, argc); + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args); return 0; } diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc index cfb8f37487d6499b803438a135be54524fcf17d2..4375e7c138c9e8d193feaa7a39d63946c4ea3086 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc @@ -19,18 +19,19 @@ limitations under the License. #include #include +#include "absl/strings/str_format.h" +#include "absl/strings/str_join.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/service.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -43,16 +44,14 @@ class OperationDumper : public DfsHloVisitorWithDefault { explicit OperationDumper(const string& path) : path_(path) {} Status DefaultAction(HloInstruction* hlo) override { - string params = tensorflow::str_util::Join( + string params = absl::StrJoin( hlo->operands(), ", ", [](string* out, const HloInstruction* operand) { - tensorflow::strings::StrAppend( - out, ShapeUtil::HumanString(operand->shape())); + absl::StrAppend(out, ShapeUtil::HumanString(operand->shape())); }); // Spit `op_name(params...) -> result_type :: path` to stdout. - std::cout << tensorflow::strings::Printf( - "%s :: (%s) -> %s :: %s\n", HloOpcodeString(hlo->opcode()).c_str(), - params.c_str(), ShapeUtil::HumanString(hlo->shape()).c_str(), - path_.c_str()); + std::cout << absl::StrFormat("%s :: (%s) -> %s :: %s\n", + HloOpcodeString(hlo->opcode()), params, + ShapeUtil::HumanString(hlo->shape()), path_); return Status::OK(); } @@ -60,7 +59,7 @@ class OperationDumper : public DfsHloVisitorWithDefault { string path_; }; -void RealMain(tensorflow::gtl::ArraySlice args) { +void RealMain(absl::Span args) { LocalClient* client = ClientLibrary::LocalClientOrDie(); LocalService* local_service = ClientLibrary::GetXlaService(client->platform()); @@ -105,8 +104,8 @@ void RealMain(tensorflow::gtl::ArraySlice args) { int main(int argc, char** argv) { tensorflow::port::InitMain(argv[0], &argc, &argv); - tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + absl::Span args(argv, argc); + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args); return 0; } diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc index b815bbf854b82b323da7879c230a1026cae96625..723569862c7550387e95003e3a673743464b67b8 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc @@ -17,16 +17,16 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/service/computation_tracker.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/service.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -34,7 +34,7 @@ limitations under the License. namespace xla { namespace tools { -void RealMain(tensorflow::gtl::ArraySlice args, bool compile) { +void RealMain(absl::Span args, bool compile) { LocalClient* client = ClientLibrary::LocalClientOrDie(); LocalService* local_service = ClientLibrary::GetXlaService(client->platform()); @@ -102,8 +102,8 @@ int main(int argc, char** argv) { tensorflow::port::InitMain(usage.c_str(), &argc, &argv); QCHECK(argc > 1) << "\nERROR: must specify at least one module\n" << usage; - tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + absl::Span args(argv, argc); + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args, compile); return 0; } diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc index a5dce20456c6a2402f425ebb3d575d1bb625f839..07ef5ff656bb48519a700a1d7d6c60b655a40ed6 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc @@ -26,15 +26,16 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/service.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -44,7 +45,7 @@ using tensorflow::Env; namespace xla { namespace tools { -void RealMain(tensorflow::gtl::ArraySlice args) { +void RealMain(absl::Span args) { Client* client = ClientLibrary::LocalClientOrDie(); for (char* arg : args) { HloSnapshot module; @@ -77,8 +78,8 @@ int main(int argc, char** argv) { tensorflow::port::InitMain(argv[0], &argc, &argv); - tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + absl::Span args(argv, argc); + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args); return 0; } diff --git a/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc b/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc index eb7bff053b1fc028fdb6930dbc496c3b6d9fae47..23ce1d235b9f2613505f8a3bfbd1a4c1162debd4 100644 --- a/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc +++ b/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc @@ -17,10 +17,10 @@ limitations under the License. #include #include +#include "absl/base/casts.h" +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/io/buffered_inputstream.h" #include "tensorflow/core/lib/io/random_inputstream.h" #include "tensorflow/core/platform/env.h" @@ -67,9 +67,8 @@ int main(int argc, char** argv) { floats.push_back(value); } - tensorflow::StringPiece content( - tensorflow::bit_cast(floats.data()), - floats.size() * sizeof(float)); + absl::string_view content(absl::bit_cast(floats.data()), + floats.size() * sizeof(float)); TF_CHECK_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(), output_file, content)); return 0; diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD deleted file mode 100644 index 0fa4b98d0a41a1e7c681bb2302da3b752315867b..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/tools/parser/BUILD +++ /dev/null @@ -1,72 +0,0 @@ -# Build file for the Hlo parser. - -licenses(["notice"]) # Apache 2.0 - -package( - default_visibility = [":friends"], -) - -package_group( - name = "friends", - includes = [ - "//tensorflow/compiler/xla:friends", - ], -) - -# Filegroup used to collect source files for dependency checking. -filegroup( - name = "c_srcs", - data = glob([ - "**/*.cc", - "**/*.h", - ]), -) - -load("//tensorflow:tensorflow.bzl", "tf_cc_test") - -cc_library( - name = "hlo_lexer", - srcs = ["hlo_lexer.cc"], - hdrs = [ - "hlo_lexer.h", - "hlo_token.h", - ], - deps = [ - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/core:lib", - "//tensorflow/core:regexp_internal", - ], -) - -cc_library( - name = "hlo_parser", - srcs = ["hlo_parser.cc"], - hdrs = ["hlo_parser.h"], - deps = [ - ":hlo_lexer", - "//tensorflow/compiler/xla:literal_util", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/core:lib", - "//tensorflow/core:lib_internal", - ], -) - -tf_cc_test( - name = "hlo_parser_test", - size = "small", - srcs = ["hlo_parser_test.cc"], - deps = [ - ":hlo_parser", - "//tensorflow/core:lib", - "//tensorflow/core:test", - "//tensorflow/core:test_main", - ], -) diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc deleted file mode 100644 index 134978d21f29e700f01fedca4086f1da10d6931b..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc +++ /dev/null @@ -1,2718 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" - -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/gtl/map_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" - -namespace xla { -namespace tools { - -namespace { - -using tensorflow::StringPiece; -using tensorflow::gtl::optional; -using tensorflow::str_util::Join; -using tensorflow::str_util::Split; -using tensorflow::str_util::SplitAndParseAsInts; -using tensorflow::strings::Printf; -using tensorflow::strings::StrAppend; -using tensorflow::strings::StrCat; - -const double kF16max = 65504; - -// Parser for the HloModule::ToString() format text. -class HloParser { - public: - using LocTy = HloLexer::LocTy; - - explicit HloParser(StringPiece str, const HloModuleConfig& config) - : lexer_(str), config_(config) {} - - // Runs the parser. Returns false if an error occurred. - bool Run(); - - // Returns the parsed HloModule. - std::unique_ptr ConsumeHloModule() { return std::move(module_); } - - // Returns the error information. - string GetError() const { return Join(error_, "\n"); } - - // Stand alone parsing for sharding. The parser string is supposed to - // contain the body of the sharding, i.e. just the rhs of the "sharding={...}" - // attribute string. - StatusOr ParseShardingOnly(); - - private: - // ParseXXX returns false if an error occurred. - bool ParseHloModule(); - bool ParseComputations(); - bool ParseComputation(HloComputation** entry_computation); - bool ParseInstructionList(HloComputation::Builder* builder, - string* root_name); - bool ParseInstruction(HloComputation::Builder* builder, string* root_name); - bool ParseControlPredecessors(HloInstruction* instruction); - bool ParseLiteral(std::unique_ptr* literal, const Shape& shape); - bool ParseTupleLiteral(std::unique_ptr* literal, const Shape& shape); - bool ParseNonTupleLiteral(std::unique_ptr* literal, - const Shape& shape); - bool ParseDenseLiteral(std::unique_ptr* literal, const Shape& shape); - bool ParseSparseLiteral(std::unique_ptr* literal, - const Shape& shape); - template - bool ParseSparseLiteralHelper(std::unique_ptr* literal, - const Shape& shape); - - // Sets the sub-value of literal at the given index to the given value. The - // literal's shape must have the default layout. - bool SetValueInLiteral(int64 value, int64 linear_index, Literal* literal); - bool SetValueInLiteral(double value, int64 linear_index, Literal* literal); - bool SetValueInLiteral(bool value, int64 linear_index, Literal* literal); - template - bool SetValueInLiteralHelper(ParsedElemT value, int64 linear_index, - Literal* literal); - - bool ParseOperands(std::vector* operands); - // Fills parsed operands into 'operands' and expects a certain number of - // operands. - bool ParseOperands(std::vector* operands, - const int expected_size); - - // Describes the start, limit, and stride on every dimension of the operand - // being sliced. - struct SliceRanges { - std::vector starts; - std::vector limits; - std::vector strides; - }; - - // Types of attributes. - enum class AttrTy { - kInt64, - kInt32, - kFloat, - kString, - kBracedInt64List, - kHloComputation, - kFftType, - kWindow, - kConvolutionDimensionNumbers, - kSharding, - kInstructionList, - kSliceRanges, - kPaddingConfig, - kMetadata, - kFusionKind, - kDistribution, - }; - - struct AttrConfig { - bool required; // whether it's required or optional - AttrTy attr_type; // what type it is - void* result; // where to store the parsed result. - }; - - // attributes ::= (',' attribute)* - // - // Parses attributes given names and configs of the attributes. Each parsed - // result is passed back through the result pointer in corresponding - // AttrConfig. Note that the result pointer must point to a optional typed - // variable which outlives this function. Returns false on error. You should - // not use the any of the results if this function failed. - // - // Example usage: - // - // std::unordered_map attrs; - // optional foo; - // attrs["foo"] = {/*required=*/false, AttrTy::kInt64, &foo}; - // optional bar; - // attrs["bar"] = {/*required=*/true, AttrTy::kWindow, &bar}; - // if (!ParseAttributes(attrs)) { - // return false; // Do not use 'foo' 'bar' if failed. - // } - // // Do something with 'bar'. - // if (foo) { // If attr foo is seen, do something with 'foo'. } - // - bool ParseAttributes(const std::unordered_map& attrs); - - // sub_attributes ::= '{' (','? attribute)* '}' - // - // Usage is the same as ParseAttributes. See immediately above. - bool ParseSubAttributes(const std::unordered_map& attrs); - - // Parses one attribute. If it has already been seen, return error. Returns - // true and adds to seen_attrs on success. - // - // Do not call this except in ParseAttributes or ParseSubAttributes. - bool ParseAttributeHelper(const std::unordered_map& attrs, - std::unordered_set* seen_attrs); - - // Parses a name and finds the corresponding hlo computation. - bool ParseComputationName(HloComputation** value); - // Parses a list of names and finds the corresponding hlo instructions. - bool ParseInstructionNames(std::vector* instructions); - bool ParseWindow(Window* window); - bool ParseConvolutionDimensionNumbers(ConvolutionDimensionNumbers* dnums); - bool ParsePaddingConfig(PaddingConfig* padding); - bool ParseMetadata(OpMetadata* metadata); - bool ParseSharding(OpSharding* sharding); - bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed); - - // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3. - bool ParseDxD(const string& name, std::vector* result); - // Parses window's pad sub-attriute, e.g., pad=0_0x3x3. - bool ParseWindowPad(std::vector>* pad); - - bool ParseSliceRanges(SliceRanges* result); - bool ParseInt64List(const TokKind start, const TokKind end, - const TokKind delim, std::vector* result); - - bool ParseParamListToShape(Shape* shape, LocTy* shape_loc); - bool ParseParamList(); - bool ParseName(string* result); - bool ParseAttributeName(string* result); - bool ParseString(string* result); - bool ParseShape(Shape* result); - bool ParseOpcode(HloOpcode* result); - bool ParseFftType(FftType* result); - bool ParseFusionKind(HloInstruction::FusionKind* result); - bool ParseRandomDistribution(RandomDistribution* result); - bool ParseInt64(int64* result); - bool ParseDouble(double* result); - bool ParseBool(bool* result); - bool ParseToken(TokKind kind, const string& msg); - - // Returns true if the current token is the beginning of a shape. - bool CanBeShape(); - // Returns true if the current token is the beginning of a - // param_list_to_shape. - bool CanBeParamListToShape(); - - // Logs the current parsing line and the given message. Always returns false. - bool TokenError(StringPiece msg); - bool Error(LocTy loc, StringPiece msg); - - // If the current token is 'kind', eats it (i.e. lexes the next token) and - // returns true. - bool EatIfPresent(TokKind kind); - // Parses a shape, and returns true if the result is compatible with the given - // shape. - bool EatShapeAndCheckCompatible(const Shape& shape); - - // Adds the instruction to the pool. Returns false and emits an error if the - // instruction already exists. - bool AddInstruction(const string& name, HloInstruction* instruction, - LocTy name_loc); - // Adds the computation to the pool. Returns false and emits an error if the - // computation already exists. - bool AddComputation(const string& name, HloComputation* computation, - LocTy name_loc); - - // The map from the instruction/computation name to the - // instruction/computation itself and it's location. This does not own the - // pointers. - std::unordered_map> - instruction_pool_; - std::unordered_map> - computation_pool_; - - HloLexer lexer_; - std::unique_ptr module_; - std::vector> computations_; - const HloModuleConfig config_; - std::vector error_; -}; - -bool HloParser::Error(LocTy loc, StringPiece msg) { - auto line_col = lexer_.GetLineAndColumn(loc); - const unsigned line = line_col.first; - const unsigned col = line_col.second; - std::vector error_lines; - error_lines.push_back( - StrCat("was parsing ", line, ":", col, ": error: ", msg)); - error_lines.push_back(std::string(lexer_.GetLine(loc))); - error_lines.push_back(col == 0 ? "" : StrCat(string(col - 1, ' '), "^")); - - error_.push_back(Join(error_lines, "\n")); - VLOG(1) << "Error: " << error_.back(); - return false; -} - -bool HloParser::TokenError(StringPiece msg) { - return Error(lexer_.GetLoc(), msg); -} - -bool HloParser::Run() { - lexer_.Lex(); - return ParseHloModule(); -} - -// ::= 'HloModule' name computations -bool HloParser::ParseHloModule() { - if (lexer_.GetKind() != TokKind::kw_HloModule) { - return TokenError("expects HloModule"); - } - // Eat 'HloModule' - lexer_.Lex(); - - string name; - if (!ParseName(&name)) { - return false; - } - - module_ = MakeUnique(name, config_); - - return ParseComputations(); -} - -// computations ::= (computation)+ -bool HloParser::ParseComputations() { - HloComputation* entry_computation = nullptr; - do { - if (!ParseComputation(&entry_computation)) { - return false; - } - } while (lexer_.GetKind() != TokKind::kEof); - - for (int i = 0; i < computations_.size(); i++) { - // If entry_computation is not nullptr, it means the computation it pointed - // to is marked with "ENTRY"; otherwise, no computation is marked with - // "ENTRY", and we use the last computation as the entry computation. We - // add the non-entry computations as embedded computations to the module. - if ((entry_computation != nullptr && - computations_[i].get() != entry_computation) || - (entry_computation == nullptr && i != computations_.size() - 1)) { - module_->AddEmbeddedComputation(std::move(computations_[i])); - continue; - } - auto computation = - module_->AddEntryComputation(std::move(computations_[i])); - // The parameters and result layouts were set to default layout. Here we - // set the layouts to what the hlo text says. - for (int p = 0; p < computation->num_parameters(); p++) { - const Shape& param_shape = computation->parameter_instruction(p)->shape(); - TF_CHECK_OK(module_->mutable_host_entry_computation_layout() - ->mutable_parameter_layout(p) - ->CopyLayoutFromShape(param_shape)); - TF_CHECK_OK(module_->mutable_device_entry_computation_layout() - ->mutable_parameter_layout(p) - ->CopyLayoutFromShape(param_shape)); - } - const Shape& result_shape = computation->root_instruction()->shape(); - TF_CHECK_OK(module_->mutable_host_entry_computation_layout() - ->mutable_result_layout() - ->CopyLayoutFromShape(result_shape)); - TF_CHECK_OK(module_->mutable_device_entry_computation_layout() - ->mutable_result_layout() - ->CopyLayoutFromShape(result_shape)); - } - - return true; -} - -// computation ::= ('ENTRY')? name (param_list_to_shape)? instruction_list -bool HloParser::ParseComputation(HloComputation** entry_computation) { - LocTy maybe_entry_loc = lexer_.GetLoc(); - const bool is_entry_computation = EatIfPresent(TokKind::kw_ENTRY); - - string name; - LocTy name_loc = lexer_.GetLoc(); - if (!ParseName(&name)) { - return false; - } - auto builder = MakeUnique(name); - - LocTy shape_loc = nullptr; - Shape shape; - if (CanBeParamListToShape() && !ParseParamListToShape(&shape, &shape_loc)) { - return false; - } - - string root_name; - if (!ParseInstructionList(builder.get(), &root_name)) { - return false; - } - - std::pair* root_node = - tensorflow::gtl::FindOrNull(instruction_pool_, root_name); - // This means some instruction was marked as ROOT but we didn't find it in the - // pool, which should not happen. - if (!root_name.empty() && root_node == nullptr) { - LOG(FATAL) << "instruction " << root_name - << " was marked as ROOT but the parser has not seen it before"; - } - - HloInstruction* root = root_node == nullptr ? nullptr : root_node->first; - // Now root can be either an existing instruction or a nullptr. If it's a - // nullptr, the implementation of Builder will set the last instruction as - // root instruction. - computations_.emplace_back(builder->Build(root)); - HloComputation* computation = computations_.back().get(); - - if (!root) { - root = computation->root_instruction(); - } else { - CHECK_EQ(root, computation->root_instruction()); - } - - // If param_list_to_shape was present, check compatibility. - if (shape_loc != nullptr && !ShapeUtil::Compatible(root->shape(), shape)) { - return Error( - shape_loc, - StrCat("Shape of computation ", name, ", ", - ShapeUtil::HumanString(shape), - ", is not compatible with that of its root instruction ", - root_name, ", ", ShapeUtil::HumanString(root->shape()))); - } - - if (is_entry_computation) { - if (*entry_computation != nullptr) { - return Error(maybe_entry_loc, "expects only one ENTRY"); - } - *entry_computation = computation; - } - instruction_pool_.clear(); - - return AddComputation(name, computation, name_loc); -} - -// instruction_list ::= '{' instruction_list1 '}' -// instruction_list1 ::= (instruction)+ -bool HloParser::ParseInstructionList(HloComputation::Builder* builder, - string* root_name) { - if (!ParseToken(TokKind::kLbrace, - "expects '{' at the beginning of instruction list.")) { - return false; - } - do { - if (!ParseInstruction(builder, root_name)) { - return false; - } - } while (lexer_.GetKind() != TokKind::kRbrace); - return ParseToken(TokKind::kRbrace, - "expects '}' at the end of instruction list."); -} - -// instruction ::= ('ROOT')? name '=' shape opcode operands (attribute)* -bool HloParser::ParseInstruction(HloComputation::Builder* builder, - string* root_name) { - string name; - Shape shape; - HloOpcode opcode; - std::vector operands; - - LocTy maybe_root_loc = lexer_.GetLoc(); - bool is_root = EatIfPresent(TokKind::kw_ROOT); - - const LocTy name_loc = lexer_.GetLoc(); - if (!ParseName(&name) || - !ParseToken(TokKind::kEqual, "expects '=' in instruction") || - !ParseShape(&shape) || !ParseOpcode(&opcode)) { - return false; - } - - if (is_root) { - if (!root_name->empty()) { - return Error(maybe_root_loc, "one computation should have only one ROOT"); - } - *root_name = name; - } - - // Add optional attributes. - std::unordered_map attrs; - optional sharding; - attrs["sharding"] = {/*required=*/false, AttrTy::kSharding, &sharding}; - optional> predecessors; - attrs["control-predecessors"] = {/*required=*/false, AttrTy::kInstructionList, - &predecessors}; - optional metadata; - attrs["metadata"] = {/*required=*/false, AttrTy::kMetadata, &metadata}; - - optional backend_config; - attrs["backend_config"] = {/*required=*/false, AttrTy::kString, - &backend_config}; - - HloInstruction* instruction; - switch (opcode) { - case HloOpcode::kParameter: { - int64 parameter_number; - if (!ParseToken(TokKind::kLparen, - "expects '(' before parameter number") || - !ParseInt64(¶meter_number) || - !ParseToken(TokKind::kRparen, "expects ')' after parameter number") || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateParameter(parameter_number, shape, name)); - break; - } - case HloOpcode::kConstant: { - std::unique_ptr literal; - if (!ParseToken(TokKind::kLparen, - "expects '(' before constant literal") || - !ParseLiteral(&literal, shape) || - !ParseToken(TokKind::kRparen, "expects ')' after constant literal") || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateConstant(std::move(literal))); - break; - } - // Unary ops. - case HloOpcode::kAbs: - case HloOpcode::kRoundNearestAfz: - case HloOpcode::kBitcast: - case HloOpcode::kCeil: - case HloOpcode::kClz: - case HloOpcode::kCopy: - case HloOpcode::kCos: - case HloOpcode::kDomain: - case HloOpcode::kExp: - case HloOpcode::kExpm1: - case HloOpcode::kImag: - case HloOpcode::kIsFinite: - case HloOpcode::kFloor: - case HloOpcode::kLog: - case HloOpcode::kLog1p: - case HloOpcode::kNot: - case HloOpcode::kNegate: - case HloOpcode::kReal: - case HloOpcode::kSign: - case HloOpcode::kSin: - case HloOpcode::kSort: - case HloOpcode::kTanh: { - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateUnary(shape, opcode, operands[0])); - break; - } - // Binary ops. - case HloOpcode::kAdd: - case HloOpcode::kDivide: - case HloOpcode::kMultiply: - case HloOpcode::kSubtract: - case HloOpcode::kAtan2: - case HloOpcode::kComplex: - case HloOpcode::kEq: - case HloOpcode::kGe: - case HloOpcode::kGt: - case HloOpcode::kLe: - case HloOpcode::kLt: - case HloOpcode::kNe: - case HloOpcode::kMaximum: - case HloOpcode::kMinimum: - case HloOpcode::kPower: - case HloOpcode::kRemainder: - case HloOpcode::kAnd: - case HloOpcode::kOr: - case HloOpcode::kShiftLeft: - case HloOpcode::kShiftRightArithmetic: - case HloOpcode::kShiftRightLogical: { - if (!ParseOperands(&operands, /*expected_size=*/2) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateBinary( - shape, opcode, operands[0], operands[1])); - break; - } - // Ternary ops. - case HloOpcode::kClamp: - case HloOpcode::kSelect: { - if (!ParseOperands(&operands, /*expected_size=*/3) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateTernary( - shape, opcode, operands[0], operands[1], operands[2])); - break; - } - // Other supported ops. - case HloOpcode::kConvert: { - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateConvert(shape, operands[0])); - break; - } - case HloOpcode::kBitcastConvert: { - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateBitcastConvert(shape, operands[0])); - break; - } - case HloOpcode::kCrossReplicaSum: { - if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateCrossReplicaSum(shape, operands)); - break; - } - case HloOpcode::kReshape: { - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateReshape(shape, operands[0])); - break; - } - case HloOpcode::kTuple: { - if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { - return false; - } - instruction = - builder->AddInstruction(HloInstruction::CreateTuple(operands)); - break; - } - case HloOpcode::kWhile: { - optional condition; - optional body; - attrs["condition"] = {/*required=*/true, AttrTy::kHloComputation, - &condition}; - attrs["body"] = {/*required=*/true, AttrTy::kHloComputation, &body}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateWhile( - shape, *condition, *body, /*init=*/operands[0])); - break; - } - case HloOpcode::kRecv: { - optional channel_id; - attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id}; - if (!ParseOperands(&operands, /*expected_size=*/0) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateRecv(shape.tuple_shapes(0), *channel_id)); - break; - } - case HloOpcode::kRecvDone: { - optional channel_id; - attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - if (channel_id != operands[0]->channel_id()) { - return false; - } - instruction = - builder->AddInstruction(HloInstruction::CreateRecvDone(operands[0])); - break; - } - case HloOpcode::kSend: { - optional channel_id; - attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateSend(operands[0], *channel_id)); - break; - } - case HloOpcode::kSendDone: { - optional channel_id; - attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - if (channel_id != operands[0]->channel_id()) { - return false; - } - instruction = - builder->AddInstruction(HloInstruction::CreateSendDone(operands[0])); - break; - } - case HloOpcode::kGetTupleElement: { - optional index; - attrs["index"] = {/*required=*/true, AttrTy::kInt64, &index}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateGetTupleElement(shape, operands[0], *index)); - break; - } - case HloOpcode::kCall: { - optional to_apply; - attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, - &to_apply}; - if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateCall(shape, operands, *to_apply)); - break; - } - case HloOpcode::kReduceWindow: { - optional reduce_computation; - optional window; - attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window}; - attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, - &reduce_computation}; - if (!ParseOperands(&operands, /*expected_size=*/2) || - !ParseAttributes(attrs)) { - return false; - } - if (!window) { - window.emplace(); - } - instruction = builder->AddInstruction(HloInstruction::CreateReduceWindow( - shape, /*operand=*/operands[0], /*init_value=*/operands[1], *window, - *reduce_computation)); - break; - } - case HloOpcode::kConvolution: { - optional window; - optional dnums; - attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window}; - attrs["dim_labels"] = {/*required=*/true, - AttrTy::kConvolutionDimensionNumbers, &dnums}; - if (!ParseOperands(&operands, /*expected_size=*/2) || - !ParseAttributes(attrs)) { - return false; - } - if (!window) { - window.emplace(); - } - instruction = builder->AddInstruction(HloInstruction::CreateConvolve( - shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums)); - break; - } - case HloOpcode::kFft: { - optional fft_type; - optional> fft_length; - attrs["fft_type"] = {/*required=*/true, AttrTy::kFftType, &fft_type}; - attrs["fft_length"] = {/*required=*/true, AttrTy::kBracedInt64List, - &fft_length}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateFft( - shape, operands[0], *fft_type, *fft_length)); - break; - } - case HloOpcode::kBroadcast: { - optional> broadcast_dimensions; - attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, - &broadcast_dimensions}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateBroadcast( - shape, operands[0], *broadcast_dimensions)); - break; - } - case HloOpcode::kConcatenate: { - optional> dimensions; - attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, - &dimensions}; - if (!ParseOperands(&operands) || !ParseAttributes(attrs) || - dimensions->size() != 1) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateConcatenate( - shape, operands, dimensions->at(0))); - break; - } - case HloOpcode::kMap: { - optional to_apply; - attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, - &to_apply}; - if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateMap(shape, operands, *to_apply)); - break; - } - case HloOpcode::kReduce: { - optional reduce_computation; - attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, - &reduce_computation}; - optional> dimensions_to_reduce; - attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, - &dimensions_to_reduce}; - if (!ParseOperands(&operands, /*expected_size=*/2) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateReduce( - shape, /*operand=*/operands[0], /*init_value=*/operands[1], - *dimensions_to_reduce, *reduce_computation)); - break; - } - case HloOpcode::kReverse: { - optional> dimensions; - attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, - &dimensions}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateReverse(shape, operands[0], *dimensions)); - break; - } - case HloOpcode::kSelectAndScatter: { - optional select; - attrs["select"] = {/*required=*/true, AttrTy::kHloComputation, &select}; - optional scatter; - attrs["scatter"] = {/*required=*/true, AttrTy::kHloComputation, &scatter}; - optional window; - attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window}; - if (!ParseOperands(&operands, /*expected_size=*/3) || - !ParseAttributes(attrs)) { - return false; - } - if (!window) { - window.emplace(); - } - instruction = - builder->AddInstruction(HloInstruction::CreateSelectAndScatter( - shape, /*operand=*/operands[0], *select, *window, - /*source=*/operands[1], /*init_value=*/operands[2], *scatter)); - break; - } - case HloOpcode::kSlice: { - optional slice_ranges; - attrs["slice"] = {/*required=*/true, AttrTy::kSliceRanges, &slice_ranges}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateSlice( - shape, operands[0], slice_ranges->starts, slice_ranges->limits, - slice_ranges->strides)); - break; - } - case HloOpcode::kDynamicSlice: { - optional> dynamic_slice_sizes; - attrs["dynamic_slice_sizes"] = { - /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes}; - if (!ParseOperands(&operands, /*expected_size=*/2) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateDynamicSlice( - shape, /*operand=*/operands[0], /*start_indices=*/operands[1], - *dynamic_slice_sizes)); - break; - } - case HloOpcode::kDynamicUpdateSlice: { - if (!ParseOperands(&operands, /*expected_size=*/3) || - !ParseAttributes(attrs)) { - return false; - } - instruction = - builder->AddInstruction(HloInstruction::CreateDynamicUpdateSlice( - shape, /*operand=*/operands[0], /*update=*/operands[1], - /*start_indices=*/operands[2])); - break; - } - case HloOpcode::kTranspose: { - optional> dimensions; - attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, - &dimensions}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateTranspose(shape, operands[0], *dimensions)); - break; - } - case HloOpcode::kBatchNormTraining: { - optional epsilon; - attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon}; - optional feature_index; - attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64, - &feature_index}; - if (!ParseOperands(&operands, /*expected_size=*/3) || - !ParseAttributes(attrs)) { - return false; - } - instruction = - builder->AddInstruction(HloInstruction::CreateBatchNormTraining( - shape, /*operand=*/operands[0], /*scale=*/operands[1], - /*offset=*/operands[2], *epsilon, *feature_index)); - break; - } - case HloOpcode::kBatchNormInference: { - optional epsilon; - attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon}; - optional feature_index; - attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64, - &feature_index}; - if (!ParseOperands(&operands, /*expected_size=*/5) || - !ParseAttributes(attrs)) { - return false; - } - instruction = - builder->AddInstruction(HloInstruction::CreateBatchNormInference( - shape, /*operand=*/operands[0], /*scale=*/operands[1], - /*offset=*/operands[2], /*mean=*/operands[3], - /*variance=*/operands[4], *epsilon, *feature_index)); - break; - } - case HloOpcode::kBatchNormGrad: { - optional epsilon; - attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon}; - optional feature_index; - attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64, - &feature_index}; - if (!ParseOperands(&operands, /*expected_size=*/5) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateBatchNormGrad( - shape, /*operand=*/operands[0], /*scale=*/operands[1], - /*mean=*/operands[2], /*variance=*/operands[3], - /*grad_output=*/operands[4], *epsilon, *feature_index)); - break; - } - case HloOpcode::kPad: { - optional padding; - attrs["padding"] = {/*required=*/true, AttrTy::kPaddingConfig, &padding}; - if (!ParseOperands(&operands, /*expected_size=*/2) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreatePad( - shape, operands[0], /*padding_value=*/operands[1], *padding)); - break; - } - case HloOpcode::kFusion: { - optional fusion_computation; - attrs["calls"] = {/*required=*/true, AttrTy::kHloComputation, - &fusion_computation}; - optional fusion_kind; - attrs["kind"] = {/*required=*/true, AttrTy::kFusionKind, &fusion_kind}; - if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateFusion( - shape, *fusion_kind, operands, *fusion_computation)); - break; - } - case HloOpcode::kInfeed: { - optional config; - attrs["infeed_config"] = {/*required=*/false, AttrTy::kString, &config}; - if (!ParseOperands(&operands, /*expected_size=*/0) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateInfeed(shape, config ? *config : "")); - break; - } - case HloOpcode::kOutfeed: { - optional config; - attrs["outfeed_config"] = {/*required=*/false, AttrTy::kString, &config}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateOutfeed( - operands[0]->shape(), operands[0], config ? *config : "")); - break; - } - case HloOpcode::kRng: { - optional distribution; - attrs["distribution"] = {/*required=*/true, AttrTy::kDistribution, - &distribution}; - if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateRng(shape, *distribution, operands)); - break; - } - case HloOpcode::kReducePrecision: { - optional exponent_bits; - optional mantissa_bits; - attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64, - &exponent_bits}; - attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64, - &mantissa_bits}; - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = - builder->AddInstruction(HloInstruction::CreateReducePrecision( - shape, operands[0], static_cast(*exponent_bits), - static_cast(*mantissa_bits))); - break; - } - case HloOpcode::kConditional: { - optional true_computation; - optional false_computation; - attrs["true_computation"] = {/*required=*/true, AttrTy::kHloComputation, - &true_computation}; - attrs["false_computation"] = {/*required=*/true, AttrTy::kHloComputation, - &false_computation}; - if (!ParseOperands(&operands, /*expected_size=*/3) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateConditional( - shape, /*pred=*/operands[0], - /*true_computation_arg=*/operands[1], *true_computation, - /*false_computation_arg=*/operands[2], *false_computation)); - break; - } - case HloOpcode::kCustomCall: { - optional custom_call_target; - attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString, - &custom_call_target}; - if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateCustomCall( - shape, operands, *custom_call_target)); - break; - } - case HloOpcode::kHostCompute: { - optional channel_name; - optional cost_estimate_ns; - attrs["channel_name"] = {/*required=*/true, AttrTy::kString, - &channel_name}; - attrs["cost_estimate_ns"] = {/*required=*/true, AttrTy::kInt64, - &cost_estimate_ns}; - if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction(HloInstruction::CreateHostCompute( - shape, operands, *channel_name, *cost_estimate_ns)); - break; - } - case HloOpcode::kDot: { - optional> lhs_contracting_dims; - attrs["lhs_contracting_dims"] = { - /*required=*/false, AttrTy::kBracedInt64List, &lhs_contracting_dims}; - optional> rhs_contracting_dims; - attrs["rhs_contracting_dims"] = { - /*required=*/false, AttrTy::kBracedInt64List, &rhs_contracting_dims}; - optional> lhs_batch_dims; - attrs["lhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List, - &lhs_batch_dims}; - optional> rhs_batch_dims; - attrs["rhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List, - &rhs_batch_dims}; - - if (!ParseOperands(&operands, /*expected_size=*/2) || - !ParseAttributes(attrs)) { - return false; - } - - DotDimensionNumbers dnum; - if (lhs_contracting_dims) { - *dnum.mutable_lhs_contracting_dimensions() = { - lhs_contracting_dims->begin(), lhs_contracting_dims->end()}; - } - if (rhs_contracting_dims) { - *dnum.mutable_rhs_contracting_dimensions() = { - rhs_contracting_dims->begin(), rhs_contracting_dims->end()}; - } - if (lhs_batch_dims) { - *dnum.mutable_lhs_batch_dimensions() = {lhs_batch_dims->begin(), - lhs_batch_dims->end()}; - } - if (rhs_batch_dims) { - *dnum.mutable_rhs_batch_dimensions() = {rhs_batch_dims->begin(), - rhs_batch_dims->end()}; - } - - instruction = builder->AddInstruction( - HloInstruction::CreateDot(shape, operands[0], operands[1], dnum)); - break; - } - case HloOpcode::kGather: { - optional> output_window_dims; - attrs["output_window_dims"] = { - /*required=*/true, AttrTy::kBracedInt64List, &output_window_dims}; - optional> elided_window_dims; - attrs["elided_window_dims"] = { - /*required=*/true, AttrTy::kBracedInt64List, &elided_window_dims}; - optional> gather_dims_to_operand_dims; - attrs["gather_dims_to_operand_dims"] = {/*required=*/true, - AttrTy::kBracedInt64List, - &gather_dims_to_operand_dims}; - optional index_vector_dim; - attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64, - &index_vector_dim}; - optional> window_bounds; - attrs["window_bounds"] = {/*required=*/true, AttrTy::kBracedInt64List, - &window_bounds}; - - if (!ParseOperands(&operands, /*expected_size=*/2) || - !ParseAttributes(attrs)) { - return false; - } - - GatherDimensionNumbers dim_numbers = HloInstruction::MakeGatherDimNumbers( - /*output_window_dims=*/*output_window_dims, - /*elided_window_dims=*/*elided_window_dims, - /*gather_dims_to_operand_dims=*/*gather_dims_to_operand_dims, - /*index_vector_dim=*/*index_vector_dim); - - instruction = builder->AddInstruction(HloInstruction::CreateGather( - shape, /*operand=*/operands[0], /*gather_indices=*/operands[1], - dim_numbers, *window_bounds)); - break; - } - case HloOpcode::kTrace: - return TokenError(StrCat("parsing not yet implemented for op: ", - HloOpcodeString(opcode))); - } - - instruction->set_name(name); - - // Add shared attributes like metadata to the instruction, if they were seen. - if (sharding) { - instruction->set_sharding( - HloSharding::FromProto(sharding.value()).ValueOrDie()); - } - if (predecessors) { - for (auto* pre : *predecessors) { - Status status = pre->AddControlDependencyTo(instruction); - if (!status.ok()) { - return Error(name_loc, StrCat("error adding control dependency for: ", - name, " status: ", status.ToString())); - } - } - } - if (metadata) { - instruction->set_metadata(*metadata); - } - if (backend_config) { - instruction->set_backend_config(std::move(*backend_config)); - } - return AddInstruction(name, instruction, name_loc); -} // NOLINT(readability/fn_size) - -// ::= '{' (single_sharding | tuple_sharding) '}' -// -// tuple_sharding ::= single_sharding* (',' single_sharding)* -bool HloParser::ParseSharding(OpSharding* sharding) { - // A single sharding starts with '{' and is not followed by '{'. - // A tuple sharding starts with '{' and is followed by '{', or is '{''}' for - // an empty tuple. - if (!ParseToken(TokKind::kLbrace, - "expected '{' to start sharding attribute")) { - return false; - } - - if (lexer_.GetKind() != TokKind::kLbrace && - lexer_.GetKind() != TokKind::kRbrace) { - return ParseSingleSharding(sharding, /*lbrace_pre_lexed=*/true); - } - - // Tuple sharding. - // Allow empty tuple shardings. - if (lexer_.GetKind() != TokKind::kRbrace) { - do { - if (!ParseSingleSharding(sharding->add_tuple_shardings(), - /*lbrace_pre_lexed=*/false)) { - return false; - } - } while (EatIfPresent(TokKind::kComma)); - } - sharding->set_type(OpSharding::Type::OpSharding_Type_TUPLE); - - return ParseToken(TokKind::kRbrace, "expected '}' to end sharding attribute"); -} - -// ::= '{' 'replicated'? 'maximal'? ('device=' int)? shape? -// ('devices=' ('[' dims ']')* device_list)? '}' -// dims ::= int_list device_list ::= int_list -bool HloParser::ParseSingleSharding(OpSharding* sharding, - bool lbrace_pre_lexed) { - if (!lbrace_pre_lexed && - !ParseToken(TokKind::kLbrace, - "expected '{' to start sharding attribute")) { - return false; - } - - LocTy loc = lexer_.GetLoc(); - bool maximal = false; - bool replicated = false; - std::vector devices; - std::vector tile_assignment_dimensions; - Shape tile_shape; - while (lexer_.GetKind() != TokKind::kRbrace) { - switch (lexer_.GetKind()) { - case TokKind::kw_maximal: - maximal = true; - lexer_.Lex(); - break; - case TokKind::kw_replicated: - replicated = true; - lexer_.Lex(); - break; - case TokKind::kAttributeName: { - if (lexer_.GetStrVal() == "device") { - if (lexer_.Lex() != TokKind::kInt) { - return TokenError("device= attribute must be an integer"); - } - devices = {lexer_.GetInt64Val()}; - lexer_.Lex(); - } else if (lexer_.GetStrVal() == "devices") { - lexer_.Lex(); - if (!ParseToken(TokKind::kLsquare, - "expected '[' to start sharding devices shape")) { - return false; - } - - do { - int64 dim; - if (!ParseInt64(&dim)) { - return false; - } - tile_assignment_dimensions.push_back(dim); - } while (EatIfPresent(TokKind::kComma)); - - if (!ParseToken(TokKind::kRsquare, - "expected ']' to start sharding devices shape")) { - return false; - } - do { - int64 device; - if (!ParseInt64(&device)) { - return false; - } - devices.push_back(device); - } while (EatIfPresent(TokKind::kComma)); - } else { - return TokenError( - "unknown attribute in sharding: expected device= or devices="); - } - break; - } - case TokKind::kShape: - tile_shape = lexer_.GetShapeVal(); - lexer_.Lex(); - break; - case TokKind::kRbrace: - break; - default: - return TokenError("unexpected token"); - } - } - - if (replicated) { - if (!devices.empty()) { - return Error(loc, - "replicated shardings should not have any devices assigned"); - } - if (!ShapeUtil::Equal(tile_shape, Shape())) { - return Error(loc, - "replicated shardings should not have any tile shape set"); - } - sharding->set_type(OpSharding::Type::OpSharding_Type_REPLICATED); - } else if (maximal) { - if (devices.size() != 1) { - return Error(loc, - "maximal shardings should have exactly one device assigned"); - } - if (!ShapeUtil::Equal(tile_shape, Shape())) { - return Error(loc, "maximal shardings should not have any tile shape set"); - } - sharding->set_type(OpSharding::Type::OpSharding_Type_MAXIMAL); - sharding->add_tile_assignment_devices(devices[0]); - } else { - if (devices.size() <= 1) { - return Error( - loc, "non-maximal shardings must have more than one device assigned"); - } - if (ShapeUtil::Equal(tile_shape, Shape())) { - return Error(loc, "non-maximal shardings should have a tile shape set"); - } - if (tile_assignment_dimensions.empty()) { - return Error( - loc, - "non-maximal shardings must have a tile assignment list including " - "dimensions"); - } - sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER); - *sharding->mutable_tile_shape() = tile_shape; - for (int64 dim : tile_assignment_dimensions) { - sharding->add_tile_assignment_dimensions(dim); - } - for (int64 device : devices) { - sharding->add_tile_assignment_devices(device); - } - } - - lexer_.Lex(); - return true; -} - -// '{' name+ '}' -bool HloParser::ParseInstructionNames( - std::vector* instructions) { - if (!ParseToken(TokKind::kLbrace, - "expects '{' at the beginning of instruction name list")) { - return false; - } - LocTy loc = lexer_.GetLoc(); - do { - string name; - if (!ParseName(&name)) { - return Error(loc, "expects a instruction name"); - } - std::pair* instr = - tensorflow::gtl::FindOrNull(instruction_pool_, name); - if (!instr) { - return TokenError( - Printf("instruction '%s' is not defined", name.c_str())); - } - instructions->push_back(instr->first); - } while (EatIfPresent(TokKind::kComma)); - - return ParseToken(TokKind::kRbrace, - "expects '}' at the end of instruction name list"); -} - -bool HloParser::SetValueInLiteral(int64 value, int64 linear_index, - Literal* literal) { - const Shape& shape = literal->shape(); - switch (shape.element_type()) { - case S8: - return SetValueInLiteralHelper(value, linear_index, literal); - case S16: - return SetValueInLiteralHelper(value, linear_index, literal); - case S32: - return SetValueInLiteralHelper(value, linear_index, literal); - case S64: - return SetValueInLiteralHelper(value, linear_index, literal); - case U8: - return SetValueInLiteralHelper(value, linear_index, literal); - case U16: - return SetValueInLiteralHelper(value, linear_index, literal); - case U32: - return SetValueInLiteralHelper(value, linear_index, literal); - case U64: - return SetValueInLiteralHelper(value, linear_index, literal); - default: - LOG(FATAL) << "unknown integral primitive type " - << PrimitiveType_Name(shape.element_type()); - } -} - -bool HloParser::SetValueInLiteral(double value, int64 linear_index, - Literal* literal) { - const Shape& shape = literal->shape(); - switch (shape.element_type()) { - case F16: - return SetValueInLiteralHelper(value, linear_index, literal); - case BF16: - return SetValueInLiteralHelper(value, linear_index, literal); - case F32: - return SetValueInLiteralHelper(value, linear_index, literal); - case F64: - return SetValueInLiteralHelper(value, linear_index, literal); - default: - LOG(FATAL) << "unknown floating point primitive type " - << PrimitiveType_Name(shape.element_type()); - } -} - -bool HloParser::SetValueInLiteral(bool value, int64 linear_index, - Literal* literal) { - const Shape& shape = literal->shape(); - switch (shape.element_type()) { - case PRED: - return SetValueInLiteralHelper(value, linear_index, literal); - default: - LOG(FATAL) << PrimitiveType_Name(shape.element_type()) - << " is not PRED type"; - } -} - -template -bool HloParser::SetValueInLiteralHelper(ParsedElemT value, int64 linear_index, - Literal* literal) { - // Check that linear_index is in range. - if (linear_index >= ShapeUtil::ElementsIn(literal->shape())) { - return TokenError( - StrCat("trys to set value ", value, " to a literal in shape ", - ShapeUtil::HumanString(literal->shape()), " at linear index ", - linear_index, ", but the index is out of range")); - } - - if (std::isnan(value) || - (std::numeric_limits::has_infinity && - (std::numeric_limits::infinity() == value || - -std::numeric_limits::infinity() == value))) { - // Skip range checking for non-finite value. - } else if (literal->shape().element_type() == F16 || - literal->shape().element_type() == BF16) { - if (value > kF16max || value < -kF16max) { - return TokenError(StrCat( - "value ", value, " is out of range for literal's primitive type ", - PrimitiveType_Name(literal->shape().element_type()))); - } - } else if (value > static_cast( - std::numeric_limits::max()) || - value < static_cast( - std::numeric_limits::lowest())) { - // Value is out of range for LiteralNativeT. - return TokenError(StrCat( - "value ", value, " is out of range for literal's primitive type ", - PrimitiveType_Name(literal->shape().element_type()))); - } - - literal->data().at(linear_index) = - static_cast(value); - return true; -} - -bool HloParser::EatShapeAndCheckCompatible(const Shape& shape) { - Shape new_shape; - if (!ParseShape(&new_shape)) { - return TokenError(StrCat("expects shape ", ShapeUtil::HumanString(shape))); - } - if (!ShapeUtil::Compatible(shape, new_shape)) { - return TokenError(StrCat( - "expects shape ", ShapeUtil::HumanString(shape), - ", but sees a different shape: ", ShapeUtil::HumanString(new_shape))); - } - return true; -} - -// literal -// ::= tuple -// ::= non_tuple -bool HloParser::ParseLiteral(std::unique_ptr* literal, - const Shape& shape) { - return ShapeUtil::IsTuple(shape) ? ParseTupleLiteral(literal, shape) - : ParseNonTupleLiteral(literal, shape); -} - -// tuple -// ::= shape '(' literal_list ')' -// literal_list -// ::= /*empty*/ -// ::= literal (',' literal)* -bool HloParser::ParseTupleLiteral(std::unique_ptr* literal, - const Shape& shape) { - if (!EatShapeAndCheckCompatible(shape)) { - return TokenError(StrCat("expects tuple constant in shape ", - ShapeUtil::HumanString(shape))); - } - if (!ParseToken(TokKind::kLparen, "expects '(' in front of tuple elements")) { - return false; - } - std::vector> elements( - ShapeUtil::TupleElementCount(shape)); - - if (lexer_.GetKind() == TokKind::kRparen) { - // empty - } else { - // literal, (',' literal)* - for (int i = 0; i < elements.size(); i++) { - if (i > 0) { - ParseToken(TokKind::kComma, "exepcts ',' to separate tuple elements"); - } - if (!ParseLiteral(&elements[i], - ShapeUtil::GetTupleElementShape(shape, i))) { - return TokenError(StrCat("expects the ", i, "th element")); - } - } - } - *literal = Literal::MakeTupleOwned(std::move(elements)); - return ParseToken(TokKind::kRparen, - StrCat("expects ')' at the end of the tuple with ", - ShapeUtil::TupleElementCount(shape), "elements")); -} - -// non_tuple -// ::= rank01 -// ::= rank2345 -// rank2345 ::= shape sparse_or_nested_array -bool HloParser::ParseNonTupleLiteral(std::unique_ptr* literal, - const Shape& shape) { - if (LayoutUtil::IsSparseArray(shape)) { - return ParseSparseLiteral(literal, shape); - } - - CHECK(LayoutUtil::IsDenseArray(shape)); - return ParseDenseLiteral(literal, shape); -} - -bool HloParser::ParseDenseLiteral(std::unique_ptr* literal, - const Shape& shape) { - const int64 rank = ShapeUtil::Rank(shape); - if (rank > 1 && !EatShapeAndCheckCompatible(shape)) { - return false; - } - - // Create a literal with the given shape in default layout. - *literal = Literal::CreateFromDimensions(shape.element_type(), - AsInt64Slice(shape.dimensions())); - int64 nest_level = 0; - int64 linear_index = 0; - // elems_seen_per_dim[i] is how many elements or sub-arrays we have seen for - // the dimension i. For example, to parse f32[2,3] {{1, 2, 3}, {4, 5, 6}}, - // when we are parsing the 2nd '{' (right before '1'), we are seeing a - // sub-array of the dimension 0, so elems_seen_per_dim[0]++. When we are at - // the first '}' (right after '3'), it means the sub-array ends, and the - // sub-array is supposed to contain exactly 3 elements, so check if - // elems_seen_per_dim[1] is 3. - std::vector elems_seen_per_dim(rank); - auto get_index_str = [&elems_seen_per_dim](int dim) -> string { - std::vector elems_seen_until_dim(elems_seen_per_dim.begin(), - elems_seen_per_dim.begin() + dim); - return StrCat("[", - Join(elems_seen_until_dim, ",", - [](string* out, const int64& num_elems) { - tensorflow::strings::StrAppend(out, num_elems - 1); - }), - "]"); - }; - do { - switch (lexer_.GetKind()) { - default: - return TokenError("unexpected token type in a literal"); - case TokKind::kLbrace: { - nest_level++; - if (nest_level > rank) { - return TokenError(Printf( - "expects nested array in rank %lld, but sees larger", rank)); - } - if (nest_level > 1) { - elems_seen_per_dim[nest_level - 2]++; - if (elems_seen_per_dim[nest_level - 2] > - shape.dimensions(nest_level - 2)) { - return TokenError(Printf( - "expects %lld elements in the %sth element, but sees more", - shape.dimensions(nest_level - 2), - get_index_str(nest_level - 2).c_str())); - } - } - lexer_.Lex(); - break; - } - case TokKind::kRbrace: { - nest_level--; - if (elems_seen_per_dim[nest_level] != shape.dimensions(nest_level)) { - return TokenError(Printf( - "expects %lld elements in the %sth element, but sees %lld", - shape.dimensions(nest_level), get_index_str(nest_level).c_str(), - elems_seen_per_dim[nest_level])); - } - elems_seen_per_dim[nest_level] = 0; - lexer_.Lex(); - break; - } - case TokKind::kComma: - case TokKind::kComment: - // Skip. - lexer_.Lex(); - break; - case TokKind::kw_true: - case TokKind::kw_false: - case TokKind::kInt: - case TokKind::kDecimal: - case TokKind::kw_nan: - case TokKind::kw_inf: - case TokKind::kNegInf: { - if (rank > 0) { - if (nest_level != rank) { - return TokenError( - Printf("expects nested array in rank %lld, but sees %lld", rank, - nest_level)); - } - elems_seen_per_dim[rank - 1]++; - if (elems_seen_per_dim[rank - 1] > shape.dimensions(rank - 1)) { - return TokenError( - Printf("expects %lld elements on the minor-most dimension, but " - "sees more", - shape.dimensions(rank - 1))); - } - } - if (lexer_.GetKind() == TokKind::kw_true || - lexer_.GetKind() == TokKind::kw_false) { - // TODO(congliu): bool type literals with rank >= 1 are actually - // printed in a compact form instead of "true" or "false". Fix that. - if (!SetValueInLiteral(lexer_.GetKind() == TokKind::kw_true, - linear_index++, literal->get())) { - return false; - } - lexer_.Lex(); - } else if (primitive_util::IsIntegralType(shape.element_type())) { - LocTy loc = lexer_.GetLoc(); - int64 value; - if (!ParseInt64(&value)) { - return Error(loc, StrCat("expects integer for primitive type: ", - PrimitiveType_Name(shape.element_type()))); - } - if (!SetValueInLiteral(value, linear_index++, literal->get())) { - return false; - } - } else if (primitive_util::IsFloatingPointType(shape.element_type())) { - LocTy loc = lexer_.GetLoc(); - double value; - if (!ParseDouble(&value)) { - return Error( - loc, StrCat("expect floating point value for primitive type: ", - PrimitiveType_Name(shape.element_type()))); - } - if (!SetValueInLiteral(value, linear_index++, literal->get())) { - return false; - } - } else { - return TokenError(StrCat("unsupported primitive type ", - PrimitiveType_Name(shape.element_type()))); - } - break; - } - } // end of switch - } while (nest_level > 0); - - *literal = (*literal)->Relayout(shape.layout()); - return true; -} - -bool HloParser::ParseSparseLiteral(std::unique_ptr* literal, - const Shape& shape) { - if (!EatShapeAndCheckCompatible(shape)) { - return false; - } - - switch (shape.element_type()) { - case PRED: - return ParseSparseLiteralHelper(literal, shape); - case S8: - return ParseSparseLiteralHelper(literal, shape); - case S16: - return ParseSparseLiteralHelper(literal, shape); - case S32: - return ParseSparseLiteralHelper(literal, shape); - case S64: - return ParseSparseLiteralHelper(literal, shape); - case U8: - return ParseSparseLiteralHelper(literal, shape); - case U16: - return ParseSparseLiteralHelper(literal, shape); - case U32: - return ParseSparseLiteralHelper(literal, shape); - case U64: - return ParseSparseLiteralHelper(literal, shape); - case F16: - return ParseSparseLiteralHelper(literal, shape); - case F32: - return ParseSparseLiteralHelper(literal, shape); - case BF16: - return ParseSparseLiteralHelper(literal, shape); - case F64: - return ParseSparseLiteralHelper(literal, shape); - default: - return Error(lexer_.GetLoc(), - StrCat("invalid primitive type for sparse literal: ", - PrimitiveType_Name(shape.element_type()))); - } -} - -template -bool HloParser::ParseSparseLiteralHelper(std::unique_ptr* literal, - const Shape& shape) { - std::vector index; - - int64 rank = ShapeUtil::Rank(shape); - - *literal = MakeUnique(shape); - - if (!ParseToken(TokKind::kLbrace, - "expects '{' at the beginning of a sparse literal")) { - return false; - } - - for (;;) { - if (lexer_.GetKind() == TokKind::kRbrace) { - lexer_.Lex(); - break; - } - - LocTy index_loc = lexer_.GetLoc(); - index.clear(); - if (lexer_.GetKind() == TokKind::kInt) { - int64 single_index = lexer_.GetInt64Val(); - lexer_.Lex(); - if (rank != 1) { - return Error( - index_loc, - StrCat("invalid single-dimensional index for shape with rank ", - rank, ": ", single_index)); - } - index.push_back(single_index); - } else { - if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kComma, - &index)) { - return false; - } - if (index.size() != rank) { - return Error( - index_loc, - StrCat("invalid multi-dimension index for shape with rank ", rank, - ": [", Join(index, ", "), "]")); - } - } - if (!ParseToken(TokKind::kColon, - "expects ':' after after the sparse array index and before " - "the sparse array value")) { - return false; - } - LocTy value_loc = lexer_.GetLoc(); - LiteralNativeT value; - if (lexer_.GetKind() == TokKind::kw_true || - lexer_.GetKind() == TokKind::kw_false) { - value = static_cast(lexer_.GetKind() == TokKind::kw_true); - lexer_.Lex(); - } else if (primitive_util::IsIntegralType(shape.element_type())) { - int64 value_s64; - if (!ParseInt64(&value_s64)) { - return Error(value_loc, - StrCat("expects integer for primitive type: ", - PrimitiveType_Name(shape.element_type()))); - } - value = static_cast(value_s64); - } else if (primitive_util::IsFloatingPointType(shape.element_type())) { - double value_f64; - if (!ParseDouble(&value_f64)) { - return Error(value_loc, - StrCat("expects floating point value for primitive type: ", - PrimitiveType_Name(shape.element_type()))); - } - value = static_cast(value_f64); - } else { - LOG(FATAL) << "Unexpected element type: " - << PrimitiveType_Name(shape.element_type()); - } - if (lexer_.GetKind() != TokKind::kRbrace && - !ParseToken(TokKind::kComma, - "expects ',' separator between sparse array elements")) { - return false; - } - - if ((*literal)->sparse_element_count() + 1 == - LayoutUtil::MaxSparseElements(shape.layout())) { - return Error( - lexer_.GetLoc(), - StrCat("number of sparse elements exceeds maximum for layout: ", - ShapeUtil::HumanStringWithLayout(shape))); - } - - (*literal)->AppendSparseElement(index, value); - } - - (*literal)->SortSparseElements(); - return true; -} - -// operands ::= '(' operands1 ')' -// operands1 -// ::= /*empty*/ -// ::= operand (, operand)* -// operand ::= (shape)? name -bool HloParser::ParseOperands(std::vector* operands) { - if (!ParseToken(TokKind::kLparen, - "expects '(' at the beginning of operands")) { - return false; - } - if (lexer_.GetKind() == TokKind::kRparen) { - // empty - } else { - do { - LocTy loc = lexer_.GetLoc(); - string name; - if (CanBeShape()) { - Shape shape; - if (!ParseShape(&shape)) { - return false; - } - } - if (!ParseName(&name)) { - return false; - } - std::pair* instruction = - tensorflow::gtl::FindOrNull(instruction_pool_, name); - if (!instruction) { - return Error(loc, StrCat("instruction does not exist: ", name)); - } - operands->push_back(instruction->first); - } while (EatIfPresent(TokKind::kComma)); - } - return ParseToken(TokKind::kRparen, "expects ')' at the end of operands"); -} - -bool HloParser::ParseOperands(std::vector* operands, - const int expected_size) { - LocTy loc = lexer_.GetLoc(); - if (!ParseOperands(operands)) { - return false; - } - if (expected_size != operands->size()) { - return Error(loc, StrCat("expects ", expected_size, " operands, but has ", - operands->size(), " operands")); - } - return true; -} - -// sub_attributes ::= '{' (','? attribute)* '}' -bool HloParser::ParseSubAttributes( - const std::unordered_map& attrs) { - LocTy loc = lexer_.GetLoc(); - if (!ParseToken(TokKind::kLbrace, "expects '{' to start sub attributes")) { - return false; - } - std::unordered_set seen_attrs; - if (lexer_.GetKind() == TokKind::kRbrace) { - // empty - } else { - do { - EatIfPresent(TokKind::kComma); - if (!ParseAttributeHelper(attrs, &seen_attrs)) { - return false; - } - } while (lexer_.GetKind() != TokKind::kRbrace); - } - // Check that all required attrs were seen. - for (const auto& attr_it : attrs) { - if (attr_it.second.required && - seen_attrs.find(attr_it.first) == seen_attrs.end()) { - return Error(loc, Printf("sub-attribute %s is expected but not seen", - attr_it.first.c_str())); - } - } - return ParseToken(TokKind::kRbrace, "expects '}' to end sub attributes"); -} - -// attributes ::= (',' attribute)* -bool HloParser::ParseAttributes( - const std::unordered_map& attrs) { - LocTy loc = lexer_.GetLoc(); - std::unordered_set seen_attrs; - while (EatIfPresent(TokKind::kComma)) { - if (!ParseAttributeHelper(attrs, &seen_attrs)) { - return false; - } - } - // Check that all required attrs were seen. - for (const auto& attr_it : attrs) { - if (attr_it.second.required && - seen_attrs.find(attr_it.first) == seen_attrs.end()) { - return Error(loc, Printf("attribute %s is expected but not seen", - attr_it.first.c_str())); - } - } - return true; -} - -bool HloParser::ParseAttributeHelper( - const std::unordered_map& attrs, - std::unordered_set* seen_attrs) { - LocTy loc = lexer_.GetLoc(); - string name; - if (!ParseAttributeName(&name)) { - return Error(loc, "error parsing attributes"); - } - VLOG(1) << "Parsing attribute " << name; - if (!seen_attrs->insert(name).second) { - return Error(loc, Printf("attribute %s already exists", name.c_str())); - } - auto attr_it = attrs.find(name); - if (attr_it == attrs.end()) { - string allowed_attrs; - if (attrs.empty()) { - allowed_attrs = "No attributes are allowed here."; - } else { - allowed_attrs = StrCat( - "Allowed attributes: ", - Join(attrs, ", ", - [&](string* out, const std::pair& kv) { - StrAppend(out, kv.first); - })); - } - return Error(loc, Printf("unexpected attribute \"%s\". %s", name.c_str(), - allowed_attrs.c_str())); - } - AttrTy attr_type = attr_it->second.attr_type; - void* attr_out_ptr = attr_it->second.result; - bool success = [&] { - LocTy attr_loc = lexer_.GetLoc(); - switch (attr_type) { - case AttrTy::kInt64: { - int64 result; - if (!ParseInt64(&result)) { - return false; - } - static_cast*>(attr_out_ptr)->emplace(result); - return true; - } - case AttrTy::kInt32: { - int64 result; - if (!ParseInt64(&result)) { - return false; - } - if (result != static_cast(result)) { - return Error(attr_loc, "value out of range for int32"); - } - static_cast*>(attr_out_ptr) - ->emplace(static_cast(result)); - return true; - } - case AttrTy::kFloat: { - double result; - if (!ParseDouble(&result)) { - return false; - } - if (result > std::numeric_limits::max() || - result < std::numeric_limits::lowest()) { - return Error(attr_loc, "value out of range for float"); - } - static_cast*>(attr_out_ptr) - ->emplace(static_cast(result)); - return true; - } - case AttrTy::kHloComputation: { - HloComputation* result; - if (!ParseComputationName(&result)) { - return false; - } - static_cast*>(attr_out_ptr)->emplace(result); - return true; - } - case AttrTy::kFftType: { - FftType result; - if (!ParseFftType(&result)) { - return false; - } - static_cast*>(attr_out_ptr)->emplace(result); - return true; - } - case AttrTy::kWindow: { - Window result; - if (!ParseWindow(&result)) { - return false; - } - static_cast*>(attr_out_ptr)->emplace(result); - return true; - } - case AttrTy::kConvolutionDimensionNumbers: { - ConvolutionDimensionNumbers result; - if (!ParseConvolutionDimensionNumbers(&result)) { - return false; - } - static_cast*>(attr_out_ptr) - ->emplace(result); - return true; - } - case AttrTy::kSharding: { - OpSharding sharding; - if (!ParseSharding(&sharding)) { - return false; - } - static_cast*>(attr_out_ptr)->emplace(sharding); - return true; - } - case AttrTy::kInstructionList: { - std::vector result; - if (!ParseInstructionNames(&result)) { - return false; - } - static_cast>*>(attr_out_ptr) - ->emplace(result); - return true; - } - case AttrTy::kFusionKind: { - HloInstruction::FusionKind result; - if (!ParseFusionKind(&result)) { - return false; - } - static_cast*>(attr_out_ptr) - ->emplace(result); - return true; - } - case AttrTy::kBracedInt64List: { - std::vector result; - if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma, - &result)) { - return false; - } - static_cast>*>(attr_out_ptr) - ->emplace(result); - return true; - } - case AttrTy::kSliceRanges: { - SliceRanges result; - if (!ParseSliceRanges(&result)) { - return false; - } - static_cast*>(attr_out_ptr)->emplace(result); - return true; - } - case AttrTy::kPaddingConfig: { - PaddingConfig result; - if (!ParsePaddingConfig(&result)) { - return false; - } - static_cast*>(attr_out_ptr)->emplace(result); - return true; - } - case AttrTy::kString: { - string result; - if (!ParseString(&result)) { - return false; - } - static_cast*>(attr_out_ptr)->emplace(result); - return true; - } - case AttrTy::kMetadata: { - OpMetadata result; - if (!ParseMetadata(&result)) { - return false; - } - static_cast*>(attr_out_ptr)->emplace(result); - return true; - } - case AttrTy::kDistribution: { - RandomDistribution result; - if (!ParseRandomDistribution(&result)) { - return false; - } - static_cast*>(attr_out_ptr) - ->emplace(result); - return true; - } - } - }(); - if (!success) { - return Error(loc, Printf("error parsing attribute %s", name.c_str())); - } - return true; -} - -bool HloParser::ParseComputationName(HloComputation** value) { - string name; - LocTy loc = lexer_.GetLoc(); - if (!ParseName(&name)) { - return Error(loc, "expects computation name"); - } - std::pair* computation = - tensorflow::gtl::FindOrNull(computation_pool_, name); - if (computation == nullptr) { - return Error(loc, StrCat("computation does not exist: ", name)); - } - *value = computation->first; - return true; -} - -// ::= '{' size stride? pad? lhs_dilate? rhs_dilate? '}' -// The subattributes can appear in any order. 'size=' is required, others are -// optional. -bool HloParser::ParseWindow(Window* window) { - LocTy loc = lexer_.GetLoc(); - if (!ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) { - return false; - } - - std::vector size; - std::vector stride; - std::vector> pad; - std::vector lhs_dilate; - std::vector rhs_dilate; - std::vector rhs_reversal; - while (lexer_.GetKind() != TokKind::kRbrace) { - LocTy attr_loc = lexer_.GetLoc(); - string field_name; - if (!ParseAttributeName(&field_name)) { - return Error(attr_loc, "expects sub-attributes in window"); - } - bool ok = [&] { - if (field_name == "size") { - return ParseDxD("size", &size); - } - if (field_name == "stride") { - return ParseDxD("stride", &stride); - } - if (field_name == "lhs_dilate") { - return ParseDxD("lhs_dilate", &lhs_dilate); - } - if (field_name == "rhs_dilate") { - return ParseDxD("rls_dilate", &rhs_dilate); - } - if (field_name == "pad") { - return ParseWindowPad(&pad); - } - if (field_name == "rhs_reversal") { - return ParseDxD("rhs_reversal", &rhs_reversal); - } - return Error(attr_loc, StrCat("unexpected attribute name: ", field_name)); - }(); - if (!ok) { - return false; - } - } - - if (size.empty()) { - return Error(loc, - "sub-attribute 'size=' is required in the window attribute"); - } - if (!stride.empty() && stride.size() != size.size()) { - return Error(loc, "expects 'stride=' has the same size as 'size='"); - } - if (!lhs_dilate.empty() && lhs_dilate.size() != size.size()) { - return Error(loc, "expects 'lhs_dilate=' has the same size as 'size='"); - } - if (!rhs_dilate.empty() && rhs_dilate.size() != size.size()) { - return Error(loc, "expects 'rhs_dilate=' has the same size as 'size='"); - } - if (!pad.empty() && pad.size() != size.size()) { - return Error(loc, "expects 'pad=' has the same size as 'size='"); - } - - for (int i = 0; i < size.size(); i++) { - window->add_dimensions()->set_size(size[i]); - if (!pad.empty()) { - window->mutable_dimensions(i)->set_padding_low(pad[i][0]); - window->mutable_dimensions(i)->set_padding_high(pad[i][1]); - } - // If some field is not present, it has the default value. - window->mutable_dimensions(i)->set_stride(stride.empty() ? 1 : stride[i]); - window->mutable_dimensions(i)->set_base_dilation( - lhs_dilate.empty() ? 1 : lhs_dilate[i]); - window->mutable_dimensions(i)->set_window_dilation( - rhs_dilate.empty() ? 1 : rhs_dilate[i]); - window->mutable_dimensions(i)->set_window_reversal( - rhs_reversal.empty() ? false : (rhs_reversal[i] == 1)); - } - return ParseToken(TokKind::kRbrace, "expected '}' to end window attribute"); -} - -// This is the inverse of HloInstruction::ConvolutionDimensionNumbersToString. -// The string looks like "dim_labels=0bf_0io->0bf". -bool HloParser::ParseConvolutionDimensionNumbers( - ConvolutionDimensionNumbers* dnums) { - if (lexer_.GetKind() != TokKind::kDimLabels) { - return TokenError("expects dim labels pattern, e.g., 'bf0_0io->0bf'"); - } - string str = lexer_.GetStrVal(); - - // The str is expected to have 3 items, lhs, rhs, out, and it must looks like - // lhs_rhs->out, that is, the first separator is "_" and the second is "->". - // So we replace the "->" with "_" and then split on "_". - str = tensorflow::str_util::StringReplace(str, /*oldsub=*/"->", - /*newsub=*/"_", - /*replace_all=*/false); - std::vector lhs_rhs_out = Split(str, "_"); - if (lhs_rhs_out.size() != 3) { - LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees " - << str; - } - - const int64 rank = lhs_rhs_out[0].length(); - if (rank != lhs_rhs_out[1].length() || rank != lhs_rhs_out[2].length()) { - return TokenError( - "convolution lhs, rhs, and output must have the same rank"); - } - if (rank < 2) { - return TokenError("convolution rank must >=2"); - } - - auto is_unique = [](string str) -> bool { - std::sort(str.begin(), str.end()); - return std::unique(str.begin(), str.end()) == str.end(); - }; - - // lhs - { - const string& lhs = lhs_rhs_out[0]; - if (!is_unique(lhs)) { - return TokenError( - StrCat("expects unique lhs dimension numbers, but sees ", lhs)); - } - for (int i = 0; i < rank - 2; i++) { - dnums->add_input_spatial_dimensions(-1); - } - for (int i = 0; i < rank; i++) { - char c = lhs[i]; - if (c == 'b') { - dnums->set_input_batch_dimension(i); - } else if (c == 'f') { - dnums->set_input_feature_dimension(i); - } else if (c < '0' + rank && c >= '0') { - dnums->set_input_spatial_dimensions(c - '0', i); - } else { - return TokenError( - Printf("expects [0-%lldbf] in lhs dimension numbers", rank - 1)); - } - } - } - // rhs - { - const string& rhs = lhs_rhs_out[1]; - if (!is_unique(rhs)) { - return TokenError( - StrCat("expects unique rhs dimension numbers, but sees ", rhs)); - } - for (int i = 0; i < rank - 2; i++) { - dnums->add_kernel_spatial_dimensions(-1); - } - for (int i = 0; i < rank; i++) { - char c = rhs[i]; - if (c == 'i') { - dnums->set_kernel_input_feature_dimension(i); - } else if (c == 'o') { - dnums->set_kernel_output_feature_dimension(i); - } else if (c < '0' + rank && c >= '0') { - dnums->set_kernel_spatial_dimensions(c - '0', i); - } else { - return TokenError( - Printf("expects [0-%lldio] in rhs dimension numbers", rank - 1)); - } - } - } - // output - { - const string& out = lhs_rhs_out[2]; - if (!is_unique(out)) { - return TokenError( - StrCat("expects unique output dimension numbers, but sees ", out)); - } - for (int i = 0; i < rank - 2; i++) { - dnums->add_output_spatial_dimensions(-1); - } - for (int i = 0; i < rank; i++) { - char c = out[i]; - if (c == 'b') { - dnums->set_output_batch_dimension(i); - } else if (c == 'f') { - dnums->set_output_feature_dimension(i); - } else if (c < '0' + rank && c >= '0') { - dnums->set_output_spatial_dimensions(c - '0', i); - } else { - return TokenError( - Printf("expects [0-%lldbf] in output dimension numbers", rank - 1)); - } - } - } - - lexer_.Lex(); - return true; -} - -// ::= '{' ranges '}' -// ::= /*empty*/ -// ::= range (',' range)* -// range ::= '[' start ':' limit (':' stride)? ']' -// -// The slice ranges are printed as: -// -// {[dim0_start:dim0_limit:dim0stride], [dim1_start:dim1_limit], ...} -// -// This function extracts the starts, limits, and strides as 3 vectors to the -// result. If stride is not present, stride is 1. For example, if the slice -// ranges is printed as: -// -// {[2:3:4], [5:6:7], [8:9]} -// -// The parsed result will be: -// -// {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}} -// -bool HloParser::ParseSliceRanges(SliceRanges* result) { - if (!ParseToken(TokKind::kLbrace, "expects '{' to start ranges")) { - return false; - } - std::vector> ranges; - if (lexer_.GetKind() == TokKind::kRbrace) { - // empty - return ParseToken(TokKind::kRbrace, "expects '}' to end ranges"); - } - do { - LocTy loc = lexer_.GetLoc(); - ranges.emplace_back(); - if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kColon, - &ranges.back())) { - return false; - } - const auto& range = ranges.back(); - if (range.size() != 2 && range.size() != 3) { - return Error(loc, Printf("expects [start:limit:step] or [start:limit], " - "but sees %ld elements.", - range.size())); - } - } while (EatIfPresent(TokKind::kComma)); - - for (const auto& range : ranges) { - result->starts.push_back(range[0]); - result->limits.push_back(range[1]); - result->strides.push_back(range.size() == 3 ? range[2] : 1); - } - return ParseToken(TokKind::kRbrace, "expects '}' to end ranges"); -} - -// int64list ::= start int64_elements end -// int64_elements -// ::= /*empty*/ -// ::= int64_val (delim int64_val)* -bool HloParser::ParseInt64List(const TokKind start, const TokKind end, - const TokKind delim, - std::vector* result) { - if (!ParseToken(start, StrCat("expects an int64 list starting with ", - TokKindToString(start)))) { - return false; - } - if (lexer_.GetKind() == end) { - // empty - } else { - do { - int64 i; - if (!ParseInt64(&i)) { - return false; - } - result->push_back(i); - } while (EatIfPresent(delim)); - } - return ParseToken( - end, StrCat("expects an int64 list to end with ", TokKindToString(end))); -} - -// param_list_to_shape ::= param_list '->' shape -bool HloParser::ParseParamListToShape(Shape* shape, LocTy* shape_loc) { - if (!ParseParamList() || !ParseToken(TokKind::kArrow, "expects '->'")) { - return false; - } - *shape_loc = lexer_.GetLoc(); - return ParseShape(shape); -} - -bool HloParser::CanBeParamListToShape() { - return lexer_.GetKind() == TokKind::kLparen; -} - -// param_list ::= '(' param_list1 ')' -// param_list1 -// ::= /*empty*/ -// ::= param (',' param)* -// param ::= name shape -bool HloParser::ParseParamList() { - if (!ParseToken(TokKind::kLparen, - "expects '(' at the beginning of param list")) { - return false; - } - - if (lexer_.GetKind() == TokKind::kRparen) { - // empty - } else { - do { - Shape shape; - string name; - if (!ParseName(&name) || !ParseShape(&shape)) { - return false; - } - } while (EatIfPresent(TokKind::kComma)); - } - return ParseToken(TokKind::kRparen, "expects ')' at the end of param list"); -} - -// shape ::= shape_val_ -// shape ::= '(' tuple_elements ')' -// tuple_elements -// ::= /*empty*/ -// ::= shape (',' shape)* -bool HloParser::ParseShape(Shape* result) { - if (EatIfPresent(TokKind::kLparen)) { // Tuple - std::vector shapes; - if (lexer_.GetKind() == TokKind::kRparen) { - /*empty*/ - } else { - // shape (',' shape)* - do { - shapes.emplace_back(); - if (!ParseShape(&shapes.back())) { - return false; - } - } while (EatIfPresent(TokKind::kComma)); - } - *result = ShapeUtil::MakeTupleShape(shapes); - return ParseToken(TokKind::kRparen, "expects ')' at the end of tuple."); - } - - if (lexer_.GetKind() != TokKind::kShape) { - return TokenError("expects shape"); - } - *result = lexer_.GetShapeVal(); - lexer_.Lex(); - return true; -} - -bool HloParser::CanBeShape() { - // A non-tuple shape starts with a kShape token; a tuple shape starts with - // '('. - return lexer_.GetKind() == TokKind::kShape || - lexer_.GetKind() == TokKind::kLparen; -} - -bool HloParser::ParseName(string* result) { - VLOG(1) << "ParseName"; - if (lexer_.GetKind() != TokKind::kIdent && - lexer_.GetKind() != TokKind::kName) { - return TokenError("expects name"); - } - *result = lexer_.GetStrVal(); - lexer_.Lex(); - return true; -} - -bool HloParser::ParseAttributeName(string* result) { - if (lexer_.GetKind() != TokKind::kAttributeName) { - return TokenError("expects attribute name"); - } - *result = lexer_.GetStrVal(); - lexer_.Lex(); - return true; -} - -bool HloParser::ParseString(string* result) { - VLOG(1) << "ParseString"; - if (lexer_.GetKind() != TokKind::kString) { - return TokenError("expects string"); - } - *result = lexer_.GetStrVal(); - lexer_.Lex(); - return true; -} - -bool HloParser::ParseDxD(const string& name, std::vector* result) { - LocTy loc = lexer_.GetLoc(); - if (!result->empty()) { - return Error(loc, - Printf("sub-attribute '%s=' already exists", name.c_str())); - } - // 1D - if (lexer_.GetKind() == TokKind::kInt) { - int64 number; - if (!ParseInt64(&number)) { - return Error(loc, Printf("expects sub-attribute '%s=i'", name.c_str())); - } - result->push_back(number); - return true; - } - // 2D or higher. - if (lexer_.GetKind() == TokKind::kDxD) { - string str = lexer_.GetStrVal(); - if (!SplitAndParseAsInts(str, 'x', result)) { - return Error(loc, - Printf("expects sub-attribute '%s=ixj...'", name.c_str())); - } - lexer_.Lex(); - return true; - } - return TokenError("expects token type kInt or kDxD"); -} - -bool HloParser::ParseWindowPad(std::vector>* pad) { - LocTy loc = lexer_.GetLoc(); - if (!pad->empty()) { - return Error(loc, "sub-attribute 'pad=' already exists"); - } - if (lexer_.GetKind() != TokKind::kPad) { - return TokenError("expects window pad pattern, e.g., '0_0x3_3'"); - } - string str = lexer_.GetStrVal(); - std::vector padding_str = Split(str, 'x'); - for (int i = 0; i < padding_str.size(); i++) { - std::vector low_high; - if (!SplitAndParseAsInts(padding_str[i], '_', &low_high) || - low_high.size() != 2) { - return Error(loc, - "expects padding_low and padding_high separated by '_'"); - } - pad->push_back(low_high); - } - lexer_.Lex(); - return true; -} - -// This is the inverse xla::ToString(PaddingConfig). The padding config string -// looks like "0_0_0x3_3_1". The string is first separated by 'x', each -// substring represents one PaddingConfigDimension. The substring is 3 (or 2) -// numbers joined by '_'. -bool HloParser::ParsePaddingConfig(PaddingConfig* padding) { - if (lexer_.GetKind() != TokKind::kPad) { - return TokenError("expects padding config, e.g., '0_0_0x3_3_1'"); - } - LocTy loc = lexer_.GetLoc(); - string str = lexer_.GetStrVal(); - std::vector padding_str = Split(str, 'x'); - for (const auto& padding_dim_str : padding_str) { - std::vector padding_dim; - if (!SplitAndParseAsInts(padding_dim_str, '_', &padding_dim) || - (padding_dim.size() != 2 && padding_dim.size() != 3)) { - return Error(loc, - "expects padding config pattern like 'low_high_interior' or " - "'low_high'"); - } - auto* dim = padding->add_dimensions(); - dim->set_edge_padding_low(padding_dim[0]); - dim->set_edge_padding_high(padding_dim[1]); - dim->set_interior_padding(padding_dim.size() == 3 ? padding_dim[2] : 0); - } - lexer_.Lex(); - return true; -} - -// '{' metadata_string '}' -bool HloParser::ParseMetadata(OpMetadata* metadata) { - std::unordered_map attrs; - optional op_type; - optional op_name; - optional source_file; - optional source_line; - attrs["op_type"] = {/*required=*/false, AttrTy::kString, &op_type}; - attrs["op_name"] = {/*required=*/false, AttrTy::kString, &op_name}; - attrs["source_file"] = {/*required=*/false, AttrTy::kString, &source_file}; - attrs["source_line"] = {/*required=*/false, AttrTy::kInt32, &source_line}; - if (!ParseSubAttributes(attrs)) { - return false; - } - if (op_type) { - metadata->set_op_type(*op_type); - } - if (op_name) { - metadata->set_op_name(*op_name); - } - if (source_file) { - metadata->set_source_file(*source_file); - } - if (source_line) { - metadata->set_source_line(*source_line); - } - return true; -} - -bool HloParser::ParseOpcode(HloOpcode* result) { - VLOG(1) << "ParseOpcode"; - if (lexer_.GetKind() != TokKind::kIdent) { - return TokenError("expects opcode"); - } - string val = lexer_.GetStrVal(); - auto status_or_result = StringToHloOpcode(val); - if (!status_or_result.ok()) { - return TokenError( - Printf("expects opcode but sees: %s, error: %s", val.c_str(), - status_or_result.status().error_message().c_str())); - } - *result = status_or_result.ValueOrDie(); - lexer_.Lex(); - return true; -} - -bool HloParser::ParseFftType(FftType* result) { - VLOG(1) << "ParseFftType"; - if (lexer_.GetKind() != TokKind::kIdent) { - return TokenError("expects fft type"); - } - string val = lexer_.GetStrVal(); - if (!FftType_Parse(val, result) || !FftType_IsValid(*result)) { - return TokenError(Printf("expects fft type but sees: %s", val.c_str())); - } - lexer_.Lex(); - return true; -} - -bool HloParser::ParseFusionKind(HloInstruction::FusionKind* result) { - VLOG(1) << "ParseFusionKind"; - if (lexer_.GetKind() != TokKind::kIdent) { - return TokenError("expects fusion kind"); - } - string val = lexer_.GetStrVal(); - auto status_or_result = StringToFusionKind(val); - if (!status_or_result.ok()) { - return TokenError( - Printf("expects fusion kind but sees: %s, error: %s", val.c_str(), - status_or_result.status().error_message().c_str())); - } - *result = status_or_result.ValueOrDie(); - lexer_.Lex(); - return true; -} - -bool HloParser::ParseRandomDistribution(RandomDistribution* result) { - VLOG(1) << "ParseRandomDistribution"; - if (lexer_.GetKind() != TokKind::kIdent) { - return TokenError("expects random distribution"); - } - string val = lexer_.GetStrVal(); - auto status_or_result = StringToRandomDistribution(val); - if (!status_or_result.ok()) { - return TokenError( - Printf("expects random distribution but sees: %s, error: %s", - val.c_str(), status_or_result.status().error_message().c_str())); - } - *result = status_or_result.ValueOrDie(); - lexer_.Lex(); - return true; -} - -bool HloParser::ParseInt64(int64* result) { - VLOG(1) << "ParseInt64"; - if (lexer_.GetKind() != TokKind::kInt) { - return TokenError("expects integer"); - } - *result = lexer_.GetInt64Val(); - lexer_.Lex(); - return true; -} - -bool HloParser::ParseDouble(double* result) { - switch (lexer_.GetKind()) { - case TokKind::kDecimal: - *result = lexer_.GetDecimalVal(); - break; - case TokKind::kInt: - *result = static_cast(lexer_.GetInt64Val()); - break; - case TokKind::kw_nan: - *result = std::numeric_limits::quiet_NaN(); - break; - case TokKind::kw_inf: - *result = std::numeric_limits::infinity(); - break; - case TokKind::kNegInf: - *result = -std::numeric_limits::infinity(); - break; - default: - return TokenError("expects decimal or integer"); - } - lexer_.Lex(); - return true; -} - -bool HloParser::ParseBool(bool* result) { - if (lexer_.GetKind() != TokKind::kw_true && - lexer_.GetKind() != TokKind::kw_false) { - return TokenError("expects true or false"); - } - *result = lexer_.GetKind() == TokKind::kw_true; - lexer_.Lex(); - return true; -} - -bool HloParser::ParseToken(TokKind kind, const string& msg) { - VLOG(1) << "ParseToken " << TokKindToString(kind) << " " << msg; - if (lexer_.GetKind() != kind) { - return TokenError(msg); - } - lexer_.Lex(); - return true; -} - -bool HloParser::EatIfPresent(TokKind kind) { - if (lexer_.GetKind() != kind) { - return false; - } - lexer_.Lex(); - return true; -} - -bool HloParser::AddInstruction(const string& name, HloInstruction* instruction, - LocTy name_loc) { - auto result = instruction_pool_.insert({name, {instruction, name_loc}}); - if (!result.second) { - Error(name_loc, StrCat("instruction already exists: ", name)); - return Error(/*loc=*/result.first->second.second, - "instruction previously defined here"); - } - return true; -} - -bool HloParser::AddComputation(const string& name, HloComputation* computation, - LocTy name_loc) { - auto result = computation_pool_.insert({name, {computation, name_loc}}); - if (!result.second) { - Error(name_loc, StrCat("computation already exists: ", name)); - return Error(/*loc=*/result.first->second.second, - "computation previously defined here"); - } - return true; -} - -StatusOr HloParser::ParseShardingOnly() { - lexer_.Lex(); - OpSharding op_sharding; - if (!ParseSharding(&op_sharding)) { - return InvalidArgument("Syntax error:\n%s", GetError().c_str()); - } - if (lexer_.GetKind() != TokKind::kEof) { - return InvalidArgument("Syntax error:\nExtra content after sharding"); - } - return HloSharding::FromProto(op_sharding); -} - -} // namespace - -StatusOr> Parse(StringPiece str, - const HloModuleConfig& config) { - HloParser parser(str, config); - if (!parser.Run()) { - return InvalidArgument("Syntax error:\n%s", parser.GetError().c_str()); - } - return parser.ConsumeHloModule(); -} - -StatusOr> Parse(StringPiece str) { - HloModuleConfig config; - return Parse(str, config); -} - -StatusOr ParseSharding(tensorflow::StringPiece str) { - HloModuleConfig config; - HloParser parser(str, config); - return parser.ParseShardingOnly(); -} - -} // namespace tools -} // namespace xla diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.h b/tensorflow/compiler/xla/tools/parser/hlo_parser.h deleted file mode 100644 index f7854f403e00c9ca2de1a6634f2d5813b9661525..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/tools/parser/hlo_parser.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_PARSER_H_ -#define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_PARSER_H_ - -#include "tensorflow/compiler/xla/ptr_util.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/tools/parser/hlo_lexer.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" - -namespace xla { -namespace tools { - -// The api of the hlo parser. Given a string in the HloModule::ToString() -// format, parses the string and creates a HloModule with the given config. -StatusOr> Parse(tensorflow::StringPiece str, - const HloModuleConfig& config); - -// The api of the hlo parser. Given a string in the HloModule::ToString() -// format, parses the string and creates a HloModule with default config. -StatusOr> Parse(tensorflow::StringPiece str); - -// Parse sharding from str. str is supposed to contain the body of the -// sharding, i.e. just the rhs of the "sharding={...}" attribute string. -StatusOr ParseSharding(tensorflow::StringPiece str); - -} // namespace tools -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_PARSER_H_ diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc deleted file mode 100644 index 183b1121cd9ca2da8ad88c7d7e58f8ee79e90662..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc +++ /dev/null @@ -1,1354 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" - -#include -#include "tensorflow/core/lib/core/status_test_util.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/platform/test.h" - -namespace xla { -namespace tools { -namespace { - -using tensorflow::StringPiece; - -struct TestData { - string test_name; - string module_string; -}; - -string TestDataToString(const ::testing::TestParamInfo& data) { - return data.param.test_name; -} - -// For each string below, we check that: -// - we parse it to an HloModule successfully, and -// - the stringification of the resulting HloModule is equal to our original -// string. -std::vector CreateTestCases() { - // clang-format off - return std::vector({ -// ax + y -{ -"AxpyParam", -R"(HloModule axpy_module - -ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] { - %alpha = f32[] parameter(0) - %broadcast = f32[2,4]{1,0} broadcast(f32[] %alpha), dimensions={} - %x = f32[2,4]{1,0} parameter(1) - %multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x) - %y = f32[2,4]{1,0} parameter(2) - ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y) -} - -)" -}, -// pred constant -{ -"ConstantPred", -R"(HloModule constant_pred_module - -ENTRY %constant_pred () -> pred[] { - ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="\"it\'s not a problem\n" source_file="path/to/test.cc" source_line=68}, backend_config="foo\" bar" -} - -)" -}, -// s32 constant -{ -"ConstantS32", -R"(HloModule constant_s32_module - -ENTRY %constant_s32 () -> s32[] { - ROOT %constant = s32[] constant(-42) -} - -)" -}, -// f32 constant, but the value is not a decimal and there is a backend -// configuration -{ -"ConstantF32", -R"(HloModule ConstantF32_module - -ENTRY %ConstantF32.v4 () -> f32[] { - ROOT %constant = f32[] constant(42), backend_config="this is a configuration" -} - -)" -}, -// f32 constant, rank 1 empty array. -{ -"ConstantF32R1Empty", -R"(HloModule ConstantF32Empty_module - -ENTRY %ConstantF32Empty.v4 () -> f32[0] { - ROOT %constant = f32[0]{0} constant({}) -} - -)" -}, -// f32 constant, rank 4 empty array. -{ -"ConstantF32R4Empty", -R"(HloModule ConstantF32R4Empty_module - -ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] { - ROOT %constant = f32[2,0,4,3]{3,2,1,0} constant(f32[2,0,4,3] { { /*i0=0*/ }, { /*i0=1*/ } }) -} - -)" -}, -// constant 4D -{ -"Constant4D", -R"(HloModule Small_3x2x1x1_module - -ENTRY %Small_3x2x1x1.v1 () -> f32[3,2,1,1] { - ROOT %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } }) -} - -)" -}, -// non-finite constants: nan, inf, -inf -{ -"ConstantNonFinite", -R"(HloModule IsFiniteR1F32s_module - -ENTRY %IsFiniteR1F32s.v2 () -> pred[6] { - %constant = f32[6]{0} constant({nan, 7, nan, -1, inf, -inf}) - ROOT %is-finite = pred[6]{0} is-finite(f32[6]{0} %constant) -} - -)" -}, -// constant f16 -{ -"ConstantF16", -R"(HloModule ConstantF16_module - -ENTRY %ConstantF16.v4 () -> f16[] { - ROOT %constant = f16[] constant(500) -} - -)" -}, -// bf16 -{ -"BF16", -R"(HloModule BF16 - -ENTRY %BF16.v4 () -> bf16[] { - ROOT %constant = bf16[] constant(500) -} - -)" -}, -// constant + constant -{ -"AddConstants", -R"(HloModule add_constants_module - -ENTRY %add_constants () -> f32[] { - %constant = f32[] constant(3.14) - ROOT %add = f32[] add(f32[] %constant, f32[] %constant) -} - -)" -}, -// tuple constant -{ -"TupleConstant", -R"(HloModule TupleConstant_module - -ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) { - ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} )) -} - -)" -}, -// v1 > v2 ? v1 : v2 -{ -"SelectR1F32", -R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module - -ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f32[4] { - %v1 = f32[4]{0} parameter(0), sharding={maximal device=1} - %v2 = f32[4]{0} parameter(1), sharding={maximal device=1} - %greater-than = pred[4]{0} greater-than(f32[4]{0} %v1, f32[4]{0} %v2), sharding={replicated} - ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2), sharding={} -} - -)" -}, -// empty tuple -{ -"EmptyTupleCreate", -R"(HloModule EmptyTupleCreate_module - -ENTRY %EmptyTupleCreate.v1 () -> () { - ROOT %tuple = () tuple() -} - -)" -}, -// tuple -{ -"TupleCreate", -R"(HloModule TupleCreate_module - -ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) { - %v1 = f32[] parameter(0) - %v2 = f32[3]{0} parameter(1) - %v3 = f32[2,3]{1,0} parameter(2) - ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3) -} - -)" -}, -{ -"ShardedTupleCreate", -R"(HloModule ShardedTupleCreate_module - -ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) { - %v1 = f32[] parameter(0) - %v2 = f32[3]{0} parameter(1) - %v3 = f32[2,3]{1,0} parameter(2) - ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3), sharding={{replicated}, {maximal device=0}, {replicated}} -} - -)" -}, -// int32 result = 0; -// while (result < 5) { result = result + 1; } -{ -"WhileWithScalarS32Result", -R"(HloModule WhileWithScalarS32Result_module - -%body.v3 (prev.1: s32[]) -> s32[] { - %constant = s32[] constant(1) - %prev.1 = s32[] parameter(0) - ROOT %add = s32[] add(s32[] %constant, s32[] %prev.1) -} - -%condition.v3 (prev.2: s32[]) -> pred[] { - %constant.1 = s32[] constant(5) - %prev.2 = s32[] parameter(0) - ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %prev.2) -} - -ENTRY %WhileWithScalarS32Result.v2 () -> s32[] { - %constant.2 = s32[] constant(0) - ROOT %while = s32[] while(s32[] %constant.2), condition=%condition.v3, body=%body.v3 -} - -)" -}, -// send and recv -{ -"SendRecv", -R"(HloModule TwoSendRecvBothWayRecvFist_module - -ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] { - %recv = (f32[], u32[]) recv(), channel_id=15, sharding={maximal device=1} - ROOT %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15, sharding={maximal device=1} - %constant = f32[] constant(2.1), sharding={maximal device=0} - %send = (f32[], u32[]) send(f32[] %constant), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv} - %send-done = () send-done((f32[], u32[]) %send), channel_id=16, sharding={maximal device=0} -} - -)" -}, -// get-tuple-element -{ -"GetTupleElement", -R"(HloModule GetTupleElement_module - -ENTRY %GetTupleElement.v4 () -> s32[2,3] { - %constant = f32[3]{0} constant({1, 2, 3}) - %constant.1 = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 4, 5, 6 } }) - %tuple = (f32[3]{0}, s32[2,3]{1,0}) tuple(f32[3]{0} %constant, s32[2,3]{1,0} %constant.1) - ROOT %get-tuple-element = s32[2,3]{1,0} get-tuple-element((f32[3]{0}, s32[2,3]{1,0}) %tuple), index=1, sharding={maximal device=0} -} - -)" -}, -// call -{ -"Call", -R"(HloModule CallR0F32IdentityScalar_module - -%Identity.v1 (x: f32[]) -> f32[] { - ROOT %x = f32[] parameter(0) -} - -ENTRY %CallR0F32IdentityScalar.v2 () -> f32[] { - %constant = f32[] constant(42) - ROOT %call = f32[] call(f32[] %constant), to_apply=%Identity.v1 -} - -)" -}, -// reduce window -{ -"ReduceWindow", -R"(HloModule R4UnitWindow_module - -%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] { - %lhs = f32[] parameter(0) - %rhs = f32[] parameter(1) - ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs) -} - -ENTRY %R4UnitWindow.v3 (operand: f32[13,12,8,15]) -> f32[13,3,8,15] { - %operand = f32[13,12,8,15]{0,3,2,1} parameter(0) - %constant = f32[] constant(0) - ROOT %reduce-window = f32[13,3,8,15]{0,3,2,1} reduce-window(f32[13,12,8,15]{0,3,2,1} %operand, f32[] %constant), window={size=1x1x7x1 stride=1x4x1x1 pad=0_0x0_0x3_3x0_0}, to_apply=%add_F32.v3 -} - -)" -}, -// reduce window on scalar -{ -"ReduceWindowScalar", -R"(HloModule reduce_window_scalar - -%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] { - %lhs = f32[] parameter(0) - %rhs = f32[] parameter(1) - ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs) -} - -ENTRY %R4UnitWindowScalar () -> f32[] { - %constant = f32[] constant(42) - %constant.1 = f32[] constant(1) - ROOT %reduce-window = f32[] reduce-window(f32[] %constant, f32[] %constant.1), to_apply=%add_F32.v3 -} - -)" -}, -// convolution -{ -"Convolution", -R"(HloModule Convolve1D1Window_0_module - -ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] { - %input = f32[1,2,1]{2,1,0} parameter(0) - %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input) - %filter = f32[1,1,1]{2,1,0} parameter(1) - ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f -} - -)" -}, -// convolution rank 2 -{ -"ConvolutionR2", -R"(HloModule ConvolveR2_module - -ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[1,1]) -> f32[1,2] { - %input = f32[1,2]{1,0} parameter(0) - %filter = f32[1,1]{1,0} parameter(1) - ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), dim_labels=bf_io->bf -} - -)" -}, -// convolution backward -{ -"ConvolutionBackward", -R"(HloModule ConvolveBackward_module - -ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f32[128,14,14,512] { - %input = f32[128,7,7,512]{0,3,2,1} parameter(0) - %filter = f32[3,3,512,512]{3,2,1,0} parameter(1) - ROOT %convolution-base-dilated = f32[128,14,14,512]{0,3,2,1} convolution(f32[128,7,7,512]{0,3,2,1} %input, f32[3,3,512,512]{3,2,1,0} %filter), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f -} - -)" -}, -// reverse(constant) -{ -"Reverse4D", -R"(HloModule Reverse4DFloatArrayOnDim01_module - -ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] { - %constant = f32[4,3,2,1]{0,1,2,3} constant(f32[4,3,2,1] { { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } }) - ROOT %reverse = f32[4,3,2,1]{0,1,2,3} reverse(f32[4,3,2,1]{0,1,2,3} %constant), dimensions={0,1} -} - -)" -}, -// concat -{ -"Concat", -R"(HloModule Concat2x3With2x5_module - -ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] { - %constant = f32[2,3]{1,0} constant(f32[2,3] { { 0, 1, 2 }, { 1000, 1001, 1002 } }) - %constant.1 = f32[2,5]{1,0} constant(f32[2,5] { { 64, 65, 66, 67, 68 }, { 1064, 1065, 1066, 1067, 1068 } }) - ROOT %concatenate = f32[2,8]{1,0} concatenate(f32[2,3]{1,0} %constant, f32[2,5]{1,0} %constant.1), dimensions={1} -} - -)" -}, -// select and scatter -{ -"SelectAndScatter", -R"(HloModule R4F32OverlapSmall_module - -%ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] { - %lhs = f32[] parameter(0) - %rhs = f32[] parameter(1) - ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs) -} - -%add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] { - %lhs.1 = f32[] parameter(0) - %rhs.1 = f32[] parameter(1) - ROOT %add = f32[] add(f32[] %lhs.1, f32[] %rhs.1) -} - -ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] { - %constant = f32[4,5,1,1]{3,2,1,0} constant(f32[4,5,1,1] { { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } }) - %constant.1 = f32[2,2,1,1]{3,2,1,0} constant(f32[2,2,1,1] { { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } }) - %constant.2 = f32[] constant(0) - ROOT %select-and-scatter = f32[4,5,1,1]{3,2,1,0} select-and-scatter(f32[4,5,1,1]{3,2,1,0} %constant, f32[2,2,1,1]{3,2,1,0} %constant.1, f32[] %constant.2), window={size=2x3x1x1 stride=2x2x1x1}, select=%ge_F32.v3, scatter=%add_F32.v3 -} - -)" -}, -// select and scatter on scalar -{ -"SelectAndScatterScalar", -R"(HloModule select_and_scatter_scalar - -%ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] { - %lhs = f32[] parameter(0) - %rhs = f32[] parameter(1) - ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs) -} - -%add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] { - %lhs.1 = f32[] parameter(0) - %rhs.1 = f32[] parameter(1) - ROOT %add = f32[] add(f32[] %lhs.1, f32[] %rhs.1) -} - -ENTRY %SelectAndScatterScalar () -> f32[] { - %constant = f32[] constant(42) - %constant.1 = f32[] constant(1) - %constant.2 = f32[] constant(2) - ROOT %select-and-scatter = f32[] select-and-scatter(f32[] %constant, f32[] %constant.1, f32[] %constant.2), select=%ge_F32.v3, scatter=%add_F32.v3 -} - -)" -}, -// slice -{ -"Slice", -R"(HloModule slice_module - -ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] { - %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0) - ROOT %slice = f32[3,3,2,4]{3,2,1,0} slice(f32[3,3,4,4]{3,2,1,0} %p0), slice={[0:3:1], [0:3:1], [0:4:2], [0:4:1]} -} - -)" -}, -// slice, no stride -{ -"SliceNoStride", -R"(HloModule Slice3x3x3_To_1x3x3_F32_module - -ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] { - %constant = f32[3,3,3]{2,1,0} constant(f32[3,3,3] { { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } }) - ROOT %slice = f32[1,3,3]{2,1,0} slice(f32[3,3,3]{2,1,0} %constant), slice={[0:1], [0:3], [0:3]} -} - -)" -}, -// slice R0 -{ -"SliceR0", -R"(HloModule SliceR0_module - -ENTRY %SliceR0.v2 () -> s32[] { - %constant = s32[] constant(1) - ROOT %slice = s32[] slice(s32[] %constant), slice={} -} - -)" -}, -// transpose -{ -"Transpose", -R"(HloModule Transpose_module - -ENTRY %Transpose.v2 () -> s32[1,2,3] { - %constant = s32[1,2,3]{2,1,0} constant(s32[1,2,3] { { { 1, 2, 3 }, { 4, 5, 6 } } }) - ROOT %transpose = s32[1,2,3]{2,1,0} transpose(s32[1,2,3]{2,1,0} %constant), dimensions={0,1,2} -} - -)" -}, -// Dynamic slice -{ -"DynamicSlice", -R"(HloModule DynamicSlice_module - -ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -> s32[2,2,258] { - %original_parameter = s32[2,2,258]{2,1,0} parameter(0) - %constant = s32[1]{0} constant({0}) - %start_index = s32[1]{0} parameter(1) - %concatenate = s32[3]{0} concatenate(s32[1]{0} %constant, s32[1]{0} %constant, s32[1]{0} %start_index), dimensions={0} - ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[3]{0} %concatenate), dynamic_slice_sizes={2,2,258} -} - -)" -}, -// Dynamic update slice -{ -"DynamicUpdateSlice", -R"(HloModule DynamicUpdateSlice_module - -ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] { - %input = s32[1,1,25,1]{3,2,1,0} parameter(0) - %update = s32[1,1,2,1]{3,2,1,0} parameter(1) - %start_indices = s32[4]{0} parameter(2) - ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[4]{0} %start_indices) -} - -)" -}, -// batch norm training -{ -"BatchNormTraining", -R"(HloModule BasicTraining_module - -ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) { - %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } }) - %constant.1 = f32[2]{0} constant({2, 3}) - %constant.2 = f32[2]{0} constant({1, 2}) - ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3 -} - -)" -}, -// batch norm inference -{ -"BatchNormInference", -R"(HloModule BatchNormInference_module - -ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2], mean: f32[2], variance: f32[2]) -> f32[2,2,2,2] { - %input = f32[2,2,2,2]{3,2,1,0} parameter(0) - %offset = f32[2]{0} parameter(1) - %scale = f32[2]{0} parameter(2) - %mean = f32[2]{0} parameter(3) - %variance = f32[2]{0} parameter(4) - ROOT %batch-norm-inference = f32[2,2,2,2]{3,2,1,0} batch-norm-inference(f32[2,2,2,2]{3,2,1,0} %input, f32[2]{0} %offset, f32[2]{0} %scale, f32[2]{0} %mean, f32[2]{0} %variance), epsilon=0.001, feature_index=0 -} - -)" -}, -// batch norm grad -{ -"BatchNormGrad", -R"(HloModule BatchNormGrad_module - -ENTRY %BatchNormGrad.v4 (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], variance: f32[2], grad_output: f32[2,2,2,2]) -> (f32[2,2,2,2], f32[2], f32[2]) { - %input = f32[2,2,2,2]{3,2,1,0} parameter(0) - %scale = f32[2]{0} parameter(1) - %mean = f32[2]{0} parameter(2) - %variance = f32[2]{0} parameter(3) - %grad_output = f32[2,2,2,2]{3,2,1,0} parameter(4) - ROOT %batch-norm-grad = (f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-grad(f32[2,2,2,2]{3,2,1,0} %input, f32[2]{0} %scale, f32[2]{0} %mean, f32[2]{0} %variance, f32[2,2,2,2]{3,2,1,0} %grad_output), epsilon=0.001, feature_index=0 -} - -)" -}, -// fft -{ -"Fft", -R"(HloModule Fft_module - -ENTRY %Fft (input: c64[8,32]) -> c64[8,32] { - %input = c64[8,32]{1,0} parameter(0) - ROOT %fft = c64[8,32]{1,0} fft(c64[8,32]{1,0} %input), fft_type=FFT, fft_length={32} -} - -)" -}, -// ifft -{ -"Ifft2d", -R"(HloModule Ifft2d_module - -ENTRY %Ifft2d (input: c64[5,8,32]) -> c64[5,8,32] { - %input = c64[5,8,32]{2,1,0} parameter(0) - ROOT %fft = c64[5,8,32]{2,1,0} fft(c64[5,8,32]{2,1,0} %input), fft_type=IFFT, fft_length={8,32} -} - -)" -}, -// rfft2d -{ -"Rfft2d", -R"(HloModule Rfft2d_module - -ENTRY %Rfft2d (input: f32[5,64,32]) -> c64[5,64,17] { - %input = f32[5,64,32]{2,1,0} parameter(0) - ROOT %fft = c64[5,64,17]{2,1,0} fft(f32[5,64,32]{2,1,0} %input), fft_type=RFFT, fft_length={64,32} -} - -)" -}, -// irfft3d -{ -"Irfft3d", -R"(HloModule Irfft3d_module - -ENTRY %Irfft3d (input: c64[5,64,128,33]) -> f32[5,64,128,64] { - %input = c64[5,64,128,33]{3,2,1,0} parameter(0) - ROOT %fft = f32[5,64,128,64]{3,2,1,0} fft(c64[5,64,128,33]{3,2,1,0} %input), fft_type=IRFFT, fft_length={64,128,64} -} - -)" -}, -// pad -{ -"Pad", -R"(HloModule Pad1DS3Array_module - -ENTRY %Pad1DS3Array.v3 () -> f32[8] { - %constant = f32[3]{0} constant({1, 2, 3}) - %constant.1 = f32[] constant(0.1) - ROOT %pad = f32[8]{0} pad(f32[3]{0} %constant, f32[] %constant.1), padding=3_1 -} - -)" -}, -// pad has interior -{ -"PadHasInterior", -R"(HloModule PadHasInterior_module - -ENTRY %PadHasInterior.v3 (input: f32[1,25,7,7]) -> f32[1,25,17,11] { - %input = f32[1,25,7,7]{3,2,1,0} parameter(0) - %constant = f32[] constant(-5.123) - ROOT %pad = f32[1,25,17,11]{3,2,1,0} pad(f32[1,25,7,7]{3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_0_0x2_2_1x2_2_0 -} - -)" -}, -// Negative padding -{ -"PadHasNegativePadding", -R"(HloModule PadHasNegativePadding_module - -ENTRY %PadHasNegativePadding (input: f32[1,25,7,7,10]) -> f32[1,15,6,3,29] { - %input = f32[1,25,7,7,10]{4,3,2,1,0} parameter(0) - %constant = f32[] constant(-5.123) - ROOT %pad = f32[1,15,6,3,29]{4,3,2,1,0} pad(f32[1,25,7,7,10]{4,3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_-10_0x0_-1_0x-2_-2_0x-1_-1_3 -} - -)" -}, -// fusion -{ -"Fusion", -R"(HloModule fusion_module - -%fused_computation (constant.param_0: f32[3,2,1,1], constant.1.param_1: f32[2]) -> f32[3,2,1,1] { - %constant.param_0 = f32[3,2,1,1]{3,2,1,0} parameter(0) - %constant.1.param_1 = f32[2]{0} parameter(1) - %broadcast = f32[3,2,1,1]{3,2,1,0} broadcast(f32[2]{0} %constant.1.param_1), dimensions={1} - ROOT %subtract = f32[3,2,1,1]{3,2,1,0} subtract(f32[3,2,1,1]{3,2,1,0} %constant.param_0, f32[3,2,1,1]{3,2,1,0} %broadcast) -} - -ENTRY %fusion.v3 () -> f32[3,2,1,1] { - %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } }) - %constant.1 = f32[2]{0} constant({3.14, 4.25}) - ROOT %fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %constant, f32[2]{0} %constant.1), kind=kLoop, calls=%fused_computation -} - -)" -}, -{ -"Sparse", -R"(HloModule sparse_f32 - -ENTRY %sparse () -> f32[2,3,4] { - ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{[0, 1, 2]: 1, [1, 2, 3]: 2, [2, 3, 4]: 3}) -} - -)" -}, -{ -"SparseEmpty", -R"(HloModule sparse_f32_empty - -ENTRY %sparse_f32_empty () -> f32[2,3,4] { - ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{}) -} - -)" -}, -{ -"SparseR1", -R"(HloModule sparse_f32_r1 - -ENTRY %sparse_f32_r1 () -> f32[9] { - ROOT %foo = f32[9]sparse{10} constant(f32[9]{1: 2, 3: 4, 5: 6}) -} - -)" -}, -{ -"gather", -R"(HloModule StringifyGather - -ENTRY %Gather (input_tensor: f32[50,49,48,47,46], gather_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] { - %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0) - %gather_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1) - ROOT %gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26} -} - -)" -}, - }); - // clang-format on -} - -std::vector CreateShortTestCases() { - // clang-format off - return std::vector({ -// map -{ -"Map", -R"(HloModule MapBinaryAdder_module - -add_F32.v3 { - lhs = f32[] parameter(0) - rhs = f32[] parameter(1) - ROOT add = f32[] add(lhs, rhs) -} - -ENTRY MapBinaryAdder.v3 { - param0 = f32[4]{0} parameter(0) - param1 = f32[4]{0} parameter(1) - ROOT map = f32[4]{0} map(param0, param1), to_apply=add_F32.v3 -} - -)" -}, -// reduce -{ -"Reduce", -R"(HloModule ReduceR3ToR2_module - -add_F32.v3 { - lhs = f32[] parameter(0) - rhs = f32[] parameter(1) - ROOT add = f32[] add(lhs, rhs) -} - -ENTRY ReduceR3ToR2.v3 { - input = f32[8,16,256]{2,1,0} parameter(0) - constant = f32[] constant(0) - ROOT reduce = f32[8,16]{1,0} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3 -} - -)" -}, -// infeed/outfeed -{ -"InfeedOutfeed", -R"(HloModule outfeed_module - -ENTRY InfeedToOutfeed { - infeed = (u32[3]{0}, pred[]) infeed() - outfeed = () outfeed(infeed) - ROOT infeed.1 = (u32[3]{0}, pred[]) infeed() - outfeed.1 = () outfeed(infeed.1) -} - -)" -}, -// Rng -{ -"Rng", -R"(HloModule rng_module - -ENTRY Rng { - constant = f32[] constant(0) - constant.1 = f32[] constant(1) - ROOT rng = f32[8]{0} rng(constant, constant.1), distribution=rng_uniform -} - -)" -}, -// Reduce precision -{ -"ReducePrevison", -R"(HloModule reduce_precision - -ENTRY ReducePrecision { - constant = f32[1]{0} constant({3.14159}) - ROOT reduce-precision = f32[1]{0} reduce-precision(constant), exponent_bits=8, mantissa_bits=10 -} - -)" -}, -// Conditional -{ -"Conditional", -R"(HloModule conditional - -Negate { - x = f32[] parameter(0) - ROOT negate = f32[] negate(x) -} - -Identity { - y = f32[] parameter(0) - ROOT copy = f32[] copy(y) -} - -ENTRY Parameters1.v4 { - constant = pred[] constant(true) - constant.1 = f32[] constant(56) - constant.2 = f32[] constant(12) - ROOT conditional = f32[] conditional(constant, constant.1, constant.2), true_computation=Negate, false_computation=Identity -} - -)" -}, -// CustomCall -{ -"CustomCall", -R"(HloModule custom_call - -ENTRY CustomCall { - constant = f32[1]{0} constant({12345}) - ROOT custom-call = f32[1,2,3]{0,2,1} custom-call(constant), custom_call_target="foo\"bar" -} - -)" -}, -// Variables with non-default names -{ -"NonDefaultNames", -R"(HloModule add_constants_module - -ENTRY add_constants { - foo = f32[] constant(3.14) - ROOT bar = f32[] add(foo, foo) -} - -)" -}, -{ -"Dot", -R"(HloModule dot - -ENTRY dot { - a = f32[2,10]{1,0} parameter(0) - b = f32[10,3]{1,0} parameter(1) - ROOT dot = f32[2,3]{1,0} dot(a, b), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={0} -} - -)" -}, -{ -"gather", -R"(HloModule gather - -ENTRY Gather { - input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0) - gather_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1) - ROOT gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(input_tensor, gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26} -} - -)" -}, - }); - // clang-format on -} - -class HloParserTest : public ::testing::Test, - public ::testing::WithParamInterface { - protected: - static void ExpectHasSubstr(StringPiece s, StringPiece expected) { - EXPECT_TRUE(tensorflow::str_util::StrContains(s, expected)) - << "'" << s << "' does not contain '" << expected << "'"; - } - - // Expects "ToString(Parse(string)) == string", that is, parses the string, - // asserts that it succeeded, stringifies the parsed module, and checks that - // the it equals the original string. - void ExpectEqual() { - const string& original = GetParam().module_string; - auto result = Parse(original); - TF_ASSERT_OK(result.status()); - EXPECT_EQ(original, result.ValueOrDie()->ToString( - HloPrintOptions().set_print_large_constants(true))); - } -}; - -class HloParserShortTest : public HloParserTest { - protected: - void ExpectEqualShort() { - const string& original = GetParam().module_string; - auto result = Parse(original); - TF_ASSERT_OK(result.status()); - EXPECT_EQ(original, - result.ValueOrDie()->ToString(HloPrintOptions::ShortParsable())); - } -}; - -TEST_P(HloParserTest, Run) { ExpectEqual(); } - -TEST_P(HloParserShortTest, Run) { ExpectEqualShort(); } - -INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTest, - ::testing::ValuesIn(CreateTestCases()), - TestDataToString); - -INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserShortTest, - ::testing::ValuesIn(CreateShortTestCases()), - TestDataToString); - -TEST_F(HloParserTest, Empty) { - const string original = ""; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); -} - -TEST_F(HloParserTest, Garbage) { - const string original = "HloModule thi$ str1ng makes# N0 sen$e @all!*&^%$"; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); -} - -TEST_F(HloParserTest, WrongOpcode) { - const string original = R"(HloModule wrong_opcode: - -ENTRY %blabla (x: f32[], y: f32[]) -> f32[] { - %x = f32[]{} parameter(0) - %y = f32[]{} parameter(1) - %le = pred[]{} le(f32[]{} %x, f32[]{} %y) -} - -)"; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); -} - -TEST_F(HloParserTest, WrongShape) { - const string original = R"(HloModule wrong_opcode: - -ENTRY %blabla (x: g32[]) -> g32[] { - %x = g32[]{} parameter(0) -} - -)"; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); -} - -TEST_F(HloParserTest, WrongOperandsSize) { - const string original = R"(HloModule wrong_opcode: - -ENTRY %blabla (x: f32[]) -> pred[] { - %x = f32[]{} parameter(0) - %eq = pred[]{} equal-to(f32[]{} %x) -} - -)"; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); -} - -TEST_F(HloParserTest, OperandNotFound) { - const string original = R"(HloModule operand_not_found: -ENTRY %blabla (x: f32[]) -> pred[] { - %x = f32[]{} parameter(0) - %eq = pred[]{} equal-to(f32[]{} %x, f32[]{} %y) -} -)"; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); -} - -TEST_F(HloParserTest, MoreConstants) { - const string original = R"(HloModule SelectScalarS32True_module - -ENTRY %SelectScalarS32True.v4 () -> s32[] { - %constant.2 = pred[] constant(true) - %constant.1 = s32[] constant(-42), sharding={s32[5,6] devices=[2,3]1,2,3,4} - %constant = s32[] constant(42) - %select = s32[] select(pred[] %constant.2, s32[] %constant.1, s32[] %constant) -} - -)"; - auto result = Parse(original); - TF_EXPECT_OK(result.status()); - // Constant instructions have no name. The string will be parsed successfully - // but the constant names will not be exactly the same. -} - -TEST_F(HloParserTest, ConfigurationField) { - const string original = R"(HloModule AModule -ENTRY %configuration_test() -> s32[] { - %constant = s32[] constant(42), backend_config="foo bar" -})"; - auto result = Parse(original); - TF_ASSERT_OK(result.status()); - EXPECT_EQ("foo bar", result.ValueOrDie() - ->entry_computation() - ->root_instruction() - ->backend_config()); -} - -TEST_F(HloParserTest, LiteralDimensionsMismatch_1) { - const string original = R"(HloModule some_2_module - -ENTRY %some_2 () -> f32[2] { - ROOT %constant = f32[2]{0} constant({1,{2}}) -} - -)"; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); - ExpectHasSubstr(result.status().error_message(), - "expects nested array in rank 1, but sees larger"); -} - -TEST_F(HloParserTest, LiteralDimensionsMismatch_2) { - const string original = R"(HloModule some_2x3_module - -ENTRY %some_2x3 () -> f32[2,3] { - ROOT %constant = f32[2,3]{1,0} constant(f32[2,3] {1, 2, 3, 4, 5, 6}) -} - -)"; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); - ExpectHasSubstr(result.status().error_message(), - "expects nested array in rank 2, but sees 1"); -} - -TEST_F(HloParserTest, LiteralDimensionsMismatch_3) { - const string original = R"(HloModule some_2x3x2_module - -ENTRY %some_2x3x2 () -> f32[2,3,2] { - ROOT %constant = f32[2,3,2]{2,1,0} constant(f32[2,3,2] {{{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, 12}}}) -} - -)"; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); - ExpectHasSubstr(result.status().error_message(), - "expects 3 elements in the [0]th element"); -} - -TEST_F(HloParserTest, ConstantF16Overflow) { - const string original = - R"(HloModule ConstantF16Overflow_module - -ENTRY %ConstantF16Overflow.v4 () -> f16[] { - ROOT %constant = f16[] constant(-65505) -} - -)"; - auto result = Parse(original); - EXPECT_NE(Status::OK(), result.status()); - ExpectHasSubstr(result.status().error_message(), - "is out of range for literal's primitive type F16"); -} - -TEST_F(HloParserTest, ConstantWithExp) { - const string original = R"(HloModule ConstantWithExp_module - -ENTRY %ConstantWithExp.v4 () -> f32[] { - %constant.1 = f32[] constant(3e+2) -} - -)"; - auto result = Parse(original); - TF_EXPECT_OK(result.status()); - // The string will be parsed successfully but the output strings are not - // exactly the same, because "3e2" is parsed into value 300 and will be - // printed as "300". -} - -TEST_F(HloParserTest, AttibutesAnyOrder) { - const string original = R"(HloModule any_order_module - -ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] { - %input = f32[1,2,1]{2,1,0} parameter(0) - %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input) - %filter = f32[1,1,1]{2,1,0} parameter(1) - ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), sharding={maximal device=1}, backend_config="foo", dim_labels=b0f_0io->b0f, window={pad=1_1 size=2} -} - -)"; - TF_EXPECT_OK(Parse(original).status()); -} - -TEST_F(HloParserTest, InvalidDimLabels) { - string prefix = R"(HloModule invalid_dim_labels_module - -ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] { - %input = f32[1,2,1]{2,1,0} parameter(0) - %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input) - %filter = f32[1,1,1]{2,1,0} parameter(1) - ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1} )"; - string suffix = R"( -} - -)"; - - ExpectHasSubstr( - Parse(tensorflow::strings::StrCat(prefix, ",dim_labels=00_01_10", suffix)) - .status() - .error_message(), - "expects dim labels pattern"); - - ExpectHasSubstr(Parse(tensorflow::strings::StrCat( - prefix, ",dim_labels=010_1100->010", suffix)) - .status() - .error_message(), - "must have the same rank"); -} - -TEST_F(HloParserTest, UnexpectedAttribute) { - const string original = R"(HloModule unexpected_attr_module - -ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] { - %recv = (f32[], u32[]) recv(), channel_id=15 - %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15 - ROOT %constant = f32[] constant(2.1) - %send = (f32[], u32[]) send(f32[] %constant), channel_id=16, calls=%recv - %send-done = () send-done((f32[], u32[]) %send), channel_id=16 -} - -)"; - ExpectHasSubstr(Parse(original).status().error_message(), - "unexpected attribute \"calls\""); -} - -TEST_F(HloParserTest, MissingAttribute) { - const string original = R"(HloModule missing_attr_module - -ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] { - %recv = (f32[], u32[]) recv(), channel_id=15 - %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15 - ROOT %constant = f32[] constant(-2.1) - %send = (f32[], u32[]) send(f32[] %constant) - %send-done = () send-done((f32[], u32[]) %send), channel_id=16 -} - -)"; - ExpectHasSubstr(Parse(original).status().error_message(), - "attribute channel_id is expected but not seen"); -} - -TEST_F(HloParserTest, PredecessorUndefined) { - const string original = R"(HloModule pre_not_found_module - -ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] { - %recv = (f32[], u32[]) recv(), channel_id=15 - %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15 - ROOT %constant = f32[] constant(2.1) - %send = (f32[], u32[]) send(f32[] %constant), channel_id=16, control-predecessors={%done} - %send-done = () send-done((f32[], u32[]) %send), channel_id=16 -} - -)"; - ExpectHasSubstr(Parse(original).status().error_message(), - "'done' is not defined"); -} - -TEST_F(HloParserTest, SliceAllowOmitStride1) { - const string original = R"(HloModule slice_module - -ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] { - %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0) - ROOT %slice = f32[3,3,2,4]{3,2,1,0} slice(f32[3,3,4,4]{3,2,1,0} %p0), slice={[0:3], [0:3], [0:4:2], [0:4]} -} - -)"; - TF_EXPECT_OK(Parse(original).status()); -} - -TEST_F(HloParserTest, PaddingConfigIsNotWindowPad) { - const string original = R"(HloModule window_pad_module - -ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] { - %input = f32[1,2,1]{2,1,0} parameter(0) - %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input) - %filter = f32[1,1,1]{2,1,0} parameter(1) - ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), dim_labels=b0f_0io->b0f, window={pad=1_1_0 size=1} -} - -)"; - ExpectHasSubstr(Parse(original).status().error_message(), - "expects padding_low and padding_high separated by '_'"); -} - -TEST_F(HloParserTest, CommaBetweenSubAttributes) { - const string original = R"(HloModule test_comma_module - -ENTRY %test_comma.v4 () -> f32[] { - ROOT %constant = f32[] constant(-4.2), metadata={source_line=5, op_type="::const"} -} - -)"; - TF_EXPECT_OK(Parse(original).status()); -} - -TEST_F(HloParserTest, ComputationShapeDoesNotMatchRootShape) { - const string original = R"(HloModule custom_call: - -ENTRY %CustomCall () -> f32[1] { - %constant = f32[1]{0} constant({12345}) - ROOT %foo = f32[1,2,3]{0,2,1} custom-call(f32[1]{0} %constant), custom_call_target="foo\"bar" -})"; - ExpectHasSubstr(Parse(original).status().error_message(), - "Shape of computation CustomCall, f32[1], is not compatible " - "with that of its root instruction foo, f32[1,2,3]"); -} - -TEST_F(HloParserTest, EntryComputationWithLayout) { - const string original = R"(HloModule layout: -add_F32.v3 { - lhs = f32[] parameter(0) - rhs = f32[] parameter(1) - ROOT add = f32[] add(lhs, rhs) -} - -ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] { - input = f32[8,16,256]{0,1,2} parameter(0) - constant = f32[] constant(0) - ROOT reduce = f32[8,16]{0,1} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3 -})"; - - auto module = Parse(original); - TF_ASSERT_OK(module.status()); - auto program_layout = module.ValueOrDie()->host_entry_computation_layout(); - ASSERT_EQ(program_layout.parameter_count(), 1); - auto param_layout = program_layout.parameter_layout(0).layout(); - auto result_layout = program_layout.result_layout().layout(); - EXPECT_TRUE( - LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1, 2}), param_layout)) - << "actual layout of parameter(0) is " - << LayoutUtil::HumanString(param_layout); - EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1}), result_layout)) - << "actual layout of result is " - << LayoutUtil::HumanString(result_layout); -} - -TEST_F(HloParserTest, NoEntry) { - const string original = R"(HloModule no_entry: -c1 { - const1 = f32[1]{0} constant({12345}) -} -c2 { - const2 = f32[1]{0} constant({67890}) -})"; - auto module = Parse(original); - TF_ASSERT_OK(module.status()); - EXPECT_EQ(module.ValueOrDie()->entry_computation()->name(), "c2"); -} - -TEST_F(HloParserTest, NoRoot) { - const string original = R"(HloModule no_root: -ENTRY consts { - first = f32[1]{0} constant({12345}) - last = f32[1]{0} constant({67890}) -})"; - auto module = Parse(original); - TF_ASSERT_OK(module.status()); - EXPECT_EQ( - module.ValueOrDie()->entry_computation()->root_instruction()->name(), - "last"); -} - -TEST_F(HloParserTest, MultipleEntries) { - const string original = R"(HloModule multiple_entries: -ENTRY c1 { - const1 = f32[1]{0} constant({12345}) -} -ENTRY c2 { - const2 = f32[1]{0} constant({67890}) -})"; - ExpectHasSubstr(Parse(original).status().error_message(), - "expects only one ENTRY"); -} - -TEST_F(HloParserTest, MultipleRoots) { - const string original = R"(HloModule multiple_roots: -ENTRY consts { - ROOT const1 = f32[1]{0} constant({12345}) - ROOT const2 = f32[1]{0} constant({12345}) -})"; - ExpectHasSubstr(Parse(original).status().error_message(), - "one computation should have only one ROOT"); -} - -TEST_F(HloParserTest, ComputationExists) { - const string original = R"(HloModule comp_exists -comp { - const1 = f32[1]{0} constant({12345}) -} -comp { - const2 = f32[1]{0} constant({67890}) -})"; - ExpectHasSubstr(Parse(original).status().error_message(), - R"(was parsing 2:1: error: computation previously defined here -comp { -^)"); -} - -TEST_F(HloParserTest, CrossComputationLookup) { - const string original = R"(HloModule cross_computation_lookup: -tcalla (a: (s32[], s32[])) -> (s32[], s32[]) { - ROOT aparam = (s32[], s32[]) parameter(0) -} - -tcallb (b: (s32[], s32[])) -> s32[] { - rparam = (s32[], s32[]) parameter(0) - ROOT gte0 = s32[] get-tuple-element(aparam), index=0 -} - -ENTRY entry { - param = (s32[], s32[]) parameter(0) - call0 = (s32[], s32[]) call(param), to_apply=tcalla - ROOT call1 = s32[] call(param), to_apply=tcallb -})"; - ExpectHasSubstr( - Parse(original).status().error_message(), - "was parsing 8:39: error: instruction does not exist: aparam"); -} - -} // namespace -} // namespace tools -} // namespace xla diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc index 2349fa919ec033dcebaec5b5d5b21833e0c31360..ba814af4769f43dbe96190c902cf6f52ca5659bb 100644 --- a/tensorflow/compiler/xla/tools/replay_computation.cc +++ b/tensorflow/compiler/xla/tools/replay_computation.cc @@ -24,9 +24,15 @@ limitations under the License. // passing --use_fake_data on the command line. If the real data is available // in the proto and --use_fake_data is false, the real data is used. // +// Input can be a binary HloSnapshot proto, a binary HloProto proto, or a +// textual HLO string. +// // The output format is: // // file_path: computation_name :: type:literal_str +// +// Note: If you pass multiple modules, they will be compiled in parallel but run +// in series. #include #include @@ -34,15 +40,19 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/testing.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/execution_options_util.h" -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -50,7 +60,6 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -68,9 +77,20 @@ struct Options { bool use_fake_data = false; bool print_result = true; int num_runs = 1; - bool xla_hlo_profile_last_run = false; }; +std::unique_ptr CompileExecutable(const HloSnapshot& module, + LocalClient* client) { + XlaComputation computation(module.hlo().hlo_module()); + std::vector argument_layouts; + for (const auto& param : computation.proto().program_shape().parameters()) { + argument_layouts.push_back(¶m); + } + return client + ->Compile(computation, argument_layouts, ExecutableBuildOptions()) + .ValueOrDie(); +} + // Invokes the given computation passing arbitrary data for every (unbound) // parameter if use_fake_data, Otherwise use recorded data if available. // @@ -80,21 +100,36 @@ struct Options { // // If neither generate_fake_infeed is true nor a fake_infeed_shape is provided, // no infeed is performed. -StatusOr> ReplayComputation(const HloSnapshot& module, - Client* client, - const Options& opts) { - TF_ASSIGN_OR_RETURN(auto computation, client->LoadSnapshot(module)); +StatusOr ReplayComputation(const HloSnapshot& module, + LocalExecutable* executable, + LocalClient* client, const Options& opts) { + XlaComputation computation(module.hlo().hlo_module()); - std::vector> arguments; + // Build the `argument_ptrs` vector, which contains ShapedBuffer*s to our + // arguments. This is a bit involved, because we may have to convert from + // GlobalData to ShapedBuffer*, and we have to manage the lifetime of all our + // objects. + std::vector scoped_shaped_buffer_arguments; + std::vector> global_data_arguments; + std::vector argument_ptrs; if (opts.use_fake_data) { - arguments = MakeFakeArgumentsOrDie(computation, client); + global_data_arguments = MakeFakeArgumentsOrDie(computation, client); + for (const auto& data : global_data_arguments) { + argument_ptrs.push_back( + client->GlobalDataToShapedBuffer(data->handle(), /*device_ordinal=*/0) + .ValueOrDie()); + } } else { // use recorded data if available for (const auto& proto : module.arguments()) { TF_ASSIGN_OR_RETURN(std::unique_ptr literal, Literal::CreateFromProto(proto)); - TF_ASSIGN_OR_RETURN(std::unique_ptr data, - client->TransferToServer(*literal)); - arguments.push_back(std::move(data)); + TF_ASSIGN_OR_RETURN( + ScopedShapedBuffer data, + client->LiteralToShapedBuffer(*literal, /*device_ordinal=*/0)); + scoped_shaped_buffer_arguments.push_back(std::move(data)); + } + for (const auto& argument : scoped_shaped_buffer_arguments) { + argument_ptrs.push_back(&argument); } } @@ -125,7 +160,7 @@ StatusOr> ReplayComputation(const HloSnapshot& module, // concurrent infeed occur via the fake_infeed_shape, or when // --generate_fake_infeed is passed and there exists an infeed operation in // the HloSnapshot. - tensorflow::gtl::optional pool; + absl::optional pool; std::unique_ptr data; if (provide_infeed) { data = std::move(MakeFakeLiteral(infeed_shape)).ValueOrDie(); @@ -149,68 +184,126 @@ StatusOr> ReplayComputation(const HloSnapshot& module, }); } - std::vector execute_arguments; - execute_arguments.reserve(arguments.size()); - for (auto& argument : arguments) { - execute_arguments.push_back(argument.get()); + // Do not attempt to run the executable if num_runs is less than 1. + if (opts.num_runs < 1) { + return Cancelled("Cancelled after compilation since --num_runs < 1."); } // Run the computation num_runs times, and return the result from the last // execution. - std::unique_ptr result; + const bool xla_hlo_profile = + legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile(); + StreamExecutorMemoryAllocator allocator( + client->platform(), + {client->platform()->ExecutorForDevice(0).ValueOrDie()}); + absl::optional result; for (int i = 0; i < opts.num_runs; ++i) { - ExecutionProfile profile; - ExecutionOptions execution_options = CreateDefaultExecutionOptions(); - if (opts.xla_hlo_profile_last_run && i == opts.num_runs - 1) { - execution_options.mutable_debug_options()->set_xla_hlo_profile(true); + // If xla_hlo_profile is enabled, print a noisy message before the last run, + // making it easier to separate this profile from the others in the logspam. + if (xla_hlo_profile && i == opts.num_runs - 1) { + LOG(INFO) << "\n\n***** Final run below ******"; } + ExecutionProfile profile; + ExecutableRunOptions run_options; + run_options.set_execution_profile(&profile); + run_options.set_allocator(&allocator); - if (opts.print_result) { - TF_ASSIGN_OR_RETURN( - result, client->ExecuteAndTransfer(computation, execute_arguments, - &execution_options, &profile)); - } else { - // If we're not printing the result, execute the computation but don't - // bother retrieving the result. This can be a significant speedup. - TF_RETURN_IF_ERROR(client - ->Execute(computation, execute_arguments, - &execution_options, &profile) - .status()); - } - LOG(INFO) << "Execution took " - << static_cast(profile.compute_time_ns()) / 1e9 << "s"; + TF_ASSIGN_OR_RETURN(result, executable->Run(argument_ptrs, run_options)); + LOG(INFO) << "Done executing in " + << static_cast(profile.compute_time_ns()) / 1e9 + << "s: " << module.hlo().hlo_module().name(); } - return std::move(result); + TF_ASSIGN_OR_RETURN(std::unique_ptr result_literal, + client->ShapedBufferToLiteral(*result)); + return std::move(*result_literal); } -int RealMain(tensorflow::gtl::ArraySlice args, const Options& opts) { - Client* client = ClientLibrary::LocalClientOrDie(); +StatusOr ParseInputFile(const string& filename, + const Options& opts) { tensorflow::Env* env = tensorflow::Env::Default(); + HloSnapshot snapshot; + auto s = tensorflow::ReadBinaryProto(env, filename, &snapshot); + if (s.ok()) { + return snapshot; + } + if (s.code() == tensorflow::error::NOT_FOUND) { + return s; + } + CHECK(opts.use_fake_data) + << "Without --use_fake_data, you must pass an HloSnapshot -- HloProto " + "and textual HLO don't carry real data."; + fprintf(stderr, "%s: is not HloSnapshot. Trying HloProto.\n", + filename.c_str()); + + if (tensorflow::ReadBinaryProto(env, filename, snapshot.mutable_hlo()).ok()) { + return snapshot; + } + fprintf(stderr, "%s: is not HloProto. Trying HLO text.\n", filename.c_str()); + string contents; + TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(env, filename, &contents)); + StatusOr> module = ParseHloString(contents); + if (module.ok()) { + *snapshot.mutable_hlo()->mutable_hlo_module() = + module.ValueOrDie()->ToProto(); + return snapshot; + } + fprintf(stderr, "%s: is not HLO text. Nothing left to try.\n", + filename.c_str()); + return InvalidArgument("Could not parse %s.", filename); +} + +int RealMain(absl::Span args, const Options& opts) { + LocalClient* client = ClientLibrary::LocalClientOrDie(); int exit_status = EXIT_SUCCESS; + + std::vector snapshots; for (char* arg : args) { - HloSnapshot snapshot; - auto status = tensorflow::ReadBinaryProto(env, arg, &snapshot); - if (!status.ok()) { - fprintf(stderr, "%s: is not HloSnapshot: %s.\n", arg, - status.ToString().c_str()); - continue; + StatusOr maybe_snapshot = ParseInputFile(arg, opts); + if (maybe_snapshot.ok()) { + snapshots.push_back(std::move(maybe_snapshot).ValueOrDie()); + } else { + LOG(ERROR) << "Can't handle file " << arg << ": " + << maybe_snapshot.status(); + } + } + + // Compile all the modules in parallel. + LOG(INFO) << "Compiling " << snapshots.size() << " modules in parallel."; + std::vector> executables; + { + // ThreadPool CHECK-fails if we give it 0 threads. + tensorflow::thread::ThreadPool thread_pool( + tensorflow::Env::Default(), tensorflow::ThreadOptions(), + "compile_modules", std::max(size_t{1}, snapshots.size()), + /*low_latency_hint=*/false); + executables.resize(snapshots.size()); + for (int64 i = 0; i < snapshots.size(); ++i) { + thread_pool.Schedule([&snapshots, &executables, client, i] { + executables[i] = CompileExecutable(snapshots[i], client); + }); } - StatusOr> result_status = - ReplayComputation(snapshot, client, opts); + } + LOG(INFO) << "Done compiling; now running the modules."; + + for (int64 i = 0; i < executables.size(); ++i) { + LocalExecutable* executable = executables[i].get(); + StatusOr result_status = + ReplayComputation(snapshots[i], executable, client, opts); if (!result_status.ok()) { - fprintf(stderr, "%s: error: %s\n", arg, + fprintf(stderr, "%s: error: %s\n", args[i], result_status.status().ToString().c_str()); exit_status = EXIT_FAILURE; continue; } - std::unique_ptr result = result_status.ConsumeValueOrDie(); - if (result != nullptr) { - fprintf(stdout, "%s: %s :: %s:%s\n", arg, - snapshot.hlo().hlo_module().name().c_str(), - ShapeUtil::HumanString(result->shape()).c_str(), - result->ToString().c_str()); + if (opts.print_result) { + Literal result = std::move(result_status).ValueOrDie(); + fprintf(stdout, "%s: %s :: %s:%s\n", args[i], + executable->executable()->module().name().c_str(), + ShapeUtil::HumanString(result.shape()).c_str(), + result.ToString().c_str()); + auto& snapshot = snapshots[i]; if (snapshot.has_result()) { std::unique_ptr literal = Literal::CreateFromProto(snapshot.result()).ConsumeValueOrDie(); @@ -243,9 +336,6 @@ int main(int argc, char** argv) { tensorflow::Flag("generate_fake_infeed", &opts.generate_fake_infeed, "Whether a fake infeed shape should be generated " "derived from the computation"), - tensorflow::Flag( - "xla_hlo_profile_last_run", &opts.xla_hlo_profile_last_run, - "Pass --xla_hlo_profile the last time we run the computation."), }; xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list); bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list); @@ -254,7 +344,7 @@ int main(int argc, char** argv) { LOG(QFATAL) << usage; } - tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + absl::Span args(argv, argc); + args.remove_prefix(1); // Pop off the binary name, argv[0] return xla::tools::RealMain(args, opts); } diff --git a/tensorflow/compiler/xla/tools/show_literal.cc b/tensorflow/compiler/xla/tools/show_literal.cc index fe8e72ba32bb4493b2751cfdfeb977f271092f9c..51909190a3ef20c3df78d08796e88bdbb650609d 100644 --- a/tensorflow/compiler/xla/tools/show_literal.cc +++ b/tensorflow/compiler/xla/tools/show_literal.cc @@ -21,7 +21,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" diff --git a/tensorflow/compiler/xla/tools/show_signature.cc b/tensorflow/compiler/xla/tools/show_signature.cc index 4e53fafcc97ff53afc5713e7ed8ee5222fac316b..cdf306dfd1027cf6022c5d8ae844b4308f580e8d 100644 --- a/tensorflow/compiler/xla/tools/show_signature.cc +++ b/tensorflow/compiler/xla/tools/show_signature.cc @@ -29,6 +29,7 @@ limitations under the License. #include #include +#include "absl/types/span.h" #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" @@ -37,7 +38,6 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -45,7 +45,7 @@ limitations under the License. namespace xla { namespace tools { -void RealMain(tensorflow::gtl::ArraySlice args) { +void RealMain(absl::Span args) { Client* client = ClientLibrary::LocalClientOrDie(); for (char* arg : args) { HloSnapshot module; @@ -66,8 +66,8 @@ void RealMain(tensorflow::gtl::ArraySlice args) { int main(int argc, char** argv) { tensorflow::port::InitMain(argv[0], &argc, &argv); - tensorflow::gtl::ArraySlice args(argv, argc); - args.pop_front(); // Pop off the binary name, argv[0] + absl::Span args(argv, argc); + args.remove_prefix(1); // Pop off the binary name, argv[0] xla::tools::RealMain(args); return 0; } diff --git a/tensorflow/compiler/xla/tools/show_text_literal.cc b/tensorflow/compiler/xla/tools/show_text_literal.cc index 8525873e913185554d18df8c8c3584bfcdcdcabe..48c837481181f6ad8f864569fd62e0e23fa02ecd 100644 --- a/tensorflow/compiler/xla/tools/show_text_literal.cc +++ b/tensorflow/compiler/xla/tools/show_text_literal.cc @@ -20,7 +20,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/text_literal_reader.h" #include "tensorflow/compiler/xla/types.h" diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc index e43498e381b8e63543e2ddda08ca7c0df91817e4..68cab7387cf1576072f96878b50f07def6862d8b 100644 --- a/tensorflow/compiler/xla/util.cc +++ b/tensorflow/compiler/xla/util.cc @@ -18,12 +18,13 @@ limitations under the License. #include #include +#include "absl/strings/match.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "absl/strings/str_split.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stacktrace.h" @@ -54,111 +55,28 @@ ScopedLoggingTimer::~ScopedLoggingTimer() { } } -Status AddStatus(Status prior, tensorflow::StringPiece context) { +Status AddStatus(Status prior, absl::string_view context) { CHECK(!prior.ok()); - return Status{prior.code(), tensorflow::strings::StrCat( - context, ": ", prior.error_message())}; + return Status{prior.code(), + absl::StrCat(context, ": ", prior.error_message())}; } -Status AppendStatus(Status prior, tensorflow::StringPiece context) { +Status AppendStatus(Status prior, absl::string_view context) { CHECK(!prior.ok()); - return Status{prior.code(), tensorflow::strings::StrCat(prior.error_message(), - ": ", context)}; + return Status{prior.code(), + absl::StrCat(prior.error_message(), ": ", context)}; } -// Implementation note: we can't common these out (without using macros) because -// they all need to va_start/va_end their varargs in their frame. - -Status InvalidArgumentV(const char* format, va_list args) { - string message; - tensorflow::strings::Appendv(&message, format, args); - return WithLogBacktrace(tensorflow::errors::InvalidArgument(message)); -} - -Status InvalidArgument(const char* format, ...) { - va_list args; - va_start(args, format); - Status result = InvalidArgumentV(format, args); - va_end(args); - return result; -} - -Status Unimplemented(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::Unimplemented(message)); -} - -Status InternalError(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::Internal(message)); -} - -Status FailedPrecondition(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::FailedPrecondition(message)); -} - -Status Cancelled(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::Cancelled(message)); -} - -Status ResourceExhausted(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::ResourceExhausted(message)); -} - -Status NotFound(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::NotFound(message)); -} - -Status Unavailable(const char* format, ...) { - string message; - va_list args; - va_start(args, format); - tensorflow::strings::Appendv(&message, format, args); - va_end(args); - return WithLogBacktrace(tensorflow::errors::Unavailable(message)); -} - -string Reindent(tensorflow::StringPiece original, - const tensorflow::StringPiece indentation) { - std::vector pieces = tensorflow::str_util::Split( - tensorflow::StringPiece(original.data(), original.size()), '\n'); - return tensorflow::str_util::Join( - pieces, "\n", [indentation](string* out, string s) { - tensorflow::StringPiece piece(s); - tensorflow::str_util::RemoveWhitespaceContext(&piece); - tensorflow::strings::StrAppend(out, indentation, piece); - }); +string Reindent(absl::string_view original, + const absl::string_view indentation) { + std::vector pieces = + absl::StrSplit(absl::string_view(original.data(), original.size()), '\n'); + return absl::StrJoin(pieces, "\n", [indentation](string* out, string s) { + absl::StrAppend(out, indentation, absl::StripAsciiWhitespace(s)); + }); } -bool IsPermutation(tensorflow::gtl::ArraySlice permutation, int64 rank) { +bool IsPermutation(absl::Span permutation, int64 rank) { if (rank != permutation.size()) { return false; } @@ -172,7 +90,7 @@ bool IsPermutation(tensorflow::gtl::ArraySlice permutation, int64 rank) { } std::vector InversePermutation( - tensorflow::gtl::ArraySlice input_permutation) { + absl::Span input_permutation) { DCHECK(IsPermutation(input_permutation, input_permutation.size())); std::vector output_permutation(input_permutation.size(), -1); for (size_t i = 0; i < input_permutation.size(); ++i) { @@ -181,8 +99,8 @@ std::vector InversePermutation( return output_permutation; } -std::vector ComposePermutations(tensorflow::gtl::ArraySlice p1, - tensorflow::gtl::ArraySlice p2) { +std::vector ComposePermutations(absl::Span p1, + absl::Span p2) { CHECK_EQ(p1.size(), p2.size()); std::vector output; for (size_t i = 0; i < p1.size(); ++i) { @@ -191,7 +109,7 @@ std::vector ComposePermutations(tensorflow::gtl::ArraySlice p1, return output; } -bool IsIdentityPermutation(tensorflow::gtl::ArraySlice permutation) { +bool IsIdentityPermutation(absl::Span permutation) { for (int64 i = 0; i < permutation.size(); ++i) { if (permutation[i] != i) { return false; @@ -212,7 +130,7 @@ PaddingConfig MakeNoPaddingConfig(int64 rank) { } PaddingConfig MakeEdgePaddingConfig( - tensorflow::gtl::ArraySlice> padding) { + absl::Span> padding) { PaddingConfig padding_config; for (const std::pair& dim : padding) { auto dimension = padding_config.add_dimensions(); @@ -234,20 +152,20 @@ bool HasInteriorPadding(const PaddingConfig& config) { namespace { string HumanReadableNumOps(double flops, double nanoseconds, - tensorflow::StringPiece op_prefix) { + absl::string_view op_prefix) { if (nanoseconds == 0) { - return tensorflow::strings::StrCat("NaN ", op_prefix, "OP/s"); + return absl::StrCat("NaN ", op_prefix, "OP/s"); } double nano_flops = flops / nanoseconds; string throughput = tensorflow::strings::HumanReadableNum( static_cast(nano_flops * 1e9)); - tensorflow::StringPiece sp(throughput); + absl::string_view sp(throughput); // Use the more common "G(FLOPS)", rather than "B(FLOPS)" - if (tensorflow::str_util::EndsWith(sp, "B") || // Ends in 'B', ignoring case - tensorflow::str_util::EndsWith(sp, "b")) { + if (absl::EndsWith(sp, "B") || // Ends in 'B', ignoring case + absl::EndsWith(sp, "b")) { *throughput.rbegin() = 'G'; } - throughput += tensorflow::strings::StrCat(op_prefix, "OP/s"); + throughput += absl::StrCat(op_prefix, "OP/s"); return throughput; } } // namespace @@ -260,8 +178,7 @@ string HumanReadableNumTranscendentalOps(double trops, double nanoseconds) { return HumanReadableNumOps(trops, nanoseconds, "TR"); } -void LogLines(int sev, tensorflow::StringPiece text, const char* fname, - int lineno) { +void LogLines(int sev, absl::string_view text, const char* fname, int lineno) { const int orig_sev = sev; if (sev == tensorflow::FATAL) { sev = tensorflow::ERROR; @@ -275,7 +192,7 @@ void LogLines(int sev, tensorflow::StringPiece text, const char* fname, size_t cur = 0; while (cur < text.size()) { size_t eol = text.find('\n', cur); - if (eol == tensorflow::StringPiece::npos) { + if (eol == absl::string_view::npos) { eol = text.size(); } auto msg = text.substr(cur, eol - cur); @@ -290,14 +207,13 @@ void LogLines(int sev, tensorflow::StringPiece text, const char* fname, } } -int64 Product(tensorflow::gtl::ArraySlice xs) { +int64 Product(absl::Span xs) { return std::accumulate(xs.begin(), xs.end(), static_cast(1), std::multiplies()); } -std::vector> CommonFactors( - tensorflow::gtl::ArraySlice a, - tensorflow::gtl::ArraySlice b) { +std::vector> CommonFactors(absl::Span a, + absl::Span b) { CHECK_EQ(Product(a), Product(b)); if (0 == Product(a)) { return {std::make_pair(0, 0), std::make_pair(a.size(), b.size())}; diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h index 73036407262c617b1b49511d319386db187ff8e5..8ce741647414a1fa75e6d706ec1e719ace7b7cc8 100644 --- a/tensorflow/compiler/xla/util.h +++ b/tensorflow/compiler/xla/util.h @@ -24,16 +24,20 @@ limitations under the License. #include #include +#include "absl/algorithm/container.h" +#include "absl/container/inlined_vector.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/core/stringpiece.h" -#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/math/math_util.h" #include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/protobuf.h" @@ -53,7 +57,7 @@ Status WithLogBacktrace(const Status& status); // the InlinedVector will just behave like an std::vector<> and allocate the // memory to store its values. static constexpr int kInlineRank = 8; -using DimensionVector = tensorflow::gtl::InlinedVector; +using DimensionVector = absl::InlinedVector; // RAII timer that logs with a given label the wall clock time duration in human // readable form. This differs from base's ElapsedTimer primarily in that it @@ -97,65 +101,63 @@ struct ScopedLoggingTimer { uint64 start_micros; }; -// Given a vector, returns a MutableArraySlice that points at its +// Given a vector, returns a Span that points at its // internals. // // Warning: if the vector is updated its storage pointer may change, so use this // with caution (ideally in limited scopes with temporary lifetimes). template -tensorflow::gtl::MutableArraySlice MutableByteSlice(std::vector* v) { - return tensorflow::gtl::MutableArraySlice( - reinterpret_cast(v->data()), v->size() * sizeof(T)); +absl::Span MutableByteSlice(std::vector* v) { + return absl::Span(reinterpret_cast(v->data()), + v->size() * sizeof(T)); } // Turns an immutable slice of type T into an immutable slice of bytes with the // same byte size. template -tensorflow::gtl::ArraySlice CastToByteSlice( - tensorflow::gtl::ArraySlice slice) { - return tensorflow::gtl::ArraySlice( - reinterpret_cast(slice.data()), slice.size() * sizeof(T)); +absl::Span CastToByteSlice(absl::Span slice) { + return absl::Span(reinterpret_cast(slice.data()), + slice.size() * sizeof(T)); } // Casts a byte slice to a non-byte type T, checking that the original slice // length is a multiple of sizeof(T). template -tensorflow::gtl::ArraySlice CastByteSlice( - tensorflow::gtl::ArraySlice slice) { +absl::Span CastByteSlice(absl::Span slice) { CHECK_EQ(0, slice.size() % sizeof(T)); - return tensorflow::gtl::ArraySlice( - reinterpret_cast(slice.data()), slice.size() / sizeof(T)); + return absl::Span(reinterpret_cast(slice.data()), + slice.size() / sizeof(T)); } // Convenience function to force a vector to convert to an immutable slice. template -tensorflow::gtl::ArraySlice AsSlice(const std::vector& v) { - return tensorflow::gtl::ArraySlice(v); +absl::Span AsSlice(const std::vector& v) { + return absl::Span(v); } -// Converts a mutable vector pointer into a MutableArraySlice of the same +// Converts a mutable vector pointer into a Span of the same // type. template -tensorflow::gtl::MutableArraySlice AsMutableSlice(std::vector* v) { - return tensorflow::gtl::MutableArraySlice(v->data(), v->size()); +absl::Span AsMutableSlice(std::vector* v) { + return absl::Span(v->data(), v->size()); } // xla::int64 is not the same type as tensorflow::protobuf_int64 in open-source. // Wrapper function that gives an int64 array slice view of a repeated int64 // protobuf field. -static inline tensorflow::gtl::ArraySlice AsInt64Slice( +static inline absl::Span AsInt64Slice( const tensorflow::protobuf::RepeatedField& v) { - tensorflow::gtl::ArraySlice slice(v); - return tensorflow::gtl::ArraySlice( - reinterpret_cast(slice.data()), slice.size()); + absl::Span slice(v); + return absl::Span(reinterpret_cast(slice.data()), + slice.size()); } // As above, but for uint64 types. -static inline tensorflow::gtl::ArraySlice AsUInt64Slice( +static inline absl::Span AsUInt64Slice( const tensorflow::protobuf::RepeatedField& v) { - tensorflow::gtl::ArraySlice slice(v); - return tensorflow::gtl::ArraySlice( - reinterpret_cast(slice.data()), slice.size()); + absl::Span slice(v); + return absl::Span(reinterpret_cast(slice.data()), + slice.size()); } // Compares two containers for equality. Returns true iff the two containers @@ -171,7 +173,7 @@ template bool ContainersEqual(const Container1T& c1, std::initializer_list il) { - tensorflow::gtl::ArraySlice c2{il}; + absl::Span c2{il}; return ContainersEqual(c1, c2); } @@ -189,9 +191,9 @@ bool ContainersEqual(const Container1T& c1, const Container2T& c2, // source and destination. The source starting index is src_base, while the // destination one is dest_base. template -void StridedCopy(tensorflow::gtl::MutableArraySlice dest, int64 dest_base, - int64 dest_stride, tensorflow::gtl::ArraySlice src, - int64 src_base, int64 src_stride, int64 count) { +void StridedCopy(absl::Span dest, int64 dest_base, int64 dest_stride, + absl::Span src, int64 src_base, int64 src_stride, + int64 count) { for (; count > 0; --count, dest_base += dest_stride, src_base += src_stride) { dest[dest_base] = static_cast(src[src_base]); } @@ -200,46 +202,76 @@ void StridedCopy(tensorflow::gtl::MutableArraySlice dest, int64 dest_base, // Adds some context information to the error message in a // Status. This is useful as Statuses are // propagated upwards. -Status AddStatus(Status prior, tensorflow::StringPiece context); -Status AppendStatus(Status prior, tensorflow::StringPiece context); - -// Status error shorthands -- printfs the arguments to be -// used as an error message and returns a status in the canonical -// error space. -Status InvalidArgument(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status Unimplemented(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status InternalError(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status FailedPrecondition(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status Cancelled(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status ResourceExhausted(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status NotFound(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); -Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); - -// Passed-varargs variant of the InvalidArgument factory above. -Status InvalidArgumentV(const char* format, va_list args); +Status AddStatus(Status prior, absl::string_view context); +Status AppendStatus(Status prior, absl::string_view context); + +// Status error shorthands -- StrFormat's the arguments to be used as an error +// message and returns a status in the canonical error space. +template +Status InvalidArgument(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::InvalidArgument(absl::StrFormat(format, args...))); +} +template +Status Unimplemented(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::Unimplemented(absl::StrFormat(format, args...))); +} +template +Status InternalError(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::Internal(absl::StrFormat(format, args...))); +} +template +Status FailedPrecondition(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::FailedPrecondition(absl::StrFormat(format, args...))); +} +template +Status Cancelled(const absl::FormatSpec& format, const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::Cancelled(absl::StrFormat(format, args...))); +} +template +Status ResourceExhausted(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::ResourceExhausted(absl::StrFormat(format, args...))); +} +template +Status NotFound(const absl::FormatSpec& format, const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::NotFound(absl::StrFormat(format, args...))); +} +template +Status Unavailable(const absl::FormatSpec& format, + const Args&... args) { + return WithLogBacktrace( + tensorflow::errors::Unavailable(absl::StrFormat(format, args...))); +} template Status InvalidArgumentStrCat(Args&&... concat) { - return InvalidArgument( - "%s", tensorflow::strings::StrCat(std::forward(concat)...).c_str()); + return InvalidArgument("%s", absl::StrCat(std::forward(concat)...)); } template Status UnimplementedStrCat(Args&&... concat) { - return Unimplemented( - "%s", tensorflow::strings::StrCat(std::forward(concat)...).c_str()); + return Unimplemented("%s", absl::StrCat(std::forward(concat)...)); } template Status InternalErrorStrCat(Args&&... concat) { - return InternalError( - "%s", tensorflow::strings::StrCat(std::forward(concat)...).c_str()); + return InternalError("%s", absl::StrCat(std::forward(concat)...)); } template Status ResourceExhaustedStrCat(Args&&... concat) { - return ResourceExhausted( - "%s", tensorflow::strings::StrCat(std::forward(concat)...).c_str()); + return ResourceExhausted("%s", absl::StrCat(std::forward(concat)...)); } // Splits the lines of the original, replaces leading whitespace with the prefix @@ -248,11 +280,10 @@ Status ResourceExhaustedStrCat(Args&&... concat) { // // Note: even different amounts of leading whitespace on different lines will be // uniformly replaced with "indentation". -string Reindent(tensorflow::StringPiece original, - tensorflow::StringPiece indentation); +string Reindent(absl::string_view original, absl::string_view indentation); // Checks whether permutation is a permutation of the [0, rank) integer range. -bool IsPermutation(tensorflow::gtl::ArraySlice permutation, int64 rank); +bool IsPermutation(absl::Span permutation, int64 rank); // Applies `permutation` on `input` and returns the permuted array. // For each i, output[permutation[i]] = input[i]. @@ -260,10 +291,11 @@ bool IsPermutation(tensorflow::gtl::ArraySlice permutation, int64 rank); // Precondition: // 1. `permutation` is a permutation of 0..permutation.size()-1. // 2. permutation.size() == input.size(). -template